53 lines
2.0 KiB
Python
53 lines
2.0 KiB
Python
from langchain.schema import Document
|
|
from langchain.chains.summarize import load_summarize_chain
|
|
from langchain_community.llms import OpenAI
|
|
from typing import List, Dict
|
|
import yaml
|
|
import os
|
|
|
|
class TechnicalConceptExtractor:
|
|
def __init__(self, llm):
|
|
self.llm = llm
|
|
|
|
def extract_concepts(self, text: str) -> List[str]:
|
|
prompt = f"""Identify key technical concepts from this text:
|
|
{text[:5000]}
|
|
List them as comma-separated values:"""
|
|
result = self.llm(prompt)
|
|
return [c.strip() for c in result.split(",")]
|
|
|
|
class InsightAggregator:
|
|
def __init__(self, llm):
|
|
self.llm = llm
|
|
|
|
def find_connections(self, papers: List[Dict]) -> str:
|
|
summaries = "\n\n".join([p["summary"] for p in papers])
|
|
prompt = f"""Analyze these paper summaries and identify cross-cutting themes:
|
|
{summaries[:10000]}
|
|
Provide a structured analysis:"""
|
|
return self.llm(prompt)
|
|
|
|
class PaperAnalyzer:
|
|
def __init__(self, config_path: str = "config/settings.yaml"):
|
|
with open(config_path) as f:
|
|
self.config = yaml.safe_load(f)
|
|
|
|
self.llm = OpenAI(
|
|
temperature=0.5, # More conservative temperature for academic analysis
|
|
model_name="deepseek-r1",
|
|
api_key=os.getenv("DEEPSEEK_API_KEY"),
|
|
base_url="https://api.deepseek.com/v1",
|
|
max_tokens=4096 # Increased token limit for complex papers
|
|
)
|
|
self.summary_chain = load_summarize_chain(self.llm, chain_type="map_reduce")
|
|
self.concept_extractor = TechnicalConceptExtractor(self.llm)
|
|
self.insight_aggregator = InsightAggregator(self.llm)
|
|
|
|
def analyze_document(self, document: Document) -> Dict:
|
|
summary = self.summary_chain.run([document])
|
|
concepts = self.concept_extractor.extract_concepts(document.page_content)
|
|
return {
|
|
"summary": summary,
|
|
"concepts": concepts,
|
|
"metadata": document.metadata
|
|
} |