from langchain.schema import Document from langchain.chains.summarize import load_summarize_chain from langchain_community.llms import OpenAI from typing import List, Dict import yaml import os class TechnicalConceptExtractor: def __init__(self, llm): self.llm = llm def extract_concepts(self, text: str) -> List[str]: prompt = f"""Identify key technical concepts from this text: {text[:5000]} List them as comma-separated values:""" result = self.llm(prompt) return [c.strip() for c in result.split(",")] class InsightAggregator: def __init__(self, llm): self.llm = llm def find_connections(self, papers: List[Dict]) -> str: summaries = "\n\n".join([p["summary"] for p in papers]) prompt = f"""Analyze these paper summaries and identify cross-cutting themes: {summaries[:10000]} Provide a structured analysis:""" return self.llm(prompt) class PaperAnalyzer: def __init__(self, config_path: str = "config/settings.yaml"): with open(config_path) as f: self.config = yaml.safe_load(f) self.llm = OpenAI( temperature=0.5, # More conservative temperature for academic analysis model_name="deepseek-r1", api_key=os.getenv("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com/v1", max_tokens=4096 # Increased token limit for complex papers ) self.summary_chain = load_summarize_chain(self.llm, chain_type="map_reduce") self.concept_extractor = TechnicalConceptExtractor(self.llm) self.insight_aggregator = InsightAggregator(self.llm) def analyze_document(self, document: Document) -> Dict: summary = self.summary_chain.run([document]) concepts = self.concept_extractor.extract_concepts(document.page_content) return { "summary": summary, "concepts": concepts, "metadata": document.metadata }