diff --git a/src/utils/agent_controller.py b/src/utils/agent_controller.py index c644c3e..f229aa4 100644 --- a/src/utils/agent_controller.py +++ b/src/utils/agent_controller.py @@ -12,6 +12,7 @@ from dotenv import load_dotenv from src.analysis.llm_analyzer import LLMAnalyzer from src.storage.paper_store import PaperStore from src.storage.vector_store import VectorStore +from src.data_acquisition.pdf_downloader import PDFDownloader logger = logging.getLogger(__name__) @@ -24,6 +25,7 @@ class AgentController: self.llm_analyzer = None self.paper_store = None self.vector_store = None + self.pdf_downloader = None self.papers_dir = Path("papers") async def __aenter__(self): @@ -57,6 +59,7 @@ class AgentController: self.llm_analyzer = LLMAnalyzer(api_key=deepseek_key, provider='deepseek') self.paper_store = PaperStore() self.vector_store = VectorStore() + self.pdf_downloader = PDFDownloader() # Initialize database await self.paper_store.initialize() @@ -86,6 +89,15 @@ class AgentController: if paper_data.get('abstract'): paper_text += f"Abstract: {paper_data['abstract']}\n\n" + # Download PDF if URL is available + if paper_data.get('pdf_url'): + try: + pdf_path = await self.pdf_downloader.download_pdf(paper_data['pdf_url'], paper_data['entry_id'].split('/')[-1]) + if pdf_path: + logger.info(f"Successfully downloaded PDF to {pdf_path}") + except Exception as e: + logger.error(f"Error downloading PDF: {e}") + # Analyze paper using LLM analysis = await self.llm_analyzer.analyze_paper(paper_text) @@ -107,14 +119,6 @@ class AgentController: await self._repair_orphaned_paper(paper_id, paper_data) return {} - # Clean paper_id to use as filename - safe_id = paper_id.replace('.', '_') - - # Save paper content to file - paper_path = self.papers_dir / f"{safe_id}.json" - with open(paper_path, 'w', encoding='utf-8') as f: - json.dump(paper_data, f, indent=2, ensure_ascii=False) - # Store metadata in PostgreSQL metadata = { 'id': paper_id, @@ -140,9 +144,9 @@ class AgentController: {'paper_id': paper_id, 'type': 'technical_concepts'} ] chunk_ids = [ - f"{safe_id}_text", - f"{safe_id}_summary", - f"{safe_id}_concepts" + f"{paper_id}_text", + f"{paper_id}_summary", + f"{paper_id}_concepts" ] self.vector_store.add_chunks(chunks, chunk_metadata, chunk_ids) @@ -180,13 +184,6 @@ class AgentController: for paper_id in paper_ids: paper = await self.paper_store.get_paper(paper_id) if paper: - # Load full paper data if needed - safe_id = paper_id.replace('.', '_') - paper_path = self.papers_dir / f"{safe_id}.json" - if paper_path.exists(): - with open(paper_path, 'r', encoding='utf-8') as f: - full_paper = json.load(f) - paper['full_data'] = full_paper papers.append(paper) return papers @@ -207,6 +204,8 @@ class AgentController: await self.paper_store.close() if self.vector_store: await self.vector_store.close() + if self.pdf_downloader: + await self.pdf_downloader.close() self.initialized = False logger.info("Agent controller closed successfully")