removed unessecery json download in papers directory

This commit is contained in:
kpcto 2025-02-12 23:45:21 +00:00
parent e4adde5484
commit e35cd9a906

View File

@ -12,6 +12,7 @@ from dotenv import load_dotenv
from src.analysis.llm_analyzer import LLMAnalyzer
from src.storage.paper_store import PaperStore
from src.storage.vector_store import VectorStore
from src.data_acquisition.pdf_downloader import PDFDownloader
logger = logging.getLogger(__name__)
@ -24,6 +25,7 @@ class AgentController:
self.llm_analyzer = None
self.paper_store = None
self.vector_store = None
self.pdf_downloader = None
self.papers_dir = Path("papers")
async def __aenter__(self):
@ -57,6 +59,7 @@ class AgentController:
self.llm_analyzer = LLMAnalyzer(api_key=deepseek_key, provider='deepseek')
self.paper_store = PaperStore()
self.vector_store = VectorStore()
self.pdf_downloader = PDFDownloader()
# Initialize database
await self.paper_store.initialize()
@ -86,6 +89,15 @@ class AgentController:
if paper_data.get('abstract'):
paper_text += f"Abstract: {paper_data['abstract']}\n\n"
# Download PDF if URL is available
if paper_data.get('pdf_url'):
try:
pdf_path = await self.pdf_downloader.download_pdf(paper_data['pdf_url'], paper_data['entry_id'].split('/')[-1])
if pdf_path:
logger.info(f"Successfully downloaded PDF to {pdf_path}")
except Exception as e:
logger.error(f"Error downloading PDF: {e}")
# Analyze paper using LLM
analysis = await self.llm_analyzer.analyze_paper(paper_text)
@ -107,14 +119,6 @@ class AgentController:
await self._repair_orphaned_paper(paper_id, paper_data)
return {}
# Clean paper_id to use as filename
safe_id = paper_id.replace('.', '_')
# Save paper content to file
paper_path = self.papers_dir / f"{safe_id}.json"
with open(paper_path, 'w', encoding='utf-8') as f:
json.dump(paper_data, f, indent=2, ensure_ascii=False)
# Store metadata in PostgreSQL
metadata = {
'id': paper_id,
@ -140,9 +144,9 @@ class AgentController:
{'paper_id': paper_id, 'type': 'technical_concepts'}
]
chunk_ids = [
f"{safe_id}_text",
f"{safe_id}_summary",
f"{safe_id}_concepts"
f"{paper_id}_text",
f"{paper_id}_summary",
f"{paper_id}_concepts"
]
self.vector_store.add_chunks(chunks, chunk_metadata, chunk_ids)
@ -180,13 +184,6 @@ class AgentController:
for paper_id in paper_ids:
paper = await self.paper_store.get_paper(paper_id)
if paper:
# Load full paper data if needed
safe_id = paper_id.replace('.', '_')
paper_path = self.papers_dir / f"{safe_id}.json"
if paper_path.exists():
with open(paper_path, 'r', encoding='utf-8') as f:
full_paper = json.load(f)
paper['full_data'] = full_paper
papers.append(paper)
return papers
@ -207,6 +204,8 @@ class AgentController:
await self.paper_store.close()
if self.vector_store:
await self.vector_store.close()
if self.pdf_downloader:
await self.pdf_downloader.close()
self.initialized = False
logger.info("Agent controller closed successfully")