removed unessecery json download in papers directory
This commit is contained in:
parent
e4adde5484
commit
e35cd9a906
@ -12,6 +12,7 @@ from dotenv import load_dotenv
|
||||
from src.analysis.llm_analyzer import LLMAnalyzer
|
||||
from src.storage.paper_store import PaperStore
|
||||
from src.storage.vector_store import VectorStore
|
||||
from src.data_acquisition.pdf_downloader import PDFDownloader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -24,6 +25,7 @@ class AgentController:
|
||||
self.llm_analyzer = None
|
||||
self.paper_store = None
|
||||
self.vector_store = None
|
||||
self.pdf_downloader = None
|
||||
self.papers_dir = Path("papers")
|
||||
|
||||
async def __aenter__(self):
|
||||
@ -57,6 +59,7 @@ class AgentController:
|
||||
self.llm_analyzer = LLMAnalyzer(api_key=deepseek_key, provider='deepseek')
|
||||
self.paper_store = PaperStore()
|
||||
self.vector_store = VectorStore()
|
||||
self.pdf_downloader = PDFDownloader()
|
||||
|
||||
# Initialize database
|
||||
await self.paper_store.initialize()
|
||||
@ -86,6 +89,15 @@ class AgentController:
|
||||
if paper_data.get('abstract'):
|
||||
paper_text += f"Abstract: {paper_data['abstract']}\n\n"
|
||||
|
||||
# Download PDF if URL is available
|
||||
if paper_data.get('pdf_url'):
|
||||
try:
|
||||
pdf_path = await self.pdf_downloader.download_pdf(paper_data['pdf_url'], paper_data['entry_id'].split('/')[-1])
|
||||
if pdf_path:
|
||||
logger.info(f"Successfully downloaded PDF to {pdf_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading PDF: {e}")
|
||||
|
||||
# Analyze paper using LLM
|
||||
analysis = await self.llm_analyzer.analyze_paper(paper_text)
|
||||
|
||||
@ -107,14 +119,6 @@ class AgentController:
|
||||
await self._repair_orphaned_paper(paper_id, paper_data)
|
||||
return {}
|
||||
|
||||
# Clean paper_id to use as filename
|
||||
safe_id = paper_id.replace('.', '_')
|
||||
|
||||
# Save paper content to file
|
||||
paper_path = self.papers_dir / f"{safe_id}.json"
|
||||
with open(paper_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(paper_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Store metadata in PostgreSQL
|
||||
metadata = {
|
||||
'id': paper_id,
|
||||
@ -140,9 +144,9 @@ class AgentController:
|
||||
{'paper_id': paper_id, 'type': 'technical_concepts'}
|
||||
]
|
||||
chunk_ids = [
|
||||
f"{safe_id}_text",
|
||||
f"{safe_id}_summary",
|
||||
f"{safe_id}_concepts"
|
||||
f"{paper_id}_text",
|
||||
f"{paper_id}_summary",
|
||||
f"{paper_id}_concepts"
|
||||
]
|
||||
self.vector_store.add_chunks(chunks, chunk_metadata, chunk_ids)
|
||||
|
||||
@ -180,13 +184,6 @@ class AgentController:
|
||||
for paper_id in paper_ids:
|
||||
paper = await self.paper_store.get_paper(paper_id)
|
||||
if paper:
|
||||
# Load full paper data if needed
|
||||
safe_id = paper_id.replace('.', '_')
|
||||
paper_path = self.papers_dir / f"{safe_id}.json"
|
||||
if paper_path.exists():
|
||||
with open(paper_path, 'r', encoding='utf-8') as f:
|
||||
full_paper = json.load(f)
|
||||
paper['full_data'] = full_paper
|
||||
papers.append(paper)
|
||||
|
||||
return papers
|
||||
@ -207,6 +204,8 @@ class AgentController:
|
||||
await self.paper_store.close()
|
||||
if self.vector_store:
|
||||
await self.vector_store.close()
|
||||
if self.pdf_downloader:
|
||||
await self.pdf_downloader.close()
|
||||
|
||||
self.initialized = False
|
||||
logger.info("Agent controller closed successfully")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user