removed unessecery json download in papers directory
This commit is contained in:
parent
e4adde5484
commit
e35cd9a906
@ -12,6 +12,7 @@ from dotenv import load_dotenv
|
|||||||
from src.analysis.llm_analyzer import LLMAnalyzer
|
from src.analysis.llm_analyzer import LLMAnalyzer
|
||||||
from src.storage.paper_store import PaperStore
|
from src.storage.paper_store import PaperStore
|
||||||
from src.storage.vector_store import VectorStore
|
from src.storage.vector_store import VectorStore
|
||||||
|
from src.data_acquisition.pdf_downloader import PDFDownloader
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -24,6 +25,7 @@ class AgentController:
|
|||||||
self.llm_analyzer = None
|
self.llm_analyzer = None
|
||||||
self.paper_store = None
|
self.paper_store = None
|
||||||
self.vector_store = None
|
self.vector_store = None
|
||||||
|
self.pdf_downloader = None
|
||||||
self.papers_dir = Path("papers")
|
self.papers_dir = Path("papers")
|
||||||
|
|
||||||
async def __aenter__(self):
|
async def __aenter__(self):
|
||||||
@ -57,6 +59,7 @@ class AgentController:
|
|||||||
self.llm_analyzer = LLMAnalyzer(api_key=deepseek_key, provider='deepseek')
|
self.llm_analyzer = LLMAnalyzer(api_key=deepseek_key, provider='deepseek')
|
||||||
self.paper_store = PaperStore()
|
self.paper_store = PaperStore()
|
||||||
self.vector_store = VectorStore()
|
self.vector_store = VectorStore()
|
||||||
|
self.pdf_downloader = PDFDownloader()
|
||||||
|
|
||||||
# Initialize database
|
# Initialize database
|
||||||
await self.paper_store.initialize()
|
await self.paper_store.initialize()
|
||||||
@ -86,6 +89,15 @@ class AgentController:
|
|||||||
if paper_data.get('abstract'):
|
if paper_data.get('abstract'):
|
||||||
paper_text += f"Abstract: {paper_data['abstract']}\n\n"
|
paper_text += f"Abstract: {paper_data['abstract']}\n\n"
|
||||||
|
|
||||||
|
# Download PDF if URL is available
|
||||||
|
if paper_data.get('pdf_url'):
|
||||||
|
try:
|
||||||
|
pdf_path = await self.pdf_downloader.download_pdf(paper_data['pdf_url'], paper_data['entry_id'].split('/')[-1])
|
||||||
|
if pdf_path:
|
||||||
|
logger.info(f"Successfully downloaded PDF to {pdf_path}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error downloading PDF: {e}")
|
||||||
|
|
||||||
# Analyze paper using LLM
|
# Analyze paper using LLM
|
||||||
analysis = await self.llm_analyzer.analyze_paper(paper_text)
|
analysis = await self.llm_analyzer.analyze_paper(paper_text)
|
||||||
|
|
||||||
@ -107,14 +119,6 @@ class AgentController:
|
|||||||
await self._repair_orphaned_paper(paper_id, paper_data)
|
await self._repair_orphaned_paper(paper_id, paper_data)
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
# Clean paper_id to use as filename
|
|
||||||
safe_id = paper_id.replace('.', '_')
|
|
||||||
|
|
||||||
# Save paper content to file
|
|
||||||
paper_path = self.papers_dir / f"{safe_id}.json"
|
|
||||||
with open(paper_path, 'w', encoding='utf-8') as f:
|
|
||||||
json.dump(paper_data, f, indent=2, ensure_ascii=False)
|
|
||||||
|
|
||||||
# Store metadata in PostgreSQL
|
# Store metadata in PostgreSQL
|
||||||
metadata = {
|
metadata = {
|
||||||
'id': paper_id,
|
'id': paper_id,
|
||||||
@ -140,9 +144,9 @@ class AgentController:
|
|||||||
{'paper_id': paper_id, 'type': 'technical_concepts'}
|
{'paper_id': paper_id, 'type': 'technical_concepts'}
|
||||||
]
|
]
|
||||||
chunk_ids = [
|
chunk_ids = [
|
||||||
f"{safe_id}_text",
|
f"{paper_id}_text",
|
||||||
f"{safe_id}_summary",
|
f"{paper_id}_summary",
|
||||||
f"{safe_id}_concepts"
|
f"{paper_id}_concepts"
|
||||||
]
|
]
|
||||||
self.vector_store.add_chunks(chunks, chunk_metadata, chunk_ids)
|
self.vector_store.add_chunks(chunks, chunk_metadata, chunk_ids)
|
||||||
|
|
||||||
@ -180,13 +184,6 @@ class AgentController:
|
|||||||
for paper_id in paper_ids:
|
for paper_id in paper_ids:
|
||||||
paper = await self.paper_store.get_paper(paper_id)
|
paper = await self.paper_store.get_paper(paper_id)
|
||||||
if paper:
|
if paper:
|
||||||
# Load full paper data if needed
|
|
||||||
safe_id = paper_id.replace('.', '_')
|
|
||||||
paper_path = self.papers_dir / f"{safe_id}.json"
|
|
||||||
if paper_path.exists():
|
|
||||||
with open(paper_path, 'r', encoding='utf-8') as f:
|
|
||||||
full_paper = json.load(f)
|
|
||||||
paper['full_data'] = full_paper
|
|
||||||
papers.append(paper)
|
papers.append(paper)
|
||||||
|
|
||||||
return papers
|
return papers
|
||||||
@ -207,6 +204,8 @@ class AgentController:
|
|||||||
await self.paper_store.close()
|
await self.paper_store.close()
|
||||||
if self.vector_store:
|
if self.vector_store:
|
||||||
await self.vector_store.close()
|
await self.vector_store.close()
|
||||||
|
if self.pdf_downloader:
|
||||||
|
await self.pdf_downloader.close()
|
||||||
|
|
||||||
self.initialized = False
|
self.initialized = False
|
||||||
logger.info("Agent controller closed successfully")
|
logger.info("Agent controller closed successfully")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user