Initial commit: Academic paper processing system

2025-01-31 02:00:36 +00:00 · 2025-01-31 02:00:36 +00:00 · ec6a6b0cd0
commit ec6a6b0cd0
9 changed files with 470 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,20 @@
 # LLM API Keys
 DEEPSEEK_API_KEY=your-deepseek-key-here
 OPENAI_API_KEY=your-openai-key-here  # Optional backup provider
 # Database Credentials
 POSTGRES_USER=your-postgres-user
 POSTGRES_PASSWORD=your-postgres-password
 POSTGRES_DB=paper_review
 # Redis Configuration
 REDIS_URL=redis://localhost:6379/0
 # Storage Paths
 CACHE_DIR=cache/papers
 VECTOR_STORE_PATH=data/chroma
 LOG_PATH=logs/agent.log
 # Security
 ENCRYPTION_KEY=your-encryption-key-here
 MAX_REQUESTS_PER_MINUTE=100
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,9 @@
 venv/
 .venv/
 .env
 __pycache__/
 *.pyc
 .vscode/
 .idea/
 *.sqlite3
 *.comments
--- a/architecture.md
+++ b/architecture.md
@ -0,0 +1,62 @@
 # AI Paper Review Agent Architecture
 ## Overview
 Modular system for fetching, processing, and summarizing academic papers using LangChain.
 ### Core Components
 1. **Data Acquisition Layer**
   - arXiv API Client (REST interface)
   - PDF Downloader Service
   - Metadata Extraction Module
 2. **Processing Pipeline**
   - PDF Text Extractor (PyPDF2/Unstructured)
   - Semantic Chunker
   - Metadata Enricher (author institutions, citations)
 3. **Analysis Engine**
   - LangChain Document Loaders
   - Multi-stage Summary Chain (Deepseek r1)
   - Technical Concept Extractor
   - Cross-Paper Insight Aggregator
 4. **Storage Layer**
   - Relational Storage (PostgreSQL)
   - Vector Store (Chroma)
   - Cache System (Redis)
 5. **Orchestration**
   - Agent Controller Class
   - Retry Mechanism with Exponential Backoff
   - Quality Assurance Checks
 ## Architectural Diagram
 ```mermaid
 graph LR
    A[User Query] --> B(arXiv API)
    B --> C[PDF Storage]
    C --> D{Processing Queue}
    D --> E[Text Extraction]
    E --> F[Chunking]
    F --> G[Embedding]
    G --> H[Vector Store]
    H --> I[LLM Analysis]
    I --> J[Report Generation]
 ```
 ## Key Decisions
 1. **Modular Design**: Components communicate via clean interfaces for easy replacement
 2. **Batch Processing**: Asynchronous pipeline for parallel paper processing
 3. **Caching Layer**: Reduces API calls and improves performance
 4. **Fallback Strategies**: Multiple PDF parsers with automatic fallback
 5. **Security**: Environment variables for credentials, encrypted storage
 ## Dependencies
 ```python
 langchain==0.2.1
 arxiv==2.1.0
 unstructured==0.12.2
 openai==1.30.1
 faiss-cpu==1.8.0
 sqlalchemy==2.0.30
--- a/config/settings.yaml
+++ b/config/settings.yaml
@ -0,0 +1,63 @@
 # AI Paper Review Agent Configuration
 # LLM Settings
 llm:
  temperature: 0.5
  max_tokens: 4096
  model: deepseek-r1
 # ArXiv Client Settings
 arxiv:
  max_results_per_query: 10
  cache_dir: cache/papers
  retry_attempts: 3
  retry_delay: 2  # seconds
 # PDF Processing
 pdf:
  chunk_size: 1000
  chunk_overlap: 100
  fallback_enabled: true
  supported_formats:
    - pdf
    - PDF
 # Analysis Settings
 analysis:
  summary_chain_type: map_reduce
  min_concepts: 3
  max_summary_length: 2000
 # Storage Configuration
 storage:
  # PostgreSQL settings
  postgres:
    host: localhost
    port: 5432
    database: paper_review
    user: ${POSTGRES_USER}
    password: ${POSTGRES_PASSWORD}
  # Vector store settings
  chroma:
    persist_directory: data/chroma
    collection_name: papers
  # Redis cache settings
  redis:
    host: localhost
    port: 6379
    db: 0
    ttl: 86400  # 24 hours
 # Security
 security:
  encrypt_storage: true
  api_rate_limit: 100  # requests per minute
  max_file_size: 50000000  # 50MB
 # Logging
 logging:
  level: INFO
  format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
  file: logs/agent.log
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,12 @@
 langchain==0.2.1
 arxiv==2.1.0
 unstructured==0.12.2
 deepseek-ai==1.0.0
 faiss-cpu==1.8.0
 sqlalchemy==2.0.30
 python-dotenv==1.0.0
 pyyaml==6.0.1
 pypdf==4.2.0
 redis==5.0.3
 chromadb==0.5.0
 requests>=2.31.0
--- a/src/analysis/analysis_engine.py
+++ b/src/analysis/analysis_engine.py
@ -0,0 +1,53 @@
 from langchain.schema import Document
 from langchain.chains.summarize import load_summarize_chain
 from langchain_community.llms import OpenAI
 from typing import List, Dict
 import yaml
 import os
 class TechnicalConceptExtractor:
    def __init__(self, llm):
        self.llm = llm
    def extract_concepts(self, text: str) -> List[str]:
        prompt = f"""Identify key technical concepts from this text:
        {text[:5000]}
        List them as comma-separated values:"""
        result = self.llm(prompt)
        return [c.strip() for c in result.split(",")]
 class InsightAggregator:
    def __init__(self, llm):
        self.llm = llm
    def find_connections(self, papers: List[Dict]) -> str:
        summaries = "\n\n".join([p["summary"] for p in papers])
        prompt = f"""Analyze these paper summaries and identify cross-cutting themes:
        {summaries[:10000]}
        Provide a structured analysis:"""
        return self.llm(prompt)
 class PaperAnalyzer:
    def __init__(self, config_path: str = "config/settings.yaml"):
        with open(config_path) as f:
            self.config = yaml.safe_load(f)
        self.llm = OpenAI(
            temperature=0.5,  # More conservative temperature for academic analysis
            model_name="deepseek-r1",
            api_key=os.getenv("DEEPSEEK_API_KEY"),
            base_url="https://api.deepseek.com/v1",
            max_tokens=4096  # Increased token limit for complex papers
        )
        self.summary_chain = load_summarize_chain(self.llm, chain_type="map_reduce")
        self.concept_extractor = TechnicalConceptExtractor(self.llm)
        self.insight_aggregator = InsightAggregator(self.llm)
    def analyze_document(self, document: Document) -> Dict:
        summary = self.summary_chain.run([document])
        concepts = self.concept_extractor.extract_concepts(document.page_content)
        return {
            "summary": summary,
            "concepts": concepts,
            "metadata": document.metadata
        }
--- a/src/data_acquisition/arxiv_client.py
+++ b/src/data_acquisition/arxiv_client.py
@ -0,0 +1,61 @@
 import arxiv
 import os
 from typing import List, Dict
 import requests
 from pathlib import Path
 class ArxivClient:
    def __init__(self, cache_dir: str = "cache/papers"):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(parents=True, exist_ok=True)
    def search_papers(self, query: str, max_results: int = 10) -> List[Dict]:
        """Search arXiv for papers matching the query"""
        search = arxiv.Search(
            query=query,
            max_results=max_results,
            sort_by=arxiv.SortCriterion.SubmittedDate
        )
        results = []
        for paper in search.results():
            results.append({
                "id": paper.entry_id.split("/")[-1],
                "title": paper.title,
                "authors": [author.name for author in paper.authors],
                "summary": paper.summary,
                "pdf_url": paper.pdf_url,
                "published": paper.published.isoformat(),
                "categories": paper.categories
            })
        return results
    def download_paper(self, paper_id: str, pdf_url: str) -> str:
        """Download paper PDF and return local path"""
        cache_path = self.cache_dir / f"{paper_id}.pdf"
        if cache_path.exists():
            return str(cache_path)
        response = requests.get(pdf_url, stream=True)
        response.raise_for_status()
        with open(cache_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        return str(cache_path)
    def extract_metadata(self, paper_id: str) -> Dict:
        """Extract additional metadata from arXiv API"""
        url = f"http://export.arxiv.org/api/query?id_list={paper_id}"
        response = requests.get(url)
        response.raise_for_status()
        # Basic metadata extraction from API response
        # Could be enhanced with XML parsing for more detailed info
        return {
            "id": paper_id,
            "retrieved_at": response.headers.get("Last-Modified"),
            "size": len(response.content)
        }
--- a/src/orchestration/agent_controller.py
+++ b/src/orchestration/agent_controller.py
@ -0,0 +1,107 @@
 import time
 from typing import Dict, List, Optional
 import logging
 from pathlib import Path
 import redis
 from langchain.schema import Document
 from ..data_acquisition.arxiv_client import ArxivClient
 from ..processing.pdf_processor import PDFProcessor
 from ..analysis.analysis_engine import PaperAnalyzer
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class AgentController:
    def __init__(self, config: Dict):
        self.config = config
        self.arxiv_client = ArxivClient()
        self.pdf_processor = PDFProcessor()
        self.paper_analyzer = PaperAnalyzer()
        # Initialize Redis for caching
        redis_config = config.get("redis", {})
        self.cache = redis.Redis(
            host=redis_config.get("host", "localhost"),
            port=redis_config.get("port", 6379),
            db=redis_config.get("db", 0)
        )
    def process_paper(self, paper_id: str, pdf_url: str, max_retries: int = 3) -> Optional[Dict]:
        """Process a single paper with retry mechanism"""
        for attempt in range(max_retries):
            try:
                # Download PDF
                pdf_path = self.arxiv_client.download_paper(paper_id, pdf_url)
                # Extract text with fallback options
                text = self.pdf_processor.extract_text(pdf_path)
                if not text.strip():
                    raise ValueError("Extracted text is empty")
                # Create document with metadata
                metadata = {
                    **self.arxiv_client.extract_metadata(paper_id),
                    **self.pdf_processor.extract_metadata(pdf_path)
                }
                document = Document(page_content=text, metadata=metadata)
                # Analyze content
                return self.paper_analyzer.analyze_document(document)
            except Exception as e:
                logger.error(f"Attempt {attempt + 1} failed: {str(e)}")
                if attempt < max_retries - 1:
                    wait_time = 2 ** attempt  # Exponential backoff
                    time.sleep(wait_time)
                else:
                    logger.error(f"All attempts failed for paper {paper_id}")
                    return None
    def process_query(self, query: str, max_papers: int = 5) -> List[Dict]:
        """Process multiple papers for a given query"""
        # Search for papers
        papers = self.arxiv_client.search_papers(query, max_results=max_papers)
        results = []
        for paper in papers:
            # Check cache first
            cache_key = f"paper:{paper['id']}"
            cached_result = self.cache.get(cache_key)
            if cached_result:
                results.append(eval(cached_result))
                continue
            # Process paper if not in cache
            result = self.process_paper(paper["id"], paper["pdf_url"])
            if result:
                # Cache successful results
                self.cache.setex(
                    cache_key,
                    self.config.get("cache_ttl", 86400),  # Default 24h TTL
                    str(result)
                )
                results.append(result)
        return results
    def run_quality_checks(self, results: List[Dict]) -> bool:
        """Verify quality of processed results"""
        if not results:
            return False
        for result in results:
            # Check for required fields
            if not all(k in result for k in ["summary", "concepts", "metadata"]):
                return False
            # Validate summary length
            if len(result["summary"]) < 100:  # Arbitrary minimum length
                return False
            # Validate concepts extraction
            if not result["concepts"] or len(result["concepts"]) < 3:
                return False
        return True
--- a/src/processing/pdf_processor.py
+++ b/src/processing/pdf_processor.py
@ -0,0 +1,83 @@
 from typing import Dict, List
 from pathlib import Path
 import pypdf
 from unstructured.partition.pdf import partition_pdf
 import logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class PDFProcessor:
    def __init__(self, fallback_enabled: bool = True):
        self.fallback_enabled = fallback_enabled
    def extract_text(self, pdf_path: str) -> str:
        """Extract text from PDF with fallback options"""
        try:
            # First attempt: Use unstructured
            elements = partition_pdf(pdf_path)
            text = "\n".join([str(element) for element in elements])
            if text.strip():
                return text
            if not self.fallback_enabled:
                raise ValueError("Primary extraction failed and fallback disabled")
            # Fallback: Use PyPDF
            logger.info("Falling back to PyPDF for extraction")
            return self._extract_with_pypdf(pdf_path)
        except Exception as e:
            logger.error(f"PDF extraction failed: {str(e)}")
            if self.fallback_enabled:
                logger.info("Attempting PyPDF fallback")
                return self._extract_with_pypdf(pdf_path)
            raise
    def _extract_with_pypdf(self, pdf_path: str) -> str:
        """Fallback extraction using PyPDF"""
        text = []
        with open(pdf_path, "rb") as file:
            reader = pypdf.PdfReader(file)
            for page in reader.pages:
                text.append(page.extract_text())
        return "\n".join(text)
    def chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
        """Split text into overlapping chunks for processing"""
        chunks = []
        start = 0
        text_len = len(text)
        while start < text_len:
            end = start + chunk_size
            chunk = text[start:end]
            # Adjust chunk to end at sentence boundary if possible
            if end < text_len:
                last_period = chunk.rfind(".")
                if last_period != -1:
                    end = start + last_period + 1
                    chunk = text[start:end]
            chunks.append(chunk)
            start = end - overlap
        return chunks
    def extract_metadata(self, pdf_path: str) -> Dict:
        """Extract PDF metadata"""
        with open(pdf_path, "rb") as file:
            reader = pypdf.PdfReader(file)
            info = reader.metadata
            if info:
                return {
                    "title": info.get("/Title", ""),
                    "author": info.get("/Author", ""),
                    "subject": info.get("/Subject", ""),
                    "keywords": info.get("/Keywords", ""),
                    "creator": info.get("/Creator", ""),
                    "producer": info.get("/Producer", ""),
                    "page_count": len(reader.pages)
                }
            return {"page_count": len(reader.pages)}