spignotti · spignotti · May 10, 2026 · May 10, 2026 · May 10, 2026
diff --git a/litresearch.toml.example b/litresearch.toml.example
@@ -95,6 +95,19 @@ abstract_fallback = true
 # PDFs should be named: {paper_id}.pdf or {doi}.pdf (slashes replaced with underscores)
 # inject_pdf_dir = "/path/to/pdfs"
 
+# ============================================================================
+# Query Expansion (Optional)
+# ============================================================================
+
+# Enable iterative query expansion after initial enrichment
+enable_query_expansion = true
+
+# Maximum number of expansion queries to generate
+max_expansion_queries = 2
+
+# Number of top candidates (by citation count) to sample for expansion analysis
+expansion_candidate_sample = 30
+
 # ============================================================================
 # Citation Expansion (Optional)
 # ============================================================================
@@ -105,6 +118,12 @@ expand_citations = false
 # Minimum number of cross-references required to include a paper
 min_cross_refs = 3
 
+# Enable foundational paper detection (papers cited by many candidates)
+enable_foundational_detection = true
+
+# Number of foundational papers to identify
+foundational_papers_count = 5
+
 # ============================================================================
 # Zotero Export (Optional)
 # ============================================================================

diff --git a/src/litresearch/cli.py b/src/litresearch/cli.py
@@ -20,7 +20,6 @@ def _build_settings(
     top_n: int | None = None,
     output_dir: str | None = None,
     threshold: int | None = None,
-    inject_pdf_dir: str | None = None,
 ) -> Settings:
     """Load settings and apply CLI overrides."""
     overrides = {
@@ -30,7 +29,6 @@ def _build_settings(
             "top_n": top_n,
             "output_dir": output_dir,
             "screening_threshold": threshold,
-            "inject_pdf_dir": inject_pdf_dir,
         }.items()
         if value is not None
     }
@@ -51,9 +49,6 @@ def config() -> None:
     console.print(f"screening_threshold={settings.screening_threshold}")
     console.print(f"top_n={settings.top_n}")
     console.print(f"max_results_per_query={settings.max_results_per_query}")
-    console.print(f"pdf_first_pages={settings.pdf_first_pages}")
-    console.print(f"pdf_last_pages={settings.pdf_last_pages}")
-    console.print(f"inject_pdf_dir={settings.inject_pdf_dir}")
     console.print(f"output_dir={settings.output_dir}")
     console.print(f"s2_api_key_configured={bool(settings.s2_api_key)}")
     console.print(f"llm_api_key_configured={settings.has_llm_api_key}")
@@ -73,35 +68,19 @@ def run(
         bool,
         typer.Option("--overwrite", help="Overwrite existing output directory."),
     ] = False,
-    inject_pdfs: Annotated[
-        Path | None,
-        typer.Option(
-            "--inject-pdfs", help="Directory containing PDFs to inject by paper_id or DOI"
-        ),
-    ] = None,
-    stop_after_screening: Annotated[
-        bool,
-        typer.Option(
-            "--stop-after-screening",
-            help="Stop after screening to review papers needing PDFs before analysis",
-        ),
-    ] = False,
 ) -> None:
     """Run the literature research pipeline."""
     settings = _build_settings(
         model=model,
         top_n=top_n,
         output_dir=output_dir,
         threshold=threshold,
-        inject_pdf_dir=str(inject_pdfs) if inject_pdfs is not None else None,
     )
 
     state = run_pipeline(
         questions,
         settings,
         overwrite=overwrite,
-        inject_pdfs_dir=inject_pdfs,
-        stop_after_screening=stop_after_screening,
     )
     if state.screened_papers_completed and not state.analyses:
         console.print(
@@ -121,23 +100,16 @@ def resume(
         int | None,
         typer.Option("--threshold", help="Override the screening threshold."),
     ] = None,
-    inject_pdfs: Annotated[
-        Path | None,
-        typer.Option(
-            "--inject-pdfs", help="Directory containing PDFs to inject by paper_id or DOI"
-        ),
-    ] = None,
 ) -> None:
     """Resume the literature research pipeline from saved state."""
     settings = _build_settings(
         model=model,
         top_n=top_n,
         output_dir=output_dir,
         threshold=threshold,
-        inject_pdf_dir=str(inject_pdfs) if inject_pdfs is not None else None,
     )
 
-    state = run_pipeline([], settings, resume_path=Path(state_file), inject_pdfs_dir=inject_pdfs)
+    state = run_pipeline([], settings, resume_path=Path(state_file))
     console.print(f"[green]Resume complete.[/green] Output: {state.output_dir}")
 
 

diff --git a/src/litresearch/config.py b/src/litresearch/config.py
@@ -47,7 +47,7 @@ def settings_customise_sources(
     max_retries: int = 3
     retry_base_delay: float = 1.0
     llm_timeout: int = 120
-    default_model: str = "openai/gpt-4o-mini"
+    default_model: str = "openai/gpt-5.4-mini"
     screening_selection_mode: Literal["top_percent", "threshold", "top_k"] = "top_percent"
     screening_top_percent: float = 0.3  # 0-1; used when screening_selection_mode=top_percent
     screening_top_k: int | None = None  # used when screening_selection_mode=top_k
@@ -59,9 +59,16 @@ def settings_customise_sources(
     discovery_sources: list[str] = ["s2"]
     openalex_email: str | None = None
 
+    # Query expansion
+    enable_query_expansion: bool = True
+    max_expansion_queries: int = 2
+    expansion_candidate_sample: int = 30
+
     # Citation expansion
     expand_citations: bool = False
     min_cross_refs: int = 3
+    enable_foundational_detection: bool = True
+    foundational_papers_count: int = 5
 
     # Zotero export
     zotero_library_id: str | None = None
@@ -71,12 +78,6 @@ def settings_customise_sources(
     zotero_tag: str | None = None
     zotero_export: bool = False
 
-    pdf_first_pages: int = 4
-    pdf_last_pages: int = 2
-    pdf_extraction_mode: Literal["budget", "pages"] = "budget"
-    pdf_token_budget: int = 4000
-    abstract_fallback: bool = True
-    inject_pdf_dir: str | None = None
     output_dir: str = "output"
 
     @computed_field

diff --git a/src/litresearch/exporters/zotero.py b/src/litresearch/exporters/zotero.py
@@ -1,6 +1,5 @@
 """Zotero export integration."""
 
-from pathlib import Path
 from typing import Any
 
 from rich.console import Console
@@ -73,9 +72,6 @@ def export_to_zotero(
             if paper.doi:
                 item["DOI"] = paper.doi
 
-            if paper.open_access_pdf_url:
-                item["url"] = paper.open_access_pdf_url
-
             if collection_key:
                 item["collections"] = [collection_key]
 
@@ -90,17 +86,6 @@ def create_item(payload: dict[str, Any] = item) -> dict[str, Any]:
 
             if result.get("successful"):
                 successful += 1
-
-                if paper.pdf_path:
-                    try:
-                        pdf_full_path = Path(paper.pdf_path)
-                        if pdf_full_path.exists():
-                            item_key = list(result["successful"].values())[0]["key"]
-                            zot.attachment_simple([str(pdf_full_path)], item_key)
-                    except Exception as exc:  # noqa: BLE001
-                        console.print(
-                            f"[yellow]Failed to attach PDF for {paper.title}:[/yellow] {exc}"
-                        )
             else:
                 failed.append(f"{paper.title}: {result.get('failed', 'Unknown error')}")
 

diff --git a/src/litresearch/models.py b/src/litresearch/models.py
@@ -27,7 +27,6 @@ class S2PaperLike(Protocol):
     citationCount: int | None
     venue: str | None
     externalIds: dict[str, str] | None
-    openAccessPdf: dict[str, str] | None
     citationStyles: dict[str, str] | None
 
 
@@ -57,26 +56,14 @@ class Paper(BaseModel):
     citation_count: int = 0
     venue: str | None = None
     doi: str | None = None
-    open_access_pdf_url: str | None = None
     bibtex: str | None = None
     source: Literal["s2", "openalex", "both", "citation_expansion"] = "s2"
-    pdf_path: str | None = None
-    pdf_status: Literal["not_attempted", "downloaded", "unavailable", "user_provided"] = (
-        "not_attempted"
-    )
-    data_completeness: Literal["full", "abstract_only", "metadata_only"] = "full"
-
-    @property
-    def pdf_downloaded(self) -> bool:
-        """Backwards-compatible indicator for downloaded or provided PDFs."""
-        return self.pdf_status in {"downloaded", "user_provided"} or self.pdf_path is not None
 
     @classmethod
     def from_s2(cls, s2_paper: S2PaperLike) -> "Paper":
         """Create a normalized paper model from a Semantic Scholar paper object."""
 
         external_ids = s2_paper.externalIds or {}
-        open_access_pdf = s2_paper.openAccessPdf or {}
         citation_styles = s2_paper.citationStyles or {}
         authors = s2_paper.authors or []
 
@@ -90,7 +77,6 @@ def from_s2(cls, s2_paper: S2PaperLike) -> "Paper":
             citation_count=s2_paper.citationCount or 0,
             venue=html.unescape(s2_paper.venue) if s2_paper.venue else None,
             doi=external_ids.get("DOI"),
-            open_access_pdf_url=open_access_pdf.get("url"),
             bibtex=citation_styles.get("bibtex"),
             source="s2",
         )
@@ -141,13 +127,11 @@ class RunMetrics(BaseModel):
     total_analyzed: int = 0
     total_exported: int = 0
     citation_expanded: int = 0
+    expansion_queries_generated: int = 0
+    foundational_papers: int = 0
 
     sources: dict[str, int] = Field(default_factory=dict)
 
-    pdfs_downloaded: int = 0
-    pdfs_user_provided: int = 0
-    pdfs_unavailable: int = 0
-
 
 class PipelineState(BaseModel):
     """Serializable pipeline state for fresh runs and resume."""
@@ -159,7 +143,9 @@ class PipelineState(BaseModel):
     screening_results: list[ScreeningResult] = Field(default_factory=list)
     analyses: list[AnalysisResult] = Field(default_factory=list)
     ranked_paper_ids: list[str] = Field(default_factory=list)
+    foundational_paper_ids: list[str] = Field(default_factory=list)
     screened_papers_completed: bool = False
+    query_expansion_run: bool = False
     current_stage: str
     output_dir: str
     created_at: str

diff --git a/src/litresearch/pdf.py b/src/litresearch/pdf.py