Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions litresearch.toml.example
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,19 @@ abstract_fallback = true
# PDFs should be named: {paper_id}.pdf or {doi}.pdf (slashes replaced with underscores)
# inject_pdf_dir = "/path/to/pdfs"

# ============================================================================
# Query Expansion (Optional)
# ============================================================================

# Enable iterative query expansion after initial enrichment
enable_query_expansion = true

# Maximum number of expansion queries to generate
max_expansion_queries = 2

# Number of top candidates (by citation count) to sample for expansion analysis
expansion_candidate_sample = 30

# ============================================================================
# Citation Expansion (Optional)
# ============================================================================
Expand All @@ -105,6 +118,12 @@ expand_citations = false
# Minimum number of cross-references required to include a paper
min_cross_refs = 3

# Enable foundational paper detection (papers cited by many candidates)
enable_foundational_detection = true

# Number of foundational papers to identify
foundational_papers_count = 5

# ============================================================================
# Zotero Export (Optional)
# ============================================================================
Expand Down
30 changes: 1 addition & 29 deletions src/litresearch/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ def _build_settings(
top_n: int | None = None,
output_dir: str | None = None,
threshold: int | None = None,
inject_pdf_dir: str | None = None,
) -> Settings:
"""Load settings and apply CLI overrides."""
overrides = {
Expand All @@ -30,7 +29,6 @@ def _build_settings(
"top_n": top_n,
"output_dir": output_dir,
"screening_threshold": threshold,
"inject_pdf_dir": inject_pdf_dir,
}.items()
if value is not None
}
Expand All @@ -51,9 +49,6 @@ def config() -> None:
console.print(f"screening_threshold={settings.screening_threshold}")
console.print(f"top_n={settings.top_n}")
console.print(f"max_results_per_query={settings.max_results_per_query}")
console.print(f"pdf_first_pages={settings.pdf_first_pages}")
console.print(f"pdf_last_pages={settings.pdf_last_pages}")
console.print(f"inject_pdf_dir={settings.inject_pdf_dir}")
console.print(f"output_dir={settings.output_dir}")
console.print(f"s2_api_key_configured={bool(settings.s2_api_key)}")
console.print(f"llm_api_key_configured={settings.has_llm_api_key}")
Expand All @@ -73,35 +68,19 @@ def run(
bool,
typer.Option("--overwrite", help="Overwrite existing output directory."),
] = False,
inject_pdfs: Annotated[
Path | None,
typer.Option(
"--inject-pdfs", help="Directory containing PDFs to inject by paper_id or DOI"
),
] = None,
stop_after_screening: Annotated[
bool,
typer.Option(
"--stop-after-screening",
help="Stop after screening to review papers needing PDFs before analysis",
),
] = False,
) -> None:
"""Run the literature research pipeline."""
settings = _build_settings(
model=model,
top_n=top_n,
output_dir=output_dir,
threshold=threshold,
inject_pdf_dir=str(inject_pdfs) if inject_pdfs is not None else None,
)

state = run_pipeline(
questions,
settings,
overwrite=overwrite,
inject_pdfs_dir=inject_pdfs,
stop_after_screening=stop_after_screening,
)
if state.screened_papers_completed and not state.analyses:
console.print(
Expand All @@ -121,23 +100,16 @@ def resume(
int | None,
typer.Option("--threshold", help="Override the screening threshold."),
] = None,
inject_pdfs: Annotated[
Path | None,
typer.Option(
"--inject-pdfs", help="Directory containing PDFs to inject by paper_id or DOI"
),
] = None,
) -> None:
"""Resume the literature research pipeline from saved state."""
settings = _build_settings(
model=model,
top_n=top_n,
output_dir=output_dir,
threshold=threshold,
inject_pdf_dir=str(inject_pdfs) if inject_pdfs is not None else None,
)

state = run_pipeline([], settings, resume_path=Path(state_file), inject_pdfs_dir=inject_pdfs)
state = run_pipeline([], settings, resume_path=Path(state_file))
console.print(f"[green]Resume complete.[/green] Output: {state.output_dir}")


Expand Down
15 changes: 8 additions & 7 deletions src/litresearch/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def settings_customise_sources(
max_retries: int = 3
retry_base_delay: float = 1.0
llm_timeout: int = 120
default_model: str = "openai/gpt-4o-mini"
default_model: str = "openai/gpt-5.4-mini"
screening_selection_mode: Literal["top_percent", "threshold", "top_k"] = "top_percent"
screening_top_percent: float = 0.3 # 0-1; used when screening_selection_mode=top_percent
screening_top_k: int | None = None # used when screening_selection_mode=top_k
Expand All @@ -59,9 +59,16 @@ def settings_customise_sources(
discovery_sources: list[str] = ["s2"]
openalex_email: str | None = None

# Query expansion
enable_query_expansion: bool = True
max_expansion_queries: int = 2
expansion_candidate_sample: int = 30

# Citation expansion
expand_citations: bool = False
min_cross_refs: int = 3
enable_foundational_detection: bool = True
foundational_papers_count: int = 5

# Zotero export
zotero_library_id: str | None = None
Expand All @@ -71,12 +78,6 @@ def settings_customise_sources(
zotero_tag: str | None = None
zotero_export: bool = False

pdf_first_pages: int = 4
pdf_last_pages: int = 2
pdf_extraction_mode: Literal["budget", "pages"] = "budget"
pdf_token_budget: int = 4000
abstract_fallback: bool = True
inject_pdf_dir: str | None = None
output_dir: str = "output"

@computed_field
Expand Down
15 changes: 0 additions & 15 deletions src/litresearch/exporters/zotero.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Zotero export integration."""

from pathlib import Path
from typing import Any

from rich.console import Console
Expand Down Expand Up @@ -73,9 +72,6 @@ def export_to_zotero(
if paper.doi:
item["DOI"] = paper.doi

if paper.open_access_pdf_url:
item["url"] = paper.open_access_pdf_url

if collection_key:
item["collections"] = [collection_key]

Expand All @@ -90,17 +86,6 @@ def create_item(payload: dict[str, Any] = item) -> dict[str, Any]:

if result.get("successful"):
successful += 1

if paper.pdf_path:
try:
pdf_full_path = Path(paper.pdf_path)
if pdf_full_path.exists():
item_key = list(result["successful"].values())[0]["key"]
zot.attachment_simple([str(pdf_full_path)], item_key)
except Exception as exc: # noqa: BLE001
console.print(
f"[yellow]Failed to attach PDF for {paper.title}:[/yellow] {exc}"
)
else:
failed.append(f"{paper.title}: {result.get('failed', 'Unknown error')}")

Expand Down
22 changes: 4 additions & 18 deletions src/litresearch/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ class S2PaperLike(Protocol):
citationCount: int | None
venue: str | None
externalIds: dict[str, str] | None
openAccessPdf: dict[str, str] | None
citationStyles: dict[str, str] | None


Expand Down Expand Up @@ -57,26 +56,14 @@ class Paper(BaseModel):
citation_count: int = 0
venue: str | None = None
doi: str | None = None
open_access_pdf_url: str | None = None
bibtex: str | None = None
source: Literal["s2", "openalex", "both", "citation_expansion"] = "s2"
pdf_path: str | None = None
pdf_status: Literal["not_attempted", "downloaded", "unavailable", "user_provided"] = (
"not_attempted"
)
data_completeness: Literal["full", "abstract_only", "metadata_only"] = "full"

@property
def pdf_downloaded(self) -> bool:
"""Backwards-compatible indicator for downloaded or provided PDFs."""
return self.pdf_status in {"downloaded", "user_provided"} or self.pdf_path is not None

@classmethod
def from_s2(cls, s2_paper: S2PaperLike) -> "Paper":
"""Create a normalized paper model from a Semantic Scholar paper object."""

external_ids = s2_paper.externalIds or {}
open_access_pdf = s2_paper.openAccessPdf or {}
citation_styles = s2_paper.citationStyles or {}
authors = s2_paper.authors or []

Expand All @@ -90,7 +77,6 @@ def from_s2(cls, s2_paper: S2PaperLike) -> "Paper":
citation_count=s2_paper.citationCount or 0,
venue=html.unescape(s2_paper.venue) if s2_paper.venue else None,
doi=external_ids.get("DOI"),
open_access_pdf_url=open_access_pdf.get("url"),
bibtex=citation_styles.get("bibtex"),
source="s2",
)
Expand Down Expand Up @@ -141,13 +127,11 @@ class RunMetrics(BaseModel):
total_analyzed: int = 0
total_exported: int = 0
citation_expanded: int = 0
expansion_queries_generated: int = 0
foundational_papers: int = 0

sources: dict[str, int] = Field(default_factory=dict)

pdfs_downloaded: int = 0
pdfs_user_provided: int = 0
pdfs_unavailable: int = 0


class PipelineState(BaseModel):
"""Serializable pipeline state for fresh runs and resume."""
Expand All @@ -159,7 +143,9 @@ class PipelineState(BaseModel):
screening_results: list[ScreeningResult] = Field(default_factory=list)
analyses: list[AnalysisResult] = Field(default_factory=list)
ranked_paper_ids: list[str] = Field(default_factory=list)
foundational_paper_ids: list[str] = Field(default_factory=list)
screened_papers_completed: bool = False
query_expansion_run: bool = False
current_stage: str
output_dir: str
created_at: str
Expand Down
90 changes: 0 additions & 90 deletions src/litresearch/pdf.py

This file was deleted.

Loading