diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 278e954..dee8694 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,7 +23,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e65701..8a8237a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,26 @@ All notable changes to **LongParser** are documented here. This project follows [Semantic Versioning](https://semver.org/) and [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). +## [0.1.3] — 2026-04-13 + +### Fixed + +- **Source code**: Added `DocumentPipeline` as a public alias for `PipelineOrchestrator` — + docs, quickstart, and all examples now use this name consistently +- **Documentation**: Fixed wrong coverage path `long_parser` → `longparser` in `CONTRIBUTING.md` +- **Documentation**: Replaced stale `cleanrag-api` reference in Docker deployment docs +- **Documentation**: Standardized Gemini API key env var to `GOOGLE_API_KEY` across all docs +- **Source code**: Updated default LLM model fallback from `gpt-4o` to `gpt-5.3` in + `schemas.py`, `llm_chain.py`, and `engine.py` +- **Source code**: Renamed stale `cleanrag:` Redis key prefix to `longparser:` in embeddings + +### Changed + +- Python 3.13 added to CI matrix, badges, and installation docs +- `SECURITY.md` updated with Redis rate-limiting and CORS threat mitigations + +--- + ## [0.1.2] — 2026-04-05 ### Changed diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f44546e..06acdab 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -84,7 +84,7 @@ Use Python 3.10+ type hints. All public API must be fully annotated. uv run pytest tests/unit/ -v # With coverage: -uv run pytest tests/unit/ --cov=src/long_parser --cov-report=term-missing +uv run pytest tests/unit/ --cov=src/longparser --cov-report=term-missing # Full test suite (requires MongoDB + Redis): uv run pytest tests/ -v diff --git a/README.md b/README.md index 3b4f72a..dce377d 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Monthly Downloads - Python + Python MIT License @@ -105,9 +105,9 @@ pip install "longparser[cpu]" ### Python SDK ```python -from longparser import PipelineOrchestrator, ProcessingConfig +from longparser import DocumentPipeline, ProcessingConfig -pipeline = PipelineOrchestrator() +pipeline = DocumentPipeline(ProcessingConfig()) result = pipeline.process_file("document.pdf") print(f"Pages: {result.document.metadata.total_pages}") @@ -186,7 +186,7 @@ src/longparser/ ├── schemas.py ← core Pydantic models (Document, Block, Chunk, …) ├── extractors/ ← Docling, LaTeX OCR backends ├── chunkers/ ← HybridChunker -├── pipeline/ ← PipelineOrchestrator +├── pipeline/ ← DocumentPipeline ├── integrations/ ← LangChain loader & LlamaIndex reader ├── utils/ ← shared helpers (RTL detection, …) └── server/ ← REST API layer diff --git a/SECURITY.md b/SECURITY.md index ba315cf..9932f71 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -35,6 +35,8 @@ Key risks: | **MongoDB injection** | Motor driver + typed Pydantic inputs prevent injection | | **SSRF via webhook** | No outbound HTTP made based on user input | | **Hallucinated citations** | Citation IDs validated against retrieved set before returning to client | +| **DDoS / Spam via API** | Route-level Rate Limiting strictly isolated per tenant via Redis | +| **Cross-Origin attacks** | Configurable CORS restrictions and strict Tenant Isolation | ## Dependency Security diff --git a/docs/changelog.md b/docs/changelog.md index 9523c0c..2fa3957 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -5,6 +5,26 @@ All notable changes to **LongParser** are documented here. This project follows [Semantic Versioning](https://semver.org/) and [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). +## [0.1.3] — 2026-04-13 + +### Fixed + +- **Source code**: Added `DocumentPipeline` as a public alias for `PipelineOrchestrator` — + docs, quickstart, and all examples now use this name consistently +- **Documentation**: Fixed wrong coverage path `long_parser` → `longparser` in `CONTRIBUTING.md` +- **Documentation**: Replaced stale `cleanrag-api` reference in Docker deployment docs +- **Documentation**: Standardized Gemini API key env var to `GOOGLE_API_KEY` across all docs +- **Source code**: Updated default LLM model fallback from `gpt-4o` to `gpt-5.3` in + `schemas.py`, `llm_chain.py`, and `engine.py` +- **Source code**: Renamed stale `cleanrag:` Redis key prefix to `longparser:` in embeddings + +### Changed + +- Python 3.13 added to CI matrix, badges, and installation docs +- `SECURITY.md` updated with Redis rate-limiting and CORS threat mitigations + +--- + ## [0.1.2] — 2026-04-05 ### Changed diff --git a/docs/contributing.md b/docs/contributing.md index e8b7196..72727c9 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -84,7 +84,7 @@ Use Python 3.10+ type hints. All public API must be fully annotated. uv run pytest tests/unit/ -v # With coverage: -uv run pytest tests/unit/ --cov=src/long_parser --cov-report=term-missing +uv run pytest tests/unit/ --cov=src/longparser --cov-report=term-missing # Full test suite (requires MongoDB + Redis): uv run pytest tests/ -v diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md index e462ce5..8ffeac7 100644 --- a/docs/deployment/docker.md +++ b/docs/deployment/docker.md @@ -49,5 +49,5 @@ docker compose up --scale longparser=3 ```bash curl http://localhost:8000/health -# {"status": "ok", "service": "cleanrag-api"} +# {"status": "ok", "service": "longparser-api"} ``` diff --git a/docs/deployment/environment.md b/docs/deployment/environment.md index 3245c88..0d8d28c 100644 --- a/docs/deployment/environment.md +++ b/docs/deployment/environment.md @@ -16,7 +16,7 @@ Copy `.env.example` to `.env` and configure for your deployment. | `LONGPARSER_LLM_PROVIDER` | `openai` | LLM provider | | `LONGPARSER_LLM_MODEL` | _(provider default)_ | Model name | | `OPENAI_API_KEY` | — | OpenAI API key | -| `GEMINI_API_KEY` | — | Google Gemini API key | +| `GOOGLE_API_KEY` | — | Google Gemini API key | | `GROQ_API_KEY` | — | Groq API key | | `OPENROUTER_API_KEY` | — | OpenRouter API key | diff --git a/docs/getting-started/configuration.md b/docs/getting-started/configuration.md index efd370f..859c2c1 100644 --- a/docs/getting-started/configuration.md +++ b/docs/getting-started/configuration.md @@ -33,7 +33,7 @@ cp .env.example .env |---|---| | `LONGPARSER_LLM_PROVIDER` | `openai` / `gemini` / `groq` / `openrouter` | | `LONGPARSER_LLM_MODEL` | Model name (uses provider default if unset) | -| `GEMINI_API_KEY` | For Google Gemini | +| `GOOGLE_API_KEY` | For Google Gemini | | `GROQ_API_KEY` | For Groq | ## Vector Store diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md index 908f659..5356c04 100644 --- a/docs/getting-started/installation.md +++ b/docs/getting-started/installation.md @@ -2,7 +2,7 @@ ## Requirements -- Python 3.10, 3.11, or 3.12 +- Python 3.10, 3.11, 3.12, or 3.13 - Tesseract OCR (`brew install tesseract` / `apt install tesseract-ocr`) --- @@ -104,5 +104,5 @@ The server starts on `http://localhost:8000`. ```python import longparser -print(longparser.__version__) # 0.1.2 +print(longparser.__version__) # 0.1.3 ``` diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md index b779f4b..e501288 100644 --- a/docs/getting-started/quickstart.md +++ b/docs/getting-started/quickstart.md @@ -17,11 +17,11 @@ from longparser import DocumentPipeline, ProcessingConfig pipeline = DocumentPipeline(ProcessingConfig()) # Parse a PDF -doc = pipeline.process("research_paper.pdf") +result = pipeline.process_file("research_paper.pdf") -print(f"Pages: {len(doc.pages)}") -print(f"Blocks: {len(doc.blocks)}") -print(f"Chunks: {len(doc.chunks)}") +print(f"Pages: {result.document.metadata.total_pages}") +print(f"Chunks: {len(result.chunks)}") +print(result.chunks[0].text) ``` ## 3. Inspect Chunks diff --git a/docs/guide/chat.md b/docs/guide/chat.md index a3fb8e6..7ddc175 100644 --- a/docs/guide/chat.md +++ b/docs/guide/chat.md @@ -70,6 +70,6 @@ Every answer's `cited_chunk_ids` are validated against the retrieved set. IDs no | Provider | Key | |---|---| | OpenAI | `OPENAI_API_KEY` | -| Google Gemini | `GEMINI_API_KEY` | +| Google Gemini | `GOOGLE_API_KEY` | | Groq | `GROQ_API_KEY` | | OpenRouter | `OPENROUTER_API_KEY` | diff --git a/docs/guide/parsing.md b/docs/guide/parsing.md index 171c5b9..93c6386 100644 --- a/docs/guide/parsing.md +++ b/docs/guide/parsing.md @@ -18,7 +18,7 @@ LongParser uses **Docling** with Tesseract CLI OCR as its extraction engine — from longparser import DocumentPipeline, ProcessingConfig pipeline = DocumentPipeline(ProcessingConfig()) -doc = pipeline.process("paper.pdf") +result = pipeline.process_file("paper.pdf") ``` ## Formula Modes @@ -36,15 +36,15 @@ config = ProcessingConfig(formula_mode="smart") ```python # Pages -for page in doc.pages: +for page in result.document.pages: print(f"Page {page.page_number}: {page.width}x{page.height}") # Blocks (semantic units) -for block in doc.blocks: +for block in result.document.blocks: print(f"[{block.type}] p={block.provenance.page_number}: {block.text[:80]}") # Chunks (RAG-ready) -for chunk in doc.chunks: +for chunk in result.chunks: print(f"{chunk.chunk_type} | {chunk.token_count} tokens | pages={chunk.page_numbers}") ``` diff --git a/docs/index.md b/docs/index.md index 650ed63..4e7ff6e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -16,7 +16,7 @@ Monthly Downloads   - Python + Python   MIT License @@ -57,9 +57,10 @@ pip install longparser from longparser import DocumentPipeline, ProcessingConfig pipeline = DocumentPipeline(ProcessingConfig()) -doc = pipeline.process("report.pdf") +result = pipeline.process_file("report.pdf") -print(f"Extracted {len(doc.blocks)} blocks, {len(doc.chunks)} chunks") +print(f"Chunks: {len(result.chunks)}") +print(result.chunks[0].text) ``` --- diff --git a/docs/reference/pipeline.md b/docs/reference/pipeline.md index 8f3e5a4..7cdfbf9 100644 --- a/docs/reference/pipeline.md +++ b/docs/reference/pipeline.md @@ -7,39 +7,49 @@ The `DocumentPipeline` is the main entry point for LongParser's extraction pipel ```python from longparser import DocumentPipeline, ProcessingConfig -pipeline = DocumentPipeline(config=ProcessingConfig()) -doc = pipeline.process("document.pdf") +pipeline = DocumentPipeline(ProcessingConfig()) +result = pipeline.process_file("document.pdf") ``` ### Constructor ```python -DocumentPipeline(config: ProcessingConfig) +DocumentPipeline(config: ProcessingConfig | None = None) ``` | Parameter | Type | Description | |---|---|---| -| `config` | `ProcessingConfig` | Extraction and chunking configuration | +| `config` | `ProcessingConfig \| None` | Extraction and chunking configuration (uses defaults if `None`) | ### Methods -#### `process(file_path)` +#### `process_file(file_path)` Process a document end-to-end through Extract → Validate → Chunk. ```python -doc = pipeline.process("report.pdf") -# Returns: longparser.schemas.Document +result = pipeline.process_file("report.pdf") +# Returns: longparser.pipeline.PipelineResult ``` -**Returns:** `Document` with `.pages`, `.blocks`, `.chunks` populated. +**Returns:** `PipelineResult` with `.document` and `.chunks` populated. + +#### `process(request)` + +Process a document from a `JobRequest` object. + +```python +from longparser import JobRequest +request = JobRequest(file_path="report.pdf") +result = pipeline.process(request) +``` #### `process_batch(file_paths)` Process multiple documents sequentially. ```python -docs = pipeline.process_batch(["a.pdf", "b.docx", "c.pptx"]) +results = pipeline.process_batch(["a.pdf", "b.docx", "c.pptx"]) ``` ## ProcessingConfig diff --git a/docs/reference/schemas.md b/docs/reference/schemas.md index 7e33ac6..e4dda21 100644 --- a/docs/reference/schemas.md +++ b/docs/reference/schemas.md @@ -4,7 +4,7 @@ Core data models used throughout LongParser. ## Document -Top-level container returned by `DocumentPipeline.process()`. +Top-level container returned by `DocumentPipeline.process_file()`. ```python class Document: diff --git a/pyproject.toml b/pyproject.toml index 38330da..afea16d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "longparser" -version = "0.1.2" +version = "0.1.3" description = "Privacy-first document intelligence engine — converts PDFs, DOCX, PPTX, XLSX, and CSV into AI-ready Markdown + structured JSON for RAG pipelines." readme = {file = "README.md", content-type = "text/markdown"} requires-python = ">=3.10" diff --git a/src/longparser/__init__.py b/src/longparser/__init__.py index 5de272e..7d00c7e 100755 --- a/src/longparser/__init__.py +++ b/src/longparser/__init__.py @@ -9,9 +9,9 @@ Quick start:: - from longparser import PipelineOrchestrator, ProcessingConfig + from longparser import DocumentPipeline, ProcessingConfig - pipeline = PipelineOrchestrator() + pipeline = DocumentPipeline(ProcessingConfig()) result = pipeline.process_file("document.pdf") print(result.chunks[0].text) @@ -19,13 +19,13 @@ uv run uvicorn longparser.server.app:app --reload --port 8000 -See :class:`~longparser.pipeline.PipelineOrchestrator` for the main SDK entry +See :class:`~longparser.pipeline.DocumentPipeline` for the main SDK entry point and :mod:`longparser.server` for the REST API layer. """ from __future__ import annotations -__version__ = "0.1.2" +__version__ = "0.1.3" __author__ = "ENDEVSOLS Team" __license__ = "MIT" @@ -62,6 +62,9 @@ def __getattr__(name: str): if name == "PipelineOrchestrator": from .pipeline import PipelineOrchestrator return PipelineOrchestrator + if name == "DocumentPipeline": + from .pipeline import DocumentPipeline + return DocumentPipeline if name == "PipelineResult": from .pipeline import PipelineResult return PipelineResult @@ -99,6 +102,7 @@ def __getattr__(name: str): # Lazily imported (require extras) "DoclingExtractor", "PipelineOrchestrator", + "DocumentPipeline", "PipelineResult", "HybridChunker", ] diff --git a/src/longparser/pipeline/__init__.py b/src/longparser/pipeline/__init__.py index 6b775d9..710800e 100755 --- a/src/longparser/pipeline/__init__.py +++ b/src/longparser/pipeline/__init__.py @@ -2,7 +2,11 @@ from .orchestrator import PipelineOrchestrator, PipelineResult +# Public alias — docs and quickstart use this name +DocumentPipeline = PipelineOrchestrator + __all__ = [ "PipelineOrchestrator", + "DocumentPipeline", "PipelineResult", ] diff --git a/src/longparser/server/chat/engine.py b/src/longparser/server/chat/engine.py index b55b7cf..d50a7af 100755 --- a/src/longparser/server/chat/engine.py +++ b/src/longparser/server/chat/engine.py @@ -76,7 +76,7 @@ # Token Counting (model-aware) — kept as custom logic # --------------------------------------------------------------------------- -def count_tokens(text: str, model: str = "gpt-4o") -> int: +def count_tokens(text: str, model: str = "gpt-5.3") -> int: """Count tokens — exact for OpenAI models, conservative approx for others.""" try: import tiktoken @@ -96,7 +96,7 @@ def budget_trim( recent_turns: list[dict], rolling_summary: str, long_term_facts: list[dict], - model: str = "gpt-4o", + model: str = "gpt-5.3", max_prompt_tokens: int = 6000, ) -> dict: """Priority-ordered truncation of prompt variables to fit token budget. diff --git a/src/longparser/server/chat/llm_chain.py b/src/longparser/server/chat/llm_chain.py index f2cb8e7..b32bb2f 100755 --- a/src/longparser/server/chat/llm_chain.py +++ b/src/longparser/server/chat/llm_chain.py @@ -115,7 +115,7 @@ def get_chat_model( """ config = config or ChatConfig() provider = provider or config.llm_provider - model = model or config.llm_model or DEFAULT_MODELS.get(provider, "gpt-4o") + model = model or config.llm_model or DEFAULT_MODELS.get(provider, "gpt-5.3") max_tokens = max_tokens or config.max_output_tokens creator = _CREATORS.get(provider) diff --git a/src/longparser/server/chat/schemas.py b/src/longparser/server/chat/schemas.py index 0405a84..0479cf7 100755 --- a/src/longparser/server/chat/schemas.py +++ b/src/longparser/server/chat/schemas.py @@ -33,7 +33,7 @@ class ChatConfig(BaseModel): default_factory=lambda: os.getenv("LONGPARSER_LLM_PROVIDER", "openai") ) llm_model: str = Field( - default_factory=lambda: os.getenv("LONGPARSER_LLM_MODEL", "gpt-4o") + default_factory=lambda: os.getenv("LONGPARSER_LLM_MODEL", "gpt-5.3") ) max_input_tokens: int = Field( default_factory=lambda: int(os.getenv("LONGPARSER_CHAT_MAX_INPUT_TOKENS", "1000")) diff --git a/src/longparser/server/embeddings.py b/src/longparser/server/embeddings.py index e59f513..e0b2bbc 100755 --- a/src/longparser/server/embeddings.py +++ b/src/longparser/server/embeddings.py @@ -108,7 +108,7 @@ def dim(self) -> int: return self._dim fp = self.get_fingerprint() - cache_key = f"cleanrag:embed_dim:{fp}" + cache_key = f"longparser:embed_dim:{fp}" # 1) Try Redis cross-process cache if available try: