From 90a83c246a8ad008a7b46dd30e8aedfa04eb0cce Mon Sep 17 00:00:00 2001 From: ionfwsrijan Date: Sat, 13 Jun 2026 15:57:01 +0530 Subject: [PATCH 01/10] fix: re-raise exception in ingest_document so Celery retry mechanism fires (#561) --- backend/app/services/document_ingestion.py | 1 + backend/app/tasks.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/backend/app/services/document_ingestion.py b/backend/app/services/document_ingestion.py index 6e76d79d..f4027328 100644 --- a/backend/app/services/document_ingestion.py +++ b/backend/app/services/document_ingestion.py @@ -206,5 +206,6 @@ def ingest_document(document_id: str, filepath: str, original_name: str, user_id db.commit() except Exception: logger.exception("Failed to mark document %s as failed", document_id) + raise finally: db.close() diff --git a/backend/app/tasks.py b/backend/app/tasks.py index 811cad06..e02d407a 100644 --- a/backend/app/tasks.py +++ b/backend/app/tasks.py @@ -41,6 +41,7 @@ def process_document( doc.status = "processing" # Set explicitly to show UI activity db.commit() +<<<<<<< HEAD logger.info("Starting Advanced Layout-Aware Ingestion for document: %s", original_name) # 2. Trigger your advanced structural parser @@ -87,9 +88,9 @@ def process_document( doc.status = "completed" doc.processing_progress = 100 db.commit() + status = doc.status - return {"document_id": document_id, "status": "completed"} - + return {"document_id": document_id, "status": status} except Exception as exc: logger.error("Document %s processing failed (attempt %s): %s", document_id, self.request.retries + 1, exc) with get_db_session() as db: From d6cca443a39ccd31d5477e10a2e1bbacdbd1def2 Mon Sep 17 00:00:00 2001 From: ionfwsrijan Date: Sat, 13 Jun 2026 16:16:15 +0530 Subject: [PATCH 02/10] fix: update test to match new AdvancedPDFParser-based ingestion pipeline (#561) --- backend/app/tasks.py | 1 - backend/tests/test_celery_ingestion.py | 22 ++++++++-------------- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/backend/app/tasks.py b/backend/app/tasks.py index e02d407a..ef5d9ecc 100644 --- a/backend/app/tasks.py +++ b/backend/app/tasks.py @@ -41,7 +41,6 @@ def process_document( doc.status = "processing" # Set explicitly to show UI activity db.commit() -<<<<<<< HEAD logger.info("Starting Advanced Layout-Aware Ingestion for document: %s", original_name) # 2. Trigger your advanced structural parser diff --git a/backend/tests/test_celery_ingestion.py b/backend/tests/test_celery_ingestion.py index 2e359e63..c35ba4db 100644 --- a/backend/tests/test_celery_ingestion.py +++ b/backend/tests/test_celery_ingestion.py @@ -27,20 +27,14 @@ def test_process_document_ingestion_pipeline(db_session): mock_session_factory.return_value.__enter__.return_value = db_session mock_session_factory.return_value = db_session - # Patch the factory globally, and patch ingest_document right where app.tasks calls it + # Patch the factory globally, and patch AdvancedPDFParser to avoid real file I/O with patch("app.database.SessionLocal", mock_session_factory, create=True), \ - patch("app.services.document_ingestion.SessionLocal", mock_session_factory, create=True), \ - patch("app.tasks.ingest_document") as mock_ingest: - - # Simulate what the underlying service does upon a successful processing run - def simulate_successful_ingestion(*args, **kwargs): - doc = db_session.query(Document).filter_by(id="test-doc-123").first() - if doc: - doc.status = "ready" - db_session.commit() - return {"status": "success"} - - mock_ingest.side_effect = simulate_successful_ingestion + patch("app.tasks.AdvancedPDFParser") as mock_parser: + + # Return empty chunks so the vectorization loop is a no-op + mock_parser_instance = MagicMock() + mock_parser.return_value = mock_parser_instance + mock_parser_instance.ingest_document.return_value = [] task_result = process_document.apply( kwargs={ @@ -57,4 +51,4 @@ def simulate_successful_ingestion(*args, **kwargs): # Query the database to verify the state update updated_doc = db_session.query(Document).filter_by(id="test-doc-123").first() assert updated_doc is not None - assert updated_doc.status == "ready" \ No newline at end of file + assert updated_doc.status == "completed" \ No newline at end of file From ab3dd3f90cdb9e823731d2a5efb0805a922bc3f3 Mon Sep 17 00:00:00 2001 From: ionfwsrijan Date: Sat, 13 Jun 2026 16:26:02 +0530 Subject: [PATCH 03/10] chore: add missing pymupdf4llm dependency to requirements.txt --- backend/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/requirements.txt b/backend/requirements.txt index f46463bf..a4c8d700 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -29,6 +29,7 @@ httpx # Document Processing PyMuPDF +pymupdf4llm pdfplumber python-docx unstructured[pdf] From 6d66c763657b97e930725655efe61d6d3aea0eb9 Mon Sep 17 00:00:00 2001 From: ionfwsrijan Date: Sat, 13 Jun 2026 16:30:27 +0530 Subject: [PATCH 04/10] chore: add missing google-generativeai dependency to requirements.txt --- backend/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/requirements.txt b/backend/requirements.txt index a4c8d700..74d26a84 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -55,6 +55,7 @@ spacy>=3.7 neo4j>=5.0 # LLM Inference +google-generativeai huggingface-hub # Production From 22a0f08ddce85a624b03ae3702435eae7f76b6d4 Mon Sep 17 00:00:00 2001 From: ionfwsrijan Date: Sat, 13 Jun 2026 16:39:06 +0530 Subject: [PATCH 05/10] ci: add explicit pip install for pymupdf4llm and google-generativeai --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 74838149..47a0bef2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,6 +40,8 @@ jobs: pip install flake8 flake8-bugbear # Install project deps (skip heavy ML libs with stub extras) pip install -r backend/requirements.txt --quiet || true + # Install document-processing dependencies added after CI was broken + pip install pymupdf4llm google-generativeai - name: Flake8 lint (errors only, no style noise) run: | From 038f947e731dcb708fa43d72d921cf3281c245ad Mon Sep 17 00:00:00 2001 From: ionfwsrijan Date: Sat, 13 Jun 2026 16:51:56 +0530 Subject: [PATCH 06/10] ci: force-reinstall pymupdf4llm and google-generativeai to fix stale cache --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 47a0bef2..ffbd04e4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,8 +40,8 @@ jobs: pip install flake8 flake8-bugbear # Install project deps (skip heavy ML libs with stub extras) pip install -r backend/requirements.txt --quiet || true - # Install document-processing dependencies added after CI was broken - pip install pymupdf4llm google-generativeai + # Install document-processing dependencies (force reinstall to fix cached stale files) + pip install --force-reinstall pymupdf4llm google-generativeai - name: Flake8 lint (errors only, no style noise) run: | From e9bfccac98dcacb8030ccf17b6f83f8c802cb189 Mon Sep 17 00:00:00 2001 From: ionfwsrijan Date: Sat, 13 Jun 2026 16:57:04 +0530 Subject: [PATCH 07/10] fix: use google-genai (not deprecated google-generativeai) for genai import --- .github/workflows/ci.yml | 2 +- backend/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ffbd04e4..09d82ea8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -41,7 +41,7 @@ jobs: # Install project deps (skip heavy ML libs with stub extras) pip install -r backend/requirements.txt --quiet || true # Install document-processing dependencies (force reinstall to fix cached stale files) - pip install --force-reinstall pymupdf4llm google-generativeai + pip install --force-reinstall pymupdf4llm google-genai - name: Flake8 lint (errors only, no style noise) run: | diff --git a/backend/requirements.txt b/backend/requirements.txt index 74d26a84..8c3db07f 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -55,7 +55,7 @@ spacy>=3.7 neo4j>=5.0 # LLM Inference -google-generativeai +google-genai huggingface-hub # Production From 3d6572bc3c85075a3e8a978c5804cd26ff9b129b Mon Sep 17 00:00:00 2001 From: ionfwsrijan Date: Sat, 13 Jun 2026 17:01:43 +0530 Subject: [PATCH 08/10] ci: add GOOGLE_API_KEY env var for genai.Client() at import time --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 09d82ea8..06101910 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -58,6 +58,7 @@ jobs: DATABASE_URL: sqlite:///./ci_test.db DEBUG: "false" HF_TOKEN: ci-dummy-token + GOOGLE_API_KEY: ci-dummy-key UPLOAD_DIR: /tmp/uploads CHROMA_PERSIST_DIR: /tmp/chroma run: | @@ -72,6 +73,7 @@ jobs: DATABASE_URL: sqlite:///./ci_test.db DEBUG: "false" HF_TOKEN: ci-dummy-token + GOOGLE_API_KEY: ci-dummy-key UPLOAD_DIR: /tmp/uploads CHROMA_PERSIST_DIR: /tmp/chroma run: | From 754f21950f1e8e93963f052d175c387eee85e65f Mon Sep 17 00:00:00 2001 From: ionfwsrijan Date: Sat, 13 Jun 2026 17:07:24 +0530 Subject: [PATCH 09/10] fix: update test to mock AdvancedPDFParser instead of removed ingest_document --- backend/tests/test_celery_ingestion.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/backend/tests/test_celery_ingestion.py b/backend/tests/test_celery_ingestion.py index c35ba4db..de997965 100644 --- a/backend/tests/test_celery_ingestion.py +++ b/backend/tests/test_celery_ingestion.py @@ -5,10 +5,11 @@ from app.models import Document from app.tasks import process_document + def test_process_document_ingestion_pipeline(db_session): """ - Test that the Celery task updates document status from pending to ready - by executing the ingestion engine inside the active test database session. + Test that the Celery task updates document status from pending to completed + by executing the layout-aware parser pipeline inside the active test database session. """ # 1. SETUP: Create a mock document that starts as 'pending' @@ -17,7 +18,7 @@ def test_process_document_ingestion_pipeline(db_session): filename="sample.pdf", original_name="sample.pdf", status="pending", - user_id="user-456" + user_id="user-456", ) db_session.add(test_doc) db_session.commit() @@ -27,14 +28,16 @@ def test_process_document_ingestion_pipeline(db_session): mock_session_factory.return_value.__enter__.return_value = db_session mock_session_factory.return_value = db_session - # Patch the factory globally, and patch AdvancedPDFParser to avoid real file I/O + # Patch the factory globally, and mock AdvancedPDFParser so no real PDF is parsed with patch("app.database.SessionLocal", mock_session_factory, create=True), \ - patch("app.tasks.AdvancedPDFParser") as mock_parser: + patch("app.services.document_ingestion.SessionLocal", mock_session_factory, create=True), \ + patch("app.services.layout_parser.AdvancedPDFParser") as mock_parser_cls: - # Return empty chunks so the vectorization loop is a no-op - mock_parser_instance = MagicMock() - mock_parser.return_value = mock_parser_instance - mock_parser_instance.ingest_document.return_value = [] + mock_parser = MagicMock() + mock_parser_cls.return_value = mock_parser + mock_parser.ingest_document.return_value = [ + {"text": "mock chunk 1", "page_number": 1, "type": "text_layout"}, + ] task_result = process_document.apply( kwargs={ @@ -47,7 +50,7 @@ def test_process_document_ingestion_pipeline(db_session): # 3. ASSERT: Verify the task metrics and status changes inside the session context assert task_result.status == "SUCCESS" - + # Query the database to verify the state update updated_doc = db_session.query(Document).filter_by(id="test-doc-123").first() assert updated_doc is not None From 2454af3a26eb04e6667bb17d26959909a6338fad Mon Sep 17 00:00:00 2001 From: ionfwsrijan Date: Sat, 13 Jun 2026 17:14:04 +0530 Subject: [PATCH 10/10] Fix test: patch AdvancedPDFParser at app.tasks namespace (not layout_parser) to account for module-level import --- backend/tests/test_celery_ingestion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/tests/test_celery_ingestion.py b/backend/tests/test_celery_ingestion.py index de997965..bb4d3001 100644 --- a/backend/tests/test_celery_ingestion.py +++ b/backend/tests/test_celery_ingestion.py @@ -31,7 +31,7 @@ def test_process_document_ingestion_pipeline(db_session): # Patch the factory globally, and mock AdvancedPDFParser so no real PDF is parsed with patch("app.database.SessionLocal", mock_session_factory, create=True), \ patch("app.services.document_ingestion.SessionLocal", mock_session_factory, create=True), \ - patch("app.services.layout_parser.AdvancedPDFParser") as mock_parser_cls: + patch("app.tasks.AdvancedPDFParser") as mock_parser_cls: mock_parser = MagicMock() mock_parser_cls.return_value = mock_parser