diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 74838149..06101910 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,6 +40,8 @@ jobs: pip install flake8 flake8-bugbear # Install project deps (skip heavy ML libs with stub extras) pip install -r backend/requirements.txt --quiet || true + # Install document-processing dependencies (force reinstall to fix cached stale files) + pip install --force-reinstall pymupdf4llm google-genai - name: Flake8 lint (errors only, no style noise) run: | @@ -56,6 +58,7 @@ jobs: DATABASE_URL: sqlite:///./ci_test.db DEBUG: "false" HF_TOKEN: ci-dummy-token + GOOGLE_API_KEY: ci-dummy-key UPLOAD_DIR: /tmp/uploads CHROMA_PERSIST_DIR: /tmp/chroma run: | @@ -70,6 +73,7 @@ jobs: DATABASE_URL: sqlite:///./ci_test.db DEBUG: "false" HF_TOKEN: ci-dummy-token + GOOGLE_API_KEY: ci-dummy-key UPLOAD_DIR: /tmp/uploads CHROMA_PERSIST_DIR: /tmp/chroma run: | diff --git a/backend/app/services/document_ingestion.py b/backend/app/services/document_ingestion.py index 6e76d79d..f4027328 100644 --- a/backend/app/services/document_ingestion.py +++ b/backend/app/services/document_ingestion.py @@ -206,5 +206,6 @@ def ingest_document(document_id: str, filepath: str, original_name: str, user_id db.commit() except Exception: logger.exception("Failed to mark document %s as failed", document_id) + raise finally: db.close() diff --git a/backend/app/tasks.py b/backend/app/tasks.py index 811cad06..ef5d9ecc 100644 --- a/backend/app/tasks.py +++ b/backend/app/tasks.py @@ -87,9 +87,9 @@ def process_document( doc.status = "completed" doc.processing_progress = 100 db.commit() + status = doc.status - return {"document_id": document_id, "status": "completed"} - + return {"document_id": document_id, "status": status} except Exception as exc: logger.error("Document %s processing failed (attempt %s): %s", document_id, self.request.retries + 1, exc) with get_db_session() as db: diff --git a/backend/requirements.txt b/backend/requirements.txt index f46463bf..8c3db07f 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -29,6 +29,7 @@ httpx # Document Processing PyMuPDF +pymupdf4llm pdfplumber python-docx unstructured[pdf] @@ -54,6 +55,7 @@ spacy>=3.7 neo4j>=5.0 # LLM Inference +google-genai huggingface-hub # Production diff --git a/backend/tests/test_celery_ingestion.py b/backend/tests/test_celery_ingestion.py index 2e359e63..bb4d3001 100644 --- a/backend/tests/test_celery_ingestion.py +++ b/backend/tests/test_celery_ingestion.py @@ -5,10 +5,11 @@ from app.models import Document from app.tasks import process_document + def test_process_document_ingestion_pipeline(db_session): """ - Test that the Celery task updates document status from pending to ready - by executing the ingestion engine inside the active test database session. + Test that the Celery task updates document status from pending to completed + by executing the layout-aware parser pipeline inside the active test database session. """ # 1. SETUP: Create a mock document that starts as 'pending' @@ -17,7 +18,7 @@ def test_process_document_ingestion_pipeline(db_session): filename="sample.pdf", original_name="sample.pdf", status="pending", - user_id="user-456" + user_id="user-456", ) db_session.add(test_doc) db_session.commit() @@ -27,20 +28,16 @@ def test_process_document_ingestion_pipeline(db_session): mock_session_factory.return_value.__enter__.return_value = db_session mock_session_factory.return_value = db_session - # Patch the factory globally, and patch ingest_document right where app.tasks calls it + # Patch the factory globally, and mock AdvancedPDFParser so no real PDF is parsed with patch("app.database.SessionLocal", mock_session_factory, create=True), \ patch("app.services.document_ingestion.SessionLocal", mock_session_factory, create=True), \ - patch("app.tasks.ingest_document") as mock_ingest: - - # Simulate what the underlying service does upon a successful processing run - def simulate_successful_ingestion(*args, **kwargs): - doc = db_session.query(Document).filter_by(id="test-doc-123").first() - if doc: - doc.status = "ready" - db_session.commit() - return {"status": "success"} - - mock_ingest.side_effect = simulate_successful_ingestion + patch("app.tasks.AdvancedPDFParser") as mock_parser_cls: + + mock_parser = MagicMock() + mock_parser_cls.return_value = mock_parser + mock_parser.ingest_document.return_value = [ + {"text": "mock chunk 1", "page_number": 1, "type": "text_layout"}, + ] task_result = process_document.apply( kwargs={ @@ -53,8 +50,8 @@ def simulate_successful_ingestion(*args, **kwargs): # 3. ASSERT: Verify the task metrics and status changes inside the session context assert task_result.status == "SUCCESS" - + # Query the database to verify the state update updated_doc = db_session.query(Document).filter_by(id="test-doc-123").first() assert updated_doc is not None - assert updated_doc.status == "ready" \ No newline at end of file + assert updated_doc.status == "completed" \ No newline at end of file