ScienciaLAB · Sanakhamassi · Apr 23, 2026 · Apr 30, 2026 · Apr 30, 2026 · May 7, 2026
diff --git a/document_qa/document_qa_engine.py b/document_qa/document_qa_engine.py
@@ -1,7 +1,7 @@
 import copy
 import os
 from pathlib import Path
-from typing import Union, Any, List
+from typing import Union, Any, List, Tuple
 
 import tiktoken
 from langchain.chains import create_extraction_chain
@@ -15,7 +15,7 @@
 from langchain_core.vectorstores import VectorStore
 from tqdm import tqdm
 
-from document_qa.grobid_processors import GrobidProcessor
+from document_qa.grobid_processors import GrobidProcessor, GrobidServiceError
 from document_qa.langchain import ChromaAdvancedRetrieval
 
 
@@ -209,7 +209,8 @@ def __init__(self,
                  llm,
                  data_storage: DataStorage,
                  grobid_url=None,
-                 memory=None
+                 memory=None,
+                 ping_grobid_server: bool = False
                  ):
 
         self.llm = llm
@@ -219,7 +220,7 @@ def __init__(self,
         self.data_storage = data_storage
 
         if grobid_url:
-            self.grobid_processor = GrobidProcessor(grobid_url)
+            self.grobid_processor = GrobidProcessor(grobid_url, ping_server=ping_grobid_server)
 
     def query_document(
             self,
@@ -229,7 +230,7 @@ def query_document(
             context_size=4,
             extraction_schema=None,
             verbose=False
-    ) -> (Any, str):
+    ) -> Tuple[Any, str]:
         # self.load_embeddings(self.embeddings_root_path)
 
         if verbose:
@@ -258,7 +259,7 @@ def query_document(
         else:
             return None, response, coordinates
 
-    def query_storage(self, query: str, doc_id, context_size=4) -> (List[Document], list):
+    def query_storage(self, query: str, doc_id, context_size=4) -> Tuple[List[Document], list]:
         """
         Returns the context related to a given query
         """
@@ -329,12 +330,12 @@ def _parse_json(self, response, output_parser):
 
         return parsed_output
 
-    def _run_query(self, doc_id, query, context_size=4) -> (List[Document], list):
+    def _run_query(self, doc_id, query, context_size=4) -> Tuple[List[Document], list]:
         relevant_documents, relevant_document_coordinates = self._get_context(doc_id, query, context_size)
         response = self.chain.invoke({"context": relevant_documents, "question": query})
         return response, relevant_document_coordinates
 
-    def _get_context(self, doc_id, query, context_size=4) -> (List[Document], list):
+    def _get_context(self, doc_id, query, context_size=4) -> Tuple[List[Document], list]:
         db = self.data_storage.embeddings_dict[doc_id]
         retriever = db.as_retriever(search_kwargs={"k": context_size})
         relevant_documents = retriever.invoke(query)
@@ -376,6 +377,8 @@ def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1,
         filename = Path(pdf_file_path).stem
         coordinates = True  # if chunk_size == -1 else False
         structure = self.grobid_processor.process_structure(pdf_file_path, coordinates=coordinates)
+        if not structure:
+            raise GrobidServiceError("Grobid did not return a response.")
 
         biblio = structure['biblio']
         biblio['filename'] = filename.replace(" ", "_")

diff --git a/document_qa/grobid_processors.py b/document_qa/grobid_processors.py
@@ -9,6 +9,14 @@
 from grobid_client.grobid_client import GrobidClient
 
 
+class GrobidServiceError(RuntimeError):
+    """Raised when the Grobid service fails to process a document."""
+
+    def __init__(self, message="Grobid service error", status_code=None):
+        super().__init__(message)
+        self.status_code = status_code
+
+
 def get_span_start(type, title=None):
     title_ = ' title="' + title + '"' if title is not None else ""
     return '<span class="label ' + type + '"' + title_ + '>'
@@ -97,18 +105,24 @@ def __init__(self, grobid_url, ping_server=True):
         self.grobid_client = grobid_client
 
     def process_structure(self, input_path, coordinates=False):
-        pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
-                                                                input_path,
-                                                                consolidate_header=True,
-                                                                consolidate_citations=False,
-                                                                segment_sentences=False,
-                                                                tei_coordinates=coordinates,
-                                                                include_raw_citations=False,
-                                                                include_raw_affiliations=False,
-                                                                generateIDs=True)
+        try:
+            pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
+                                                                    input_path,
+                                                                    consolidate_header=True,
+                                                                    consolidate_citations=False,
+                                                                    segment_sentences=False,
+                                                                    tei_coordinates=coordinates,
+                                                                    include_raw_citations=False,
+                                                                    include_raw_affiliations=False,
+                                                                    generateIDs=True)
+        except Exception as exc:
+            raise GrobidServiceError("Grobid service did not respond.") from exc
 
         if status != 200:
-            return
+            raise GrobidServiceError(
+                f"Grobid service returned status {status}.",
+                status_code=status
+            )
 
         document_object = self.parse_grobid_xml(text, coordinates=coordinates)
         document_object['filename'] = Path(pdf_file).stem.replace(".tei", "")
@@ -137,7 +151,7 @@ def parse_grobid_xml(self, text, coordinates=False):
         try:
             year = dateparser.parse(doc_biblio.header.date).year
             biblio["publication_year"] = year
-        except:
+        except Exception:
             pass
 
         output_data['biblio'] = biblio

diff --git a/streamlit_app.py b/streamlit_app.py
@@ -4,20 +4,19 @@
 from tempfile import NamedTemporaryFile
 
 import dotenv
+import streamlit as st
 from grobid_quantities.quantities import QuantitiesAPI
 from langchain.memory import ConversationBufferMemory
 from langchain_openai import ChatOpenAI
 from streamlit_pdf_viewer import pdf_viewer
 
 from document_qa.custom_embeddings import ModalEmbeddings
+from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
+from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations, GrobidServiceError
 from document_qa.ner_client_generic import NERClientGeneric
 
 dotenv.load_dotenv(override=True)
 
-import streamlit as st
-from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
-from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations
-
 API_MODELS = {
     "microsoft/Phi-4-mini-instruct": os.environ["PHI_URL"],
     "Qwen/Qwen3-0.6B": os.environ["QWEN_URL"]
@@ -314,19 +313,35 @@ def play_old_messages(container):
         st.stop()
 
     with left_column:
-        with st.spinner('Reading file, calling Grobid, and creating in-memory embeddings...'):
-            binary = uploaded_file.getvalue()
-            tmp_file = NamedTemporaryFile()
-            tmp_file.write(bytearray(binary))
-            st.session_state['binary'] = binary
-
-            st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(
-                tmp_file.name,
-                chunk_size=chunk_size,
-                perc_overlap=0.1
-            )
-            st.session_state['loaded_embeddings'] = True
-            st.session_state.messages = []
+        try:
+            with st.spinner('Reading file, calling Grobid, and creating in-memory embeddings...'):
+                binary = uploaded_file.getvalue()
+                tmp_path = None
+                try:
+                    with NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
+                        tmp_file.write(bytearray(binary))
+                        tmp_file.flush()
+                        tmp_path = tmp_file.name
+                    st.session_state['binary'] = binary
+
+                    st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(
+                        tmp_path,
+                        chunk_size=chunk_size,
+                        perc_overlap=0.1
+                    )
+                finally:
+                    if tmp_path and os.path.exists(tmp_path):
+                        os.unlink(tmp_path)
+                st.session_state['loaded_embeddings'] = True
+                st.session_state.messages = []
+        except GrobidServiceError as exc:
+            message = str(exc).strip() or "Grobid is not responding"
+            status = f" (status {exc.status_code})" if exc.status_code else ""
+            st.session_state['doc_id'] = None
+            st.session_state['loaded_embeddings'] = False
+            st.session_state['uploaded'] = False
+            st.error(f"{message}{status} Please try later.")
+            st.stop()
 
 
 def rgb_to_hex(rgb):

diff --git a/tests/test_grobid_processors.py b/tests/test_grobid_processors.py
@@ -1,7 +1,14 @@
 import os
-
+from unittest.mock import MagicMock, patch
+import pytest
 from bs4 import BeautifulSoup
-from document_qa.grobid_processors import get_xml_nodes_body, get_xml_nodes_figures, get_xml_nodes_header
+from document_qa.grobid_processors import (
+    GrobidProcessor,
+    GrobidServiceError,
+    get_xml_nodes_body,
+    get_xml_nodes_figures,
+    get_xml_nodes_header,
+)
 from tests.resources import TEST_DATA_PATH
 
 
@@ -48,3 +55,74 @@ def test_get_xml_nodes_header_sentences():
     children = get_xml_nodes_header(soup, use_paragraphs=False)
 
     assert sum([len(child) for k, child in children.items()]) == 15
+
+def test_grobid_service_error_default_status_code():
+    error = GrobidServiceError("Something went wrong")
+    assert error.status_code is None
+    assert str(error) == "Something went wrong"
+
+
+def test_grobid_service_error_stores_status_code():
+    error = GrobidServiceError("Bad gateway", status_code=502)
+    assert error.status_code == 502
+    assert "Bad gateway" in str(error)
+
+@pytest.fixture
+def grobid_processor():
+    with patch("document_qa.grobid_processors.GrobidClient") as mock_client_class:
+        mock_client = MagicMock()
+        mock_client_class.return_value = mock_client
+        processor = GrobidProcessor("http://fake-url", ping_server=False)
+        yield processor
+
+
+# Connection/timeout failures
+def test_process_structure_raises_on_connection_error(grobid_processor):
+    grobid_processor.grobid_client.process_pdf.side_effect = ConnectionError(
+        "Connection refused"
+    )
+    with pytest.raises(GrobidServiceError) as exc_info:
+        grobid_processor.process_structure("fake.pdf")
+
+    assert "did not respond" in str(exc_info.value).lower()
+    assert exc_info.value.status_code is None
+
+
+def test_process_structure_raises_on_timeout(grobid_processor):
+    grobid_processor.grobid_client.process_pdf.side_effect = TimeoutError(
+        "Request timed out"
+    )
+    with pytest.raises(GrobidServiceError) as exc_info:
+        grobid_processor.process_structure("fake.pdf")
+
+    assert exc_info.value.status_code is None
+
+
+#  Non-200 HTTP status codes
+def test_process_structure_raises_on_503_status(grobid_processor):
+    grobid_processor.grobid_client.process_pdf.return_value = ("fake.pdf", 503, None)
+
+    with pytest.raises(GrobidServiceError) as exc_info:
+        grobid_processor.process_structure("fake.pdf")
+
+    assert exc_info.value.status_code == 503
+    assert "503" in str(exc_info.value)
+
+
+def test_process_structure_raises_on_500_status(grobid_processor):
+    grobid_processor.grobid_client.process_pdf.return_value = ("fake.pdf", 500, None)
+
+    with pytest.raises(GrobidServiceError) as exc_info:
+        grobid_processor.process_structure("fake.pdf")
+
+    assert exc_info.value.status_code == 500
+    assert "500" in str(exc_info.value)
+
+
+def test_process_structure_raises_on_404_status(grobid_processor):
+    grobid_processor.grobid_client.process_pdf.return_value = ("fake.pdf", 404, None)
+
+    with pytest.raises(GrobidServiceError) as exc_info:
+        grobid_processor.process_structure("fake.pdf")
+
+    assert exc_info.value.status_code == 404