From 6a83c20bcc14ea72161ddb16edc8c8ff170ee009 Mon Sep 17 00:00:00 2001 From: sanakhamassi Date: Thu, 23 Apr 2026 19:40:05 +0100 Subject: [PATCH 1/6] Add error handling for Grobid service when not responding --- document_qa/document_qa_engine.py | 4 +++- document_qa/grobid_processors.py | 36 +++++++++++++++++++++---------- streamlit_app.py | 29 +++++++++++++++---------- 3 files changed, 46 insertions(+), 23 deletions(-) diff --git a/document_qa/document_qa_engine.py b/document_qa/document_qa_engine.py index 7560ecf..8dd7f86 100644 --- a/document_qa/document_qa_engine.py +++ b/document_qa/document_qa_engine.py @@ -15,7 +15,7 @@ from langchain_core.vectorstores import VectorStore from tqdm import tqdm -from document_qa.grobid_processors import GrobidProcessor +from document_qa.grobid_processors import GrobidProcessor, GrobidServiceError from document_qa.langchain import ChromaAdvancedRetrieval @@ -376,6 +376,8 @@ def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, filename = Path(pdf_file_path).stem coordinates = True # if chunk_size == -1 else False structure = self.grobid_processor.process_structure(pdf_file_path, coordinates=coordinates) + if not structure: + raise GrobidServiceError("Grobid did not return a response.") biblio = structure['biblio'] biblio['filename'] = filename.replace(" ", "_") diff --git a/document_qa/grobid_processors.py b/document_qa/grobid_processors.py index 0aae0ee..55ec695 100644 --- a/document_qa/grobid_processors.py +++ b/document_qa/grobid_processors.py @@ -9,6 +9,14 @@ from grobid_client.grobid_client import GrobidClient +class GrobidServiceError(RuntimeError): + """Raised when the Grobid service fails to process a document.""" + + def __init__(self, message="Grobid service error", status_code=None): + super().__init__(message) + self.status_code = status_code + + def get_span_start(type, title=None): title_ = ' title="' + title + '"' if title is not None else "" return '' @@ -97,18 +105,24 @@ def __init__(self, grobid_url, ping_server=True): self.grobid_client = grobid_client def process_structure(self, input_path, coordinates=False): - pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument", - input_path, - consolidate_header=True, - consolidate_citations=False, - segment_sentences=False, - tei_coordinates=coordinates, - include_raw_citations=False, - include_raw_affiliations=False, - generateIDs=True) + try: + pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument", + input_path, + consolidate_header=True, + consolidate_citations=False, + segment_sentences=False, + tei_coordinates=coordinates, + include_raw_citations=False, + include_raw_affiliations=False, + generateIDs=True) + except Exception as exc: + raise GrobidServiceError("Grobid service did not respond.") from exc if status != 200: - return + raise GrobidServiceError( + f"Grobid service returned status {status}.", + status_code=status + ) document_object = self.parse_grobid_xml(text, coordinates=coordinates) document_object['filename'] = Path(pdf_file).stem.replace(".tei", "") @@ -137,7 +151,7 @@ def parse_grobid_xml(self, text, coordinates=False): try: year = dateparser.parse(doc_biblio.header.date).year biblio["publication_year"] = year - except: + except Exception: pass output_data['biblio'] = biblio diff --git a/streamlit_app.py b/streamlit_app.py index c01ad2b..a10dc7d 100644 --- a/streamlit_app.py +++ b/streamlit_app.py @@ -4,20 +4,19 @@ from tempfile import NamedTemporaryFile import dotenv +import streamlit as st from grobid_quantities.quantities import QuantitiesAPI from langchain.memory import ConversationBufferMemory from langchain_openai import ChatOpenAI from streamlit_pdf_viewer import pdf_viewer from document_qa.custom_embeddings import ModalEmbeddings +from document_qa.document_qa_engine import DocumentQAEngine, DataStorage +from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations, GrobidServiceError from document_qa.ner_client_generic import NERClientGeneric dotenv.load_dotenv(override=True) -import streamlit as st -from document_qa.document_qa_engine import DocumentQAEngine, DataStorage -from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations - API_MODELS = { "microsoft/Phi-4-mini-instruct": os.environ["PHI_URL"], "Qwen/Qwen3-0.6B": os.environ["QWEN_URL"] @@ -320,13 +319,21 @@ def play_old_messages(container): tmp_file.write(bytearray(binary)) st.session_state['binary'] = binary - st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings( - tmp_file.name, - chunk_size=chunk_size, - perc_overlap=0.1 - ) - st.session_state['loaded_embeddings'] = True - st.session_state.messages = [] + try: + st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings( + tmp_file.name, + chunk_size=chunk_size, + perc_overlap=0.1 + ) + st.session_state['loaded_embeddings'] = True + st.session_state.messages = [] + except GrobidServiceError as exc: + status = f" (status {exc.status_code})" if exc.status_code else "" + st.session_state['doc_id'] = None + st.session_state['loaded_embeddings'] = False + st.session_state['uploaded'] = False + st.error(f"Grobid is not responding{status}. Please try later.") + st.stop() def rgb_to_hex(rgb): From 5670b4b5d078fc093407341ed6500ef82f6538ba Mon Sep 17 00:00:00 2001 From: sanakhamassi Date: Thu, 30 Apr 2026 10:52:34 +0100 Subject: [PATCH 2/6] Update GrobidProcessor initialization and enhance error handling in Streamlit app --- document_qa/document_qa_engine.py | 2 +- streamlit_app.py | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/document_qa/document_qa_engine.py b/document_qa/document_qa_engine.py index 8dd7f86..f42e94e 100644 --- a/document_qa/document_qa_engine.py +++ b/document_qa/document_qa_engine.py @@ -219,7 +219,7 @@ def __init__(self, self.data_storage = data_storage if grobid_url: - self.grobid_processor = GrobidProcessor(grobid_url) + self.grobid_processor = GrobidProcessor(grobid_url, ping_server=False) def query_document( self, diff --git a/streamlit_app.py b/streamlit_app.py index a10dc7d..b98673d 100644 --- a/streamlit_app.py +++ b/streamlit_app.py @@ -313,13 +313,13 @@ def play_old_messages(container): st.stop() with left_column: - with st.spinner('Reading file, calling Grobid, and creating in-memory embeddings...'): - binary = uploaded_file.getvalue() - tmp_file = NamedTemporaryFile() - tmp_file.write(bytearray(binary)) - st.session_state['binary'] = binary + try: + with st.spinner('Reading file, calling Grobid, and creating in-memory embeddings...'): + binary = uploaded_file.getvalue() + tmp_file = NamedTemporaryFile() + tmp_file.write(bytearray(binary)) + st.session_state['binary'] = binary - try: st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings( tmp_file.name, chunk_size=chunk_size, @@ -327,13 +327,13 @@ def play_old_messages(container): ) st.session_state['loaded_embeddings'] = True st.session_state.messages = [] - except GrobidServiceError as exc: - status = f" (status {exc.status_code})" if exc.status_code else "" - st.session_state['doc_id'] = None - st.session_state['loaded_embeddings'] = False - st.session_state['uploaded'] = False - st.error(f"Grobid is not responding{status}. Please try later.") - st.stop() + except GrobidServiceError as exc: + status = f" (status {exc.status_code})" if exc.status_code else "" + st.session_state['doc_id'] = None + st.session_state['loaded_embeddings'] = False + st.session_state['uploaded'] = False + st.error(f"Grobid is not responding{status}. Please try later.") + st.stop() def rgb_to_hex(rgb): From cf2978129ce192929b07bdf6617637fbfb61333a Mon Sep 17 00:00:00 2001 From: sanakhamassi Date: Thu, 30 Apr 2026 11:14:38 +0100 Subject: [PATCH 3/6] Fix missing space in error message --- streamlit_app.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/streamlit_app.py b/streamlit_app.py index b98673d..bad17f1 100644 --- a/streamlit_app.py +++ b/streamlit_app.py @@ -328,11 +328,12 @@ def play_old_messages(container): st.session_state['loaded_embeddings'] = True st.session_state.messages = [] except GrobidServiceError as exc: + message = str(exc).strip() or "Grobid is not responding" status = f" (status {exc.status_code})" if exc.status_code else "" st.session_state['doc_id'] = None st.session_state['loaded_embeddings'] = False st.session_state['uploaded'] = False - st.error(f"Grobid is not responding{status}. Please try later.") + st.error(f"{message} Please try later.") st.stop() From a7f35463e51b3413693e0a627d6c84cf9f73afcb Mon Sep 17 00:00:00 2001 From: sanakhamassi Date: Thu, 7 May 2026 11:23:54 +0100 Subject: [PATCH 4/6] use temporary file with suffix and ensure cleanup --- streamlit_app.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/streamlit_app.py b/streamlit_app.py index bad17f1..f75dae2 100644 --- a/streamlit_app.py +++ b/streamlit_app.py @@ -316,15 +316,22 @@ def play_old_messages(container): try: with st.spinner('Reading file, calling Grobid, and creating in-memory embeddings...'): binary = uploaded_file.getvalue() - tmp_file = NamedTemporaryFile() - tmp_file.write(bytearray(binary)) - st.session_state['binary'] = binary - - st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings( - tmp_file.name, - chunk_size=chunk_size, - perc_overlap=0.1 - ) + tmp_path = None + try: + with NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file: + tmp_file.write(bytearray(binary)) + tmp_file.flush() + tmp_path = tmp_file.name + st.session_state['binary'] = binary + + st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings( + tmp_path, + chunk_size=chunk_size, + perc_overlap=0.1 + ) + finally: + if tmp_path and os.path.exists(tmp_path): + os.unlink(tmp_path) st.session_state['loaded_embeddings'] = True st.session_state.messages = [] except GrobidServiceError as exc: @@ -333,7 +340,7 @@ def play_old_messages(container): st.session_state['doc_id'] = None st.session_state['loaded_embeddings'] = False st.session_state['uploaded'] = False - st.error(f"{message} Please try later.") + st.error(f"{message}{status} Please try later.") st.stop() From 76673e2af80f7759a1f6157522e768dfdeb163f9 Mon Sep 17 00:00:00 2001 From: sanakhamassi Date: Thu, 7 May 2026 11:24:56 +0100 Subject: [PATCH 5/6] Enhance DocumentQAEngine with ping_grobid_server parameter and update type hints --- document_qa/document_qa_engine.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/document_qa/document_qa_engine.py b/document_qa/document_qa_engine.py index f42e94e..8a55a66 100644 --- a/document_qa/document_qa_engine.py +++ b/document_qa/document_qa_engine.py @@ -1,7 +1,7 @@ import copy import os from pathlib import Path -from typing import Union, Any, List +from typing import Union, Any, List, Tuple import tiktoken from langchain.chains import create_extraction_chain @@ -209,7 +209,8 @@ def __init__(self, llm, data_storage: DataStorage, grobid_url=None, - memory=None + memory=None, + ping_grobid_server: bool = False ): self.llm = llm @@ -219,7 +220,7 @@ def __init__(self, self.data_storage = data_storage if grobid_url: - self.grobid_processor = GrobidProcessor(grobid_url, ping_server=False) + self.grobid_processor = GrobidProcessor(grobid_url, ping_server=ping_grobid_server) def query_document( self, @@ -229,7 +230,7 @@ def query_document( context_size=4, extraction_schema=None, verbose=False - ) -> (Any, str): + ) -> Tuple[Any, str]: # self.load_embeddings(self.embeddings_root_path) if verbose: @@ -258,7 +259,7 @@ def query_document( else: return None, response, coordinates - def query_storage(self, query: str, doc_id, context_size=4) -> (List[Document], list): + def query_storage(self, query: str, doc_id, context_size=4) -> Tuple[List[Document], list]: """ Returns the context related to a given query """ @@ -329,12 +330,12 @@ def _parse_json(self, response, output_parser): return parsed_output - def _run_query(self, doc_id, query, context_size=4) -> (List[Document], list): + def _run_query(self, doc_id, query, context_size=4) -> Tuple[List[Document], list]: relevant_documents, relevant_document_coordinates = self._get_context(doc_id, query, context_size) response = self.chain.invoke({"context": relevant_documents, "question": query}) return response, relevant_document_coordinates - def _get_context(self, doc_id, query, context_size=4) -> (List[Document], list): + def _get_context(self, doc_id, query, context_size=4) -> Tuple[List[Document], list]: db = self.data_storage.embeddings_dict[doc_id] retriever = db.as_retriever(search_kwargs={"k": context_size}) relevant_documents = retriever.invoke(query) From 24c0f84ec2a4040eaec168134b715d862fea38ee Mon Sep 17 00:00:00 2001 From: sanakhamassi Date: Thu, 7 May 2026 11:42:29 +0100 Subject: [PATCH 6/6] add tests for GrobidServiceError handling --- tests/test_grobid_processors.py | 82 ++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 2 deletions(-) diff --git a/tests/test_grobid_processors.py b/tests/test_grobid_processors.py index 9edf1ea..158c88d 100644 --- a/tests/test_grobid_processors.py +++ b/tests/test_grobid_processors.py @@ -1,7 +1,14 @@ import os - +from unittest.mock import MagicMock, patch +import pytest from bs4 import BeautifulSoup -from document_qa.grobid_processors import get_xml_nodes_body, get_xml_nodes_figures, get_xml_nodes_header +from document_qa.grobid_processors import ( + GrobidProcessor, + GrobidServiceError, + get_xml_nodes_body, + get_xml_nodes_figures, + get_xml_nodes_header, +) from tests.resources import TEST_DATA_PATH @@ -48,3 +55,74 @@ def test_get_xml_nodes_header_sentences(): children = get_xml_nodes_header(soup, use_paragraphs=False) assert sum([len(child) for k, child in children.items()]) == 15 + +def test_grobid_service_error_default_status_code(): + error = GrobidServiceError("Something went wrong") + assert error.status_code is None + assert str(error) == "Something went wrong" + + +def test_grobid_service_error_stores_status_code(): + error = GrobidServiceError("Bad gateway", status_code=502) + assert error.status_code == 502 + assert "Bad gateway" in str(error) + +@pytest.fixture +def grobid_processor(): + with patch("document_qa.grobid_processors.GrobidClient") as mock_client_class: + mock_client = MagicMock() + mock_client_class.return_value = mock_client + processor = GrobidProcessor("http://fake-url", ping_server=False) + yield processor + + +# Connection/timeout failures +def test_process_structure_raises_on_connection_error(grobid_processor): + grobid_processor.grobid_client.process_pdf.side_effect = ConnectionError( + "Connection refused" + ) + with pytest.raises(GrobidServiceError) as exc_info: + grobid_processor.process_structure("fake.pdf") + + assert "did not respond" in str(exc_info.value).lower() + assert exc_info.value.status_code is None + + +def test_process_structure_raises_on_timeout(grobid_processor): + grobid_processor.grobid_client.process_pdf.side_effect = TimeoutError( + "Request timed out" + ) + with pytest.raises(GrobidServiceError) as exc_info: + grobid_processor.process_structure("fake.pdf") + + assert exc_info.value.status_code is None + + +# Non-200 HTTP status codes +def test_process_structure_raises_on_503_status(grobid_processor): + grobid_processor.grobid_client.process_pdf.return_value = ("fake.pdf", 503, None) + + with pytest.raises(GrobidServiceError) as exc_info: + grobid_processor.process_structure("fake.pdf") + + assert exc_info.value.status_code == 503 + assert "503" in str(exc_info.value) + + +def test_process_structure_raises_on_500_status(grobid_processor): + grobid_processor.grobid_client.process_pdf.return_value = ("fake.pdf", 500, None) + + with pytest.raises(GrobidServiceError) as exc_info: + grobid_processor.process_structure("fake.pdf") + + assert exc_info.value.status_code == 500 + assert "500" in str(exc_info.value) + + +def test_process_structure_raises_on_404_status(grobid_processor): + grobid_processor.grobid_client.process_pdf.return_value = ("fake.pdf", 404, None) + + with pytest.raises(GrobidServiceError) as exc_info: + grobid_processor.process_structure("fake.pdf") + + assert exc_info.value.status_code == 404 \ No newline at end of file