Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 11 additions & 8 deletions document_qa/document_qa_engine.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import copy
import os
from pathlib import Path
from typing import Union, Any, List
from typing import Union, Any, List, Tuple

import tiktoken
from langchain.chains import create_extraction_chain
Expand All @@ -15,7 +15,7 @@
from langchain_core.vectorstores import VectorStore
from tqdm import tqdm

from document_qa.grobid_processors import GrobidProcessor
from document_qa.grobid_processors import GrobidProcessor, GrobidServiceError
from document_qa.langchain import ChromaAdvancedRetrieval


Expand Down Expand Up @@ -209,7 +209,8 @@ def __init__(self,
llm,
data_storage: DataStorage,
grobid_url=None,
memory=None
memory=None,
ping_grobid_server: bool = False
):

self.llm = llm
Expand All @@ -219,7 +220,7 @@ def __init__(self,
self.data_storage = data_storage

if grobid_url:
self.grobid_processor = GrobidProcessor(grobid_url)
self.grobid_processor = GrobidProcessor(grobid_url, ping_server=ping_grobid_server)

Comment on lines 222 to 224
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Sanakhamassi why was this removed?

def query_document(
self,
Expand All @@ -229,7 +230,7 @@ def query_document(
context_size=4,
extraction_schema=None,
verbose=False
) -> (Any, str):
) -> Tuple[Any, str]:
# self.load_embeddings(self.embeddings_root_path)

if verbose:
Expand Down Expand Up @@ -258,7 +259,7 @@ def query_document(
else:
return None, response, coordinates

def query_storage(self, query: str, doc_id, context_size=4) -> (List[Document], list):
def query_storage(self, query: str, doc_id, context_size=4) -> Tuple[List[Document], list]:
"""
Returns the context related to a given query
"""
Expand Down Expand Up @@ -329,12 +330,12 @@ def _parse_json(self, response, output_parser):

return parsed_output

def _run_query(self, doc_id, query, context_size=4) -> (List[Document], list):
def _run_query(self, doc_id, query, context_size=4) -> Tuple[List[Document], list]:
relevant_documents, relevant_document_coordinates = self._get_context(doc_id, query, context_size)
response = self.chain.invoke({"context": relevant_documents, "question": query})
return response, relevant_document_coordinates

def _get_context(self, doc_id, query, context_size=4) -> (List[Document], list):
def _get_context(self, doc_id, query, context_size=4) -> Tuple[List[Document], list]:
db = self.data_storage.embeddings_dict[doc_id]
retriever = db.as_retriever(search_kwargs={"k": context_size})
relevant_documents = retriever.invoke(query)
Expand Down Expand Up @@ -376,6 +377,8 @@ def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1,
filename = Path(pdf_file_path).stem
coordinates = True # if chunk_size == -1 else False
structure = self.grobid_processor.process_structure(pdf_file_path, coordinates=coordinates)
if not structure:
raise GrobidServiceError("Grobid did not return a response.")
Comment thread
lfoppiano marked this conversation as resolved.

biblio = structure['biblio']
biblio['filename'] = filename.replace(" ", "_")
Expand Down
36 changes: 25 additions & 11 deletions document_qa/grobid_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@
from grobid_client.grobid_client import GrobidClient


class GrobidServiceError(RuntimeError):
"""Raised when the Grobid service fails to process a document."""

def __init__(self, message="Grobid service error", status_code=None):
super().__init__(message)
self.status_code = status_code


def get_span_start(type, title=None):
title_ = ' title="' + title + '"' if title is not None else ""
return '<span class="label ' + type + '"' + title_ + '>'
Expand Down Expand Up @@ -97,18 +105,24 @@ def __init__(self, grobid_url, ping_server=True):
self.grobid_client = grobid_client

def process_structure(self, input_path, coordinates=False):
pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
input_path,
consolidate_header=True,
consolidate_citations=False,
segment_sentences=False,
tei_coordinates=coordinates,
include_raw_citations=False,
include_raw_affiliations=False,
generateIDs=True)
try:
pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
input_path,
consolidate_header=True,
consolidate_citations=False,
segment_sentences=False,
tei_coordinates=coordinates,
include_raw_citations=False,
include_raw_affiliations=False,
generateIDs=True)
except Exception as exc:
raise GrobidServiceError("Grobid service did not respond.") from exc
Comment thread
lfoppiano marked this conversation as resolved.

Comment on lines +116 to 120
if status != 200:
return
raise GrobidServiceError(
f"Grobid service returned status {status}.",
status_code=status
)
Comment thread
lfoppiano marked this conversation as resolved.
Comment on lines +108 to +125

document_object = self.parse_grobid_xml(text, coordinates=coordinates)
document_object['filename'] = Path(pdf_file).stem.replace(".tei", "")
Expand Down Expand Up @@ -137,7 +151,7 @@ def parse_grobid_xml(self, text, coordinates=False):
try:
year = dateparser.parse(doc_biblio.header.date).year
biblio["publication_year"] = year
except:
except Exception:
pass

output_data['biblio'] = biblio
Expand Down
49 changes: 32 additions & 17 deletions streamlit_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,19 @@
from tempfile import NamedTemporaryFile

import dotenv
import streamlit as st
from grobid_quantities.quantities import QuantitiesAPI
from langchain.memory import ConversationBufferMemory
from langchain_openai import ChatOpenAI
from streamlit_pdf_viewer import pdf_viewer

from document_qa.custom_embeddings import ModalEmbeddings
from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations, GrobidServiceError
from document_qa.ner_client_generic import NERClientGeneric

dotenv.load_dotenv(override=True)

import streamlit as st
from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations

API_MODELS = {
"microsoft/Phi-4-mini-instruct": os.environ["PHI_URL"],
"Qwen/Qwen3-0.6B": os.environ["QWEN_URL"]
Expand Down Expand Up @@ -314,19 +313,35 @@ def play_old_messages(container):
st.stop()

with left_column:
with st.spinner('Reading file, calling Grobid, and creating in-memory embeddings...'):
binary = uploaded_file.getvalue()
tmp_file = NamedTemporaryFile()
tmp_file.write(bytearray(binary))
st.session_state['binary'] = binary

st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(
tmp_file.name,
chunk_size=chunk_size,
perc_overlap=0.1
)
st.session_state['loaded_embeddings'] = True
st.session_state.messages = []
try:
with st.spinner('Reading file, calling Grobid, and creating in-memory embeddings...'):
binary = uploaded_file.getvalue()
tmp_path = None
try:
with NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
tmp_file.write(bytearray(binary))
tmp_file.flush()
tmp_path = tmp_file.name
st.session_state['binary'] = binary

st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(
tmp_path,
chunk_size=chunk_size,
perc_overlap=0.1
)
Comment on lines +325 to +331
finally:
if tmp_path and os.path.exists(tmp_path):
os.unlink(tmp_path)
st.session_state['loaded_embeddings'] = True
st.session_state.messages = []
except GrobidServiceError as exc:
message = str(exc).strip() or "Grobid is not responding"
status = f" (status {exc.status_code})" if exc.status_code else ""
st.session_state['doc_id'] = None
st.session_state['loaded_embeddings'] = False
st.session_state['uploaded'] = False
st.error(f"{message}{status} Please try later.")
st.stop()


def rgb_to_hex(rgb):
Expand Down
82 changes: 80 additions & 2 deletions tests/test_grobid_processors.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
import os

from unittest.mock import MagicMock, patch
import pytest
from bs4 import BeautifulSoup
from document_qa.grobid_processors import get_xml_nodes_body, get_xml_nodes_figures, get_xml_nodes_header
from document_qa.grobid_processors import (
GrobidProcessor,
GrobidServiceError,
get_xml_nodes_body,
get_xml_nodes_figures,
get_xml_nodes_header,
)
from tests.resources import TEST_DATA_PATH


Expand Down Expand Up @@ -48,3 +55,74 @@ def test_get_xml_nodes_header_sentences():
children = get_xml_nodes_header(soup, use_paragraphs=False)

assert sum([len(child) for k, child in children.items()]) == 15

def test_grobid_service_error_default_status_code():
error = GrobidServiceError("Something went wrong")
assert error.status_code is None
assert str(error) == "Something went wrong"


def test_grobid_service_error_stores_status_code():
error = GrobidServiceError("Bad gateway", status_code=502)
assert error.status_code == 502
assert "Bad gateway" in str(error)

@pytest.fixture
def grobid_processor():
with patch("document_qa.grobid_processors.GrobidClient") as mock_client_class:
mock_client = MagicMock()
mock_client_class.return_value = mock_client
processor = GrobidProcessor("http://fake-url", ping_server=False)
yield processor


# Connection/timeout failures
def test_process_structure_raises_on_connection_error(grobid_processor):
grobid_processor.grobid_client.process_pdf.side_effect = ConnectionError(
"Connection refused"
)
with pytest.raises(GrobidServiceError) as exc_info:
grobid_processor.process_structure("fake.pdf")

assert "did not respond" in str(exc_info.value).lower()
assert exc_info.value.status_code is None


def test_process_structure_raises_on_timeout(grobid_processor):
grobid_processor.grobid_client.process_pdf.side_effect = TimeoutError(
"Request timed out"
)
with pytest.raises(GrobidServiceError) as exc_info:
grobid_processor.process_structure("fake.pdf")

assert exc_info.value.status_code is None


# Non-200 HTTP status codes
def test_process_structure_raises_on_503_status(grobid_processor):
grobid_processor.grobid_client.process_pdf.return_value = ("fake.pdf", 503, None)

with pytest.raises(GrobidServiceError) as exc_info:
grobid_processor.process_structure("fake.pdf")

assert exc_info.value.status_code == 503
assert "503" in str(exc_info.value)


def test_process_structure_raises_on_500_status(grobid_processor):
grobid_processor.grobid_client.process_pdf.return_value = ("fake.pdf", 500, None)

with pytest.raises(GrobidServiceError) as exc_info:
grobid_processor.process_structure("fake.pdf")

assert exc_info.value.status_code == 500
assert "500" in str(exc_info.value)


def test_process_structure_raises_on_404_status(grobid_processor):
grobid_processor.grobid_client.process_pdf.return_value = ("fake.pdf", 404, None)

with pytest.raises(GrobidServiceError) as exc_info:
grobid_processor.process_structure("fake.pdf")

assert exc_info.value.status_code == 404