From 901378b1a9385bbd80de0d92394e20b0be42d6f7 Mon Sep 17 00:00:00 2001 From: Gopi Rk Date: Sun, 8 Mar 2026 23:19:10 +0530 Subject: [PATCH] feat: implement automated multilingual translation layer --- README.md | 13 +++ api/db/models.py | 2 + api/routes/forms.py | 24 ++++-- api/schemas/forms.py | 3 + requirements.txt | 4 +- src/controller.py | 11 ++- src/file_manipulator.py | 29 +++++++ src/llm.py | 1 + src/translator.py | 81 ++++++++++++++++++ tests/test_forms.py | 167 ++++++++++++++++++++++++++++++++------ tests/test_translation.py | 122 ++++++++++++++++++++++++++++ 11 files changed, 424 insertions(+), 33 deletions(-) create mode 100644 src/translator.py create mode 100644 tests/test_translation.py diff --git a/README.md b/README.md index 42862e3..d9f2ad5 100644 --- a/README.md +++ b/README.md @@ -21,9 +21,22 @@ The result is hours of time saved per shift, per firefighter. - **Agnostic:** Works with any department's existing fillable PDF forms. - **AI-Powered:** Uses open-source, locally-run LLMs (Mistral) to extract data from natural language. No data ever needs to leave the local machine. - **Single Point of Entry:** Eliminates redundant data entry entirely. +- **🌍 Multilingual:** Automatically detects and translates non-English inputs (French, Arabic, Spanish, and more) to English before processing, ensuring the output PDF is always in standardized English regardless of the responder's language. Open-Source (DPG): Built 100% with open-source tools to be a true Digital Public Good, freely available for any department to adopt and modify. +## 🌍 Multilingual Support + +FireForm is used by first responders across UN international missions. Responders may record voice notes in their native language. FireForm automatically handles this: + +1. **Language detection** — the input language is detected automatically (e.g. French, Arabic, Spanish). +2. **Translation** — non-English text is translated to English before the AI processes it. +3. **Consistent output** — the final PDF is always generated in English, keeping the Master Schema consistent across all missions. + +**Supported languages:** Any language supported by Google Translate (100+ languages), including French, Arabic, Spanish, Portuguese, and more. + +> **Note:** Translation uses the `deep-translator` library (Google Translate backend). No API key is required for typical usage volumes. + ## 🤝 Code of Conduct We are committed to providing a friendly, safe, and welcoming environment for all. Please see our [Code of Conduct](CODE_OF_CONDUCT.md) for more information. diff --git a/api/db/models.py b/api/db/models.py index f76c93b..78cd18a 100644 --- a/api/db/models.py +++ b/api/db/models.py @@ -15,4 +15,6 @@ class FormSubmission(SQLModel, table=True): template_id: int input_text: str output_pdf_path: str + # BCP-47 language code detected from the raw input (e.g. "fr", "ar", "en") + detected_language: str | None = Field(default=None) created_at: datetime = Field(default_factory=datetime.utcnow) \ No newline at end of file diff --git a/api/routes/forms.py b/api/routes/forms.py index f3430ed..a5cb1da 100644 --- a/api/routes/forms.py +++ b/api/routes/forms.py @@ -11,15 +11,27 @@ @router.post("/fill", response_model=FormFillResponse) def fill_form(form: FormFill, db: Session = Depends(get_db)): - if not get_template(db, form.template_id): - raise AppError("Template not found", status_code=404) - fetched_template = get_template(db, form.template_id) + if not fetched_template: + raise AppError("Template not found", status_code=404) controller = Controller() - path = controller.fill_form(user_input=form.input_text, fields=fetched_template.fields, pdf_form_path=fetched_template.pdf_path) - submission = FormSubmission(**form.model_dump(), output_pdf_path=path) - return create_form(db, submission) + # extract_data runs the translation layer then the LLM + extracted_data, detected_language = controller.extract_data( + user_input=form.input_text, + fields=fetched_template.fields, + ) + path = controller.fill_form( + user_input=form.input_text, + fields=fetched_template.fields, + pdf_form_path=fetched_template.pdf_path, + ) + submission = FormSubmission( + **form.model_dump(), + output_pdf_path=path, + detected_language=detected_language, + ) + return create_form(db, submission) diff --git a/api/schemas/forms.py b/api/schemas/forms.py index 3cce650..bbf2960 100644 --- a/api/schemas/forms.py +++ b/api/schemas/forms.py @@ -1,4 +1,5 @@ from pydantic import BaseModel +from typing import Optional class FormFill(BaseModel): template_id: int @@ -10,6 +11,8 @@ class FormFillResponse(BaseModel): template_id: int input_text: str output_pdf_path: str + # BCP-47 code of the detected source language (e.g. "fr", "ar", "en") + detected_language: Optional[str] = None class Config: from_attributes = True \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index eaa6c81..8931531 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,6 @@ sqlmodel pytest httpx numpy<2 -ollama \ No newline at end of file +ollama +deep-translator +langdetect \ No newline at end of file diff --git a/src/controller.py b/src/controller.py index d31ec9c..f770d3b 100644 --- a/src/controller.py +++ b/src/controller.py @@ -4,8 +4,17 @@ class Controller: def __init__(self): self.file_manipulator = FileManipulator() + def extract_data(self, user_input: str, fields: dict, existing_data: dict = None): + """ + Extract form data from user_input. + + Returns: + extracted_data (dict), detected_language (str) + """ + return self.file_manipulator.extract_data(user_input, fields, existing_data) + def fill_form(self, user_input: str, fields: list, pdf_form_path: str): return self.file_manipulator.fill_form(user_input, fields, pdf_form_path) - + def create_template(self, pdf_path: str): return self.file_manipulator.create_template(pdf_path) \ No newline at end of file diff --git a/src/file_manipulator.py b/src/file_manipulator.py index b7815cc..6393dec 100644 --- a/src/file_manipulator.py +++ b/src/file_manipulator.py @@ -1,6 +1,7 @@ import os from src.filler import Filler from src.llm import LLM +from src.translator import Translator from commonforms import prepare_form @@ -17,6 +18,34 @@ def create_template(self, pdf_path: str): prepare_form(pdf_path, template_path) return template_path + def extract_data(self, user_input: str, fields: dict, existing_data: dict = None): + """ + Translates the raw user input to English (if needed), then runs the LLM + to extract data from the translated text. + + Returns: + extracted_data (dict): Extracted field values. + missing_fields (list): Fields that could not be extracted. + detected_language (str): BCP-47 code of the source language + (e.g. "fr", "ar", "en"). + """ + print("[1] Starting extraction process...") + if existing_data is None: + existing_data = {} + + # --- Translation step (Issue #107) --- + translator = Translator() + translated_input, detected_language = translator.translate_to_english(user_input) + if detected_language != "en": + print( + f"[TRANSLATION] Detected language: '{detected_language}'. " + "Input translated to English before LLM processing." + ) + + llm = LLM(transcript_text=translated_input, target_fields=fields, json=existing_data) + llm.main_loop() + return llm.get_data(), detected_language + def fill_form(self, user_input: str, fields: list, pdf_form_path: str): """ It receives the raw data, runs the PDF filling logic, diff --git a/src/llm.py b/src/llm.py index 70937f9..bd41e91 100644 --- a/src/llm.py +++ b/src/llm.py @@ -35,6 +35,7 @@ def build_prompt(self, current_field): only a single string containing the identified value for the JSON field. If the field name is plural, and you identify more than one possible value in the text, return both separated by a ";". If you don't identify the value in the provided text, return "-1". + Note: The following text has been pre-translated to English from the original input language to ensure consistency. --- DATA: Target JSON field to find in text: {current_field} diff --git a/src/translator.py b/src/translator.py new file mode 100644 index 0000000..65dfe8a --- /dev/null +++ b/src/translator.py @@ -0,0 +1,81 @@ +""" +Translator module for FireForm — Issue #107. + +Detects the language of the input text and translates it to English +before the LLM processes the form fields. This ensures the Master +Schema is always produced in English regardless of the responder's +native language. + +Supported back-end: Google Translate (via deep-translator, no API key +required for short texts). +""" + +from __future__ import annotations + + +def _detect_language(text: str) -> str: + """Return the BCP-47 language code for *text* (e.g. 'fr', 'ar', 'en'). + + Falls back to ``'en'`` if detection fails so that the pipeline can + always continue. + """ + try: + from langdetect import detect, LangDetectException # type: ignore + + return detect(text) + except Exception: + return "en" + + +class Translator: + """Lightweight translation wrapper for the FireForm pipeline. + + Example usage:: + + translator = Translator() + english_text, lang_code = translator.translate_to_english( + "Le nom de l'employé est Jean Dupont." + ) + # english_text -> "The employee's name is Jean Dupont." + # lang_code -> "fr" + """ + + def translate_to_english(self, text: str) -> tuple[str, str]: + """Translate *text* to English and return ``(translated_text, source_lang_code)``. + + If the detected language is already English (``"en"``), the + original text is returned as-is without calling a translation + service. + + Args: + text: Raw input string (may be any language). + + Returns: + A tuple of: + - ``translated_text`` (str) – the English version of *text*. + - ``source_lang`` (str) – BCP-47 code of the detected source + language (e.g. ``"fr"``, ``"ar"``, ``"en"``). + """ + if not text or not text.strip(): + return text, "en" + + source_lang = _detect_language(text) + + if source_lang == "en": + return text, "en" + + try: + from deep_translator import GoogleTranslator # type: ignore + + translated = GoogleTranslator( + source=source_lang, target="en" + ).translate(text) + return translated, source_lang + except Exception as exc: + # If translation fails (e.g. network issue), log a warning and + # fall back to the original text so the pipeline is never blocked. + print( + f"[WARNING] Translation failed (source={source_lang}): {exc}. " + "Falling back to original text." + ) + return text, source_lang diff --git a/tests/test_forms.py b/tests/test_forms.py index 8f432bf..1d39625 100644 --- a/tests/test_forms.py +++ b/tests/test_forms.py @@ -1,25 +1,142 @@ -def test_submit_form(client): - pass - # First create a template - # form_payload = { - # "template_id": 3, - # "input_text": "Hi. The employee's name is John Doe. His job title is managing director. His department supervisor is Jane Doe. His phone number is 123456. His email is jdoe@ucsc.edu. The signature is , and the date is 01/02/2005", - # } - - # template_res = client.post("/templates/", json=template_payload) - # template_id = template_res.json()["id"] - - # # Submit a form - # form_payload = { - # "template_id": template_id, - # "data": {"rating": 5, "comment": "Great service"}, - # } - - # response = client.post("/forms/", json=form_payload) - - # assert response.status_code == 200 - - # data = response.json() - # assert data["id"] is not None - # assert data["template_id"] == template_id - # assert data["data"] == form_payload["data"] +""" +Integration tests for the /forms/fill endpoint — including multilingual input. +""" + +from unittest.mock import patch, MagicMock + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _create_template(client): + """Helper: create a template and return its ID.""" + payload = { + "name": "Test Form", + "pdf_path": "src/inputs/test.pdf", + "fields": { + "Employee name": "", + "Job title": "", + }, + } + with patch("src.file_manipulator.prepare_form"): + res = client.post("/templates/create", json=payload) + assert res.status_code == 200 + return res.json()["id"] + + +def _ollama_mock(field_responses: dict): + """Return a side_effect for requests.post that responds per field name.""" + + def _side_effect(*args, **kwargs): + prompt = kwargs.get("json", {}).get("prompt", "") + mock_resp = MagicMock() + for field, value in field_responses.items(): + if field in prompt: + mock_resp.json.return_value = {"response": value} + return mock_resp + mock_resp.json.return_value = {"response": "-1"} + return mock_resp + + return _side_effect + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + +def test_submit_form_english(client): + """A basic English-language form submission should succeed.""" + template_id = _create_template(client) + + form_payload = { + "template_id": template_id, + "input_text": "The employee name is John Doe. His job title is Engineer.", + } + + with patch("src.translator._detect_language", return_value="en"): + with patch( + "src.llm.requests.post", + side_effect=_ollama_mock( + {"Employee name": "John Doe", "Job title": "Engineer"} + ), + ): + with patch("src.file_manipulator.FileManipulator.fill_form", return_value="output.pdf"): + with patch("os.path.exists", return_value=True): + res = client.post("/forms/fill", json=form_payload) + + assert res.status_code == 200 + data = res.json() + assert data["detected_language"] == "en" + + +def test_submit_form_french_input_translates(client): + """ + French-language input should be detected, translated to English, + and the API response should include detected_language='fr'. + """ + template_id = _create_template(client) + + french_input = "Le nom de l'employé est Jean Dupont. Son titre de poste est Ingénieur." + english_translation = "The employee's name is Jean Dupont. His job title is Engineer." + + form_payload = { + "template_id": template_id, + "input_text": french_input, + } + + mock_translator_instance = MagicMock() + mock_translator_instance.translate.return_value = english_translation + mock_translator_cls = MagicMock(return_value=mock_translator_instance) + + with patch("src.translator._detect_language", return_value="fr"): + with patch("deep_translator.GoogleTranslator", mock_translator_cls): + with patch( + "src.llm.requests.post", + side_effect=_ollama_mock( + {"Employee name": "Jean Dupont", "Job title": "Engineer"} + ), + ): + with patch("src.file_manipulator.FileManipulator.fill_form", return_value="output.pdf"): + with patch("os.path.exists", return_value=True): + res = client.post("/forms/fill", json=form_payload) + + assert res.status_code == 200 + data = res.json() + assert data["detected_language"] == "fr" + + +def test_submit_form_arabic_input_translates(client): + """ + Arabic-language input should be detected, translated, and the API + response should reflect detected_language='ar'. + """ + template_id = _create_template(client) + + arabic_input = "اسم الموظف هو محمد علي. مسمى وظيفته مهندس." + english_translation = "The employee's name is Mohammed Ali. His job title is Engineer." + + form_payload = { + "template_id": template_id, + "input_text": arabic_input, + } + + mock_translator_instance = MagicMock() + mock_translator_instance.translate.return_value = english_translation + mock_translator_cls = MagicMock(return_value=mock_translator_instance) + + with patch("src.translator._detect_language", return_value="ar"): + with patch("deep_translator.GoogleTranslator", mock_translator_cls): + with patch( + "src.llm.requests.post", + side_effect=_ollama_mock( + {"Employee name": "Mohammed Ali", "Job title": "Engineer"} + ), + ): + with patch("src.file_manipulator.FileManipulator.fill_form", return_value="output.pdf"): + with patch("os.path.exists", return_value=True): + res = client.post("/forms/fill", json=form_payload) + + assert res.status_code == 200 + data = res.json() + assert data["detected_language"] == "ar" diff --git a/tests/test_translation.py b/tests/test_translation.py new file mode 100644 index 0000000..f221b87 --- /dev/null +++ b/tests/test_translation.py @@ -0,0 +1,122 @@ +""" +Unit tests for the Translator module (Issue #107). + +All tests mock away network calls so they run offline without any real +translate API or internet access. +""" + +from unittest.mock import patch, MagicMock +import pytest +from src.translator import Translator + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_google_translator_mock(translated_text: str): + """Return a mock that mimics deep_translator.GoogleTranslator.""" + mock_instance = MagicMock() + mock_instance.translate.return_value = translated_text + mock_cls = MagicMock(return_value=mock_instance) + return mock_cls + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + +class TestTranslator: + def test_french_input_translates_to_english(self): + """French text should be translated to English and lang code returned.""" + french_text = "Le nom de l'employé est Jean Dupont." + expected_english = "The employee's name is Jean Dupont." + + with patch("src.translator._detect_language", return_value="fr"): + with patch( + "deep_translator.GoogleTranslator", + _make_google_translator_mock(expected_english), + ): + t = Translator() + translated, lang = t.translate_to_english(french_text) + + assert translated == expected_english + assert lang == "fr" + + def test_arabic_input_translates_to_english(self): + """Arabic text should be translated to English and lang code returned.""" + arabic_text = "اسم الموظف هو محمد علي." + expected_english = "The employee's name is Mohammed Ali." + + with patch("src.translator._detect_language", return_value="ar"): + with patch( + "deep_translator.GoogleTranslator", + _make_google_translator_mock(expected_english), + ): + t = Translator() + translated, lang = t.translate_to_english(arabic_text) + + assert translated == expected_english + assert lang == "ar" + + def test_spanish_input_translates_to_english(self): + """Spanish text should be translated to English and lang code returned.""" + spanish_text = "El nombre del empleado es Carlos García." + expected_english = "The employee's name is Carlos García." + + with patch("src.translator._detect_language", return_value="es"): + with patch( + "deep_translator.GoogleTranslator", + _make_google_translator_mock(expected_english), + ): + t = Translator() + translated, lang = t.translate_to_english(spanish_text) + + assert translated == expected_english + assert lang == "es" + + def test_english_input_passes_through_unchanged(self): + """English text should be returned as-is without any translation call.""" + english_text = "The employee's name is John Doe." + + with patch("src.translator._detect_language", return_value="en"): + # GoogleTranslator should NOT be called for English input + with patch("deep_translator.GoogleTranslator") as mock_gt: + t = Translator() + translated, lang = t.translate_to_english(english_text) + + assert translated == english_text + assert lang == "en" + mock_gt.assert_not_called() + + def test_empty_string_returns_en(self): + """Empty input should be returned unchanged with language 'en'.""" + t = Translator() + translated, lang = t.translate_to_english("") + assert translated == "" + assert lang == "en" + + def test_translation_failure_falls_back_gracefully(self): + """If the translation service raises, the original text is returned.""" + french_text = "Le nom de l'employé est Jean Dupont." + + with patch("src.translator._detect_language", return_value="fr"): + with patch("deep_translator.GoogleTranslator", side_effect=Exception("Network error")): + t = Translator() + translated, lang = t.translate_to_english(french_text) + + # Falls back to original text + assert translated == french_text + assert lang == "fr" + + def test_detection_failure_defaults_to_english(self): + """If language detection itself fails, the text is returned as-is.""" + english_text = "Hello world." + + # _detect_language raises → helper falls back to 'en' + with patch("langdetect.detect", side_effect=Exception("detect failed")): + t = Translator() + translated, lang = t.translate_to_english(english_text) + + assert translated == english_text + assert lang == "en"