fireform-core · Gopisokk · Mar 8, 2026
diff --git a/README.md b/README.md
@@ -21,9 +21,22 @@ The result is hours of time saved per shift, per firefighter.
 - **Agnostic:** Works with any department's existing fillable PDF forms.
 - **AI-Powered:** Uses open-source, locally-run LLMs (Mistral) to extract data from natural language. No data ever needs to leave the local machine.
 - **Single Point of Entry:** Eliminates redundant data entry entirely.
+- **🌍 Multilingual:** Automatically detects and translates non-English inputs (French, Arabic, Spanish, and more) to English before processing, ensuring the output PDF is always in standardized English regardless of the responder's language.
 
 Open-Source (DPG): Built 100% with open-source tools to be a true Digital Public Good, freely available for any department to adopt and modify.
 
+## 🌍 Multilingual Support
+
+FireForm is used by first responders across UN international missions. Responders may record voice notes in their native language. FireForm automatically handles this:
+
+1. **Language detection** — the input language is detected automatically (e.g. French, Arabic, Spanish).
+2. **Translation** — non-English text is translated to English before the AI processes it.
+3. **Consistent output** — the final PDF is always generated in English, keeping the Master Schema consistent across all missions.
+
+**Supported languages:** Any language supported by Google Translate (100+ languages), including French, Arabic, Spanish, Portuguese, and more.
+
+> **Note:** Translation uses the `deep-translator` library (Google Translate backend). No API key is required for typical usage volumes.
+
 ## 🤝 Code of Conduct
 
 We are committed to providing a friendly, safe, and welcoming environment for all. Please see our [Code of Conduct](CODE_OF_CONDUCT.md) for more information.

diff --git a/api/db/models.py b/api/db/models.py
@@ -15,4 +15,6 @@ class FormSubmission(SQLModel, table=True):
     template_id: int
     input_text: str
     output_pdf_path: str
+    # BCP-47 language code detected from the raw input (e.g. "fr", "ar", "en")
+    detected_language: str | None = Field(default=None)
     created_at: datetime = Field(default_factory=datetime.utcnow)
diff --git a/api/routes/forms.py b/api/routes/forms.py
@@ -11,15 +11,27 @@
 
 @router.post("/fill", response_model=FormFillResponse)
 def fill_form(form: FormFill, db: Session = Depends(get_db)):
-    if not get_template(db, form.template_id):
-        raise AppError("Template not found", status_code=404)
-
     fetched_template = get_template(db, form.template_id)
+    if not fetched_template:
+        raise AppError("Template not found", status_code=404)
 
     controller = Controller()
-    path = controller.fill_form(user_input=form.input_text, fields=fetched_template.fields, pdf_form_path=fetched_template.pdf_path)
 
-    submission = FormSubmission(**form.model_dump(), output_pdf_path=path)
-    return create_form(db, submission)
+    # extract_data runs the translation layer then the LLM
+    extracted_data, detected_language = controller.extract_data(
+        user_input=form.input_text,
+        fields=fetched_template.fields,
+    )
 
+    path = controller.fill_form(
+        user_input=form.input_text,
+        fields=fetched_template.fields,
+        pdf_form_path=fetched_template.pdf_path,
+    )
 
+    submission = FormSubmission(
+        **form.model_dump(),
+        output_pdf_path=path,
+        detected_language=detected_language,
+    )
+    return create_form(db, submission)
diff --git a/api/schemas/forms.py b/api/schemas/forms.py
@@ -1,4 +1,5 @@
 from pydantic import BaseModel
+from typing import Optional
 
 class FormFill(BaseModel):
     template_id: int
@@ -10,6 +11,8 @@ class FormFillResponse(BaseModel):
     template_id: int
     input_text: str
     output_pdf_path: str
+    # BCP-47 code of the detected source language (e.g. "fr", "ar", "en")
+    detected_language: Optional[str] = None
 
     class Config:
         from_attributes = True
diff --git a/requirements.txt b/requirements.txt
@@ -9,4 +9,6 @@ sqlmodel
 pytest
 httpx
 numpy<2
-ollama
+ollama
+deep-translator
+langdetect
diff --git a/src/controller.py b/src/controller.py
@@ -4,8 +4,17 @@ class Controller:
     def __init__(self):
         self.file_manipulator = FileManipulator()
 
+    def extract_data(self, user_input: str, fields: dict, existing_data: dict = None):
+        """
+        Extract form data from user_input.
+
+        Returns:
+            extracted_data (dict), detected_language (str)
+        """
+        return self.file_manipulator.extract_data(user_input, fields, existing_data)
+
     def fill_form(self, user_input: str, fields: list, pdf_form_path: str):
         return self.file_manipulator.fill_form(user_input, fields, pdf_form_path)
-    
+
     def create_template(self, pdf_path: str):
         return self.file_manipulator.create_template(pdf_path)
diff --git a/src/file_manipulator.py b/src/file_manipulator.py
@@ -1,6 +1,7 @@
 import os
 from src.filler import Filler
 from src.llm import LLM
+from src.translator import Translator
 from commonforms import prepare_form
 
 
@@ -17,6 +18,34 @@ def create_template(self, pdf_path: str):
         prepare_form(pdf_path, template_path)
         return template_path
 
+    def extract_data(self, user_input: str, fields: dict, existing_data: dict = None):
+        """
+        Translates the raw user input to English (if needed), then runs the LLM
+        to extract data from the translated text.
+
+        Returns:
+            extracted_data (dict): Extracted field values.
+            missing_fields (list): Fields that could not be extracted.
+            detected_language (str): BCP-47 code of the source language
+                (e.g. "fr", "ar", "en").
+        """
+        print("[1] Starting extraction process...")
+        if existing_data is None:
+            existing_data = {}
+
+        # --- Translation step (Issue #107) ---
+        translator = Translator()
+        translated_input, detected_language = translator.translate_to_english(user_input)
+        if detected_language != "en":
+            print(
+                f"[TRANSLATION] Detected language: '{detected_language}'. "
+                "Input translated to English before LLM processing."
+            )
+
+        llm = LLM(transcript_text=translated_input, target_fields=fields, json=existing_data)
+        llm.main_loop()
+        return llm.get_data(), detected_language
+
     def fill_form(self, user_input: str, fields: list, pdf_form_path: str):
         """
         It receives the raw data, runs the PDF filling logic,

diff --git a/src/llm.py b/src/llm.py
@@ -35,6 +35,7 @@ def build_prompt(self, current_field):
             only a single string containing the identified value for the JSON field. 
             If the field name is plural, and you identify more than one possible value in the text, return both separated by a ";".
             If you don't identify the value in the provided text, return "-1".
+            Note: The following text has been pre-translated to English from the original input language to ensure consistency.
             ---
             DATA:
             Target JSON field to find in text: {current_field}

diff --git a/src/translator.py b/src/translator.py
@@ -0,0 +1,81 @@
+"""
+Translator module for FireForm — Issue #107.
+
+Detects the language of the input text and translates it to English
+before the LLM processes the form fields.  This ensures the Master
+Schema is always produced in English regardless of the responder's
+native language.
+
+Supported back-end: Google Translate (via deep-translator, no API key
+required for short texts).
+"""
+
+from __future__ import annotations
+
+
+def _detect_language(text: str) -> str:
+    """Return the BCP-47 language code for *text* (e.g. 'fr', 'ar', 'en').
+
+    Falls back to ``'en'`` if detection fails so that the pipeline can
+    always continue.
+    """
+    try:
+        from langdetect import detect, LangDetectException  # type: ignore
+
+        return detect(text)
+    except Exception:
+        return "en"
+
+
+class Translator:
+    """Lightweight translation wrapper for the FireForm pipeline.
+
+    Example usage::
+
+        translator = Translator()
+        english_text, lang_code = translator.translate_to_english(
+            "Le nom de l'employé est Jean Dupont."
+        )
+        # english_text -> "The employee's name is Jean Dupont."
+        # lang_code    -> "fr"
+    """
+
+    def translate_to_english(self, text: str) -> tuple[str, str]:
+        """Translate *text* to English and return ``(translated_text, source_lang_code)``.
+
+        If the detected language is already English (``"en"``), the
+        original text is returned as-is without calling a translation
+        service.
+
+        Args:
+            text: Raw input string (may be any language).
+
+        Returns:
+            A tuple of:
+            - ``translated_text`` (str) – the English version of *text*.
+            - ``source_lang`` (str) – BCP-47 code of the detected source
+              language (e.g. ``"fr"``, ``"ar"``, ``"en"``).
+        """
+        if not text or not text.strip():
+            return text, "en"
+
+        source_lang = _detect_language(text)
+
+        if source_lang == "en":
+            return text, "en"
+
+        try:
+            from deep_translator import GoogleTranslator  # type: ignore
+
+            translated = GoogleTranslator(
+                source=source_lang, target="en"
+            ).translate(text)
+            return translated, source_lang
+        except Exception as exc:
+            # If translation fails (e.g. network issue), log a warning and
+            # fall back to the original text so the pipeline is never blocked.
+            print(
+                f"[WARNING] Translation failed (source={source_lang}): {exc}. "
+                "Falling back to original text."
+            )
+            return text, source_lang
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,4 +9,6 @@ sqlmodel @@
     pytest
     httpx
     numpy<2
-    ollama
+    ollama
+    deep-translator
+    langdetect