Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,22 @@ The result is hours of time saved per shift, per firefighter.
- **Agnostic:** Works with any department's existing fillable PDF forms.
- **AI-Powered:** Uses open-source, locally-run LLMs (Mistral) to extract data from natural language. No data ever needs to leave the local machine.
- **Single Point of Entry:** Eliminates redundant data entry entirely.
- **🌍 Multilingual:** Automatically detects and translates non-English inputs (French, Arabic, Spanish, and more) to English before processing, ensuring the output PDF is always in standardized English regardless of the responder's language.

Open-Source (DPG): Built 100% with open-source tools to be a true Digital Public Good, freely available for any department to adopt and modify.

## 🌍 Multilingual Support

FireForm is used by first responders across UN international missions. Responders may record voice notes in their native language. FireForm automatically handles this:

1. **Language detection** — the input language is detected automatically (e.g. French, Arabic, Spanish).
2. **Translation** — non-English text is translated to English before the AI processes it.
3. **Consistent output** — the final PDF is always generated in English, keeping the Master Schema consistent across all missions.

**Supported languages:** Any language supported by Google Translate (100+ languages), including French, Arabic, Spanish, Portuguese, and more.

> **Note:** Translation uses the `deep-translator` library (Google Translate backend). No API key is required for typical usage volumes.

## 🤝 Code of Conduct

We are committed to providing a friendly, safe, and welcoming environment for all. Please see our [Code of Conduct](CODE_OF_CONDUCT.md) for more information.
Expand Down
2 changes: 2 additions & 0 deletions api/db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,6 @@ class FormSubmission(SQLModel, table=True):
template_id: int
input_text: str
output_pdf_path: str
# BCP-47 language code detected from the raw input (e.g. "fr", "ar", "en")
detected_language: str | None = Field(default=None)
created_at: datetime = Field(default_factory=datetime.utcnow)
24 changes: 18 additions & 6 deletions api/routes/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,27 @@

@router.post("/fill", response_model=FormFillResponse)
def fill_form(form: FormFill, db: Session = Depends(get_db)):
if not get_template(db, form.template_id):
raise AppError("Template not found", status_code=404)

fetched_template = get_template(db, form.template_id)
if not fetched_template:
raise AppError("Template not found", status_code=404)

controller = Controller()
path = controller.fill_form(user_input=form.input_text, fields=fetched_template.fields, pdf_form_path=fetched_template.pdf_path)

submission = FormSubmission(**form.model_dump(), output_pdf_path=path)
return create_form(db, submission)
# extract_data runs the translation layer then the LLM
extracted_data, detected_language = controller.extract_data(
user_input=form.input_text,
fields=fetched_template.fields,
)

path = controller.fill_form(
user_input=form.input_text,
fields=fetched_template.fields,
pdf_form_path=fetched_template.pdf_path,
)

submission = FormSubmission(
**form.model_dump(),
output_pdf_path=path,
detected_language=detected_language,
)
return create_form(db, submission)
3 changes: 3 additions & 0 deletions api/schemas/forms.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pydantic import BaseModel
from typing import Optional

class FormFill(BaseModel):
template_id: int
Expand All @@ -10,6 +11,8 @@ class FormFillResponse(BaseModel):
template_id: int
input_text: str
output_pdf_path: str
# BCP-47 code of the detected source language (e.g. "fr", "ar", "en")
detected_language: Optional[str] = None

class Config:
from_attributes = True
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,6 @@ sqlmodel
pytest
httpx
numpy<2
ollama
ollama
deep-translator
langdetect
11 changes: 10 additions & 1 deletion src/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,17 @@ class Controller:
def __init__(self):
self.file_manipulator = FileManipulator()

def extract_data(self, user_input: str, fields: dict, existing_data: dict = None):
"""
Extract form data from user_input.

Returns:
extracted_data (dict), detected_language (str)
"""
return self.file_manipulator.extract_data(user_input, fields, existing_data)

def fill_form(self, user_input: str, fields: list, pdf_form_path: str):
return self.file_manipulator.fill_form(user_input, fields, pdf_form_path)

def create_template(self, pdf_path: str):
return self.file_manipulator.create_template(pdf_path)
29 changes: 29 additions & 0 deletions src/file_manipulator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
from src.filler import Filler
from src.llm import LLM
from src.translator import Translator
from commonforms import prepare_form


Expand All @@ -17,6 +18,34 @@ def create_template(self, pdf_path: str):
prepare_form(pdf_path, template_path)
return template_path

def extract_data(self, user_input: str, fields: dict, existing_data: dict = None):
"""
Translates the raw user input to English (if needed), then runs the LLM
to extract data from the translated text.

Returns:
extracted_data (dict): Extracted field values.
missing_fields (list): Fields that could not be extracted.
detected_language (str): BCP-47 code of the source language
(e.g. "fr", "ar", "en").
"""
print("[1] Starting extraction process...")
if existing_data is None:
existing_data = {}

# --- Translation step (Issue #107) ---
translator = Translator()
translated_input, detected_language = translator.translate_to_english(user_input)
if detected_language != "en":
print(
f"[TRANSLATION] Detected language: '{detected_language}'. "
"Input translated to English before LLM processing."
)

llm = LLM(transcript_text=translated_input, target_fields=fields, json=existing_data)
llm.main_loop()
return llm.get_data(), detected_language

def fill_form(self, user_input: str, fields: list, pdf_form_path: str):
"""
It receives the raw data, runs the PDF filling logic,
Expand Down
1 change: 1 addition & 0 deletions src/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def build_prompt(self, current_field):
only a single string containing the identified value for the JSON field.
If the field name is plural, and you identify more than one possible value in the text, return both separated by a ";".
If you don't identify the value in the provided text, return "-1".
Note: The following text has been pre-translated to English from the original input language to ensure consistency.
---
DATA:
Target JSON field to find in text: {current_field}
Expand Down
81 changes: 81 additions & 0 deletions src/translator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""
Translator module for FireForm — Issue #107.

Detects the language of the input text and translates it to English
before the LLM processes the form fields. This ensures the Master
Schema is always produced in English regardless of the responder's
native language.

Supported back-end: Google Translate (via deep-translator, no API key
required for short texts).
"""

from __future__ import annotations


def _detect_language(text: str) -> str:
"""Return the BCP-47 language code for *text* (e.g. 'fr', 'ar', 'en').

Falls back to ``'en'`` if detection fails so that the pipeline can
always continue.
"""
try:
from langdetect import detect, LangDetectException # type: ignore

return detect(text)
except Exception:
return "en"


class Translator:
"""Lightweight translation wrapper for the FireForm pipeline.

Example usage::

translator = Translator()
english_text, lang_code = translator.translate_to_english(
"Le nom de l'employé est Jean Dupont."
)
# english_text -> "The employee's name is Jean Dupont."
# lang_code -> "fr"
"""

def translate_to_english(self, text: str) -> tuple[str, str]:
"""Translate *text* to English and return ``(translated_text, source_lang_code)``.

If the detected language is already English (``"en"``), the
original text is returned as-is without calling a translation
service.

Args:
text: Raw input string (may be any language).

Returns:
A tuple of:
- ``translated_text`` (str) – the English version of *text*.
- ``source_lang`` (str) – BCP-47 code of the detected source
language (e.g. ``"fr"``, ``"ar"``, ``"en"``).
"""
if not text or not text.strip():
return text, "en"

source_lang = _detect_language(text)

if source_lang == "en":
return text, "en"

try:
from deep_translator import GoogleTranslator # type: ignore

translated = GoogleTranslator(
source=source_lang, target="en"
).translate(text)
return translated, source_lang
except Exception as exc:
# If translation fails (e.g. network issue), log a warning and
# fall back to the original text so the pipeline is never blocked.
print(
f"[WARNING] Translation failed (source={source_lang}): {exc}. "
"Falling back to original text."
)
return text, source_lang
Loading