Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions api/db/repositories.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,33 @@
from sqlmodel import Session, select
from api.db.models import Template, FormSubmission

# Templates

# ── Templates ─────────────────────────────────────────────────

def create_template(session: Session, template: Template) -> Template:
session.add(template)
session.commit()
session.refresh(template)
return template


def get_template(session: Session, template_id: int) -> Template | None:
return session.get(Template, template_id)

# Forms

def get_all_templates(session: Session, limit: int = 100, offset: int = 0) -> list[Template]:
statement = select(Template).offset(offset).limit(limit)
return session.exec(statement).all()


# ── Forms ─────────────────────────────────────────────────────

def create_form(session: Session, form: FormSubmission) -> FormSubmission:
session.add(form)
session.commit()
session.refresh(form)
return form
return form


def get_form(session: Session, submission_id: int) -> FormSubmission | None:
return session.get(FormSubmission, submission_id)
20 changes: 19 additions & 1 deletion api/main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,25 @@
from fastapi import FastAPI
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from api.routes import templates, forms
from api.errors.base import AppError
from typing import Union

app = FastAPI()

app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)

@app.exception_handler(AppError)
def app_error_handler(request: Request, exc: AppError):
return JSONResponse(
status_code=exc.status_code,
content={"detail": exc.message}
)

app.include_router(templates.router)
app.include_router(forms.router)
205 changes: 198 additions & 7 deletions api/routes/forms.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,216 @@
import os
from fastapi import APIRouter, Depends
from fastapi.responses import FileResponse
from sqlmodel import Session
from api.deps import get_db
from api.schemas.forms import FormFill, FormFillResponse
from api.db.repositories import create_form, get_template
from api.schemas.forms import FormFill, FormFillResponse, BatchFormFill, BatchFormFillResponse, BatchResultItem
from api.db.repositories import create_form, get_template, get_form
from api.db.models import FormSubmission
from api.errors.base import AppError
from src.controller import Controller
from src.llm import LLM
from src.filler import Filler

router = APIRouter(prefix="/forms", tags=["forms"])


@router.post("/fill", response_model=FormFillResponse)
def fill_form(form: FormFill, db: Session = Depends(get_db)):
if not get_template(db, form.template_id):
template = get_template(db, form.template_id)
if not template:
raise AppError("Template not found", status_code=404)

fetched_template = get_template(db, form.template_id)
# Validate PDF exists on disk (#235)
if not os.path.exists(template.pdf_path):
raise AppError(
f"Template PDF not found on disk: {template.pdf_path}. "
"Please re-upload the template.",
status_code=404
)

try:
controller = Controller()
fields_list = list(template.fields.keys()) if isinstance(template.fields, dict) else template.fields
path = controller.fill_form(
user_input=form.input_text,
fields=fields_list,
pdf_form_path=template.pdf_path
)
except ConnectionError:
raise AppError(
"Could not connect to Ollama. Make sure ollama serve is running.",
status_code=503
)
except Exception as e:
raise AppError(f"PDF filling failed: {str(e)}", status_code=500)

controller = Controller()
path = controller.fill_form(user_input=form.input_text, fields=fetched_template.fields, pdf_form_path=fetched_template.pdf_path)
if not path:
raise AppError(
"PDF generation failed — no output file was produced. "
"Check that the PDF template is a valid fillable form and Ollama is running.",
status_code=500
)

submission = FormSubmission(**form.model_dump(), output_pdf_path=path)
if not os.path.exists(path):
raise AppError(
f"PDF was generated but file not found at: {path}",
status_code=500
)

submission = FormSubmission(
**form.model_dump(),
output_pdf_path=path
)
return create_form(db, submission)


@router.post("/fill/batch", response_model=BatchFormFillResponse)
def fill_batch(batch: BatchFormFill, db: Session = Depends(get_db)):
"""
Batch multi-template form filling — closes #156.

KEY DESIGN: LLM extraction runs ONCE for the entire batch.
All templates share the same extracted JSON — no redundant Ollama calls.

Flow:
1. Validate all templates exist upfront
2. Merge ALL fields from ALL templates into one superset
3. ONE LLM call extracts all values from transcript
4. Each template PDF filled using its relevant subset of extracted values
"""
if not batch.template_ids:
raise AppError("template_ids must not be empty", status_code=400)

# ── Step 1: Validate all templates upfront ────────────────
templates = []
for tid in batch.template_ids:
tpl = get_template(db, tid)
if not tpl:
raise AppError(f"Template {tid} not found", status_code=404)
if not os.path.exists(tpl.pdf_path):
raise AppError(
f"Template '{tpl.name}' (id={tid}) PDF not found on disk. "
"Please re-upload the template.",
status_code=404
)
templates.append(tpl)

print(f"[BATCH] Starting batch fill for {len(templates)} template(s)...")
print(f"[BATCH] Templates: {[t.name for t in templates]}")

# ── Step 2: Merge ALL fields from ALL templates into superset
# One LLM call covers every field needed across all templates
merged_fields = {}
for tpl in templates:
if isinstance(tpl.fields, dict):
merged_fields.update(tpl.fields)
else:
for f in tpl.fields:
merged_fields[f] = f

print(f"[BATCH] Merged superset: {len(merged_fields)} unique field(s) across all templates")

# ── Step 3: ONE LLM call for entire batch ─────────────────
print(f"[BATCH] Running single LLM extraction (no redundant calls)...")
try:
llm = LLM(
transcript_text=batch.input_text,
target_fields=merged_fields
)
llm.main_loop()
extracted_json = llm.get_data()
print(f"[BATCH] Extraction complete — {len(extracted_json)} fields extracted")
except ConnectionError:
raise AppError(
"Could not connect to Ollama. Make sure ollama serve is running.",
status_code=503
)
except Exception as e:
raise AppError(f"LLM extraction failed: {str(e)}", status_code=500)

# ── Step 4: Fill each PDF with pre-extracted data ─────────
# No new LLM calls — just PDF writing per template
results = []
success_count = 0
fail_count = 0
filler = Filler()

for tpl in templates:
print(f"[BATCH] Filling PDF: '{tpl.name}' (id={tpl.id})...")
try:
# Subset extracted data to only this template's fields
tpl_field_keys = list(tpl.fields.keys()) if isinstance(tpl.fields, dict) else tpl.fields
tpl_data = {k: extracted_json.get(k) for k in tpl_field_keys}

# Fill PDF directly — no LLM call
output_path = filler.fill_form_with_data(
pdf_form=tpl.pdf_path,
data=tpl_data
)

if not output_path or not os.path.exists(output_path):
raise RuntimeError("No output file produced")

submission = FormSubmission(
template_id=tpl.id,
input_text=batch.input_text,
output_pdf_path=output_path
)
saved = create_form(db, submission)

results.append(BatchResultItem(
template_id=tpl.id,
template_name=tpl.name,
success=True,
submission_id=saved.id,
download_url=f"/forms/download/{saved.id}",
error=None
))
success_count += 1
print(f"[BATCH] ✅ '{tpl.name}' done (submission #{saved.id})")

except Exception as e:
fail_count += 1
results.append(BatchResultItem(
template_id=tpl.id,
template_name=tpl.name,
success=False,
submission_id=None,
download_url=None,
error=str(e)
))
print(f"[BATCH] ✗ '{tpl.name}' failed: {e}")

print(f"[BATCH] Complete — {success_count} succeeded, {fail_count} failed")

return BatchFormFillResponse(
total=len(templates),
succeeded=success_count,
failed=fail_count,
results=results
)


@router.get("/{submission_id}", response_model=FormFillResponse)
def get_submission(submission_id: int, db: Session = Depends(get_db)):
submission = get_form(db, submission_id)
if not submission:
raise AppError("Submission not found", status_code=404)
return submission


@router.get("/download/{submission_id}")
def download_filled_pdf(submission_id: int, db: Session = Depends(get_db)):
submission = get_form(db, submission_id)
if not submission:
raise AppError("Submission not found", status_code=404)

file_path = submission.output_pdf_path
if not os.path.exists(file_path):
raise AppError("PDF file not found on server", status_code=404)

return FileResponse(
path=file_path,
media_type="application/pdf",
filename=os.path.basename(file_path)
)
91 changes: 82 additions & 9 deletions api/routes/templates.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,89 @@
from fastapi import APIRouter, Depends
import os
import shutil
import uuid
from fastapi import APIRouter, Depends, UploadFile, File, Form
from sqlmodel import Session
from api.deps import get_db
from api.schemas.templates import TemplateCreate, TemplateResponse
from api.db.repositories import create_template
from api.schemas.templates import TemplateResponse
from api.db.repositories import create_template, get_all_templates
from api.db.models import Template
from src.controller import Controller
from api.errors.base import AppError

router = APIRouter(prefix="/templates", tags=["templates"])

# Save directly into src/inputs/ — stable location, won't get wiped
TEMPLATES_DIR = os.path.join("src", "inputs")
os.makedirs(TEMPLATES_DIR, exist_ok=True)


@router.post("/create", response_model=TemplateResponse)
def create(template: TemplateCreate, db: Session = Depends(get_db)):
controller = Controller()
template_path = controller.create_template(template.pdf_path)
tpl = Template(**template.model_dump(exclude={"pdf_path"}), pdf_path=template_path)
return create_template(db, tpl)
async def create(
name: str = Form(...),
file: UploadFile = File(...),
db: Session = Depends(get_db)
):
# Validate PDF
if not file.filename.endswith(".pdf"):
raise AppError("Only PDF files are allowed", status_code=400)

# Save uploaded file with unique name into src/inputs/
unique_name = f"{uuid.uuid4().hex}_{file.filename}"
save_path = os.path.join(TEMPLATES_DIR, unique_name)

with open(save_path, "wb") as f:
shutil.copyfileobj(file.file, f)

# Extract fields using commonforms + pypdf
# Store as simple list of field name strings — what Filler expects
try:
from commonforms import prepare_form
from pypdf import PdfReader

# Read real field names directly from original PDF
# Use /T (internal name) as both key and label
# Real names like "JobTitle", "Phone Number" are already human-readable
reader = PdfReader(save_path)
raw_fields = reader.get_fields() or {}

fields = {}
for internal_name, field_data in raw_fields.items():
# Use /TU tooltip if available, otherwise prettify /T name
label = None
if isinstance(field_data, dict):
label = field_data.get("/TU")
if not label:
# Prettify: "JobTitle" → "Job Title", "DATE7_af_date" → "Date"
import re
label = re.sub(r'([a-z])([A-Z])', r'\1 \2', internal_name)
label = re.sub(r'_af_.*$', '', label) # strip "_af_date" suffix
label = label.replace('_', ' ').strip().title()
fields[internal_name] = label

except Exception as e:
print(f"Field extraction failed: {e}")
fields = []

# Save to DB
tpl = Template(name=name, pdf_path=save_path, fields=fields)
return create_template(db, tpl)


@router.get("", response_model=list[TemplateResponse])
def list_templates(
limit: int = 100,
offset: int = 0,
db: Session = Depends(get_db)
):
return get_all_templates(db, limit=limit, offset=offset)


@router.get("/{template_id}", response_model=TemplateResponse)
def get_template_by_id(
template_id: int,
db: Session = Depends(get_db)
):
from api.db.repositories import get_template
tpl = get_template(db, template_id)
if not tpl:
raise AppError("Template not found", status_code=404)
return tpl
Loading