diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
index 02c9240..3f17f9c 100644
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -4,7 +4,7 @@ User-facing overview, screenshots, and quick start: [README.md](README.md).
 
 GrillKit is an AI-powered technical interview trainer. The stack is **FastAPI** (HTTP + WebSocket), **SQLAlchemy** (SQLite), **Alembic** (schema and data migrations), **Jinja2** templates, and **OpenAI-compatible** plus **faster-whisper** adapters in `ai/`. Code is organized **by feature** (`interview/`, `theory/`, `coding/`, `speech/`, `question_voice/`, `platform/`) with cross-cutting code in `shared/`.
 
-**Session orchestration** lives in `interview/`: setup, dashboard, session shell (`Interview`), page composition, phase order, completion, and `selection_spec` v2 (`session_mode`). **Theory flow** lives in `theory/`: questions, tasks, timer, WebSocket/audio submit, and AI evaluation. **Coding flow** lives in `coding/`: YAML task banks, Monaco UI, Judge0 Run attempts, WebSocket submit, and AI evaluation. The interview shell does not own section tasks; `InterviewRead` composes theory task rows at read time via `theory_sections` + `answers`.
+**Session orchestration** lives in `interview/`: setup, dashboard, session shell (`Interview`), page composition, phase order, completion, results hub, and `selection_spec` v2 (`session_mode`). **Theory flow** lives in `theory/`: questions, tasks, timer, WebSocket/audio submit, AI evaluation, and post-session review. **Coding flow** lives in `coding/`: YAML task banks, Monaco UI, Judge0 Run attempts, WebSocket submit, AI evaluation, and post-session review. The interview shell does not own section tasks; `InterviewRead` composes theory task rows at read time via `theory_sections` + `answers`, and coding context from `coding_sections` + `coding_tasks`.
 
 Within each feature: transport in `api/`, orchestration in `services/`, Pydantic read models in `schemas/` (where present), persistence in `repositories/`. Domain layers use frozen aggregates and value objects separate from ORM and DTOs. Transactions use `InterviewUnitOfWork` / `TheoryUnitOfWork` extending `shared/infrastructure/uow.py`. APIs do not expose SQLAlchemy models on the wire.
 
@@ -29,10 +29,14 @@ grillkit/
 │   │   ├── questions.py        # YAML theory question loader (data/questions/)
 │   │   ├── coding.py           # YAML coding task loader (data/coding/)
 │   │   ├── locales.py          # SUPPORTED_LOCALES, normalize_locale()
+│   │   ├── structured_evaluation.py  # Shared LLM JSON parse helpers
+│   │   ├── evaluation_models.py      # Section/session evaluation DTOs
+│   │   ├── task_timer.py             # Per-round timer helpers
 │   │   ├── infrastructure/
 │   │   │   ├── database.py     # engine, SessionLocal, DATABASE_URL env, run_migrations()
-│   │   │   ├── models.py       # Interview, TheorySection, Answer (theory tasks) ORM
+│   │   │   ├── models.py       # Interview, TheorySection, Answer, CodingSection, CodingTask, CodeRunAttempt
 │   │   │   ├── audio_wav.py    # Canonical mono 16 kHz WAV validation
+│   │   │   ├── hf_hub_runtime.py, hf_download_progress.py, artifact_*
 │   │   │   └── uow.py          # Base UnitOfWork: session, commit, rollback
 │   │   └── repositories/
 │   │       └── base.py         # Repository[T], SqlAlchemyRepository[T]
@@ -73,13 +77,16 @@ grillkit/
 │   │   │   ├── sections.py     # Section registry and shared section DTOs
 │   │   │   ├── evaluation_aggregator.py
 │   │   │   ├── session_evaluator.py
-│   │   │   └── events.py
+│   │   │   ├── results_page.py # SessionResultsPageService (completed hub)
+│   │   │   ├── section_feedback.py, section_evaluation.py, scoring.py
+│   │   │   └── events.py       # Shared WS/NDJSON event types (theory + coding)
 │   │   └── api/
 │   │       ├── deps.py
 │   │       ├── dashboard.py    # GET /
-│   │       ├── setup.py        # GET/POST /setup
+│   │       ├── setup.py        # GET/POST /setup, cascaded options
 │   │       ├── setup_form.py
 │   │       ├── routes.py       # GET /interview/{id}, question-audio
+│   │       ├── results.py      # GET /results, /theory, /coding (completed sessions)
 │   │       └── errors.py
 │   ├── coding/                 # Coding section (tasks, Judge0 runner, WS/API, evaluator)
 │   │   ├── domain/             # CodingSection, CodingTask, CodeRunAttempt aggregates
@@ -91,7 +98,7 @@ grillkit/
 │   │   │   ├── runner.py       # CodingRunnerService (public/hidden tests, compile-only)
 │   │   │   ├── run_execution.py, submission.py, navigation.py, state.py, page.py
 │   │   │   ├── judge0_client.py, judge0_config.py, harness.py
-│   │   │   ├── section.py, query.py
+│   │   │   ├── section.py, query.py, review.py
 │   │   │   └── evaluator/      # CodingEvaluatorService
 │   │   ├── api/
 │   │   │   ├── routes.py       # POST /coding/run, GET /coding/state, WS /coding/ws
@@ -106,7 +113,7 @@ grillkit/
 │   │   │   ├── creation.py     # TheorySectionCreationService
 │   │   │   ├── submission.py   # answer/timeout/audio orchestration
 │   │   │   ├── navigation.py, timer.py, evaluation_persistence.py
-│   │   │   ├── page.py, query.py, section.py
+│   │   │   ├── page.py, query.py, section.py, review.py
 │   │   │   └── evaluator/      # TheoryEvaluatorService
 │   │   └── api/
 │   │       ├── routes.py       # WS /theory/ws, POST /theory/audio-answer
@@ -136,10 +143,20 @@ grillkit/
 │   └── questions/              # YAML banks: {track}/{level}/{category}.yaml
 ├── alembic/                    # Schema and data migrations
 ├── alembic.ini
-├── docker-compose.yml          # app service only
+├── docker-compose.yml          # app (+ optional Judge0 profile `coding`)
 ├── docker-entrypoint.sh        # PUID/PGID, ensures data/db writable
 ├── Dockerfile                  # Multi-stage uv build → uvicorn
-└── tests/
+└── tests/                      # Mirrors app/ layout (see Tests)
+    ├── conftest.py, fakes.py
+    ├── helpers/                # Flat shared seeds (interview_seed, coding_seed, …)
+    ├── ai/, app/
+    ├── interview/{api,repositories,services/rules,services}/
+    ├── theory/{api,services,repositories,integration}/
+    ├── coding/{api,services,repositories}/
+    ├── speech/{api,services}/
+    ├── question_voice/{api,services}/
+    ├── platform/{api,services}/
+    └── shared/{infrastructure}/
 ```
 
 ## HTTP Routes
@@ -149,7 +166,9 @@ grillkit/
 | GET | `/` | `interview/api/dashboard.py` | Interview history (last 20) |
 | GET | `/setup` | `interview/api/setup.py` | New interview form (redirects to `/config` if unset) |
 | POST | `/setup` | `interview/api/setup.py` | Create interview → redirect `/interview/{id}` |
-| GET | `/setup/options` | `interview/api/setup.py` | Cascaded JSON: tracks → levels → categories |
+| GET | `/setup/options` | `interview/api/setup.py` | Cascaded JSON: theory tracks → levels → categories |
+| GET | `/setup/coding-options` | `interview/api/setup.py` | Cascaded JSON: coding tracks → levels → categories |
+| GET | `/setup/coding-available` | `interview/api/setup.py` | JSON: whether coding modes are offered (Judge0 health) |
 | GET | `/config` | `platform/api/config.py` | AI provider configuration form |
 | POST | `/config` | `platform/api/config.py` | Test connection (via form dependency), then save |
 | POST | `/config/test` | `platform/api/config.py` | Test connection without saving |
@@ -160,10 +179,16 @@ grillkit/
 | GET | `/speech/model/options` | `speech/api/routes.py` | JSON size trade-off metadata |
 | GET | `/speech/tts/status` | `question_voice/api/routes.py` | Piper voice status (HTML fragment or JSON) when question voice is enabled |
 | POST | `/speech/tts/voice/download` | `question_voice/api/routes.py` | Start Piper voice download for configured `tts_voice_id` |
-| GET | `/interview/{interview_id}` | `interview/api/routes.py` | Session page (composed shell + theory context) |
+| GET | `/interview/{interview_id}` | `interview/api/routes.py` | Active session page (theory and/or coding by phase); completed → redirect `/results` |
+| GET | `/interview/{interview_id}/results` | `interview/api/results.py` | Completed session hub: overall evaluation + section cards |
+| GET | `/interview/{interview_id}/theory` | `interview/api/results.py` | Theory review: chat history and section feedback (completed only) |
+| GET | `/interview/{interview_id}/coding` | `interview/api/results.py` | Coding review: per-task accordion with submits and feedback (completed only) |
 | GET | `/interview/{interview_id}/question-audio` | `interview/api/routes.py` | WAV for current theory task (`answer_id` query param) |
 | POST | `/interview/{interview_id}/theory/audio-answer` | `theory/api/routes.py` | Multipart WAV theory answer → NDJSON |
 | WS | `/interview/{interview_id}/theory/ws` | `theory/api/routes.py` | Real-time theory task submit, timeout, session complete |
+| POST | `/interview/{interview_id}/coding/run` | `coding/api/routes.py` | Run public tests via Judge0; persist `CodeRunAttempt` |
+| GET | `/interview/{interview_id}/coding/state` | `coding/api/routes.py` | Current coding task, progress, run history |
+| WS | `/interview/{interview_id}/coding/ws` | `coding/api/routes.py` | Coding submit, hidden tests, AI evaluation stream |
 | WS | `/interview/{interview_id}/dictation` | `speech/api/dictation.py` | PCM dictation: `start` → `ready`, audio chunks, `stop` → `final` |
 | — | `/static/*` | `main.py` | CSS, JS, and assets |
 
@@ -175,6 +200,7 @@ grillkit/
 | `*/api/deps.py` | Inject service **classes** via `Depends` (handlers call static methods) |
 | `interview/domain/` | Interview session shell aggregate, `SessionSelection`, serialization, domain exceptions |
 | `theory/domain/` | `TheorySection` / `TheoryTask` aggregates and theory-specific exceptions |
+| `coding/domain/` | `CodingSection` / `CodingTask` / `CodeRunAttempt` aggregates and coding exceptions |
 | `interview/schemas/` | Session read models (`InterviewRead`, dashboard/page context) |
 | `theory/schemas/` | Theory read models and WebSocket wire message types |
 | `interview/repositories/mappers.py` | Shell ORM ↔ domain; composes `InterviewRead` with theory tasks |
@@ -190,6 +216,9 @@ grillkit/
 | `shared/infrastructure/uow.py` | Base transaction boundary (session lifecycle) |
 | `interview/repositories/uow.py` | `InterviewUnitOfWork`: `uow.interviews`, `uow.theory_sections` |
 | `theory/repositories/uow.py` | `TheoryUnitOfWork`: theory section persistence |
+| `coding/repositories/uow.py` | `CodingUnitOfWork`: coding section + run attempts |
+| `interview/services/results_page.py` | Completed session hub context (`SessionResultsPageService`) |
+| `theory/services/review.py`, `coding/services/review.py` | Post-session section review page builders |
 | `shared/infrastructure/models.py` | ORM models |
 | `ai/` | Provider adapters (`AIProvider`, `SpeechTranscriber`) |
 | `shared/questions.py` | Read-only YAML question bank access |
@@ -221,18 +250,24 @@ question_voice/services/
   └── tts_cache.py ──► data/tts-cache/v2/{locale}/
 
 interview/services/
-  ├── creation.py ──► SessionCreationService, TheorySectionCreationService
-  ├── page.py ──► SessionPageService, TheoryPageService
+  ├── creation.py ──► SessionCreationService + section creation services
+  ├── page.py ──► SessionPageService, TheoryPageService, CodingPageService
   ├── completion.py ──► SessionCompletionService, SessionEvaluationAggregator
+  ├── results_page.py ──► completed hub; review links via section registry
   ├── query.py, dashboard.py, phases.py, sections.py
-  └── session_evaluator.py ──► session-level narrative (delegates section eval to theory)
+  └── session_evaluator.py ──► session-level narrative (theory + coding sections)
 
 theory/services/
   ├── planning.py ──► app/shared/questions.py (filters type=coding)
-  ├── creation.py, submission.py, navigation.py, timer.py
+  ├── creation.py, submission.py, navigation.py, timer.py, review.py
   ├── section.py ──► section registry hooks + prefetch
   └── evaluator/ ──► TheoryEvaluatorService (per-task + section narrative)
 
+coding/services/
+  ├── planning.py ──► app/shared/coding.py
+  ├── runner.py, submission.py, section.py, review.py
+  └── evaluator/ ──► CodingEvaluatorService (per-task + section narrative)
+
 interview/api/deps.py ──► platform/services/ai_context (yields AIProvider for WS/routes)
 
 platform/services/config.py ──► ai/factory, speech/schemas, data/config.json
@@ -243,7 +278,7 @@ speech/services/
   └── dictation.py ──► ai/speech_transcriber
 
 shared/infrastructure/uow.py
-  └── interview/repositories/, theory/repositories/ ──► shared/repositories/base, models
+  └── interview/, theory/, coding/ repositories ──► shared/repositories/base, models
 ```
 
 On GitHub, the same graph is also available as Mermaid (rendered on github.com only):
@@ -284,8 +319,20 @@ flowchart TB
     interview_creation[creation]
     interview_query[query]
     interview_completion[completion]
-    answer_processing
-    interview_evaluator[evaluator]
+    interview_phases[phases]
+    session_evaluator[session_evaluator]
+    results_page[results_page]
+  end
+  subgraph theory_svc [theory/services]
+    theory_submission[submission]
+    theory_evaluator[evaluator]
+    theory_review[review]
+  end
+  subgraph coding_svc [coding/services]
+    coding_submission[submission]
+    coding_runner[runner]
+    coding_evaluator[evaluator]
+    coding_review[review]
   end
   subgraph platform_svc [platform/services]
     config_service[config]
@@ -304,8 +351,12 @@ flowchart TB
   interview_svc --> uow
   interview_svc --> questions_mod[questions]
   interview_creation --> questions_mod
-  interview_completion --> interview_evaluator
-  answer_processing --> interview_evaluator
+  interview_completion --> session_evaluator
+  theory_submission --> theory_evaluator
+  coding_submission --> coding_runner
+  coding_submission --> coding_evaluator
+  results_page --> theory_review
+  results_page --> coding_review
   ai_context --> config_service
   ai_context --> ai_layer
   subgraph ai_layer [ai]
@@ -316,9 +367,14 @@ flowchart TB
   uow --> repos
   subgraph interview_repos [interview/repositories]
     interview_repo[interview]
-    answer_repo[answer]
     repo_mappers[mappers]
   end
+  subgraph theory_repos [theory/repositories]
+    theory_section_repo[theory_section]
+  end
+  subgraph coding_repos [coding/repositories]
+    coding_section_repo[coding_section]
+  end
   interview_repos --> models
   repo_mappers --> interview_domain
 ```
@@ -331,16 +387,21 @@ flowchart TB
 |---------|----------------|
 | Session shell aggregate | `app.interview.domain.entities.Interview` |
 | Theory section aggregate | `app.theory.domain.entities.TheorySection` |
+| Coding section aggregate | `app.coding.domain.entities.CodingSection` |
 | Interview ORM model | `shared.infrastructure.models.Interview` (table `interviews`) |
 | Theory task ORM | `shared.infrastructure.models.Answer` (table `answers`, FK `theory_section_id`) |
+| Coding task ORM | `shared.infrastructure.models.CodingTask` (table `coding_tasks`) |
+| Coding run snapshot ORM | `shared.infrastructure.models.CodeRunAttempt` |
 | Session read DTO | `app.interview.schemas.interview.InterviewRead` (composes theory tasks) |
 | Theory task read DTO | `app.theory.schemas.theory.TheoryTaskRead` |
 | Route / WS path param | `interview_id` (same value as `Interview.id`) |
-| Create flow | `SessionCreationService.create_session()` + `TheorySectionCreationService.create()` |
+| Create flow | `SessionCreationService.create_session()` + section creation services when enabled |
 | Read flow | `InterviewQuery.get_interview()`, `DashboardBuilder.list_rows()` |
-| Theory submit | `TheorySubmissionService` (WS + audio) |
 | Complete flow | `SessionCompletionService.complete_session()` |
-| UoW repositories | `uow.interviews`, `uow.theory_sections` |
+| Results hub | `SessionResultsPageService.prepare_page()` |
+| UoW repositories | `uow.interviews`, `uow.theory_sections`, `uow.coding_sections` (per feature UoW) |
+| Theory submit | `TheorySubmissionService` (WS + audio + timeouts) |
+| Coding submit | `CodingSubmissionService` (WS submit after Run history) |
 | SQLAlchemy session | `uow.session` |
 
 ## Key Models
@@ -386,6 +447,35 @@ flowchart TB
 
 Initial task rows are created with the theory section; follow-ups append via `TheorySectionRepository.save_aggregate`.
 
+### CodingSection (`coding_sections`)
+
+| Field | Type | Notes |
+|-------|------|-------|
+| `id` | `int` | Auto-increment PK |
+| `interview_id` | `str` | FK to `interviews.id` (1:0..1) |
+| `selection_spec` | `str` | Coding branch selection JSON |
+| `task_count` | `int` | Number of coding tasks in section |
+| `task_time_limit_seconds` | `int \| None` | Per-task timer (`None` = off) |
+| `status` | `str` | `pending`, `active`, `completed`, or `skipped` |
+| `section_score`, `section_feedback` | | Section narrative (prefetched after phase complete) |
+| `locale` | `str` | Section locale snapshot |
+
+### CodingTask (`coding_tasks`)
+
+| Field | Type | Notes |
+|-------|------|-------|
+| `id` | `int` | Auto-increment PK |
+| `coding_section_id` | `int` | FK to `coding_sections.id` |
+| `task_id` | `str` | ID from coding YAML bank |
+| `order` | `int` | 1-based display order |
+| `round` | `int` | `0` = initial; `1+` = AI follow-up (code or explanation) |
+| `prompt_text`, `task_spec` | `str` | Snapshot at ask time (`task_spec` is JSON) |
+| `submitted_code` | `str \| None` | Final code for the round |
+| `submit_test_summary` | `str \| None` | JSON hidden-test outcome on submit |
+| `score`, `feedback` | | After AI evaluation (1–5) |
+
+`CodeRunAttempt` rows store each **Run** snapshot (code, stderr, public test results) for AI context on submit.
+
 ## Data Flow: Configure Provider
 
 ```
@@ -516,6 +606,29 @@ Client → WS /interview/{id}/theory/ws {"type":"complete"}
 
 Display score sums `score_breakdown.theory.score` and `score_breakdown.coding.score` when both sections exist. Ending early marks an incomplete enabled section as skipped (score 0 for that section).
 
+## Data Flow: Results and Review Pages
+
+```
+GET /interview/{id} on completed session
+  → SessionPageService redirects 303 → /interview/{id}/results
+
+GET /interview/{id}/results
+  → SessionResultsPageService.prepare_page()
+       → load completed InterviewRead + overall_feedback JSON
+       → section registry builds cards (theory/coding) with review URLs
+  → session_results.html
+
+GET /interview/{id}/theory
+  → TheoryReviewService.build_context() — answered rounds + section_feedback
+  → theory_review.html (redirect to /results if section missing)
+
+GET /interview/{id}/coding
+  → CodingReviewService.build_context() — tasks grouped by task_id with rounds
+  → coding_review.html
+```
+
+Dashboard history links to `/interview/{id}/results` for completed sessions.
+
 ## Data Access Pattern
 
 ```python
@@ -649,6 +762,32 @@ Follow-up rounds use the same pipeline (cache key from localized `question_text`
 | Audio flag | `accepts_audio_input` on `LLMModelEntry` — enables interview audio-answer UI and config audio probe |
 | Effective config | `ConfigService.resolve_effective_config()` applies catalog `base_url`, `model`, and `api_key` |
 
+## Tests
+
+Pytest discovers modules under `tests/` (`pyproject.toml` → `testpaths = ["tests"]`). Layout **mirrors `app/`** so each feature owns its tests:
+
+| `app/` package | `tests/` mirror | Typical modules |
+|----------------|-----------------|-----------------|
+| `ai/` | `tests/ai/` | `test_base.py`, `test_factory.py`, `test_openai_compatible.py` |
+| `interview/` | `tests/interview/{api,repositories,services}/` | `test_creation.py`, `test_phases.py`, `test_results.py` |
+| `theory/` | `tests/theory/{api,services,repositories,integration}/` | `test_submission.py`, `test_ws_routes.py`, `test_review.py` |
+| `coding/` | `tests/coding/{api,services,repositories}/` | `test_runner.py`, `test_evaluator.py`, `test_review.py` |
+| `speech/`, `question_voice/` | `tests/speech/`, `tests/question_voice/` | API + service tests |
+| `platform/` | `tests/platform/{api,services}/` | `test_config.py`, `test_llm_catalog.py` |
+| `shared/` | `tests/shared/` (+ `infrastructure/`) | `test_questions.py`, `test_coding.py`, `test_uow.py` |
+| `main.py` | `tests/app/` | `test_main.py` |
+
+Shared fixtures live in `tests/conftest.py` (`client`, `isolated_db`, `fake_ai_provider`, `override_ws_ai_provider`). Cross-feature seeds stay **flat** in `tests/helpers/` (`interview_seed.py`, `coding_seed.py`, `completed_session_seed.py`, …). `tests/fakes.py` provides `FakeProvider` and canned evaluation JSON.
+
+`tests/shared/test_questions.py` is loaded via `pytest_plugins` in `conftest.py` for the `temp_questions_dir` fixture used by creation tests.
+
+Run the suite:
+
+```bash
+uv run pytest
+uv run pytest tests/theory/services/test_submission.py   # single module
+```
+
 ## Current Limitations
 
 - Only one AI adapter type is implemented: `openai-compatible` (`ProviderFactory`)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1dc1119..af26e5f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,62 +8,26 @@ Work in progress is accumulated under `[Unreleased]`; on release, that section b
 
 ### Added
 
-- **Session results hub** — completed interviews redirect to `/interview/{id}/results` with overall evaluation and per-section summary cards linking to dedicated review pages
-- **Theory review page** — `/interview/{id}/theory` shows section feedback and full Q&A chat history with per-round scores after session completion
-- **Coding review page** — `/interview/{id}/coding` shows section feedback and an accordion of coding tasks with final submit, test summary, and per-round feedback on one page
-- **Coding section evaluator** — `CodingEvaluatorService.evaluate_section()` prefetches `coding_sections.section_feedback` when the coding phase completes and before session completion
-- **Coding interview UI** — separate coding panel with Monaco editor (CDN), Run (`POST /coding/run`), Submit (`WS /coding/ws`), run output with test progress, `sessionStorage` drafts, and phase switch between theory and coding by `session_mode`
-- **CodingEvaluatorService** — AI scoring for coding submit with run history and hidden test context in prompts; `follow_up_mode: code | explanation`; hidden test failures cap score at 3
-- **Coding Run API** — `POST /interview/{id}/coding/run` executes public tests via Judge0 and persists `CodeRunAttempt`; `GET /interview/{id}/coding/state` returns current task, progress, and run history; `WS /interview/{id}/coding/ws` accepts submit and streams `feedback`
-- **Judge0 coding runner** — `CodingRunnerService` executes public tests and compile-only checks via `Judge0Client`; Python harness wraps candidate code for entrypoint tasks; setup blocks coding when Judge0 is unhealthy (`CODING_ENABLED` + health probe)
-- **Judge0 Docker profile** — `docker compose --profile coding up` starts Judge0 CE (server, worker, Postgres, Redis); `deploy/judge0.conf` and env vars `JUDGE0_URL`, `JUDGE0_AUTH_TOKEN`
-- **Coding setup and planning** — all four `session_mode` options on setup when coding is available; `GET /setup/coding-options` and `GET /setup/coding-available`; `app/coding/services/planning.py` picks tasks from `data/coding/`; `SessionCreationService` creates coding sections via `CodingSectionCreationService`
-- **Dashboard session mode badge** — history rows show Theory, Coding, or Theory+Coding from `session_mode`
-- **`app/theory/` module scaffold** — domain (`TheorySection`, `TheoryTask`), repositories, read schemas, and `theory_sections` table with backfill from existing interviews
-- **Theory section tasks** — `answers.theory_section_id` links tasks to sections; theory repository loads full aggregate; interview creation dual-writes theory section rows
-- **Theory submission services** — answer processing, navigation, timer, and evaluation persistence moved to `app/theory/services/`; WebSocket and audio API use `TheorySubmissionService`
-- **Theory API routes** — canonical `POST /interview/{id}/theory/audio-answer` and `WS /interview/{id}/theory/ws`; legacy `/audio-answer` and `/ws` delegate with deprecation log; interview page uses new paths
-- **Theory evaluator** — `app/theory/services/evaluator/` with `TheoryEvaluatorService`; per-task evaluation used by theory submission; `InterviewEvaluatorService` remains a compat alias
-- **Session creation split** — `SessionCreationService` persists an interview shell plus `TheorySectionCreationService`; `Interview.start_shell` and theory-aware `interview_from_orm` reads
-- **Selection spec v2** — `SessionSelection` with `session_mode`, theory/coding branches; setup form session-mode picker (coding modes shown as coming soon); Alembic backfill for legacy rows
-- **Session page composition** — `SessionPageService` merges shell + `TheoryPageContext`; phase order from `session_mode`
-- **Session evaluation pipeline** — `SessionEvaluationAggregator`, `SessionEvaluatorService`, and `InterviewSection` protocol with theory prefetch via `on_phase_complete`
-
 ### Changed
 
-- **Section orchestration consolidation** — typed `SectionService` protocol with `is_user_facing` / `activate_if_pending`, shared section evaluation/review helpers, session evaluation models moved to `app/shared/evaluation_models.py`, multi-section score fallback sums both sections, unified results hub card builder via section registry, `score_breakdown` attached only at session completion via `attach_session_score_breakdown`
-- **Session orchestration refactor** — unified `SESSION_MODE_LABELS`, section service registry instead of unused `InterviewSection` protocol, single `InterviewUnitOfWork` for cross-section phase reads, shared section-feedback prefetch and task timer helpers, score resolution moved out of mappers
-- **Completed session navigation** — dashboard history links to `/interview/{id}/results`; active interview pages no longer embed final evaluation in the sidebar
-- **Session completion scoring** — `SessionCompletionService` merges theory and coding section summaries; `score_breakdown` exposes separate `theory` and `coding` totals; display score sums both sections
-- **Theory question planning** — excludes legacy `type: coding` rows still present in theory YAML banks
-- **Documentation** — `ARCHITECTURE.md` coding data flows and scoring; `README.md` setup/coding env vars; `CONTRIBUTING.md` coding task YAML format
-- **Coding naming** — domain/ORM fields use `task_count`, `task_id`, and `prompt_text` instead of legacy `question_*` names; `CodingSectionCreationService` requires shared `InterviewUnitOfWork` like theory
-- **Shared paths and questions** — `app/paths.py` and `app/questions.py` moved to `app/shared/paths.py` and `app/shared/questions.py`
-- **Theory question planning** — moved to `app/theory/services/planning.py`; excludes YAML `type: coding` rows
-- **Session read models** — `AnswerRead` is an alias of `TheoryTaskRead`; interview domain no longer defines an `Answer` entity
-- **Interview aggregate** — `Interview` is a session shell only; answers and theory config are composed at read time from `theory_sections`
-- **Interview completion** — `SessionCompletionService` loads read models and scores from merged section breakdown
-- **Interview creation** — setup uses `SessionCreationService.create_session` with shell + theory section persistence
-- **Setup form** — posts v2 `selection_json`; theory question count and timer stored on the theory branch
-
 ### Fixed
 
-- **Coding session UI** — dedicated `coding_interview.html` layout (assignment panel + editor); evaluating spinner no longer visible on load (`[hidden]` vs `display:flex` clash)
-- **Coding task bank** — tasks use `coding.assignment` (technical brief) instead of theory-style `question.text` prompts
-- **Coding-only session pages** — dashboard and interview page no longer 500 when theory sources are empty; titles and selection summary use coding branch data
-- **Coding phase activation** — `theory_then_coding` sessions promote coding sections from `pending` to `active` when theory finishes (`SessionPhaseOrchestrator`, `CodingPageService.activate_timer`)
-- **Theory-to-coding handoff** — completing the theory section auto-reloads into the coding page via shared `session_phases.js`; theory-complete state shows a **Continue to Coding** button as fallback
-- Configuration speech model panel tracks the selected Whisper size and locale in the form (status, download, and save now refer to the same model)
-- Piper and Whisper downloads in Docker no longer fail with ``Permission denied: '/.cache'`` (Hub cache uses ``data/.cache/huggingface``)
-- Per-question timer stops when the interview is ended or completed (including during final evaluation)
-- Configuration question voice panel tracks the selected interview language in the form (status and download now refer to the matching Piper voice)
-- Whisper and Piper voices can be downloaded from Configuration before any LLM model is saved; adding an audio-capable catalog entry no longer requires Whisper to be installed first
-
 ### Removed
 
-- **Legacy interview columns** — `question_count`, `question_ids`, `question_time_limit_seconds`, and `score` dropped from `interviews`; `answers.interview_id` removed (Alembic `20260608_0007`)
-- **Deprecated interview API paths** — `POST /interview/{id}/audio-answer` and `WS /interview/{id}/ws`; use `/theory/audio-answer` and `/theory/ws`
-- **Interview compat re-exports** — `AnswerProcessingService`, `InterviewPageService`, `InterviewCreationService`, `InterviewCompletionService`, and `app/interview/services/evaluator/`
+## 2026.6.12
+
+### Added
+
+- **Coding interviews** — practice live coding in the browser: editor, Run on public tests, Submit for evaluation, and a review page after the session; use `docker compose --profile coding` for code execution
+- **Coding question bank** — 33 Python language-focused tasks (junior: basics, strings, functions, control flow, exceptions, OOP, collections; middle: refactor, bug hunt, complete code, implement)
+
+### Changed
+
+- **New interview setup** — choose session mode (theory only, coding only, or both in sequence) and configure theory and coding topics separately on one screen
+
+### Fixed
+
+- **First-time configuration** — saving provider settings and downloading Whisper or Piper models works on a fresh install, including in Docker
 
 ## 2026.5.31
 
diff --git a/README.md b/README.md
index adb90de..ed0f2f3 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,9 @@
 
 [![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
 [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-yellow.svg)](https://opensource.org/licenses/Apache-2.0)
-[![Version](https://img.shields.io/badge/version-2026.5.31-blue.svg)](CHANGELOG.md)
+[![Version](https://img.shields.io/badge/version-2026.6.12-blue.svg)](CHANGELOG.md)
 
-Open-source AI technical interview trainer. Practice from curated YAML question banks, get structured scoring and follow-ups, and optionally use voice — with your own LLM (cloud or local).
+Open-source AI technical interview trainer. Practice **theory Q&A**, **live coding**, or **both in one session** from curated YAML banks — with structured scoring, follow-ups, optional voice, and a local results history. Bring your own LLM (cloud or local).
 
 [Why GrillKit](#why-grillkit-not-just-chatgpt) · [Quick start](#quick-start) · [Changelog](CHANGELOG.md) · [Architecture](ARCHITECTURE.md)
 
@@ -15,9 +15,10 @@ A general chat assistant is flexible, but it does not run an **interview** for y
 | What you need | ChatGPT-style chat | GrillKit |
 |---------------|-------------------|----------|
 | Curated technical questions | You prompt each time | Built-in **tracks** (Python, Kafka, System Design, …), **levels**, and **topics** |
-| Interview flow | Free-form thread | Fixed session: N questions, up to **2 AI follow-ups** per question, **1–5 scoring**, session summary |
-| Practice history | Scattered chats | **Dashboard** with past sessions stored locally |
-| Time pressure | None | Optional **per-round timer** (expired round → 0, move on) |
+| Interview flow | Free-form thread | Fixed session: theory Q&A and/or coding tasks, up to **2 AI follow-ups** per item, **1–5 scoring**, session summary |
+| Live coding practice | Paste code in chat | **Monaco editor**, **Run** against public tests, **Submit** for hidden tests + AI review (needs Judge0) |
+| Practice history | Scattered chats | **Dashboard** with past sessions; open **results** and per-section **review** pages after completion |
+| Time pressure | None | Optional **per-round timer** on theory and coding (expired round → 0, move on) |
 | Voice practice | Depends on product | Offline **Whisper** dictation; optional **Piper** question audio; **audio answers** when your model supports it |
 | Where data lives | Vendor cloud | **Self-hosted**: SQLite + `data/` on your machine; use **Ollama**, vLLM, or any OpenAI-compatible API |
 
@@ -45,7 +46,13 @@ A general chat assistant is flexible, but it does not run an **interview** for y
   <img src="./assets/interview-setup.png" alt="Interview setup" width="900" />
 </p>
 
-**Interview session** — real-time Q&A with AI scoring and final evaluation
+**Coding section** — Monaco editor, Run on public tests, Submit for AI evaluation
+
+<p align="center">
+  <img src="./assets/coding.png" alt="Coding interview session" width="900" />
+</p>
+
+**Theory section** — real-time Q&A with AI scoring and final evaluation
 
 <p align="center">
   <img src="./assets/interview-session.png" alt="Completed interview with evaluation" width="900" />
@@ -53,13 +60,30 @@ A general chat assistant is flexible, but it does not run an **interview** for y
 
 ## Features
 
-- **Interviews** — multi-track setup, several topics per session, WebSocket Q&A, AI scoring 1–5, up to 2 follow-ups per question
-- **Question banks** — Python, Database/SQL, System Design, Kafka, RabbitMQ, Docker, Kubernetes, Observability, Airflow, and more under `data/questions/{track}/` (junior / middle / senior where applicable)
-- **Timer** — optional per-round time limit; expired rounds score 0 and the session moves on
-- **Voice** — offline Whisper dictation for typed answers; optional Piper TTS to read questions aloud
-- **Audio answers** — when the configured model supports audio input and Whisper is ready, record and send a WAV answer from the interview page
-- **Setup** — model catalog on `/config`, interview locale (AI feedback language), Whisper/Piper downloads from the UI
-- **Dashboard** — recent interview history on the home page
+### Session modes
+
+Pick one mode on **New interview** (`/setup`):
+
+| Mode | What you practice |
+|------|-------------------|
+| **Theory only** | Technical Q&A from `data/questions/` — type, dictate, or record answers |
+| **Coding only** | Programming tasks from `data/coding/` — edit, Run, Submit |
+| **Theory then coding** | Q&A first, then coding panel when theory finishes |
+| **Coding then theory** | Coding first, then theory |
+
+Coding modes need a running [Judge0](https://github.com/judge0/judge0) instance (see **Coding sessions** below).
+
+### Practice tools
+
+- **Theory** — WebSocket Q&A, AI scoring 1–5, up to 2 follow-ups per question
+- **Coding** — Monaco editor, Run (`POST /coding/run`) on public tests, Submit (`WS /coding/ws`) with hidden tests and AI feedback
+- **Question banks** — Python, Database/SQL, System Design, Kafka, RabbitMQ, Docker, Kubernetes, Observability, Airflow, and more (junior / middle / senior where applicable)
+- **Timer** — optional per-round limit on theory and coding; expired rounds score 0 and the session moves on
+- **Voice** — offline Whisper dictation; optional Piper TTS to read theory questions aloud
+- **Audio answers** — record a WAV theory answer when your model supports audio input and Whisper is ready
+- **Results hub** — after you finish, `/interview/{id}/results` shows overall evaluation and links to **theory** and **coding** review pages with full chat/code history
+- **Dashboard** — recent sessions on the home page (completed sessions link to results)
+- **Setup** — model catalog on `/config`, interview locale, Whisper/Piper downloads from the UI
 - **Deployment** — Docker Compose on port 8000 with `./data` volume for config, DB, and models
 
 ## Quick start
@@ -106,9 +130,10 @@ On some Linux hosts Judge0 needs **cgroup v1** (`systemd.unified_cgroup_hierarch
 
 ### First-time flow
 
-1. **Configuration** (`/config`) — add one or more OpenAI-compatible models to the catalog, select an interview model, set interview locale; test connection, then save.
-2. **New interview** (`/setup`) — pick a **session mode** (theory only, coding only, or combined). Configure theory and/or coding tracks, topics, task counts, and per-task timers. Coding modes require Judge0 (see **Coding sessions** above).
-3. **Interview** (`/interview/{id}`) — theory answers over `WS /theory/ws`; coding uses Monaco + Run (`POST /coding/run`) and Submit (`WS /coding/ws`). End interview from the sidebar at any time.
+1. **Configuration** (`/config`) — add one or more OpenAI-compatible models to the catalog, select an interview model, set interview locale; test connection, then save. Download Whisper (and optionally a Piper voice) from the same page if you want voice features.
+2. **New interview** (`/setup`) — pick a **session mode** (theory only, coding only, or combined). Choose tracks, levels, topics, how many questions/tasks, and optional per-round timers. Coding modes require Judge0 (see **Coding sessions** above).
+3. **Practice** (`/interview/{id}`) — answer theory questions in the chat (type, dictate, or record audio). On coding phases, use the editor: **Run** to check public tests, **Submit** when ready. Combined sessions switch panels automatically when a section ends (or use **Continue to Coding**). End the interview from the sidebar at any time.
+4. **Review** (`/interview/{id}/results`) — after completion, read the overall evaluation, then open **Theory** or **Coding** review for full conversation history, scores, and feedback.
 
 Without saved provider config, `/setup` redirects to `/config`.
 
@@ -168,8 +193,8 @@ Optional environment variables (full list in [ARCHITECTURE.md](ARCHITECTURE.md#p
 
 | Document | Contents |
 |----------|----------|
-| [ARCHITECTURE.md](ARCHITECTURE.md) | Layers, HTTP/WebSocket routes, data flows, persistence, question banks |
-| [CONTRIBUTING.md](CONTRIBUTING.md) | Dev setup, tests, ruff/mypy/pytest, contribution workflow |
+| [ARCHITECTURE.md](ARCHITECTURE.md) | Feature modules, routes, data flows, persistence, test layout |
+| [CONTRIBUTING.md](CONTRIBUTING.md) | Dev setup, quality checks, question/coding YAML guidelines |
 | [CHANGELOG.md](CHANGELOG.md) | Release history |
 
 ## Security
diff --git a/app/main.py b/app/main.py
index 8be4d66..4bf1597 100644
--- a/app/main.py
+++ b/app/main.py
@@ -49,7 +49,7 @@ def create_app() -> FastAPI:
     app = FastAPI(
         title="GrillKit",
         description="AI Interview Trainer",
-        version="2026.5.31",
+        version="2026.6.12",
         lifespan=lifespan,
     )
 
diff --git a/assets/coding.png b/assets/coding.png
new file mode 100644
index 0000000..5eb48fb
Binary files /dev/null and b/assets/coding.png differ
diff --git a/data/coding/python/junior/basics.yaml b/data/coding/python/junior/basics.yaml
index 0445ae0..032024c 100644
--- a/data/coding/python/junior/basics.yaml
+++ b/data/coding/python/junior/basics.yaml
@@ -5,6 +5,108 @@ level: "junior"
 description: "Core Python fundamentals: types, variables, operators, and language essentials"
 
 tasks:
+  - id: "bas-001"
+    difficulty: 1
+    tags: ["f-strings", "formatting"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          The greeting uses old-style `%` formatting. Modern Python code prefers f-strings
+          for readability.
+
+          Your task:
+          Rewrite the `greeting` assignment to use an f-string. Keep the same output.
+        ru: |
+          Контекст:
+          Приветствие собирается через `%`-форматирование.
+
+          Задача:
+          Перепишите присваивание `greeting` на f-string с тем же результатом.
+      starter_code: |
+        name = "Alice"
+        score = 95
+
+        greeting = "Hello, %s! Your score is %d." % (name, score)
+        print(greeting)
+    expected_points:
+      - "Uses f-string with name and score interpolated"
+      - "Same printed output as original"
+
+  - id: "bas-002"
+    difficulty: 1
+    tags: ["none", "identity", "comparison"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          `find_user` checks for a missing user with `== None`. In Python, singletons
+          like `None` should be compared with `is` / `is not`.
+
+          Your task:
+          Fix the None check. Do not change behavior for valid users.
+        ru: |
+          Контекст:
+          `find_user` сравнивает результат с `None` через `==`.
+
+          Задача:
+          Исправьте проверку на `None` через `is` / `is not`. Поведение для найденных пользователей не меняйте.
+      starter_code: |
+        users = {"alice": "Alice", "bob": "Bob"}
+
+
+        def find_user(user_id):
+            return users.get(user_id)
+
+
+        result = find_user("charlie")
+        if result == None:
+            print("User not found")
+        else:
+            print(f"Found: {result}")
+    expected_points:
+      - "Uses `is None` or `is not None` instead of == None"
+      - "Same output for missing and existing users"
+
+  - id: "bas-003"
+    difficulty: 2
+    tags: ["truthiness", "conditionals"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          `is_valid` treats any truthy value as valid, so non-empty strings like `"0"`
+          pass even when they should not.
+
+          Your task:
+          Rewrite `is_valid` so only actual boolean `True` is accepted.
+          Use an explicit identity check against `True`.
+        ru: |
+          Контекст:
+          `is_valid` принимает любое truthy-значение, включая строку `"0"`.
+
+          Задача:
+          Перепишите `is_valid`: валидным считается только булев `True` (явная проверка идентичности).
+      starter_code: |
+        def is_valid(flag):
+            if flag:
+                return "ok"
+            return "invalid"
+
+
+        print(is_valid(True))
+        print(is_valid("0"))
+        print(is_valid(1))
+    expected_points:
+      - "Checks `flag is True` (or equivalent explicit boolean check)"
+      - "String \"0\" and integer 1 return invalid"
+
   - id: "bas-004"
     difficulty: 2
     tags: ["type-conversion", "type-hints"]
diff --git a/data/coding/python/junior/collections.yaml b/data/coding/python/junior/collections.yaml
new file mode 100644
index 0000000..1d6b6fb
--- /dev/null
+++ b/data/coding/python/junior/collections.yaml
@@ -0,0 +1,76 @@
+category: "Collections"
+track: "python"
+level: "junior"
+
+description: "Lists, dicts, sets, and common collection operations"
+
+tasks:
+  - id: "col-001"
+    difficulty: 1
+    tags: ["set", "deduplication"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          `unique_tags` removes duplicates manually with nested loops. That is slow
+          and harder to read than built-in tools.
+
+          Your task:
+          Rewrite `unique_tags` using `set` (preserve order is not required).
+          Return a list of unique tags.
+        ru: |
+          Контекст:
+          `unique_tags` убирает дубликаты вложенными циклами.
+
+          Задача:
+          Перепишите через `set`. Верните список уникальных тегов (порядок не важен).
+      starter_code: |
+        def unique_tags(tags):
+            result = []
+            for tag in tags:
+                if tag not in result:
+                    result.append(tag)
+            return result
+
+
+        print(unique_tags(["python", "web", "python", "api", "web"]))
+    expected_points:
+      - "Uses set for deduplication"
+      - "Returns list without duplicates"
+
+  - id: "col-002"
+    difficulty: 2
+    tags: ["dict", "get", "counting"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          `count_words` should return how many times each word appears in a list.
+          The skeleton uses a plain dict.
+
+          Your task:
+          Complete the loop using `dict.get` (or `.setdefault`) to increment counts.
+          Return the frequency dictionary.
+        ru: |
+          Контекст:
+          `count_words` считает частоту слов в списке.
+
+          Задача:
+          Допишите цикл через `dict.get` (или `.setdefault`). Верните словарь частот.
+      starter_code: |
+        def count_words(words):
+            counts = {}
+            for word in words:
+                # increment counts[word]
+                pass
+            return counts
+
+
+        print(count_words(["a", "b", "a", "c", "b", "a"]))
+    expected_points:
+      - "Increments count with get/setdefault or equivalent"
+      - "Correct frequencies for repeated words"
diff --git a/data/coding/python/junior/control-flow.yaml b/data/coding/python/junior/control-flow.yaml
index f968988..f441750 100644
--- a/data/coding/python/junior/control-flow.yaml
+++ b/data/coding/python/junior/control-flow.yaml
@@ -5,6 +5,69 @@ level: "junior"
 description: "Python control flow constructs: conditionals, loops, iterators, and context managers"
 
 tasks:
+  - id: "cf-001"
+    difficulty: 1
+    tags: ["break", "loops"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          `find_first_even` scans the entire list even after the first even number is found.
+
+          Your task:
+          Stop the loop early with `break` once the first even number is found.
+          Return `None` if no even number exists.
+        ru: |
+          Контекст:
+          `find_first_even` проходит весь список, хотя первое чётное уже найдено.
+
+          Задача:
+          Остановите цикл через `break` после первого чётного. Если чётных нет — верните `None`.
+      starter_code: |
+        def find_first_even(numbers):
+            for n in numbers:
+                if n % 2 == 0:
+                    return n
+            return None
+
+
+        print(find_first_even([1, 3, 4, 6, 8]))
+        print(find_first_even([1, 3, 5]))
+    expected_points:
+      - "Uses break when even number found (or equivalent early exit)"
+      - "Returns first even or None"
+
+  - id: "cf-002"
+    difficulty: 1
+    tags: ["dict", "items", "iteration"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          The loop prints scores by indexing into `scores` with each key from `scores.keys()`.
+          That pattern is verbose and non-idiomatic.
+
+          Your task:
+          Refactor the loop to iterate with `.items()` while keeping the same output.
+        ru: |
+          Контекст:
+          Баллы выводятся через индексацию по ключам из `scores.keys()`.
+
+          Задача:
+          Перепишите цикл на `.items()` с тем же выводом.
+      starter_code: |
+        scores = {"Alice": 85, "Bob": 92, "Charlie": 78}
+
+        for name in scores.keys():
+            print(name, scores[name])
+    expected_points:
+      - "Uses for name, score in scores.items()"
+      - "Same print output as original"
+
   - id: "cf-003"
     difficulty: 2
     tags: ["range", "enumerate", "iteration"]
diff --git a/data/coding/python/junior/exceptions.yaml b/data/coding/python/junior/exceptions.yaml
index 200885b..b4df1e6 100644
--- a/data/coding/python/junior/exceptions.yaml
+++ b/data/coding/python/junior/exceptions.yaml
@@ -5,6 +5,66 @@ level: "junior"
 description: "Python exception handling: try/except/finally, raising exceptions, and exception hierarchy"
 
 tasks:
+  - id: "exc-001"
+    difficulty: 1
+    tags: ["try-except", "value-error"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          `to_int` crashes on invalid input because `int()` raises `ValueError`.
+
+          Your task:
+          Wrap the conversion in try/except. Return `None` when conversion fails.
+        ru: |
+          Контекст:
+          `to_int` падает на невалидном вводе.
+
+          Задача:
+          Оберните преобразование в try/except. При ошибке возвращайте `None`.
+      starter_code: |
+        def to_int(value):
+            return int(value)
+
+
+        print(to_int("42"))
+        print(to_int("abc"))
+        print(to_int(""))
+    expected_points:
+      - "Catches ValueError (or broader Exception) around int()"
+      - "Returns None on invalid input"
+      - "Returns int for valid numeric strings"
+
+  - id: "exc-002"
+    difficulty: 2
+    tags: ["finally", "cleanup"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          `read_lines` opens a file but never closes it if an error occurs while reading.
+
+          Your task:
+          Ensure the file is always closed using a `finally` block (do not switch to `with` here).
+        ru: |
+          Контекст:
+          `read_lines` не закрывает файл при ошибке чтения.
+
+          Задача:
+          Гарантируйте закрытие файла через `finally` (без перехода на `with`).
+      starter_code: |
+        def read_lines(path):
+            f = open(path, "r")
+            lines = f.readlines()
+            return [line.strip() for line in lines]
+    expected_points:
+      - "Uses try/finally to close the file handle"
+      - "File closed even when readlines raises"
+
   - id: "exc-005"
     difficulty: 1
     tags: ["assert", "debugging"]
diff --git a/data/coding/python/junior/functions.yaml b/data/coding/python/junior/functions.yaml
index 26af8f2..28eb8af 100644
--- a/data/coding/python/junior/functions.yaml
+++ b/data/coding/python/junior/functions.yaml
@@ -5,6 +5,99 @@ level: "junior"
 description: "Python functions: parameters, return values, scoping, and advanced function concepts"
 
 tasks:
+  - id: "func-001"
+    difficulty: 1
+    tags: ["default-arguments"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          `greet` always requires a prefix argument. Callers want a sensible default.
+
+          Your task:
+          Add a default value `"Hello"` to the `prefix` parameter. Keep the function body unchanged.
+        ru: |
+          Контекст:
+          `greet` всегда требует аргумент `prefix`.
+
+          Задача:
+          Задайте значение по умолчанию `"Hello"` для `prefix`. Тело функции не меняйте.
+      starter_code: |
+        def greet(name, prefix):
+            return f"{prefix}, {name}!"
+
+
+        print(greet("Alice"))
+        print(greet("Bob", "Hi"))
+    expected_points:
+      - "prefix has default value \"Hello\""
+      - "greet(\"Alice\") works without second argument"
+
+  - id: "func-002"
+    difficulty: 2
+    tags: ["args", "variadic"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          `total` should accept any number of numeric arguments and return their sum.
+
+          Your task:
+          Implement `total` using `*args`. Return `0` when called with no arguments.
+        ru: |
+          Контекст:
+          `total` должна суммировать произвольное число аргументов.
+
+          Задача:
+          Реализуйте `total` через `*args`. Без аргументов возвращайте `0`.
+      starter_code: |
+        def total(*args):
+            pass
+
+
+        print(total(1, 2, 3))
+        print(total())
+        print(total(10, -5, 2.5))
+    expected_points:
+      - "Uses *args in signature"
+      - "Returns sum of all arguments"
+      - "Empty call returns 0"
+
+  - id: "func-003"
+    difficulty: 2
+    tags: ["keyword-only", "parameters"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          `connect` accepts host and port, but callers sometimes pass the port positionally
+          by mistake. The port should be keyword-only.
+
+          Your task:
+          Make `port` a keyword-only parameter (use `*` in the signature).
+          Keep the return format unchanged.
+        ru: |
+          Контекст:
+          В `connect` порт иногда передают позиционно по ошибке.
+
+          Задача:
+          Сделайте `port` keyword-only (через `*` в сигнатуре). Формат возврата не меняйте.
+      starter_code: |
+        def connect(host, port):
+            return f"{host}:{port}"
+
+
+        print(connect("localhost", port=5432))
+    expected_points:
+      - "port is keyword-only after bare *"
+      - "connect(\"localhost\", port=5432) still works"
+
   - id: "func-006"
     difficulty: 2
     tags: ["docstrings", "annotations"]
diff --git a/data/coding/python/junior/oop.yaml b/data/coding/python/junior/oop.yaml
new file mode 100644
index 0000000..36a89a1
--- /dev/null
+++ b/data/coding/python/junior/oop.yaml
@@ -0,0 +1,77 @@
+category: "OOP"
+track: "python"
+level: "junior"
+
+description: "Classes, instances, methods, and basic object-oriented patterns"
+
+tasks:
+  - id: "oop-001"
+    difficulty: 2
+    tags: ["str", "repr", "dunder"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          `Point` stores coordinates but printing an instance shows the default
+          `<Point object at 0x...>` representation.
+
+          Your task:
+          Add `__str__` so `print(Point(3, 4))` outputs `Point(x=3, y=4)`.
+        ru: |
+          Контекст:
+          У `Point` нет читаемого строкового представления.
+
+          Задача:
+          Добавьте `__str__`, чтобы `print(Point(3, 4))` выводил `Point(x=3, y=4)`.
+      starter_code: |
+        class Point:
+            def __init__(self, x, y):
+                self.x = x
+                self.y = y
+
+
+        p = Point(3, 4)
+        print(p)
+    expected_points:
+      - "Defines __str__ returning Point(x=..., y=...) format"
+      - "Uses self.x and self.y"
+
+  - id: "oop-002"
+    difficulty: 2
+    tags: ["methods", "encapsulation"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          `BankAccount` stores a balance but allows direct mutation via `account.balance`.
+          Add a method to deposit money safely.
+
+          Your task:
+          Implement `deposit(amount)` that adds a positive amount to `balance`.
+          Raise `ValueError` when `amount` is zero or negative.
+        ru: |
+          Контекст:
+          `BankAccount` хранит баланс; нужен безопасный способ пополнения.
+
+          Задача:
+          Реализуйте `deposit(amount)`: прибавляет положительную сумму к `balance`.
+          При нуле или отрицательной сумме — `ValueError`.
+      starter_code: |
+        class BankAccount:
+            def __init__(self, balance=0):
+                self.balance = balance
+
+            def deposit(self, amount):
+                pass
+
+
+        account = BankAccount(100)
+        account.deposit(50)
+        print(account.balance)
+    expected_points:
+      - "Increases balance for positive amount"
+      - "Raises ValueError for zero or negative deposit"
diff --git a/data/coding/python/junior/strings.yaml b/data/coding/python/junior/strings.yaml
index 7880e6c..77030dd 100644
--- a/data/coding/python/junior/strings.yaml
+++ b/data/coding/python/junior/strings.yaml
@@ -5,6 +5,72 @@ level: "junior"
 description: "Python string operations, formatting, and manipulation"
 
 tasks:
+  - id: "str-001"
+    difficulty: 1
+    tags: ["split", "strip", "parsing"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          A log line stores key-value pairs separated by commas (`key=value`).
+          The parser must extract the value for a given key.
+
+          Your task:
+          Complete `parse_value` so it splits the line, strips whitespace, and returns
+          the value for `key`, or `None` if the key is absent.
+        ru: |
+          Контекст:
+          Строка лога содержит пары `key=value` через запятую.
+
+          Задача:
+          Допишите `parse_value`: разбейте строку, уберите пробелы, верните значение для `key`
+          или `None`, если ключа нет.
+      starter_code: |
+        def parse_value(line, key):
+            # split by comma, then by '=', strip parts
+            pass
+
+
+        line = "user=alice, role=admin, active=true"
+        print(parse_value(line, "role"))
+        print(parse_value(line, "missing"))
+    expected_points:
+      - "Splits on comma and equals with strip"
+      - "Returns correct value for existing key"
+      - "Returns None when key is missing"
+
+  - id: "str-002"
+    difficulty: 1
+    tags: ["case", "normalization"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          User emails are compared case-sensitively, so `"User@Mail.com"` and
+          `"user@mail.com"` are treated as different accounts.
+
+          Your task:
+          Normalize both emails with `.lower()` before comparison in `emails_match`.
+        ru: |
+          Контекст:
+          Email сравниваются с учётом регистра — дубликаты не находятся.
+
+          Задача:
+          Нормализуйте оба email через `.lower()` в `emails_match` перед сравнением.
+      starter_code: |
+        def emails_match(a, b):
+            return a == b
+
+
+        print(emails_match("User@Mail.com", "user@mail.com"))
+    expected_points:
+      - "Calls .lower() on both operands before =="
+      - "Returns True for case-insensitive match"
+
   - id: "str-004"
     difficulty: 2
     tags: ["join", "split", "concatenation"]
diff --git a/data/coding/python/middle/bug-hunt.yaml b/data/coding/python/middle/bug-hunt.yaml
index 1a2d9e5..48bb451 100644
--- a/data/coding/python/middle/bug-hunt.yaml
+++ b/data/coding/python/middle/bug-hunt.yaml
@@ -47,3 +47,69 @@ tasks:
       - "Non-numeric lines cause ValueError without handling"
       - "Fix skips blank lines and catches ValueError per line"
       - "Negative numbers are ignored as required"
+
+  - id: "bh-mutable-default-002"
+    difficulty: 2
+    tags: ["mutable-default", "functions"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          `add_item` uses a mutable list as a default argument. Repeated calls share
+          the same list, which surprises callers.
+
+          Your task:
+          1. Explain the bug (in a comment at the top of the file).
+          2. Fix `add_item` so each call without `items` gets a fresh empty list.
+        ru: |
+          Контекст:
+          `add_item` использует изменяемый список по умолчанию — вызовы делят один список.
+
+          Задача:
+          1. Опишите баг в комментарии в начале файла.
+          2. Исправьте `add_item`: без `items` каждый вызов получает новый пустой список.
+      starter_code: |
+        def add_item(value, items=[]):
+            items.append(value)
+            return items
+
+
+        print(add_item("a"))
+        print(add_item("b"))
+    expected_points:
+      - "Comment describes shared mutable default"
+      - "Uses None sentinel and items = items or [] (or equivalent)"
+      - "Second call without items does not contain first call's value"
+
+  - id: "bh-string-is-003"
+    difficulty: 2
+    tags: ["identity", "strings", "comparison"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          `is_admin_role` compares role strings with `is`. String content should be
+          compared with `==`, not identity.
+
+          Your task:
+          Fix the comparison so `"admin"` matches regardless of how the string was created.
+        ru: |
+          Контекст:
+          `is_admin_role` сравнивает строки через `is` вместо сравнения значений.
+
+          Задача:
+          Исправьте сравнение: роль `"admin"` должна определяться по содержимому.
+      starter_code: |
+        def is_admin_role(role):
+            return role is "admin"
+
+
+        user_input = "admin"
+        print(is_admin_role(user_input))
+    expected_points:
+      - "Uses == for string equality"
+      - "Returns True for role equal to admin"
diff --git a/data/coding/python/middle/complete-code.yaml b/data/coding/python/middle/complete-code.yaml
index 00faa64..1744216 100644
--- a/data/coding/python/middle/complete-code.yaml
+++ b/data/coding/python/middle/complete-code.yaml
@@ -55,3 +55,83 @@ tasks:
       - "set removes old queue entry before re-appending on update"
       - "eviction uses popleft on order and deletes key from data"
       - "FIFO semantics preserved after updates and inserts"
+
+  - id: "cc-freq-002"
+    difficulty: 2
+    tags: ["dict", "counting", "collections"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          `top_n` should return the `n` most frequent items from a list as `(item, count)` pairs,
+          sorted by count descending.
+
+          Your task:
+          Complete `top_n`: build frequencies, then return the top `n` pairs.
+          You may use `sorted` with a key; ties can be broken arbitrarily.
+        ru: |
+          Контекст:
+          `top_n` возвращает `n` самых частых элементов как пары `(элемент, счётчик)`.
+
+          Задача:
+          Допишите `top_n`: посчитайте частоты, верните топ-`n` по убыванию счётчика.
+      starter_code: |
+        def top_n(items, n):
+            counts = {}
+            for item in items:
+                counts[item] = counts.get(item, 0) + 1
+            # return n most common (item, count) pairs
+            pass
+
+
+        print(top_n(["a", "b", "a", "c", "a", "b"], 2))
+    expected_points:
+      - "Builds frequency dict correctly"
+      - "Returns up to n pairs sorted by count descending"
+
+  - id: "cc-context-003"
+    difficulty: 3
+    tags: ["context-managers", "dunder"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          `Timer` is a context manager skeleton. It should record elapsed wall time
+          between entering and exiting the block.
+
+          Your task:
+          Implement `__enter__` and `__exit__` so that after the `with` block,
+          `timer.elapsed` holds the duration in seconds (float).
+        ru: |
+          Контекст:
+          `Timer` — заготовка контекстного менеджера для замера времени блока.
+
+          Задача:
+          Реализуйте `__enter__` и `__exit__`: после `with` в `timer.elapsed` — длительность в секундах.
+      starter_code: |
+        import time
+
+
+        class Timer:
+            def __init__(self):
+                self.elapsed = 0.0
+
+            def __enter__(self):
+                pass
+
+            def __exit__(self, exc_type, exc, tb):
+                pass
+
+
+        with Timer() as timer:
+            time.sleep(0.01)
+
+        print(timer.elapsed > 0)
+    expected_points:
+      - "__enter__ records start time"
+      - "__exit__ sets elapsed from monotonic or perf counter"
+      - "elapsed is positive after block"
diff --git a/data/coding/python/middle/implement.yaml b/data/coding/python/middle/implement.yaml
index 81c52b4..fe4f2ea 100644
--- a/data/coding/python/middle/implement.yaml
+++ b/data/coding/python/middle/implement.yaml
@@ -57,3 +57,44 @@ tasks:
       - "Only lists are flattened; scalars appended in order"
       - "Handles empty input and deeply nested single value"
       - "Includes runnable tests covering examples and edge cases"
+
+  - id: "im-config-002"
+    difficulty: 2
+    tags: ["dict", "validation", "types"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          `parse_config` receives a plain dict from JSON. Required keys are `host` (str)
+          and `port` (int). Optional `debug` defaults to `False`.
+
+          Your task:
+          Implement validation:
+          - raise `ValueError` with a clear message if `host` or `port` is missing
+          - raise `TypeError` if `port` is not an int
+          - return a new dict with `host`, `port`, and `debug` (default False)
+        ru: |
+          Контекст:
+          `parse_config` валидирует словарь конфигурации из JSON.
+
+          Задача:
+          - `ValueError`, если нет `host` или `port`
+          - `TypeError`, если `port` не int
+          - вернуть dict с `host`, `port`, `debug` (по умолчанию False)
+      starter_code: |
+        def parse_config(raw):
+            """Validate and normalize application config from a JSON dict."""
+            raise NotImplementedError
+
+
+        cfg = parse_config({"host": "localhost", "port": 8080})
+        print(cfg)
+
+        cfg_debug = parse_config({"host": "api", "port": 443, "debug": True})
+        print(cfg_debug)
+    expected_points:
+      - "Raises ValueError on missing host or port"
+      - "Raises TypeError when port is not int"
+      - "Returns dict with debug defaulting to False"
diff --git a/data/coding/python/middle/refactor.yaml b/data/coding/python/middle/refactor.yaml
index 9b253d2..43c9a1b 100644
--- a/data/coding/python/middle/refactor.yaml
+++ b/data/coding/python/middle/refactor.yaml
@@ -78,3 +78,69 @@ tasks:
       - "Type hints on public methods"
       - "Docstrings describe return semantics"
       - "PEP 8 spacing after commas and around operators"
+
+  - id: "rf-list-comp-002"
+    difficulty: 2
+    tags: ["list-comprehension", "idioms"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          `square_evens` builds a result list with append in a loop. A list comprehension
+          is shorter and idiomatic for simple filters and transforms.
+
+          Your task:
+          Rewrite the function body as a single list comprehension. Keep the same behavior:
+          return squares of even numbers only.
+        ru: |
+          Контекст:
+          `square_evens` собирает результат через append в цикле.
+
+          Задача:
+          Перепишите тело функции одним list comprehension. Квадраты только чётных чисел.
+      starter_code: |
+        def square_evens(numbers):
+            result = []
+            for n in numbers:
+                if n % 2 == 0:
+                    result.append(n * n)
+            return result
+
+
+        print(square_evens([1, 2, 3, 4, 5, 6]))
+    expected_points:
+      - "Single list comprehension with filter for even n"
+      - "Same output as loop version"
+
+  - id: "rf-with-open-003"
+    difficulty: 2
+    tags: ["context-managers", "files"]
+    coding:
+      language: python
+      evaluation_mode: ai
+      assignment:
+        en: |
+          Context:
+          `read_config` opens a file and closes it manually. If `read()` raises,
+          the handle may leak.
+
+          Your task:
+          Refactor to use `with open(...) as f`. Preserve the return value (file contents).
+        ru: |
+          Контекст:
+          `read_config` закрывает файл вручную — при ошибке чтения возможна утечка дескриптора.
+
+          Задача:
+          Перепишите на `with open(...) as f`. Возвращайте содержимое файла как раньше.
+      starter_code: |
+        def read_config(path):
+            f = open(path, "r")
+            data = f.read()
+            f.close()
+            return data
+    expected_points:
+      - "Uses with open for reading"
+      - "Returns full file contents"
+      - "No manual close after refactor"
diff --git a/tests/ai/__init__.py b/tests/ai/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_audio_probe.py b/tests/ai/test_audio_probe.py
similarity index 100%
rename from tests/test_audio_probe.py
rename to tests/ai/test_audio_probe.py
diff --git a/tests/test_ai_base.py b/tests/ai/test_base.py
similarity index 100%
rename from tests/test_ai_base.py
rename to tests/ai/test_base.py
diff --git a/tests/test_ai_factory.py b/tests/ai/test_factory.py
similarity index 100%
rename from tests/test_ai_factory.py
rename to tests/ai/test_factory.py
diff --git a/tests/test_openai_compatible.py b/tests/ai/test_openai_compatible.py
similarity index 100%
rename from tests/test_openai_compatible.py
rename to tests/ai/test_openai_compatible.py
diff --git a/tests/app/__init__.py b/tests/app/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_main.py b/tests/app/test_main.py
similarity index 98%
rename from tests/test_main.py
rename to tests/app/test_main.py
index f41f269..ac6efc5 100644
--- a/tests/test_main.py
+++ b/tests/app/test_main.py
@@ -19,7 +19,7 @@ def test_app_creation(self):
         assert app is not None
         assert app.title == "GrillKit"
         assert app.description == "AI Interview Trainer"
-        assert app.version == "2026.5.31"
+        assert app.version == "2026.6.12"
 
     def test_static_files_mounted(self):
         """Test that static files are mounted."""
diff --git a/tests/coding/__init__.py b/tests/coding/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/coding/api/__init__.py b/tests/coding/api/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_coding_api.py b/tests/coding/api/test_routes.py
similarity index 100%
rename from tests/test_coding_api.py
rename to tests/coding/api/test_routes.py
diff --git a/tests/coding/repositories/__init__.py b/tests/coding/repositories/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_coding_repository.py b/tests/coding/repositories/test_coding_section.py
similarity index 100%
rename from tests/test_coding_repository.py
rename to tests/coding/repositories/test_coding_section.py
diff --git a/tests/coding/services/__init__.py b/tests/coding/services/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_coding_availability.py b/tests/coding/services/test_availability.py
similarity index 100%
rename from tests/test_coding_availability.py
rename to tests/coding/services/test_availability.py
diff --git a/tests/test_coding_evaluator.py b/tests/coding/services/test_evaluator.py
similarity index 66%
rename from tests/test_coding_evaluator.py
rename to tests/coding/services/test_evaluator.py
index 61ee2be..5db3d57 100644
--- a/tests/test_coding_evaluator.py
+++ b/tests/coding/services/test_evaluator.py
@@ -48,3 +48,28 @@ async def test_evaluate_submission_uses_run_history_context() -> None:
     assert follow_up_needed is True
     assert follow_up_text == "Add type hints."
     assert follow_up_mode == "code"
+
+
+@pytest.mark.asyncio
+async def test_coding_evaluator_evaluate_section() -> None:
+    """Coding section evaluation returns parsed section narrative."""
+    from tests.fakes import FakeProvider, section_evaluation_json
+
+    provider = FakeProvider(
+        replies=[section_evaluation_json(section_feedback="Strong coding section.")]
+    )
+    result = await CodingEvaluatorService.evaluate_section(
+        provider=provider,
+        task_submissions=[
+            {
+                "task_id": "cod-001",
+                "round": 0,
+                "prompt_text": "Solve it.",
+                "submitted_code": "return 1",
+                "score": 4,
+            }
+        ],
+        sources_text="Python / junior: basics",
+        locale="en",
+    )
+    assert result.section_feedback == "Strong coding section."
diff --git a/tests/test_coding_harness.py b/tests/coding/services/test_harness.py
similarity index 100%
rename from tests/test_coding_harness.py
rename to tests/coding/services/test_harness.py
diff --git a/tests/test_judge0_client.py b/tests/coding/services/test_judge0_client.py
similarity index 100%
rename from tests/test_judge0_client.py
rename to tests/coding/services/test_judge0_client.py
diff --git a/tests/test_coding_page.py b/tests/coding/services/test_page.py
similarity index 100%
rename from tests/test_coding_page.py
rename to tests/coding/services/test_page.py
diff --git a/tests/test_coding_planning.py b/tests/coding/services/test_planning.py
similarity index 98%
rename from tests/test_coding_planning.py
rename to tests/coding/services/test_planning.py
index c41c1f4..34a6ed6 100644
--- a/tests/test_coding_planning.py
+++ b/tests/coding/services/test_planning.py
@@ -65,7 +65,7 @@ def test_build_coding_task_plan_from_bank() -> None:
     )
     planned = build_coding_task_plan(selection, task_count=1, locale="en")
     assert len(planned) == 1
-    assert planned[0].id == "bas-004"
+    assert planned[0].id.startswith("bas-")
     assert planned[0].task_spec["language"] == "python"
 
 
diff --git a/tests/coding/services/test_review.py b/tests/coding/services/test_review.py
new file mode 100644
index 0000000..a1271fa
--- /dev/null
+++ b/tests/coding/services/test_review.py
@@ -0,0 +1,46 @@
+# Copyright 2026 GrillKit Contributors
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for CodingReviewService."""
+
+import json
+
+from app.coding.services.review import CodingReviewService
+from app.interview.repositories.uow import InterviewUnitOfWork
+from app.shared.infrastructure.models import CodingTask
+from tests.helpers.completed_session_seed import seed_completed_coding_interview
+
+
+def test_coding_review_service_groups_task_rounds(isolated_db) -> None:
+    """Coding review groups submitted rounds on one page."""
+    interview_id = seed_completed_coding_interview()
+    with InterviewUnitOfWork(auto_commit=True) as uow:
+        section = uow.coding_sections.get_aggregate(interview_id)
+        assert section is not None
+        follow_up = CodingTask(
+            coding_section_id=section.id,
+            task_id="cod-001",
+            order=1,
+            round=1,
+            prompt_text="Explain your approach.",
+            task_spec=json.dumps({"language": "python"}),
+            submitted_code="I used a direct return.",
+            score=3,
+            feedback="Explanation was brief.",
+        )
+        uow.session.add(follow_up)
+
+    context = CodingReviewService.build_context(interview_id)
+    assert context is not None
+    assert len(context.tasks) == 1
+    assert len(context.tasks[0].rounds) == 2
+    assert context.tasks[0].total_score == 7
+
+
+def test_coding_review_page_renders_task_accordion(client, isolated_db) -> None:
+    """Coding review page renders per-task accordion with final submit."""
+    interview_id = seed_completed_coding_interview("results-coding-page-1")
+    response = client.get(f"/interview/{interview_id}/coding")
+    assert response.status_code == 200
+    assert "Coding Tasks" in response.text
+    assert "cod-001" in response.text
+    assert "Works for the sample case." in response.text
diff --git a/tests/test_coding_runner.py b/tests/coding/services/test_runner.py
similarity index 100%
rename from tests/test_coding_runner.py
rename to tests/coding/services/test_runner.py
diff --git a/tests/test_coding_section_service.py b/tests/coding/services/test_section.py
similarity index 100%
rename from tests/test_coding_section_service.py
rename to tests/coding/services/test_section.py
diff --git a/tests/conftest.py b/tests/conftest.py
index 8077652..9e523c0 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -116,4 +116,4 @@ def uow(isolated_db):
         yield work
 
 
-pytest_plugins = ["tests.test_questions"]
+pytest_plugins = ["tests.shared.test_questions"]
diff --git a/tests/helpers/completed_session_seed.py b/tests/helpers/completed_session_seed.py
new file mode 100644
index 0000000..b759b7e
--- /dev/null
+++ b/tests/helpers/completed_session_seed.py
@@ -0,0 +1,124 @@
+# Copyright 2026 GrillKit Contributors
+# SPDX-License-Identifier: Apache-2.0
+"""Test helpers for seeding completed interview sessions."""
+
+import json
+
+from app.interview.repositories.uow import InterviewUnitOfWork
+from app.shared.infrastructure.models import Answer, Interview
+from tests.helpers.coding_seed import (
+    attach_coding_tasks,
+    create_coding_section_for_interview,
+)
+from tests.helpers.interview_seed import persist_interview_with_answers
+from tests.helpers.selection import minimal_selection_spec
+
+
+def seed_completed_theory_interview(interview_id: str = "results-theory-1") -> str:
+    """Persist a completed theory interview with one answered question.
+
+    Args:
+        interview_id: Interview primary key.
+
+    Returns:
+        Interview UUID.
+    """
+    persist_interview_with_answers(
+        Interview(
+            id=interview_id,
+            locale="en",
+            selection_spec=minimal_selection_spec(categories=["basics"]),
+            status="active",
+        ),
+        [
+            Answer(
+                question_id="q1",
+                order=1,
+                round=0,
+                question_text="What is Python?",
+                answer_text="A programming language",
+                score=4,
+                feedback="Clear and concise.",
+            )
+        ],
+    )
+    overall_feedback = {
+        "overall_feedback": "Good theory performance.",
+        "strengths_summary": ["basics"],
+        "topics_to_review": [],
+        "score_breakdown": {
+            "theory": {
+                "score": 4,
+                "max": 5,
+                "skipped": False,
+                "questions": {"q1": {"score": 4, "max": 5}},
+            }
+        },
+    }
+    with InterviewUnitOfWork(auto_commit=True) as uow:
+        aggregate = uow.interviews.get_aggregate(interview_id)
+        assert aggregate is not None
+        completed = aggregate.with_session_completed(overall_feedback)
+        uow.interviews.save_aggregate(completed)
+    return interview_id
+
+
+def seed_completed_coding_interview(interview_id: str = "results-coding-1") -> str:
+    """Persist a completed coding-only interview with one submitted task.
+
+    Args:
+        interview_id: Interview primary key.
+
+    Returns:
+        Interview UUID.
+    """
+    with InterviewUnitOfWork(auto_commit=True) as uow:
+        interview = Interview(
+            id=interview_id,
+            locale="en",
+            selection_spec=json.dumps(
+                {
+                    "version": 2,
+                    "session_mode": "coding_only",
+                    "theory": {"enabled": False},
+                    "coding": {"enabled": True},
+                }
+            ),
+            session_mode="coding_only",
+            status="active",
+        )
+        uow.interviews.add(interview)
+        uow.flush()
+        section = create_coding_section_for_interview(
+            uow.session,
+            interview,
+            task_count=1,
+            status="completed",
+        )
+        tasks = attach_coding_tasks(uow.session, section, task_ids=["cod-001"])
+        task = tasks[0]
+        task.submitted_code = "def solve():\n    return 1"
+        task.score = 4
+        task.feedback = "Works for the sample case."
+        task.submit_test_summary = json.dumps(
+            {"status": "success", "tests_passed": 2, "tests_total": 2}
+        )
+        uow.session.add(task)
+        overall_feedback = {
+            "overall_feedback": "Good coding performance.",
+            "strengths_summary": ["problem solving"],
+            "topics_to_review": [],
+            "score_breakdown": {
+                "coding": {
+                    "score": 4,
+                    "max": 5,
+                    "skipped": False,
+                    "questions": {"cod-001": {"score": 4, "max": 5}},
+                }
+            },
+        }
+        aggregate = uow.interviews.get_aggregate(interview_id)
+        assert aggregate is not None
+        completed = aggregate.with_session_completed(overall_feedback)
+        uow.interviews.save_aggregate(completed)
+    return interview_id
diff --git a/tests/interview/__init__.py b/tests/interview/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/interview/api/__init__.py b/tests/interview/api/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_interview_errors.py b/tests/interview/api/test_errors.py
similarity index 100%
rename from tests/test_interview_errors.py
rename to tests/interview/api/test_errors.py
diff --git a/tests/interview/api/test_results.py b/tests/interview/api/test_results.py
new file mode 100644
index 0000000..418cb8b
--- /dev/null
+++ b/tests/interview/api/test_results.py
@@ -0,0 +1,23 @@
+# Copyright 2026 GrillKit Contributors
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for completed session results HTTP routes."""
+
+from tests.helpers.completed_session_seed import seed_completed_theory_interview
+
+
+def test_completed_interview_page_redirects_to_results(client, isolated_db) -> None:
+    """Completed sessions no longer render the active interview page."""
+    interview_id = seed_completed_theory_interview("results-redirect-1")
+    response = client.get(f"/interview/{interview_id}", follow_redirects=False)
+    assert response.status_code == 303
+    assert response.headers["location"] == f"/interview/{interview_id}/results"
+
+
+def test_results_page_renders_for_completed_session(client, isolated_db) -> None:
+    """Results hub renders overall feedback and section cards."""
+    interview_id = seed_completed_theory_interview("results-page-1")
+    response = client.get(f"/interview/{interview_id}/results")
+    assert response.status_code == 200
+    assert "Overall Evaluation" in response.text
+    assert "View details" in response.text
+    assert "Good theory performance." in response.text
diff --git a/tests/interview/api/test_routes.py b/tests/interview/api/test_routes.py
new file mode 100644
index 0000000..b2fa603
--- /dev/null
+++ b/tests/interview/api/test_routes.py
@@ -0,0 +1,64 @@
+# Copyright 2026 GrillKit Contributors
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for interview HTTP routes (dashboard and legacy endpoints)."""
+
+from unittest.mock import patch
+
+
+class TestDashboardRouter:
+    """Tests for the dashboard home page."""
+
+    def test_dashboard_includes_interview_history(self, client):
+        """Dashboard passes interview history to the template."""
+        mock_rows = [
+            type(
+                "Row",
+                (),
+                {
+                    "id": "id-1",
+                    "title": "Python Interview",
+                    "question_count": 5,
+                    "score_display": "10 / 15",
+                    "status": "completed",
+                    "status_label": "Completed",
+                    "datetime_display": "18 May 2026, 14:30",
+                    "url": "/interview/id-1",
+                },
+            )(),
+        ]
+        with patch(
+            "app.interview.services.dashboard.DashboardBuilder.list_rows",
+            return_value=mock_rows,
+        ):
+            response = client.get("/")
+            assert response.status_code == 200
+            assert "Interview history" in response.text
+            assert "Python Interview" in response.text
+
+    def test_dashboard_returns_html(self, client):
+        """Dashboard always returns HTML, even without provider config."""
+        with patch(
+            "app.interview.services.dashboard.DashboardBuilder.list_rows",
+            return_value=[],
+        ):
+            response = client.get("/")
+        assert response.status_code == 200
+        assert "text/html" in response.headers.get("content-type", "")
+        assert "Dashboard" in response.text
+
+
+class TestInterviewHttpRoutes:
+    """Tests for interview HTTP surface (page only; interaction is WebSocket)."""
+
+    def test_legacy_post_answer_removed(self, client):
+        """Legacy form POST answer endpoint is no longer registered."""
+        response = client.post(
+            "/interview/test-id/answer",
+            data={"question_id": "q1", "answer_text": "text"},
+        )
+        assert response.status_code == 404
+
+    def test_legacy_post_complete_removed(self, client):
+        """Legacy form POST complete endpoint is no longer registered."""
+        response = client.post("/interview/test-id/complete")
+        assert response.status_code == 404
diff --git a/tests/test_setup_api.py b/tests/interview/api/test_setup.py
similarity index 100%
rename from tests/test_setup_api.py
rename to tests/interview/api/test_setup.py
diff --git a/tests/interview/repositories/__init__.py b/tests/interview/repositories/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_repositories.py b/tests/interview/repositories/test_interview.py
similarity index 100%
rename from tests/test_repositories.py
rename to tests/interview/repositories/test_interview.py
diff --git a/tests/interview/services/__init__.py b/tests/interview/services/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/interview/services/rules/__init__.py b/tests/interview/services/rules/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_interview_timer.py b/tests/interview/services/rules/test_feedback.py
similarity index 100%
rename from tests/test_interview_timer.py
rename to tests/interview/services/rules/test_feedback.py
diff --git a/tests/test_interview_completion.py b/tests/interview/services/test_completion.py
similarity index 100%
rename from tests/test_interview_completion.py
rename to tests/interview/services/test_completion.py
diff --git a/tests/test_interview_creation.py b/tests/interview/services/test_creation.py
similarity index 99%
rename from tests/test_interview_creation.py
rename to tests/interview/services/test_creation.py
index a191570..f782678 100644
--- a/tests/test_interview_creation.py
+++ b/tests/interview/services/test_creation.py
@@ -209,7 +209,7 @@ def test_create_coding_only_session(isolated_db, monkeypatch) -> None:
     assert section.status == "active"
     assert section.task_count == 1
     assert len(section.tasks) == 1
-    assert section.tasks[0].task_id == "bas-004"
+    assert section.tasks[0].task_id.startswith("bas-")
     assert section.task_time_limit_seconds == 600
 
 
diff --git a/tests/test_dashboard_query.py b/tests/interview/services/test_dashboard.py
similarity index 100%
rename from tests/test_dashboard_query.py
rename to tests/interview/services/test_dashboard.py
diff --git a/tests/test_interview_page.py b/tests/interview/services/test_page.py
similarity index 100%
rename from tests/test_interview_page.py
rename to tests/interview/services/test_page.py
diff --git a/tests/test_session_phases.py b/tests/interview/services/test_phases.py
similarity index 100%
rename from tests/test_session_phases.py
rename to tests/interview/services/test_phases.py
diff --git a/tests/interview/services/test_results_page.py b/tests/interview/services/test_results_page.py
new file mode 100644
index 0000000..846a168
--- /dev/null
+++ b/tests/interview/services/test_results_page.py
@@ -0,0 +1,20 @@
+# Copyright 2026 GrillKit Contributors
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for SessionResultsPageService."""
+
+from app.interview.repositories.uow import InterviewUnitOfWork
+from app.interview.services.results_page import SessionResultsPageService
+from tests.helpers.completed_session_seed import seed_completed_theory_interview
+
+
+def test_session_results_page_service_builds_section_cards(isolated_db) -> None:
+    """Results hub includes enabled section cards with review links."""
+    interview_id = seed_completed_theory_interview("results-hub-1")
+    with InterviewUnitOfWork() as uow:
+        interview = uow.interviews.get_read_model(interview_id)
+    assert interview is not None
+    context = SessionResultsPageService.build_context(interview)
+    assert context is not None
+    assert context.theory_review_url == f"/interview/{interview_id}/theory"
+    assert len(context.section_cards) == 1
+    assert context.section_cards[0].section == "theory"
diff --git a/tests/test_section_feedback.py b/tests/interview/services/test_section_feedback.py
similarity index 100%
rename from tests/test_section_feedback.py
rename to tests/interview/services/test_section_feedback.py
diff --git a/tests/test_interview_selection.py b/tests/interview/services/test_selection.py
similarity index 100%
rename from tests/test_interview_selection.py
rename to tests/interview/services/test_selection.py
diff --git a/tests/test_session_evaluation.py b/tests/interview/services/test_session_evaluation.py
similarity index 100%
rename from tests/test_session_evaluation.py
rename to tests/interview/services/test_session_evaluation.py
diff --git a/tests/platform/__init__.py b/tests/platform/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/platform/api/__init__.py b/tests/platform/api/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/platform/api/test_config.py b/tests/platform/api/test_config.py
new file mode 100644
index 0000000..8bb5e54
--- /dev/null
+++ b/tests/platform/api/test_config.py
@@ -0,0 +1,241 @@
+# Copyright 2026 GrillKit Contributors
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for platform config HTTP routes."""
+
+from unittest.mock import patch
+
+import pytest
+
+from app.ai.llm_models import LLMModelEntry
+from app.platform.services.config import AppConfig
+
+
+class TestConfigRouter:
+    """Tests for config router endpoints."""
+
+    _catalog_entry = LLMModelEntry(
+        id="cloud",
+        display_name="Cloud",
+        provider_type="openai-compatible",
+        model="gpt-4",
+        base_url="https://api.openai.com",
+        api_key_required=True,
+        api_key="stored-secret",
+    )
+
+    def _config_form_data(self, **overrides):
+        """Build a valid config form payload."""
+        data = {
+            "llm_preset_id": "cloud",
+            "api_key": "test-key",
+            "timeout": 60.0,
+            "locale": "en",
+        }
+        data.update(overrides)
+        return data
+
+    def test_config_page_get(self, client):
+        """Test GET /config endpoint returns HTML."""
+        mock_config = AppConfig(
+            provider_type="openai-compatible",
+            base_url="https://api.openai.com",
+            model="gpt-4",
+            api_key="test-key",
+        )
+
+        with (
+            patch(
+                "app.platform.services.config.ConfigService.get_config",
+                return_value=mock_config,
+            ),
+        ):
+            response = client.get("/config")
+            assert response.status_code == 200
+            assert "text/html" in response.headers.get("content-type", "")
+            assert "Interview model" in response.text
+            assert "Add model to catalog" in response.text
+
+    def test_config_page_get_no_config(self, client):
+        """Test GET /config without existing config."""
+        with (
+            patch(
+                "app.platform.services.config.ConfigService.get_config",
+                return_value=None,
+            ),
+        ):
+            response = client.get("/config")
+            assert response.status_code == 200
+            assert "Interview model" in response.text
+            assert "Speech recognition model" in response.text
+            assert "Question voice (TTS)" in response.text
+
+    async def test_save_config_preserves_api_key_when_field_empty(self, client):
+        """POST /config keeps the stored key when the password field is left blank."""
+        existing = AppConfig(
+            provider_type="openai-compatible",
+            base_url="https://api.openai.com",
+            model="gpt-4",
+            api_key="stored-secret",
+            llm_preset_id="cloud",
+        )
+        with (
+            patch(
+                "app.platform.services.config.ConfigService.get_config",
+                return_value=existing,
+            ),
+            patch(
+                "app.platform.services.config_form.normalize_model_id",
+                return_value="cloud",
+            ),
+            patch(
+                "app.platform.api.config.LLMCatalogService.get_model",
+                return_value=self._catalog_entry,
+            ),
+            patch(
+                "app.platform.services.config.LLMCatalogService.get_model",
+                return_value=self._catalog_entry,
+            ),
+            patch(
+                "app.platform.services.config.ConfigService.test_connection",
+                return_value=(True, "OK"),
+            ),
+            patch(
+                "app.platform.services.config.ConfigService.save_config"
+            ) as mock_save,
+        ):
+            response = client.post(
+                "/config",
+                data=self._config_form_data(api_key=""),
+            )
+
+        assert response.status_code == 200
+        saved = mock_save.call_args[0][0]
+        assert saved.api_key == "stored-secret"
+
+    @pytest.mark.asyncio
+    async def test_save_config_success(self, client):
+        """Test POST /config with successful connection test."""
+        with (
+            patch(
+                "app.platform.services.config_form.normalize_model_id",
+                return_value="cloud",
+            ),
+            patch(
+                "app.platform.api.config.LLMCatalogService.get_model",
+                return_value=self._catalog_entry,
+            ),
+            patch(
+                "app.platform.services.config.LLMCatalogService.get_model",
+                return_value=self._catalog_entry,
+            ),
+            patch(
+                "app.platform.services.config.ConfigService.test_connection",
+                return_value=(True, "OK"),
+            ),
+            patch(
+                "app.platform.services.config.ConfigService.save_config"
+            ) as mock_save,
+        ):
+            response = client.post(
+                "/config",
+                data=self._config_form_data(),
+            )
+
+            assert response.status_code == 200
+            mock_save.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_save_config_failure(self, client):
+        """Test POST /config with failed connection test."""
+        with (
+            patch(
+                "app.platform.services.config_form.normalize_model_id",
+                return_value="cloud",
+            ),
+            patch(
+                "app.platform.api.config.LLMCatalogService.get_model",
+                return_value=self._catalog_entry,
+            ),
+            patch(
+                "app.platform.services.config.LLMCatalogService.get_model",
+                return_value=self._catalog_entry,
+            ),
+            patch(
+                "app.platform.services.config.ConfigService.test_connection",
+                return_value=(False, "Connection failed"),
+            ),
+        ):
+            response = client.post(
+                "/config",
+                data=self._config_form_data(),
+            )
+
+            assert response.status_code == 200
+
+    def test_delete_config(self, client):
+        """Test DELETE /config endpoint."""
+        with (
+            patch(
+                "app.platform.services.config.ConfigService.delete_config"
+            ) as mock_delete,
+        ):
+            response = client.delete("/config")
+
+            assert response.status_code == 200
+            mock_delete.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_test_config_success(self, client):
+        """Test POST /config/test with successful connection."""
+        with (
+            patch(
+                "app.platform.services.config_form.normalize_model_id",
+                return_value="cloud",
+            ),
+            patch(
+                "app.platform.api.config.LLMCatalogService.get_model",
+                return_value=self._catalog_entry,
+            ),
+            patch(
+                "app.platform.services.config.LLMCatalogService.get_model",
+                return_value=self._catalog_entry,
+            ),
+            patch(
+                "app.platform.services.config.ConfigService.test_connection",
+                return_value=(True, "Connection successful"),
+            ),
+        ):
+            response = client.post(
+                "/config/test",
+                data=self._config_form_data(),
+            )
+
+            assert response.status_code == 200
+
+    @pytest.mark.asyncio
+    async def test_test_config_failure(self, client):
+        """Test POST /config/test with failed connection."""
+        with (
+            patch(
+                "app.platform.services.config_form.normalize_model_id",
+                return_value="cloud",
+            ),
+            patch(
+                "app.platform.api.config.LLMCatalogService.get_model",
+                return_value=self._catalog_entry,
+            ),
+            patch(
+                "app.platform.services.config.LLMCatalogService.get_model",
+                return_value=self._catalog_entry,
+            ),
+            patch(
+                "app.platform.services.config.ConfigService.test_connection",
+                return_value=(False, "Invalid API key"),
+            ),
+        ):
+            response = client.post(
+                "/config/test",
+                data=self._config_form_data(api_key="invalid-key"),
+            )
+
+            assert response.status_code == 200
diff --git a/tests/platform/services/__init__.py b/tests/platform/services/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_config_service.py b/tests/platform/services/test_config.py
similarity index 100%
rename from tests/test_config_service.py
rename to tests/platform/services/test_config.py
diff --git a/tests/test_llm_catalog.py b/tests/platform/services/test_llm_catalog.py
similarity index 100%
rename from tests/test_llm_catalog.py
rename to tests/platform/services/test_llm_catalog.py
diff --git a/tests/question_voice/__init__.py b/tests/question_voice/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/question_voice/api/__init__.py b/tests/question_voice/api/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_tts_api.py b/tests/question_voice/api/test_tts.py
similarity index 100%
rename from tests/test_tts_api.py
rename to tests/question_voice/api/test_tts.py
diff --git a/tests/question_voice/services/__init__.py b/tests/question_voice/services/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_piper_storage.py b/tests/question_voice/services/test_piper_storage.py
similarity index 100%
rename from tests/test_piper_storage.py
rename to tests/question_voice/services/test_piper_storage.py
diff --git a/tests/test_tts_cache.py b/tests/question_voice/services/test_tts_cache.py
similarity index 100%
rename from tests/test_tts_cache.py
rename to tests/question_voice/services/test_tts_cache.py
diff --git a/tests/shared/__init__.py b/tests/shared/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/shared/infrastructure/__init__.py b/tests/shared/infrastructure/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_alembic_migrations.py b/tests/shared/infrastructure/test_alembic_migrations.py
similarity index 100%
rename from tests/test_alembic_migrations.py
rename to tests/shared/infrastructure/test_alembic_migrations.py
diff --git a/tests/test_artifact_download.py b/tests/shared/infrastructure/test_artifact_download.py
similarity index 100%
rename from tests/test_artifact_download.py
rename to tests/shared/infrastructure/test_artifact_download.py
diff --git a/tests/test_artifact_status.py b/tests/shared/infrastructure/test_artifact_status.py
similarity index 100%
rename from tests/test_artifact_status.py
rename to tests/shared/infrastructure/test_artifact_status.py
diff --git a/tests/test_audio_wav.py b/tests/shared/infrastructure/test_audio_wav.py
similarity index 100%
rename from tests/test_audio_wav.py
rename to tests/shared/infrastructure/test_audio_wav.py
diff --git a/tests/test_database.py b/tests/shared/infrastructure/test_database.py
similarity index 100%
rename from tests/test_database.py
rename to tests/shared/infrastructure/test_database.py
diff --git a/tests/test_hf_download_progress.py b/tests/shared/infrastructure/test_hf_download_progress.py
similarity index 100%
rename from tests/test_hf_download_progress.py
rename to tests/shared/infrastructure/test_hf_download_progress.py
diff --git a/tests/test_hf_hub_runtime.py b/tests/shared/infrastructure/test_hf_hub_runtime.py
similarity index 100%
rename from tests/test_hf_hub_runtime.py
rename to tests/shared/infrastructure/test_hf_hub_runtime.py
diff --git a/tests/test_uow.py b/tests/shared/infrastructure/test_uow.py
similarity index 100%
rename from tests/test_uow.py
rename to tests/shared/infrastructure/test_uow.py
diff --git a/tests/test_coding_tasks.py b/tests/shared/test_coding.py
similarity index 93%
rename from tests/test_coding_tasks.py
rename to tests/shared/test_coding.py
index 52afa9a..fe1f9ae 100644
--- a/tests/test_coding_tasks.py
+++ b/tests/shared/test_coding.py
@@ -168,11 +168,12 @@ def test_load_categories_merges_and_dedupes(self, temp_coding_dir) -> None:
         assert [task.id for task in tasks] == ["algo-001", "algo-002", "algo-003"]
 
     def test_load_real_python_junior_basics(self) -> None:
-        """Load migrated production task bank entry."""
+        """Load production basics category including type-hints task."""
         tasks = load_category("python", "junior", "basics", locale="en")
-        assert len(tasks) == 1
-        assert tasks[0].id == "bas-004"
-        assert tasks[0].coding.evaluation_mode == "ai"
-        assert tasks[0].coding.starter_code is not None
-        assert "def process" in tasks[0].coding.starter_code
-        assert "type hints" in tasks[0].text.lower()
+        by_id = {task.id: task for task in tasks}
+        assert "bas-004" in by_id
+        task = by_id["bas-004"]
+        assert task.coding.evaluation_mode == "ai"
+        assert task.coding.starter_code is not None
+        assert "def process" in task.coding.starter_code
+        assert "type hints" in task.text.lower()
diff --git a/tests/test_locales.py b/tests/shared/test_locales.py
similarity index 100%
rename from tests/test_locales.py
rename to tests/shared/test_locales.py
diff --git a/tests/test_questions.py b/tests/shared/test_questions.py
similarity index 100%
rename from tests/test_questions.py
rename to tests/shared/test_questions.py
diff --git a/tests/test_speech_models.py b/tests/shared/test_speech_models.py
similarity index 100%
rename from tests/test_speech_models.py
rename to tests/shared/test_speech_models.py
diff --git a/tests/shared/test_structured_evaluation.py b/tests/shared/test_structured_evaluation.py
new file mode 100644
index 0000000..6ee54a8
--- /dev/null
+++ b/tests/shared/test_structured_evaluation.py
@@ -0,0 +1,102 @@
+# Copyright 2026 GrillKit Contributors
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for shared structured LLM evaluation helpers."""
+
+import json
+
+import pytest
+
+from app.ai.base import GenerationResult, Message
+from app.shared.structured_evaluation import generate_and_parse_json_response
+from app.theory.services.evaluator.models import AnswerEvaluation
+
+
+class _SequencedGenerateProvider:
+    """Minimal provider stub that returns preset generation results."""
+
+    def __init__(self, results: list[GenerationResult]) -> None:
+        self._results = list(results)
+        self.calls = 0
+        self.max_tokens_history: list[int] = []
+
+    async def generate(
+        self,
+        messages: list[Message],
+        temperature: float = 0.7,
+        max_tokens: int = 2000,
+    ) -> GenerationResult:
+        del messages, temperature
+        self.max_tokens_history.append(max_tokens)
+        if self.calls >= len(self._results):
+            raise ValueError("No more queued provider results")
+        result = self._results[self.calls]
+        self.calls += 1
+        return result
+
+
+@pytest.mark.asyncio
+async def test_generate_and_parse_json_response_retries_truncated_json() -> None:
+    """Invalid truncated JSON triggers one retry with a higher token budget."""
+    valid_payload = json.dumps(
+        {
+            "score": 4,
+            "feedback": "Solid answer with minor gaps.",
+            "strengths": ["clear structure"],
+            "weaknesses": ["missed edge cases"],
+            "follow_up_needed": False,
+            "follow_up_question": None,
+        }
+    )
+    provider = _SequencedGenerateProvider(
+        [
+            GenerationResult(
+                content='{"score": 4, "feedback": "Solid answer but cut off',
+                finish_reason="length",
+            ),
+            GenerationResult(content=valid_payload, finish_reason="stop"),
+        ]
+    )
+    messages = [
+        Message(role="system", content="Evaluate the answer."),
+        Message(role="user", content="Question and answer text."),
+    ]
+
+    result = await generate_and_parse_json_response(
+        provider,
+        messages=messages,
+        response_model=AnswerEvaluation,
+        max_tokens=1000,
+    )
+
+    assert result.score == 4
+    assert provider.calls == 2
+    assert provider.max_tokens_history == [1000, 2000]
+
+
+@pytest.mark.asyncio
+async def test_generate_and_parse_json_response_does_not_retry_validation_error() -> (
+    None
+):
+    """Schema validation failures are not retried."""
+    provider = _SequencedGenerateProvider(
+        [
+            GenerationResult(
+                content=json.dumps({"score": 9, "feedback": "Too high"}),
+                finish_reason="stop",
+            ),
+        ]
+    )
+    messages = [
+        Message(role="system", content="Evaluate the answer."),
+        Message(role="user", content="Question and answer text."),
+    ]
+
+    with pytest.raises(ValueError, match="validation failed"):
+        await generate_and_parse_json_response(
+            provider,
+            messages=messages,
+            response_model=AnswerEvaluation,
+            max_tokens=1000,
+        )
+
+    assert provider.calls == 1
diff --git a/tests/speech/__init__.py b/tests/speech/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/speech/api/__init__.py b/tests/speech/api/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_dictation_ws.py b/tests/speech/api/test_dictation_ws.py
similarity index 100%
rename from tests/test_dictation_ws.py
rename to tests/speech/api/test_dictation_ws.py
diff --git a/tests/test_speech_api.py b/tests/speech/api/test_routes.py
similarity index 100%
rename from tests/test_speech_api.py
rename to tests/speech/api/test_routes.py
diff --git a/tests/speech/services/__init__.py b/tests/speech/services/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_speech_recognition.py b/tests/speech/services/test_dictation.py
similarity index 100%
rename from tests/test_speech_recognition.py
rename to tests/speech/services/test_dictation.py
diff --git a/tests/test_whisper_runtime.py b/tests/speech/services/test_whisper_runtime.py
similarity index 100%
rename from tests/test_whisper_runtime.py
rename to tests/speech/services/test_whisper_runtime.py
diff --git a/tests/test_api_routers.py b/tests/test_api_routers.py
deleted file mode 100644
index 463c9c6..0000000
--- a/tests/test_api_routers.py
+++ /dev/null
@@ -1,540 +0,0 @@
-# Copyright 2026 GrillKit Contributors
-# SPDX-License-Identifier: Apache-2.0
-"""Tests for API routers."""
-
-import time
-from typing import Any
-from unittest.mock import ANY, AsyncMock, patch
-
-from fastapi.testclient import TestClient
-import pytest
-
-from app.ai.llm_models import LLMModelEntry
-from app.interview.domain.exceptions import (
-    InterviewNotActiveError,
-    InterviewNotFoundError,
-)
-from app.main import create_app
-from app.platform.services.config import AppConfig
-
-
-async def _raising_answer_stream(
-    exc: Exception,
-    interview_id: str,
-    question_id: str,
-    answer_text: str,
-    **kwargs: Any,
-) -> None:
-    raise exc
-    yield  # type: ignore[misc, unreachable]
-
-
-@pytest.fixture
-def client():
-    """Create a test client with mocked database."""
-    from app.interview.api.deps import get_ai_provider
-    from tests.fakes import FakeProvider
-
-    async def _fake_ai_provider():
-        yield FakeProvider([])
-
-    with (
-        patch("app.main.run_migrations"),
-        patch(
-            "app.platform.services.speech_runtime.SpeechRuntimeCoordinator.startup",
-            new=AsyncMock(),
-        ),
-        patch(
-            "app.platform.services.speech_runtime.SpeechRuntimeCoordinator.unload_all",
-        ),
-    ):
-        app = create_app()
-        app.dependency_overrides[get_ai_provider] = _fake_ai_provider
-        with TestClient(app) as test_client:
-            yield test_client
-        app.dependency_overrides.clear()
-
-
-class MockInterview:
-    """Minimal mock of Interview for WebSocket tests."""
-
-    def __init__(self, status: str = "active"):
-        self.id = "test-session-id"
-        self.status = status
-        self.answers = []
-        self.question_count = 5
-        self.locale = "en"
-        self.selection_spec = (
-            '{"sources":[{"track":"python","level":"junior",'
-            '"categories":["data-structures"]}]}'
-        )
-        self.score = None
-        self.overall_feedback = None
-
-
-class TestDashboardRouter:
-    """Tests for the dashboard home page."""
-
-    def test_dashboard_includes_interview_history(self, client):
-        """Dashboard passes interview history to the template."""
-        mock_rows = [
-            type(
-                "Row",
-                (),
-                {
-                    "id": "id-1",
-                    "title": "Python Interview",
-                    "question_count": 5,
-                    "score_display": "10 / 15",
-                    "status": "completed",
-                    "status_label": "Completed",
-                    "datetime_display": "18 May 2026, 14:30",
-                    "url": "/interview/id-1",
-                },
-            )(),
-        ]
-        with patch(
-            "app.interview.services.dashboard.DashboardBuilder.list_rows",
-            return_value=mock_rows,
-        ):
-            response = client.get("/")
-            assert response.status_code == 200
-            assert "Interview history" in response.text
-            assert "Python Interview" in response.text
-
-    def test_dashboard_returns_html(self, client):
-        """Dashboard always returns HTML, even without provider config."""
-        with patch(
-            "app.interview.services.dashboard.DashboardBuilder.list_rows",
-            return_value=[],
-        ):
-            response = client.get("/")
-        assert response.status_code == 200
-        assert "text/html" in response.headers.get("content-type", "")
-        assert "Dashboard" in response.text
-
-
-class TestConfigRouter:
-    """Tests for config router endpoints."""
-
-    _catalog_entry = LLMModelEntry(
-        id="cloud",
-        display_name="Cloud",
-        provider_type="openai-compatible",
-        model="gpt-4",
-        base_url="https://api.openai.com",
-        api_key_required=True,
-        api_key="stored-secret",
-    )
-
-    def _config_form_data(self, **overrides):
-        """Build a valid config form payload."""
-        data = {
-            "llm_preset_id": "cloud",
-            "api_key": "test-key",
-            "timeout": 60.0,
-            "locale": "en",
-        }
-        data.update(overrides)
-        return data
-
-    def test_config_page_get(self, client):
-        """Test GET /config endpoint returns HTML."""
-        mock_config = AppConfig(
-            provider_type="openai-compatible",
-            base_url="https://api.openai.com",
-            model="gpt-4",
-            api_key="test-key",
-        )
-
-        with (
-            patch(
-                "app.platform.services.config.ConfigService.get_config",
-                return_value=mock_config,
-            ),
-        ):
-            response = client.get("/config")
-            assert response.status_code == 200
-            assert "text/html" in response.headers.get("content-type", "")
-            assert "Interview model" in response.text
-            assert "Add model to catalog" in response.text
-
-    def test_config_page_get_no_config(self, client):
-        """Test GET /config without existing config."""
-        with (
-            patch(
-                "app.platform.services.config.ConfigService.get_config",
-                return_value=None,
-            ),
-        ):
-            response = client.get("/config")
-            assert response.status_code == 200
-            assert "Interview model" in response.text
-            assert "Speech recognition model" in response.text
-            assert "Question voice (TTS)" in response.text
-
-    async def test_save_config_preserves_api_key_when_field_empty(self, client):
-        """POST /config keeps the stored key when the password field is left blank."""
-        existing = AppConfig(
-            provider_type="openai-compatible",
-            base_url="https://api.openai.com",
-            model="gpt-4",
-            api_key="stored-secret",
-            llm_preset_id="cloud",
-        )
-        with (
-            patch(
-                "app.platform.services.config.ConfigService.get_config",
-                return_value=existing,
-            ),
-            patch(
-                "app.platform.services.config_form.normalize_model_id",
-                return_value="cloud",
-            ),
-            patch(
-                "app.platform.api.config.LLMCatalogService.get_model",
-                return_value=self._catalog_entry,
-            ),
-            patch(
-                "app.platform.services.config.LLMCatalogService.get_model",
-                return_value=self._catalog_entry,
-            ),
-            patch(
-                "app.platform.services.config.ConfigService.test_connection",
-                return_value=(True, "OK"),
-            ),
-            patch(
-                "app.platform.services.config.ConfigService.save_config"
-            ) as mock_save,
-        ):
-            response = client.post(
-                "/config",
-                data=self._config_form_data(api_key=""),
-            )
-
-        assert response.status_code == 200
-        saved = mock_save.call_args[0][0]
-        assert saved.api_key == "stored-secret"
-
-    @pytest.mark.asyncio
-    async def test_save_config_success(self, client):
-        """Test POST /config with successful connection test."""
-        with (
-            patch(
-                "app.platform.services.config_form.normalize_model_id",
-                return_value="cloud",
-            ),
-            patch(
-                "app.platform.api.config.LLMCatalogService.get_model",
-                return_value=self._catalog_entry,
-            ),
-            patch(
-                "app.platform.services.config.LLMCatalogService.get_model",
-                return_value=self._catalog_entry,
-            ),
-            patch(
-                "app.platform.services.config.ConfigService.test_connection",
-                return_value=(True, "OK"),
-            ),
-            patch(
-                "app.platform.services.config.ConfigService.save_config"
-            ) as mock_save,
-        ):
-            response = client.post(
-                "/config",
-                data=self._config_form_data(),
-            )
-
-            assert response.status_code == 200
-            mock_save.assert_called_once()
-
-    @pytest.mark.asyncio
-    async def test_save_config_failure(self, client):
-        """Test POST /config with failed connection test."""
-        with (
-            patch(
-                "app.platform.services.config_form.normalize_model_id",
-                return_value="cloud",
-            ),
-            patch(
-                "app.platform.api.config.LLMCatalogService.get_model",
-                return_value=self._catalog_entry,
-            ),
-            patch(
-                "app.platform.services.config.LLMCatalogService.get_model",
-                return_value=self._catalog_entry,
-            ),
-            patch(
-                "app.platform.services.config.ConfigService.test_connection",
-                return_value=(False, "Connection failed"),
-            ),
-        ):
-            response = client.post(
-                "/config",
-                data=self._config_form_data(),
-            )
-
-            assert response.status_code == 200
-
-    def test_delete_config(self, client):
-        """Test DELETE /config endpoint."""
-        with (
-            patch(
-                "app.platform.services.config.ConfigService.delete_config"
-            ) as mock_delete,
-        ):
-            response = client.delete("/config")
-
-            assert response.status_code == 200
-            mock_delete.assert_called_once()
-
-    @pytest.mark.asyncio
-    async def test_test_config_success(self, client):
-        """Test POST /config/test with successful connection."""
-        with (
-            patch(
-                "app.platform.services.config_form.normalize_model_id",
-                return_value="cloud",
-            ),
-            patch(
-                "app.platform.api.config.LLMCatalogService.get_model",
-                return_value=self._catalog_entry,
-            ),
-            patch(
-                "app.platform.services.config.LLMCatalogService.get_model",
-                return_value=self._catalog_entry,
-            ),
-            patch(
-                "app.platform.services.config.ConfigService.test_connection",
-                return_value=(True, "Connection successful"),
-            ),
-        ):
-            response = client.post(
-                "/config/test",
-                data=self._config_form_data(),
-            )
-
-            assert response.status_code == 200
-
-    @pytest.mark.asyncio
-    async def test_test_config_failure(self, client):
-        """Test POST /config/test with failed connection."""
-        with (
-            patch(
-                "app.platform.services.config_form.normalize_model_id",
-                return_value="cloud",
-            ),
-            patch(
-                "app.platform.api.config.LLMCatalogService.get_model",
-                return_value=self._catalog_entry,
-            ),
-            patch(
-                "app.platform.services.config.LLMCatalogService.get_model",
-                return_value=self._catalog_entry,
-            ),
-            patch(
-                "app.platform.services.config.ConfigService.test_connection",
-                return_value=(False, "Invalid API key"),
-            ),
-        ):
-            response = client.post(
-                "/config/test",
-                data=self._config_form_data(api_key="invalid-key"),
-            )
-
-            assert response.status_code == 200
-
-
-class TestInterviewHttpRoutes:
-    """Tests for interview HTTP surface (page only; interaction is WebSocket)."""
-
-    def test_legacy_post_answer_removed(self, client):
-        """Legacy form POST answer endpoint is no longer registered."""
-        response = client.post(
-            "/interview/test-id/answer",
-            data={"question_id": "q1", "answer_text": "text"},
-        )
-        assert response.status_code == 404
-
-    def test_legacy_post_complete_removed(self, client):
-        """Legacy form POST complete endpoint is no longer registered."""
-        response = client.post("/interview/test-id/complete")
-        assert response.status_code == 404
-
-
-class TestInterviewWebSocket:
-    """Tests for WebSocket interview endpoint."""
-
-    def test_websocket_unknown_message(self, client):
-        """Test WebSocket returns error for unknown message type."""
-        with (
-            patch("app.interview.services.query.InterviewQuery.get_interview"),
-            client.websocket_connect("/interview/test-id/theory/ws") as ws,
-        ):
-            ws.send_json({"type": "unknown_command"})
-            response = ws.receive_json()
-            assert response["type"] == "error"
-            assert "Unknown message type" in response["message"]
-
-    def test_websocket_answer_success(self, client):
-        """Test WebSocket answer submission invokes stream_answer_submission."""
-        stream_calls: list[tuple[str, str, str]] = []
-
-        async def mock_stream(
-            interview_id: str,
-            question_id: str,
-            answer_text: str,
-            **kwargs: Any,
-        ) -> None:
-            stream_calls.append((interview_id, question_id, answer_text))
-            return
-            yield  # type: ignore[misc, unreachable]
-
-        with (
-            patch(
-                "app.theory.services.submission.TheorySubmissionService.stream_answer_submission",
-                side_effect=mock_stream,
-            ),
-            client.websocket_connect("/interview/test-id/theory/ws") as ws,
-        ):
-            ws.send_json(
-                {
-                    "type": "answer",
-                    "question_id": "ds-001",
-                    "answer_text": "My answer",
-                }
-            )
-            for _ in range(100):
-                if stream_calls:
-                    break
-                time.sleep(0.01)
-            assert stream_calls == [("test-id", "ds-001", "My answer")]
-
-    def test_websocket_answer_missing_fields(self, client):
-        """Test WebSocket returns error when question_id or answer_text is missing."""
-        with (
-            patch("app.interview.services.query.InterviewQuery.get_interview"),
-            client.websocket_connect("/interview/test-id/theory/ws") as ws,
-        ):
-            ws.send_json({"type": "answer", "question_id": ""})
-            response = ws.receive_json()
-            assert response["type"] == "error"
-            assert "Both" in response["message"]
-
-    def test_websocket_answer_completed_session(self, client):
-        """Test WebSocket rejects answer on completed session."""
-        with (
-            patch(
-                "app.theory.services.submission.TheorySubmissionService.stream_answer_submission",
-                side_effect=lambda *args, **kwargs: _raising_answer_stream(
-                    InterviewNotActiveError("test-id"), *args, **kwargs
-                ),
-            ),
-            client.websocket_connect("/interview/test-id/theory/ws") as ws,
-        ):
-            ws.send_json(
-                {
-                    "type": "answer",
-                    "question_id": "ds-001",
-                    "answer_text": "My answer",
-                }
-            )
-            response = ws.receive_json()
-            assert response["type"] == "error"
-            assert "completed" in response["message"].lower()
-
-    def test_websocket_answer_session_not_found(self, client):
-        """Test WebSocket returns error when session is not found."""
-        with (
-            patch(
-                "app.theory.services.submission.TheorySubmissionService.stream_answer_submission",
-                side_effect=lambda *args, **kwargs: _raising_answer_stream(
-                    InterviewNotFoundError("test-id"), *args, **kwargs
-                ),
-            ),
-            client.websocket_connect("/interview/test-id/theory/ws") as ws,
-        ):
-            ws.send_json(
-                {
-                    "type": "answer",
-                    "question_id": "ds-001",
-                    "answer_text": "My answer",
-                }
-            )
-            response = ws.receive_json()
-            assert response["type"] == "error"
-            assert "not found" in response["message"].lower()
-
-    def test_websocket_ping_pong(self, client):
-        """Test WebSocket ping/pong returns session status."""
-        mock_session = MockInterview(status="active")
-
-        with (
-            patch(
-                "app.interview.services.query.InterviewQuery.get_interview",
-                return_value=mock_session,
-            ),
-            client.websocket_connect("/interview/test-id/theory/ws") as ws,
-        ):
-            ws.send_json({"type": "ping"})
-            response = ws.receive_json()
-            assert response["type"] == "pong"
-            assert response["status"] == "active"
-
-    def test_websocket_ping_completed_session(self, client):
-        """Test ping returns completed status."""
-        mock_session = MockInterview(status="completed")
-
-        with (
-            patch(
-                "app.interview.services.query.InterviewQuery.get_interview",
-                return_value=mock_session,
-            ),
-            client.websocket_connect("/interview/test-id/theory/ws") as ws,
-        ):
-            ws.send_json({"type": "ping"})
-            response = ws.receive_json()
-            assert response["type"] == "pong"
-            assert response["status"] == "completed"
-
-    def test_websocket_complete_success(self, client):
-        """Test WebSocket complete message triggers session completion."""
-        with (
-            patch(
-                "app.interview.services.completion.SessionCompletionService.complete_session",
-                new_callable=AsyncMock,
-                return_value=[],
-            ) as mock_complete,
-            client.websocket_connect("/interview/test-id/theory/ws") as ws,
-        ):
-            ws.send_json({"type": "complete"})
-            for _ in range(100):
-                if mock_complete.await_count:
-                    break
-                time.sleep(0.01)
-            mock_complete.assert_awaited_once_with(
-                interview_id="test-id",
-                provider=ANY,
-            )
-
-    def test_websocket_answer_service_error(self, client):
-        """Test WebSocket handles ValueError from service layer."""
-        with (
-            patch(
-                "app.theory.services.submission.TheorySubmissionService.stream_answer_submission",
-                side_effect=lambda *args, **kwargs: _raising_answer_stream(
-                    ValueError("Invalid question"), *args, **kwargs
-                ),
-            ),
-            client.websocket_connect("/interview/test-id/theory/ws") as ws,
-        ):
-            ws.send_json(
-                {
-                    "type": "answer",
-                    "question_id": "ds-001",
-                    "answer_text": "My answer",
-                }
-            )
-            response = ws.receive_json()
-            assert response["type"] == "error"
-            assert "Invalid question" in response["message"]
diff --git a/tests/test_audio_answer_processing.py b/tests/test_audio_answer_processing.py
deleted file mode 100644
index a20f4e3..0000000
--- a/tests/test_audio_answer_processing.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# Copyright 2026 GrillKit Contributors
-# SPDX-License-Identifier: Apache-2.0
-"""Tests for audio answer submission orchestration."""
-
-import asyncio
-
-import pytest
-
-from app.ai.audio_probe import minimal_wav_bytes
-from app.interview.services.events import (
-    AnswerFeedbackEvent,
-    AnswerSavedEvent,
-    EvaluatingEvent,
-    TranscriptEvent,
-)
-from app.interview.services.query import InterviewQuery
-from app.shared.infrastructure.models import Answer, Interview
-from app.theory.services.evaluator.service import TheoryEvaluatorService
-from app.theory.services.submission import TheorySubmissionService
-from tests.fakes import answer_evaluation_json, follow_up_evaluation_json
-from tests.helpers.interview_seed import (
-    persist_interview_with_answers,
-    seed_two_question_interview,
-)
-from tests.helpers.selection import minimal_selection_spec
-from tests.helpers.transcription import FakeTranscriber
-
-
-@pytest.mark.asyncio
-async def test_process_audio_answer_runs_transcription_and_evaluation(
-    isolated_db, fake_ai_provider, monkeypatch
-):
-    """Audio answers yield saved, evaluating, transcript, and feedback events."""
-    monkeypatch.setattr(
-        TheorySubmissionService,
-        "require_audio_answer_enabled",
-        staticmethod(lambda: None),
-    )
-    interview_id = seed_two_question_interview("audio-ap-1")
-    provider = fake_ai_provider(
-        [answer_evaluation_json(score=5, follow_up_needed=False)]
-    )
-    transcriber = FakeTranscriber("spoken answer text")
-    wav_bytes = minimal_wav_bytes(duration_sec=0.2)
-
-    events = await TheorySubmissionService.process_audio_answer_submission(
-        interview_id=interview_id,
-        question_id="q1",
-        wav_bytes=wav_bytes,
-        provider=provider,
-        transcriber=transcriber,
-    )
-
-    assert [type(event) for event in events] == [
-        AnswerSavedEvent,
-        EvaluatingEvent,
-        TranscriptEvent,
-        AnswerFeedbackEvent,
-    ]
-    transcript = events[2]
-    assert isinstance(transcript, TranscriptEvent)
-    assert transcript.text == "spoken answer text"
-    assert transcriber.last_audio is not None
-
-    reloaded = InterviewQuery.get_interview(interview_id)
-    assert reloaded is not None
-    answer = next(a for a in reloaded.answers if a.question_id == "q1" and a.round == 0)
-    assert answer.answer_text == "spoken answer text"
-    assert answer.score == 5
-
-
-@pytest.mark.asyncio
-async def test_process_audio_answer_rejects_invalid_wav(
-    isolated_db, fake_ai_provider, monkeypatch
-):
-    """Invalid WAV payloads fail before any events are emitted."""
-    monkeypatch.setattr(
-        TheorySubmissionService,
-        "require_audio_answer_enabled",
-        staticmethod(lambda: None),
-    )
-    interview_id = seed_two_question_interview("audio-ap-1")
-    provider = fake_ai_provider([answer_evaluation_json()])
-    transcriber = FakeTranscriber()
-
-    with pytest.raises(ValueError, match="valid WAV"):
-        await TheorySubmissionService.process_audio_answer_submission(
-            interview_id=interview_id,
-            question_id="q1",
-            wav_bytes=b"not-wav",
-            provider=provider,
-            transcriber=transcriber,
-        )
-
-
-@pytest.mark.asyncio
-async def test_process_audio_answer_last_follow_up_fast_path(
-    isolated_db, fake_ai_provider, monkeypatch
-):
-    """Last follow-up round advances immediately and transcribes in-band."""
-    monkeypatch.setattr(
-        TheorySubmissionService,
-        "require_audio_answer_enabled",
-        staticmethod(lambda: None),
-    )
-    interview_id = "audio-ap-last-follow-up"
-    initial = Answer(
-        question_id="q1",
-        order=1,
-        round=0,
-        question_text="Original question?",
-    )
-    initial.answer_text = "First answer"
-    initial.score = 3
-    initial.feedback = "OK"
-    first_follow_up = Answer(
-        question_id="q1",
-        order=1,
-        round=1,
-        question_text="First follow-up?",
-    )
-    first_follow_up.answer_text = "First follow-up answer"
-    first_follow_up.score = 3
-    first_follow_up.feedback = "OK"
-    persist_interview_with_answers(
-        Interview(
-            id=interview_id,
-            locale="en",
-            selection_spec=minimal_selection_spec(categories=["basics"]),
-            status="active",
-        ),
-        [
-            initial,
-            first_follow_up,
-            Answer(
-                question_id="q1",
-                order=1,
-                round=2,
-                question_text="Second follow-up?",
-            ),
-            Answer(
-                question_id="q2",
-                order=2,
-                round=0,
-                question_text="Question two?",
-            ),
-        ],
-        question_count=2,
-    )
-
-    provider = fake_ai_provider(
-        [
-            follow_up_evaluation_json(
-                score=4,
-                needs_further_follow_up=False,
-            )
-        ]
-    )
-    transcriber = FakeTranscriber("second follow-up spoken")
-    wav_bytes = minimal_wav_bytes()
-
-    orig_eval = TheoryEvaluatorService.evaluate_submission
-
-    async def slow_audio_eval(**kwargs):
-        await asyncio.sleep(0.05)
-        return await orig_eval(**kwargs)
-
-    monkeypatch.setattr(
-        TheoryEvaluatorService,
-        "evaluate_submission",
-        staticmethod(slow_audio_eval),
-    )
-
-    events = await TheorySubmissionService.process_audio_answer_submission(
-        interview_id=interview_id,
-        question_id="q1",
-        wav_bytes=wav_bytes,
-        provider=provider,
-        transcriber=transcriber,
-    )
-
-    assert len(events) == 3
-    assert isinstance(events[0], AnswerSavedEvent)
-    assert isinstance(events[1], AnswerFeedbackEvent)
-    assert isinstance(events[2], TranscriptEvent)
-    assert not any(isinstance(event, EvaluatingEvent) for event in events)
-
-    reloaded = InterviewQuery.get_interview(interview_id)
-    assert reloaded is not None
-    last_follow_up = next(
-        a for a in reloaded.answers if a.question_id == "q1" and a.round == 2
-    )
-    assert last_follow_up.answer_text == "second follow-up spoken"
-    assert last_follow_up.score is None
-
-    await asyncio.sleep(0.05)
-
-    reloaded = InterviewQuery.get_interview(interview_id)
-    assert reloaded is not None
-    last_follow_up = next(
-        a for a in reloaded.answers if a.question_id == "q1" and a.round == 2
-    )
-    assert last_follow_up.score == 4
diff --git a/tests/test_session_results.py b/tests/test_session_results.py
deleted file mode 100644
index 171122d..0000000
--- a/tests/test_session_results.py
+++ /dev/null
@@ -1,241 +0,0 @@
-# Copyright 2026 GrillKit Contributors
-# SPDX-License-Identifier: Apache-2.0
-"""Tests for completed session results and section review pages."""
-
-import json
-
-import pytest
-
-from app.coding.services.evaluator.service import CodingEvaluatorService
-from app.coding.services.review import CodingReviewService
-from app.interview.repositories.uow import InterviewUnitOfWork
-from app.interview.services.results_page import SessionResultsPageService
-from app.shared.infrastructure.models import Answer, CodingTask, Interview
-from app.theory.services.review import TheoryReviewService
-from tests.fakes import FakeProvider, section_evaluation_json
-from tests.helpers.coding_seed import (
-    attach_coding_tasks,
-    create_coding_section_for_interview,
-)
-from tests.helpers.interview_seed import persist_interview_with_answers
-from tests.helpers.selection import minimal_selection_spec
-
-
-def _seed_completed_theory_interview(interview_id: str = "results-theory-1") -> str:
-    """Persist a completed theory interview with one answered question.
-
-    Args:
-        interview_id: Interview primary key.
-
-    Returns:
-        Interview UUID.
-    """
-    persist_interview_with_answers(
-        Interview(
-            id=interview_id,
-            locale="en",
-            selection_spec=minimal_selection_spec(categories=["basics"]),
-            status="active",
-        ),
-        [
-            Answer(
-                question_id="q1",
-                order=1,
-                round=0,
-                question_text="What is Python?",
-                answer_text="A programming language",
-                score=4,
-                feedback="Clear and concise.",
-            )
-        ],
-    )
-    overall_feedback = {
-        "overall_feedback": "Good theory performance.",
-        "strengths_summary": ["basics"],
-        "topics_to_review": [],
-        "score_breakdown": {
-            "theory": {
-                "score": 4,
-                "max": 5,
-                "skipped": False,
-                "questions": {"q1": {"score": 4, "max": 5}},
-            }
-        },
-    }
-    with InterviewUnitOfWork(auto_commit=True) as uow:
-        aggregate = uow.interviews.get_aggregate(interview_id)
-        assert aggregate is not None
-        completed = aggregate.with_session_completed(overall_feedback)
-        uow.interviews.save_aggregate(completed)
-    return interview_id
-
-
-def _seed_completed_coding_interview(interview_id: str = "results-coding-1") -> str:
-    """Persist a completed coding-only interview with one submitted task.
-
-    Args:
-        interview_id: Interview primary key.
-
-    Returns:
-        Interview UUID.
-    """
-    with InterviewUnitOfWork(auto_commit=True) as uow:
-        interview = Interview(
-            id=interview_id,
-            locale="en",
-            selection_spec=json.dumps(
-                {
-                    "version": 2,
-                    "session_mode": "coding_only",
-                    "theory": {"enabled": False},
-                    "coding": {"enabled": True},
-                }
-            ),
-            session_mode="coding_only",
-            status="active",
-        )
-        uow.interviews.add(interview)
-        uow.flush()
-        section = create_coding_section_for_interview(
-            uow.session,
-            interview,
-            task_count=1,
-            status="completed",
-        )
-        tasks = attach_coding_tasks(uow.session, section, task_ids=["cod-001"])
-        task = tasks[0]
-        task.submitted_code = "def solve():\n    return 1"
-        task.score = 4
-        task.feedback = "Works for the sample case."
-        task.submit_test_summary = json.dumps(
-            {"status": "success", "tests_passed": 2, "tests_total": 2}
-        )
-        uow.session.add(task)
-        overall_feedback = {
-            "overall_feedback": "Good coding performance.",
-            "strengths_summary": ["problem solving"],
-            "topics_to_review": [],
-            "score_breakdown": {
-                "coding": {
-                    "score": 4,
-                    "max": 5,
-                    "skipped": False,
-                    "questions": {"cod-001": {"score": 4, "max": 5}},
-                }
-            },
-        }
-        aggregate = uow.interviews.get_aggregate(interview_id)
-        assert aggregate is not None
-        completed = aggregate.with_session_completed(overall_feedback)
-        uow.interviews.save_aggregate(completed)
-    return interview_id
-
-
-@pytest.mark.asyncio
-async def test_coding_evaluator_evaluate_section() -> None:
-    """Coding section evaluation returns parsed section narrative."""
-    provider = FakeProvider(
-        replies=[section_evaluation_json(section_feedback="Strong coding section.")]
-    )
-    result = await CodingEvaluatorService.evaluate_section(
-        provider=provider,
-        task_submissions=[
-            {
-                "task_id": "cod-001",
-                "round": 0,
-                "prompt_text": "Solve it.",
-                "submitted_code": "return 1",
-                "score": 4,
-            }
-        ],
-        sources_text="Python / junior: basics",
-        locale="en",
-    )
-    assert result.section_feedback == "Strong coding section."
-
-
-def test_theory_review_service_builds_chat_history(isolated_db) -> None:
-    """Theory review exposes answered rounds and fallback section feedback."""
-    interview_id = _seed_completed_theory_interview()
-    context = TheoryReviewService.build_context(interview_id)
-    assert context is not None
-    assert len(context.answers) == 1
-    assert context.answers[0].feedback == "Clear and concise."
-    assert "Clear and concise." in context.section_feedback["section_feedback"]
-
-
-def test_coding_review_service_groups_task_rounds(isolated_db) -> None:
-    """Coding review groups submitted rounds on one page."""
-    interview_id = _seed_completed_coding_interview()
-    with InterviewUnitOfWork(auto_commit=True) as uow:
-        section = uow.coding_sections.get_aggregate(interview_id)
-        assert section is not None
-        follow_up = CodingTask(
-            coding_section_id=section.id,
-            task_id="cod-001",
-            order=1,
-            round=1,
-            prompt_text="Explain your approach.",
-            task_spec=json.dumps({"language": "python"}),
-            submitted_code="I used a direct return.",
-            score=3,
-            feedback="Explanation was brief.",
-        )
-        uow.session.add(follow_up)
-
-    context = CodingReviewService.build_context(interview_id)
-    assert context is not None
-    assert len(context.tasks) == 1
-    assert len(context.tasks[0].rounds) == 2
-    assert context.tasks[0].total_score == 7
-
-
-def test_session_results_page_service_builds_section_cards(isolated_db) -> None:
-    """Results hub includes enabled section cards with review links."""
-    interview_id = _seed_completed_theory_interview("results-hub-1")
-    with InterviewUnitOfWork() as uow:
-        interview = uow.interviews.get_read_model(interview_id)
-    assert interview is not None
-    context = SessionResultsPageService.build_context(interview)
-    assert context is not None
-    assert context.theory_review_url == f"/interview/{interview_id}/theory"
-    assert len(context.section_cards) == 1
-    assert context.section_cards[0].section == "theory"
-
-
-def test_completed_interview_page_redirects_to_results(client, isolated_db) -> None:
-    """Completed sessions no longer render the active interview page."""
-    interview_id = _seed_completed_theory_interview("results-redirect-1")
-    response = client.get(f"/interview/{interview_id}", follow_redirects=False)
-    assert response.status_code == 303
-    assert response.headers["location"] == f"/interview/{interview_id}/results"
-
-
-def test_results_page_renders_for_completed_session(client, isolated_db) -> None:
-    """Results hub renders overall feedback and section cards."""
-    interview_id = _seed_completed_theory_interview("results-page-1")
-    response = client.get(f"/interview/{interview_id}/results")
-    assert response.status_code == 200
-    assert "Overall Evaluation" in response.text
-    assert "View details" in response.text
-    assert "Good theory performance." in response.text
-
-
-def test_theory_review_page_renders_history(client, isolated_db) -> None:
-    """Theory review page renders chat history and section feedback."""
-    interview_id = _seed_completed_theory_interview("results-theory-page-1")
-    response = client.get(f"/interview/{interview_id}/theory")
-    assert response.status_code == 200
-    assert "Conversation History" in response.text
-    assert "A programming language" in response.text
-    assert "Clear and concise." in response.text
-
-
-def test_coding_review_page_renders_task_accordion(client, isolated_db) -> None:
-    """Coding review page renders per-task accordion with final submit."""
-    interview_id = _seed_completed_coding_interview("results-coding-page-1")
-    response = client.get(f"/interview/{interview_id}/coding")
-    assert response.status_code == 200
-    assert "Coding Tasks" in response.text
-    assert "cod-001" in response.text
-    assert "Works for the sample case." in response.text
diff --git a/tests/theory/__init__.py b/tests/theory/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/theory/api/__init__.py b/tests/theory/api/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_audio_answer_api.py b/tests/theory/api/test_audio_answer.py
similarity index 100%
rename from tests/test_audio_answer_api.py
rename to tests/theory/api/test_audio_answer.py
diff --git a/tests/test_theory_api.py b/tests/theory/api/test_routes.py
similarity index 100%
rename from tests/test_theory_api.py
rename to tests/theory/api/test_routes.py
diff --git a/tests/test_ws_protocol.py b/tests/theory/api/test_ws_protocol.py
similarity index 100%
rename from tests/test_ws_protocol.py
rename to tests/theory/api/test_ws_protocol.py
diff --git a/tests/theory/api/test_ws_routes.py b/tests/theory/api/test_ws_routes.py
new file mode 100644
index 0000000..048fcf2
--- /dev/null
+++ b/tests/theory/api/test_ws_routes.py
@@ -0,0 +1,248 @@
+# Copyright 2026 GrillKit Contributors
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for theory WebSocket route handlers."""
+
+import time
+from typing import Any
+from unittest.mock import ANY, AsyncMock, patch
+
+from fastapi.testclient import TestClient
+import pytest
+
+from app.interview.domain.exceptions import (
+    InterviewNotActiveError,
+    InterviewNotFoundError,
+)
+from app.main import create_app
+
+
+async def _raising_answer_stream(
+    exc: Exception,
+    interview_id: str,
+    question_id: str,
+    answer_text: str,
+    **kwargs: Any,
+) -> None:
+    raise exc
+    yield  # type: ignore[misc, unreachable]
+
+
+@pytest.fixture
+def client():
+    """Create a test client with mocked database and fake AI provider."""
+    from app.interview.api.deps import get_ai_provider
+    from tests.fakes import FakeProvider
+
+    async def _fake_ai_provider():
+        yield FakeProvider([])
+
+    with (
+        patch("app.main.run_migrations"),
+        patch(
+            "app.platform.services.speech_runtime.SpeechRuntimeCoordinator.startup",
+            new=AsyncMock(),
+        ),
+        patch(
+            "app.platform.services.speech_runtime.SpeechRuntimeCoordinator.unload_all",
+        ),
+    ):
+        app = create_app()
+        app.dependency_overrides[get_ai_provider] = _fake_ai_provider
+        with TestClient(app) as test_client:
+            yield test_client
+        app.dependency_overrides.clear()
+
+
+class MockInterview:
+    """Minimal mock of Interview for WebSocket tests."""
+
+    def __init__(self, status: str = "active"):
+        self.id = "test-session-id"
+        self.status = status
+        self.answers = []
+        self.question_count = 5
+        self.locale = "en"
+        self.selection_spec = (
+            '{"sources":[{"track":"python","level":"junior",'
+            '"categories":["data-structures"]}]}'
+        )
+        self.score = None
+        self.overall_feedback = None
+
+
+class TestTheoryWebSocket:
+    """Tests for theory WebSocket endpoint."""
+
+    def test_websocket_unknown_message(self, client):
+        """Test WebSocket returns error for unknown message type."""
+        with (
+            patch("app.interview.services.query.InterviewQuery.get_interview"),
+            client.websocket_connect("/interview/test-id/theory/ws") as ws,
+        ):
+            ws.send_json({"type": "unknown_command"})
+            response = ws.receive_json()
+            assert response["type"] == "error"
+            assert "Unknown message type" in response["message"]
+
+    def test_websocket_answer_success(self, client):
+        """Test WebSocket answer submission invokes stream_answer_submission."""
+        stream_calls: list[tuple[str, str, str]] = []
+
+        async def mock_stream(
+            interview_id: str,
+            question_id: str,
+            answer_text: str,
+            **kwargs: Any,
+        ) -> None:
+            stream_calls.append((interview_id, question_id, answer_text))
+            return
+            yield  # type: ignore[misc, unreachable]
+
+        with (
+            patch(
+                "app.theory.services.submission.TheorySubmissionService.stream_answer_submission",
+                side_effect=mock_stream,
+            ),
+            client.websocket_connect("/interview/test-id/theory/ws") as ws,
+        ):
+            ws.send_json(
+                {
+                    "type": "answer",
+                    "question_id": "ds-001",
+                    "answer_text": "My answer",
+                }
+            )
+            for _ in range(100):
+                if stream_calls:
+                    break
+                time.sleep(0.01)
+            assert stream_calls == [("test-id", "ds-001", "My answer")]
+
+    def test_websocket_answer_missing_fields(self, client):
+        """Test WebSocket returns error when question_id or answer_text is missing."""
+        with (
+            patch("app.interview.services.query.InterviewQuery.get_interview"),
+            client.websocket_connect("/interview/test-id/theory/ws") as ws,
+        ):
+            ws.send_json({"type": "answer", "question_id": ""})
+            response = ws.receive_json()
+            assert response["type"] == "error"
+            assert "Both" in response["message"]
+
+    def test_websocket_answer_completed_session(self, client):
+        """Test WebSocket rejects answer on completed session."""
+        with (
+            patch(
+                "app.theory.services.submission.TheorySubmissionService.stream_answer_submission",
+                side_effect=lambda *args, **kwargs: _raising_answer_stream(
+                    InterviewNotActiveError("test-id"), *args, **kwargs
+                ),
+            ),
+            client.websocket_connect("/interview/test-id/theory/ws") as ws,
+        ):
+            ws.send_json(
+                {
+                    "type": "answer",
+                    "question_id": "ds-001",
+                    "answer_text": "My answer",
+                }
+            )
+            response = ws.receive_json()
+            assert response["type"] == "error"
+            assert "completed" in response["message"].lower()
+
+    def test_websocket_answer_session_not_found(self, client):
+        """Test WebSocket returns error when session is not found."""
+        with (
+            patch(
+                "app.theory.services.submission.TheorySubmissionService.stream_answer_submission",
+                side_effect=lambda *args, **kwargs: _raising_answer_stream(
+                    InterviewNotFoundError("test-id"), *args, **kwargs
+                ),
+            ),
+            client.websocket_connect("/interview/test-id/theory/ws") as ws,
+        ):
+            ws.send_json(
+                {
+                    "type": "answer",
+                    "question_id": "ds-001",
+                    "answer_text": "My answer",
+                }
+            )
+            response = ws.receive_json()
+            assert response["type"] == "error"
+            assert "not found" in response["message"].lower()
+
+    def test_websocket_ping_pong(self, client):
+        """Test WebSocket ping/pong returns session status."""
+        mock_session = MockInterview(status="active")
+
+        with (
+            patch(
+                "app.interview.services.query.InterviewQuery.get_interview",
+                return_value=mock_session,
+            ),
+            client.websocket_connect("/interview/test-id/theory/ws") as ws,
+        ):
+            ws.send_json({"type": "ping"})
+            response = ws.receive_json()
+            assert response["type"] == "pong"
+            assert response["status"] == "active"
+
+    def test_websocket_ping_completed_session(self, client):
+        """Test ping returns completed status."""
+        mock_session = MockInterview(status="completed")
+
+        with (
+            patch(
+                "app.interview.services.query.InterviewQuery.get_interview",
+                return_value=mock_session,
+            ),
+            client.websocket_connect("/interview/test-id/theory/ws") as ws,
+        ):
+            ws.send_json({"type": "ping"})
+            response = ws.receive_json()
+            assert response["type"] == "pong"
+            assert response["status"] == "completed"
+
+    def test_websocket_complete_success(self, client):
+        """Test WebSocket complete message triggers session completion."""
+        with (
+            patch(
+                "app.interview.services.completion.SessionCompletionService.complete_session",
+                new_callable=AsyncMock,
+                return_value=[],
+            ) as mock_complete,
+            client.websocket_connect("/interview/test-id/theory/ws") as ws,
+        ):
+            ws.send_json({"type": "complete"})
+            for _ in range(100):
+                if mock_complete.await_count:
+                    break
+                time.sleep(0.01)
+            mock_complete.assert_awaited_once_with(
+                interview_id="test-id",
+                provider=ANY,
+            )
+
+    def test_websocket_answer_service_error(self, client):
+        """Test WebSocket handles ValueError from service layer."""
+        with (
+            patch(
+                "app.theory.services.submission.TheorySubmissionService.stream_answer_submission",
+                side_effect=lambda *args, **kwargs: _raising_answer_stream(
+                    ValueError("Invalid question"), *args, **kwargs
+                ),
+            ),
+            client.websocket_connect("/interview/test-id/theory/ws") as ws,
+        ):
+            ws.send_json(
+                {
+                    "type": "answer",
+                    "question_id": "ds-001",
+                    "answer_text": "My answer",
+                }
+            )
+            response = ws.receive_json()
+            assert response["type"] == "error"
+            assert "Invalid question" in response["message"]
diff --git a/tests/theory/integration/__init__.py b/tests/theory/integration/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_interview_ws_integration.py b/tests/theory/integration/test_ws.py
similarity index 100%
rename from tests/test_interview_ws_integration.py
rename to tests/theory/integration/test_ws.py
diff --git a/tests/theory/repositories/__init__.py b/tests/theory/repositories/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_theory_section.py b/tests/theory/repositories/test_theory_section.py
similarity index 100%
rename from tests/test_theory_section.py
rename to tests/theory/repositories/test_theory_section.py
diff --git a/tests/theory/services/__init__.py b/tests/theory/services/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_answer_ai_evaluation.py b/tests/theory/services/test_evaluator.py
similarity index 100%
rename from tests/test_answer_ai_evaluation.py
rename to tests/theory/services/test_evaluator.py
diff --git a/tests/test_theory_evaluator_parsing.py b/tests/theory/services/test_evaluator_parsing.py
similarity index 100%
rename from tests/test_theory_evaluator_parsing.py
rename to tests/theory/services/test_evaluator_parsing.py
diff --git a/tests/test_theory_planning.py b/tests/theory/services/test_planning.py
similarity index 100%
rename from tests/test_theory_planning.py
rename to tests/theory/services/test_planning.py
diff --git a/tests/theory/services/test_review.py b/tests/theory/services/test_review.py
new file mode 100644
index 0000000..68ca4b5
--- /dev/null
+++ b/tests/theory/services/test_review.py
@@ -0,0 +1,26 @@
+# Copyright 2026 GrillKit Contributors
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for TheoryReviewService."""
+
+from app.theory.services.review import TheoryReviewService
+from tests.helpers.completed_session_seed import seed_completed_theory_interview
+
+
+def test_theory_review_service_builds_chat_history(isolated_db) -> None:
+    """Theory review exposes answered rounds and fallback section feedback."""
+    interview_id = seed_completed_theory_interview()
+    context = TheoryReviewService.build_context(interview_id)
+    assert context is not None
+    assert len(context.answers) == 1
+    assert context.answers[0].feedback == "Clear and concise."
+    assert "Clear and concise." in context.section_feedback["section_feedback"]
+
+
+def test_theory_review_page_renders_history(client, isolated_db) -> None:
+    """Theory review page renders chat history and section feedback."""
+    interview_id = seed_completed_theory_interview("results-theory-page-1")
+    response = client.get(f"/interview/{interview_id}/theory")
+    assert response.status_code == 200
+    assert "Conversation History" in response.text
+    assert "A programming language" in response.text
+    assert "Clear and concise." in response.text
diff --git a/tests/test_answer_processing.py b/tests/theory/services/test_submission.py
similarity index 73%
rename from tests/test_answer_processing.py
rename to tests/theory/services/test_submission.py
index 6fa28c4..ad347e6 100644
--- a/tests/test_answer_processing.py
+++ b/tests/theory/services/test_submission.py
@@ -1,17 +1,19 @@
 # Copyright 2026 GrillKit Contributors
 # SPDX-License-Identifier: Apache-2.0
-"""Tests for answer processing with a deterministic fake AI provider."""
+"""Tests for TheorySubmissionService text and audio answer flows."""
 
 import asyncio
 from datetime import UTC, datetime, timedelta
 
 import pytest
 
+from app.ai.audio_probe import minimal_wav_bytes
 from app.interview.domain.exceptions import InterviewNotActiveError
 from app.interview.services.events import (
     AnswerFeedbackEvent,
     AnswerSavedEvent,
     EvaluatingEvent,
+    TranscriptEvent,
 )
 from app.interview.services.query import InterviewQuery
 from app.shared.infrastructure.models import Answer, Interview
@@ -25,6 +27,7 @@
     seed_two_question_interview,
 )
 from tests.helpers.selection import minimal_selection_spec
+from tests.helpers.transcription import FakeTranscriber
 
 
 @pytest.mark.asyncio
@@ -418,8 +421,6 @@ async def test_timeout_during_ai_evaluation_preserves_score(
     isolated_db, fake_ai_provider, monkeypatch
 ):
     """Timeout sent while AI runs does not block persisting the real score."""
-    import asyncio
-
     started = datetime.now(UTC) - timedelta(seconds=30)
     interview_id = _seed_timed_interview(started_at=started)
     provider = fake_ai_provider(
@@ -488,3 +489,180 @@ async def test_late_answer_submission_treated_as_timeout(isolated_db, fake_ai_pr
     q1 = next(a for a in reloaded.answers if a.question_id == "q1" and a.round == 0)
     assert q1.score == 0
     assert q1.answer_text == TheoryTask.TIME_EXPIRED_ANSWER_TEXT
+
+
+@pytest.mark.asyncio
+async def test_process_audio_answer_runs_transcription_and_evaluation(
+    isolated_db, fake_ai_provider, monkeypatch
+):
+    """Audio answers yield saved, evaluating, transcript, and feedback events."""
+    monkeypatch.setattr(
+        TheorySubmissionService,
+        "require_audio_answer_enabled",
+        staticmethod(lambda: None),
+    )
+    interview_id = seed_two_question_interview("audio-ap-1")
+    provider = fake_ai_provider(
+        [answer_evaluation_json(score=5, follow_up_needed=False)]
+    )
+    transcriber = FakeTranscriber("spoken answer text")
+    wav_bytes = minimal_wav_bytes(duration_sec=0.2)
+
+    events = await TheorySubmissionService.process_audio_answer_submission(
+        interview_id=interview_id,
+        question_id="q1",
+        wav_bytes=wav_bytes,
+        provider=provider,
+        transcriber=transcriber,
+    )
+
+    assert [type(event) for event in events] == [
+        AnswerSavedEvent,
+        EvaluatingEvent,
+        TranscriptEvent,
+        AnswerFeedbackEvent,
+    ]
+    transcript = events[2]
+    assert isinstance(transcript, TranscriptEvent)
+    assert transcript.text == "spoken answer text"
+    assert transcriber.last_audio is not None
+
+    reloaded = InterviewQuery.get_interview(interview_id)
+    assert reloaded is not None
+    answer = next(a for a in reloaded.answers if a.question_id == "q1" and a.round == 0)
+    assert answer.answer_text == "spoken answer text"
+    assert answer.score == 5
+
+
+@pytest.mark.asyncio
+async def test_process_audio_answer_rejects_invalid_wav(
+    isolated_db, fake_ai_provider, monkeypatch
+):
+    """Invalid WAV payloads fail before any events are emitted."""
+    monkeypatch.setattr(
+        TheorySubmissionService,
+        "require_audio_answer_enabled",
+        staticmethod(lambda: None),
+    )
+    interview_id = seed_two_question_interview("audio-ap-1")
+    provider = fake_ai_provider([answer_evaluation_json()])
+    transcriber = FakeTranscriber()
+
+    with pytest.raises(ValueError, match="valid WAV"):
+        await TheorySubmissionService.process_audio_answer_submission(
+            interview_id=interview_id,
+            question_id="q1",
+            wav_bytes=b"not-wav",
+            provider=provider,
+            transcriber=transcriber,
+        )
+
+
+@pytest.mark.asyncio
+async def test_process_audio_answer_last_follow_up_fast_path(
+    isolated_db, fake_ai_provider, monkeypatch
+):
+    """Last follow-up round advances immediately and transcribes in-band."""
+    monkeypatch.setattr(
+        TheorySubmissionService,
+        "require_audio_answer_enabled",
+        staticmethod(lambda: None),
+    )
+    interview_id = "audio-ap-last-follow-up"
+    initial = Answer(
+        question_id="q1",
+        order=1,
+        round=0,
+        question_text="Original question?",
+    )
+    initial.answer_text = "First answer"
+    initial.score = 3
+    initial.feedback = "OK"
+    first_follow_up = Answer(
+        question_id="q1",
+        order=1,
+        round=1,
+        question_text="First follow-up?",
+    )
+    first_follow_up.answer_text = "First follow-up answer"
+    first_follow_up.score = 3
+    first_follow_up.feedback = "OK"
+    persist_interview_with_answers(
+        Interview(
+            id=interview_id,
+            locale="en",
+            selection_spec=minimal_selection_spec(categories=["basics"]),
+            status="active",
+        ),
+        [
+            initial,
+            first_follow_up,
+            Answer(
+                question_id="q1",
+                order=1,
+                round=2,
+                question_text="Second follow-up?",
+            ),
+            Answer(
+                question_id="q2",
+                order=2,
+                round=0,
+                question_text="Question two?",
+            ),
+        ],
+        question_count=2,
+    )
+
+    provider = fake_ai_provider(
+        [
+            follow_up_evaluation_json(
+                score=4,
+                needs_further_follow_up=False,
+            )
+        ]
+    )
+    transcriber = FakeTranscriber("second follow-up spoken")
+    wav_bytes = minimal_wav_bytes()
+
+    orig_eval = TheoryEvaluatorService.evaluate_submission
+
+    async def slow_audio_eval(**kwargs):
+        await asyncio.sleep(0.05)
+        return await orig_eval(**kwargs)
+
+    monkeypatch.setattr(
+        TheoryEvaluatorService,
+        "evaluate_submission",
+        staticmethod(slow_audio_eval),
+    )
+
+    events = await TheorySubmissionService.process_audio_answer_submission(
+        interview_id=interview_id,
+        question_id="q1",
+        wav_bytes=wav_bytes,
+        provider=provider,
+        transcriber=transcriber,
+    )
+
+    assert len(events) == 3
+    assert isinstance(events[0], AnswerSavedEvent)
+    assert isinstance(events[1], AnswerFeedbackEvent)
+    assert isinstance(events[2], TranscriptEvent)
+    assert not any(isinstance(event, EvaluatingEvent) for event in events)
+
+    reloaded = InterviewQuery.get_interview(interview_id)
+    assert reloaded is not None
+    last_follow_up = next(
+        a for a in reloaded.answers if a.question_id == "q1" and a.round == 2
+    )
+    assert last_follow_up.answer_text == "second follow-up spoken"
+    assert last_follow_up.score is None
+
+    await asyncio.sleep(0.05)
+
+    reloaded = InterviewQuery.get_interview(interview_id)
+    assert reloaded is not None
+    last_follow_up = next(
+        a for a in reloaded.answers if a.question_id == "q1" and a.round == 2
+    )
+    assert last_follow_up.score == 4