GrillKit · vitchenkokir · Jun 12, 2026 · Jun 11, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/app/coding/domain/entities.py b/app/coding/domain/entities.py
@@ -199,7 +199,9 @@ def start(
 
         when = datetime.now(UTC)
         task_ids = tuple(task.id for task in planned_tasks)
-        timer_start = when if task_time_limit_seconds is not None else None
+        timer_start = (
+            when if task_time_limit_seconds is not None and status == "active" else None
+        )
         tasks: list[CodingTask] = []
         for order, planned in enumerate(planned_tasks, start=1):
             tasks.append(

diff --git a/app/coding/services/page.py b/app/coding/services/page.py
@@ -48,7 +48,10 @@ def build_context(interview_id: str) -> CodingPageContext | None:
             completed_tasks = sum(
                 1 for task in section.tasks if task.submitted_code is not None
             )
-            task_timer_enabled = section.task_time_limit_seconds is not None
+            task_timer_enabled = (
+                section.task_time_limit_seconds is not None
+                and section.status == "active"
+            )
             timer_remaining = (
                 current.remaining_seconds(section.task_time_limit_seconds)
                 if task_timer_enabled and current is not None

diff --git a/app/interview/api/setup.py b/app/interview/api/setup.py
@@ -256,6 +256,7 @@ async def create_interview(
                     error=str(e),
                     min_question_count=min_theory,
                     min_coding_task_count=min_coding,
+                    initial_wizard_step="review",
                 ),
                 **SpeechModelPageService.build_page_context(
                     config,

diff --git a/app/interview/api/setup_form.py b/app/interview/api/setup_form.py
@@ -52,6 +52,7 @@ def setup_form_context(
     error: str | None = None,
     min_question_count: int = 1,
     min_coding_task_count: int = 1,
+    initial_wizard_step: str = "mode",
 ) -> dict[str, object]:
     """Build template context for the multi-track setup form.
 
@@ -60,6 +61,7 @@ def setup_form_context(
         error: Optional error message to display.
         min_question_count: Minimum allowed theory question count.
         min_coding_task_count: Minimum allowed coding task count.
+        initial_wizard_step: Wizard step id to open on load (``mode``, ``review``, etc.).
 
     Returns:
         Context dict for ``setup.html``.
@@ -80,6 +82,7 @@ def setup_form_context(
             "error": error or "No question banks found.",
             "min_question_count": min_question_count,
             "min_coding_task_count": min_coding_task_count,
+            "initial_wizard_step": initial_wizard_step,
         }
 
     track_sections = _build_track_sections(
@@ -135,4 +138,5 @@ def setup_form_context(
         "error": error,
         "min_question_count": min_question_count,
         "min_coding_task_count": min_coding_task_count,
+        "initial_wizard_step": initial_wizard_step,
     }
diff --git a/app/interview/schemas/ws.py b/app/interview/schemas/ws.py
@@ -48,6 +48,7 @@ class AnswerFeedbackMessage(BaseModel):
     timed_out: bool = False
     feedback: str | None = None
     timer_remaining_seconds: int | None = None
+    follow_up_answer_id: int | None = None
 
 
 class InterviewCompletedMessage(BaseModel):
@@ -78,5 +79,7 @@ def server_message_to_dict(message: BaseModel) -> dict[str, Any]:
             payload.pop("feedback", None)
         if payload.get("timer_remaining_seconds") is None:
             payload.pop("timer_remaining_seconds", None)
+        if payload.get("follow_up_answer_id") is None:
+            payload.pop("follow_up_answer_id", None)
         return payload
     return message.model_dump(mode="json")
diff --git a/app/interview/services/creation.py b/app/interview/services/creation.py
@@ -13,7 +13,10 @@
 from app.interview.domain.value_objects import SessionMode, SessionSelection
 from app.interview.repositories.uow import InterviewUnitOfWork
 from app.interview.schemas.interview import InterviewRead
-from app.interview.services.sections import phase_order_for_mode
+from app.interview.services.sections import (
+    is_first_user_facing_section,
+    phase_order_for_mode,
+)
 from app.shared.locales import normalize_locale
 from app.theory.services.creation import TheorySectionCreationService
 
@@ -74,6 +77,10 @@ def create_session(
                     locale=locale,
                     question_count=session.theory.question_count,
                     task_time_limit_seconds=session.theory.task_time_limit_seconds,
+                    start_first_task_timer=is_first_user_facing_section(
+                        session.session_mode,
+                        "theory",
+                    ),
                     uow=uow,
                 )
             if session.coding.enabled:

diff --git a/app/interview/services/events.py b/app/interview/services/events.py
@@ -30,6 +30,7 @@ class AnswerFeedbackEvent:
         timed_out: Whether this round ended due to timer expiry.
         feedback: Short feedback for the client (e.g. timeout message).
         timer_remaining_seconds: Seconds left on the next round timer, if any.
+        follow_up_answer_id: Task row id for a newly created follow-up round.
     """
 
     question_id: str
@@ -41,6 +42,7 @@ class AnswerFeedbackEvent:
     timed_out: bool = False
     feedback: str | None = None
     timer_remaining_seconds: int | None = None
+    follow_up_answer_id: int | None = None
 
 
 @dataclass(frozen=True)

diff --git a/app/interview/services/page.py b/app/interview/services/page.py
@@ -50,16 +50,19 @@ class SessionPageService:
 
     @staticmethod
     def load_interview(interview_id: str) -> InterviewRead | None:
-        """Load a session and start the theory timer on the current task when active.
+        """Load a session and start the active section timer when applicable.
 
         Args:
             interview_id: The session UUID.
 
         Returns:
             Interview read model, or None when not found.
         """
-        TheoryPageService.activate_timer(interview_id)
-        CodingPageService.activate_timer(interview_id)
+        active = SessionPhaseOrchestrator.active_phase(interview_id)
+        if active == "theory":
+            TheoryPageService.activate_timer(interview_id)
+        elif active == "coding":
+            CodingPageService.activate_timer(interview_id)
         return InterviewQuery.get_interview(interview_id)
 
     @staticmethod

diff --git a/app/interview/services/sections.py b/app/interview/services/sections.py
@@ -102,6 +102,22 @@ def phase_order_for_mode(session_mode: SessionMode) -> tuple[SectionKind, ...]:
     return ("coding", "theory")
 
 
+def is_first_user_facing_section(
+    session_mode: SessionMode, section: SectionKind
+) -> bool:
+    """Return whether ``section`` is the first interactive phase for a session mode.
+
+    Args:
+        session_mode: Session mode from setup.
+        section: Section kind to check.
+
+    Returns:
+        True when ``section`` is the first entry in the mode phase order.
+    """
+    order = phase_order_for_mode(session_mode)
+    return bool(order) and order[0] == section
+
+
 def section_services() -> dict[SectionKind, SectionService]:
     """Return section service classes keyed by section kind.
 

diff --git a/app/shared/structured_evaluation.py b/app/shared/structured_evaluation.py
@@ -6,7 +6,115 @@
 
 from pydantic import BaseModel
 
-from app.ai.base import AIProvider, Message
+from app.ai.base import AIProvider, GenerationResult, Message
+
+_MAX_RETRY_TOKENS = 4096
+_COMPACT_JSON_RETRY_NOTE = (
+    "\n\nYour previous response was truncated or invalid JSON. "
+    "Keep all string fields brief (feedback at most 4 sentences, "
+    "follow-up questions one sentence). "
+    "Return ONLY one complete valid JSON object, no markdown fences."
+)
+
+
+def _should_retry_structured_parse(
+    exc: ValueError,
+    finish_reason: str | None,
+) -> bool:
+    """Return True when a structured JSON parse failure may succeed on retry.
+
+    Args:
+        exc: Parse or validation error from the model response.
+        finish_reason: Provider completion reason, when available.
+
+    Returns:
+        True if the caller should retry with a higher token budget.
+    """
+    if finish_reason == "length":
+        return True
+    return "invalid JSON" in str(exc)
+
+
+async def _parse_generation_result[T: BaseModel](
+    result: GenerationResult,
+    response_model: type[T],
+) -> T:
+    """Parse one provider result into a validated structured model.
+
+    Args:
+        result: Raw provider generation result.
+        response_model: Pydantic model for parsed JSON output.
+
+    Returns:
+        Parsed evaluation model instance.
+
+    Raises:
+        ValueError: If the response body is empty or invalid JSON.
+    """
+    from app.theory.services.evaluator.prompts import parse_json_response
+
+    content = result.content.strip()
+    if not content:
+        raise ValueError("AI returned empty response")
+    return parse_json_response(content, response_model)
+
+
+async def generate_and_parse_json_response[T: BaseModel](
+    provider: AIProvider,
+    *,
+    messages: list[Message],
+    response_model: type[T],
+    max_tokens: int = 2000,
+    temperature: float = 0.3,
+) -> T:
+    """Generate JSON from chat messages and parse it with retry on truncation.
+
+    Args:
+        provider: Configured AI provider instance.
+        messages: Full chat messages for the provider request.
+        response_model: Pydantic model for parsed JSON output.
+        max_tokens: Initial maximum tokens for the model response.
+        temperature: Sampling temperature for generation.
+
+    Returns:
+        Parsed evaluation model instance.
+
+    Raises:
+        ValueError: If AI response is invalid or connection fails after retries.
+    """
+    token_budgets = [max_tokens, min(max_tokens * 2, _MAX_RETRY_TOKENS)]
+    last_error: ValueError | None = None
+    base_system_prompt = (
+        messages[0].content if messages and messages[0].role == "system" else None
+    )
+
+    for attempt, budget in enumerate(token_budgets):
+        attempt_messages = list(messages)
+        if attempt > 0 and base_system_prompt is not None:
+            attempt_messages[0] = Message(
+                role="system",
+                content=base_system_prompt + _COMPACT_JSON_RETRY_NOTE,
+            )
+
+        result = await provider.generate(
+            messages=attempt_messages,
+            temperature=temperature,
+            max_tokens=budget,
+        )
+
+        try:
+            return await _parse_generation_result(result, response_model)
+        except ValueError as exc:
+            last_error = exc
+            if attempt < len(token_budgets) - 1 and _should_retry_structured_parse(
+                exc, result.finish_reason
+            ):
+                continue
+            raise
+
+    if last_error is not None:
+        raise last_error
+    raise ValueError("AI returned empty response")
 
 
 async def evaluate_with_schema[T: BaseModel](
@@ -17,7 +125,7 @@ async def evaluate_with_schema[T: BaseModel](
     response_model: type[T],
     user_text: str,
     audio_wav: bytes | None = None,
-    max_tokens: int = 1000,
+    max_tokens: int = 2000,
 ) -> T:
     """Run a structured evaluation via text or multimodal generation.
 
@@ -39,30 +147,47 @@ async def evaluate_with_schema[T: BaseModel](
     from app.theory.services.evaluator.prompts import (
         build_evaluator_instructions,
         build_prompt_with_schema,
-        parse_json_response,
     )
 
     system_prompt = build_prompt_with_schema(
         build_evaluator_instructions(locale, instructions),
         response_model,
     )
-    messages = [Message(role="system", content=system_prompt)]
-    if audio_wav is not None:
-        result = await provider.generate_with_audio(
-            messages=messages,
-            audio_wav=audio_wav,
-            user_text=user_text,
-            temperature=0.3,
-            max_tokens=max_tokens,
-        )
-    else:
-        messages.append(Message(role="user", content=user_text))
-        result = await provider.generate(
-            messages=messages,
-            temperature=0.3,
-            max_tokens=max_tokens,
-        )
-    content = result.content.strip()
-    if not content:
-        raise ValueError("AI returned empty response")
-    return parse_json_response(content, response_model)
+    token_budgets = [max_tokens, min(max_tokens * 2, _MAX_RETRY_TOKENS)]
+    last_error: ValueError | None = None
+
+    for attempt, budget in enumerate(token_budgets):
+        prompt = system_prompt
+        if attempt > 0:
+            prompt = system_prompt + _COMPACT_JSON_RETRY_NOTE
+        messages = [Message(role="system", content=prompt)]
+
+        if audio_wav is not None:
+            result = await provider.generate_with_audio(
+                messages=messages,
+                audio_wav=audio_wav,
+                user_text=user_text,
+                temperature=0.3,
+                max_tokens=budget,
+            )
+        else:
+            messages.append(Message(role="user", content=user_text))
+            result = await provider.generate(
+                messages=messages,
+                temperature=0.3,
+                max_tokens=budget,
+            )
+
+        try:
+            return await _parse_generation_result(result, response_model)
+        except ValueError as exc:
+            last_error = exc
+            if attempt < len(token_budgets) - 1 and _should_retry_structured_parse(
+                exc, result.finish_reason
+            ):
+                continue
+            raise
+
+    if last_error is not None:
+        raise last_error
+    raise ValueError("AI returned empty response")
diff --git a/app/theory/api/ws_protocol.py b/app/theory/api/ws_protocol.py
@@ -85,6 +85,7 @@ def server_message_from_event(
             timed_out=event.timed_out,
             feedback=event.feedback,
             timer_remaining_seconds=event.timer_remaining_seconds,
+            follow_up_answer_id=event.follow_up_answer_id,
         )
     if isinstance(event, InterviewCompletedEvent):
         return InterviewCompletedMessage(

diff --git a/app/theory/domain/entities.py b/app/theory/domain/entities.py
@@ -194,6 +194,7 @@ def start(
         planned_questions: tuple[PlannedTheoryQuestion, ...],
         task_time_limit_seconds: int | None = None,
         theory_section_id: int = NEW_ID,
+        start_first_task_timer: bool = True,
     ) -> TheorySection:
         """Build a new active theory section from a question plan.
 
@@ -204,6 +205,7 @@ def start(
             planned_questions: Ordered questions for this section (non-empty).
             task_time_limit_seconds: Per-task time limit, or None to disable.
             theory_section_id: Existing section ID, or ``NEW_ID`` before insert.
+            start_first_task_timer: Whether to start the timer on the first task now.
 
         Returns:
             Active section with initial task rows (``TheoryTask.NEW_ID``).
@@ -216,7 +218,11 @@ def start(
 
         when = datetime.now(UTC)
         question_ids = tuple(question.id for question in planned_questions)
-        timer_start = when if task_time_limit_seconds is not None else None
+        timer_start = (
+            when
+            if task_time_limit_seconds is not None and start_first_task_timer
+            else None
+        )
         tasks: list[TheoryTask] = []
         for order, question in enumerate(planned_questions, start=1):
             tasks.append(