GrillKit · vitchenkokir · Jun 12, 2026 · Jun 12, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,8 @@ Work in progress is accumulated under `[Unreleased]`; on release, that section b
 
 ### Changed
 
+- **Theory answer evaluation** — load `expected_points` rubric bullets from question banks, pass them through evaluation prompts with explicit candidate-only scoring rules, and use temperature 0 for structured LLM evaluation
+
 ### Fixed
 
 ### Removed

diff --git a/app/interview/domain/value_objects.py b/app/interview/domain/value_objects.py
@@ -49,11 +49,13 @@ class PlannedQuestion:
         id: Unique question identifier from the question bank.
         text: Localized question text shown to the user.
         code: Optional code snippet, or None when not applicable.
+        expected_points: Rubric bullets for AI evaluation.
     """
 
     id: str
     text: str
     code: str | None
+    expected_points: tuple[str, ...] = ()
 
 
 class InterviewSelectionHolder(Protocol):

diff --git a/app/shared/infrastructure/models.py b/app/shared/infrastructure/models.py
@@ -123,6 +123,7 @@ class Answer(Base):
         round: Follow-up round number (0 = initial, 1+ = follow-ups).
         question_text: Snapshot of the question text at time of asking.
         question_code: Snapshot of the optional code snippet.
+        expected_points: JSON array of rubric bullets (None for legacy rows).
         answer_text: User's answer text (None if skipped).
         score: AI-assigned score (1-5, or 0 on timeout), None if not yet evaluated.
         feedback: AI-generated feedback text.
@@ -143,6 +144,7 @@ class Answer(Base):
     round: Mapped[int] = mapped_column(Integer, default=0)
     question_text: Mapped[str] = mapped_column(Text)
     question_code: Mapped[str | None] = mapped_column(Text, nullable=True)
+    expected_points: Mapped[str | None] = mapped_column(Text, nullable=True)
     answer_text: Mapped[str | None] = mapped_column(Text, nullable=True)
     score: Mapped[int | None] = mapped_column(Integer, nullable=True)
     feedback: Mapped[str | None] = mapped_column(Text, nullable=True)

diff --git a/app/shared/questions.py b/app/shared/questions.py
@@ -29,6 +29,7 @@ class Question:
         tags: List of topic tags.
         text: The question text.
         code: Optional code snippet (None if not applicable).
+        expected_points: Rubric bullets for AI evaluation.
     """
 
     id: str
@@ -37,6 +38,7 @@ class Question:
     tags: list[str]
     text: str
     code: str | None
+    expected_points: tuple[str, ...] = ()
 
 
 def _resolve_localized_string(
@@ -111,6 +113,10 @@ def load_category(
     questions = []
     for q in data.get("questions", []):
         qid = q["id"]
+        points_raw = q.get("expected_points", [])
+        if not isinstance(points_raw, list):
+            msg = f"Question {qid}: invalid expected_points"
+            raise ValueError(msg)
         questions.append(
             Question(
                 id=qid,
@@ -124,6 +130,7 @@ def load_category(
                     question_id=qid,
                 ),
                 code=q["question"].get("code"),
+                expected_points=tuple(str(point) for point in points_raw),
             )
         )
     return questions

diff --git a/app/shared/structured_evaluation.py b/app/shared/structured_evaluation.py
@@ -65,7 +65,7 @@ async def generate_and_parse_json_response[T: BaseModel](
     messages: list[Message],
     response_model: type[T],
     max_tokens: int = 2000,
-    temperature: float = 0.3,
+    temperature: float = 0.1,
 ) -> T:
     """Generate JSON from chat messages and parse it with retry on truncation.
 
@@ -167,14 +167,14 @@ async def evaluate_with_schema[T: BaseModel](
                 messages=messages,
                 audio_wav=audio_wav,
                 user_text=user_text,
-                temperature=0.3,
+                temperature=0.0,
                 max_tokens=budget,
             )
         else:
             messages.append(Message(role="user", content=user_text))
             result = await provider.generate(
                 messages=messages,
-                temperature=0.3,
+                temperature=0.0,
                 max_tokens=budget,
             )
 

diff --git a/app/theory/domain/entities.py b/app/theory/domain/entities.py
@@ -44,6 +44,7 @@ class TheoryTask:
         round: Follow-up round number (0 = initial).
         question_text: Question text shown to the user.
         question_code: Optional code snippet for the question.
+        expected_points: Rubric bullets for AI evaluation.
         answer_text: User answer text, or None when unanswered.
         score: AI score for the round, or None when not evaluated.
         feedback: AI-generated feedback text, or None.
@@ -68,6 +69,7 @@ class TheoryTask:
     feedback: str | None
     started_at: datetime | None
     created_at: datetime
+    expected_points: tuple[str, ...] = ()
 
     def timer_deadline(self, limit_seconds: int) -> datetime:
         """Compute the absolute deadline for this timed task round.
@@ -240,6 +242,7 @@ def start(
                     feedback=None,
                     started_at=timer_start if order == 1 else None,
                     created_at=when,
+                    expected_points=question.expected_points,
                 )
             )
         return cls(
@@ -452,6 +455,7 @@ def with_follow_up(
             feedback=None,
             started_at=None,
             created_at=created_at,
+            expected_points=base.expected_points,
         )
         return replace(self, tasks=self.tasks + (follow_up,)), follow_up
 

diff --git a/app/theory/domain/value_objects.py b/app/theory/domain/value_objects.py
@@ -15,8 +15,10 @@ class PlannedTheoryQuestion:
         id: Unique question identifier from the question bank.
         text: Localized question text shown to the user.
         code: Optional code snippet, or None when not applicable.
+        expected_points: Rubric bullets for AI evaluation.
     """
 
     id: str
     text: str
     code: str | None
+    expected_points: tuple[str, ...] = ()
diff --git a/app/theory/repositories/mappers.py b/app/theory/repositories/mappers.py
@@ -20,6 +20,37 @@
 from app.theory.schemas.theory import TheorySectionRead, TheoryTaskRead
 
 
+def _expected_points_to_json(points: tuple[str, ...]) -> str | None:
+    """Serialize rubric bullets for ORM storage.
+
+    Args:
+        points: Domain rubric tuple.
+
+    Returns:
+        JSON array string, or None when empty.
+    """
+    if not points:
+        return None
+    return json.dumps(list(points), separators=(",", ":"))
+
+
+def _expected_points_from_json(raw: str | None) -> tuple[str, ...]:
+    """Deserialize rubric bullets from an ORM column.
+
+    Args:
+        raw: JSON array string or None for legacy rows.
+
+    Returns:
+        Tuple of rubric bullet strings.
+    """
+    if raw is None:
+        return ()
+    data = json.loads(raw)
+    if not isinstance(data, list):
+        return ()
+    return tuple(str(point) for point in data)
+
+
 def _question_ids_from_tasks(tasks: tuple[DomainTheoryTask, ...]) -> tuple[str, ...]:
     """Derive ordered question IDs from initial task rounds.
 
@@ -59,6 +90,7 @@ def theory_task_from_orm(
         round=answer.round,
         question_text=answer.question_text,
         question_code=answer.question_code,
+        expected_points=_expected_points_from_json(answer.expected_points),
         answer_text=answer.answer_text,
         score=answer.score,
         feedback=answer.feedback,
@@ -91,6 +123,7 @@ def domain_theory_task_to_orm(
         round=task.round,
         question_text=task.question_text,
         question_code=task.question_code,
+        expected_points=_expected_points_to_json(task.expected_points),
         answer_text=task.answer_text,
         score=task.score,
         feedback=task.feedback,

diff --git a/app/theory/services/evaluator/prompts.py b/app/theory/services/evaluator/prompts.py
@@ -43,6 +43,20 @@
 feedback, strengths, or weaknesses; focus on whether the candidate grasped the concepts."""
 
 ANSWER_EVALUATION_INSTRUCTIONS = """You are a technical interviewer evaluating a candidate's answer.
+
+Evaluate ONLY the candidate text under "Candidate answer (evaluate this only):".
+Do NOT treat question text, code blocks in the question, expected rubric points,
+or your own knowledge as something the candidate said.
+
+If the answer is empty, off-topic, only asks the interviewer what to do,
+or does not attempt to explain the topic, score 1 and set follow_up_needed true.
+
+In feedback, quote or paraphrase only what appears under the candidate answer.
+Do not praise code or explanations that are not present in the answer.
+
+Use the expected rubric points as a checklist for what a strong answer should cover,
+but score only what the candidate actually stated.
+
 Assess the answer based on:
 - 5: Excellent — complete understanding, examples, edge cases considered
 - 4: Good — solid understanding with minor omissions
@@ -105,6 +119,22 @@
 }"""
 
 
+def format_expected_rubric(
+    expected_points: tuple[str, ...] | list[str] | None,
+) -> str:
+    """Format rubric bullets for evaluator prompts.
+
+    Args:
+        expected_points: Rubric bullets from the question bank.
+
+    Returns:
+        Bulleted list text, or ``(none)`` when empty.
+    """
+    if not expected_points:
+        return "(none)"
+    return "\n".join(f"- {point}" for point in expected_points)
+
+
 def build_evaluator_instructions(locale: str, task_instructions: str) -> str:
     """Combine locale, substance-focused rules, and task-specific instructions.
 

diff --git a/app/theory/services/evaluator/service.py b/app/theory/services/evaluator/service.py
@@ -23,6 +23,7 @@
     SECTION_EVALUATION_INSTRUCTIONS,
     SESSION_EVALUATION_INSTRUCTIONS,
     build_evaluator_instructions,
+    format_expected_rubric,
     looks_like_json_schema_fragment,
 )
 
@@ -62,6 +63,39 @@ def _format_question(question_text: str, question_code: str | None) -> str:
             return f"{question_text}\n\nCode:\n{question_code}"
         return question_text
 
+    @staticmethod
+    def _format_answer_evaluation_user_text(
+        question_text: str,
+        question_code: str | None,
+        answer_text: str | None = None,
+        expected_points: tuple[str, ...] | None = None,
+    ) -> str:
+        """Build labeled user text for initial answer evaluation prompts.
+
+        Args:
+            question_text: The question text.
+            question_code: Optional code snippet from the question.
+            answer_text: Candidate answer text, or None for audio-only input.
+            expected_points: Rubric bullets from the question bank.
+
+        Returns:
+            Labeled prompt text separating context from candidate content.
+        """
+        question = TheoryEvaluatorService._format_question(question_text, question_code)
+        rubric = format_expected_rubric(expected_points)
+        parts = [
+            f"Question (for context only, NOT part of the answer):\n{question}",
+            (
+                "Expected rubric points (checklist only, NOT candidate content):\n"
+                f"{rubric}"
+            ),
+        ]
+        if answer_text is not None:
+            parts.append(
+                f"Candidate answer (evaluate this only):\n{answer_text}",
+            )
+        return "\n\n".join(parts)
+
     @staticmethod
     async def _evaluate_with_schema(
         provider: AIProvider,
@@ -106,6 +140,7 @@ async def evaluate_answer(
         question_text: str,
         answer_text: str,
         question_code: str | None = None,
+        expected_points: tuple[str, ...] | None = None,
         locale: str = DEFAULT_LOCALE,
     ) -> AnswerEvaluation:
         """Evaluate a user's initial answer (round=0).
@@ -115,6 +150,7 @@ async def evaluate_answer(
             question_text: The question text.
             answer_text: The user's answer.
             question_code: Optional code snippet from the question.
+            expected_points: Rubric bullets from the question bank.
             locale: Locale for AI feedback and follow-up questions.
 
         Returns:
@@ -123,8 +159,12 @@ async def evaluate_answer(
         Raises:
             ValueError: If AI response is invalid or connection fails.
         """
-        question = TheoryEvaluatorService._format_question(question_text, question_code)
-        user_text = f"Question:\n{question}\n\nAnswer:\n{answer_text}"
+        user_text = TheoryEvaluatorService._format_answer_evaluation_user_text(
+            question_text=question_text,
+            question_code=question_code,
+            answer_text=answer_text,
+            expected_points=expected_points,
+        )
         return await TheoryEvaluatorService._evaluate_with_schema(
             provider,
             locale=locale,
@@ -139,6 +179,7 @@ async def evaluate_answer_with_audio(
         question_text: str,
         audio_wav: bytes,
         question_code: str | None = None,
+        expected_points: tuple[str, ...] | None = None,
         locale: str = DEFAULT_LOCALE,
     ) -> AnswerEvaluation:
         """Evaluate a user's initial spoken answer (round=0).
@@ -148,6 +189,7 @@ async def evaluate_answer_with_audio(
             question_text: The question text.
             audio_wav: The user's spoken answer as WAV bytes.
             question_code: Optional code snippet from the question.
+            expected_points: Rubric bullets from the question bank.
             locale: Locale for AI feedback and follow-up questions.
 
         Returns:
@@ -156,13 +198,17 @@ async def evaluate_answer_with_audio(
         Raises:
             ValueError: If AI response is invalid or connection fails.
         """
-        question = TheoryEvaluatorService._format_question(question_text, question_code)
+        user_text = TheoryEvaluatorService._format_answer_evaluation_user_text(
+            question_text=question_text,
+            question_code=question_code,
+            expected_points=expected_points,
+        )
         return await TheoryEvaluatorService._evaluate_with_schema(
             provider,
             locale=locale,
             instructions=ANSWER_EVALUATION_INSTRUCTIONS,
             response_model=AnswerEvaluation,
-            user_text=f"Question:\n{question}",
+            user_text=user_text,
             audio_wav=audio_wav,
         )
 
@@ -290,6 +336,7 @@ async def evaluate_submission(
         question_code: str | None,
         initial_question_text: str,
         initial_answer_text: str,
+        expected_points: tuple[str, ...] | None = None,
         answer_text: str | None = None,
         audio_wav: bytes | None = None,
     ) -> tuple[AnswerEvaluation | FollowUpEvaluation, bool, str | None]:
@@ -303,6 +350,7 @@ async def evaluate_submission(
             question_code: Optional code snippet for the question.
             initial_question_text: Original question text (round 0).
             initial_answer_text: User's initial answer text (round 0).
+            expected_points: Rubric bullets from the question bank.
             answer_text: User answer text for text-mode evaluation.
             audio_wav: Spoken answer WAV for multimodal evaluation.
 
@@ -323,6 +371,7 @@ async def evaluate_submission(
                     question_text=question_text,
                     audio_wav=audio_wav,
                     question_code=question_code,
+                    expected_points=expected_points,
                     locale=locale,
                 )
             else:
@@ -331,6 +380,7 @@ async def evaluate_submission(
                     question_text=question_text,
                     answer_text=answer_text or "",
                     question_code=question_code,
+                    expected_points=expected_points,
                     locale=locale,
                 )
         elif audio_wav is not None: