From f41cf56f1d070c1ef3cf3ef4986a9c234fd514fa Mon Sep 17 00:00:00 2001
From: vitchenkokir <vitchenkokir@gmail.com>
Date: Fri, 12 Jun 2026 13:43:25 +0300
Subject: [PATCH] fix model evalution prompt

---
 CHANGELOG.md                                  |  2 +
 app/interview/domain/value_objects.py         |  2 +
 app/shared/infrastructure/models.py           |  2 +
 app/shared/questions.py                       |  7 +++
 app/shared/structured_evaluation.py           |  6 +-
 app/theory/domain/entities.py                 |  4 ++
 app/theory/domain/value_objects.py            |  2 +
 app/theory/repositories/mappers.py            | 33 +++++++++++
 app/theory/services/evaluator/prompts.py      | 30 ++++++++++
 app/theory/services/evaluator/service.py      | 58 +++++++++++++++++--
 app/theory/services/planning.py               | 14 ++++-
 app/theory/services/submission.py             | 10 ++++
 tests/shared/test_questions.py                | 36 ++++++++++++
 .../repositories/test_theory_section.py       | 53 +++++++++++++++++
 tests/theory/services/test_evaluator.py       | 29 ++++++++++
 15 files changed, 279 insertions(+), 9 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index af26e5f..747577c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,8 @@ Work in progress is accumulated under `[Unreleased]`; on release, that section b
 
 ### Changed
 
+- **Theory answer evaluation** — load `expected_points` rubric bullets from question banks, pass them through evaluation prompts with explicit candidate-only scoring rules, and use temperature 0 for structured LLM evaluation
+
 ### Fixed
 
 ### Removed
diff --git a/app/interview/domain/value_objects.py b/app/interview/domain/value_objects.py
index 99fb3a2..3dade4d 100644
--- a/app/interview/domain/value_objects.py
+++ b/app/interview/domain/value_objects.py
@@ -49,11 +49,13 @@ class PlannedQuestion:
         id: Unique question identifier from the question bank.
         text: Localized question text shown to the user.
         code: Optional code snippet, or None when not applicable.
+        expected_points: Rubric bullets for AI evaluation.
     """
 
     id: str
     text: str
     code: str | None
+    expected_points: tuple[str, ...] = ()
 
 
 class InterviewSelectionHolder(Protocol):
diff --git a/app/shared/infrastructure/models.py b/app/shared/infrastructure/models.py
index d01dd04..a5225ca 100644
--- a/app/shared/infrastructure/models.py
+++ b/app/shared/infrastructure/models.py
@@ -123,6 +123,7 @@ class Answer(Base):
         round: Follow-up round number (0 = initial, 1+ = follow-ups).
         question_text: Snapshot of the question text at time of asking.
         question_code: Snapshot of the optional code snippet.
+        expected_points: JSON array of rubric bullets (None for legacy rows).
         answer_text: User's answer text (None if skipped).
         score: AI-assigned score (1-5, or 0 on timeout), None if not yet evaluated.
         feedback: AI-generated feedback text.
@@ -143,6 +144,7 @@ class Answer(Base):
     round: Mapped[int] = mapped_column(Integer, default=0)
     question_text: Mapped[str] = mapped_column(Text)
     question_code: Mapped[str | None] = mapped_column(Text, nullable=True)
+    expected_points: Mapped[str | None] = mapped_column(Text, nullable=True)
     answer_text: Mapped[str | None] = mapped_column(Text, nullable=True)
     score: Mapped[int | None] = mapped_column(Integer, nullable=True)
     feedback: Mapped[str | None] = mapped_column(Text, nullable=True)
diff --git a/app/shared/questions.py b/app/shared/questions.py
index 0065067..d874899 100644
--- a/app/shared/questions.py
+++ b/app/shared/questions.py
@@ -29,6 +29,7 @@ class Question:
         tags: List of topic tags.
         text: The question text.
         code: Optional code snippet (None if not applicable).
+        expected_points: Rubric bullets for AI evaluation.
     """
 
     id: str
@@ -37,6 +38,7 @@ class Question:
     tags: list[str]
     text: str
     code: str | None
+    expected_points: tuple[str, ...] = ()
 
 
 def _resolve_localized_string(
@@ -111,6 +113,10 @@ def load_category(
     questions = []
     for q in data.get("questions", []):
         qid = q["id"]
+        points_raw = q.get("expected_points", [])
+        if not isinstance(points_raw, list):
+            msg = f"Question {qid}: invalid expected_points"
+            raise ValueError(msg)
         questions.append(
             Question(
                 id=qid,
@@ -124,6 +130,7 @@ def load_category(
                     question_id=qid,
                 ),
                 code=q["question"].get("code"),
+                expected_points=tuple(str(point) for point in points_raw),
             )
         )
     return questions
diff --git a/app/shared/structured_evaluation.py b/app/shared/structured_evaluation.py
index d39fa01..66b7e66 100644
--- a/app/shared/structured_evaluation.py
+++ b/app/shared/structured_evaluation.py
@@ -65,7 +65,7 @@ async def generate_and_parse_json_response[T: BaseModel](
     messages: list[Message],
     response_model: type[T],
     max_tokens: int = 2000,
-    temperature: float = 0.3,
+    temperature: float = 0.1,
 ) -> T:
     """Generate JSON from chat messages and parse it with retry on truncation.
 
@@ -167,14 +167,14 @@ async def evaluate_with_schema[T: BaseModel](
                 messages=messages,
                 audio_wav=audio_wav,
                 user_text=user_text,
-                temperature=0.3,
+                temperature=0.0,
                 max_tokens=budget,
             )
         else:
             messages.append(Message(role="user", content=user_text))
             result = await provider.generate(
                 messages=messages,
-                temperature=0.3,
+                temperature=0.0,
                 max_tokens=budget,
             )
 
diff --git a/app/theory/domain/entities.py b/app/theory/domain/entities.py
index 360e470..7772e75 100644
--- a/app/theory/domain/entities.py
+++ b/app/theory/domain/entities.py
@@ -44,6 +44,7 @@ class TheoryTask:
         round: Follow-up round number (0 = initial).
         question_text: Question text shown to the user.
         question_code: Optional code snippet for the question.
+        expected_points: Rubric bullets for AI evaluation.
         answer_text: User answer text, or None when unanswered.
         score: AI score for the round, or None when not evaluated.
         feedback: AI-generated feedback text, or None.
@@ -68,6 +69,7 @@ class TheoryTask:
     feedback: str | None
     started_at: datetime | None
     created_at: datetime
+    expected_points: tuple[str, ...] = ()
 
     def timer_deadline(self, limit_seconds: int) -> datetime:
         """Compute the absolute deadline for this timed task round.
@@ -240,6 +242,7 @@ def start(
                     feedback=None,
                     started_at=timer_start if order == 1 else None,
                     created_at=when,
+                    expected_points=question.expected_points,
                 )
             )
         return cls(
@@ -452,6 +455,7 @@ def with_follow_up(
             feedback=None,
             started_at=None,
             created_at=created_at,
+            expected_points=base.expected_points,
         )
         return replace(self, tasks=self.tasks + (follow_up,)), follow_up
 
diff --git a/app/theory/domain/value_objects.py b/app/theory/domain/value_objects.py
index a83361b..c15283c 100644
--- a/app/theory/domain/value_objects.py
+++ b/app/theory/domain/value_objects.py
@@ -15,8 +15,10 @@ class PlannedTheoryQuestion:
         id: Unique question identifier from the question bank.
         text: Localized question text shown to the user.
         code: Optional code snippet, or None when not applicable.
+        expected_points: Rubric bullets for AI evaluation.
     """
 
     id: str
     text: str
     code: str | None
+    expected_points: tuple[str, ...] = ()
diff --git a/app/theory/repositories/mappers.py b/app/theory/repositories/mappers.py
index e899c9f..5146b97 100644
--- a/app/theory/repositories/mappers.py
+++ b/app/theory/repositories/mappers.py
@@ -20,6 +20,37 @@
 from app.theory.schemas.theory import TheorySectionRead, TheoryTaskRead
 
 
+def _expected_points_to_json(points: tuple[str, ...]) -> str | None:
+    """Serialize rubric bullets for ORM storage.
+
+    Args:
+        points: Domain rubric tuple.
+
+    Returns:
+        JSON array string, or None when empty.
+    """
+    if not points:
+        return None
+    return json.dumps(list(points), separators=(",", ":"))
+
+
+def _expected_points_from_json(raw: str | None) -> tuple[str, ...]:
+    """Deserialize rubric bullets from an ORM column.
+
+    Args:
+        raw: JSON array string or None for legacy rows.
+
+    Returns:
+        Tuple of rubric bullet strings.
+    """
+    if raw is None:
+        return ()
+    data = json.loads(raw)
+    if not isinstance(data, list):
+        return ()
+    return tuple(str(point) for point in data)
+
+
 def _question_ids_from_tasks(tasks: tuple[DomainTheoryTask, ...]) -> tuple[str, ...]:
     """Derive ordered question IDs from initial task rounds.
 
@@ -59,6 +90,7 @@ def theory_task_from_orm(
         round=answer.round,
         question_text=answer.question_text,
         question_code=answer.question_code,
+        expected_points=_expected_points_from_json(answer.expected_points),
         answer_text=answer.answer_text,
         score=answer.score,
         feedback=answer.feedback,
@@ -91,6 +123,7 @@ def domain_theory_task_to_orm(
         round=task.round,
         question_text=task.question_text,
         question_code=task.question_code,
+        expected_points=_expected_points_to_json(task.expected_points),
         answer_text=task.answer_text,
         score=task.score,
         feedback=task.feedback,
diff --git a/app/theory/services/evaluator/prompts.py b/app/theory/services/evaluator/prompts.py
index 6fe511f..f229e70 100644
--- a/app/theory/services/evaluator/prompts.py
+++ b/app/theory/services/evaluator/prompts.py
@@ -43,6 +43,20 @@
 feedback, strengths, or weaknesses; focus on whether the candidate grasped the concepts."""
 
 ANSWER_EVALUATION_INSTRUCTIONS = """You are a technical interviewer evaluating a candidate's answer.
+
+Evaluate ONLY the candidate text under "Candidate answer (evaluate this only):".
+Do NOT treat question text, code blocks in the question, expected rubric points,
+or your own knowledge as something the candidate said.
+
+If the answer is empty, off-topic, only asks the interviewer what to do,
+or does not attempt to explain the topic, score 1 and set follow_up_needed true.
+
+In feedback, quote or paraphrase only what appears under the candidate answer.
+Do not praise code or explanations that are not present in the answer.
+
+Use the expected rubric points as a checklist for what a strong answer should cover,
+but score only what the candidate actually stated.
+
 Assess the answer based on:
 - 5: Excellent — complete understanding, examples, edge cases considered
 - 4: Good — solid understanding with minor omissions
@@ -105,6 +119,22 @@
 }"""
 
 
+def format_expected_rubric(
+    expected_points: tuple[str, ...] | list[str] | None,
+) -> str:
+    """Format rubric bullets for evaluator prompts.
+
+    Args:
+        expected_points: Rubric bullets from the question bank.
+
+    Returns:
+        Bulleted list text, or ``(none)`` when empty.
+    """
+    if not expected_points:
+        return "(none)"
+    return "\n".join(f"- {point}" for point in expected_points)
+
+
 def build_evaluator_instructions(locale: str, task_instructions: str) -> str:
     """Combine locale, substance-focused rules, and task-specific instructions.
 
diff --git a/app/theory/services/evaluator/service.py b/app/theory/services/evaluator/service.py
index 34198fe..b3eec0b 100644
--- a/app/theory/services/evaluator/service.py
+++ b/app/theory/services/evaluator/service.py
@@ -23,6 +23,7 @@
     SECTION_EVALUATION_INSTRUCTIONS,
     SESSION_EVALUATION_INSTRUCTIONS,
     build_evaluator_instructions,
+    format_expected_rubric,
     looks_like_json_schema_fragment,
 )
 
@@ -62,6 +63,39 @@ def _format_question(question_text: str, question_code: str | None) -> str:
             return f"{question_text}\n\nCode:\n{question_code}"
         return question_text
 
+    @staticmethod
+    def _format_answer_evaluation_user_text(
+        question_text: str,
+        question_code: str | None,
+        answer_text: str | None = None,
+        expected_points: tuple[str, ...] | None = None,
+    ) -> str:
+        """Build labeled user text for initial answer evaluation prompts.
+
+        Args:
+            question_text: The question text.
+            question_code: Optional code snippet from the question.
+            answer_text: Candidate answer text, or None for audio-only input.
+            expected_points: Rubric bullets from the question bank.
+
+        Returns:
+            Labeled prompt text separating context from candidate content.
+        """
+        question = TheoryEvaluatorService._format_question(question_text, question_code)
+        rubric = format_expected_rubric(expected_points)
+        parts = [
+            f"Question (for context only, NOT part of the answer):\n{question}",
+            (
+                "Expected rubric points (checklist only, NOT candidate content):\n"
+                f"{rubric}"
+            ),
+        ]
+        if answer_text is not None:
+            parts.append(
+                f"Candidate answer (evaluate this only):\n{answer_text}",
+            )
+        return "\n\n".join(parts)
+
     @staticmethod
     async def _evaluate_with_schema(
         provider: AIProvider,
@@ -106,6 +140,7 @@ async def evaluate_answer(
         question_text: str,
         answer_text: str,
         question_code: str | None = None,
+        expected_points: tuple[str, ...] | None = None,
         locale: str = DEFAULT_LOCALE,
     ) -> AnswerEvaluation:
         """Evaluate a user's initial answer (round=0).
@@ -115,6 +150,7 @@ async def evaluate_answer(
             question_text: The question text.
             answer_text: The user's answer.
             question_code: Optional code snippet from the question.
+            expected_points: Rubric bullets from the question bank.
             locale: Locale for AI feedback and follow-up questions.
 
         Returns:
@@ -123,8 +159,12 @@ async def evaluate_answer(
         Raises:
             ValueError: If AI response is invalid or connection fails.
         """
-        question = TheoryEvaluatorService._format_question(question_text, question_code)
-        user_text = f"Question:\n{question}\n\nAnswer:\n{answer_text}"
+        user_text = TheoryEvaluatorService._format_answer_evaluation_user_text(
+            question_text=question_text,
+            question_code=question_code,
+            answer_text=answer_text,
+            expected_points=expected_points,
+        )
         return await TheoryEvaluatorService._evaluate_with_schema(
             provider,
             locale=locale,
@@ -139,6 +179,7 @@ async def evaluate_answer_with_audio(
         question_text: str,
         audio_wav: bytes,
         question_code: str | None = None,
+        expected_points: tuple[str, ...] | None = None,
         locale: str = DEFAULT_LOCALE,
     ) -> AnswerEvaluation:
         """Evaluate a user's initial spoken answer (round=0).
@@ -148,6 +189,7 @@ async def evaluate_answer_with_audio(
             question_text: The question text.
             audio_wav: The user's spoken answer as WAV bytes.
             question_code: Optional code snippet from the question.
+            expected_points: Rubric bullets from the question bank.
             locale: Locale for AI feedback and follow-up questions.
 
         Returns:
@@ -156,13 +198,17 @@ async def evaluate_answer_with_audio(
         Raises:
             ValueError: If AI response is invalid or connection fails.
         """
-        question = TheoryEvaluatorService._format_question(question_text, question_code)
+        user_text = TheoryEvaluatorService._format_answer_evaluation_user_text(
+            question_text=question_text,
+            question_code=question_code,
+            expected_points=expected_points,
+        )
         return await TheoryEvaluatorService._evaluate_with_schema(
             provider,
             locale=locale,
             instructions=ANSWER_EVALUATION_INSTRUCTIONS,
             response_model=AnswerEvaluation,
-            user_text=f"Question:\n{question}",
+            user_text=user_text,
             audio_wav=audio_wav,
         )
 
@@ -290,6 +336,7 @@ async def evaluate_submission(
         question_code: str | None,
         initial_question_text: str,
         initial_answer_text: str,
+        expected_points: tuple[str, ...] | None = None,
         answer_text: str | None = None,
         audio_wav: bytes | None = None,
     ) -> tuple[AnswerEvaluation | FollowUpEvaluation, bool, str | None]:
@@ -303,6 +350,7 @@ async def evaluate_submission(
             question_code: Optional code snippet for the question.
             initial_question_text: Original question text (round 0).
             initial_answer_text: User's initial answer text (round 0).
+            expected_points: Rubric bullets from the question bank.
             answer_text: User answer text for text-mode evaluation.
             audio_wav: Spoken answer WAV for multimodal evaluation.
 
@@ -323,6 +371,7 @@ async def evaluate_submission(
                     question_text=question_text,
                     audio_wav=audio_wav,
                     question_code=question_code,
+                    expected_points=expected_points,
                     locale=locale,
                 )
             else:
@@ -331,6 +380,7 @@ async def evaluate_submission(
                     question_text=question_text,
                     answer_text=answer_text or "",
                     question_code=question_code,
+                    expected_points=expected_points,
                     locale=locale,
                 )
         elif audio_wav is not None:
diff --git a/app/theory/services/planning.py b/app/theory/services/planning.py
index fe1d7bc..891a51e 100644
--- a/app/theory/services/planning.py
+++ b/app/theory/services/planning.py
@@ -41,7 +41,12 @@ def _to_planned(question: Question) -> PlannedQuestion:
     Returns:
         Domain value object for theory section creation.
     """
-    return PlannedQuestion(id=question.id, text=question.text, code=question.code)
+    return PlannedQuestion(
+        id=question.id,
+        text=question.text,
+        code=question.code,
+        expected_points=question.expected_points,
+    )
 
 
 def validate_selection(selection: InterviewSelection) -> None:
@@ -142,6 +147,11 @@ def build_theory_question_plan(
     track_pools = load_track_pools(selection, locale)
     planned = plan_questions(selection, question_count, track_pools)
     return tuple(
-        PlannedTheoryQuestion(id=question.id, text=question.text, code=question.code)
+        PlannedTheoryQuestion(
+            id=question.id,
+            text=question.text,
+            code=question.code,
+            expected_points=question.expected_points,
+        )
         for question in planned
     )
diff --git a/app/theory/services/submission.py b/app/theory/services/submission.py
index 92d700c..d6dd402 100644
--- a/app/theory/services/submission.py
+++ b/app/theory/services/submission.py
@@ -54,6 +54,7 @@ class TheorySubmissionContext:
         question_code: Optional code snippet for the question.
         initial_question_text: Original question text (round 0).
         initial_answer_text: User's initial answer text (round 0).
+        expected_points: Rubric bullets for AI evaluation.
         locale: Section locale for AI and speech.
         answer_text: Text persisted on the task row (may be empty for audio).
     """
@@ -65,6 +66,7 @@ class TheorySubmissionContext:
     question_code: str | None
     initial_question_text: str
     initial_answer_text: str
+    expected_points: tuple[str, ...]
     locale: str
     answer_text: str
 
@@ -96,6 +98,7 @@ async def _evaluate_last_follow_up_in_background(
     answer_text: str,
     initial_question_text: str,
     initial_answer_text: str,
+    expected_points: tuple[str, ...],
     provider: AIProvider,
     locale: str,
     audio_wav: bytes | None = None,
@@ -111,6 +114,7 @@ async def _evaluate_last_follow_up_in_background(
         answer_text: The user's answer text (transcript when audio was submitted).
         initial_question_text: Original question text (round 0).
         initial_answer_text: User's initial answer text (round 0).
+        expected_points: Rubric bullets for AI evaluation.
         provider: AI provider for evaluation.
         locale: Locale for AI feedback.
         audio_wav: Optional spoken answer WAV for multimodal evaluation.
@@ -125,6 +129,7 @@ async def _evaluate_last_follow_up_in_background(
                 question_code=question_code,
                 initial_question_text=initial_question_text,
                 initial_answer_text=initial_answer_text,
+                expected_points=expected_points,
                 audio_wav=audio_wav,
             )
         else:
@@ -136,6 +141,7 @@ async def _evaluate_last_follow_up_in_background(
                 question_code=question_code,
                 initial_question_text=initial_question_text,
                 initial_answer_text=initial_answer_text,
+                expected_points=expected_points,
                 answer_text=answer_text,
             )
         TheoryEvaluationPersistenceService.persist_evaluation_only(
@@ -230,6 +236,7 @@ async def _open_submission(
                     question_code=saved.question_code,
                     initial_question_text=initial_question_text,
                     initial_answer_text=initial_answer_text,
+                    expected_points=saved.expected_points,
                     locale=updated.locale,
                     answer_text=answer_text,
                 )
@@ -306,6 +313,7 @@ def _schedule_last_follow_up_evaluation(
                 answer_text=ctx.answer_text,
                 initial_question_text=ctx.initial_question_text,
                 initial_answer_text=ctx.initial_answer_text,
+                expected_points=ctx.expected_points,
                 provider=provider,
                 locale=ctx.locale,
                 audio_wav=audio_wav,
@@ -433,6 +441,7 @@ async def stream_answer_submission(
                 question_code=ctx.question_code,
                 initial_question_text=ctx.initial_question_text,
                 initial_answer_text=ctx.initial_answer_text,
+                expected_points=ctx.expected_points,
                 answer_text=ctx.answer_text,
             )
         )
@@ -537,6 +546,7 @@ async def stream_audio_answer_submission(
                 question_code=ctx.question_code,
                 initial_question_text=ctx.initial_question_text,
                 initial_answer_text=ctx.initial_answer_text,
+                expected_points=ctx.expected_points,
                 audio_wav=wav_bytes,
             ),
             name=f"audio-eval-{interview_id}-{ctx.question_id}-r{ctx.round_num}",
diff --git a/tests/shared/test_questions.py b/tests/shared/test_questions.py
index a3503e4..0a6fe16 100644
--- a/tests/shared/test_questions.py
+++ b/tests/shared/test_questions.py
@@ -196,6 +196,42 @@ def test_list_categories_non_existent_track(self, temp_questions_dir):
         categories = list_categories("java", "junior")
         assert categories == []
 
+    def test_load_category_expected_points(self, temp_questions_dir):
+        """Test loading expected_points rubric bullets."""
+        path = temp_questions_dir / "python" / "junior" / "rubric.yaml"
+        _write_category_yaml(
+            path,
+            [
+                {
+                    "id": "rub-001",
+                    "type": "knowledge",
+                    "difficulty": 1,
+                    "expected_points": ["Point A", "Point B"],
+                    "question": {"text": "Explain lists.", "code": None},
+                }
+            ],
+        )
+        questions = load_category("python", "junior", "rubric")
+        assert questions[0].expected_points == ("Point A", "Point B")
+
+    def test_load_category_invalid_expected_points_raises(self, temp_questions_dir):
+        """Invalid expected_points shape raises ValueError."""
+        path = temp_questions_dir / "python" / "junior" / "bad-rubric.yaml"
+        _write_category_yaml(
+            path,
+            [
+                {
+                    "id": "bad-rub-001",
+                    "type": "knowledge",
+                    "difficulty": 1,
+                    "expected_points": "not-a-list",
+                    "question": {"text": "Bad rubric.", "code": None},
+                }
+            ],
+        )
+        with pytest.raises(ValueError, match="bad-rub-001"):
+            load_category("python", "junior", "bad-rubric")
+
     def test_load_category_with_code(self, temp_questions_dir):
         """Test loading a question with a code snippet."""
         path = temp_questions_dir / "python" / "junior" / "with-code.yaml"
diff --git a/tests/theory/repositories/test_theory_section.py b/tests/theory/repositories/test_theory_section.py
index 1623db5..e777fcd 100644
--- a/tests/theory/repositories/test_theory_section.py
+++ b/tests/theory/repositories/test_theory_section.py
@@ -41,6 +41,7 @@ def _sample_planned() -> tuple[PlannedTheoryQuestion, ...]:
             id="py-001",
             text="What is a list?",
             code=None,
+            expected_points=("Mutable sequence", "Ordered"),
         ),
         PlannedTheoryQuestion(
             id="py-002",
@@ -70,6 +71,7 @@ def test_timer_deadline_and_expiry(self) -> None:
             feedback=None,
             started_at=started,
             created_at=started,
+            expected_points=(),
         )
         deadline = task.timer_deadline(120)
         assert deadline == started + timedelta(seconds=120)
@@ -107,6 +109,29 @@ def test_start_defers_timer_when_not_first_task(self) -> None:
         )
         assert section.tasks[0].started_at is None
 
+    def test_start_preserves_expected_points_on_tasks(self) -> None:
+        """Planned rubric bullets are copied onto initial task rows."""
+        section = DomainTheorySection.start(
+            "iv-1",
+            selection=_sample_selection(),
+            locale="en",
+            planned_questions=_sample_planned(),
+        )
+        assert section.tasks[0].expected_points == ("Mutable sequence", "Ordered")
+        assert section.tasks[1].expected_points == ()
+
+    def test_with_follow_up_copies_expected_points(self) -> None:
+        """Follow-up task rows inherit rubric bullets from the base question."""
+        section = DomainTheorySection.start(
+            "iv-1",
+            selection=_sample_selection(),
+            locale="en",
+            planned_questions=_sample_planned(),
+        )
+        updated, follow_up = section.with_follow_up("py-001", "Give an example.")
+        assert follow_up.expected_points == ("Mutable sequence", "Ordered")
+        assert updated.tasks[-1].expected_points == ("Mutable sequence", "Ordered")
+
 
 class TestTheorySectionRepository:
     """Theory section persistence."""
@@ -149,6 +174,34 @@ def test_create_aggregate_persists_tasks(self, isolated_db) -> None:
         assert len(loaded.tasks) == 2
         assert loaded.tasks[0].theory_section_id == loaded.id
 
+    def test_create_aggregate_round_trips_expected_points(self, isolated_db) -> None:
+        """Repository persists rubric bullets on answer rows."""
+        with TheoryUnitOfWork() as uow:
+            uow.session.add(
+                Interview(
+                    id="iv-rubric",
+                    selection_spec='{"sources":[{"track":"python","level":"junior","categories":["basics"]}]}',
+                )
+            )
+            uow.commit()
+
+        section = DomainTheorySection.start(
+            "iv-rubric",
+            selection=_sample_selection(),
+            locale="en",
+            planned_questions=_sample_planned(),
+        )
+        with TheoryUnitOfWork() as uow:
+            uow.theory_sections.create_aggregate(section)
+            uow.commit()
+
+        with TheoryUnitOfWork() as uow:
+            loaded = uow.theory_sections.get_aggregate("iv-rubric")
+
+        assert loaded is not None
+        assert loaded.tasks[0].expected_points == ("Mutable sequence", "Ordered")
+        assert loaded.tasks[1].expected_points == ()
+
 
 @pytest.fixture
 def alembic_engine(tmp_path: Path, monkeypatch):
diff --git a/tests/theory/services/test_evaluator.py b/tests/theory/services/test_evaluator.py
index 1288305..455d023 100644
--- a/tests/theory/services/test_evaluator.py
+++ b/tests/theory/services/test_evaluator.py
@@ -5,10 +5,39 @@
 import pytest
 
 from app.ai.audio_probe import minimal_wav_bytes
+from app.theory.services.evaluator.prompts import format_expected_rubric
 from app.theory.services.evaluator.service import TheoryEvaluatorService
 from tests.fakes import FakeProvider, answer_evaluation_json, follow_up_evaluation_json
 
 
+def test_format_expected_rubric_empty() -> None:
+    """Empty rubric renders as (none)."""
+    assert format_expected_rubric(()) == "(none)"
+    assert format_expected_rubric(None) == "(none)"
+
+
+def test_format_expected_rubric_bullets() -> None:
+    """Rubric bullets render as a markdown list."""
+    text = format_expected_rubric(("First point", "Second point"))
+    assert text == "- First point\n- Second point"
+
+
+def test_format_answer_evaluation_user_text_labels_candidate_content() -> None:
+    """Evaluation prompt separates context from candidate answer."""
+    text = TheoryEvaluatorService._format_answer_evaluation_user_text(
+        question_text="What is a list?",
+        question_code="items = []",
+        answer_text="A mutable sequence.",
+        expected_points=("Ordered", "Mutable"),
+    )
+    assert "Question (for context only, NOT part of the answer):" in text
+    assert "Expected rubric points (checklist only, NOT candidate content):" in text
+    assert "Candidate answer (evaluate this only):" in text
+    assert "A mutable sequence." in text
+    assert "- Ordered" in text
+    assert "items = []" in text
+
+
 @pytest.mark.asyncio
 async def test_evaluate_with_audio_initial_round() -> None:
     """Round 0 audio evaluation uses the answer evaluation schema."""