From f41cf56f1d070c1ef3cf3ef4986a9c234fd514fa Mon Sep 17 00:00:00 2001 From: vitchenkokir Date: Fri, 12 Jun 2026 13:43:25 +0300 Subject: [PATCH] fix model evalution prompt --- CHANGELOG.md | 2 + app/interview/domain/value_objects.py | 2 + app/shared/infrastructure/models.py | 2 + app/shared/questions.py | 7 +++ app/shared/structured_evaluation.py | 6 +- app/theory/domain/entities.py | 4 ++ app/theory/domain/value_objects.py | 2 + app/theory/repositories/mappers.py | 33 +++++++++++ app/theory/services/evaluator/prompts.py | 30 ++++++++++ app/theory/services/evaluator/service.py | 58 +++++++++++++++++-- app/theory/services/planning.py | 14 ++++- app/theory/services/submission.py | 10 ++++ tests/shared/test_questions.py | 36 ++++++++++++ .../repositories/test_theory_section.py | 53 +++++++++++++++++ tests/theory/services/test_evaluator.py | 29 ++++++++++ 15 files changed, 279 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index af26e5f..747577c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ Work in progress is accumulated under `[Unreleased]`; on release, that section b ### Changed +- **Theory answer evaluation** — load `expected_points` rubric bullets from question banks, pass them through evaluation prompts with explicit candidate-only scoring rules, and use temperature 0 for structured LLM evaluation + ### Fixed ### Removed diff --git a/app/interview/domain/value_objects.py b/app/interview/domain/value_objects.py index 99fb3a2..3dade4d 100644 --- a/app/interview/domain/value_objects.py +++ b/app/interview/domain/value_objects.py @@ -49,11 +49,13 @@ class PlannedQuestion: id: Unique question identifier from the question bank. text: Localized question text shown to the user. code: Optional code snippet, or None when not applicable. + expected_points: Rubric bullets for AI evaluation. """ id: str text: str code: str | None + expected_points: tuple[str, ...] = () class InterviewSelectionHolder(Protocol): diff --git a/app/shared/infrastructure/models.py b/app/shared/infrastructure/models.py index d01dd04..a5225ca 100644 --- a/app/shared/infrastructure/models.py +++ b/app/shared/infrastructure/models.py @@ -123,6 +123,7 @@ class Answer(Base): round: Follow-up round number (0 = initial, 1+ = follow-ups). question_text: Snapshot of the question text at time of asking. question_code: Snapshot of the optional code snippet. + expected_points: JSON array of rubric bullets (None for legacy rows). answer_text: User's answer text (None if skipped). score: AI-assigned score (1-5, or 0 on timeout), None if not yet evaluated. feedback: AI-generated feedback text. @@ -143,6 +144,7 @@ class Answer(Base): round: Mapped[int] = mapped_column(Integer, default=0) question_text: Mapped[str] = mapped_column(Text) question_code: Mapped[str | None] = mapped_column(Text, nullable=True) + expected_points: Mapped[str | None] = mapped_column(Text, nullable=True) answer_text: Mapped[str | None] = mapped_column(Text, nullable=True) score: Mapped[int | None] = mapped_column(Integer, nullable=True) feedback: Mapped[str | None] = mapped_column(Text, nullable=True) diff --git a/app/shared/questions.py b/app/shared/questions.py index 0065067..d874899 100644 --- a/app/shared/questions.py +++ b/app/shared/questions.py @@ -29,6 +29,7 @@ class Question: tags: List of topic tags. text: The question text. code: Optional code snippet (None if not applicable). + expected_points: Rubric bullets for AI evaluation. """ id: str @@ -37,6 +38,7 @@ class Question: tags: list[str] text: str code: str | None + expected_points: tuple[str, ...] = () def _resolve_localized_string( @@ -111,6 +113,10 @@ def load_category( questions = [] for q in data.get("questions", []): qid = q["id"] + points_raw = q.get("expected_points", []) + if not isinstance(points_raw, list): + msg = f"Question {qid}: invalid expected_points" + raise ValueError(msg) questions.append( Question( id=qid, @@ -124,6 +130,7 @@ def load_category( question_id=qid, ), code=q["question"].get("code"), + expected_points=tuple(str(point) for point in points_raw), ) ) return questions diff --git a/app/shared/structured_evaluation.py b/app/shared/structured_evaluation.py index d39fa01..66b7e66 100644 --- a/app/shared/structured_evaluation.py +++ b/app/shared/structured_evaluation.py @@ -65,7 +65,7 @@ async def generate_and_parse_json_response[T: BaseModel]( messages: list[Message], response_model: type[T], max_tokens: int = 2000, - temperature: float = 0.3, + temperature: float = 0.1, ) -> T: """Generate JSON from chat messages and parse it with retry on truncation. @@ -167,14 +167,14 @@ async def evaluate_with_schema[T: BaseModel]( messages=messages, audio_wav=audio_wav, user_text=user_text, - temperature=0.3, + temperature=0.0, max_tokens=budget, ) else: messages.append(Message(role="user", content=user_text)) result = await provider.generate( messages=messages, - temperature=0.3, + temperature=0.0, max_tokens=budget, ) diff --git a/app/theory/domain/entities.py b/app/theory/domain/entities.py index 360e470..7772e75 100644 --- a/app/theory/domain/entities.py +++ b/app/theory/domain/entities.py @@ -44,6 +44,7 @@ class TheoryTask: round: Follow-up round number (0 = initial). question_text: Question text shown to the user. question_code: Optional code snippet for the question. + expected_points: Rubric bullets for AI evaluation. answer_text: User answer text, or None when unanswered. score: AI score for the round, or None when not evaluated. feedback: AI-generated feedback text, or None. @@ -68,6 +69,7 @@ class TheoryTask: feedback: str | None started_at: datetime | None created_at: datetime + expected_points: tuple[str, ...] = () def timer_deadline(self, limit_seconds: int) -> datetime: """Compute the absolute deadline for this timed task round. @@ -240,6 +242,7 @@ def start( feedback=None, started_at=timer_start if order == 1 else None, created_at=when, + expected_points=question.expected_points, ) ) return cls( @@ -452,6 +455,7 @@ def with_follow_up( feedback=None, started_at=None, created_at=created_at, + expected_points=base.expected_points, ) return replace(self, tasks=self.tasks + (follow_up,)), follow_up diff --git a/app/theory/domain/value_objects.py b/app/theory/domain/value_objects.py index a83361b..c15283c 100644 --- a/app/theory/domain/value_objects.py +++ b/app/theory/domain/value_objects.py @@ -15,8 +15,10 @@ class PlannedTheoryQuestion: id: Unique question identifier from the question bank. text: Localized question text shown to the user. code: Optional code snippet, or None when not applicable. + expected_points: Rubric bullets for AI evaluation. """ id: str text: str code: str | None + expected_points: tuple[str, ...] = () diff --git a/app/theory/repositories/mappers.py b/app/theory/repositories/mappers.py index e899c9f..5146b97 100644 --- a/app/theory/repositories/mappers.py +++ b/app/theory/repositories/mappers.py @@ -20,6 +20,37 @@ from app.theory.schemas.theory import TheorySectionRead, TheoryTaskRead +def _expected_points_to_json(points: tuple[str, ...]) -> str | None: + """Serialize rubric bullets for ORM storage. + + Args: + points: Domain rubric tuple. + + Returns: + JSON array string, or None when empty. + """ + if not points: + return None + return json.dumps(list(points), separators=(",", ":")) + + +def _expected_points_from_json(raw: str | None) -> tuple[str, ...]: + """Deserialize rubric bullets from an ORM column. + + Args: + raw: JSON array string or None for legacy rows. + + Returns: + Tuple of rubric bullet strings. + """ + if raw is None: + return () + data = json.loads(raw) + if not isinstance(data, list): + return () + return tuple(str(point) for point in data) + + def _question_ids_from_tasks(tasks: tuple[DomainTheoryTask, ...]) -> tuple[str, ...]: """Derive ordered question IDs from initial task rounds. @@ -59,6 +90,7 @@ def theory_task_from_orm( round=answer.round, question_text=answer.question_text, question_code=answer.question_code, + expected_points=_expected_points_from_json(answer.expected_points), answer_text=answer.answer_text, score=answer.score, feedback=answer.feedback, @@ -91,6 +123,7 @@ def domain_theory_task_to_orm( round=task.round, question_text=task.question_text, question_code=task.question_code, + expected_points=_expected_points_to_json(task.expected_points), answer_text=task.answer_text, score=task.score, feedback=task.feedback, diff --git a/app/theory/services/evaluator/prompts.py b/app/theory/services/evaluator/prompts.py index 6fe511f..f229e70 100644 --- a/app/theory/services/evaluator/prompts.py +++ b/app/theory/services/evaluator/prompts.py @@ -43,6 +43,20 @@ feedback, strengths, or weaknesses; focus on whether the candidate grasped the concepts.""" ANSWER_EVALUATION_INSTRUCTIONS = """You are a technical interviewer evaluating a candidate's answer. + +Evaluate ONLY the candidate text under "Candidate answer (evaluate this only):". +Do NOT treat question text, code blocks in the question, expected rubric points, +or your own knowledge as something the candidate said. + +If the answer is empty, off-topic, only asks the interviewer what to do, +or does not attempt to explain the topic, score 1 and set follow_up_needed true. + +In feedback, quote or paraphrase only what appears under the candidate answer. +Do not praise code or explanations that are not present in the answer. + +Use the expected rubric points as a checklist for what a strong answer should cover, +but score only what the candidate actually stated. + Assess the answer based on: - 5: Excellent — complete understanding, examples, edge cases considered - 4: Good — solid understanding with minor omissions @@ -105,6 +119,22 @@ }""" +def format_expected_rubric( + expected_points: tuple[str, ...] | list[str] | None, +) -> str: + """Format rubric bullets for evaluator prompts. + + Args: + expected_points: Rubric bullets from the question bank. + + Returns: + Bulleted list text, or ``(none)`` when empty. + """ + if not expected_points: + return "(none)" + return "\n".join(f"- {point}" for point in expected_points) + + def build_evaluator_instructions(locale: str, task_instructions: str) -> str: """Combine locale, substance-focused rules, and task-specific instructions. diff --git a/app/theory/services/evaluator/service.py b/app/theory/services/evaluator/service.py index 34198fe..b3eec0b 100644 --- a/app/theory/services/evaluator/service.py +++ b/app/theory/services/evaluator/service.py @@ -23,6 +23,7 @@ SECTION_EVALUATION_INSTRUCTIONS, SESSION_EVALUATION_INSTRUCTIONS, build_evaluator_instructions, + format_expected_rubric, looks_like_json_schema_fragment, ) @@ -62,6 +63,39 @@ def _format_question(question_text: str, question_code: str | None) -> str: return f"{question_text}\n\nCode:\n{question_code}" return question_text + @staticmethod + def _format_answer_evaluation_user_text( + question_text: str, + question_code: str | None, + answer_text: str | None = None, + expected_points: tuple[str, ...] | None = None, + ) -> str: + """Build labeled user text for initial answer evaluation prompts. + + Args: + question_text: The question text. + question_code: Optional code snippet from the question. + answer_text: Candidate answer text, or None for audio-only input. + expected_points: Rubric bullets from the question bank. + + Returns: + Labeled prompt text separating context from candidate content. + """ + question = TheoryEvaluatorService._format_question(question_text, question_code) + rubric = format_expected_rubric(expected_points) + parts = [ + f"Question (for context only, NOT part of the answer):\n{question}", + ( + "Expected rubric points (checklist only, NOT candidate content):\n" + f"{rubric}" + ), + ] + if answer_text is not None: + parts.append( + f"Candidate answer (evaluate this only):\n{answer_text}", + ) + return "\n\n".join(parts) + @staticmethod async def _evaluate_with_schema( provider: AIProvider, @@ -106,6 +140,7 @@ async def evaluate_answer( question_text: str, answer_text: str, question_code: str | None = None, + expected_points: tuple[str, ...] | None = None, locale: str = DEFAULT_LOCALE, ) -> AnswerEvaluation: """Evaluate a user's initial answer (round=0). @@ -115,6 +150,7 @@ async def evaluate_answer( question_text: The question text. answer_text: The user's answer. question_code: Optional code snippet from the question. + expected_points: Rubric bullets from the question bank. locale: Locale for AI feedback and follow-up questions. Returns: @@ -123,8 +159,12 @@ async def evaluate_answer( Raises: ValueError: If AI response is invalid or connection fails. """ - question = TheoryEvaluatorService._format_question(question_text, question_code) - user_text = f"Question:\n{question}\n\nAnswer:\n{answer_text}" + user_text = TheoryEvaluatorService._format_answer_evaluation_user_text( + question_text=question_text, + question_code=question_code, + answer_text=answer_text, + expected_points=expected_points, + ) return await TheoryEvaluatorService._evaluate_with_schema( provider, locale=locale, @@ -139,6 +179,7 @@ async def evaluate_answer_with_audio( question_text: str, audio_wav: bytes, question_code: str | None = None, + expected_points: tuple[str, ...] | None = None, locale: str = DEFAULT_LOCALE, ) -> AnswerEvaluation: """Evaluate a user's initial spoken answer (round=0). @@ -148,6 +189,7 @@ async def evaluate_answer_with_audio( question_text: The question text. audio_wav: The user's spoken answer as WAV bytes. question_code: Optional code snippet from the question. + expected_points: Rubric bullets from the question bank. locale: Locale for AI feedback and follow-up questions. Returns: @@ -156,13 +198,17 @@ async def evaluate_answer_with_audio( Raises: ValueError: If AI response is invalid or connection fails. """ - question = TheoryEvaluatorService._format_question(question_text, question_code) + user_text = TheoryEvaluatorService._format_answer_evaluation_user_text( + question_text=question_text, + question_code=question_code, + expected_points=expected_points, + ) return await TheoryEvaluatorService._evaluate_with_schema( provider, locale=locale, instructions=ANSWER_EVALUATION_INSTRUCTIONS, response_model=AnswerEvaluation, - user_text=f"Question:\n{question}", + user_text=user_text, audio_wav=audio_wav, ) @@ -290,6 +336,7 @@ async def evaluate_submission( question_code: str | None, initial_question_text: str, initial_answer_text: str, + expected_points: tuple[str, ...] | None = None, answer_text: str | None = None, audio_wav: bytes | None = None, ) -> tuple[AnswerEvaluation | FollowUpEvaluation, bool, str | None]: @@ -303,6 +350,7 @@ async def evaluate_submission( question_code: Optional code snippet for the question. initial_question_text: Original question text (round 0). initial_answer_text: User's initial answer text (round 0). + expected_points: Rubric bullets from the question bank. answer_text: User answer text for text-mode evaluation. audio_wav: Spoken answer WAV for multimodal evaluation. @@ -323,6 +371,7 @@ async def evaluate_submission( question_text=question_text, audio_wav=audio_wav, question_code=question_code, + expected_points=expected_points, locale=locale, ) else: @@ -331,6 +380,7 @@ async def evaluate_submission( question_text=question_text, answer_text=answer_text or "", question_code=question_code, + expected_points=expected_points, locale=locale, ) elif audio_wav is not None: diff --git a/app/theory/services/planning.py b/app/theory/services/planning.py index fe1d7bc..891a51e 100644 --- a/app/theory/services/planning.py +++ b/app/theory/services/planning.py @@ -41,7 +41,12 @@ def _to_planned(question: Question) -> PlannedQuestion: Returns: Domain value object for theory section creation. """ - return PlannedQuestion(id=question.id, text=question.text, code=question.code) + return PlannedQuestion( + id=question.id, + text=question.text, + code=question.code, + expected_points=question.expected_points, + ) def validate_selection(selection: InterviewSelection) -> None: @@ -142,6 +147,11 @@ def build_theory_question_plan( track_pools = load_track_pools(selection, locale) planned = plan_questions(selection, question_count, track_pools) return tuple( - PlannedTheoryQuestion(id=question.id, text=question.text, code=question.code) + PlannedTheoryQuestion( + id=question.id, + text=question.text, + code=question.code, + expected_points=question.expected_points, + ) for question in planned ) diff --git a/app/theory/services/submission.py b/app/theory/services/submission.py index 92d700c..d6dd402 100644 --- a/app/theory/services/submission.py +++ b/app/theory/services/submission.py @@ -54,6 +54,7 @@ class TheorySubmissionContext: question_code: Optional code snippet for the question. initial_question_text: Original question text (round 0). initial_answer_text: User's initial answer text (round 0). + expected_points: Rubric bullets for AI evaluation. locale: Section locale for AI and speech. answer_text: Text persisted on the task row (may be empty for audio). """ @@ -65,6 +66,7 @@ class TheorySubmissionContext: question_code: str | None initial_question_text: str initial_answer_text: str + expected_points: tuple[str, ...] locale: str answer_text: str @@ -96,6 +98,7 @@ async def _evaluate_last_follow_up_in_background( answer_text: str, initial_question_text: str, initial_answer_text: str, + expected_points: tuple[str, ...], provider: AIProvider, locale: str, audio_wav: bytes | None = None, @@ -111,6 +114,7 @@ async def _evaluate_last_follow_up_in_background( answer_text: The user's answer text (transcript when audio was submitted). initial_question_text: Original question text (round 0). initial_answer_text: User's initial answer text (round 0). + expected_points: Rubric bullets for AI evaluation. provider: AI provider for evaluation. locale: Locale for AI feedback. audio_wav: Optional spoken answer WAV for multimodal evaluation. @@ -125,6 +129,7 @@ async def _evaluate_last_follow_up_in_background( question_code=question_code, initial_question_text=initial_question_text, initial_answer_text=initial_answer_text, + expected_points=expected_points, audio_wav=audio_wav, ) else: @@ -136,6 +141,7 @@ async def _evaluate_last_follow_up_in_background( question_code=question_code, initial_question_text=initial_question_text, initial_answer_text=initial_answer_text, + expected_points=expected_points, answer_text=answer_text, ) TheoryEvaluationPersistenceService.persist_evaluation_only( @@ -230,6 +236,7 @@ async def _open_submission( question_code=saved.question_code, initial_question_text=initial_question_text, initial_answer_text=initial_answer_text, + expected_points=saved.expected_points, locale=updated.locale, answer_text=answer_text, ) @@ -306,6 +313,7 @@ def _schedule_last_follow_up_evaluation( answer_text=ctx.answer_text, initial_question_text=ctx.initial_question_text, initial_answer_text=ctx.initial_answer_text, + expected_points=ctx.expected_points, provider=provider, locale=ctx.locale, audio_wav=audio_wav, @@ -433,6 +441,7 @@ async def stream_answer_submission( question_code=ctx.question_code, initial_question_text=ctx.initial_question_text, initial_answer_text=ctx.initial_answer_text, + expected_points=ctx.expected_points, answer_text=ctx.answer_text, ) ) @@ -537,6 +546,7 @@ async def stream_audio_answer_submission( question_code=ctx.question_code, initial_question_text=ctx.initial_question_text, initial_answer_text=ctx.initial_answer_text, + expected_points=ctx.expected_points, audio_wav=wav_bytes, ), name=f"audio-eval-{interview_id}-{ctx.question_id}-r{ctx.round_num}", diff --git a/tests/shared/test_questions.py b/tests/shared/test_questions.py index a3503e4..0a6fe16 100644 --- a/tests/shared/test_questions.py +++ b/tests/shared/test_questions.py @@ -196,6 +196,42 @@ def test_list_categories_non_existent_track(self, temp_questions_dir): categories = list_categories("java", "junior") assert categories == [] + def test_load_category_expected_points(self, temp_questions_dir): + """Test loading expected_points rubric bullets.""" + path = temp_questions_dir / "python" / "junior" / "rubric.yaml" + _write_category_yaml( + path, + [ + { + "id": "rub-001", + "type": "knowledge", + "difficulty": 1, + "expected_points": ["Point A", "Point B"], + "question": {"text": "Explain lists.", "code": None}, + } + ], + ) + questions = load_category("python", "junior", "rubric") + assert questions[0].expected_points == ("Point A", "Point B") + + def test_load_category_invalid_expected_points_raises(self, temp_questions_dir): + """Invalid expected_points shape raises ValueError.""" + path = temp_questions_dir / "python" / "junior" / "bad-rubric.yaml" + _write_category_yaml( + path, + [ + { + "id": "bad-rub-001", + "type": "knowledge", + "difficulty": 1, + "expected_points": "not-a-list", + "question": {"text": "Bad rubric.", "code": None}, + } + ], + ) + with pytest.raises(ValueError, match="bad-rub-001"): + load_category("python", "junior", "bad-rubric") + def test_load_category_with_code(self, temp_questions_dir): """Test loading a question with a code snippet.""" path = temp_questions_dir / "python" / "junior" / "with-code.yaml" diff --git a/tests/theory/repositories/test_theory_section.py b/tests/theory/repositories/test_theory_section.py index 1623db5..e777fcd 100644 --- a/tests/theory/repositories/test_theory_section.py +++ b/tests/theory/repositories/test_theory_section.py @@ -41,6 +41,7 @@ def _sample_planned() -> tuple[PlannedTheoryQuestion, ...]: id="py-001", text="What is a list?", code=None, + expected_points=("Mutable sequence", "Ordered"), ), PlannedTheoryQuestion( id="py-002", @@ -70,6 +71,7 @@ def test_timer_deadline_and_expiry(self) -> None: feedback=None, started_at=started, created_at=started, + expected_points=(), ) deadline = task.timer_deadline(120) assert deadline == started + timedelta(seconds=120) @@ -107,6 +109,29 @@ def test_start_defers_timer_when_not_first_task(self) -> None: ) assert section.tasks[0].started_at is None + def test_start_preserves_expected_points_on_tasks(self) -> None: + """Planned rubric bullets are copied onto initial task rows.""" + section = DomainTheorySection.start( + "iv-1", + selection=_sample_selection(), + locale="en", + planned_questions=_sample_planned(), + ) + assert section.tasks[0].expected_points == ("Mutable sequence", "Ordered") + assert section.tasks[1].expected_points == () + + def test_with_follow_up_copies_expected_points(self) -> None: + """Follow-up task rows inherit rubric bullets from the base question.""" + section = DomainTheorySection.start( + "iv-1", + selection=_sample_selection(), + locale="en", + planned_questions=_sample_planned(), + ) + updated, follow_up = section.with_follow_up("py-001", "Give an example.") + assert follow_up.expected_points == ("Mutable sequence", "Ordered") + assert updated.tasks[-1].expected_points == ("Mutable sequence", "Ordered") + class TestTheorySectionRepository: """Theory section persistence.""" @@ -149,6 +174,34 @@ def test_create_aggregate_persists_tasks(self, isolated_db) -> None: assert len(loaded.tasks) == 2 assert loaded.tasks[0].theory_section_id == loaded.id + def test_create_aggregate_round_trips_expected_points(self, isolated_db) -> None: + """Repository persists rubric bullets on answer rows.""" + with TheoryUnitOfWork() as uow: + uow.session.add( + Interview( + id="iv-rubric", + selection_spec='{"sources":[{"track":"python","level":"junior","categories":["basics"]}]}', + ) + ) + uow.commit() + + section = DomainTheorySection.start( + "iv-rubric", + selection=_sample_selection(), + locale="en", + planned_questions=_sample_planned(), + ) + with TheoryUnitOfWork() as uow: + uow.theory_sections.create_aggregate(section) + uow.commit() + + with TheoryUnitOfWork() as uow: + loaded = uow.theory_sections.get_aggregate("iv-rubric") + + assert loaded is not None + assert loaded.tasks[0].expected_points == ("Mutable sequence", "Ordered") + assert loaded.tasks[1].expected_points == () + @pytest.fixture def alembic_engine(tmp_path: Path, monkeypatch): diff --git a/tests/theory/services/test_evaluator.py b/tests/theory/services/test_evaluator.py index 1288305..455d023 100644 --- a/tests/theory/services/test_evaluator.py +++ b/tests/theory/services/test_evaluator.py @@ -5,10 +5,39 @@ import pytest from app.ai.audio_probe import minimal_wav_bytes +from app.theory.services.evaluator.prompts import format_expected_rubric from app.theory.services.evaluator.service import TheoryEvaluatorService from tests.fakes import FakeProvider, answer_evaluation_json, follow_up_evaluation_json +def test_format_expected_rubric_empty() -> None: + """Empty rubric renders as (none).""" + assert format_expected_rubric(()) == "(none)" + assert format_expected_rubric(None) == "(none)" + + +def test_format_expected_rubric_bullets() -> None: + """Rubric bullets render as a markdown list.""" + text = format_expected_rubric(("First point", "Second point")) + assert text == "- First point\n- Second point" + + +def test_format_answer_evaluation_user_text_labels_candidate_content() -> None: + """Evaluation prompt separates context from candidate answer.""" + text = TheoryEvaluatorService._format_answer_evaluation_user_text( + question_text="What is a list?", + question_code="items = []", + answer_text="A mutable sequence.", + expected_points=("Ordered", "Mutable"), + ) + assert "Question (for context only, NOT part of the answer):" in text + assert "Expected rubric points (checklist only, NOT candidate content):" in text + assert "Candidate answer (evaluate this only):" in text + assert "A mutable sequence." in text + assert "- Ordered" in text + assert "items = []" in text + + @pytest.mark.asyncio async def test_evaluate_with_audio_initial_round() -> None: """Round 0 audio evaluation uses the answer evaluation schema."""