Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ Work in progress is accumulated under `[Unreleased]`; on release, that section b

### Changed

- **Theory answer evaluation** — load `expected_points` rubric bullets from question banks, pass them through evaluation prompts with explicit candidate-only scoring rules, and use temperature 0 for structured LLM evaluation

### Fixed

### Removed
Expand Down
2 changes: 2 additions & 0 deletions app/interview/domain/value_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,13 @@ class PlannedQuestion:
id: Unique question identifier from the question bank.
text: Localized question text shown to the user.
code: Optional code snippet, or None when not applicable.
expected_points: Rubric bullets for AI evaluation.
"""

id: str
text: str
code: str | None
expected_points: tuple[str, ...] = ()


class InterviewSelectionHolder(Protocol):
Expand Down
2 changes: 2 additions & 0 deletions app/shared/infrastructure/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ class Answer(Base):
round: Follow-up round number (0 = initial, 1+ = follow-ups).
question_text: Snapshot of the question text at time of asking.
question_code: Snapshot of the optional code snippet.
expected_points: JSON array of rubric bullets (None for legacy rows).
answer_text: User's answer text (None if skipped).
score: AI-assigned score (1-5, or 0 on timeout), None if not yet evaluated.
feedback: AI-generated feedback text.
Expand All @@ -143,6 +144,7 @@ class Answer(Base):
round: Mapped[int] = mapped_column(Integer, default=0)
question_text: Mapped[str] = mapped_column(Text)
question_code: Mapped[str | None] = mapped_column(Text, nullable=True)
expected_points: Mapped[str | None] = mapped_column(Text, nullable=True)
answer_text: Mapped[str | None] = mapped_column(Text, nullable=True)
score: Mapped[int | None] = mapped_column(Integer, nullable=True)
feedback: Mapped[str | None] = mapped_column(Text, nullable=True)
Expand Down
7 changes: 7 additions & 0 deletions app/shared/questions.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class Question:
tags: List of topic tags.
text: The question text.
code: Optional code snippet (None if not applicable).
expected_points: Rubric bullets for AI evaluation.
"""

id: str
Expand All @@ -37,6 +38,7 @@ class Question:
tags: list[str]
text: str
code: str | None
expected_points: tuple[str, ...] = ()


def _resolve_localized_string(
Expand Down Expand Up @@ -111,6 +113,10 @@ def load_category(
questions = []
for q in data.get("questions", []):
qid = q["id"]
points_raw = q.get("expected_points", [])
if not isinstance(points_raw, list):
msg = f"Question {qid}: invalid expected_points"
raise ValueError(msg)
questions.append(
Question(
id=qid,
Expand All @@ -124,6 +130,7 @@ def load_category(
question_id=qid,
),
code=q["question"].get("code"),
expected_points=tuple(str(point) for point in points_raw),
)
)
return questions
Expand Down
6 changes: 3 additions & 3 deletions app/shared/structured_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ async def generate_and_parse_json_response[T: BaseModel](
messages: list[Message],
response_model: type[T],
max_tokens: int = 2000,
temperature: float = 0.3,
temperature: float = 0.1,
) -> T:
"""Generate JSON from chat messages and parse it with retry on truncation.

Expand Down Expand Up @@ -167,14 +167,14 @@ async def evaluate_with_schema[T: BaseModel](
messages=messages,
audio_wav=audio_wav,
user_text=user_text,
temperature=0.3,
temperature=0.0,
max_tokens=budget,
)
else:
messages.append(Message(role="user", content=user_text))
result = await provider.generate(
messages=messages,
temperature=0.3,
temperature=0.0,
max_tokens=budget,
)

Expand Down
4 changes: 4 additions & 0 deletions app/theory/domain/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class TheoryTask:
round: Follow-up round number (0 = initial).
question_text: Question text shown to the user.
question_code: Optional code snippet for the question.
expected_points: Rubric bullets for AI evaluation.
answer_text: User answer text, or None when unanswered.
score: AI score for the round, or None when not evaluated.
feedback: AI-generated feedback text, or None.
Expand All @@ -68,6 +69,7 @@ class TheoryTask:
feedback: str | None
started_at: datetime | None
created_at: datetime
expected_points: tuple[str, ...] = ()

def timer_deadline(self, limit_seconds: int) -> datetime:
"""Compute the absolute deadline for this timed task round.
Expand Down Expand Up @@ -240,6 +242,7 @@ def start(
feedback=None,
started_at=timer_start if order == 1 else None,
created_at=when,
expected_points=question.expected_points,
)
)
return cls(
Expand Down Expand Up @@ -452,6 +455,7 @@ def with_follow_up(
feedback=None,
started_at=None,
created_at=created_at,
expected_points=base.expected_points,
)
return replace(self, tasks=self.tasks + (follow_up,)), follow_up

Expand Down
2 changes: 2 additions & 0 deletions app/theory/domain/value_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ class PlannedTheoryQuestion:
id: Unique question identifier from the question bank.
text: Localized question text shown to the user.
code: Optional code snippet, or None when not applicable.
expected_points: Rubric bullets for AI evaluation.
"""

id: str
text: str
code: str | None
expected_points: tuple[str, ...] = ()
33 changes: 33 additions & 0 deletions app/theory/repositories/mappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,37 @@
from app.theory.schemas.theory import TheorySectionRead, TheoryTaskRead


def _expected_points_to_json(points: tuple[str, ...]) -> str | None:
"""Serialize rubric bullets for ORM storage.

Args:
points: Domain rubric tuple.

Returns:
JSON array string, or None when empty.
"""
if not points:
return None
return json.dumps(list(points), separators=(",", ":"))


def _expected_points_from_json(raw: str | None) -> tuple[str, ...]:
"""Deserialize rubric bullets from an ORM column.

Args:
raw: JSON array string or None for legacy rows.

Returns:
Tuple of rubric bullet strings.
"""
if raw is None:
return ()
data = json.loads(raw)
if not isinstance(data, list):
return ()
return tuple(str(point) for point in data)


def _question_ids_from_tasks(tasks: tuple[DomainTheoryTask, ...]) -> tuple[str, ...]:
"""Derive ordered question IDs from initial task rounds.

Expand Down Expand Up @@ -59,6 +90,7 @@ def theory_task_from_orm(
round=answer.round,
question_text=answer.question_text,
question_code=answer.question_code,
expected_points=_expected_points_from_json(answer.expected_points),
answer_text=answer.answer_text,
score=answer.score,
feedback=answer.feedback,
Expand Down Expand Up @@ -91,6 +123,7 @@ def domain_theory_task_to_orm(
round=task.round,
question_text=task.question_text,
question_code=task.question_code,
expected_points=_expected_points_to_json(task.expected_points),
answer_text=task.answer_text,
score=task.score,
feedback=task.feedback,
Expand Down
30 changes: 30 additions & 0 deletions app/theory/services/evaluator/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,20 @@
feedback, strengths, or weaknesses; focus on whether the candidate grasped the concepts."""

ANSWER_EVALUATION_INSTRUCTIONS = """You are a technical interviewer evaluating a candidate's answer.

Evaluate ONLY the candidate text under "Candidate answer (evaluate this only):".
Do NOT treat question text, code blocks in the question, expected rubric points,
or your own knowledge as something the candidate said.

If the answer is empty, off-topic, only asks the interviewer what to do,
or does not attempt to explain the topic, score 1 and set follow_up_needed true.

In feedback, quote or paraphrase only what appears under the candidate answer.
Do not praise code or explanations that are not present in the answer.

Use the expected rubric points as a checklist for what a strong answer should cover,
but score only what the candidate actually stated.

Assess the answer based on:
- 5: Excellent — complete understanding, examples, edge cases considered
- 4: Good — solid understanding with minor omissions
Expand Down Expand Up @@ -105,6 +119,22 @@
}"""


def format_expected_rubric(
expected_points: tuple[str, ...] | list[str] | None,
) -> str:
"""Format rubric bullets for evaluator prompts.

Args:
expected_points: Rubric bullets from the question bank.

Returns:
Bulleted list text, or ``(none)`` when empty.
"""
if not expected_points:
return "(none)"
return "\n".join(f"- {point}" for point in expected_points)


def build_evaluator_instructions(locale: str, task_instructions: str) -> str:
"""Combine locale, substance-focused rules, and task-specific instructions.

Expand Down
58 changes: 54 additions & 4 deletions app/theory/services/evaluator/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
SECTION_EVALUATION_INSTRUCTIONS,
SESSION_EVALUATION_INSTRUCTIONS,
build_evaluator_instructions,
format_expected_rubric,
looks_like_json_schema_fragment,
)

Expand Down Expand Up @@ -62,6 +63,39 @@ def _format_question(question_text: str, question_code: str | None) -> str:
return f"{question_text}\n\nCode:\n{question_code}"
return question_text

@staticmethod
def _format_answer_evaluation_user_text(
question_text: str,
question_code: str | None,
answer_text: str | None = None,
expected_points: tuple[str, ...] | None = None,
) -> str:
"""Build labeled user text for initial answer evaluation prompts.

Args:
question_text: The question text.
question_code: Optional code snippet from the question.
answer_text: Candidate answer text, or None for audio-only input.
expected_points: Rubric bullets from the question bank.

Returns:
Labeled prompt text separating context from candidate content.
"""
question = TheoryEvaluatorService._format_question(question_text, question_code)
rubric = format_expected_rubric(expected_points)
parts = [
f"Question (for context only, NOT part of the answer):\n{question}",
(
"Expected rubric points (checklist only, NOT candidate content):\n"
f"{rubric}"
),
]
if answer_text is not None:
parts.append(
f"Candidate answer (evaluate this only):\n{answer_text}",
)
return "\n\n".join(parts)

@staticmethod
async def _evaluate_with_schema(
provider: AIProvider,
Expand Down Expand Up @@ -106,6 +140,7 @@ async def evaluate_answer(
question_text: str,
answer_text: str,
question_code: str | None = None,
expected_points: tuple[str, ...] | None = None,
locale: str = DEFAULT_LOCALE,
) -> AnswerEvaluation:
"""Evaluate a user's initial answer (round=0).
Expand All @@ -115,6 +150,7 @@ async def evaluate_answer(
question_text: The question text.
answer_text: The user's answer.
question_code: Optional code snippet from the question.
expected_points: Rubric bullets from the question bank.
locale: Locale for AI feedback and follow-up questions.

Returns:
Expand All @@ -123,8 +159,12 @@ async def evaluate_answer(
Raises:
ValueError: If AI response is invalid or connection fails.
"""
question = TheoryEvaluatorService._format_question(question_text, question_code)
user_text = f"Question:\n{question}\n\nAnswer:\n{answer_text}"
user_text = TheoryEvaluatorService._format_answer_evaluation_user_text(
question_text=question_text,
question_code=question_code,
answer_text=answer_text,
expected_points=expected_points,
)
return await TheoryEvaluatorService._evaluate_with_schema(
provider,
locale=locale,
Expand All @@ -139,6 +179,7 @@ async def evaluate_answer_with_audio(
question_text: str,
audio_wav: bytes,
question_code: str | None = None,
expected_points: tuple[str, ...] | None = None,
locale: str = DEFAULT_LOCALE,
) -> AnswerEvaluation:
"""Evaluate a user's initial spoken answer (round=0).
Expand All @@ -148,6 +189,7 @@ async def evaluate_answer_with_audio(
question_text: The question text.
audio_wav: The user's spoken answer as WAV bytes.
question_code: Optional code snippet from the question.
expected_points: Rubric bullets from the question bank.
locale: Locale for AI feedback and follow-up questions.

Returns:
Expand All @@ -156,13 +198,17 @@ async def evaluate_answer_with_audio(
Raises:
ValueError: If AI response is invalid or connection fails.
"""
question = TheoryEvaluatorService._format_question(question_text, question_code)
user_text = TheoryEvaluatorService._format_answer_evaluation_user_text(
question_text=question_text,
question_code=question_code,
expected_points=expected_points,
)
return await TheoryEvaluatorService._evaluate_with_schema(
provider,
locale=locale,
instructions=ANSWER_EVALUATION_INSTRUCTIONS,
response_model=AnswerEvaluation,
user_text=f"Question:\n{question}",
user_text=user_text,
audio_wav=audio_wav,
)

Expand Down Expand Up @@ -290,6 +336,7 @@ async def evaluate_submission(
question_code: str | None,
initial_question_text: str,
initial_answer_text: str,
expected_points: tuple[str, ...] | None = None,
answer_text: str | None = None,
audio_wav: bytes | None = None,
) -> tuple[AnswerEvaluation | FollowUpEvaluation, bool, str | None]:
Expand All @@ -303,6 +350,7 @@ async def evaluate_submission(
question_code: Optional code snippet for the question.
initial_question_text: Original question text (round 0).
initial_answer_text: User's initial answer text (round 0).
expected_points: Rubric bullets from the question bank.
answer_text: User answer text for text-mode evaluation.
audio_wav: Spoken answer WAV for multimodal evaluation.

Expand All @@ -323,6 +371,7 @@ async def evaluate_submission(
question_text=question_text,
audio_wav=audio_wav,
question_code=question_code,
expected_points=expected_points,
locale=locale,
)
else:
Expand All @@ -331,6 +380,7 @@ async def evaluate_submission(
question_text=question_text,
answer_text=answer_text or "",
question_code=question_code,
expected_points=expected_points,
locale=locale,
)
elif audio_wav is not None:
Expand Down
Loading
Loading