diff --git a/CHANGELOG.md b/CHANGELOG.md index ce0d635b0..df928213f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,25 @@ ## 0.13.0 (unreleased) +### Shared Metric Contract + +- Added public `MetricInput -> MetricResult` scorer/metric runtime types and `ScorerFunctionMetric`. +- Extended BYOB `@scorer` with typed scorer metadata and `to_metric()` while preserving current dict scorer behavior. +- Added optional `config_schema` support for typed scorer configs while keeping raw dict configs as the default. +- Split typed scorer config binding into strict `bind(config=ConfigModel(...))` and coercive `bind_raw_config(config={...})` paths. + +### Multiple-Choice Loglikelihood + Few-Shot (lm-evaluation-harness parity) + +Demonstrates non-trivial benchmark machinery composing with the shared metric contract **without protocol-type changes**. `MetricInput`, `MetricResult`, `MetricDescriptor`, `MetricOutputSpec` shapes untouched. + +- **`@scorer`-typed `multiple_choice_acc`** in `nemo_evaluator.scoring.multiple_choice`: returns `acc`, `acc_norm`, `acc_greedy`. Reads candidate continuations + per-choice loglikelihoods from `MetricInput.candidate.metadata`. +- **`@scorer`-typed `mcq_letter_extract`**: free-form letter extraction (A-J), returns `correct` (continuous) and `parsed` (boolean). +- **`LogprobRankingSolver`** in `nemo_evaluator.solvers.logprob`: ranks candidate continuations via `/completions` (`max_tokens=0, echo=true, logprobs=1`), parses continuation spans via `text_offset`. Per-choice calls run concurrently behind `max_concurrent_choices`. +- **`@benchmark` extensions**: `choices`, `choices_field` (with dotted-path resolution), `num_fewshot`, `fewshot_split`, `fewshot_template`, `fewshot_separator`, `fewshot_seed`. Few-shot prefix is rendered in `ByobEnvironment.seed()`. +- **`_load_hf` dataset URI parsing**: path-segment configs (`hf://ns/name/config[/split]`) and row filters (`?filter_field=...&filter_value=...`). Required for namespaced multilingual datasets like `CohereForAI/Global-MMLU-Lite/en`. +- **Eval loop forwards solver `scoring_details` to `env.verify` kwargs**: lets a solver push per-row payloads to the scorer. `_metric_input_from_verify` lifts `_mc_*`/`_solver_*` namespaced keys onto `MetricInput.candidate.metadata` rather than `row.data`. +- **`ScorerFunctionMetric.compute_scores` merges `candidate.metadata` into legacy `ScorerInput.metadata`** so legacy `(ScorerInput) -> dict` scorers see solver-emitted payloads. + ### Adapter Proxy (Breaking — replaces LiteLLM) - **LiteLLM removed**: The `litellm` dependency, `proxy` and `proxy-full` extras, and `litellm_settings` config field are all removed. The adapter proxy is now built-in with zero external proxy dependencies. diff --git a/scripts/smoketest_logprob_solver.py b/scripts/smoketest_logprob_solver.py new file mode 100644 index 000000000..8a323c98b --- /dev/null +++ b/scripts/smoketest_logprob_solver.py @@ -0,0 +1,169 @@ +"""End-to-end smoke test for LogprobRankingSolver against a fake /v1/completions. + +Spins up an aiohttp server that returns OpenAI-shape responses with +deterministic logprobs derived from the continuation. This validates: + +1. The HTTP wire format the solver emits (max_tokens=0, echo=true, logprobs=1). +2. The text_offset-based continuation parsing. +3. Concurrent per-choice ranking + argmax selection. +4. End-to-end seed → solve → verify through ByobEnvironment with the + typed multiple_choice_acc scorer. + +Run: + python scripts/smoketest_logprob_solver.py +""" + +from __future__ import annotations + +import asyncio +import hashlib +import math +from typing import Any + +from aiohttp import web + +from nemo_evaluator.environments.custom import BenchmarkDefinition, ByobEnvironment +from nemo_evaluator.scoring.multiple_choice import multiple_choice_acc +from nemo_evaluator.solvers.logprob import LogprobRankingSolver + + +def _deterministic_logprob(continuation: str, *, target: str) -> float: + """Deterministic per-continuation logprob: gold gets the highest score. + + The "model" prefers the gold continuation (mocking a perfect oracle). + Other continuations get logprobs derived from a stable hash so the + ranking is reproducible across runs. + """ + if continuation == target: + return -0.5 + h = int(hashlib.sha256(continuation.encode()).hexdigest()[:8], 16) + return -2.0 - (h % 100) / 25.0 + + +async def fake_completions_handler(request: web.Request, *, gold_per_prompt: dict[str, str]) -> web.Response: + body = await request.json() + prompt = body["prompt"] + # The benchmark prompt is a stable substring of `prompt`; find which + # gold continuation belongs to it. + gold = next((g for stem, g in gold_per_prompt.items() if stem in prompt), "") + + # Identify the continuation: it's whatever came AFTER the longest + # benchmark stem we recognise. For the smoke test we just take the + # last word as the continuation token sequence. + matched_stem = next((stem for stem in gold_per_prompt if stem in prompt), "") + continuation = prompt[len(matched_stem):] if matched_stem else prompt[-10:] + + logprob = _deterministic_logprob(continuation, target=gold) + + # Synthesize a token-level response with text_offset that puts the + # continuation just after `matched_stem`. + ctx_end = len(matched_stem) + tokens = ["", continuation] + token_logprobs = [None, logprob] + text_offset = [0, ctx_end] + top_logprobs = [ + {"": -0.01}, + {continuation: logprob, "_other": logprob - 1.0}, + ] + + return web.json_response( + { + "choices": [ + { + "text": "", + "finish_reason": "length", + "logprobs": { + "tokens": tokens, + "token_logprobs": token_logprobs, + "text_offset": text_offset, + "top_logprobs": top_logprobs, + }, + } + ], + "model": body["model"], + "usage": {"prompt_tokens": len(prompt), "completion_tokens": 0, "total_tokens": len(prompt)}, + } + ) + + +def make_app(gold_per_prompt: dict[str, str]) -> web.Application: + async def handler(request: web.Request) -> web.Response: + return await fake_completions_handler(request, gold_per_prompt=gold_per_prompt) + + app = web.Application() + app.router.add_post("/v1/completions", handler) + return app + + +async def run_smoketest() -> None: + benchmark_rows = [ + # prompt stem, choices, gold idx, gold text + {"q": "Capital of France?", "answer": 2}, + {"q": "Capital of UK?", "answer": 3}, + {"q": "Capital of Germany?", "answer": 0}, + ] + choices = ["Berlin", "Madrid", "Paris", "London"] + gold_per_prompt = {f"Q: {row['q']}\nA: ": choices[row["answer"]] for row in benchmark_rows} + + # Start fake server + app = make_app(gold_per_prompt) + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, "127.0.0.1", 11999) + await site.start() + + try: + defn = BenchmarkDefinition( + name="capitals_smoke", + dataset=lambda: benchmark_rows, + prompt="Q: {q}\nA: ", + target_field="answer", + choices=choices, + scorer_fn=multiple_choice_acc, + ) + env = ByobEnvironment(defn) + solver = LogprobRankingSolver( + base_url="http://127.0.0.1:11999/v1", + model="fake-mc-oracle", + ) + + print(f"running {len(benchmark_rows)} rows × {len(choices)} choices each\n") + per_row_results = [] + for idx in range(len(benchmark_rows)): + seed = await env.seed(idx) + solve = await solver.solve(seed) + assert solve.error is None, f"solver error: {solve.error}" + + merged_meta = {**seed.metadata, **solve.scoring_details} + vr = await env.verify(solve.response, seed.expected_answer, **merged_meta) + outputs = vr.scoring_details.get("outputs", {}) + per_row_results.append((benchmark_rows[idx]["q"], solve, vr, outputs)) + + # Summary + print(f"{'question':<22} {'argmax':<10} {'gold':<10} {'acc':<6} {'logprobs'}") + print("─" * 96) + all_correct = True + for (q, solve, vr, outputs), row in zip(per_row_results, benchmark_rows): + argmax = solve.response + gold = choices[row["answer"]] + lps = solve.scoring_details["_mc_choices_logprobs"] + lps_str = " ".join(f"{c}={lp:.2f}" for c, lp in zip(choices, lps)) + acc = outputs.get("acc", 0.0) + if acc != 1.0: + all_correct = False + print(f"{q:<22} {argmax:<10} {gold:<10} {acc:<6} {lps_str}") + + print() + print(f"all correct: {all_correct}") + for q, solve, vr, outputs in per_row_results: + assert outputs.get("acc") == 1.0, f"acc != 1.0 for {q!r}: {outputs}" + assert math.isfinite(solve.scoring_details["_mc_choices_logprobs"][0]) + print("OK: end-to-end seed → solve → verify works through real HTTP") + + await solver.close() + finally: + await runner.cleanup() + + +if __name__ == "__main__": + asyncio.run(run_smoketest()) diff --git a/src/nemo_evaluator/__init__.py b/src/nemo_evaluator/__init__.py index ffba7adcb..21414cd24 100644 --- a/src/nemo_evaluator/__init__.py +++ b/src/nemo_evaluator/__init__.py @@ -16,22 +16,30 @@ __version__ = "0.12.0" +from nemo_evaluator.engine.eval_loop import run_evaluation +from nemo_evaluator.engine.model_client import ModelClient from nemo_evaluator.environments.base import EvalEnvironment, SeedResult, VerifyResult from nemo_evaluator.environments.custom import benchmark, scorer from nemo_evaluator.environments.registry import get_environment, list_environments, load_benchmark_file, register -from nemo_evaluator.engine.eval_loop import run_evaluation -from nemo_evaluator.engine.model_client import ModelClient -from nemo_evaluator.solvers import ( - ChatSolver, - CompletionSolver, - NatSolver, - OpenClawSolver, - Solver, - SolveResult, - VLMSolver, -) from nemo_evaluator.scoring import ( + BooleanValue, + CandidateOutput, + ContinuousScore, + DatasetRow, + DiscreteScore, + Label, + Metric, + MetricDescriptor, + MetricInput, + MetricOutput, + MetricOutputSpec, + MetricResult, + MetricScorerFunction, + ScorerCallable, + ScorerConfig, + ScorerFunctionMetric, ScorerInput, + ScorerReturn, answer_line, code_sandbox, code_sandbox_async, @@ -40,6 +48,18 @@ multichoice_regex, needs_judge, numeric_match, + score_names_from_output_spec, +) +from nemo_evaluator.scoring.multiple_choice import mcq_letter_extract, multiple_choice_acc +from nemo_evaluator.solvers import ( + ChatSolver, + CompletionSolver, + LogprobRankingSolver, + NatSolver, + OpenClawSolver, + Solver, + SolveResult, + VLMSolver, ) __all__ = [ @@ -57,6 +77,7 @@ "Solver", "ChatSolver", "CompletionSolver", + "LogprobRankingSolver", "NatSolver", "OpenClawSolver", "VLMSolver", @@ -65,6 +86,24 @@ "benchmark", "scorer", "ScorerInput", + "Metric", + "BooleanValue", + "DatasetRow", + "CandidateOutput", + "ContinuousScore", + "DiscreteScore", + "Label", + "MetricInput", + "MetricOutput", + "MetricOutputSpec", + "MetricDescriptor", + "MetricResult", + "MetricScorerFunction", + "ScorerCallable", + "ScorerConfig", + "ScorerFunctionMetric", + "ScorerReturn", + "score_names_from_output_spec", # Scoring primitives "exact_match", "multichoice_regex", @@ -73,5 +112,7 @@ "numeric_match", "code_sandbox", "code_sandbox_async", + "mcq_letter_extract", + "multiple_choice_acc", "needs_judge", ] diff --git a/src/nemo_evaluator/engine/eval_loop.py b/src/nemo_evaluator/engine/eval_loop.py index 989238ba3..73fb21eaf 100644 --- a/src/nemo_evaluator/engine/eval_loop.py +++ b/src/nemo_evaluator/engine/eval_loop.py @@ -435,11 +435,18 @@ async def _run_step(idx: int, slot: int, rep: int, seed_result, seed_ms: float): logger.debug("p%d r%d: using pre-computed reward=%.4f", idx, rep, vr.reward) elif not _solve_failed: verify_sandbox = await lifecycle.get_verify_sandbox() + # Forward solver-emitted payload (e.g. logprobs, ranking + # metadata) into verify alongside seed metadata. Keys + # collide rarely in practice; solver payload wins on + # collision because it is the more recent, more + # specific source. + solver_meta = solve_result.scoring_details if solve_result else {} + merged_meta = {**seed_result.metadata, **solver_meta} vr = await env.verify( response_text, seed_result.expected_answer, sandbox=verify_sandbox, - **seed_result.metadata, + **merged_meta, ) break # success — exit retry loop diff --git a/src/nemo_evaluator/environments/custom.py b/src/nemo_evaluator/environments/custom.py index 5d0e8cdd2..8cf60d203 100644 --- a/src/nemo_evaluator/environments/custom.py +++ b/src/nemo_evaluator/environments/custom.py @@ -41,16 +41,34 @@ def score(sample: ScorerInput) -> dict: import json import logging import random +from collections.abc import Mapping from dataclasses import dataclass, field from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable +from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast, overload + +from pydantic import BaseModel if TYPE_CHECKING: from nemo_evaluator.sandbox.base import Sandbox from nemo_evaluator.environments.base import EvalEnvironment, SeedResult, VerifyResult -from nemo_evaluator.sandbox.base import ImageBuildRequest, SandboxSpec from nemo_evaluator.environments.registry import register +from nemo_evaluator.sandbox.base import ImageBuildRequest, SandboxSpec +from nemo_evaluator.scoring.metric import ( + CandidateOutput, + DatasetRow, + Metric, + MetricDescriptor, + MetricInput, + MetricOutputSpec, + MetricResult, + MetricScorerFunction, + ScorerCallable, + ScorerConfig, + ScorerFunctionMetric, + ScorerReturn, + score_names_from_output_spec, +) from nemo_evaluator.scoring.types import ScorerInput logger = logging.getLogger(__name__) @@ -59,10 +77,14 @@ def score(sample: ScorerInput) -> dict: # ── Data types ──────────────────────────────────────────────────────────── +ConfigT = TypeVar("ConfigT", bound=Mapping[str, object] | BaseModel) +ConfigModelT = TypeVar("ConfigModelT", bound=BaseModel) + + @dataclass class BenchmarkDefinition: name: str - dataset: str | Callable[[], list[dict]] + dataset: str | Callable[..., list[dict[str, Any]]] prompt: str target_field: str = "target" endpoint_type: str = "chat" @@ -70,10 +92,22 @@ class BenchmarkDefinition: field_mapping: dict[str, str] | None = None extra: dict[str, Any] = field(default_factory=dict) requirements: list[str] | None = None - scorer_fn: Callable[[ScorerInput], dict] | None = None - prepare_row: Callable[[dict, int, random.Random], dict] | None = None - seed_fn: Callable[[dict, int], SeedResult] | None = None - image_builder_fn: Callable[[list[dict]], ImageBuildRequest] | None = None + scorer_fn: Callable[..., ScorerReturn] | None = None + prepare_row: Callable[[dict[str, Any], int, random.Random], dict[str, Any]] | None = None + seed_fn: Callable[[dict[str, Any], int], SeedResult] | None = None + image_builder_fn: Callable[[list[dict[str, Any]]], ImageBuildRequest] | None = None + # Multiple-choice loglikelihood support (lm-eval-harness parity). + # Either ``choices`` (static, e.g. ["A","B","C","D"]) or ``choices_field`` + # (per-row dataset field, supports dotted paths like ``choices.text``) + # must be set when paired with LogprobRankingSolver. + choices: list[str] | None = None + choices_field: str | None = None + # Few-shot prompting (mirrors lm-eval-harness ``--num_fewshot``). + num_fewshot: int = 0 + fewshot_split: str | None = None + fewshot_template: str | None = None + fewshot_separator: str = "\n\n" + fewshot_seed: int = 42 _BYOB_REGISTRY: dict[str, BenchmarkDefinition] = {} @@ -82,8 +116,11 @@ class BenchmarkDefinition: # ── Dataset loading ─────────────────────────────────────────────────────── -def _load_dataset_from_spec(spec: str | Callable, num_examples: int | None = None) -> list[dict[str, Any]]: - if callable(spec): +def _load_dataset_from_spec( + spec: str | Callable[..., list[dict[str, Any]]], + num_examples: int | None = None, +) -> list[dict[str, Any]]: + if not isinstance(spec, str): import inspect sig = inspect.signature(spec) @@ -124,10 +161,26 @@ def _load_csv(path: Path, delimiter: str = ",") -> list[dict[str, Any]]: def _load_hf(spec: str, num_examples: int | None = None) -> list[dict[str, Any]]: + """Load a HuggingFace dataset. + + Accepts both query-style and path-segment URIs:: + + google/boolq?split=validation # query split + CohereForAI/Global-MMLU-Lite/en?split=test # path-segment config + CohereForAI/Global-MMLU-Lite/en/test # path-segment config + split + boolq?split=validation&config=cfg # explicit config kwarg + + The first two path segments are always treated as ``namespace/name`` so + namespaced datasets work. A third segment is treated as the config when + no ``?config=`` is given; a fourth is treated as the split when no + ``?split=`` is given. Filter kwargs (``filter_field``, + ``filter_value``, optionally suffixed with ``_1``/``_2``/...) drop rows + that don't match. + """ from datasets import load_dataset - parts = spec.split("?") - dataset_name = parts[0] + parts = spec.split("?", 1) + body = parts[0] params: dict[str, str] = {} if len(parts) > 1: for kv in parts[1].split("&"): @@ -135,8 +188,26 @@ def _load_hf(spec: str, num_examples: int | None = None) -> list[dict[str, Any]] k, v = kv.split("=", 1) params[k] = v - split = params.get("split", "test") - config = params.get("config") + segments = [s for s in body.split("/") if s] + if not segments: + raise ValueError(f"Invalid HuggingFace spec (empty): {spec!r}") + + config_from_path: str | None = None + split_from_path: str | None = None + if len(segments) == 1: + dataset_name = segments[0] + elif len(segments) == 2: + dataset_name = "/".join(segments[:2]) + elif len(segments) == 3: + dataset_name = "/".join(segments[:2]) + config_from_path = segments[2] + else: + dataset_name = "/".join(segments[:2]) + config_from_path = segments[2] + split_from_path = segments[3] + + split = params.get("split", split_from_path or "test") + config = params.get("config", config_from_path) if num_examples and "[" not in split: split = f"{split}[:{num_examples}]" @@ -145,10 +216,33 @@ def _load_hf(spec: str, num_examples: int | None = None) -> list[dict[str, Any]] if config: args.append(config) ds = load_dataset(*args, split=split) - return [dict(row) for row in ds] + rows = [dict(row) for row in ds] + + filters = _extract_filters(params) + if filters: + rows = [r for r in rows if all(str(r.get(field)) == value for field, value in filters)] + + return rows -def _format_prompt(template: str, row: dict, field_mapping: dict | None = None) -> str: +def _extract_filters(params: dict[str, str]) -> list[tuple[str, str]]: + """Pull ``filter_field=...&filter_value=...`` pairs (with numeric suffixes).""" + filters: list[tuple[str, str]] = [] + base_field = params.get("filter_field") + base_value = params.get("filter_value") + if base_field and base_value is not None: + filters.append((base_field, base_value)) + for k, v in params.items(): + if not k.startswith("filter_field_"): + continue + suffix = k.removeprefix("filter_field_") + value = params.get(f"filter_value_{suffix}") + if value is not None: + filters.append((v, value)) + return filters + + +def _format_prompt(template: str, row: dict[str, Any], field_mapping: dict[str, str] | None = None) -> str: data = dict(row) if field_mapping: for src, dst in field_mapping.items(): @@ -210,12 +304,21 @@ async def seed(self, idx: int) -> SeedResult: return self._defn.seed_fn(row, idx) prompt = _format_prompt(self._defn.prompt, row, self._defn.field_mapping) + if self._defn.num_fewshot > 0: + prompt = self._fewshot_prefix() + prompt target = str(row.get(self._defn.target_field, "")) meta: dict[str, Any] = {"source": "byob", "benchmark": self._defn.name} for k, v in row.items(): meta[k] = v + # Multiple-choice loglikelihood: surface candidate continuations on + # ``metadata["_mc_choices"]`` so a LogprobRankingSolver (or any + # solver that recognises the convention) can rank them. + choices = _resolve_mc_choices(row, self._defn) + if choices is not None: + meta["_mc_choices"] = choices + messages = [{"role": "user", "content": prompt}] if self._defn.system_prompt: messages.insert(0, {"role": "system", "content": self._defn.system_prompt}) @@ -224,6 +327,32 @@ async def seed(self, idx: int) -> SeedResult: prompt=prompt, expected_answer=target, messages=messages, system=self._defn.system_prompt, metadata=meta ) + def _fewshot_prefix(self) -> str: + defn = self._defn + examples = self._fewshot_examples() + if not examples: + return "" + rendered: list[str] = [] + for ex in examples: + text = _render_fewshot_example(defn, ex) + if text is not None: + rendered.append(text) + if not rendered: + return "" + return defn.fewshot_separator.join(rendered) + defn.fewshot_separator + + def _fewshot_examples(self) -> list[dict[str, Any]]: + defn = self._defn + if defn.num_fewshot <= 0: + return [] + pool = self._dataset[: max(defn.num_fewshot * 4, defn.num_fewshot)] + if not pool: + return [] + rng = random.Random(defn.fewshot_seed) + if len(pool) <= defn.num_fewshot: + return list(pool) + return rng.sample(pool, defn.num_fewshot) + async def verify(self, response: str, expected: str, sandbox: Sandbox | None = None, **meta: Any) -> VerifyResult: if self._defn.scorer_fn is None: correct = response.strip().lower() == expected.strip().lower() @@ -235,27 +364,159 @@ async def verify(self, response: str, expected: str, sandbox: Sandbox | None = N import asyncio + to_metric = getattr(self._defn.scorer_fn, "to_metric", None) + if callable(to_metric): + metric = cast(ScorerFunctionMetric[ScorerConfig], to_metric()).bind_raw_config( + config=self._defn.extra, + sandbox=sandbox, + target=expected, + ) + metric_input = _metric_input_from_verify( + response=response, + metadata=meta, + ) + result = await metric.compute_scores(metric_input) + return _metric_result_to_verify_result( + metric=metric, + result=result, + benchmark_name=self._defn.name, + response=response, + ) + sample = ScorerInput( response=response, target=expected, metadata=meta, config=self._defn.extra, sandbox=sandbox ) - scores = self._defn.scorer_fn(sample) - if asyncio.iscoroutine(scores): - scores = await scores - - reward = float(scores.get("correct", scores.get("reward", next(iter(scores.values()), 0)))) + scores_result = self._defn.scorer_fn(sample) + if asyncio.iscoroutine(scores_result): + scores_result = await scores_result + scores = cast(Mapping[str, object], scores_result) + + reward_value = scores.get("correct", scores.get("reward", next(iter(scores.values()), 0))) + reward = float(reward_value) if isinstance(reward_value, bool | int | float) else 0.0 + extracted = scores.get("extracted") return VerifyResult( reward=reward, - extracted_answer=scores.get("extracted", response.strip()[:200]), - scoring_details={"method": f"byob_{self._defn.name}", **scores}, + extracted_answer=extracted if isinstance(extracted, str) else response.strip()[:200], + scoring_details={"method": f"byob_{self._defn.name}", **dict(scores)}, ) +def _metric_input_from_verify( + *, + response: str, + metadata: dict[str, Any], +) -> MetricInput: + """Build a MetricInput from BYOB verify kwargs. + + Keys prefixed ``_mc_`` (and any other agreed-upon solver-side payload + namespace, e.g. ``_solver_*``) are lifted out of the dataset row and + placed on ``candidate.metadata`` — the slot the contract designates for + inference-time information about the model's output. Dataset-row keys + flow through ``row.data`` unchanged. + """ + candidate_meta: dict[str, object] = {} + row_data: dict[str, object] = {} + for k, v in metadata.items(): + if isinstance(k, str) and (k.startswith("_mc_") or k.startswith("_solver_")): + candidate_meta[k] = v + else: + row_data[k] = v + return MetricInput( + row=DatasetRow(data=row_data), + candidate=CandidateOutput(output_text=response, metadata=candidate_meta), + ) + + +def _resolve_mc_choices(row: dict[str, Any], defn: BenchmarkDefinition) -> list[str] | None: + """Resolve per-row candidate continuations from a benchmark definition. + + Per-row ``choices_field`` (with optional dotted path support such as + ``choices.text``) takes precedence over the static ``choices`` list. + Returns ``None`` when neither source resolves to a non-empty list. + """ + if defn.choices_field: + raw: object = row + for part in defn.choices_field.split("."): + if isinstance(raw, dict) and part in raw: + raw = raw[part] + else: + raw = None + break + if raw is None: + raw = row.get(defn.choices_field) + if isinstance(raw, dict): + raw = raw.get("text") + if isinstance(raw, list) and raw: + return [str(c) for c in raw] + if defn.choices: + return list(defn.choices) + return None + + +def _render_fewshot_example(defn: BenchmarkDefinition, row: dict[str, Any]) -> str | None: + """Render one few-shot example. Returns None on missing-field errors.""" + try: + if defn.fewshot_template: + return _format_prompt(defn.fewshot_template, row, defn.field_mapping) + prompt_part = _format_prompt(defn.prompt, row, defn.field_mapping) + target_part = row.get(defn.target_field, "") + return f"{prompt_part} {target_part}".rstrip() + except KeyError: + return None + + +def _metric_result_to_verify_result( + *, + metric: Metric, + result: MetricResult, + benchmark_name: str, + response: str, +) -> VerifyResult: + outputs = {output.name: output.value for output in result.outputs} + score_names = score_names_from_output_spec(metric.output_spec()) + scores = {name: _score_value(outputs[name]) for name in score_names if name in outputs} + reward_name = _select_reward_score_name(scores=scores, declared=score_names) + extracted = outputs.get("extracted") + + scoring_details: dict[str, Any] = { + "method": f"byob_{benchmark_name}", + "metric_type": metric.type, + "outputs": outputs, + } + for name, value in outputs.items(): + scoring_details.setdefault(name, value) + + return VerifyResult( + reward=scores[reward_name] if reward_name is not None else 0.0, + extracted_answer=extracted if isinstance(extracted, str) else response.strip()[:200], + scoring_details=scoring_details, + ) + + +def _select_reward_score_name(*, scores: dict[str, float], declared: list[str]) -> str | None: + for preferred in ("reward", "correct"): + if preferred in scores: + return preferred + for name in declared: + if name in scores: + return name + return next(iter(scores), None) + + +def _score_value(value: object) -> float: + if isinstance(value, bool): + return 1.0 if value else 0.0 + if isinstance(value, int | float): + return float(value) + raise TypeError(f"Metric score output must be bool, int, or float, got {type(value).__name__}") + + # ── Decorators ──────────────────────────────────────────────────────────── def benchmark( name: str, - dataset: str | Callable, + dataset: str | Callable[..., list[dict[str, Any]]], prompt: str = "", target_field: str = "target", endpoint_type: str = "chat", @@ -263,9 +524,16 @@ def benchmark( field_mapping: dict[str, str] | None = None, extra: dict[str, Any] | None = None, requirements: list[str] | None = None, - prepare_row: Callable | None = None, - seed_fn: Callable | None = None, - **kwargs, + prepare_row: Callable[[dict[str, Any], int, random.Random], dict[str, Any]] | None = None, + seed_fn: Callable[[dict[str, Any], int], SeedResult] | None = None, + choices: list[str] | None = None, + choices_field: str | None = None, + num_fewshot: int = 0, + fewshot_split: str | None = None, + fewshot_template: str | None = None, + fewshot_separator: str = "\n\n", + fewshot_seed: int = 42, + **kwargs: Any, ): """Register a benchmark. Decorate a scorer function.""" defn = BenchmarkDefinition( @@ -280,6 +548,13 @@ def benchmark( requirements=requirements, prepare_row=prepare_row, seed_fn=seed_fn, + choices=choices, + choices_field=choices_field, + num_fewshot=num_fewshot, + fewshot_split=fewshot_split, + fewshot_template=fewshot_template, + fewshot_separator=fewshot_separator, + fewshot_seed=fewshot_seed, ) def decorator(fn): @@ -300,13 +575,133 @@ def __init__(self, num_examples: int | None = None): return decorator -def scorer(fn: Callable[[ScorerInput], dict]) -> Callable[[ScorerInput], dict]: - """Marks a function as a scorer.""" - fn._is_scorer = True # type: ignore[attr-defined] - return fn +@overload +def scorer( + fn: None = None, + *, + metric_type: str, + outputs: list[MetricOutputSpec], + config_schema: type[ConfigModelT], +) -> Callable[[ScorerCallable[ConfigModelT]], MetricScorerFunction[ConfigModelT]]: ... + + +@overload +def scorer( + fn: ScorerCallable[ConfigModelT], + *, + metric_type: str, + outputs: list[MetricOutputSpec], + config_schema: type[ConfigModelT], +) -> MetricScorerFunction[ConfigModelT]: ... + + +@overload +def scorer( + fn: None = None, + *, + metric_type: str, + outputs: list[MetricOutputSpec], + config_schema: None = None, +) -> Callable[[ScorerCallable[ConfigT]], MetricScorerFunction[ConfigT]]: ... + + +@overload +def scorer( + fn: ScorerCallable[ConfigT], + *, + metric_type: str, + outputs: list[MetricOutputSpec], + config_schema: None = None, +) -> MetricScorerFunction[ConfigT]: ... + + +@overload +def scorer(fn: ScorerCallable[ConfigT]) -> ScorerCallable[ConfigT]: ... + + +@overload +def scorer( + fn: None = None, + *, + metric_type: None = None, + outputs: None = None, + config_schema: None = None, +) -> Callable[[ScorerCallable[ConfigT]], ScorerCallable[ConfigT]]: ... + + +def scorer( + fn: Callable[..., ScorerReturn] | None = None, + *, + metric_type: str | None = None, + outputs: list[MetricOutputSpec] | None = None, + config_schema: type[BaseModel] | None = None, +) -> object: + """Marks a function as a scorer. + + Plain ``@scorer`` keeps the current ``ScorerInput -> dict`` behavior. + ``@scorer(metric_type=..., outputs=...)`` exposes ``descriptor`` and + ``to_metric()`` for adapting scorer functions to the shared Metric protocol. + """ + if outputs is None and (metric_type is not None or config_schema is not None): + metric_options = [ + option + for option, value in ( + ("metric_type=...", metric_type), + ("config_schema=...", config_schema), + ) + if value is not None + ] + raise ValueError( + f"@scorer({', '.join(metric_options)}) opts into the Metric contract, but no outputs were declared. " + "Pass outputs=[MetricOutputSpec(...)] so the metric descriptor can declare and validate outputs." + ) + if outputs is not None and metric_type is None: + raise ValueError( + "@scorer(outputs=...) opts into the Metric contract, but no metric_type was declared. " + "Pass metric_type='...' so the metric has a stable identity across refactors." + ) + + def decorate(fn: Callable[..., ScorerReturn]) -> object: + return _decorate_scorer( + cast(ScorerCallable[ScorerConfig], fn), + metric_type=metric_type, + outputs=outputs, + config_schema=config_schema, + ) + + return decorate(fn) if fn is not None else decorate -def image_builder(builder_fn: Callable[[list[dict]], ImageBuildRequest]): +def _decorate_scorer( + fn: ScorerCallable[ConfigT], + *, + metric_type: str | None = None, + outputs: list[MetricOutputSpec] | None = None, + config_schema: type[BaseModel] | None = None, +): + setattr(fn, "_is_scorer", True) + if outputs is None: + return fn + if metric_type is None: + raise ValueError("metric_type is required when outputs are declared") + + descriptor = MetricDescriptor( + type=metric_type, + outputs=outputs, + config_schema=config_schema, + ) + + def to_metric() -> ScorerFunctionMetric[ConfigT]: + return ScorerFunctionMetric( + descriptor=descriptor, + scorer_fn=fn, + ) + + setattr(fn, "descriptor", descriptor) + setattr(fn, "to_metric", to_metric) + return fn + +def image_builder(builder_fn: Callable[[list[dict[str, Any]]], ImageBuildRequest]): """Declare images that need building, stacked with ``@benchmark``. ``builder_fn`` receives the dataset rows and returns an diff --git a/src/nemo_evaluator/scoring/__init__.py b/src/nemo_evaluator/scoring/__init__.py index 3ca193cf7..928b7fd26 100644 --- a/src/nemo_evaluator/scoring/__init__.py +++ b/src/nemo_evaluator/scoring/__init__.py @@ -22,6 +22,8 @@ from typing import Callable +from nemo_evaluator.scoring.code_execution import code_sandbox, code_sandbox_async +from nemo_evaluator.scoring.json_schema import extract_json, validate_json_schema from nemo_evaluator.scoring.judge import ( JudgeScoringConfig, build_judge_prompt, @@ -29,9 +31,27 @@ needs_judge, parse_judge_response, ) -from nemo_evaluator.scoring.json_schema import extract_json, validate_json_schema +from nemo_evaluator.scoring.metric import ( + BooleanValue, + CandidateOutput, + ContinuousScore, + DatasetRow, + DiscreteScore, + Label, + Metric, + MetricDescriptor, + MetricInput, + MetricOutput, + MetricOutputSpec, + MetricResult, + MetricScorerFunction, + ScorerCallable, + ScorerConfig, + ScorerFunctionMetric, + ScorerReturn, + score_names_from_output_spec, +) from nemo_evaluator.scoring.pattern import answer_line, multichoice_regex, numeric_match -from nemo_evaluator.scoring.code_execution import code_sandbox, code_sandbox_async from nemo_evaluator.scoring.text import exact_match, extract_mcq_letter, fuzzy_match from nemo_evaluator.scoring.types import ScorerInput @@ -65,6 +85,24 @@ def list_scorers() -> list[str]: __all__ = [ "ScorerInput", + "Metric", + "BooleanValue", + "DatasetRow", + "ContinuousScore", + "CandidateOutput", + "DiscreteScore", + "Label", + "MetricInput", + "MetricOutput", + "MetricOutputSpec", + "MetricDescriptor", + "MetricResult", + "MetricScorerFunction", + "ScorerCallable", + "ScorerConfig", + "ScorerFunctionMetric", + "ScorerReturn", + "score_names_from_output_spec", "get_scorer", "list_scorers", # Text @@ -87,4 +125,15 @@ def list_scorers() -> list[str]: # JSON "extract_json", "validate_json_schema", + # Multiple-choice (lazy-loaded — depends on @scorer in environments.custom) + "multiple_choice_acc", + "mcq_letter_extract", ] + + +def __getattr__(name: str): + if name in ("multiple_choice_acc", "mcq_letter_extract"): + from nemo_evaluator.scoring import multiple_choice as _mc + + return getattr(_mc, name) + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/src/nemo_evaluator/scoring/metric.py b/src/nemo_evaluator/scoring/metric.py new file mode 100644 index 000000000..4f7dafcac --- /dev/null +++ b/src/nemo_evaluator/scoring/metric.py @@ -0,0 +1,395 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Shared MetricInput -> MetricResult runtime contract.""" + +from __future__ import annotations + +import inspect +from collections.abc import Awaitable, Callable, Mapping +from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast, runtime_checkable + +from pydantic import BaseModel, ConfigDict, Field, RootModel, field_validator + +from nemo_evaluator.scoring.types import ScorerInput + +if TYPE_CHECKING: + from nemo_evaluator.sandbox.base import Sandbox + + +ConfigT = TypeVar("ConfigT", bound=Mapping[str, object] | BaseModel) +SchemaT = TypeVar("SchemaT", bound=BaseModel) + + +class DatasetRow(BaseModel): + """Original benchmark dataset row plus optional stable row identity.""" + + model_config = ConfigDict(extra="forbid") + + row_index: int | None = None + data: dict[str, object] + + +class CandidateOutput(BaseModel): + """Candidate output being scored for one dataset row.""" + + model_config = ConfigDict(extra="forbid") + + output_text: str | None = None + response: object | None = None + trajectory: object | None = None + metadata: dict[str, object] = Field(default_factory=dict) + + +class MetricInput(BaseModel): + """Complete per-row scoring input passed to a metric.""" + + model_config = ConfigDict(extra="forbid") + + row: DatasetRow + candidate: CandidateOutput + + +class ContinuousScore(RootModel[float]): + """Continuous numeric metric value.""" + + +class DiscreteScore(RootModel[int]): + """Discrete numeric metric value.""" + + +class Label(RootModel[str]): + """String label metric value.""" + + +class BooleanValue(RootModel[bool]): + """Boolean metric value.""" + + +class MetricOutputSpec(BaseModel, Generic[SchemaT]): + """Schema for one named value emitted by a metric.""" + + model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) + + name: str + description: str | None = None + value_schema: type[SchemaT] + + @field_validator("name") + @classmethod + def _name_must_not_be_empty(cls, value: str) -> str: + if not value: + raise ValueError("metric output name must not be empty") + return value + + @staticmethod + def continuous_score(name: str, description: str | None = None) -> MetricOutputSpec[ContinuousScore]: + return MetricOutputSpec[ContinuousScore](name=name, description=description, value_schema=ContinuousScore) + + @staticmethod + def discrete_score(name: str, description: str | None = None) -> MetricOutputSpec[DiscreteScore]: + return MetricOutputSpec[DiscreteScore](name=name, description=description, value_schema=DiscreteScore) + + @staticmethod + def label(name: str, description: str | None = None) -> MetricOutputSpec[Label]: + return MetricOutputSpec[Label](name=name, description=description, value_schema=Label) + + @staticmethod + def boolean(name: str, description: str | None = None) -> MetricOutputSpec[BooleanValue]: + return MetricOutputSpec[BooleanValue](name=name, description=description, value_schema=BooleanValue) + + @staticmethod + def model( + name: str, + value_schema: type[SchemaT], + description: str | None = None, + ) -> MetricOutputSpec[SchemaT]: + return MetricOutputSpec[SchemaT](name=name, description=description, value_schema=value_schema) + + def coerce_value(self, value: object) -> SchemaT: + """Validate and coerce a raw output value to this spec's declared schema.""" + return self.value_schema.model_validate(value) + + def coerce_output(self, output: MetricOutput) -> SchemaT: + """Validate and coerce a named metric output against this spec.""" + if output.name != self.name: + raise ValueError(f"Expected metric output {self.name!r}, got {output.name!r}") + return self.coerce_value(output.value) + + def value_json_schema(self) -> dict[str, object]: + return self.value_schema.model_json_schema() + + +class MetricDescriptor(BaseModel): + """Metadata needed to materialize a decorated scorer as a Metric.""" + + model_config = ConfigDict(extra="forbid") + + type: str + outputs: list[MetricOutputSpec] = Field(min_length=1) + config_schema: type[BaseModel] | None = None + + @field_validator("type") + @classmethod + def _type_must_not_be_empty(cls, value: str) -> str: + if not value: + raise ValueError("metric type must not be empty") + return value + + + @field_validator("outputs") + @classmethod + def _output_names_must_be_unique( + cls, value: list[MetricOutputSpec] + ) -> list[MetricOutputSpec]: + names = [output.name for output in value] + duplicates = sorted({name for name in names if names.count(name) > 1}) + if duplicates: + raise ValueError(f"duplicate metric output names: {duplicates}") + return value + + +class MetricOutput(BaseModel): + """One named value emitted by a metric.""" + + model_config = ConfigDict(extra="forbid") + + name: str + value: object + + +class MetricResult(BaseModel): + """Structured row-level metric result.""" + + model_config = ConfigDict(extra="forbid") + + outputs: list[MetricOutput] + + +@runtime_checkable +class Metric(Protocol): + """Shared row-scoring primitive.""" + + @property + def type(self) -> str: ... + + def output_spec(self) -> list[MetricOutputSpec]: ... + + async def compute_scores(self, input: MetricInput) -> MetricResult: ... + + +ScorerReturn = Mapping[str, object] | Awaitable[Mapping[str, object]] +ScorerCallable = Callable[[ScorerInput[ConfigT]], ScorerReturn] +ScorerConfig = Mapping[str, object] | BaseModel + + +class MetricScorerFunction(Protocol[ConfigT]): + """Decorated scorer function that can be materialized as a metric.""" + + @property + def descriptor(self) -> MetricDescriptor: ... + + def __call__(self, sample: ScorerInput[ConfigT]) -> ScorerReturn: ... + + def to_metric(self) -> ScorerFunctionMetric[ConfigT]: ... + + +class ScorerFunctionMetric(Generic[ConfigT]): + """Metric adapter for decorator-authored OSS ScorerInput -> dict scorers.""" + + def __init__( + self, + *, + descriptor: MetricDescriptor, + scorer_fn: ScorerCallable[ConfigT], + config: ConfigT | None = None, + sandbox: "Sandbox | None" = None, + target: object | None = None, + target_field: str = "target", + ) -> None: + self._descriptor = descriptor + self._scorer_fn = scorer_fn + self._config: ConfigT | Mapping[str, object] | None = ( + self._validate_config(config) if config is not None else None + ) + self._sandbox = sandbox + self._target = target + self._target_field = target_field + + @property + def type(self) -> str: + return self._descriptor.type + + @property + def config(self) -> dict[str, object]: + config = self._resolve_config() + if isinstance(config, BaseModel): + return config.model_dump() + return dict(config) + + @property + def sandbox(self) -> "Sandbox | None": + return self._sandbox + + @property + def target(self) -> object | None: + return self._target + + @property + def target_field(self) -> str: + return self._target_field + + def bind( + self, + *, + config: ConfigT | None = None, + sandbox: "Sandbox | None" = None, + target: object | None = None, + target_field: str | None = None, + ) -> ScorerFunctionMetric[ConfigT]: + validated_config = self._config if config is None else self._validate_config(config) + return ScorerFunctionMetric( + descriptor=self._descriptor, + scorer_fn=self._scorer_fn, + config=cast(ConfigT, validated_config), + sandbox=self._sandbox if sandbox is None else sandbox, + target=self._target if target is None else target, + target_field=self._target_field if target_field is None else target_field, + ) + + def bind_raw_config( + self, + *, + config: ScorerConfig | None = None, + sandbox: "Sandbox | None" = None, + target: object | None = None, + target_field: str | None = None, + ) -> ScorerFunctionMetric[ConfigT]: + """Bind dict-like runtime config, validating it against ``config_schema`` when present.""" + validated_config = self._config if config is None else self._validate_config(config, coerce=True) + return ScorerFunctionMetric( + descriptor=self._descriptor, + scorer_fn=self._scorer_fn, + config=cast(ConfigT, validated_config), + sandbox=self._sandbox if sandbox is None else sandbox, + target=self._target if target is None else target, + target_field=self._target_field if target_field is None else target_field, + ) + + def output_spec(self) -> list[MetricOutputSpec]: + return self._descriptor.outputs + + async def compute_scores(self, input: MetricInput) -> MetricResult: + # Merge row-level dataset data with per-row candidate metadata so legacy + # ``ScorerInput`` scorers see solver-produced payloads (logprobs, + # trajectories, etc.) alongside dataset fields. Candidate metadata + # wins on key collisions — it is the more specific source. + merged_metadata: dict[str, object] = {**dict(input.row.data), **dict(input.candidate.metadata)} + sample: ScorerInput[ConfigT] = ScorerInput( + response=input.candidate.output_text or "", + target=self._target if self._target is not None else input.row.data.get(self._target_field), + metadata=merged_metadata, + config=cast(ConfigT, self._resolve_config()), + sandbox=self._sandbox, + ) + result = self._scorer_fn(sample) + if inspect.isawaitable(result): + result = await result + if not isinstance(result, Mapping): + raise TypeError(f"scorer_fn must return a mapping, got {type(result).__name__}") + metric_result = MetricResult( + outputs=[MetricOutput(name=name, value=value) for name, value in cast(Mapping[str, object], result).items()] + ) + return validate_metric_result(metric_result, self._descriptor.outputs) + + def _validate_config( + self, config: ConfigT | ScorerConfig, *, coerce: bool = False + ) -> ConfigT | Mapping[str, object]: + schema = self._descriptor.config_schema + if schema is None: + if isinstance(config, BaseModel): + return config.model_dump() + return dict(config) + if isinstance(config, schema): + return cast(ConfigT, config) + if not coerce: + raise TypeError( + f"config must be an instance of {schema.__name__}; " + "use bind_raw_config(...) to validate dict-like runtime config" + ) + payload = config.model_dump() if isinstance(config, BaseModel) else config + return cast(ConfigT, schema.model_validate(payload)) + + def _resolve_config(self) -> ConfigT | Mapping[str, object]: + if self._config is not None: + return self._config + schema = self._descriptor.config_schema + if schema is None: + return {} + return cast(ConfigT, schema.model_validate({})) + + +def validate_metric_result(result: MetricResult, outputs: list[MetricOutputSpec]) -> MetricResult: + """Validate a metric result against its declared outputs.""" + returned_names = [output.name for output in result.outputs] + duplicates = sorted({name for name in returned_names if returned_names.count(name) > 1}) + if duplicates: + raise ValueError(f"Duplicate metric output names: {duplicates}") + + outputs_by_name = {output.name: output for output in outputs} + declared_names = [output.name for output in outputs] + declared = set(declared_names) + returned = set(returned_names) + missing = [name for name in declared_names if name not in returned] + undeclared = [name for name in returned_names if name not in declared] + + if missing: + raise ValueError(f"Missing declared metric outputs: {missing}") + if undeclared: + raise ValueError(f"Undeclared metric outputs: {undeclared}") + for output in result.outputs: + outputs_by_name[output.name].coerce_output(output) + return result + + +def score_names_from_output_spec(outputs: list[MetricOutputSpec]) -> list[str]: + """Return declared numeric score names from metric output specs.""" + return [ + output.name + for output in outputs + if issubclass(output.value_schema, ContinuousScore | DiscreteScore | BooleanValue) + ] + + +__all__ = [ + "BooleanValue", + "CandidateOutput", + "ContinuousScore", + "DatasetRow", + "DiscreteScore", + "Label", + "Metric", + "MetricDescriptor", + "MetricInput", + "MetricOutput", + "MetricOutputSpec", + "MetricResult", + "MetricScorerFunction", + "ScorerCallable", + "ScorerConfig", + "ScorerFunctionMetric", + "ScorerReturn", + "score_names_from_output_spec", + "validate_metric_result", +] diff --git a/src/nemo_evaluator/scoring/multiple_choice.py b/src/nemo_evaluator/scoring/multiple_choice.py new file mode 100644 index 000000000..56c14021b --- /dev/null +++ b/src/nemo_evaluator/scoring/multiple_choice.py @@ -0,0 +1,217 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Multiple-choice loglikelihood scorers (lm-evaluation-harness parity). + +The :func:`multiple_choice_acc` scorer expects a per-row payload populated +by an upstream solver (typically :class:`LogprobRankingSolver`) that already +ran the candidate continuations and computed sum-loglikelihoods. The payload +reaches the scorer via :attr:`ScorerInput.metadata` under three keys: + +- ``_mc_choices``: list of candidate continuation strings. +- ``_mc_choices_logprobs``: list of per-choice sum-loglikelihoods. +- ``_mc_choices_is_greedy``: list of booleans, True iff every continuation + token equals the top-1 logprob token at its position. + +The companion :func:`mcq_letter_extract` scorer scores the text-mode case +where the model produced a free-form response and we extract a single +letter (A-J) before comparing to ground truth. +""" + +from __future__ import annotations + +import re +from typing import TYPE_CHECKING + +from nemo_evaluator.environments.custom import scorer +from nemo_evaluator.scoring.metric import MetricOutputSpec + +if TYPE_CHECKING: + from nemo_evaluator.scoring.types import ScorerInput + + +_INT_TO_LETTER = {i: chr(ord("A") + i) for i in range(10)} + + +# Patterns ordered most-specific first; the first match wins. +_MCQ_LETTER_PATTERNS = ( + re.compile(r"\\boxed\{\s*([A-Ja-j])\s*\}"), + re.compile(r"(?:answer\s+is\s*[:\-]?\s*\(?)\s*([A-Ja-j])\b", re.IGNORECASE), + re.compile(r"\boption\s*\(?([A-Ja-j])\)", re.IGNORECASE), + re.compile(r"^\s*\(?([A-Ja-j])[\)\.\:]", re.IGNORECASE), + re.compile(r"\b([A-Ja-j])\b"), +) + + +def _extract_letter(response: str | None) -> str: + """Best-effort extract a single A-J letter from a free-form response.""" + text = (response or "").strip() + if not text: + return "" + if text[0].upper() in "ABCDEFGHIJ" and (len(text) == 1 or not text[1].isalpha()): + return text[0].upper() + for pat in _MCQ_LETTER_PATTERNS: + m = pat.search(text[:200]) + if m: + return m.group(1).upper() + return "" + + +@scorer( + metric_type="multiple_choice_acc", + outputs=[ + MetricOutputSpec.continuous_score( + "acc", + description="1.0 iff argmax of raw choice loglikelihoods matches gold (canonical MMLU metric).", + ), + MetricOutputSpec.continuous_score( + "acc_norm", + description="1.0 iff argmax of length-normalized loglikelihoods matches gold (ARC/BoolQ).", + ), + MetricOutputSpec.continuous_score( + "acc_greedy", + description="1.0 iff the highest-loglikelihood greedy-eligible choice matches gold.", + ), + ], +) +def multiple_choice_acc(sample: ScorerInput) -> dict: + """Score multiple-choice loglikelihood ranking with lm-eval-harness metrics. + + Reads ``_mc_choices``, ``_mc_choices_logprobs``, ``_mc_choices_is_greedy`` + from :attr:`ScorerInput.metadata`. Returns ``{"acc": 0.0, "acc_norm": + 0.0, "acc_greedy": 0.0}`` when the payload is missing or malformed (e.g., + user wired the scorer to a benchmark that didn't go through the + LogprobRankingSolver). + + Resolves the gold target heuristically — accepts integer index, single + letter ``"A"..."Z"``, stringified integer, or verbatim choice text. + Length-normalization uses character length to match lm-eval-harness's + ``acc_norm``. + """ + meta = sample.metadata or {} + choices = meta.get("_mc_choices") or [] + logprobs = meta.get("_mc_choices_logprobs") or [] + is_greedy = meta.get("_mc_choices_is_greedy") or [] + + zero = {"acc": 0.0, "acc_norm": 0.0, "acc_greedy": 0.0} + if not choices or not logprobs or len(choices) != len(logprobs): + return zero + + gold_idx = _resolve_gold_index(sample.target, choices) + if gold_idx is None: + return zero + + raw_argmax = max(range(len(logprobs)), key=lambda i: logprobs[i]) + + norm_scores = [logprobs[i] / max(len(choices[i]), 1) for i in range(len(choices))] + norm_argmax = max(range(len(norm_scores)), key=lambda i: norm_scores[i]) + + greedy_argmax: int | None = None + if is_greedy and any(is_greedy): + greedy_indices = [i for i, g in enumerate(is_greedy) if g] + if greedy_indices: + greedy_argmax = max(greedy_indices, key=lambda i: logprobs[i]) + + return { + "acc": 1.0 if raw_argmax == gold_idx else 0.0, + "acc_norm": 1.0 if norm_argmax == gold_idx else 0.0, + "acc_greedy": 1.0 if greedy_argmax is not None and greedy_argmax == gold_idx else 0.0, + } + + +@scorer( + metric_type="mcq_letter_extract", + outputs=[ + MetricOutputSpec.continuous_score( + "correct", + description="1.0 iff the letter extracted from the response matches the gold letter.", + ), + MetricOutputSpec.boolean( + "parsed", + description="True iff the response yielded a non-empty extracted letter.", + ), + ], +) +def mcq_letter_extract(sample: ScorerInput) -> dict: + """Extract A-J from response and compare to target. + + Handles common response formats: ``"A"``, ``"A)"``, ``"The answer is B"``, + ``"B. Because..."``, ``"(C)"``, ``"Option D"``, ``"\\boxed{E}"``. + + Targets may be a letter (``"A"..."J"``), an integer (``0..9``), the + string form of an integer, or verbatim choice text. When the target is + verbatim text and the row has letter-coded choice columns (``a``/``b``/ + ``c``/``d`` in ``ScorerInput.metadata``), it is matched against those. + """ + predicted = _extract_letter(sample.response) + + raw = sample.target + target_letter = "" + if isinstance(raw, bool): + raw = int(raw) + if isinstance(raw, int): + target_letter = _INT_TO_LETTER.get(raw, "") + elif isinstance(raw, str): + s = raw.strip() + if s.isdigit(): + target_letter = _INT_TO_LETTER.get(int(s), s.upper()) + elif s and s[0].upper() in "ABCDEFGHIJ" and len(s) <= 2: + target_letter = s[0].upper() + else: + for letter, key in zip("ABCD", ("a", "b", "c", "d")): + if (sample.metadata or {}).get(key, "").strip().lower() == s.lower(): + target_letter = letter + break + + return { + "correct": 1.0 if predicted and predicted == target_letter else 0.0, + "parsed": bool(predicted), + } + + +def _resolve_gold_index(target: object, choices: list[str]) -> int | None: + """Map a heterogeneous target value to an index into choices.""" + if target is None: + return None + if isinstance(target, bool): + i = int(target) + return i if i < len(choices) else None + if isinstance(target, int): + return target if 0 <= target < len(choices) else None + if isinstance(target, str): + s = target.strip() + if not s: + return None + if len(s) == 1 and s.upper().isalpha(): + idx = ord(s.upper()) - ord("A") + if 0 <= idx < len(choices): + return idx + if s.lstrip("-").isdigit(): + try: + idx = int(s) + if 0 <= idx < len(choices): + return idx + except ValueError: + pass + for i, c in enumerate(choices): + if c.strip() == s: + return i + s_low = s.lower() + for i, c in enumerate(choices): + if c.strip().lower() == s_low: + return i + return None + + +__all__ = ["multiple_choice_acc", "mcq_letter_extract"] diff --git a/src/nemo_evaluator/scoring/types.py b/src/nemo_evaluator/scoring/types.py index fa2426a5a..fe3a7fa43 100644 --- a/src/nemo_evaluator/scoring/types.py +++ b/src/nemo_evaluator/scoring/types.py @@ -17,14 +17,17 @@ from __future__ import annotations from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Generic, TypeVar if TYPE_CHECKING: from nemo_evaluator.sandbox.base import Sandbox +ConfigT = TypeVar("ConfigT") + + @dataclass -class ScorerInput: +class ScorerInput(Generic[ConfigT]): """Input passed to scorer functions. The ``sandbox`` field is available for scorers that need to inspect or @@ -36,5 +39,5 @@ class ScorerInput: response: str target: Any metadata: dict[str, Any] = field(default_factory=dict) - config: dict[str, Any] = field(default_factory=dict) + config: ConfigT = field(default_factory=dict) sandbox: Sandbox | None = None diff --git a/src/nemo_evaluator/solvers/__init__.py b/src/nemo_evaluator/solvers/__init__.py index 549cf3a11..9bb590abd 100644 --- a/src/nemo_evaluator/solvers/__init__.py +++ b/src/nemo_evaluator/solvers/__init__.py @@ -17,6 +17,7 @@ from nemo_evaluator.solvers.base import Solver, SolveResult from nemo_evaluator.solvers.chat import ChatSolver from nemo_evaluator.solvers.completion import CompletionSolver +from nemo_evaluator.solvers.logprob import LogprobRankingSolver from nemo_evaluator.solvers.nat import NatSolver from nemo_evaluator.solvers.openclaw import OpenClawSolver from nemo_evaluator.solvers.vlm import VLMSolver @@ -25,6 +26,7 @@ "ChatSolver", "CompletionSolver", "HarborSolver", + "LogprobRankingSolver", "NatSolver", "OpenClawSolver", "ReActSolver", diff --git a/src/nemo_evaluator/solvers/logprob.py b/src/nemo_evaluator/solvers/logprob.py new file mode 100644 index 000000000..7bd609db4 --- /dev/null +++ b/src/nemo_evaluator/solvers/logprob.py @@ -0,0 +1,232 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""LogprobRankingSolver: score candidate continuations by sum-loglikelihood. + +Mirrors lm-evaluation-harness's ``loglikelihood`` contract for the +``local-completions`` model adapter. For each row the solver: + +1. Reads candidate continuations from ``task.metadata["_mc_choices"]``. +2. POSTs ``/completions`` with ``prompt = task.prompt + continuation``, + ``max_tokens=0``, ``echo=true``, ``logprobs=1`` for each candidate. +3. Locates the continuation token span via OpenAI-compatible + ``logprobs.text_offset`` (first token whose offset is ``>= len(prompt)``). +4. Sums ``token_logprobs`` over the continuation span, computes greedy + eligibility against ``top_logprobs``, returns a ``SolveResult`` whose + ``response`` is the argmax choice and whose ``scoring_details`` carries + the per-choice logprobs/greedy flags for the verify stage. + +The eval loop forwards ``solve_result.scoring_details`` to +:meth:`EvalEnvironment.verify` so the scoring step can place the payload +on ``MetricInput.candidate.metadata`` for the metric to consume. +""" + +from __future__ import annotations + +import asyncio +import logging +import time +from typing import Any + +from nemo_evaluator.environments.base import SeedResult +from nemo_evaluator.observability.types import ModelResponse +from nemo_evaluator.solvers.base import ErrorKind, SolveResult +from nemo_evaluator.solvers.trajectory_util import build_single_turn_atif + +logger = logging.getLogger(__name__) + + +class LogprobRankingSolver: + """Solver that ranks candidate continuations by sum-loglikelihood. + + Required ``task.metadata`` keys: + + - ``_mc_choices``: ``list[str]`` of candidate continuations. + + Optional ``task.metadata`` keys: + + - ``_mc_continuation_separator``: ``str`` inserted between the prompt + and each continuation (default ``""``). lm-eval-harness conventionally + uses ``" "`` for whitespace-sensitive tokenizers. + """ + + def __init__( + self, + base_url: str, + model: str, + api_key: str | None = None, + max_concurrent_choices: int = 8, + ) -> None: + from nemo_evaluator.engine.model_client import ModelClient + + self._model_client = ModelClient( + base_url=base_url.rstrip("/"), + model=model, + api_key=api_key, + ) + self._url = f"{base_url.rstrip('/')}/completions" + self._model = model + self._max_concurrent_choices = max_concurrent_choices + + async def solve(self, task: SeedResult) -> SolveResult: + choices: list[str] = list(task.metadata.get("_mc_choices") or []) + if not choices: + return SolveResult( + response="", + error="LogprobRankingSolver requires task.metadata['_mc_choices'] to be a non-empty list", + error_kind=ErrorKind.GRACEFUL, + ) + + separator = task.metadata.get("_mc_continuation_separator", "") + prompt = task.prompt or "" + full_context = (task.system + "\n" + prompt) if task.system else prompt + + sem = asyncio.Semaphore(self._max_concurrent_choices) + + async def _score_one(choice: str) -> tuple[float, bool]: + async with sem: + return await self._score_continuation(full_context, separator, choice) + + t0 = time.monotonic() + results = await asyncio.gather(*[_score_one(c) for c in choices], return_exceptions=True) + latency_ms = round((time.monotonic() - t0) * 1000, 2) + + logprobs: list[float] = [] + is_greedy: list[bool] = [] + for r in results: + if isinstance(r, BaseException): + return SolveResult( + response="", + error=f"LogprobRankingSolver inference failed: {r}", + error_kind=ErrorKind.INFRA, + ) + ll, greedy = r + logprobs.append(ll) + is_greedy.append(greedy) + + argmax_idx = max(range(len(choices)), key=lambda i: logprobs[i]) + response = choices[argmax_idx] + + model_resp = ModelResponse( + content=response, + model=self._model, + finish_reason="stop", + prompt_tokens=None, + completion_tokens=None, + total_tokens=None, + latency_ms=latency_ms, + raw_response={"choices_logprobs": logprobs, "choices_is_greedy": is_greedy, "n_choices": len(choices)}, + ) + trajectory = build_single_turn_atif(prompt, response, model_name=self._model) + return SolveResult( + response=response, + model_response=model_resp, + trajectory=trajectory, + scoring_details={ + "_mc_choices": choices, + "_mc_choices_logprobs": logprobs, + "_mc_choices_is_greedy": is_greedy, + }, + ) + + async def close(self) -> None: + await self._model_client.close() + + async def _score_continuation(self, context: str, separator: str, continuation: str) -> tuple[float, bool]: + """Score one continuation by sum-loglikelihood. + + Returns ``(sum_logprob, is_greedy)``. ``is_greedy`` is True iff every + continuation token equals the top-1 candidate at its position under + the model's logprob distribution. + """ + full_context = context + separator + full_prompt = full_context + continuation + payload: dict[str, Any] = { + "model": self._model, + "prompt": full_prompt, + "max_tokens": 0, + "temperature": 0.0, + "logprobs": 1, + "echo": True, + } + data = await self._model_client._post_with_retry(self._url, payload) + return _parse_loglikelihood_response(data, full_context) + + +def _parse_loglikelihood_response(body: dict[str, Any], context: str) -> tuple[float, bool]: + """Parse an OpenAI-compatible /completions response into ``(sum_logprob, is_greedy)``. + + Returns ``(-inf, False)`` when the response is missing logprobs or the + continuation span is empty (zero tokens). + """ + try: + choice = body["choices"][0] + logprobs = choice.get("logprobs") or {} + except (KeyError, IndexError, TypeError): + return (float("-inf"), False) + + tokens: list[str] = logprobs.get("tokens") or [] + token_logprobs: list[float | None] = logprobs.get("token_logprobs") or [] + text_offset: list[int | None] = logprobs.get("text_offset") or [] + top_logprobs: list[dict[str, float] | None] = logprobs.get("top_logprobs") or [] + + if not tokens or not token_logprobs: + return (float("-inf"), False) + + ctx_len = len(context) + start_idx: int | None = None + for i, off in enumerate(text_offset): + if off is not None and off >= ctx_len: + start_idx = i + break + + # Fallback: tokenizer merged context-end with continuation-start. Walk + # backwards from the end and pick the first token whose ``text_offset`` + # is < ``ctx_len`` — the token AFTER it is the first that contains + # continuation chars only. This is an approximation; lm-eval-harness + # tokenizes context separately for exact accounting. + if start_idx is None: + for i in range(len(text_offset) - 1, -1, -1): + off = text_offset[i] + if off is not None and off < ctx_len: + start_idx = i + 1 if i + 1 < len(tokens) else i + break + if start_idx is None: + start_idx = max(len(tokens) - 1, 0) + + cont_token_logprobs = [lp for lp in token_logprobs[start_idx:] if lp is not None] + if not cont_token_logprobs: + return (float("-inf"), False) + + sum_logprob = float(sum(cont_token_logprobs)) + + is_greedy = True + for i in range(start_idx, len(tokens)): + top = top_logprobs[i] if i < len(top_logprobs) else None + if not top: + is_greedy = False + break + try: + best_token = max(top.items(), key=lambda kv: kv[1])[0] + except (AttributeError, ValueError): + is_greedy = False + break + if best_token != tokens[i]: + is_greedy = False + break + + return (sum_logprob, is_greedy) + + +__all__ = ["LogprobRankingSolver"] diff --git a/tests/test_environments/test_byob_mc_integration.py b/tests/test_environments/test_byob_mc_integration.py new file mode 100644 index 000000000..3fa328365 --- /dev/null +++ b/tests/test_environments/test_byob_mc_integration.py @@ -0,0 +1,295 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""End-to-end BYOB integration tests for multiple-choice + few-shot. + +These tests exercise the full ``ByobEnvironment.seed → verify`` path with +synthetic datasets and a fake LogprobRankingSolver. No model server is +required — the solver is mocked, the dataset is in-memory, and the eval +loop is sidestepped (we call seed/verify directly to keep the test +hermetic and fast). +""" + +from __future__ import annotations + +import pytest + +from nemo_evaluator.environments.custom import ( + BenchmarkDefinition, + ByobEnvironment, + _metric_input_from_verify, + _resolve_mc_choices, + benchmark, + scorer, +) +from nemo_evaluator.scoring.metric import MetricOutputSpec +from nemo_evaluator.scoring.multiple_choice import multiple_choice_acc + + +# ── _resolve_mc_choices ───────────────────────────────────────────────────── + + +def test_resolve_mc_choices_static_list() -> None: + defn = BenchmarkDefinition( + name="t", dataset=[], prompt="", choices=["A", "B", "C", "D"] + ) + assert _resolve_mc_choices({"answer": 1}, defn) == ["A", "B", "C", "D"] + + +def test_resolve_mc_choices_per_row_field() -> None: + defn = BenchmarkDefinition( + name="t", dataset=[], prompt="", choices_field="options" + ) + assert _resolve_mc_choices({"options": ["X", "Y"]}, defn) == ["X", "Y"] + + +def test_resolve_mc_choices_dotted_path() -> None: + """ARC-style nested choices: row['choices']['text'] = [...].""" + defn = BenchmarkDefinition( + name="t", dataset=[], prompt="", choices_field="choices.text" + ) + row = {"choices": {"text": ["foo", "bar"], "label": ["A", "B"]}} + assert _resolve_mc_choices(row, defn) == ["foo", "bar"] + + +def test_resolve_mc_choices_field_takes_precedence_over_static() -> None: + defn = BenchmarkDefinition( + name="t", + dataset=[], + prompt="", + choices=["fallback1", "fallback2"], + choices_field="options", + ) + assert _resolve_mc_choices({"options": ["x", "y"]}, defn) == ["x", "y"] + + +def test_resolve_mc_choices_falls_back_when_field_missing() -> None: + defn = BenchmarkDefinition( + name="t", + dataset=[], + prompt="", + choices=["fallback1", "fallback2"], + choices_field="missing", + ) + assert _resolve_mc_choices({}, defn) == ["fallback1", "fallback2"] + + +def test_resolve_mc_choices_returns_none_when_neither_set() -> None: + defn = BenchmarkDefinition(name="t", dataset=[], prompt="") + assert _resolve_mc_choices({}, defn) is None + + +# ── _metric_input_from_verify lifts _mc_* into candidate.metadata ─────────── + + +def test_metric_input_from_verify_separates_mc_keys() -> None: + metric_input = _metric_input_from_verify( + response="Paris", + metadata={ + "question": "Capital of France?", + "answer_idx": 2, + "_mc_choices": ["A", "B", "Paris", "London"], + "_mc_choices_logprobs": [-5.1, -4.8, -1.2, -3.7], + "_mc_choices_is_greedy": [False, False, True, False], + "_solver_attempts": 1, + }, + ) + # Dataset row keys go to row.data + assert metric_input.row.data == {"question": "Capital of France?", "answer_idx": 2} + # _mc_* and _solver_* keys go to candidate.metadata + assert metric_input.candidate.metadata == { + "_mc_choices": ["A", "B", "Paris", "London"], + "_mc_choices_logprobs": [-5.1, -4.8, -1.2, -3.7], + "_mc_choices_is_greedy": [False, False, True, False], + "_solver_attempts": 1, + } + assert metric_input.candidate.output_text == "Paris" + + +# ── End-to-end BYOB seed + verify with multiple_choice_acc ────────────────── + + +@pytest.mark.asyncio +async def test_byob_mmlu_style_end_to_end() -> None: + """MMLU-style 4-way multiple-choice with static choices, in-memory dataset.""" + dataset = [ + {"question": "Capital of France?", "answer": 2}, + {"question": "Capital of UK?", "answer": 3}, + ] + defn = BenchmarkDefinition( + name="mmlu_synth", + dataset=lambda: dataset, + prompt="Q: {question}\nA: ", + target_field="answer", + choices=["Berlin", "Madrid", "Paris", "London"], + scorer_fn=multiple_choice_acc, + ) + env = ByobEnvironment(defn) + + # Seed populates _mc_choices in metadata + seed = await env.seed(0) + assert seed.metadata["_mc_choices"] == ["Berlin", "Madrid", "Paris", "London"] + assert seed.expected_answer == "2" + assert seed.prompt == "Q: Capital of France?\nA: " + + # Simulate the LogprobRankingSolver having already ranked the choices — + # answer index 2 ("Paris") wins. + solver_meta = { + "_mc_choices": ["Berlin", "Madrid", "Paris", "London"], + "_mc_choices_logprobs": [-5.1, -4.8, -1.2, -3.7], + "_mc_choices_is_greedy": [False, False, True, False], + } + merged = {**seed.metadata, **solver_meta} + vr = await env.verify("Paris", "2", **merged) + + assert vr.reward == 1.0 # acc=1, the default reward picks first declared "acc" + assert vr.scoring_details["metric_type"] == "multiple_choice_acc" + assert vr.scoring_details["outputs"] == {"acc": 1.0, "acc_norm": 1.0, "acc_greedy": 1.0} + + +@pytest.mark.asyncio +async def test_byob_per_row_choices_arc_style() -> None: + """ARC-style: variable-length per-row choices via dotted path.""" + dataset = [ + { + "question": "What's hot?", + "choices": {"text": ["lava", "ice", "water"], "label": ["A", "B", "C"]}, + "answer": "A", + }, + { + "question": "Smallest planet?", + "choices": {"text": ["Mercury", "Venus"], "label": ["A", "B"]}, + "answer": "A", + }, + ] + defn = BenchmarkDefinition( + name="arc_synth", + dataset=lambda: dataset, + prompt="Q: {question}\nA: ", + target_field="answer", + choices_field="choices.text", + scorer_fn=multiple_choice_acc, + ) + env = ByobEnvironment(defn) + + seed = await env.seed(0) + assert seed.metadata["_mc_choices"] == ["lava", "ice", "water"] + + seed1 = await env.seed(1) + assert seed1.metadata["_mc_choices"] == ["Mercury", "Venus"] + + +@pytest.mark.asyncio +async def test_byob_few_shot_prefix_prepended() -> None: + dataset = [ + {"q": "1+1", "a": "2"}, + {"q": "2+2", "a": "4"}, + {"q": "3+3", "a": "6"}, + ] + defn = BenchmarkDefinition( + name="fewshot_synth", + dataset=lambda: dataset, + prompt="Q: {q} A: ", + target_field="a", + num_fewshot=2, + fewshot_seed=0, + ) + env = ByobEnvironment(defn) + + seed = await env.seed(0) + # Prompt should have 2 few-shot examples + separator + the test prompt + assert seed.prompt.count("Q: ") == 3 # 2 fewshot + 1 test + assert seed.prompt.endswith("Q: 1+1 A: ") + + +@pytest.mark.asyncio +async def test_byob_few_shot_zero_means_no_prefix() -> None: + dataset = [{"q": "1+1", "a": "2"}, {"q": "2+2", "a": "4"}] + defn = BenchmarkDefinition( + name="nofewshot", + dataset=lambda: dataset, + prompt="Q: {q} A: ", + target_field="a", + num_fewshot=0, + ) + env = ByobEnvironment(defn) + seed = await env.seed(0) + assert seed.prompt == "Q: 1+1 A: " + + +@pytest.mark.asyncio +async def test_byob_mc_payload_threads_to_metric_via_verify_meta() -> None: + """End-to-end: seed → fake solver injects logprobs → verify → metric scores. + + This is the integration proof: the solver's output reaches the metric + through ``MetricInput.candidate.metadata`` without protocol changes. + """ + dataset = [ + {"question": "Capital of France?", "answer": 2}, + ] + defn = BenchmarkDefinition( + name="proof", + dataset=lambda: dataset, + prompt="Q: {question}\nA: ", + target_field="answer", + choices=["Berlin", "Madrid", "Paris", "London"], + scorer_fn=multiple_choice_acc, + ) + env = ByobEnvironment(defn) + + seed = await env.seed(0) + # Pretend a LogprobRankingSolver ran and wrote scoring_details + solver_scoring_details = { + "_mc_choices": ["Berlin", "Madrid", "Paris", "London"], + "_mc_choices_logprobs": [-5.1, -4.8, -1.2, -3.7], + "_mc_choices_is_greedy": [False, False, True, False], + } + merged_meta = {**seed.metadata, **solver_scoring_details} + vr = await env.verify("Paris", seed.expected_answer, **merged_meta) + + assert vr.reward == 1.0 + assert vr.scoring_details["outputs"] == { + "acc": 1.0, + "acc_norm": 1.0, + "acc_greedy": 1.0, + } + + +# ── @benchmark decorator integration ──────────────────────────────────────── + + +def test_benchmark_decorator_wires_mc_kwargs_to_definition() -> None: + """Sanity: the new @benchmark kwargs land on BenchmarkDefinition.""" + from nemo_evaluator.environments.custom import _BYOB_REGISTRY + + @benchmark( + name="test_mc_decorator", + dataset=[{"q": "x", "answer": 0}], + prompt="{q}", + target_field="answer", + choices=["a", "b"], + num_fewshot=1, + fewshot_separator="||", + ) + @scorer( + metric_type="test_score", + outputs=[MetricOutputSpec.continuous_score("ok")], + ) + def _score(sample) -> dict: + return {"ok": 1.0} + + defn = _BYOB_REGISTRY["test_mc_decorator"] + assert defn.choices == ["a", "b"] + assert defn.num_fewshot == 1 + assert defn.fewshot_separator == "||" diff --git a/tests/test_environments/test_custom_metric_contract.py b/tests/test_environments/test_custom_metric_contract.py new file mode 100644 index 000000000..0106b6f55 --- /dev/null +++ b/tests/test_environments/test_custom_metric_contract.py @@ -0,0 +1,152 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for BYOB scorer compatibility with the shared metric contract.""" + +from __future__ import annotations + +from typing import cast + +import pytest +from pydantic import BaseModel + +from nemo_evaluator.environments.custom import BenchmarkDefinition, ByobEnvironment, scorer +from nemo_evaluator.scoring import ScorerInput +from nemo_evaluator.scoring.metric import ( + MetricOutputSpec, +) +from nemo_evaluator.sandbox.base import Sandbox + + +def _dataset() -> list[dict[str, object]]: + return [{"question": "2+2", "answer": "4", "category": "math"}] + + +class ThresholdConfig(BaseModel): + threshold: float + + +@pytest.mark.asyncio +async def test_plain_scorer_decorator_keeps_current_dict_path() -> None: + @scorer + def plain_scorer(sample: ScorerInput) -> dict[str, object]: + assert sample.response == "4" + assert sample.target == "4" + assert sample.metadata["category"] == "math" + assert sample.config["tolerance"] == "exact" + return {"correct": True, "extracted": "4", "label": "exact"} + + env = ByobEnvironment( + BenchmarkDefinition( + name="plain_contract", + dataset=_dataset, + prompt="{question}", + target_field="answer", + extra={"tolerance": "exact"}, + scorer_fn=plain_scorer, + ) + ) + + result = await env.verify("4", "4", category="math") + + assert result.reward == 1.0 + assert result.extracted_answer == "4" + assert result.scoring_details == { + "method": "byob_plain_contract", + "correct": True, + "extracted": "4", + "label": "exact", + } + + +@pytest.mark.asyncio +async def test_typed_scorer_runs_as_metric_through_byob_verify() -> None: + outputs = [ + MetricOutputSpec.continuous_score("reward"), + MetricOutputSpec.continuous_score("format"), + MetricOutputSpec.label("judge_label"), + MetricOutputSpec.label("extracted"), + ] + sandbox = cast(Sandbox, object()) + + @scorer(metric_type="tests.typed_byob", outputs=outputs, config_schema=ThresholdConfig) + async def typed_scorer(sample: ScorerInput[ThresholdConfig]) -> dict[str, object]: + assert sample.response == "4" + assert sample.target == "4" + assert "answer" not in sample.metadata + assert sample.metadata["category"] == "math" + assert isinstance(sample.config, ThresholdConfig) + assert sample.config.threshold == 0.75 + assert sample.sandbox is sandbox + return {"reward": sample.config.threshold, "format": 1.0, "judge_label": "partial", "extracted": "4"} + + env = ByobEnvironment( + BenchmarkDefinition( + name="typed_contract", + dataset=_dataset, + prompt="{question}", + target_field="answer", + extra={"threshold": "0.75"}, + scorer_fn=typed_scorer, + ) + ) + + result = await env.verify("4", "4", sandbox=sandbox, category="math") + + assert result.reward == 0.75 + assert result.extracted_answer == "4" + assert result.scoring_details == { + "method": "byob_typed_contract", + "metric_type": "tests.typed_byob", + "outputs": {"reward": 0.75, "format": 1.0, "judge_label": "partial", "extracted": "4"}, + "reward": 0.75, + "format": 1.0, + "judge_label": "partial", + "extracted": "4", + } + + +@pytest.mark.parametrize( + ("score_names", "score_values", "expected_reward"), + [ + (["reward", "correct"], {"reward": 0.2, "correct": 1.0}, 0.2), + (["quality", "correct"], {"quality": 0.2, "correct": 1.0}, 1.0), + (["quality", "format"], {"quality": 0.4, "format": 1.0}, 0.4), + ], +) +@pytest.mark.asyncio +async def test_typed_scorer_reward_selection( + score_names: list[str], score_values: dict[str, float], expected_reward: float +) -> None: + outputs = [MetricOutputSpec.continuous_score(name) for name in score_names] + + @scorer(metric_type=f"tests.reward_selection.{score_names[0]}", outputs=outputs) + def typed_scorer(sample: ScorerInput) -> dict[str, object]: + assert sample.response == "4" + assert sample.target == "4" + return {name: score_values[name] for name in score_names} + + env = ByobEnvironment( + BenchmarkDefinition( + name=f"typed_reward_{score_names[0]}", + dataset=_dataset, + prompt="{question}", + target_field="answer", + scorer_fn=typed_scorer, + ) + ) + + result = await env.verify("4", "4", category="math") + + assert result.reward == expected_reward diff --git a/tests/test_environments/test_dataset_uri_parsing.py b/tests/test_environments/test_dataset_uri_parsing.py new file mode 100644 index 000000000..c6394437e --- /dev/null +++ b/tests/test_environments/test_dataset_uri_parsing.py @@ -0,0 +1,127 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""HuggingFace URI parsing in ``_load_hf``. + +Locks down the path-segment-with-config behavior used by Sovereign +benchmarks (``hf://CohereForAI/Global-MMLU-Lite/en?split=test``) and the +filter-kwarg row dropping. We patch ``datasets.load_dataset`` so the tests +don't actually hit HuggingFace. +""" + +from __future__ import annotations + +from unittest.mock import patch + +import pytest + +from nemo_evaluator.environments.custom import _extract_filters, _load_hf + + +@pytest.fixture +def captured() -> dict: + return {"calls": []} + + +@pytest.fixture +def fake_load_dataset(captured: dict): + def _fake(*args, **kwargs): + captured["calls"].append({"args": args, "kwargs": kwargs}) + return [{"row_id": 1, "text": "hello", "category": "x"}] + + with patch("datasets.load_dataset", side_effect=_fake) as mock: + yield mock + + +def test_query_split_simple(fake_load_dataset, captured: dict) -> None: + _load_hf("google/boolq?split=validation") + call = captured["calls"][0] + assert call["args"] == ("google/boolq",) + assert call["kwargs"]["split"] == "validation" + + +def test_path_segment_config(fake_load_dataset, captured: dict) -> None: + """``hf://ns/name/config?split=test`` — the config is the third path segment.""" + _load_hf("CohereForAI/Global-MMLU-Lite/en?split=test") + call = captured["calls"][0] + assert call["args"] == ("CohereForAI/Global-MMLU-Lite", "en") + assert call["kwargs"]["split"] == "test" + + +def test_path_segment_config_and_split(fake_load_dataset, captured: dict) -> None: + """``hf://ns/name/config/split`` — both from path segments.""" + _load_hf("CohereForAI/Global-MMLU-Lite/en/test") + call = captured["calls"][0] + assert call["args"] == ("CohereForAI/Global-MMLU-Lite", "en") + assert call["kwargs"]["split"] == "test" + + +def test_query_overrides_path_split(fake_load_dataset, captured: dict) -> None: + _load_hf("ns/name/cfg/train?split=test") + call = captured["calls"][0] + assert call["args"] == ("ns/name", "cfg") + assert call["kwargs"]["split"] == "test" + + +def test_query_overrides_path_config(fake_load_dataset, captured: dict) -> None: + _load_hf("ns/name/cfg?config=other&split=train") + call = captured["calls"][0] + assert call["args"] == ("ns/name", "other") + assert call["kwargs"]["split"] == "train" + + +def test_num_examples_appended_to_split_when_no_slice(fake_load_dataset, captured: dict) -> None: + _load_hf("ds?split=test", num_examples=5) + assert captured["calls"][0]["kwargs"]["split"] == "test[:5]" + + +def test_num_examples_respects_existing_slice(fake_load_dataset, captured: dict) -> None: + _load_hf("ds?split=test[:10]", num_examples=5) + assert captured["calls"][0]["kwargs"]["split"] == "test[:10]" + + +def test_load_hf_filters_rows() -> None: + rows = [{"category": "a", "n": 1}, {"category": "b", "n": 2}, {"category": "a", "n": 3}] + with patch("datasets.load_dataset", return_value=rows): + result = _load_hf("ds?split=test&filter_field=category&filter_value=a") + assert [r["n"] for r in result] == [1, 3] + + +def test_load_hf_filters_with_numeric_suffix() -> None: + rows = [ + {"category": "a", "lang": "en"}, + {"category": "a", "lang": "fr"}, + {"category": "b", "lang": "en"}, + ] + with patch("datasets.load_dataset", return_value=rows): + result = _load_hf( + "ds?split=test&filter_field=category&filter_value=a&filter_field_1=lang&filter_value_1=en" + ) + assert len(result) == 1 + assert result[0]["category"] == "a" + assert result[0]["lang"] == "en" + + +def test_extract_filters_handles_no_filter() -> None: + assert _extract_filters({"split": "test"}) == [] + + +def test_extract_filters_pairs_and_suffix() -> None: + params = { + "filter_field": "x", + "filter_value": "1", + "filter_field_2": "y", + "filter_value_2": "2", + } + assert _extract_filters(params) == [("x", "1"), ("y", "2")] diff --git a/tests/test_integration/test_eval_loop_integration.py b/tests/test_integration/test_eval_loop_integration.py index d96098c24..ba82856a8 100644 --- a/tests/test_integration/test_eval_loop_integration.py +++ b/tests/test_integration/test_eval_loop_integration.py @@ -15,11 +15,16 @@ """Integration tests: run_evaluation end-to-end with mock solver.""" import asyncio +from typing import Any from nemo_evaluator.environments.base import EvalEnvironment, SeedResult, VerifyResult +from nemo_evaluator.environments.custom import BenchmarkDefinition, ByobEnvironment, scorer from nemo_evaluator.engine.eval_loop import run_evaluation from nemo_evaluator.observability.types import ModelResponse +from nemo_evaluator.scoring import ScorerInput +from nemo_evaluator.scoring.metric import MetricOutputSpec +from nemo_evaluator.sandbox.base import Sandbox from nemo_evaluator.solvers import SolveResult @@ -38,7 +43,9 @@ async def seed(self, idx): r = self._dataset[idx] return SeedResult(prompt=r["q"], expected_answer=r["a"], metadata={"idx": idx}) - async def verify(self, response, expected, **meta): + async def verify( + self, response: str, expected: str, sandbox: Sandbox | None = None, **meta: Any + ) -> VerifyResult: correct = response.strip() == expected.strip() return VerifyResult( reward=1.0 if correct else 0.0, extracted_answer=response.strip(), scoring_details={"method": "exact"} @@ -118,7 +125,7 @@ async def tracking_close(): closed.append(True) await original_close() - env.close = tracking_close + env.close = tracking_close # type: ignore[method-assign] # ty: ignore[invalid-assignment] solver = _MockSolver() asyncio.run(run_evaluation(env, solver, n_repeats=1)) assert closed, "env.close() was not called" @@ -191,6 +198,51 @@ def test_concurrent_execution(self): results = bundle["_results"] assert len(results) == 6 + def test_typed_byob_metric_result_preserved_in_results(self): + outputs = [ + MetricOutputSpec.continuous_score("reward"), + MetricOutputSpec.continuous_score("format"), + MetricOutputSpec.label("judge_label"), + MetricOutputSpec.label("rationale"), + ] + + @scorer(metric_type="tests.eval_loop_typed", outputs=outputs) + def typed_scorer(sample: ScorerInput) -> dict[str, object]: + matched = sample.response == sample.target + return { + "reward": 0.8 if matched else 0.0, + "format": 1.0, + "judge_label": "pass", + "rationale": "answer matched", + } + + env = ByobEnvironment( + BenchmarkDefinition( + name="typed_eval_loop", + dataset=lambda: [{"question": "1+1", "answer": "2"}], + prompt="{question}", + target_field="answer", + scorer_fn=typed_scorer, + ) + ) + solver = _MockSolver() + + bundle = asyncio.run(run_evaluation(env, solver, n_repeats=1)) + + result = bundle["_results"][0] + assert result["reward"] == 0.8 + assert result["scoring_details"]["reward"] == 0.8 + assert result["scoring_details"]["format"] == 1.0 + assert result["scoring_details"]["outputs"] == { + "reward": 0.8, + "format": 1.0, + "judge_label": "pass", + "rationale": "answer matched", + } + assert result["scoring_details"]["judge_label"] == "pass" + assert result["scoring_details"]["rationale"] == "answer matched" + assert result["scoring_details"]["metric_type"] == "tests.eval_loop_typed" + class _MockSolverWithTrajectory: """Always correct; returns a non-empty trajectory so we can assert it survives resume.""" diff --git a/tests/test_scoring/test_metric_contract.py b/tests/test_scoring/test_metric_contract.py new file mode 100644 index 000000000..5964b170b --- /dev/null +++ b/tests/test_scoring/test_metric_contract.py @@ -0,0 +1,397 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for the shared MetricInput -> MetricResult contract.""" + +from __future__ import annotations + +from typing import cast + +import pytest +from pydantic import BaseModel, ValidationError + +from nemo_evaluator.environments.custom import scorer +from nemo_evaluator.scoring import ScorerInput +from nemo_evaluator.scoring.metric import ( + CandidateOutput, + ContinuousScore, + DatasetRow, + Label, + MetricDescriptor, + MetricInput, + MetricOutput, + MetricOutputSpec, + MetricResult, + MetricScorerFunction, + ScorerFunctionMetric, + score_names_from_output_spec, + validate_metric_result, +) +from nemo_evaluator.sandbox.base import Sandbox + + +class ThresholdConfig(BaseModel): + threshold: float + label: str = "pass" + + +class OtherThresholdConfig(BaseModel): + threshold: float + label: str = "other" + + +class JudgeDetails(BaseModel): + label: str + rationale: str + confidence: float + + +def test_metric_input_groups_row_and_candidate() -> None: + metric_input = MetricInput( + row=DatasetRow(row_index=7, data={"answer": "Paris", "category": "geography"}), + candidate=CandidateOutput(output_text="Paris", metadata={"model": "mock"}), + ) + + assert metric_input.row.row_index == 7 + assert metric_input.candidate.output_text == "Paris" + assert metric_input.row.data["answer"] == "Paris" + assert not hasattr(metric_input, "sandbox") + assert not hasattr(metric_input, "config") + + +def test_metric_output_spec_convenience_constructors_and_json_schema() -> None: + score = MetricOutputSpec.continuous_score("reward", "Reward score") + label = MetricOutputSpec.label("judge_label") + details = MetricOutputSpec.model("judge_details", JudgeDetails) + + assert score.name == "reward" + assert score.description == "Reward score" + assert score.value_schema is ContinuousScore + assert score.value_json_schema()["type"] == "number" + assert label.value_schema is Label + assert details.value_schema is JudgeDetails + schema_properties = cast(dict[str, object], details.value_json_schema()["properties"]) + confidence_schema = cast(dict[str, object], schema_properties["confidence"]) + assert confidence_schema["type"] == "number" + + +def test_metric_output_spec_coerces_values_to_declared_schema() -> None: + reward = MetricOutputSpec.continuous_score("reward") + details = MetricOutputSpec.model("judge_details", JudgeDetails) + + coerced_reward = reward.coerce_output(MetricOutput(name="reward", value=1)) + coerced_details = details.coerce_value( + {"label": "pass", "rationale": "all checks passed", "confidence": 0.9} + ) + + assert isinstance(coerced_reward, ContinuousScore) + assert coerced_reward.root == 1.0 + assert isinstance(coerced_details, JudgeDetails) + assert coerced_details.label == "pass" + + with pytest.raises(ValueError, match="Expected metric output"): + reward.coerce_output(MetricOutput(name="other", value=1)) + + +@pytest.mark.asyncio +async def test_scorer_function_metric_adapts_dict_return_to_metric_outputs() -> None: + outputs = [ + MetricOutputSpec.boolean("correct"), + MetricOutputSpec.continuous_score("reward"), + MetricOutputSpec.discrete_score("attempts"), + MetricOutputSpec.label("extracted"), + MetricOutputSpec.model("judge", JudgeDetails), + ] + descriptor = MetricDescriptor(type="tests.dict_adapter", outputs=outputs) + + def sync_scorer(sample: ScorerInput) -> dict[str, object]: + return { + "correct": True, + "reward": 0.25, + "attempts": 2, + "extracted": "A", + "judge": {"label": "partial", "rationale": "close", "confidence": 0.5}, + } + + metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=sync_scorer) + + result = await metric.compute_scores( + MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(output_text="candidate")) + ) + + assert {output.name: output.value for output in result.outputs} == { + "correct": True, + "reward": 0.25, + "attempts": 2, + "extracted": "A", + "judge": {"label": "partial", "rationale": "close", "confidence": 0.5}, + } + + +def test_validate_metric_result_accepts_declared_outputs() -> None: + outputs = [ + MetricOutputSpec.continuous_score("reward"), + MetricOutputSpec.boolean("correct"), + MetricOutputSpec.label("label"), + ] + result = MetricResult( + outputs=[ + MetricOutput(name="reward", value=True), + MetricOutput(name="correct", value=True), + MetricOutput(name="label", value="yes"), + ] + ) + + validated = validate_metric_result(result, outputs) + + assert validated.outputs == [ + MetricOutput(name="reward", value=True), + MetricOutput(name="correct", value=True), + MetricOutput(name="label", value="yes"), + ] + + +def test_typed_scorer_decorator_exposes_config_schema() -> None: + outputs = [MetricOutputSpec.continuous_score("reward")] + + @scorer(metric_type="tests.threshold", outputs=outputs, config_schema=ThresholdConfig) + def threshold_scorer(sample: ScorerInput[ThresholdConfig]) -> dict[str, object]: + return {"reward": sample.config.threshold >= 0.5} + + typed_scorer: MetricScorerFunction[ThresholdConfig] = threshold_scorer + metric: ScorerFunctionMetric[ThresholdConfig] = typed_scorer.to_metric() + + assert threshold_scorer.descriptor.config_schema is ThresholdConfig + assert metric.type == "tests.threshold" + + +def test_typed_scorer_decorator_requires_metric_type_and_outputs_for_metric_contract_options() -> None: + with pytest.raises(ValueError, match=r"Metric contract.*outputs=\[MetricOutputSpec"): + scorer(metric_type="tests.missing_outputs") # type: ignore[reportArgumentType] # ty: ignore[invalid-argument-type] + + with pytest.raises(ValueError, match=r"Metric contract.*outputs=\[MetricOutputSpec"): + scorer(config_schema=ThresholdConfig) # type: ignore[reportArgumentType] # ty: ignore[invalid-argument-type] + + with pytest.raises(ValueError, match="no metric_type was declared"): + scorer(outputs=[MetricOutputSpec.continuous_score("reward")]) # type: ignore[call-overload] # ty: ignore[invalid-argument-type] + + +@pytest.mark.asyncio +async def test_scorer_function_metric_prefers_bound_target_over_row_field() -> None: + outputs = [MetricOutputSpec.boolean("reward")] + descriptor = MetricDescriptor(type="tests.bound_target", outputs=outputs) + + def sync_scorer(sample: ScorerInput) -> dict[str, object]: + assert sample.target == "expected-from-verify" + assert sample.metadata["answer"] == "answer-from-row" + return {"reward": True} + + metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=sync_scorer).bind( + target="expected-from-verify", + target_field="answer", + ) + + result = await metric.compute_scores( + MetricInput( + row=DatasetRow(data={"answer": "answer-from-row"}), + candidate=CandidateOutput(output_text="candidate"), + ) + ) + + assert result.outputs == [MetricOutput(name="reward", value=True)] + + +@pytest.mark.asyncio +async def test_scorer_function_metric_accepts_typed_config_model() -> None: + outputs = [MetricOutputSpec.continuous_score("reward")] + descriptor = MetricDescriptor(type="tests.typed_config", outputs=outputs, config_schema=ThresholdConfig) + + def sync_scorer(sample: ScorerInput[ThresholdConfig]) -> dict[str, object]: + assert isinstance(sample.config, ThresholdConfig) + assert sample.config.threshold == 0.5 + assert sample.config.label == "typed" + return {"reward": sample.config.threshold} + + metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=sync_scorer).bind( + config=ThresholdConfig(threshold=0.5, label="typed") + ) + + result = await metric.compute_scores( + MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(output_text="candidate")) + ) + + assert result.outputs == [MetricOutput(name="reward", value=0.5)] + + +@pytest.mark.asyncio +async def test_scorer_function_metric_validates_raw_config_against_typed_schema() -> None: + outputs = [MetricOutputSpec.continuous_score("reward")] + descriptor = MetricDescriptor(type="tests.raw_typed_config", outputs=outputs, config_schema=ThresholdConfig) + + def sync_scorer(sample: ScorerInput[ThresholdConfig]) -> dict[str, object]: + assert isinstance(sample.config, ThresholdConfig) + assert sample.config.threshold == 0.75 + assert sample.config.label == "pass" + return {"reward": sample.config.threshold} + + metric = ScorerFunctionMetric( + descriptor=descriptor, + scorer_fn=sync_scorer, + ).bind_raw_config(config={"threshold": "0.75"}) + + result = await metric.compute_scores( + MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(output_text="candidate")) + ) + + assert result.outputs == [MetricOutput(name="reward", value=0.75)] + + +def test_scorer_function_metric_rejects_invalid_typed_config() -> None: + outputs = [MetricOutputSpec.continuous_score("reward")] + descriptor = MetricDescriptor(type="tests.invalid_config", outputs=outputs, config_schema=ThresholdConfig) + metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=lambda sample: {"reward": True}) + + with pytest.raises(ValidationError): + metric.bind_raw_config(config={"threshold": "not-a-number"}) + + +def test_scorer_function_metric_bind_rejects_wrong_config_model_subtype() -> None: + outputs = [MetricOutputSpec.continuous_score("reward")] + descriptor = MetricDescriptor(type="tests.wrong_config_subtype", outputs=outputs, config_schema=ThresholdConfig) + metric = ScorerFunctionMetric( + descriptor=descriptor, + scorer_fn=lambda sample: {"reward": cast(ThresholdConfig, sample.config).threshold}, + ) + + with pytest.raises(TypeError, match="ThresholdConfig"): + metric.bind(config=OtherThresholdConfig(threshold=0.75)) + + +def test_scorer_function_metric_bind_rejects_raw_mapping_for_typed_config() -> None: + outputs = [MetricOutputSpec.continuous_score("reward")] + descriptor = MetricDescriptor(type="tests.raw_config_on_typed_bind", outputs=outputs, config_schema=ThresholdConfig) + metric = ScorerFunctionMetric( + descriptor=descriptor, + scorer_fn=lambda sample: {"reward": cast(ThresholdConfig, sample.config).threshold}, + ) + + with pytest.raises(TypeError, match="bind_raw_config"): + metric.bind(config={"threshold": 0.75}) + + +def test_validate_metric_result_rejects_duplicate_output_names() -> None: + outputs = [MetricOutputSpec.continuous_score("reward")] + result = MetricResult( + outputs=[ + MetricOutput(name="reward", value=1.0), + MetricOutput(name="reward", value=0.0), + ] + ) + + with pytest.raises(ValueError, match="Duplicate metric output"): + validate_metric_result(result, outputs) + + +def test_validate_metric_result_rejects_missing_declared_outputs() -> None: + outputs = [MetricOutputSpec.continuous_score("reward"), MetricOutputSpec.continuous_score("format")] + result = MetricResult(outputs=[MetricOutput(name="reward", value=1.0)]) + + with pytest.raises(ValueError, match="Missing declared metric outputs"): + validate_metric_result(result, outputs) + + +def test_validate_metric_result_rejects_undeclared_outputs() -> None: + outputs = [MetricOutputSpec.continuous_score("reward")] + result = MetricResult( + outputs=[ + MetricOutput(name="reward", value=1.0), + MetricOutput(name="format", value=1.0), + ] + ) + + with pytest.raises(ValueError, match="Undeclared metric outputs"): + validate_metric_result(result, outputs) + + +def test_validate_metric_result_rejects_value_that_does_not_match_schema() -> None: + outputs = [MetricOutputSpec.model("judge_details", JudgeDetails)] + result = MetricResult(outputs=[MetricOutput(name="judge_details", value={"label": "pass"})]) + + with pytest.raises(ValidationError): + validate_metric_result(result, outputs) + + +@pytest.mark.asyncio +async def test_scorer_function_metric_executes_sync_scorers() -> None: + outputs = [MetricOutputSpec.boolean("reward"), MetricOutputSpec.label("label")] + descriptor = MetricDescriptor(type="tests.sync_metric", outputs=outputs) + sandbox = cast(Sandbox, object()) + + def sync_scorer(sample: ScorerInput) -> dict[str, object]: + assert sample.response == "yes" + assert sample.target == "yes" + assert sample.metadata["category"] == "boolean" + assert sample.config == {"mode": "strict"} + assert sample.sandbox is sandbox + return {"reward": True, "label": "matched"} + + metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=sync_scorer).bind( + config={"mode": "strict"}, + sandbox=sandbox, + target_field="answer", + ) + + result = await metric.compute_scores( + MetricInput( + row=DatasetRow(data={"answer": "yes", "category": "boolean"}), + candidate=CandidateOutput(output_text="yes"), + ) + ) + + assert metric.type == "tests.sync_metric" + assert score_names_from_output_spec(metric.output_spec()) == ["reward"] + assert result.outputs == [MetricOutput(name="reward", value=True), MetricOutput(name="label", value="matched")] + + +@pytest.mark.asyncio +async def test_scorer_function_metric_executes_async_scorers() -> None: + outputs = [MetricOutputSpec.continuous_score("reward"), MetricOutputSpec.label("seen")] + descriptor = MetricDescriptor(type="tests.async_metric", outputs=outputs) + + async def async_scorer(sample: ScorerInput) -> dict[str, object]: + return {"reward": 0.5, "seen": sample.response} + + metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=async_scorer) + + result = await metric.compute_scores( + MetricInput(row=DatasetRow(data={"answer": "yes"}), candidate=CandidateOutput(output_text="maybe")) + ) + + assert metric.type == "tests.async_metric" + assert score_names_from_output_spec(metric.output_spec()) == ["reward"] + assert result.outputs == [MetricOutput(name="reward", value=0.5), MetricOutput(name="seen", value="maybe")] + + +def test_typed_scorer_decorator_exposes_descriptor_and_to_metric() -> None: + outputs = [MetricOutputSpec.boolean("truthful"), MetricOutputSpec.label("judge_grade")] + + @scorer(metric_type="truthfulqa", outputs=outputs) + def truthfulqa_scorer(sample: ScorerInput) -> dict[str, object]: + return {"truthful": bool(sample.response), "judge_grade": "C"} + + metric = truthfulqa_scorer.to_metric() + + assert truthfulqa_scorer.descriptor == MetricDescriptor(type="truthfulqa", outputs=outputs) + assert metric.type == "truthfulqa" + assert score_names_from_output_spec(metric.output_spec()) == ["truthful"] diff --git a/tests/test_scoring/test_multiple_choice.py b/tests/test_scoring/test_multiple_choice.py new file mode 100644 index 000000000..fb8679b09 --- /dev/null +++ b/tests/test_scoring/test_multiple_choice.py @@ -0,0 +1,283 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for multiple-choice scorers on the shared metric contract. + +These tests are the proof that Kanishk PR #1's MC functionality fits Sandy's +shared metric contract (#950) **with zero protocol-type changes**. The +scorers expose ``descriptor`` + ``to_metric()`` via ``@scorer(outputs=...)`` +exactly like any other metric, the per-row payload (choices, logprobs) +flows through ``MetricInput.candidate.metadata``, and the runtime invariants +(declared vs returned, value-schema coercion) all hold. +""" + +from __future__ import annotations + +import pytest + +from nemo_evaluator.scoring.metric import ( + BooleanValue, + CandidateOutput, + ContinuousScore, + DatasetRow, + MetricInput, + MetricOutputSpec, +) +from nemo_evaluator.scoring.multiple_choice import ( + _resolve_gold_index, + mcq_letter_extract, + multiple_choice_acc, +) + + +def _mc_input(*, target: object, choices: list[str], logprobs: list[float], is_greedy: list[bool]) -> MetricInput: + """Build a MetricInput shaped the way LogprobRankingSolver populates it.""" + argmax_idx = max(range(len(logprobs)), key=lambda i: logprobs[i]) if logprobs else 0 + return MetricInput( + row=DatasetRow(data={"target": target}), + candidate=CandidateOutput( + output_text=choices[argmax_idx] if choices else "", + metadata={ + "_mc_choices": choices, + "_mc_choices_logprobs": logprobs, + "_mc_choices_is_greedy": is_greedy, + }, + ), + ) + + +# ── Decorator + descriptor surface ────────────────────────────────────────── + + +def test_multiple_choice_acc_exposes_descriptor_with_three_continuous_scores() -> None: + desc = multiple_choice_acc.descriptor + assert desc.type == "multiple_choice_acc" + assert [o.name for o in desc.outputs] == ["acc", "acc_norm", "acc_greedy"] + for output in desc.outputs: + assert output.value_schema is ContinuousScore + + +def test_mcq_letter_extract_exposes_descriptor_with_correct_and_parsed() -> None: + desc = mcq_letter_extract.descriptor + assert desc.type == "mcq_letter_extract" + names = {o.name: o.value_schema for o in desc.outputs} + assert names == {"correct": ContinuousScore, "parsed": BooleanValue} + + +def test_to_metric_materializes_satisfying_metric_protocol() -> None: + metric = multiple_choice_acc.to_metric() + assert metric.type == "multiple_choice_acc" + assert [o.name for o in metric.output_spec()] == ["acc", "acc_norm", "acc_greedy"] + + +# ── multiple_choice_acc semantics ─────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_mc_acc_perfect_match() -> None: + metric = multiple_choice_acc.to_metric() + result = await metric.compute_scores( + _mc_input( + target=2, + choices=["A", "B", "Paris", "London"], + logprobs=[-5.1, -4.8, -1.2, -3.7], + is_greedy=[False, False, True, False], + ) + ) + outputs = {o.name: o.value for o in result.outputs} + assert outputs == {"acc": 1.0, "acc_norm": 1.0, "acc_greedy": 1.0} + + +@pytest.mark.asyncio +async def test_mc_acc_wrong_answer() -> None: + metric = multiple_choice_acc.to_metric() + result = await metric.compute_scores( + _mc_input( + target=0, + choices=["A", "B", "Paris", "London"], + logprobs=[-5.1, -4.8, -1.2, -3.7], + is_greedy=[False, False, True, False], + ) + ) + outputs = {o.name: o.value for o in result.outputs} + assert outputs == {"acc": 0.0, "acc_norm": 0.0, "acc_greedy": 0.0} + + +@pytest.mark.asyncio +async def test_mc_acc_norm_diverges_from_acc_when_choices_differ_in_length() -> None: + """Length normalization can flip argmax: a long choice may have lower raw + logprob (more tokens to multiply through) but a higher per-character one.""" + metric = multiple_choice_acc.to_metric() + result = await metric.compute_scores( + _mc_input( + target=1, + choices=["A", "ABCDEFGHIJ"], + logprobs=[-1.5, -3.0], # raw argmax = 0 + is_greedy=[False, False], + ) + ) + # raw: -1.5 > -3.0 → argmax = 0, so acc = 0 + # norm: -1.5/1 = -1.5 vs -3.0/10 = -0.3 → argmax = 1, so acc_norm = 1 + outputs = {o.name: o.value for o in result.outputs} + assert outputs["acc"] == 0.0 + assert outputs["acc_norm"] == 1.0 + + +@pytest.mark.asyncio +async def test_mc_acc_greedy_zero_when_no_choice_is_greedy() -> None: + metric = multiple_choice_acc.to_metric() + result = await metric.compute_scores( + _mc_input( + target=2, + choices=["A", "B", "Paris", "London"], + logprobs=[-5.1, -4.8, -1.2, -3.7], + is_greedy=[False, False, False, False], + ) + ) + outputs = {o.name: o.value for o in result.outputs} + assert outputs["acc"] == 1.0 + assert outputs["acc_greedy"] == 0.0 + + +@pytest.mark.asyncio +async def test_mc_acc_returns_zeros_when_payload_missing() -> None: + """No choices in metadata → all metrics 0.0 (graceful degrade).""" + metric = multiple_choice_acc.to_metric() + result = await metric.compute_scores( + MetricInput( + row=DatasetRow(data={"target": 1}), + candidate=CandidateOutput(output_text="anything"), + ) + ) + outputs = {o.name: o.value for o in result.outputs} + assert outputs == {"acc": 0.0, "acc_norm": 0.0, "acc_greedy": 0.0} + + +# ── _resolve_gold_index heterogeneous targets ─────────────────────────────── + + +@pytest.mark.parametrize( + ("target", "choices", "expected"), + [ + (2, ["A", "B", "C", "D"], 2), + (-1, ["A", "B"], None), + (5, ["A", "B"], None), + ("C", ["A", "B", "C", "D"], 2), + ("c", ["A", "B", "C", "D"], 2), + ("A", ["A", "B"], 0), + ("Z", ["A", "B"], None), + ("0", ["A", "B"], 0), + ("3", ["A", "B"], None), + ("Paris", ["London", "Paris"], 1), + ("paris", ["London", "Paris"], 1), + ("Berlin", ["London", "Paris"], None), + ("", ["A", "B"], None), + (None, ["A", "B"], None), + ], +) +def test_resolve_gold_index_heterogeneous( + target: object, choices: list[str], expected: int | None +) -> None: + assert _resolve_gold_index(target, choices) == expected + + +# ── End-to-end: integration with shared contract ──────────────────────────── + + +@pytest.mark.asyncio +async def test_candidate_metadata_flows_to_legacy_scorer_through_translator() -> None: + """Proof: ``MetricInput.candidate.metadata`` reaches the legacy scorer. + + Without the merge fix in ``ScorerFunctionMetric.compute_scores``, this + test fails because the translator builds ``ScorerInput.metadata`` from + ``row.data`` only and the scorer reads zeros. + """ + metric = multiple_choice_acc.to_metric() + result = await metric.compute_scores( + MetricInput( + row=DatasetRow(data={"target": 0, "question": "What's hot?"}), + candidate=CandidateOutput( + output_text="lava", + metadata={ + "_mc_choices": ["lava", "ice"], + "_mc_choices_logprobs": [-0.1, -3.0], + "_mc_choices_is_greedy": [True, False], + }, + ), + ) + ) + outputs = {o.name: o.value for o in result.outputs} + assert outputs == {"acc": 1.0, "acc_norm": 1.0, "acc_greedy": 1.0} + + +@pytest.mark.asyncio +async def test_mcq_letter_extract_basic_letter_match() -> None: + metric = mcq_letter_extract.to_metric() + result = await metric.compute_scores( + MetricInput( + row=DatasetRow(data={"target": "B"}), + candidate=CandidateOutput(output_text="The answer is B."), + ) + ) + outputs = {o.name: o.value for o in result.outputs} + assert outputs == {"correct": 1.0, "parsed": True} + + +@pytest.mark.asyncio +async def test_mcq_letter_extract_no_letter_in_response() -> None: + metric = mcq_letter_extract.to_metric() + result = await metric.compute_scores( + MetricInput( + row=DatasetRow(data={"target": "C"}), + candidate=CandidateOutput(output_text="????"), + ) + ) + outputs = {o.name: o.value for o in result.outputs} + assert outputs == {"correct": 0.0, "parsed": False} + + +@pytest.mark.asyncio +async def test_mcq_letter_extract_int_target() -> None: + metric = mcq_letter_extract.to_metric() + result = await metric.compute_scores( + MetricInput( + row=DatasetRow(data={"target": 1}), + candidate=CandidateOutput(output_text="Answer: B"), + ) + ) + outputs = {o.name: o.value for o in result.outputs} + assert outputs == {"correct": 1.0, "parsed": True} + + +@pytest.mark.asyncio +async def test_validate_metric_result_enforces_declared_outputs() -> None: + """Sandy's validate_metric_result rejects extra/missing keys at runtime.""" + from nemo_evaluator.scoring.metric import ( + MetricDescriptor, + MetricOutput, + MetricResult, + validate_metric_result, + ) + + desc = MetricDescriptor( + type="multiple_choice_acc", + outputs=[ + MetricOutputSpec.continuous_score("acc"), + MetricOutputSpec.continuous_score("acc_norm"), + MetricOutputSpec.continuous_score("acc_greedy"), + ], + ) + bad = MetricResult(outputs=[MetricOutput(name="acc", value=1.0)]) + with pytest.raises(ValueError, match="Missing declared metric outputs"): + validate_metric_result(bad, desc.outputs) diff --git a/tests/test_solvers/test_logprob_solver.py b/tests/test_solvers/test_logprob_solver.py new file mode 100644 index 000000000..51ddf5c5b --- /dev/null +++ b/tests/test_solvers/test_logprob_solver.py @@ -0,0 +1,179 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for :class:`LogprobRankingSolver` and the response parser.""" + +from __future__ import annotations + +from unittest.mock import patch + +import pytest + +from nemo_evaluator.environments.base import SeedResult +from nemo_evaluator.solvers.base import ErrorKind +from nemo_evaluator.solvers.logprob import ( + LogprobRankingSolver, + _parse_loglikelihood_response, +) + + +# ── _parse_loglikelihood_response ─────────────────────────────────────────── + + +def _build_logprobs_response( + *, + tokens: list[str], + token_logprobs: list[float | None], + text_offset: list[int | None], + top_logprobs: list[dict[str, float] | None] | None = None, +) -> dict: + """Shape an OpenAI-compatible /completions response body.""" + return { + "choices": [ + { + "logprobs": { + "tokens": tokens, + "token_logprobs": token_logprobs, + "text_offset": text_offset, + "top_logprobs": top_logprobs or [{tok: lp or 0.0} for tok, lp in zip(tokens, token_logprobs)], + } + } + ] + } + + +def test_parse_response_sums_continuation_logprobs() -> None: + body = _build_logprobs_response( + tokens=["The", " cat", " sat"], + token_logprobs=[None, -0.5, -0.3], + text_offset=[0, 3, 7], + ) + sum_lp, _ = _parse_loglikelihood_response(body, context="The") + assert sum_lp == pytest.approx(-0.8) + + +def test_parse_response_returns_minus_inf_when_logprobs_missing() -> None: + body: dict = {"choices": [{"text": "x"}]} # no logprobs field + sum_lp, greedy = _parse_loglikelihood_response(body, context="ctx") + assert sum_lp == float("-inf") + assert greedy is False + + +def test_parse_response_is_greedy_true_when_top1_matches_tokens() -> None: + body = _build_logprobs_response( + tokens=["The", " cat"], + token_logprobs=[None, -0.5], + text_offset=[0, 3], + top_logprobs=[{"The": -0.1, "An": -1.5}, {" cat": -0.5, " dog": -2.0}], + ) + _, greedy = _parse_loglikelihood_response(body, context="The") + assert greedy is True + + +def test_parse_response_is_greedy_false_when_top1_differs() -> None: + body = _build_logprobs_response( + tokens=["The", " cat"], + token_logprobs=[None, -0.5], + text_offset=[0, 3], + top_logprobs=[{"The": -0.1}, {" dog": -0.1, " cat": -0.5}], + ) + _, greedy = _parse_loglikelihood_response(body, context="The") + assert greedy is False + + +def test_parse_response_handles_token_straddling_with_recursive_fallback() -> None: + """If text_offset never crosses ctx_len, walk back from end.""" + # All tokens have offset < 3 (i.e. text_offset never reaches ctx_len), + # falls back to last token containing some continuation. + body = _build_logprobs_response( + tokens=["The", " cat"], + token_logprobs=[None, -0.4], + text_offset=[0, 1], # offsets all < 5 (ctx_len) + ) + sum_lp, _ = _parse_loglikelihood_response(body, context="The c") + # Fallback: only the last token's logprob is summed + assert sum_lp == pytest.approx(-0.4) + + +def test_parse_response_empty_continuation_returns_minus_inf() -> None: + body = _build_logprobs_response( + tokens=["a", "b"], + token_logprobs=[None, None], + text_offset=[0, 1], + ) + sum_lp, _ = _parse_loglikelihood_response(body, context="ab") + assert sum_lp == float("-inf") + + +# ── LogprobRankingSolver.solve ────────────────────────────────────────────── + + +def _seed_with_choices(choices: list[str]) -> SeedResult: + return SeedResult( + prompt="Q: 2+2 = ", + expected_answer="4", + metadata={"_mc_choices": choices, "source": "byob"}, + ) + + +@pytest.mark.asyncio +async def test_solve_ranks_choices_by_sum_logprob() -> None: + # Three choices; the second one wins by raw logprob sum. + responses = [ + _build_logprobs_response(tokens=["Q: 2+2 = ", "3"], token_logprobs=[None, -2.5], text_offset=[0, 9]), + _build_logprobs_response(tokens=["Q: 2+2 = ", "4"], token_logprobs=[None, -0.5], text_offset=[0, 9]), + _build_logprobs_response(tokens=["Q: 2+2 = ", "5"], token_logprobs=[None, -3.0], text_offset=[0, 9]), + ] + seed = _seed_with_choices(["3", "4", "5"]) + response_iter = iter(responses) + + solver = LogprobRankingSolver(base_url="http://fake", model="fake-model") + + async def fake_post(url, payload): + return next(response_iter) + + with patch.object(solver._model_client, "_post_with_retry", side_effect=fake_post): + result = await solver.solve(seed) + + assert result.response == "4" + assert result.error is None + assert result.scoring_details["_mc_choices"] == ["3", "4", "5"] + assert result.scoring_details["_mc_choices_logprobs"] == pytest.approx([-2.5, -0.5, -3.0]) + assert len(result.scoring_details["_mc_choices_is_greedy"]) == 3 + + +@pytest.mark.asyncio +async def test_solve_returns_graceful_error_when_choices_missing() -> None: + seed = SeedResult(prompt="anything", expected_answer="?") + solver = LogprobRankingSolver(base_url="http://fake", model="fake-model") + result = await solver.solve(seed) + assert result.response == "" + assert result.error_kind == ErrorKind.GRACEFUL + assert "_mc_choices" in (result.error or "") + + +@pytest.mark.asyncio +async def test_solve_propagates_inference_failures_as_infra_error() -> None: + seed = _seed_with_choices(["a", "b"]) + solver = LogprobRankingSolver(base_url="http://fake", model="fake-model") + + async def boom(url, payload): + raise RuntimeError("connection refused") + + with patch.object(solver._model_client, "_post_with_retry", side_effect=boom): + result = await solver.solve(seed) + + assert result.response == "" + assert result.error_kind == ErrorKind.INFRA + assert "connection refused" in (result.error or "")