NVIDIA-NeMo · wprazuch · Apr 29, 2026 · May 4, 2026 · May 4, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,25 @@
 
 ## 0.13.0 (unreleased)
 
+### Shared Metric Contract
+
+- Added public `MetricInput -> MetricResult` scorer/metric runtime types and `ScorerFunctionMetric`.
+- Extended BYOB `@scorer` with typed scorer metadata and `to_metric()` while preserving current dict scorer behavior.
+- Added optional `config_schema` support for typed scorer configs while keeping raw dict configs as the default.
+- Split typed scorer config binding into strict `bind(config=ConfigModel(...))` and coercive `bind_raw_config(config={...})` paths.
+
+### Multiple-Choice Loglikelihood + Few-Shot (lm-evaluation-harness parity)
+
+Demonstrates non-trivial benchmark machinery composing with the shared metric contract **without protocol-type changes**. `MetricInput`, `MetricResult`, `MetricDescriptor`, `MetricOutputSpec` shapes untouched.
+
+- **`@scorer`-typed `multiple_choice_acc`** in `nemo_evaluator.scoring.multiple_choice`: returns `acc`, `acc_norm`, `acc_greedy`. Reads candidate continuations + per-choice loglikelihoods from `MetricInput.candidate.metadata`.
+- **`@scorer`-typed `mcq_letter_extract`**: free-form letter extraction (A-J), returns `correct` (continuous) and `parsed` (boolean).
+- **`LogprobRankingSolver`** in `nemo_evaluator.solvers.logprob`: ranks candidate continuations via `/completions` (`max_tokens=0, echo=true, logprobs=1`), parses continuation spans via `text_offset`. Per-choice calls run concurrently behind `max_concurrent_choices`.
+- **`@benchmark` extensions**: `choices`, `choices_field` (with dotted-path resolution), `num_fewshot`, `fewshot_split`, `fewshot_template`, `fewshot_separator`, `fewshot_seed`. Few-shot prefix is rendered in `ByobEnvironment.seed()`.
+- **`_load_hf` dataset URI parsing**: path-segment configs (`hf://ns/name/config[/split]`) and row filters (`?filter_field=...&filter_value=...`). Required for namespaced multilingual datasets like `CohereForAI/Global-MMLU-Lite/en`.
+- **Eval loop forwards solver `scoring_details` to `env.verify` kwargs**: lets a solver push per-row payloads to the scorer. `_metric_input_from_verify` lifts `_mc_*`/`_solver_*` namespaced keys onto `MetricInput.candidate.metadata` rather than `row.data`.
+- **`ScorerFunctionMetric.compute_scores` merges `candidate.metadata` into legacy `ScorerInput.metadata`** so legacy `(ScorerInput) -> dict` scorers see solver-emitted payloads.
+
 ### Adapter Proxy (Breaking — replaces LiteLLM)
 
 - **LiteLLM removed**: The `litellm` dependency, `proxy` and `proxy-full` extras, and `litellm_settings` config field are all removed. The adapter proxy is now built-in with zero external proxy dependencies.

diff --git a/scripts/smoketest_logprob_solver.py b/scripts/smoketest_logprob_solver.py
@@ -0,0 +1,169 @@
+"""End-to-end smoke test for LogprobRankingSolver against a fake /v1/completions.
+
+Spins up an aiohttp server that returns OpenAI-shape responses with
+deterministic logprobs derived from the continuation. This validates:
+
+1. The HTTP wire format the solver emits (max_tokens=0, echo=true, logprobs=1).
+2. The text_offset-based continuation parsing.
+3. Concurrent per-choice ranking + argmax selection.
+4. End-to-end seed → solve → verify through ByobEnvironment with the
+   typed multiple_choice_acc scorer.
+
+Run:
+    python scripts/smoketest_logprob_solver.py
+"""
+
+from __future__ import annotations
+
+import asyncio
+import hashlib
+import math
+from typing import Any
+
+from aiohttp import web
+
+from nemo_evaluator.environments.custom import BenchmarkDefinition, ByobEnvironment
+from nemo_evaluator.scoring.multiple_choice import multiple_choice_acc
+from nemo_evaluator.solvers.logprob import LogprobRankingSolver
+
+
+def _deterministic_logprob(continuation: str, *, target: str) -> float:
+    """Deterministic per-continuation logprob: gold gets the highest score.
+
+    The "model" prefers the gold continuation (mocking a perfect oracle).
+    Other continuations get logprobs derived from a stable hash so the
+    ranking is reproducible across runs.
+    """
+    if continuation == target:
+        return -0.5
+    h = int(hashlib.sha256(continuation.encode()).hexdigest()[:8], 16)
+    return -2.0 - (h % 100) / 25.0
+
+
+async def fake_completions_handler(request: web.Request, *, gold_per_prompt: dict[str, str]) -> web.Response:
+    body = await request.json()
+    prompt = body["prompt"]
+    # The benchmark prompt is a stable substring of `prompt`; find which
+    # gold continuation belongs to it.
+    gold = next((g for stem, g in gold_per_prompt.items() if stem in prompt), "")
+
+    # Identify the continuation: it's whatever came AFTER the longest
+    # benchmark stem we recognise. For the smoke test we just take the
+    # last word as the continuation token sequence.
+    matched_stem = next((stem for stem in gold_per_prompt if stem in prompt), "")
+    continuation = prompt[len(matched_stem):] if matched_stem else prompt[-10:]
+
+    logprob = _deterministic_logprob(continuation, target=gold)
+
+    # Synthesize a token-level response with text_offset that puts the
+    # continuation just after `matched_stem`.
+    ctx_end = len(matched_stem)
+    tokens = ["<ctx>", continuation]
+    token_logprobs = [None, logprob]
+    text_offset = [0, ctx_end]
+    top_logprobs = [
+        {"<ctx>": -0.01},
+        {continuation: logprob, "_other": logprob - 1.0},
+    ]
+
+    return web.json_response(
+        {
+            "choices": [
+                {
+                    "text": "",
+                    "finish_reason": "length",
+                    "logprobs": {
+                        "tokens": tokens,
+                        "token_logprobs": token_logprobs,
+                        "text_offset": text_offset,
+                        "top_logprobs": top_logprobs,
+                    },
+                }
+            ],
+            "model": body["model"],
+            "usage": {"prompt_tokens": len(prompt), "completion_tokens": 0, "total_tokens": len(prompt)},
+        }
+    )
+
+
+def make_app(gold_per_prompt: dict[str, str]) -> web.Application:
+    async def handler(request: web.Request) -> web.Response:
+        return await fake_completions_handler(request, gold_per_prompt=gold_per_prompt)
+
+    app = web.Application()
+    app.router.add_post("/v1/completions", handler)
+    return app
+
+
+async def run_smoketest() -> None:
+    benchmark_rows = [
+        # prompt stem, choices, gold idx, gold text
+        {"q": "Capital of France?", "answer": 2},
+        {"q": "Capital of UK?", "answer": 3},
+        {"q": "Capital of Germany?", "answer": 0},
+    ]
+    choices = ["Berlin", "Madrid", "Paris", "London"]
+    gold_per_prompt = {f"Q: {row['q']}\nA: ": choices[row["answer"]] for row in benchmark_rows}
+
+    # Start fake server
+    app = make_app(gold_per_prompt)
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, "127.0.0.1", 11999)
+    await site.start()
+
+    try:
+        defn = BenchmarkDefinition(
+            name="capitals_smoke",
+            dataset=lambda: benchmark_rows,
+            prompt="Q: {q}\nA: ",
+            target_field="answer",
+            choices=choices,
+            scorer_fn=multiple_choice_acc,
+        )
+        env = ByobEnvironment(defn)
+        solver = LogprobRankingSolver(
+            base_url="http://127.0.0.1:11999/v1",
+            model="fake-mc-oracle",
+        )
+
+        print(f"running {len(benchmark_rows)} rows × {len(choices)} choices each\n")
+        per_row_results = []
+        for idx in range(len(benchmark_rows)):
+            seed = await env.seed(idx)
+            solve = await solver.solve(seed)
+            assert solve.error is None, f"solver error: {solve.error}"
+
+            merged_meta = {**seed.metadata, **solve.scoring_details}
+            vr = await env.verify(solve.response, seed.expected_answer, **merged_meta)
+            outputs = vr.scoring_details.get("outputs", {})
+            per_row_results.append((benchmark_rows[idx]["q"], solve, vr, outputs))
+
+        # Summary
+        print(f"{'question':<22} {'argmax':<10} {'gold':<10} {'acc':<6} {'logprobs'}")
+        print("─" * 96)
+        all_correct = True
+        for (q, solve, vr, outputs), row in zip(per_row_results, benchmark_rows):
+            argmax = solve.response
+            gold = choices[row["answer"]]
+            lps = solve.scoring_details["_mc_choices_logprobs"]
+            lps_str = "  ".join(f"{c}={lp:.2f}" for c, lp in zip(choices, lps))
+            acc = outputs.get("acc", 0.0)
+            if acc != 1.0:
+                all_correct = False
+            print(f"{q:<22} {argmax:<10} {gold:<10} {acc:<6} {lps_str}")
+
+        print()
+        print(f"all correct: {all_correct}")
+        for q, solve, vr, outputs in per_row_results:
+            assert outputs.get("acc") == 1.0, f"acc != 1.0 for {q!r}: {outputs}"
+            assert math.isfinite(solve.scoring_details["_mc_choices_logprobs"][0])
+        print("OK: end-to-end seed → solve → verify works through real HTTP")
+
+        await solver.close()
+    finally:
+        await runner.cleanup()
+
+
+if __name__ == "__main__":
+    asyncio.run(run_smoketest())
diff --git a/src/nemo_evaluator/__init__.py b/src/nemo_evaluator/__init__.py
@@ -16,22 +16,30 @@
 
 __version__ = "0.12.0"
 
+from nemo_evaluator.engine.eval_loop import run_evaluation
+from nemo_evaluator.engine.model_client import ModelClient
 from nemo_evaluator.environments.base import EvalEnvironment, SeedResult, VerifyResult
 from nemo_evaluator.environments.custom import benchmark, scorer
 from nemo_evaluator.environments.registry import get_environment, list_environments, load_benchmark_file, register
-from nemo_evaluator.engine.eval_loop import run_evaluation
-from nemo_evaluator.engine.model_client import ModelClient
-from nemo_evaluator.solvers import (
-    ChatSolver,
-    CompletionSolver,
-    NatSolver,
-    OpenClawSolver,
-    Solver,
-    SolveResult,
-    VLMSolver,
-)
 from nemo_evaluator.scoring import (
+    BooleanValue,
+    CandidateOutput,
+    ContinuousScore,
+    DatasetRow,
+    DiscreteScore,
+    Label,
+    Metric,
+    MetricDescriptor,
+    MetricInput,
+    MetricOutput,
+    MetricOutputSpec,
+    MetricResult,
+    MetricScorerFunction,
+    ScorerCallable,
+    ScorerConfig,
+    ScorerFunctionMetric,
     ScorerInput,
+    ScorerReturn,
     answer_line,
     code_sandbox,
     code_sandbox_async,
@@ -40,6 +48,18 @@
     multichoice_regex,
     needs_judge,
     numeric_match,
+    score_names_from_output_spec,
+)
+from nemo_evaluator.scoring.multiple_choice import mcq_letter_extract, multiple_choice_acc
+from nemo_evaluator.solvers import (
+    ChatSolver,
+    CompletionSolver,
+    LogprobRankingSolver,
+    NatSolver,
+    OpenClawSolver,
+    Solver,
+    SolveResult,
+    VLMSolver,
 )
 
 __all__ = [
@@ -57,6 +77,7 @@
     "Solver",
     "ChatSolver",
     "CompletionSolver",
+    "LogprobRankingSolver",
     "NatSolver",
     "OpenClawSolver",
     "VLMSolver",
@@ -65,6 +86,24 @@
     "benchmark",
     "scorer",
     "ScorerInput",
+    "Metric",
+    "BooleanValue",
+    "DatasetRow",
+    "CandidateOutput",
+    "ContinuousScore",
+    "DiscreteScore",
+    "Label",
+    "MetricInput",
+    "MetricOutput",
+    "MetricOutputSpec",
+    "MetricDescriptor",
+    "MetricResult",
+    "MetricScorerFunction",
+    "ScorerCallable",
+    "ScorerConfig",
+    "ScorerFunctionMetric",
+    "ScorerReturn",
+    "score_names_from_output_spec",
     # Scoring primitives
     "exact_match",
     "multichoice_regex",
@@ -73,5 +112,7 @@
     "numeric_match",
     "code_sandbox",
     "code_sandbox_async",
+    "mcq_letter_extract",
+    "multiple_choice_acc",
     "needs_judge",
 ]
diff --git a/src/nemo_evaluator/engine/eval_loop.py b/src/nemo_evaluator/engine/eval_loop.py
@@ -435,11 +435,18 @@ async def _run_step(idx: int, slot: int, rep: int, seed_result, seed_ms: float):
                         logger.debug("p%d r%d: using pre-computed reward=%.4f", idx, rep, vr.reward)
                     elif not _solve_failed:
                         verify_sandbox = await lifecycle.get_verify_sandbox()
+                        # Forward solver-emitted payload (e.g. logprobs, ranking
+                        # metadata) into verify alongside seed metadata. Keys
+                        # collide rarely in practice; solver payload wins on
+                        # collision because it is the more recent, more
+                        # specific source.
+                        solver_meta = solve_result.scoring_details if solve_result else {}
+                        merged_meta = {**seed_result.metadata, **solver_meta}
                         vr = await env.verify(
                             response_text,
                             seed_result.expected_answer,
                             sandbox=verify_sandbox,
-                            **seed_result.metadata,
+                            **merged_meta,
                         )
                     break  # success — exit retry loop