diff --git a/CHANGELOG.md b/CHANGELOG.md
index ce0d635b0..df928213f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,25 @@
 
 ## 0.13.0 (unreleased)
 
+### Shared Metric Contract
+
+- Added public `MetricInput -> MetricResult` scorer/metric runtime types and `ScorerFunctionMetric`.
+- Extended BYOB `@scorer` with typed scorer metadata and `to_metric()` while preserving current dict scorer behavior.
+- Added optional `config_schema` support for typed scorer configs while keeping raw dict configs as the default.
+- Split typed scorer config binding into strict `bind(config=ConfigModel(...))` and coercive `bind_raw_config(config={...})` paths.
+
+### Multiple-Choice Loglikelihood + Few-Shot (lm-evaluation-harness parity)
+
+Demonstrates non-trivial benchmark machinery composing with the shared metric contract **without protocol-type changes**. `MetricInput`, `MetricResult`, `MetricDescriptor`, `MetricOutputSpec` shapes untouched.
+
+- **`@scorer`-typed `multiple_choice_acc`** in `nemo_evaluator.scoring.multiple_choice`: returns `acc`, `acc_norm`, `acc_greedy`. Reads candidate continuations + per-choice loglikelihoods from `MetricInput.candidate.metadata`.
+- **`@scorer`-typed `mcq_letter_extract`**: free-form letter extraction (A-J), returns `correct` (continuous) and `parsed` (boolean).
+- **`LogprobRankingSolver`** in `nemo_evaluator.solvers.logprob`: ranks candidate continuations via `/completions` (`max_tokens=0, echo=true, logprobs=1`), parses continuation spans via `text_offset`. Per-choice calls run concurrently behind `max_concurrent_choices`.
+- **`@benchmark` extensions**: `choices`, `choices_field` (with dotted-path resolution), `num_fewshot`, `fewshot_split`, `fewshot_template`, `fewshot_separator`, `fewshot_seed`. Few-shot prefix is rendered in `ByobEnvironment.seed()`.
+- **`_load_hf` dataset URI parsing**: path-segment configs (`hf://ns/name/config[/split]`) and row filters (`?filter_field=...&filter_value=...`). Required for namespaced multilingual datasets like `CohereForAI/Global-MMLU-Lite/en`.
+- **Eval loop forwards solver `scoring_details` to `env.verify` kwargs**: lets a solver push per-row payloads to the scorer. `_metric_input_from_verify` lifts `_mc_*`/`_solver_*` namespaced keys onto `MetricInput.candidate.metadata` rather than `row.data`.
+- **`ScorerFunctionMetric.compute_scores` merges `candidate.metadata` into legacy `ScorerInput.metadata`** so legacy `(ScorerInput) -> dict` scorers see solver-emitted payloads.
+
 ### Adapter Proxy (Breaking — replaces LiteLLM)
 
 - **LiteLLM removed**: The `litellm` dependency, `proxy` and `proxy-full` extras, and `litellm_settings` config field are all removed. The adapter proxy is now built-in with zero external proxy dependencies.
diff --git a/scripts/smoketest_logprob_solver.py b/scripts/smoketest_logprob_solver.py
new file mode 100644
index 000000000..8a323c98b
--- /dev/null
+++ b/scripts/smoketest_logprob_solver.py
@@ -0,0 +1,169 @@
+"""End-to-end smoke test for LogprobRankingSolver against a fake /v1/completions.
+
+Spins up an aiohttp server that returns OpenAI-shape responses with
+deterministic logprobs derived from the continuation. This validates:
+
+1. The HTTP wire format the solver emits (max_tokens=0, echo=true, logprobs=1).
+2. The text_offset-based continuation parsing.
+3. Concurrent per-choice ranking + argmax selection.
+4. End-to-end seed → solve → verify through ByobEnvironment with the
+   typed multiple_choice_acc scorer.
+
+Run:
+    python scripts/smoketest_logprob_solver.py
+"""
+
+from __future__ import annotations
+
+import asyncio
+import hashlib
+import math
+from typing import Any
+
+from aiohttp import web
+
+from nemo_evaluator.environments.custom import BenchmarkDefinition, ByobEnvironment
+from nemo_evaluator.scoring.multiple_choice import multiple_choice_acc
+from nemo_evaluator.solvers.logprob import LogprobRankingSolver
+
+
+def _deterministic_logprob(continuation: str, *, target: str) -> float:
+    """Deterministic per-continuation logprob: gold gets the highest score.
+
+    The "model" prefers the gold continuation (mocking a perfect oracle).
+    Other continuations get logprobs derived from a stable hash so the
+    ranking is reproducible across runs.
+    """
+    if continuation == target:
+        return -0.5
+    h = int(hashlib.sha256(continuation.encode()).hexdigest()[:8], 16)
+    return -2.0 - (h % 100) / 25.0
+
+
+async def fake_completions_handler(request: web.Request, *, gold_per_prompt: dict[str, str]) -> web.Response:
+    body = await request.json()
+    prompt = body["prompt"]
+    # The benchmark prompt is a stable substring of `prompt`; find which
+    # gold continuation belongs to it.
+    gold = next((g for stem, g in gold_per_prompt.items() if stem in prompt), "")
+
+    # Identify the continuation: it's whatever came AFTER the longest
+    # benchmark stem we recognise. For the smoke test we just take the
+    # last word as the continuation token sequence.
+    matched_stem = next((stem for stem in gold_per_prompt if stem in prompt), "")
+    continuation = prompt[len(matched_stem):] if matched_stem else prompt[-10:]
+
+    logprob = _deterministic_logprob(continuation, target=gold)
+
+    # Synthesize a token-level response with text_offset that puts the
+    # continuation just after `matched_stem`.
+    ctx_end = len(matched_stem)
+    tokens = ["<ctx>", continuation]
+    token_logprobs = [None, logprob]
+    text_offset = [0, ctx_end]
+    top_logprobs = [
+        {"<ctx>": -0.01},
+        {continuation: logprob, "_other": logprob - 1.0},
+    ]
+
+    return web.json_response(
+        {
+            "choices": [
+                {
+                    "text": "",
+                    "finish_reason": "length",
+                    "logprobs": {
+                        "tokens": tokens,
+                        "token_logprobs": token_logprobs,
+                        "text_offset": text_offset,
+                        "top_logprobs": top_logprobs,
+                    },
+                }
+            ],
+            "model": body["model"],
+            "usage": {"prompt_tokens": len(prompt), "completion_tokens": 0, "total_tokens": len(prompt)},
+        }
+    )
+
+
+def make_app(gold_per_prompt: dict[str, str]) -> web.Application:
+    async def handler(request: web.Request) -> web.Response:
+        return await fake_completions_handler(request, gold_per_prompt=gold_per_prompt)
+
+    app = web.Application()
+    app.router.add_post("/v1/completions", handler)
+    return app
+
+
+async def run_smoketest() -> None:
+    benchmark_rows = [
+        # prompt stem, choices, gold idx, gold text
+        {"q": "Capital of France?", "answer": 2},
+        {"q": "Capital of UK?", "answer": 3},
+        {"q": "Capital of Germany?", "answer": 0},
+    ]
+    choices = ["Berlin", "Madrid", "Paris", "London"]
+    gold_per_prompt = {f"Q: {row['q']}\nA: ": choices[row["answer"]] for row in benchmark_rows}
+
+    # Start fake server
+    app = make_app(gold_per_prompt)
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, "127.0.0.1", 11999)
+    await site.start()
+
+    try:
+        defn = BenchmarkDefinition(
+            name="capitals_smoke",
+            dataset=lambda: benchmark_rows,
+            prompt="Q: {q}\nA: ",
+            target_field="answer",
+            choices=choices,
+            scorer_fn=multiple_choice_acc,
+        )
+        env = ByobEnvironment(defn)
+        solver = LogprobRankingSolver(
+            base_url="http://127.0.0.1:11999/v1",
+            model="fake-mc-oracle",
+        )
+
+        print(f"running {len(benchmark_rows)} rows × {len(choices)} choices each\n")
+        per_row_results = []
+        for idx in range(len(benchmark_rows)):
+            seed = await env.seed(idx)
+            solve = await solver.solve(seed)
+            assert solve.error is None, f"solver error: {solve.error}"
+
+            merged_meta = {**seed.metadata, **solve.scoring_details}
+            vr = await env.verify(solve.response, seed.expected_answer, **merged_meta)
+            outputs = vr.scoring_details.get("outputs", {})
+            per_row_results.append((benchmark_rows[idx]["q"], solve, vr, outputs))
+
+        # Summary
+        print(f"{'question':<22} {'argmax':<10} {'gold':<10} {'acc':<6} {'logprobs'}")
+        print("─" * 96)
+        all_correct = True
+        for (q, solve, vr, outputs), row in zip(per_row_results, benchmark_rows):
+            argmax = solve.response
+            gold = choices[row["answer"]]
+            lps = solve.scoring_details["_mc_choices_logprobs"]
+            lps_str = "  ".join(f"{c}={lp:.2f}" for c, lp in zip(choices, lps))
+            acc = outputs.get("acc", 0.0)
+            if acc != 1.0:
+                all_correct = False
+            print(f"{q:<22} {argmax:<10} {gold:<10} {acc:<6} {lps_str}")
+
+        print()
+        print(f"all correct: {all_correct}")
+        for q, solve, vr, outputs in per_row_results:
+            assert outputs.get("acc") == 1.0, f"acc != 1.0 for {q!r}: {outputs}"
+            assert math.isfinite(solve.scoring_details["_mc_choices_logprobs"][0])
+        print("OK: end-to-end seed → solve → verify works through real HTTP")
+
+        await solver.close()
+    finally:
+        await runner.cleanup()
+
+
+if __name__ == "__main__":
+    asyncio.run(run_smoketest())
diff --git a/src/nemo_evaluator/__init__.py b/src/nemo_evaluator/__init__.py
index ffba7adcb..21414cd24 100644
--- a/src/nemo_evaluator/__init__.py
+++ b/src/nemo_evaluator/__init__.py
@@ -16,22 +16,30 @@
 
 __version__ = "0.12.0"
 
+from nemo_evaluator.engine.eval_loop import run_evaluation
+from nemo_evaluator.engine.model_client import ModelClient
 from nemo_evaluator.environments.base import EvalEnvironment, SeedResult, VerifyResult
 from nemo_evaluator.environments.custom import benchmark, scorer
 from nemo_evaluator.environments.registry import get_environment, list_environments, load_benchmark_file, register
-from nemo_evaluator.engine.eval_loop import run_evaluation
-from nemo_evaluator.engine.model_client import ModelClient
-from nemo_evaluator.solvers import (
-    ChatSolver,
-    CompletionSolver,
-    NatSolver,
-    OpenClawSolver,
-    Solver,
-    SolveResult,
-    VLMSolver,
-)
 from nemo_evaluator.scoring import (
+    BooleanValue,
+    CandidateOutput,
+    ContinuousScore,
+    DatasetRow,
+    DiscreteScore,
+    Label,
+    Metric,
+    MetricDescriptor,
+    MetricInput,
+    MetricOutput,
+    MetricOutputSpec,
+    MetricResult,
+    MetricScorerFunction,
+    ScorerCallable,
+    ScorerConfig,
+    ScorerFunctionMetric,
     ScorerInput,
+    ScorerReturn,
     answer_line,
     code_sandbox,
     code_sandbox_async,
@@ -40,6 +48,18 @@
     multichoice_regex,
     needs_judge,
     numeric_match,
+    score_names_from_output_spec,
+)
+from nemo_evaluator.scoring.multiple_choice import mcq_letter_extract, multiple_choice_acc
+from nemo_evaluator.solvers import (
+    ChatSolver,
+    CompletionSolver,
+    LogprobRankingSolver,
+    NatSolver,
+    OpenClawSolver,
+    Solver,
+    SolveResult,
+    VLMSolver,
 )
 
 __all__ = [
@@ -57,6 +77,7 @@
     "Solver",
     "ChatSolver",
     "CompletionSolver",
+    "LogprobRankingSolver",
     "NatSolver",
     "OpenClawSolver",
     "VLMSolver",
@@ -65,6 +86,24 @@
     "benchmark",
     "scorer",
     "ScorerInput",
+    "Metric",
+    "BooleanValue",
+    "DatasetRow",
+    "CandidateOutput",
+    "ContinuousScore",
+    "DiscreteScore",
+    "Label",
+    "MetricInput",
+    "MetricOutput",
+    "MetricOutputSpec",
+    "MetricDescriptor",
+    "MetricResult",
+    "MetricScorerFunction",
+    "ScorerCallable",
+    "ScorerConfig",
+    "ScorerFunctionMetric",
+    "ScorerReturn",
+    "score_names_from_output_spec",
     # Scoring primitives
     "exact_match",
     "multichoice_regex",
@@ -73,5 +112,7 @@
     "numeric_match",
     "code_sandbox",
     "code_sandbox_async",
+    "mcq_letter_extract",
+    "multiple_choice_acc",
     "needs_judge",
 ]
diff --git a/src/nemo_evaluator/engine/eval_loop.py b/src/nemo_evaluator/engine/eval_loop.py
index 989238ba3..73fb21eaf 100644
--- a/src/nemo_evaluator/engine/eval_loop.py
+++ b/src/nemo_evaluator/engine/eval_loop.py
@@ -435,11 +435,18 @@ async def _run_step(idx: int, slot: int, rep: int, seed_result, seed_ms: float):
                         logger.debug("p%d r%d: using pre-computed reward=%.4f", idx, rep, vr.reward)
                     elif not _solve_failed:
                         verify_sandbox = await lifecycle.get_verify_sandbox()
+                        # Forward solver-emitted payload (e.g. logprobs, ranking
+                        # metadata) into verify alongside seed metadata. Keys
+                        # collide rarely in practice; solver payload wins on
+                        # collision because it is the more recent, more
+                        # specific source.
+                        solver_meta = solve_result.scoring_details if solve_result else {}
+                        merged_meta = {**seed_result.metadata, **solver_meta}
                         vr = await env.verify(
                             response_text,
                             seed_result.expected_answer,
                             sandbox=verify_sandbox,
-                            **seed_result.metadata,
+                            **merged_meta,
                         )
                     break  # success — exit retry loop
 
diff --git a/src/nemo_evaluator/environments/custom.py b/src/nemo_evaluator/environments/custom.py
index 5d0e8cdd2..8cf60d203 100644
--- a/src/nemo_evaluator/environments/custom.py
+++ b/src/nemo_evaluator/environments/custom.py
@@ -41,16 +41,34 @@ def score(sample: ScorerInput) -> dict:
 import json
 import logging
 import random
+from collections.abc import Mapping
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable
+from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast, overload
+
+from pydantic import BaseModel
 
 if TYPE_CHECKING:
     from nemo_evaluator.sandbox.base import Sandbox
 
 from nemo_evaluator.environments.base import EvalEnvironment, SeedResult, VerifyResult
-from nemo_evaluator.sandbox.base import ImageBuildRequest, SandboxSpec
 from nemo_evaluator.environments.registry import register
+from nemo_evaluator.sandbox.base import ImageBuildRequest, SandboxSpec
+from nemo_evaluator.scoring.metric import (
+    CandidateOutput,
+    DatasetRow,
+    Metric,
+    MetricDescriptor,
+    MetricInput,
+    MetricOutputSpec,
+    MetricResult,
+    MetricScorerFunction,
+    ScorerCallable,
+    ScorerConfig,
+    ScorerFunctionMetric,
+    ScorerReturn,
+    score_names_from_output_spec,
+)
 from nemo_evaluator.scoring.types import ScorerInput
 
 logger = logging.getLogger(__name__)
@@ -59,10 +77,14 @@ def score(sample: ScorerInput) -> dict:
 # ── Data types ────────────────────────────────────────────────────────────
 
 
+ConfigT = TypeVar("ConfigT", bound=Mapping[str, object] | BaseModel)
+ConfigModelT = TypeVar("ConfigModelT", bound=BaseModel)
+
+
 @dataclass
 class BenchmarkDefinition:
     name: str
-    dataset: str | Callable[[], list[dict]]
+    dataset: str | Callable[..., list[dict[str, Any]]]
     prompt: str
     target_field: str = "target"
     endpoint_type: str = "chat"
@@ -70,10 +92,22 @@ class BenchmarkDefinition:
     field_mapping: dict[str, str] | None = None
     extra: dict[str, Any] = field(default_factory=dict)
     requirements: list[str] | None = None
-    scorer_fn: Callable[[ScorerInput], dict] | None = None
-    prepare_row: Callable[[dict, int, random.Random], dict] | None = None
-    seed_fn: Callable[[dict, int], SeedResult] | None = None
-    image_builder_fn: Callable[[list[dict]], ImageBuildRequest] | None = None
+    scorer_fn: Callable[..., ScorerReturn] | None = None
+    prepare_row: Callable[[dict[str, Any], int, random.Random], dict[str, Any]] | None = None
+    seed_fn: Callable[[dict[str, Any], int], SeedResult] | None = None
+    image_builder_fn: Callable[[list[dict[str, Any]]], ImageBuildRequest] | None = None
+    # Multiple-choice loglikelihood support (lm-eval-harness parity).
+    # Either ``choices`` (static, e.g. ["A","B","C","D"]) or ``choices_field``
+    # (per-row dataset field, supports dotted paths like ``choices.text``)
+    # must be set when paired with LogprobRankingSolver.
+    choices: list[str] | None = None
+    choices_field: str | None = None
+    # Few-shot prompting (mirrors lm-eval-harness ``--num_fewshot``).
+    num_fewshot: int = 0
+    fewshot_split: str | None = None
+    fewshot_template: str | None = None
+    fewshot_separator: str = "\n\n"
+    fewshot_seed: int = 42
 
 
 _BYOB_REGISTRY: dict[str, BenchmarkDefinition] = {}
@@ -82,8 +116,11 @@ class BenchmarkDefinition:
 # ── Dataset loading ───────────────────────────────────────────────────────
 
 
-def _load_dataset_from_spec(spec: str | Callable, num_examples: int | None = None) -> list[dict[str, Any]]:
-    if callable(spec):
+def _load_dataset_from_spec(
+    spec: str | Callable[..., list[dict[str, Any]]],
+    num_examples: int | None = None,
+) -> list[dict[str, Any]]:
+    if not isinstance(spec, str):
         import inspect
 
         sig = inspect.signature(spec)
@@ -124,10 +161,26 @@ def _load_csv(path: Path, delimiter: str = ",") -> list[dict[str, Any]]:
 
 
 def _load_hf(spec: str, num_examples: int | None = None) -> list[dict[str, Any]]:
+    """Load a HuggingFace dataset.
+
+    Accepts both query-style and path-segment URIs::
+
+        google/boolq?split=validation                  # query split
+        CohereForAI/Global-MMLU-Lite/en?split=test     # path-segment config
+        CohereForAI/Global-MMLU-Lite/en/test           # path-segment config + split
+        boolq?split=validation&config=cfg              # explicit config kwarg
+
+    The first two path segments are always treated as ``namespace/name`` so
+    namespaced datasets work. A third segment is treated as the config when
+    no ``?config=`` is given; a fourth is treated as the split when no
+    ``?split=`` is given. Filter kwargs (``filter_field``,
+    ``filter_value``, optionally suffixed with ``_1``/``_2``/...) drop rows
+    that don't match.
+    """
     from datasets import load_dataset
 
-    parts = spec.split("?")
-    dataset_name = parts[0]
+    parts = spec.split("?", 1)
+    body = parts[0]
     params: dict[str, str] = {}
     if len(parts) > 1:
         for kv in parts[1].split("&"):
@@ -135,8 +188,26 @@ def _load_hf(spec: str, num_examples: int | None = None) -> list[dict[str, Any]]
                 k, v = kv.split("=", 1)
                 params[k] = v
 
-    split = params.get("split", "test")
-    config = params.get("config")
+    segments = [s for s in body.split("/") if s]
+    if not segments:
+        raise ValueError(f"Invalid HuggingFace spec (empty): {spec!r}")
+
+    config_from_path: str | None = None
+    split_from_path: str | None = None
+    if len(segments) == 1:
+        dataset_name = segments[0]
+    elif len(segments) == 2:
+        dataset_name = "/".join(segments[:2])
+    elif len(segments) == 3:
+        dataset_name = "/".join(segments[:2])
+        config_from_path = segments[2]
+    else:
+        dataset_name = "/".join(segments[:2])
+        config_from_path = segments[2]
+        split_from_path = segments[3]
+
+    split = params.get("split", split_from_path or "test")
+    config = params.get("config", config_from_path)
 
     if num_examples and "[" not in split:
         split = f"{split}[:{num_examples}]"
@@ -145,10 +216,33 @@ def _load_hf(spec: str, num_examples: int | None = None) -> list[dict[str, Any]]
     if config:
         args.append(config)
     ds = load_dataset(*args, split=split)
-    return [dict(row) for row in ds]
+    rows = [dict(row) for row in ds]
+
+    filters = _extract_filters(params)
+    if filters:
+        rows = [r for r in rows if all(str(r.get(field)) == value for field, value in filters)]
+
+    return rows
 
 
-def _format_prompt(template: str, row: dict, field_mapping: dict | None = None) -> str:
+def _extract_filters(params: dict[str, str]) -> list[tuple[str, str]]:
+    """Pull ``filter_field=...&filter_value=...`` pairs (with numeric suffixes)."""
+    filters: list[tuple[str, str]] = []
+    base_field = params.get("filter_field")
+    base_value = params.get("filter_value")
+    if base_field and base_value is not None:
+        filters.append((base_field, base_value))
+    for k, v in params.items():
+        if not k.startswith("filter_field_"):
+            continue
+        suffix = k.removeprefix("filter_field_")
+        value = params.get(f"filter_value_{suffix}")
+        if value is not None:
+            filters.append((v, value))
+    return filters
+
+
+def _format_prompt(template: str, row: dict[str, Any], field_mapping: dict[str, str] | None = None) -> str:
     data = dict(row)
     if field_mapping:
         for src, dst in field_mapping.items():
@@ -210,12 +304,21 @@ async def seed(self, idx: int) -> SeedResult:
             return self._defn.seed_fn(row, idx)
 
         prompt = _format_prompt(self._defn.prompt, row, self._defn.field_mapping)
+        if self._defn.num_fewshot > 0:
+            prompt = self._fewshot_prefix() + prompt
         target = str(row.get(self._defn.target_field, ""))
 
         meta: dict[str, Any] = {"source": "byob", "benchmark": self._defn.name}
         for k, v in row.items():
             meta[k] = v
 
+        # Multiple-choice loglikelihood: surface candidate continuations on
+        # ``metadata["_mc_choices"]`` so a LogprobRankingSolver (or any
+        # solver that recognises the convention) can rank them.
+        choices = _resolve_mc_choices(row, self._defn)
+        if choices is not None:
+            meta["_mc_choices"] = choices
+
         messages = [{"role": "user", "content": prompt}]
         if self._defn.system_prompt:
             messages.insert(0, {"role": "system", "content": self._defn.system_prompt})
@@ -224,6 +327,32 @@ async def seed(self, idx: int) -> SeedResult:
             prompt=prompt, expected_answer=target, messages=messages, system=self._defn.system_prompt, metadata=meta
         )
 
+    def _fewshot_prefix(self) -> str:
+        defn = self._defn
+        examples = self._fewshot_examples()
+        if not examples:
+            return ""
+        rendered: list[str] = []
+        for ex in examples:
+            text = _render_fewshot_example(defn, ex)
+            if text is not None:
+                rendered.append(text)
+        if not rendered:
+            return ""
+        return defn.fewshot_separator.join(rendered) + defn.fewshot_separator
+
+    def _fewshot_examples(self) -> list[dict[str, Any]]:
+        defn = self._defn
+        if defn.num_fewshot <= 0:
+            return []
+        pool = self._dataset[: max(defn.num_fewshot * 4, defn.num_fewshot)]
+        if not pool:
+            return []
+        rng = random.Random(defn.fewshot_seed)
+        if len(pool) <= defn.num_fewshot:
+            return list(pool)
+        return rng.sample(pool, defn.num_fewshot)
+
     async def verify(self, response: str, expected: str, sandbox: Sandbox | None = None, **meta: Any) -> VerifyResult:
         if self._defn.scorer_fn is None:
             correct = response.strip().lower() == expected.strip().lower()
@@ -235,27 +364,159 @@ async def verify(self, response: str, expected: str, sandbox: Sandbox | None = N
 
         import asyncio
 
+        to_metric = getattr(self._defn.scorer_fn, "to_metric", None)
+        if callable(to_metric):
+            metric = cast(ScorerFunctionMetric[ScorerConfig], to_metric()).bind_raw_config(
+                config=self._defn.extra,
+                sandbox=sandbox,
+                target=expected,
+            )
+            metric_input = _metric_input_from_verify(
+                response=response,
+                metadata=meta,
+            )
+            result = await metric.compute_scores(metric_input)
+            return _metric_result_to_verify_result(
+                metric=metric,
+                result=result,
+                benchmark_name=self._defn.name,
+                response=response,
+            )
+
         sample = ScorerInput(
             response=response, target=expected, metadata=meta, config=self._defn.extra, sandbox=sandbox
         )
-        scores = self._defn.scorer_fn(sample)
-        if asyncio.iscoroutine(scores):
-            scores = await scores
-
-        reward = float(scores.get("correct", scores.get("reward", next(iter(scores.values()), 0))))
+        scores_result = self._defn.scorer_fn(sample)
+        if asyncio.iscoroutine(scores_result):
+            scores_result = await scores_result
+        scores = cast(Mapping[str, object], scores_result)
+
+        reward_value = scores.get("correct", scores.get("reward", next(iter(scores.values()), 0)))
+        reward = float(reward_value) if isinstance(reward_value, bool | int | float) else 0.0
+        extracted = scores.get("extracted")
         return VerifyResult(
             reward=reward,
-            extracted_answer=scores.get("extracted", response.strip()[:200]),
-            scoring_details={"method": f"byob_{self._defn.name}", **scores},
+            extracted_answer=extracted if isinstance(extracted, str) else response.strip()[:200],
+            scoring_details={"method": f"byob_{self._defn.name}", **dict(scores)},
         )
 
 
+def _metric_input_from_verify(
+    *,
+    response: str,
+    metadata: dict[str, Any],
+) -> MetricInput:
+    """Build a MetricInput from BYOB verify kwargs.
+
+    Keys prefixed ``_mc_`` (and any other agreed-upon solver-side payload
+    namespace, e.g. ``_solver_*``) are lifted out of the dataset row and
+    placed on ``candidate.metadata`` — the slot the contract designates for
+    inference-time information about the model's output. Dataset-row keys
+    flow through ``row.data`` unchanged.
+    """
+    candidate_meta: dict[str, object] = {}
+    row_data: dict[str, object] = {}
+    for k, v in metadata.items():
+        if isinstance(k, str) and (k.startswith("_mc_") or k.startswith("_solver_")):
+            candidate_meta[k] = v
+        else:
+            row_data[k] = v
+    return MetricInput(
+        row=DatasetRow(data=row_data),
+        candidate=CandidateOutput(output_text=response, metadata=candidate_meta),
+    )
+
+
+def _resolve_mc_choices(row: dict[str, Any], defn: BenchmarkDefinition) -> list[str] | None:
+    """Resolve per-row candidate continuations from a benchmark definition.
+
+    Per-row ``choices_field`` (with optional dotted path support such as
+    ``choices.text``) takes precedence over the static ``choices`` list.
+    Returns ``None`` when neither source resolves to a non-empty list.
+    """
+    if defn.choices_field:
+        raw: object = row
+        for part in defn.choices_field.split("."):
+            if isinstance(raw, dict) and part in raw:
+                raw = raw[part]
+            else:
+                raw = None
+                break
+        if raw is None:
+            raw = row.get(defn.choices_field)
+        if isinstance(raw, dict):
+            raw = raw.get("text")
+        if isinstance(raw, list) and raw:
+            return [str(c) for c in raw]
+    if defn.choices:
+        return list(defn.choices)
+    return None
+
+
+def _render_fewshot_example(defn: BenchmarkDefinition, row: dict[str, Any]) -> str | None:
+    """Render one few-shot example. Returns None on missing-field errors."""
+    try:
+        if defn.fewshot_template:
+            return _format_prompt(defn.fewshot_template, row, defn.field_mapping)
+        prompt_part = _format_prompt(defn.prompt, row, defn.field_mapping)
+        target_part = row.get(defn.target_field, "")
+        return f"{prompt_part} {target_part}".rstrip()
+    except KeyError:
+        return None
+
+
+def _metric_result_to_verify_result(
+    *,
+    metric: Metric,
+    result: MetricResult,
+    benchmark_name: str,
+    response: str,
+) -> VerifyResult:
+    outputs = {output.name: output.value for output in result.outputs}
+    score_names = score_names_from_output_spec(metric.output_spec())
+    scores = {name: _score_value(outputs[name]) for name in score_names if name in outputs}
+    reward_name = _select_reward_score_name(scores=scores, declared=score_names)
+    extracted = outputs.get("extracted")
+
+    scoring_details: dict[str, Any] = {
+        "method": f"byob_{benchmark_name}",
+        "metric_type": metric.type,
+        "outputs": outputs,
+    }
+    for name, value in outputs.items():
+        scoring_details.setdefault(name, value)
+
+    return VerifyResult(
+        reward=scores[reward_name] if reward_name is not None else 0.0,
+        extracted_answer=extracted if isinstance(extracted, str) else response.strip()[:200],
+        scoring_details=scoring_details,
+    )
+
+
+def _select_reward_score_name(*, scores: dict[str, float], declared: list[str]) -> str | None:
+    for preferred in ("reward", "correct"):
+        if preferred in scores:
+            return preferred
+    for name in declared:
+        if name in scores:
+            return name
+    return next(iter(scores), None)
+
+
+def _score_value(value: object) -> float:
+    if isinstance(value, bool):
+        return 1.0 if value else 0.0
+    if isinstance(value, int | float):
+        return float(value)
+    raise TypeError(f"Metric score output must be bool, int, or float, got {type(value).__name__}")
+
+
 # ── Decorators ────────────────────────────────────────────────────────────
 
 
 def benchmark(
     name: str,
-    dataset: str | Callable,
+    dataset: str | Callable[..., list[dict[str, Any]]],
     prompt: str = "",
     target_field: str = "target",
     endpoint_type: str = "chat",
@@ -263,9 +524,16 @@ def benchmark(
     field_mapping: dict[str, str] | None = None,
     extra: dict[str, Any] | None = None,
     requirements: list[str] | None = None,
-    prepare_row: Callable | None = None,
-    seed_fn: Callable | None = None,
-    **kwargs,
+    prepare_row: Callable[[dict[str, Any], int, random.Random], dict[str, Any]] | None = None,
+    seed_fn: Callable[[dict[str, Any], int], SeedResult] | None = None,
+    choices: list[str] | None = None,
+    choices_field: str | None = None,
+    num_fewshot: int = 0,
+    fewshot_split: str | None = None,
+    fewshot_template: str | None = None,
+    fewshot_separator: str = "\n\n",
+    fewshot_seed: int = 42,
+    **kwargs: Any,
 ):
     """Register a benchmark. Decorate a scorer function."""
     defn = BenchmarkDefinition(
@@ -280,6 +548,13 @@ def benchmark(
         requirements=requirements,
         prepare_row=prepare_row,
         seed_fn=seed_fn,
+        choices=choices,
+        choices_field=choices_field,
+        num_fewshot=num_fewshot,
+        fewshot_split=fewshot_split,
+        fewshot_template=fewshot_template,
+        fewshot_separator=fewshot_separator,
+        fewshot_seed=fewshot_seed,
     )
 
     def decorator(fn):
@@ -300,13 +575,133 @@ def __init__(self, num_examples: int | None = None):
     return decorator
 
 
-def scorer(fn: Callable[[ScorerInput], dict]) -> Callable[[ScorerInput], dict]:
-    """Marks a function as a scorer."""
-    fn._is_scorer = True  # type: ignore[attr-defined]
-    return fn
+@overload
+def scorer(
+    fn: None = None,
+    *,
+    metric_type: str,
+    outputs: list[MetricOutputSpec],
+    config_schema: type[ConfigModelT],
+) -> Callable[[ScorerCallable[ConfigModelT]], MetricScorerFunction[ConfigModelT]]: ...
+
+
+@overload
+def scorer(
+    fn: ScorerCallable[ConfigModelT],
+    *,
+    metric_type: str,
+    outputs: list[MetricOutputSpec],
+    config_schema: type[ConfigModelT],
+) -> MetricScorerFunction[ConfigModelT]: ...
+
+
+@overload
+def scorer(
+    fn: None = None,
+    *,
+    metric_type: str,
+    outputs: list[MetricOutputSpec],
+    config_schema: None = None,
+) -> Callable[[ScorerCallable[ConfigT]], MetricScorerFunction[ConfigT]]: ...
+
+
+@overload
+def scorer(
+    fn: ScorerCallable[ConfigT],
+    *,
+    metric_type: str,
+    outputs: list[MetricOutputSpec],
+    config_schema: None = None,
+) -> MetricScorerFunction[ConfigT]: ...
+
+
+@overload
+def scorer(fn: ScorerCallable[ConfigT]) -> ScorerCallable[ConfigT]: ...
+
+
+@overload
+def scorer(
+    fn: None = None,
+    *,
+    metric_type: None = None,
+    outputs: None = None,
+    config_schema: None = None,
+) -> Callable[[ScorerCallable[ConfigT]], ScorerCallable[ConfigT]]: ...
+
+
+def scorer(
+    fn: Callable[..., ScorerReturn] | None = None,
+    *,
+    metric_type: str | None = None,
+    outputs: list[MetricOutputSpec] | None = None,
+    config_schema: type[BaseModel] | None = None,
+) -> object:
+    """Marks a function as a scorer.
+
+    Plain ``@scorer`` keeps the current ``ScorerInput -> dict`` behavior.
+    ``@scorer(metric_type=..., outputs=...)`` exposes ``descriptor`` and
+    ``to_metric()`` for adapting scorer functions to the shared Metric protocol.
+    """
+    if outputs is None and (metric_type is not None or config_schema is not None):
+        metric_options = [
+            option
+            for option, value in (
+                ("metric_type=...", metric_type),
+                ("config_schema=...", config_schema),
+            )
+            if value is not None
+        ]
+        raise ValueError(
+            f"@scorer({', '.join(metric_options)}) opts into the Metric contract, but no outputs were declared. "
+            "Pass outputs=[MetricOutputSpec(...)] so the metric descriptor can declare and validate outputs."
+        )
+    if outputs is not None and metric_type is None:
+        raise ValueError(
+            "@scorer(outputs=...) opts into the Metric contract, but no metric_type was declared. "
+            "Pass metric_type='...' so the metric has a stable identity across refactors."
+        )
+
+    def decorate(fn: Callable[..., ScorerReturn]) -> object:
+        return _decorate_scorer(
+            cast(ScorerCallable[ScorerConfig], fn),
+            metric_type=metric_type,
+            outputs=outputs,
+            config_schema=config_schema,
+        )
+
+    return decorate(fn) if fn is not None else decorate
 
 
-def image_builder(builder_fn: Callable[[list[dict]], ImageBuildRequest]):
+def _decorate_scorer(
+    fn: ScorerCallable[ConfigT],
+    *,
+    metric_type: str | None = None,
+    outputs: list[MetricOutputSpec] | None = None,
+    config_schema: type[BaseModel] | None = None,
+):
+    setattr(fn, "_is_scorer", True)
+    if outputs is None:
+        return fn
+    if metric_type is None:
+        raise ValueError("metric_type is required when outputs are declared")
+
+    descriptor = MetricDescriptor(
+        type=metric_type,
+        outputs=outputs,
+        config_schema=config_schema,
+    )
+
+    def to_metric() -> ScorerFunctionMetric[ConfigT]:
+        return ScorerFunctionMetric(
+            descriptor=descriptor,
+            scorer_fn=fn,
+        )
+
+    setattr(fn, "descriptor", descriptor)
+    setattr(fn, "to_metric", to_metric)
+    return fn
+
+def image_builder(builder_fn: Callable[[list[dict[str, Any]]], ImageBuildRequest]):
     """Declare images that need building, stacked with ``@benchmark``.
 
     ``builder_fn`` receives the dataset rows and returns an
diff --git a/src/nemo_evaluator/scoring/__init__.py b/src/nemo_evaluator/scoring/__init__.py
index 3ca193cf7..928b7fd26 100644
--- a/src/nemo_evaluator/scoring/__init__.py
+++ b/src/nemo_evaluator/scoring/__init__.py
@@ -22,6 +22,8 @@
 
 from typing import Callable
 
+from nemo_evaluator.scoring.code_execution import code_sandbox, code_sandbox_async
+from nemo_evaluator.scoring.json_schema import extract_json, validate_json_schema
 from nemo_evaluator.scoring.judge import (
     JudgeScoringConfig,
     build_judge_prompt,
@@ -29,9 +31,27 @@
     needs_judge,
     parse_judge_response,
 )
-from nemo_evaluator.scoring.json_schema import extract_json, validate_json_schema
+from nemo_evaluator.scoring.metric import (
+    BooleanValue,
+    CandidateOutput,
+    ContinuousScore,
+    DatasetRow,
+    DiscreteScore,
+    Label,
+    Metric,
+    MetricDescriptor,
+    MetricInput,
+    MetricOutput,
+    MetricOutputSpec,
+    MetricResult,
+    MetricScorerFunction,
+    ScorerCallable,
+    ScorerConfig,
+    ScorerFunctionMetric,
+    ScorerReturn,
+    score_names_from_output_spec,
+)
 from nemo_evaluator.scoring.pattern import answer_line, multichoice_regex, numeric_match
-from nemo_evaluator.scoring.code_execution import code_sandbox, code_sandbox_async
 from nemo_evaluator.scoring.text import exact_match, extract_mcq_letter, fuzzy_match
 from nemo_evaluator.scoring.types import ScorerInput
 
@@ -65,6 +85,24 @@ def list_scorers() -> list[str]:
 
 __all__ = [
     "ScorerInput",
+    "Metric",
+    "BooleanValue",
+    "DatasetRow",
+    "ContinuousScore",
+    "CandidateOutput",
+    "DiscreteScore",
+    "Label",
+    "MetricInput",
+    "MetricOutput",
+    "MetricOutputSpec",
+    "MetricDescriptor",
+    "MetricResult",
+    "MetricScorerFunction",
+    "ScorerCallable",
+    "ScorerConfig",
+    "ScorerFunctionMetric",
+    "ScorerReturn",
+    "score_names_from_output_spec",
     "get_scorer",
     "list_scorers",
     # Text
@@ -87,4 +125,15 @@ def list_scorers() -> list[str]:
     # JSON
     "extract_json",
     "validate_json_schema",
+    # Multiple-choice (lazy-loaded — depends on @scorer in environments.custom)
+    "multiple_choice_acc",
+    "mcq_letter_extract",
 ]
+
+
+def __getattr__(name: str):
+    if name in ("multiple_choice_acc", "mcq_letter_extract"):
+        from nemo_evaluator.scoring import multiple_choice as _mc
+
+        return getattr(_mc, name)
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/src/nemo_evaluator/scoring/metric.py b/src/nemo_evaluator/scoring/metric.py
new file mode 100644
index 000000000..4f7dafcac
--- /dev/null
+++ b/src/nemo_evaluator/scoring/metric.py
@@ -0,0 +1,395 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Shared MetricInput -> MetricResult runtime contract."""
+
+from __future__ import annotations
+
+import inspect
+from collections.abc import Awaitable, Callable, Mapping
+from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast, runtime_checkable
+
+from pydantic import BaseModel, ConfigDict, Field, RootModel, field_validator
+
+from nemo_evaluator.scoring.types import ScorerInput
+
+if TYPE_CHECKING:
+    from nemo_evaluator.sandbox.base import Sandbox
+
+
+ConfigT = TypeVar("ConfigT", bound=Mapping[str, object] | BaseModel)
+SchemaT = TypeVar("SchemaT", bound=BaseModel)
+
+
+class DatasetRow(BaseModel):
+    """Original benchmark dataset row plus optional stable row identity."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    row_index: int | None = None
+    data: dict[str, object]
+
+
+class CandidateOutput(BaseModel):
+    """Candidate output being scored for one dataset row."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    output_text: str | None = None
+    response: object | None = None
+    trajectory: object | None = None
+    metadata: dict[str, object] = Field(default_factory=dict)
+
+
+class MetricInput(BaseModel):
+    """Complete per-row scoring input passed to a metric."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    row: DatasetRow
+    candidate: CandidateOutput
+
+
+class ContinuousScore(RootModel[float]):
+    """Continuous numeric metric value."""
+
+
+class DiscreteScore(RootModel[int]):
+    """Discrete numeric metric value."""
+
+
+class Label(RootModel[str]):
+    """String label metric value."""
+
+
+class BooleanValue(RootModel[bool]):
+    """Boolean metric value."""
+
+
+class MetricOutputSpec(BaseModel, Generic[SchemaT]):
+    """Schema for one named value emitted by a metric."""
+
+    model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
+
+    name: str
+    description: str | None = None
+    value_schema: type[SchemaT]
+
+    @field_validator("name")
+    @classmethod
+    def _name_must_not_be_empty(cls, value: str) -> str:
+        if not value:
+            raise ValueError("metric output name must not be empty")
+        return value
+
+    @staticmethod
+    def continuous_score(name: str, description: str | None = None) -> MetricOutputSpec[ContinuousScore]:
+        return MetricOutputSpec[ContinuousScore](name=name, description=description, value_schema=ContinuousScore)
+
+    @staticmethod
+    def discrete_score(name: str, description: str | None = None) -> MetricOutputSpec[DiscreteScore]:
+        return MetricOutputSpec[DiscreteScore](name=name, description=description, value_schema=DiscreteScore)
+
+    @staticmethod
+    def label(name: str, description: str | None = None) -> MetricOutputSpec[Label]:
+        return MetricOutputSpec[Label](name=name, description=description, value_schema=Label)
+
+    @staticmethod
+    def boolean(name: str, description: str | None = None) -> MetricOutputSpec[BooleanValue]:
+        return MetricOutputSpec[BooleanValue](name=name, description=description, value_schema=BooleanValue)
+
+    @staticmethod
+    def model(
+        name: str,
+        value_schema: type[SchemaT],
+        description: str | None = None,
+    ) -> MetricOutputSpec[SchemaT]:
+        return MetricOutputSpec[SchemaT](name=name, description=description, value_schema=value_schema)
+
+    def coerce_value(self, value: object) -> SchemaT:
+        """Validate and coerce a raw output value to this spec's declared schema."""
+        return self.value_schema.model_validate(value)
+
+    def coerce_output(self, output: MetricOutput) -> SchemaT:
+        """Validate and coerce a named metric output against this spec."""
+        if output.name != self.name:
+            raise ValueError(f"Expected metric output {self.name!r}, got {output.name!r}")
+        return self.coerce_value(output.value)
+
+    def value_json_schema(self) -> dict[str, object]:
+        return self.value_schema.model_json_schema()
+
+
+class MetricDescriptor(BaseModel):
+    """Metadata needed to materialize a decorated scorer as a Metric."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    type: str
+    outputs: list[MetricOutputSpec] = Field(min_length=1)
+    config_schema: type[BaseModel] | None = None
+
+    @field_validator("type")
+    @classmethod
+    def _type_must_not_be_empty(cls, value: str) -> str:
+        if not value:
+            raise ValueError("metric type must not be empty")
+        return value
+
+
+    @field_validator("outputs")
+    @classmethod
+    def _output_names_must_be_unique(
+        cls, value: list[MetricOutputSpec]
+    ) -> list[MetricOutputSpec]:
+        names = [output.name for output in value]
+        duplicates = sorted({name for name in names if names.count(name) > 1})
+        if duplicates:
+            raise ValueError(f"duplicate metric output names: {duplicates}")
+        return value
+
+
+class MetricOutput(BaseModel):
+    """One named value emitted by a metric."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    name: str
+    value: object
+
+
+class MetricResult(BaseModel):
+    """Structured row-level metric result."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    outputs: list[MetricOutput]
+
+
+@runtime_checkable
+class Metric(Protocol):
+    """Shared row-scoring primitive."""
+
+    @property
+    def type(self) -> str: ...
+
+    def output_spec(self) -> list[MetricOutputSpec]: ...
+
+    async def compute_scores(self, input: MetricInput) -> MetricResult: ...
+
+
+ScorerReturn = Mapping[str, object] | Awaitable[Mapping[str, object]]
+ScorerCallable = Callable[[ScorerInput[ConfigT]], ScorerReturn]
+ScorerConfig = Mapping[str, object] | BaseModel
+
+
+class MetricScorerFunction(Protocol[ConfigT]):
+    """Decorated scorer function that can be materialized as a metric."""
+
+    @property
+    def descriptor(self) -> MetricDescriptor: ...
+
+    def __call__(self, sample: ScorerInput[ConfigT]) -> ScorerReturn: ...
+
+    def to_metric(self) -> ScorerFunctionMetric[ConfigT]: ...
+
+
+class ScorerFunctionMetric(Generic[ConfigT]):
+    """Metric adapter for decorator-authored OSS ScorerInput -> dict scorers."""
+
+    def __init__(
+        self,
+        *,
+        descriptor: MetricDescriptor,
+        scorer_fn: ScorerCallable[ConfigT],
+        config: ConfigT | None = None,
+        sandbox: "Sandbox | None" = None,
+        target: object | None = None,
+        target_field: str = "target",
+    ) -> None:
+        self._descriptor = descriptor
+        self._scorer_fn = scorer_fn
+        self._config: ConfigT | Mapping[str, object] | None = (
+            self._validate_config(config) if config is not None else None
+        )
+        self._sandbox = sandbox
+        self._target = target
+        self._target_field = target_field
+
+    @property
+    def type(self) -> str:
+        return self._descriptor.type
+
+    @property
+    def config(self) -> dict[str, object]:
+        config = self._resolve_config()
+        if isinstance(config, BaseModel):
+            return config.model_dump()
+        return dict(config)
+
+    @property
+    def sandbox(self) -> "Sandbox | None":
+        return self._sandbox
+
+    @property
+    def target(self) -> object | None:
+        return self._target
+
+    @property
+    def target_field(self) -> str:
+        return self._target_field
+
+    def bind(
+        self,
+        *,
+        config: ConfigT | None = None,
+        sandbox: "Sandbox | None" = None,
+        target: object | None = None,
+        target_field: str | None = None,
+    ) -> ScorerFunctionMetric[ConfigT]:
+        validated_config = self._config if config is None else self._validate_config(config)
+        return ScorerFunctionMetric(
+            descriptor=self._descriptor,
+            scorer_fn=self._scorer_fn,
+            config=cast(ConfigT, validated_config),
+            sandbox=self._sandbox if sandbox is None else sandbox,
+            target=self._target if target is None else target,
+            target_field=self._target_field if target_field is None else target_field,
+        )
+
+    def bind_raw_config(
+        self,
+        *,
+        config: ScorerConfig | None = None,
+        sandbox: "Sandbox | None" = None,
+        target: object | None = None,
+        target_field: str | None = None,
+    ) -> ScorerFunctionMetric[ConfigT]:
+        """Bind dict-like runtime config, validating it against ``config_schema`` when present."""
+        validated_config = self._config if config is None else self._validate_config(config, coerce=True)
+        return ScorerFunctionMetric(
+            descriptor=self._descriptor,
+            scorer_fn=self._scorer_fn,
+            config=cast(ConfigT, validated_config),
+            sandbox=self._sandbox if sandbox is None else sandbox,
+            target=self._target if target is None else target,
+            target_field=self._target_field if target_field is None else target_field,
+        )
+
+    def output_spec(self) -> list[MetricOutputSpec]:
+        return self._descriptor.outputs
+
+    async def compute_scores(self, input: MetricInput) -> MetricResult:
+        # Merge row-level dataset data with per-row candidate metadata so legacy
+        # ``ScorerInput`` scorers see solver-produced payloads (logprobs,
+        # trajectories, etc.) alongside dataset fields. Candidate metadata
+        # wins on key collisions — it is the more specific source.
+        merged_metadata: dict[str, object] = {**dict(input.row.data), **dict(input.candidate.metadata)}
+        sample: ScorerInput[ConfigT] = ScorerInput(
+            response=input.candidate.output_text or "",
+            target=self._target if self._target is not None else input.row.data.get(self._target_field),
+            metadata=merged_metadata,
+            config=cast(ConfigT, self._resolve_config()),
+            sandbox=self._sandbox,
+        )
+        result = self._scorer_fn(sample)
+        if inspect.isawaitable(result):
+            result = await result
+        if not isinstance(result, Mapping):
+            raise TypeError(f"scorer_fn must return a mapping, got {type(result).__name__}")
+        metric_result = MetricResult(
+            outputs=[MetricOutput(name=name, value=value) for name, value in cast(Mapping[str, object], result).items()]
+        )
+        return validate_metric_result(metric_result, self._descriptor.outputs)
+
+    def _validate_config(
+        self, config: ConfigT | ScorerConfig, *, coerce: bool = False
+    ) -> ConfigT | Mapping[str, object]:
+        schema = self._descriptor.config_schema
+        if schema is None:
+            if isinstance(config, BaseModel):
+                return config.model_dump()
+            return dict(config)
+        if isinstance(config, schema):
+            return cast(ConfigT, config)
+        if not coerce:
+            raise TypeError(
+                f"config must be an instance of {schema.__name__}; "
+                "use bind_raw_config(...) to validate dict-like runtime config"
+            )
+        payload = config.model_dump() if isinstance(config, BaseModel) else config
+        return cast(ConfigT, schema.model_validate(payload))
+
+    def _resolve_config(self) -> ConfigT | Mapping[str, object]:
+        if self._config is not None:
+            return self._config
+        schema = self._descriptor.config_schema
+        if schema is None:
+            return {}
+        return cast(ConfigT, schema.model_validate({}))
+
+
+def validate_metric_result(result: MetricResult, outputs: list[MetricOutputSpec]) -> MetricResult:
+    """Validate a metric result against its declared outputs."""
+    returned_names = [output.name for output in result.outputs]
+    duplicates = sorted({name for name in returned_names if returned_names.count(name) > 1})
+    if duplicates:
+        raise ValueError(f"Duplicate metric output names: {duplicates}")
+
+    outputs_by_name = {output.name: output for output in outputs}
+    declared_names = [output.name for output in outputs]
+    declared = set(declared_names)
+    returned = set(returned_names)
+    missing = [name for name in declared_names if name not in returned]
+    undeclared = [name for name in returned_names if name not in declared]
+
+    if missing:
+        raise ValueError(f"Missing declared metric outputs: {missing}")
+    if undeclared:
+        raise ValueError(f"Undeclared metric outputs: {undeclared}")
+    for output in result.outputs:
+        outputs_by_name[output.name].coerce_output(output)
+    return result
+
+
+def score_names_from_output_spec(outputs: list[MetricOutputSpec]) -> list[str]:
+    """Return declared numeric score names from metric output specs."""
+    return [
+        output.name
+        for output in outputs
+        if issubclass(output.value_schema, ContinuousScore | DiscreteScore | BooleanValue)
+    ]
+
+
+__all__ = [
+    "BooleanValue",
+    "CandidateOutput",
+    "ContinuousScore",
+    "DatasetRow",
+    "DiscreteScore",
+    "Label",
+    "Metric",
+    "MetricDescriptor",
+    "MetricInput",
+    "MetricOutput",
+    "MetricOutputSpec",
+    "MetricResult",
+    "MetricScorerFunction",
+    "ScorerCallable",
+    "ScorerConfig",
+    "ScorerFunctionMetric",
+    "ScorerReturn",
+    "score_names_from_output_spec",
+    "validate_metric_result",
+]
diff --git a/src/nemo_evaluator/scoring/multiple_choice.py b/src/nemo_evaluator/scoring/multiple_choice.py
new file mode 100644
index 000000000..56c14021b
--- /dev/null
+++ b/src/nemo_evaluator/scoring/multiple_choice.py
@@ -0,0 +1,217 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multiple-choice loglikelihood scorers (lm-evaluation-harness parity).
+
+The :func:`multiple_choice_acc` scorer expects a per-row payload populated
+by an upstream solver (typically :class:`LogprobRankingSolver`) that already
+ran the candidate continuations and computed sum-loglikelihoods. The payload
+reaches the scorer via :attr:`ScorerInput.metadata` under three keys:
+
+- ``_mc_choices``: list of candidate continuation strings.
+- ``_mc_choices_logprobs``: list of per-choice sum-loglikelihoods.
+- ``_mc_choices_is_greedy``: list of booleans, True iff every continuation
+  token equals the top-1 logprob token at its position.
+
+The companion :func:`mcq_letter_extract` scorer scores the text-mode case
+where the model produced a free-form response and we extract a single
+letter (A-J) before comparing to ground truth.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import TYPE_CHECKING
+
+from nemo_evaluator.environments.custom import scorer
+from nemo_evaluator.scoring.metric import MetricOutputSpec
+
+if TYPE_CHECKING:
+    from nemo_evaluator.scoring.types import ScorerInput
+
+
+_INT_TO_LETTER = {i: chr(ord("A") + i) for i in range(10)}
+
+
+# Patterns ordered most-specific first; the first match wins.
+_MCQ_LETTER_PATTERNS = (
+    re.compile(r"\\boxed\{\s*([A-Ja-j])\s*\}"),
+    re.compile(r"(?:answer\s+is\s*[:\-]?\s*\(?)\s*([A-Ja-j])\b", re.IGNORECASE),
+    re.compile(r"\boption\s*\(?([A-Ja-j])\)", re.IGNORECASE),
+    re.compile(r"^\s*\(?([A-Ja-j])[\)\.\:]", re.IGNORECASE),
+    re.compile(r"\b([A-Ja-j])\b"),
+)
+
+
+def _extract_letter(response: str | None) -> str:
+    """Best-effort extract a single A-J letter from a free-form response."""
+    text = (response or "").strip()
+    if not text:
+        return ""
+    if text[0].upper() in "ABCDEFGHIJ" and (len(text) == 1 or not text[1].isalpha()):
+        return text[0].upper()
+    for pat in _MCQ_LETTER_PATTERNS:
+        m = pat.search(text[:200])
+        if m:
+            return m.group(1).upper()
+    return ""
+
+
+@scorer(
+    metric_type="multiple_choice_acc",
+    outputs=[
+        MetricOutputSpec.continuous_score(
+            "acc",
+            description="1.0 iff argmax of raw choice loglikelihoods matches gold (canonical MMLU metric).",
+        ),
+        MetricOutputSpec.continuous_score(
+            "acc_norm",
+            description="1.0 iff argmax of length-normalized loglikelihoods matches gold (ARC/BoolQ).",
+        ),
+        MetricOutputSpec.continuous_score(
+            "acc_greedy",
+            description="1.0 iff the highest-loglikelihood greedy-eligible choice matches gold.",
+        ),
+    ],
+)
+def multiple_choice_acc(sample: ScorerInput) -> dict:
+    """Score multiple-choice loglikelihood ranking with lm-eval-harness metrics.
+
+    Reads ``_mc_choices``, ``_mc_choices_logprobs``, ``_mc_choices_is_greedy``
+    from :attr:`ScorerInput.metadata`. Returns ``{"acc": 0.0, "acc_norm":
+    0.0, "acc_greedy": 0.0}`` when the payload is missing or malformed (e.g.,
+    user wired the scorer to a benchmark that didn't go through the
+    LogprobRankingSolver).
+
+    Resolves the gold target heuristically — accepts integer index, single
+    letter ``"A"..."Z"``, stringified integer, or verbatim choice text.
+    Length-normalization uses character length to match lm-eval-harness's
+    ``acc_norm``.
+    """
+    meta = sample.metadata or {}
+    choices = meta.get("_mc_choices") or []
+    logprobs = meta.get("_mc_choices_logprobs") or []
+    is_greedy = meta.get("_mc_choices_is_greedy") or []
+
+    zero = {"acc": 0.0, "acc_norm": 0.0, "acc_greedy": 0.0}
+    if not choices or not logprobs or len(choices) != len(logprobs):
+        return zero
+
+    gold_idx = _resolve_gold_index(sample.target, choices)
+    if gold_idx is None:
+        return zero
+
+    raw_argmax = max(range(len(logprobs)), key=lambda i: logprobs[i])
+
+    norm_scores = [logprobs[i] / max(len(choices[i]), 1) for i in range(len(choices))]
+    norm_argmax = max(range(len(norm_scores)), key=lambda i: norm_scores[i])
+
+    greedy_argmax: int | None = None
+    if is_greedy and any(is_greedy):
+        greedy_indices = [i for i, g in enumerate(is_greedy) if g]
+        if greedy_indices:
+            greedy_argmax = max(greedy_indices, key=lambda i: logprobs[i])
+
+    return {
+        "acc": 1.0 if raw_argmax == gold_idx else 0.0,
+        "acc_norm": 1.0 if norm_argmax == gold_idx else 0.0,
+        "acc_greedy": 1.0 if greedy_argmax is not None and greedy_argmax == gold_idx else 0.0,
+    }
+
+
+@scorer(
+    metric_type="mcq_letter_extract",
+    outputs=[
+        MetricOutputSpec.continuous_score(
+            "correct",
+            description="1.0 iff the letter extracted from the response matches the gold letter.",
+        ),
+        MetricOutputSpec.boolean(
+            "parsed",
+            description="True iff the response yielded a non-empty extracted letter.",
+        ),
+    ],
+)
+def mcq_letter_extract(sample: ScorerInput) -> dict:
+    """Extract A-J from response and compare to target.
+
+    Handles common response formats: ``"A"``, ``"A)"``, ``"The answer is B"``,
+    ``"B. Because..."``, ``"(C)"``, ``"Option D"``, ``"\\boxed{E}"``.
+
+    Targets may be a letter (``"A"..."J"``), an integer (``0..9``), the
+    string form of an integer, or verbatim choice text. When the target is
+    verbatim text and the row has letter-coded choice columns (``a``/``b``/
+    ``c``/``d`` in ``ScorerInput.metadata``), it is matched against those.
+    """
+    predicted = _extract_letter(sample.response)
+
+    raw = sample.target
+    target_letter = ""
+    if isinstance(raw, bool):
+        raw = int(raw)
+    if isinstance(raw, int):
+        target_letter = _INT_TO_LETTER.get(raw, "")
+    elif isinstance(raw, str):
+        s = raw.strip()
+        if s.isdigit():
+            target_letter = _INT_TO_LETTER.get(int(s), s.upper())
+        elif s and s[0].upper() in "ABCDEFGHIJ" and len(s) <= 2:
+            target_letter = s[0].upper()
+        else:
+            for letter, key in zip("ABCD", ("a", "b", "c", "d")):
+                if (sample.metadata or {}).get(key, "").strip().lower() == s.lower():
+                    target_letter = letter
+                    break
+
+    return {
+        "correct": 1.0 if predicted and predicted == target_letter else 0.0,
+        "parsed": bool(predicted),
+    }
+
+
+def _resolve_gold_index(target: object, choices: list[str]) -> int | None:
+    """Map a heterogeneous target value to an index into choices."""
+    if target is None:
+        return None
+    if isinstance(target, bool):
+        i = int(target)
+        return i if i < len(choices) else None
+    if isinstance(target, int):
+        return target if 0 <= target < len(choices) else None
+    if isinstance(target, str):
+        s = target.strip()
+        if not s:
+            return None
+        if len(s) == 1 and s.upper().isalpha():
+            idx = ord(s.upper()) - ord("A")
+            if 0 <= idx < len(choices):
+                return idx
+        if s.lstrip("-").isdigit():
+            try:
+                idx = int(s)
+                if 0 <= idx < len(choices):
+                    return idx
+            except ValueError:
+                pass
+        for i, c in enumerate(choices):
+            if c.strip() == s:
+                return i
+        s_low = s.lower()
+        for i, c in enumerate(choices):
+            if c.strip().lower() == s_low:
+                return i
+    return None
+
+
+__all__ = ["multiple_choice_acc", "mcq_letter_extract"]
diff --git a/src/nemo_evaluator/scoring/types.py b/src/nemo_evaluator/scoring/types.py
index fa2426a5a..fe3a7fa43 100644
--- a/src/nemo_evaluator/scoring/types.py
+++ b/src/nemo_evaluator/scoring/types.py
@@ -17,14 +17,17 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Generic, TypeVar
 
 if TYPE_CHECKING:
     from nemo_evaluator.sandbox.base import Sandbox
 
 
+ConfigT = TypeVar("ConfigT")
+
+
 @dataclass
-class ScorerInput:
+class ScorerInput(Generic[ConfigT]):
     """Input passed to scorer functions.
 
     The ``sandbox`` field is available for scorers that need to inspect or
@@ -36,5 +39,5 @@ class ScorerInput:
     response: str
     target: Any
     metadata: dict[str, Any] = field(default_factory=dict)
-    config: dict[str, Any] = field(default_factory=dict)
+    config: ConfigT = field(default_factory=dict)
     sandbox: Sandbox | None = None
diff --git a/src/nemo_evaluator/solvers/__init__.py b/src/nemo_evaluator/solvers/__init__.py
index 549cf3a11..9bb590abd 100644
--- a/src/nemo_evaluator/solvers/__init__.py
+++ b/src/nemo_evaluator/solvers/__init__.py
@@ -17,6 +17,7 @@
 from nemo_evaluator.solvers.base import Solver, SolveResult
 from nemo_evaluator.solvers.chat import ChatSolver
 from nemo_evaluator.solvers.completion import CompletionSolver
+from nemo_evaluator.solvers.logprob import LogprobRankingSolver
 from nemo_evaluator.solvers.nat import NatSolver
 from nemo_evaluator.solvers.openclaw import OpenClawSolver
 from nemo_evaluator.solvers.vlm import VLMSolver
@@ -25,6 +26,7 @@
     "ChatSolver",
     "CompletionSolver",
     "HarborSolver",
+    "LogprobRankingSolver",
     "NatSolver",
     "OpenClawSolver",
     "ReActSolver",
diff --git a/src/nemo_evaluator/solvers/logprob.py b/src/nemo_evaluator/solvers/logprob.py
new file mode 100644
index 000000000..7bd609db4
--- /dev/null
+++ b/src/nemo_evaluator/solvers/logprob.py
@@ -0,0 +1,232 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LogprobRankingSolver: score candidate continuations by sum-loglikelihood.
+
+Mirrors lm-evaluation-harness's ``loglikelihood`` contract for the
+``local-completions`` model adapter. For each row the solver:
+
+1. Reads candidate continuations from ``task.metadata["_mc_choices"]``.
+2. POSTs ``/completions`` with ``prompt = task.prompt + continuation``,
+   ``max_tokens=0``, ``echo=true``, ``logprobs=1`` for each candidate.
+3. Locates the continuation token span via OpenAI-compatible
+   ``logprobs.text_offset`` (first token whose offset is ``>= len(prompt)``).
+4. Sums ``token_logprobs`` over the continuation span, computes greedy
+   eligibility against ``top_logprobs``, returns a ``SolveResult`` whose
+   ``response`` is the argmax choice and whose ``scoring_details`` carries
+   the per-choice logprobs/greedy flags for the verify stage.
+
+The eval loop forwards ``solve_result.scoring_details`` to
+:meth:`EvalEnvironment.verify` so the scoring step can place the payload
+on ``MetricInput.candidate.metadata`` for the metric to consume.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import time
+from typing import Any
+
+from nemo_evaluator.environments.base import SeedResult
+from nemo_evaluator.observability.types import ModelResponse
+from nemo_evaluator.solvers.base import ErrorKind, SolveResult
+from nemo_evaluator.solvers.trajectory_util import build_single_turn_atif
+
+logger = logging.getLogger(__name__)
+
+
+class LogprobRankingSolver:
+    """Solver that ranks candidate continuations by sum-loglikelihood.
+
+    Required ``task.metadata`` keys:
+
+    - ``_mc_choices``: ``list[str]`` of candidate continuations.
+
+    Optional ``task.metadata`` keys:
+
+    - ``_mc_continuation_separator``: ``str`` inserted between the prompt
+      and each continuation (default ``""``). lm-eval-harness conventionally
+      uses ``" "`` for whitespace-sensitive tokenizers.
+    """
+
+    def __init__(
+        self,
+        base_url: str,
+        model: str,
+        api_key: str | None = None,
+        max_concurrent_choices: int = 8,
+    ) -> None:
+        from nemo_evaluator.engine.model_client import ModelClient
+
+        self._model_client = ModelClient(
+            base_url=base_url.rstrip("/"),
+            model=model,
+            api_key=api_key,
+        )
+        self._url = f"{base_url.rstrip('/')}/completions"
+        self._model = model
+        self._max_concurrent_choices = max_concurrent_choices
+
+    async def solve(self, task: SeedResult) -> SolveResult:
+        choices: list[str] = list(task.metadata.get("_mc_choices") or [])
+        if not choices:
+            return SolveResult(
+                response="",
+                error="LogprobRankingSolver requires task.metadata['_mc_choices'] to be a non-empty list",
+                error_kind=ErrorKind.GRACEFUL,
+            )
+
+        separator = task.metadata.get("_mc_continuation_separator", "")
+        prompt = task.prompt or ""
+        full_context = (task.system + "\n" + prompt) if task.system else prompt
+
+        sem = asyncio.Semaphore(self._max_concurrent_choices)
+
+        async def _score_one(choice: str) -> tuple[float, bool]:
+            async with sem:
+                return await self._score_continuation(full_context, separator, choice)
+
+        t0 = time.monotonic()
+        results = await asyncio.gather(*[_score_one(c) for c in choices], return_exceptions=True)
+        latency_ms = round((time.monotonic() - t0) * 1000, 2)
+
+        logprobs: list[float] = []
+        is_greedy: list[bool] = []
+        for r in results:
+            if isinstance(r, BaseException):
+                return SolveResult(
+                    response="",
+                    error=f"LogprobRankingSolver inference failed: {r}",
+                    error_kind=ErrorKind.INFRA,
+                )
+            ll, greedy = r
+            logprobs.append(ll)
+            is_greedy.append(greedy)
+
+        argmax_idx = max(range(len(choices)), key=lambda i: logprobs[i])
+        response = choices[argmax_idx]
+
+        model_resp = ModelResponse(
+            content=response,
+            model=self._model,
+            finish_reason="stop",
+            prompt_tokens=None,
+            completion_tokens=None,
+            total_tokens=None,
+            latency_ms=latency_ms,
+            raw_response={"choices_logprobs": logprobs, "choices_is_greedy": is_greedy, "n_choices": len(choices)},
+        )
+        trajectory = build_single_turn_atif(prompt, response, model_name=self._model)
+        return SolveResult(
+            response=response,
+            model_response=model_resp,
+            trajectory=trajectory,
+            scoring_details={
+                "_mc_choices": choices,
+                "_mc_choices_logprobs": logprobs,
+                "_mc_choices_is_greedy": is_greedy,
+            },
+        )
+
+    async def close(self) -> None:
+        await self._model_client.close()
+
+    async def _score_continuation(self, context: str, separator: str, continuation: str) -> tuple[float, bool]:
+        """Score one continuation by sum-loglikelihood.
+
+        Returns ``(sum_logprob, is_greedy)``. ``is_greedy`` is True iff every
+        continuation token equals the top-1 candidate at its position under
+        the model's logprob distribution.
+        """
+        full_context = context + separator
+        full_prompt = full_context + continuation
+        payload: dict[str, Any] = {
+            "model": self._model,
+            "prompt": full_prompt,
+            "max_tokens": 0,
+            "temperature": 0.0,
+            "logprobs": 1,
+            "echo": True,
+        }
+        data = await self._model_client._post_with_retry(self._url, payload)
+        return _parse_loglikelihood_response(data, full_context)
+
+
+def _parse_loglikelihood_response(body: dict[str, Any], context: str) -> tuple[float, bool]:
+    """Parse an OpenAI-compatible /completions response into ``(sum_logprob, is_greedy)``.
+
+    Returns ``(-inf, False)`` when the response is missing logprobs or the
+    continuation span is empty (zero tokens).
+    """
+    try:
+        choice = body["choices"][0]
+        logprobs = choice.get("logprobs") or {}
+    except (KeyError, IndexError, TypeError):
+        return (float("-inf"), False)
+
+    tokens: list[str] = logprobs.get("tokens") or []
+    token_logprobs: list[float | None] = logprobs.get("token_logprobs") or []
+    text_offset: list[int | None] = logprobs.get("text_offset") or []
+    top_logprobs: list[dict[str, float] | None] = logprobs.get("top_logprobs") or []
+
+    if not tokens or not token_logprobs:
+        return (float("-inf"), False)
+
+    ctx_len = len(context)
+    start_idx: int | None = None
+    for i, off in enumerate(text_offset):
+        if off is not None and off >= ctx_len:
+            start_idx = i
+            break
+
+    # Fallback: tokenizer merged context-end with continuation-start. Walk
+    # backwards from the end and pick the first token whose ``text_offset``
+    # is < ``ctx_len`` — the token AFTER it is the first that contains
+    # continuation chars only. This is an approximation; lm-eval-harness
+    # tokenizes context separately for exact accounting.
+    if start_idx is None:
+        for i in range(len(text_offset) - 1, -1, -1):
+            off = text_offset[i]
+            if off is not None and off < ctx_len:
+                start_idx = i + 1 if i + 1 < len(tokens) else i
+                break
+        if start_idx is None:
+            start_idx = max(len(tokens) - 1, 0)
+
+    cont_token_logprobs = [lp for lp in token_logprobs[start_idx:] if lp is not None]
+    if not cont_token_logprobs:
+        return (float("-inf"), False)
+
+    sum_logprob = float(sum(cont_token_logprobs))
+
+    is_greedy = True
+    for i in range(start_idx, len(tokens)):
+        top = top_logprobs[i] if i < len(top_logprobs) else None
+        if not top:
+            is_greedy = False
+            break
+        try:
+            best_token = max(top.items(), key=lambda kv: kv[1])[0]
+        except (AttributeError, ValueError):
+            is_greedy = False
+            break
+        if best_token != tokens[i]:
+            is_greedy = False
+            break
+
+    return (sum_logprob, is_greedy)
+
+
+__all__ = ["LogprobRankingSolver"]
diff --git a/tests/test_environments/test_byob_mc_integration.py b/tests/test_environments/test_byob_mc_integration.py
new file mode 100644
index 000000000..3fa328365
--- /dev/null
+++ b/tests/test_environments/test_byob_mc_integration.py
@@ -0,0 +1,295 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""End-to-end BYOB integration tests for multiple-choice + few-shot.
+
+These tests exercise the full ``ByobEnvironment.seed → verify`` path with
+synthetic datasets and a fake LogprobRankingSolver. No model server is
+required — the solver is mocked, the dataset is in-memory, and the eval
+loop is sidestepped (we call seed/verify directly to keep the test
+hermetic and fast).
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from nemo_evaluator.environments.custom import (
+    BenchmarkDefinition,
+    ByobEnvironment,
+    _metric_input_from_verify,
+    _resolve_mc_choices,
+    benchmark,
+    scorer,
+)
+from nemo_evaluator.scoring.metric import MetricOutputSpec
+from nemo_evaluator.scoring.multiple_choice import multiple_choice_acc
+
+
+# ── _resolve_mc_choices ─────────────────────────────────────────────────────
+
+
+def test_resolve_mc_choices_static_list() -> None:
+    defn = BenchmarkDefinition(
+        name="t", dataset=[], prompt="", choices=["A", "B", "C", "D"]
+    )
+    assert _resolve_mc_choices({"answer": 1}, defn) == ["A", "B", "C", "D"]
+
+
+def test_resolve_mc_choices_per_row_field() -> None:
+    defn = BenchmarkDefinition(
+        name="t", dataset=[], prompt="", choices_field="options"
+    )
+    assert _resolve_mc_choices({"options": ["X", "Y"]}, defn) == ["X", "Y"]
+
+
+def test_resolve_mc_choices_dotted_path() -> None:
+    """ARC-style nested choices: row['choices']['text'] = [...]."""
+    defn = BenchmarkDefinition(
+        name="t", dataset=[], prompt="", choices_field="choices.text"
+    )
+    row = {"choices": {"text": ["foo", "bar"], "label": ["A", "B"]}}
+    assert _resolve_mc_choices(row, defn) == ["foo", "bar"]
+
+
+def test_resolve_mc_choices_field_takes_precedence_over_static() -> None:
+    defn = BenchmarkDefinition(
+        name="t",
+        dataset=[],
+        prompt="",
+        choices=["fallback1", "fallback2"],
+        choices_field="options",
+    )
+    assert _resolve_mc_choices({"options": ["x", "y"]}, defn) == ["x", "y"]
+
+
+def test_resolve_mc_choices_falls_back_when_field_missing() -> None:
+    defn = BenchmarkDefinition(
+        name="t",
+        dataset=[],
+        prompt="",
+        choices=["fallback1", "fallback2"],
+        choices_field="missing",
+    )
+    assert _resolve_mc_choices({}, defn) == ["fallback1", "fallback2"]
+
+
+def test_resolve_mc_choices_returns_none_when_neither_set() -> None:
+    defn = BenchmarkDefinition(name="t", dataset=[], prompt="")
+    assert _resolve_mc_choices({}, defn) is None
+
+
+# ── _metric_input_from_verify lifts _mc_* into candidate.metadata ───────────
+
+
+def test_metric_input_from_verify_separates_mc_keys() -> None:
+    metric_input = _metric_input_from_verify(
+        response="Paris",
+        metadata={
+            "question": "Capital of France?",
+            "answer_idx": 2,
+            "_mc_choices": ["A", "B", "Paris", "London"],
+            "_mc_choices_logprobs": [-5.1, -4.8, -1.2, -3.7],
+            "_mc_choices_is_greedy": [False, False, True, False],
+            "_solver_attempts": 1,
+        },
+    )
+    # Dataset row keys go to row.data
+    assert metric_input.row.data == {"question": "Capital of France?", "answer_idx": 2}
+    # _mc_* and _solver_* keys go to candidate.metadata
+    assert metric_input.candidate.metadata == {
+        "_mc_choices": ["A", "B", "Paris", "London"],
+        "_mc_choices_logprobs": [-5.1, -4.8, -1.2, -3.7],
+        "_mc_choices_is_greedy": [False, False, True, False],
+        "_solver_attempts": 1,
+    }
+    assert metric_input.candidate.output_text == "Paris"
+
+
+# ── End-to-end BYOB seed + verify with multiple_choice_acc ──────────────────
+
+
+@pytest.mark.asyncio
+async def test_byob_mmlu_style_end_to_end() -> None:
+    """MMLU-style 4-way multiple-choice with static choices, in-memory dataset."""
+    dataset = [
+        {"question": "Capital of France?", "answer": 2},
+        {"question": "Capital of UK?", "answer": 3},
+    ]
+    defn = BenchmarkDefinition(
+        name="mmlu_synth",
+        dataset=lambda: dataset,
+        prompt="Q: {question}\nA: ",
+        target_field="answer",
+        choices=["Berlin", "Madrid", "Paris", "London"],
+        scorer_fn=multiple_choice_acc,
+    )
+    env = ByobEnvironment(defn)
+
+    # Seed populates _mc_choices in metadata
+    seed = await env.seed(0)
+    assert seed.metadata["_mc_choices"] == ["Berlin", "Madrid", "Paris", "London"]
+    assert seed.expected_answer == "2"
+    assert seed.prompt == "Q: Capital of France?\nA: "
+
+    # Simulate the LogprobRankingSolver having already ranked the choices —
+    # answer index 2 ("Paris") wins.
+    solver_meta = {
+        "_mc_choices": ["Berlin", "Madrid", "Paris", "London"],
+        "_mc_choices_logprobs": [-5.1, -4.8, -1.2, -3.7],
+        "_mc_choices_is_greedy": [False, False, True, False],
+    }
+    merged = {**seed.metadata, **solver_meta}
+    vr = await env.verify("Paris", "2", **merged)
+
+    assert vr.reward == 1.0  # acc=1, the default reward picks first declared "acc"
+    assert vr.scoring_details["metric_type"] == "multiple_choice_acc"
+    assert vr.scoring_details["outputs"] == {"acc": 1.0, "acc_norm": 1.0, "acc_greedy": 1.0}
+
+
+@pytest.mark.asyncio
+async def test_byob_per_row_choices_arc_style() -> None:
+    """ARC-style: variable-length per-row choices via dotted path."""
+    dataset = [
+        {
+            "question": "What's hot?",
+            "choices": {"text": ["lava", "ice", "water"], "label": ["A", "B", "C"]},
+            "answer": "A",
+        },
+        {
+            "question": "Smallest planet?",
+            "choices": {"text": ["Mercury", "Venus"], "label": ["A", "B"]},
+            "answer": "A",
+        },
+    ]
+    defn = BenchmarkDefinition(
+        name="arc_synth",
+        dataset=lambda: dataset,
+        prompt="Q: {question}\nA: ",
+        target_field="answer",
+        choices_field="choices.text",
+        scorer_fn=multiple_choice_acc,
+    )
+    env = ByobEnvironment(defn)
+
+    seed = await env.seed(0)
+    assert seed.metadata["_mc_choices"] == ["lava", "ice", "water"]
+
+    seed1 = await env.seed(1)
+    assert seed1.metadata["_mc_choices"] == ["Mercury", "Venus"]
+
+
+@pytest.mark.asyncio
+async def test_byob_few_shot_prefix_prepended() -> None:
+    dataset = [
+        {"q": "1+1", "a": "2"},
+        {"q": "2+2", "a": "4"},
+        {"q": "3+3", "a": "6"},
+    ]
+    defn = BenchmarkDefinition(
+        name="fewshot_synth",
+        dataset=lambda: dataset,
+        prompt="Q: {q} A: ",
+        target_field="a",
+        num_fewshot=2,
+        fewshot_seed=0,
+    )
+    env = ByobEnvironment(defn)
+
+    seed = await env.seed(0)
+    # Prompt should have 2 few-shot examples + separator + the test prompt
+    assert seed.prompt.count("Q: ") == 3  # 2 fewshot + 1 test
+    assert seed.prompt.endswith("Q: 1+1 A: ")
+
+
+@pytest.mark.asyncio
+async def test_byob_few_shot_zero_means_no_prefix() -> None:
+    dataset = [{"q": "1+1", "a": "2"}, {"q": "2+2", "a": "4"}]
+    defn = BenchmarkDefinition(
+        name="nofewshot",
+        dataset=lambda: dataset,
+        prompt="Q: {q} A: ",
+        target_field="a",
+        num_fewshot=0,
+    )
+    env = ByobEnvironment(defn)
+    seed = await env.seed(0)
+    assert seed.prompt == "Q: 1+1 A: "
+
+
+@pytest.mark.asyncio
+async def test_byob_mc_payload_threads_to_metric_via_verify_meta() -> None:
+    """End-to-end: seed → fake solver injects logprobs → verify → metric scores.
+
+    This is the integration proof: the solver's output reaches the metric
+    through ``MetricInput.candidate.metadata`` without protocol changes.
+    """
+    dataset = [
+        {"question": "Capital of France?", "answer": 2},
+    ]
+    defn = BenchmarkDefinition(
+        name="proof",
+        dataset=lambda: dataset,
+        prompt="Q: {question}\nA: ",
+        target_field="answer",
+        choices=["Berlin", "Madrid", "Paris", "London"],
+        scorer_fn=multiple_choice_acc,
+    )
+    env = ByobEnvironment(defn)
+
+    seed = await env.seed(0)
+    # Pretend a LogprobRankingSolver ran and wrote scoring_details
+    solver_scoring_details = {
+        "_mc_choices": ["Berlin", "Madrid", "Paris", "London"],
+        "_mc_choices_logprobs": [-5.1, -4.8, -1.2, -3.7],
+        "_mc_choices_is_greedy": [False, False, True, False],
+    }
+    merged_meta = {**seed.metadata, **solver_scoring_details}
+    vr = await env.verify("Paris", seed.expected_answer, **merged_meta)
+
+    assert vr.reward == 1.0
+    assert vr.scoring_details["outputs"] == {
+        "acc": 1.0,
+        "acc_norm": 1.0,
+        "acc_greedy": 1.0,
+    }
+
+
+# ── @benchmark decorator integration ────────────────────────────────────────
+
+
+def test_benchmark_decorator_wires_mc_kwargs_to_definition() -> None:
+    """Sanity: the new @benchmark kwargs land on BenchmarkDefinition."""
+    from nemo_evaluator.environments.custom import _BYOB_REGISTRY
+
+    @benchmark(
+        name="test_mc_decorator",
+        dataset=[{"q": "x", "answer": 0}],
+        prompt="{q}",
+        target_field="answer",
+        choices=["a", "b"],
+        num_fewshot=1,
+        fewshot_separator="||",
+    )
+    @scorer(
+        metric_type="test_score",
+        outputs=[MetricOutputSpec.continuous_score("ok")],
+    )
+    def _score(sample) -> dict:
+        return {"ok": 1.0}
+
+    defn = _BYOB_REGISTRY["test_mc_decorator"]
+    assert defn.choices == ["a", "b"]
+    assert defn.num_fewshot == 1
+    assert defn.fewshot_separator == "||"
diff --git a/tests/test_environments/test_custom_metric_contract.py b/tests/test_environments/test_custom_metric_contract.py
new file mode 100644
index 000000000..0106b6f55
--- /dev/null
+++ b/tests/test_environments/test_custom_metric_contract.py
@@ -0,0 +1,152 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for BYOB scorer compatibility with the shared metric contract."""
+
+from __future__ import annotations
+
+from typing import cast
+
+import pytest
+from pydantic import BaseModel
+
+from nemo_evaluator.environments.custom import BenchmarkDefinition, ByobEnvironment, scorer
+from nemo_evaluator.scoring import ScorerInput
+from nemo_evaluator.scoring.metric import (
+    MetricOutputSpec,
+)
+from nemo_evaluator.sandbox.base import Sandbox
+
+
+def _dataset() -> list[dict[str, object]]:
+    return [{"question": "2+2", "answer": "4", "category": "math"}]
+
+
+class ThresholdConfig(BaseModel):
+    threshold: float
+
+
+@pytest.mark.asyncio
+async def test_plain_scorer_decorator_keeps_current_dict_path() -> None:
+    @scorer
+    def plain_scorer(sample: ScorerInput) -> dict[str, object]:
+        assert sample.response == "4"
+        assert sample.target == "4"
+        assert sample.metadata["category"] == "math"
+        assert sample.config["tolerance"] == "exact"
+        return {"correct": True, "extracted": "4", "label": "exact"}
+
+    env = ByobEnvironment(
+        BenchmarkDefinition(
+            name="plain_contract",
+            dataset=_dataset,
+            prompt="{question}",
+            target_field="answer",
+            extra={"tolerance": "exact"},
+            scorer_fn=plain_scorer,
+        )
+    )
+
+    result = await env.verify("4", "4", category="math")
+
+    assert result.reward == 1.0
+    assert result.extracted_answer == "4"
+    assert result.scoring_details == {
+        "method": "byob_plain_contract",
+        "correct": True,
+        "extracted": "4",
+        "label": "exact",
+    }
+
+
+@pytest.mark.asyncio
+async def test_typed_scorer_runs_as_metric_through_byob_verify() -> None:
+    outputs = [
+        MetricOutputSpec.continuous_score("reward"),
+        MetricOutputSpec.continuous_score("format"),
+        MetricOutputSpec.label("judge_label"),
+        MetricOutputSpec.label("extracted"),
+    ]
+    sandbox = cast(Sandbox, object())
+
+    @scorer(metric_type="tests.typed_byob", outputs=outputs, config_schema=ThresholdConfig)
+    async def typed_scorer(sample: ScorerInput[ThresholdConfig]) -> dict[str, object]:
+        assert sample.response == "4"
+        assert sample.target == "4"
+        assert "answer" not in sample.metadata
+        assert sample.metadata["category"] == "math"
+        assert isinstance(sample.config, ThresholdConfig)
+        assert sample.config.threshold == 0.75
+        assert sample.sandbox is sandbox
+        return {"reward": sample.config.threshold, "format": 1.0, "judge_label": "partial", "extracted": "4"}
+
+    env = ByobEnvironment(
+        BenchmarkDefinition(
+            name="typed_contract",
+            dataset=_dataset,
+            prompt="{question}",
+            target_field="answer",
+            extra={"threshold": "0.75"},
+            scorer_fn=typed_scorer,
+        )
+    )
+
+    result = await env.verify("4", "4", sandbox=sandbox, category="math")
+
+    assert result.reward == 0.75
+    assert result.extracted_answer == "4"
+    assert result.scoring_details == {
+        "method": "byob_typed_contract",
+        "metric_type": "tests.typed_byob",
+        "outputs": {"reward": 0.75, "format": 1.0, "judge_label": "partial", "extracted": "4"},
+        "reward": 0.75,
+        "format": 1.0,
+        "judge_label": "partial",
+        "extracted": "4",
+    }
+
+
+@pytest.mark.parametrize(
+    ("score_names", "score_values", "expected_reward"),
+    [
+        (["reward", "correct"], {"reward": 0.2, "correct": 1.0}, 0.2),
+        (["quality", "correct"], {"quality": 0.2, "correct": 1.0}, 1.0),
+        (["quality", "format"], {"quality": 0.4, "format": 1.0}, 0.4),
+    ],
+)
+@pytest.mark.asyncio
+async def test_typed_scorer_reward_selection(
+    score_names: list[str], score_values: dict[str, float], expected_reward: float
+) -> None:
+    outputs = [MetricOutputSpec.continuous_score(name) for name in score_names]
+
+    @scorer(metric_type=f"tests.reward_selection.{score_names[0]}", outputs=outputs)
+    def typed_scorer(sample: ScorerInput) -> dict[str, object]:
+        assert sample.response == "4"
+        assert sample.target == "4"
+        return {name: score_values[name] for name in score_names}
+
+    env = ByobEnvironment(
+        BenchmarkDefinition(
+            name=f"typed_reward_{score_names[0]}",
+            dataset=_dataset,
+            prompt="{question}",
+            target_field="answer",
+            scorer_fn=typed_scorer,
+        )
+    )
+
+    result = await env.verify("4", "4", category="math")
+
+    assert result.reward == expected_reward
diff --git a/tests/test_environments/test_dataset_uri_parsing.py b/tests/test_environments/test_dataset_uri_parsing.py
new file mode 100644
index 000000000..c6394437e
--- /dev/null
+++ b/tests/test_environments/test_dataset_uri_parsing.py
@@ -0,0 +1,127 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""HuggingFace URI parsing in ``_load_hf``.
+
+Locks down the path-segment-with-config behavior used by Sovereign
+benchmarks (``hf://CohereForAI/Global-MMLU-Lite/en?split=test``) and the
+filter-kwarg row dropping. We patch ``datasets.load_dataset`` so the tests
+don't actually hit HuggingFace.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import patch
+
+import pytest
+
+from nemo_evaluator.environments.custom import _extract_filters, _load_hf
+
+
+@pytest.fixture
+def captured() -> dict:
+    return {"calls": []}
+
+
+@pytest.fixture
+def fake_load_dataset(captured: dict):
+    def _fake(*args, **kwargs):
+        captured["calls"].append({"args": args, "kwargs": kwargs})
+        return [{"row_id": 1, "text": "hello", "category": "x"}]
+
+    with patch("datasets.load_dataset", side_effect=_fake) as mock:
+        yield mock
+
+
+def test_query_split_simple(fake_load_dataset, captured: dict) -> None:
+    _load_hf("google/boolq?split=validation")
+    call = captured["calls"][0]
+    assert call["args"] == ("google/boolq",)
+    assert call["kwargs"]["split"] == "validation"
+
+
+def test_path_segment_config(fake_load_dataset, captured: dict) -> None:
+    """``hf://ns/name/config?split=test`` — the config is the third path segment."""
+    _load_hf("CohereForAI/Global-MMLU-Lite/en?split=test")
+    call = captured["calls"][0]
+    assert call["args"] == ("CohereForAI/Global-MMLU-Lite", "en")
+    assert call["kwargs"]["split"] == "test"
+
+
+def test_path_segment_config_and_split(fake_load_dataset, captured: dict) -> None:
+    """``hf://ns/name/config/split`` — both from path segments."""
+    _load_hf("CohereForAI/Global-MMLU-Lite/en/test")
+    call = captured["calls"][0]
+    assert call["args"] == ("CohereForAI/Global-MMLU-Lite", "en")
+    assert call["kwargs"]["split"] == "test"
+
+
+def test_query_overrides_path_split(fake_load_dataset, captured: dict) -> None:
+    _load_hf("ns/name/cfg/train?split=test")
+    call = captured["calls"][0]
+    assert call["args"] == ("ns/name", "cfg")
+    assert call["kwargs"]["split"] == "test"
+
+
+def test_query_overrides_path_config(fake_load_dataset, captured: dict) -> None:
+    _load_hf("ns/name/cfg?config=other&split=train")
+    call = captured["calls"][0]
+    assert call["args"] == ("ns/name", "other")
+    assert call["kwargs"]["split"] == "train"
+
+
+def test_num_examples_appended_to_split_when_no_slice(fake_load_dataset, captured: dict) -> None:
+    _load_hf("ds?split=test", num_examples=5)
+    assert captured["calls"][0]["kwargs"]["split"] == "test[:5]"
+
+
+def test_num_examples_respects_existing_slice(fake_load_dataset, captured: dict) -> None:
+    _load_hf("ds?split=test[:10]", num_examples=5)
+    assert captured["calls"][0]["kwargs"]["split"] == "test[:10]"
+
+
+def test_load_hf_filters_rows() -> None:
+    rows = [{"category": "a", "n": 1}, {"category": "b", "n": 2}, {"category": "a", "n": 3}]
+    with patch("datasets.load_dataset", return_value=rows):
+        result = _load_hf("ds?split=test&filter_field=category&filter_value=a")
+    assert [r["n"] for r in result] == [1, 3]
+
+
+def test_load_hf_filters_with_numeric_suffix() -> None:
+    rows = [
+        {"category": "a", "lang": "en"},
+        {"category": "a", "lang": "fr"},
+        {"category": "b", "lang": "en"},
+    ]
+    with patch("datasets.load_dataset", return_value=rows):
+        result = _load_hf(
+            "ds?split=test&filter_field=category&filter_value=a&filter_field_1=lang&filter_value_1=en"
+        )
+    assert len(result) == 1
+    assert result[0]["category"] == "a"
+    assert result[0]["lang"] == "en"
+
+
+def test_extract_filters_handles_no_filter() -> None:
+    assert _extract_filters({"split": "test"}) == []
+
+
+def test_extract_filters_pairs_and_suffix() -> None:
+    params = {
+        "filter_field": "x",
+        "filter_value": "1",
+        "filter_field_2": "y",
+        "filter_value_2": "2",
+    }
+    assert _extract_filters(params) == [("x", "1"), ("y", "2")]
diff --git a/tests/test_integration/test_eval_loop_integration.py b/tests/test_integration/test_eval_loop_integration.py
index d96098c24..ba82856a8 100644
--- a/tests/test_integration/test_eval_loop_integration.py
+++ b/tests/test_integration/test_eval_loop_integration.py
@@ -15,11 +15,16 @@
 """Integration tests: run_evaluation end-to-end with mock solver."""
 
 import asyncio
+from typing import Any
 
 
 from nemo_evaluator.environments.base import EvalEnvironment, SeedResult, VerifyResult
+from nemo_evaluator.environments.custom import BenchmarkDefinition, ByobEnvironment, scorer
 from nemo_evaluator.engine.eval_loop import run_evaluation
 from nemo_evaluator.observability.types import ModelResponse
+from nemo_evaluator.scoring import ScorerInput
+from nemo_evaluator.scoring.metric import MetricOutputSpec
+from nemo_evaluator.sandbox.base import Sandbox
 from nemo_evaluator.solvers import SolveResult
 
 
@@ -38,7 +43,9 @@ async def seed(self, idx):
         r = self._dataset[idx]
         return SeedResult(prompt=r["q"], expected_answer=r["a"], metadata={"idx": idx})
 
-    async def verify(self, response, expected, **meta):
+    async def verify(
+        self, response: str, expected: str, sandbox: Sandbox | None = None, **meta: Any
+    ) -> VerifyResult:
         correct = response.strip() == expected.strip()
         return VerifyResult(
             reward=1.0 if correct else 0.0, extracted_answer=response.strip(), scoring_details={"method": "exact"}
@@ -118,7 +125,7 @@ async def tracking_close():
             closed.append(True)
             await original_close()
 
-        env.close = tracking_close
+        env.close = tracking_close  # type: ignore[method-assign]  # ty: ignore[invalid-assignment]
         solver = _MockSolver()
         asyncio.run(run_evaluation(env, solver, n_repeats=1))
         assert closed, "env.close() was not called"
@@ -191,6 +198,51 @@ def test_concurrent_execution(self):
         results = bundle["_results"]
         assert len(results) == 6
 
+    def test_typed_byob_metric_result_preserved_in_results(self):
+        outputs = [
+            MetricOutputSpec.continuous_score("reward"),
+            MetricOutputSpec.continuous_score("format"),
+            MetricOutputSpec.label("judge_label"),
+            MetricOutputSpec.label("rationale"),
+        ]
+
+        @scorer(metric_type="tests.eval_loop_typed", outputs=outputs)
+        def typed_scorer(sample: ScorerInput) -> dict[str, object]:
+            matched = sample.response == sample.target
+            return {
+                "reward": 0.8 if matched else 0.0,
+                "format": 1.0,
+                "judge_label": "pass",
+                "rationale": "answer matched",
+            }
+
+        env = ByobEnvironment(
+            BenchmarkDefinition(
+                name="typed_eval_loop",
+                dataset=lambda: [{"question": "1+1", "answer": "2"}],
+                prompt="{question}",
+                target_field="answer",
+                scorer_fn=typed_scorer,
+            )
+        )
+        solver = _MockSolver()
+
+        bundle = asyncio.run(run_evaluation(env, solver, n_repeats=1))
+
+        result = bundle["_results"][0]
+        assert result["reward"] == 0.8
+        assert result["scoring_details"]["reward"] == 0.8
+        assert result["scoring_details"]["format"] == 1.0
+        assert result["scoring_details"]["outputs"] == {
+            "reward": 0.8,
+            "format": 1.0,
+            "judge_label": "pass",
+            "rationale": "answer matched",
+        }
+        assert result["scoring_details"]["judge_label"] == "pass"
+        assert result["scoring_details"]["rationale"] == "answer matched"
+        assert result["scoring_details"]["metric_type"] == "tests.eval_loop_typed"
+
 
 class _MockSolverWithTrajectory:
     """Always correct; returns a non-empty trajectory so we can assert it survives resume."""
diff --git a/tests/test_scoring/test_metric_contract.py b/tests/test_scoring/test_metric_contract.py
new file mode 100644
index 000000000..5964b170b
--- /dev/null
+++ b/tests/test_scoring/test_metric_contract.py
@@ -0,0 +1,397 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for the shared MetricInput -> MetricResult contract."""
+
+from __future__ import annotations
+
+from typing import cast
+
+import pytest
+from pydantic import BaseModel, ValidationError
+
+from nemo_evaluator.environments.custom import scorer
+from nemo_evaluator.scoring import ScorerInput
+from nemo_evaluator.scoring.metric import (
+    CandidateOutput,
+    ContinuousScore,
+    DatasetRow,
+    Label,
+    MetricDescriptor,
+    MetricInput,
+    MetricOutput,
+    MetricOutputSpec,
+    MetricResult,
+    MetricScorerFunction,
+    ScorerFunctionMetric,
+    score_names_from_output_spec,
+    validate_metric_result,
+)
+from nemo_evaluator.sandbox.base import Sandbox
+
+
+class ThresholdConfig(BaseModel):
+    threshold: float
+    label: str = "pass"
+
+
+class OtherThresholdConfig(BaseModel):
+    threshold: float
+    label: str = "other"
+
+
+class JudgeDetails(BaseModel):
+    label: str
+    rationale: str
+    confidence: float
+
+
+def test_metric_input_groups_row_and_candidate() -> None:
+    metric_input = MetricInput(
+        row=DatasetRow(row_index=7, data={"answer": "Paris", "category": "geography"}),
+        candidate=CandidateOutput(output_text="Paris", metadata={"model": "mock"}),
+    )
+
+    assert metric_input.row.row_index == 7
+    assert metric_input.candidate.output_text == "Paris"
+    assert metric_input.row.data["answer"] == "Paris"
+    assert not hasattr(metric_input, "sandbox")
+    assert not hasattr(metric_input, "config")
+
+
+def test_metric_output_spec_convenience_constructors_and_json_schema() -> None:
+    score = MetricOutputSpec.continuous_score("reward", "Reward score")
+    label = MetricOutputSpec.label("judge_label")
+    details = MetricOutputSpec.model("judge_details", JudgeDetails)
+
+    assert score.name == "reward"
+    assert score.description == "Reward score"
+    assert score.value_schema is ContinuousScore
+    assert score.value_json_schema()["type"] == "number"
+    assert label.value_schema is Label
+    assert details.value_schema is JudgeDetails
+    schema_properties = cast(dict[str, object], details.value_json_schema()["properties"])
+    confidence_schema = cast(dict[str, object], schema_properties["confidence"])
+    assert confidence_schema["type"] == "number"
+
+
+def test_metric_output_spec_coerces_values_to_declared_schema() -> None:
+    reward = MetricOutputSpec.continuous_score("reward")
+    details = MetricOutputSpec.model("judge_details", JudgeDetails)
+
+    coerced_reward = reward.coerce_output(MetricOutput(name="reward", value=1))
+    coerced_details = details.coerce_value(
+        {"label": "pass", "rationale": "all checks passed", "confidence": 0.9}
+    )
+
+    assert isinstance(coerced_reward, ContinuousScore)
+    assert coerced_reward.root == 1.0
+    assert isinstance(coerced_details, JudgeDetails)
+    assert coerced_details.label == "pass"
+
+    with pytest.raises(ValueError, match="Expected metric output"):
+        reward.coerce_output(MetricOutput(name="other", value=1))
+
+
+@pytest.mark.asyncio
+async def test_scorer_function_metric_adapts_dict_return_to_metric_outputs() -> None:
+    outputs = [
+        MetricOutputSpec.boolean("correct"),
+        MetricOutputSpec.continuous_score("reward"),
+        MetricOutputSpec.discrete_score("attempts"),
+        MetricOutputSpec.label("extracted"),
+        MetricOutputSpec.model("judge", JudgeDetails),
+    ]
+    descriptor = MetricDescriptor(type="tests.dict_adapter", outputs=outputs)
+
+    def sync_scorer(sample: ScorerInput) -> dict[str, object]:
+        return {
+            "correct": True,
+            "reward": 0.25,
+            "attempts": 2,
+            "extracted": "A",
+            "judge": {"label": "partial", "rationale": "close", "confidence": 0.5},
+        }
+
+    metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=sync_scorer)
+
+    result = await metric.compute_scores(
+        MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(output_text="candidate"))
+    )
+
+    assert {output.name: output.value for output in result.outputs} == {
+        "correct": True,
+        "reward": 0.25,
+        "attempts": 2,
+        "extracted": "A",
+        "judge": {"label": "partial", "rationale": "close", "confidence": 0.5},
+    }
+
+
+def test_validate_metric_result_accepts_declared_outputs() -> None:
+    outputs = [
+        MetricOutputSpec.continuous_score("reward"),
+        MetricOutputSpec.boolean("correct"),
+        MetricOutputSpec.label("label"),
+    ]
+    result = MetricResult(
+        outputs=[
+            MetricOutput(name="reward", value=True),
+            MetricOutput(name="correct", value=True),
+            MetricOutput(name="label", value="yes"),
+        ]
+    )
+
+    validated = validate_metric_result(result, outputs)
+
+    assert validated.outputs == [
+        MetricOutput(name="reward", value=True),
+        MetricOutput(name="correct", value=True),
+        MetricOutput(name="label", value="yes"),
+    ]
+
+
+def test_typed_scorer_decorator_exposes_config_schema() -> None:
+    outputs = [MetricOutputSpec.continuous_score("reward")]
+
+    @scorer(metric_type="tests.threshold", outputs=outputs, config_schema=ThresholdConfig)
+    def threshold_scorer(sample: ScorerInput[ThresholdConfig]) -> dict[str, object]:
+        return {"reward": sample.config.threshold >= 0.5}
+
+    typed_scorer: MetricScorerFunction[ThresholdConfig] = threshold_scorer
+    metric: ScorerFunctionMetric[ThresholdConfig] = typed_scorer.to_metric()
+
+    assert threshold_scorer.descriptor.config_schema is ThresholdConfig
+    assert metric.type == "tests.threshold"
+
+
+def test_typed_scorer_decorator_requires_metric_type_and_outputs_for_metric_contract_options() -> None:
+    with pytest.raises(ValueError, match=r"Metric contract.*outputs=\[MetricOutputSpec"):
+        scorer(metric_type="tests.missing_outputs")  # type: ignore[reportArgumentType]  # ty: ignore[invalid-argument-type]
+
+    with pytest.raises(ValueError, match=r"Metric contract.*outputs=\[MetricOutputSpec"):
+        scorer(config_schema=ThresholdConfig)  # type: ignore[reportArgumentType]  # ty: ignore[invalid-argument-type]
+
+    with pytest.raises(ValueError, match="no metric_type was declared"):
+        scorer(outputs=[MetricOutputSpec.continuous_score("reward")])  # type: ignore[call-overload]  # ty: ignore[invalid-argument-type]
+
+
+@pytest.mark.asyncio
+async def test_scorer_function_metric_prefers_bound_target_over_row_field() -> None:
+    outputs = [MetricOutputSpec.boolean("reward")]
+    descriptor = MetricDescriptor(type="tests.bound_target", outputs=outputs)
+
+    def sync_scorer(sample: ScorerInput) -> dict[str, object]:
+        assert sample.target == "expected-from-verify"
+        assert sample.metadata["answer"] == "answer-from-row"
+        return {"reward": True}
+
+    metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=sync_scorer).bind(
+        target="expected-from-verify",
+        target_field="answer",
+    )
+
+    result = await metric.compute_scores(
+        MetricInput(
+            row=DatasetRow(data={"answer": "answer-from-row"}),
+            candidate=CandidateOutput(output_text="candidate"),
+        )
+    )
+
+    assert result.outputs == [MetricOutput(name="reward", value=True)]
+
+
+@pytest.mark.asyncio
+async def test_scorer_function_metric_accepts_typed_config_model() -> None:
+    outputs = [MetricOutputSpec.continuous_score("reward")]
+    descriptor = MetricDescriptor(type="tests.typed_config", outputs=outputs, config_schema=ThresholdConfig)
+
+    def sync_scorer(sample: ScorerInput[ThresholdConfig]) -> dict[str, object]:
+        assert isinstance(sample.config, ThresholdConfig)
+        assert sample.config.threshold == 0.5
+        assert sample.config.label == "typed"
+        return {"reward": sample.config.threshold}
+
+    metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=sync_scorer).bind(
+        config=ThresholdConfig(threshold=0.5, label="typed")
+    )
+
+    result = await metric.compute_scores(
+        MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(output_text="candidate"))
+    )
+
+    assert result.outputs == [MetricOutput(name="reward", value=0.5)]
+
+
+@pytest.mark.asyncio
+async def test_scorer_function_metric_validates_raw_config_against_typed_schema() -> None:
+    outputs = [MetricOutputSpec.continuous_score("reward")]
+    descriptor = MetricDescriptor(type="tests.raw_typed_config", outputs=outputs, config_schema=ThresholdConfig)
+
+    def sync_scorer(sample: ScorerInput[ThresholdConfig]) -> dict[str, object]:
+        assert isinstance(sample.config, ThresholdConfig)
+        assert sample.config.threshold == 0.75
+        assert sample.config.label == "pass"
+        return {"reward": sample.config.threshold}
+
+    metric = ScorerFunctionMetric(
+        descriptor=descriptor,
+        scorer_fn=sync_scorer,
+    ).bind_raw_config(config={"threshold": "0.75"})
+
+    result = await metric.compute_scores(
+        MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(output_text="candidate"))
+    )
+
+    assert result.outputs == [MetricOutput(name="reward", value=0.75)]
+
+
+def test_scorer_function_metric_rejects_invalid_typed_config() -> None:
+    outputs = [MetricOutputSpec.continuous_score("reward")]
+    descriptor = MetricDescriptor(type="tests.invalid_config", outputs=outputs, config_schema=ThresholdConfig)
+    metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=lambda sample: {"reward": True})
+
+    with pytest.raises(ValidationError):
+        metric.bind_raw_config(config={"threshold": "not-a-number"})
+
+
+def test_scorer_function_metric_bind_rejects_wrong_config_model_subtype() -> None:
+    outputs = [MetricOutputSpec.continuous_score("reward")]
+    descriptor = MetricDescriptor(type="tests.wrong_config_subtype", outputs=outputs, config_schema=ThresholdConfig)
+    metric = ScorerFunctionMetric(
+        descriptor=descriptor,
+        scorer_fn=lambda sample: {"reward": cast(ThresholdConfig, sample.config).threshold},
+    )
+
+    with pytest.raises(TypeError, match="ThresholdConfig"):
+        metric.bind(config=OtherThresholdConfig(threshold=0.75))
+
+
+def test_scorer_function_metric_bind_rejects_raw_mapping_for_typed_config() -> None:
+    outputs = [MetricOutputSpec.continuous_score("reward")]
+    descriptor = MetricDescriptor(type="tests.raw_config_on_typed_bind", outputs=outputs, config_schema=ThresholdConfig)
+    metric = ScorerFunctionMetric(
+        descriptor=descriptor,
+        scorer_fn=lambda sample: {"reward": cast(ThresholdConfig, sample.config).threshold},
+    )
+
+    with pytest.raises(TypeError, match="bind_raw_config"):
+        metric.bind(config={"threshold": 0.75})
+
+
+def test_validate_metric_result_rejects_duplicate_output_names() -> None:
+    outputs = [MetricOutputSpec.continuous_score("reward")]
+    result = MetricResult(
+        outputs=[
+            MetricOutput(name="reward", value=1.0),
+            MetricOutput(name="reward", value=0.0),
+        ]
+    )
+
+    with pytest.raises(ValueError, match="Duplicate metric output"):
+        validate_metric_result(result, outputs)
+
+
+def test_validate_metric_result_rejects_missing_declared_outputs() -> None:
+    outputs = [MetricOutputSpec.continuous_score("reward"), MetricOutputSpec.continuous_score("format")]
+    result = MetricResult(outputs=[MetricOutput(name="reward", value=1.0)])
+
+    with pytest.raises(ValueError, match="Missing declared metric outputs"):
+        validate_metric_result(result, outputs)
+
+
+def test_validate_metric_result_rejects_undeclared_outputs() -> None:
+    outputs = [MetricOutputSpec.continuous_score("reward")]
+    result = MetricResult(
+        outputs=[
+            MetricOutput(name="reward", value=1.0),
+            MetricOutput(name="format", value=1.0),
+        ]
+    )
+
+    with pytest.raises(ValueError, match="Undeclared metric outputs"):
+        validate_metric_result(result, outputs)
+
+
+def test_validate_metric_result_rejects_value_that_does_not_match_schema() -> None:
+    outputs = [MetricOutputSpec.model("judge_details", JudgeDetails)]
+    result = MetricResult(outputs=[MetricOutput(name="judge_details", value={"label": "pass"})])
+
+    with pytest.raises(ValidationError):
+        validate_metric_result(result, outputs)
+
+
+@pytest.mark.asyncio
+async def test_scorer_function_metric_executes_sync_scorers() -> None:
+    outputs = [MetricOutputSpec.boolean("reward"), MetricOutputSpec.label("label")]
+    descriptor = MetricDescriptor(type="tests.sync_metric", outputs=outputs)
+    sandbox = cast(Sandbox, object())
+
+    def sync_scorer(sample: ScorerInput) -> dict[str, object]:
+        assert sample.response == "yes"
+        assert sample.target == "yes"
+        assert sample.metadata["category"] == "boolean"
+        assert sample.config == {"mode": "strict"}
+        assert sample.sandbox is sandbox
+        return {"reward": True, "label": "matched"}
+
+    metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=sync_scorer).bind(
+        config={"mode": "strict"},
+        sandbox=sandbox,
+        target_field="answer",
+    )
+
+    result = await metric.compute_scores(
+        MetricInput(
+            row=DatasetRow(data={"answer": "yes", "category": "boolean"}),
+            candidate=CandidateOutput(output_text="yes"),
+        )
+    )
+
+    assert metric.type == "tests.sync_metric"
+    assert score_names_from_output_spec(metric.output_spec()) == ["reward"]
+    assert result.outputs == [MetricOutput(name="reward", value=True), MetricOutput(name="label", value="matched")]
+
+
+@pytest.mark.asyncio
+async def test_scorer_function_metric_executes_async_scorers() -> None:
+    outputs = [MetricOutputSpec.continuous_score("reward"), MetricOutputSpec.label("seen")]
+    descriptor = MetricDescriptor(type="tests.async_metric", outputs=outputs)
+
+    async def async_scorer(sample: ScorerInput) -> dict[str, object]:
+        return {"reward": 0.5, "seen": sample.response}
+
+    metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=async_scorer)
+
+    result = await metric.compute_scores(
+        MetricInput(row=DatasetRow(data={"answer": "yes"}), candidate=CandidateOutput(output_text="maybe"))
+    )
+
+    assert metric.type == "tests.async_metric"
+    assert score_names_from_output_spec(metric.output_spec()) == ["reward"]
+    assert result.outputs == [MetricOutput(name="reward", value=0.5), MetricOutput(name="seen", value="maybe")]
+
+
+def test_typed_scorer_decorator_exposes_descriptor_and_to_metric() -> None:
+    outputs = [MetricOutputSpec.boolean("truthful"), MetricOutputSpec.label("judge_grade")]
+
+    @scorer(metric_type="truthfulqa", outputs=outputs)
+    def truthfulqa_scorer(sample: ScorerInput) -> dict[str, object]:
+        return {"truthful": bool(sample.response), "judge_grade": "C"}
+
+    metric = truthfulqa_scorer.to_metric()
+
+    assert truthfulqa_scorer.descriptor == MetricDescriptor(type="truthfulqa", outputs=outputs)
+    assert metric.type == "truthfulqa"
+    assert score_names_from_output_spec(metric.output_spec()) == ["truthful"]
diff --git a/tests/test_scoring/test_multiple_choice.py b/tests/test_scoring/test_multiple_choice.py
new file mode 100644
index 000000000..fb8679b09
--- /dev/null
+++ b/tests/test_scoring/test_multiple_choice.py
@@ -0,0 +1,283 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for multiple-choice scorers on the shared metric contract.
+
+These tests are the proof that Kanishk PR #1's MC functionality fits Sandy's
+shared metric contract (#950) **with zero protocol-type changes**. The
+scorers expose ``descriptor`` + ``to_metric()`` via ``@scorer(outputs=...)``
+exactly like any other metric, the per-row payload (choices, logprobs)
+flows through ``MetricInput.candidate.metadata``, and the runtime invariants
+(declared vs returned, value-schema coercion) all hold.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from nemo_evaluator.scoring.metric import (
+    BooleanValue,
+    CandidateOutput,
+    ContinuousScore,
+    DatasetRow,
+    MetricInput,
+    MetricOutputSpec,
+)
+from nemo_evaluator.scoring.multiple_choice import (
+    _resolve_gold_index,
+    mcq_letter_extract,
+    multiple_choice_acc,
+)
+
+
+def _mc_input(*, target: object, choices: list[str], logprobs: list[float], is_greedy: list[bool]) -> MetricInput:
+    """Build a MetricInput shaped the way LogprobRankingSolver populates it."""
+    argmax_idx = max(range(len(logprobs)), key=lambda i: logprobs[i]) if logprobs else 0
+    return MetricInput(
+        row=DatasetRow(data={"target": target}),
+        candidate=CandidateOutput(
+            output_text=choices[argmax_idx] if choices else "",
+            metadata={
+                "_mc_choices": choices,
+                "_mc_choices_logprobs": logprobs,
+                "_mc_choices_is_greedy": is_greedy,
+            },
+        ),
+    )
+
+
+# ── Decorator + descriptor surface ──────────────────────────────────────────
+
+
+def test_multiple_choice_acc_exposes_descriptor_with_three_continuous_scores() -> None:
+    desc = multiple_choice_acc.descriptor
+    assert desc.type == "multiple_choice_acc"
+    assert [o.name for o in desc.outputs] == ["acc", "acc_norm", "acc_greedy"]
+    for output in desc.outputs:
+        assert output.value_schema is ContinuousScore
+
+
+def test_mcq_letter_extract_exposes_descriptor_with_correct_and_parsed() -> None:
+    desc = mcq_letter_extract.descriptor
+    assert desc.type == "mcq_letter_extract"
+    names = {o.name: o.value_schema for o in desc.outputs}
+    assert names == {"correct": ContinuousScore, "parsed": BooleanValue}
+
+
+def test_to_metric_materializes_satisfying_metric_protocol() -> None:
+    metric = multiple_choice_acc.to_metric()
+    assert metric.type == "multiple_choice_acc"
+    assert [o.name for o in metric.output_spec()] == ["acc", "acc_norm", "acc_greedy"]
+
+
+# ── multiple_choice_acc semantics ───────────────────────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_mc_acc_perfect_match() -> None:
+    metric = multiple_choice_acc.to_metric()
+    result = await metric.compute_scores(
+        _mc_input(
+            target=2,
+            choices=["A", "B", "Paris", "London"],
+            logprobs=[-5.1, -4.8, -1.2, -3.7],
+            is_greedy=[False, False, True, False],
+        )
+    )
+    outputs = {o.name: o.value for o in result.outputs}
+    assert outputs == {"acc": 1.0, "acc_norm": 1.0, "acc_greedy": 1.0}
+
+
+@pytest.mark.asyncio
+async def test_mc_acc_wrong_answer() -> None:
+    metric = multiple_choice_acc.to_metric()
+    result = await metric.compute_scores(
+        _mc_input(
+            target=0,
+            choices=["A", "B", "Paris", "London"],
+            logprobs=[-5.1, -4.8, -1.2, -3.7],
+            is_greedy=[False, False, True, False],
+        )
+    )
+    outputs = {o.name: o.value for o in result.outputs}
+    assert outputs == {"acc": 0.0, "acc_norm": 0.0, "acc_greedy": 0.0}
+
+
+@pytest.mark.asyncio
+async def test_mc_acc_norm_diverges_from_acc_when_choices_differ_in_length() -> None:
+    """Length normalization can flip argmax: a long choice may have lower raw
+    logprob (more tokens to multiply through) but a higher per-character one."""
+    metric = multiple_choice_acc.to_metric()
+    result = await metric.compute_scores(
+        _mc_input(
+            target=1,
+            choices=["A", "ABCDEFGHIJ"],
+            logprobs=[-1.5, -3.0],  # raw argmax = 0
+            is_greedy=[False, False],
+        )
+    )
+    # raw: -1.5 > -3.0 → argmax = 0, so acc = 0
+    # norm: -1.5/1 = -1.5 vs -3.0/10 = -0.3 → argmax = 1, so acc_norm = 1
+    outputs = {o.name: o.value for o in result.outputs}
+    assert outputs["acc"] == 0.0
+    assert outputs["acc_norm"] == 1.0
+
+
+@pytest.mark.asyncio
+async def test_mc_acc_greedy_zero_when_no_choice_is_greedy() -> None:
+    metric = multiple_choice_acc.to_metric()
+    result = await metric.compute_scores(
+        _mc_input(
+            target=2,
+            choices=["A", "B", "Paris", "London"],
+            logprobs=[-5.1, -4.8, -1.2, -3.7],
+            is_greedy=[False, False, False, False],
+        )
+    )
+    outputs = {o.name: o.value for o in result.outputs}
+    assert outputs["acc"] == 1.0
+    assert outputs["acc_greedy"] == 0.0
+
+
+@pytest.mark.asyncio
+async def test_mc_acc_returns_zeros_when_payload_missing() -> None:
+    """No choices in metadata → all metrics 0.0 (graceful degrade)."""
+    metric = multiple_choice_acc.to_metric()
+    result = await metric.compute_scores(
+        MetricInput(
+            row=DatasetRow(data={"target": 1}),
+            candidate=CandidateOutput(output_text="anything"),
+        )
+    )
+    outputs = {o.name: o.value for o in result.outputs}
+    assert outputs == {"acc": 0.0, "acc_norm": 0.0, "acc_greedy": 0.0}
+
+
+# ── _resolve_gold_index heterogeneous targets ───────────────────────────────
+
+
+@pytest.mark.parametrize(
+    ("target", "choices", "expected"),
+    [
+        (2, ["A", "B", "C", "D"], 2),
+        (-1, ["A", "B"], None),
+        (5, ["A", "B"], None),
+        ("C", ["A", "B", "C", "D"], 2),
+        ("c", ["A", "B", "C", "D"], 2),
+        ("A", ["A", "B"], 0),
+        ("Z", ["A", "B"], None),
+        ("0", ["A", "B"], 0),
+        ("3", ["A", "B"], None),
+        ("Paris", ["London", "Paris"], 1),
+        ("paris", ["London", "Paris"], 1),
+        ("Berlin", ["London", "Paris"], None),
+        ("", ["A", "B"], None),
+        (None, ["A", "B"], None),
+    ],
+)
+def test_resolve_gold_index_heterogeneous(
+    target: object, choices: list[str], expected: int | None
+) -> None:
+    assert _resolve_gold_index(target, choices) == expected
+
+
+# ── End-to-end: integration with shared contract ────────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_candidate_metadata_flows_to_legacy_scorer_through_translator() -> None:
+    """Proof: ``MetricInput.candidate.metadata`` reaches the legacy scorer.
+
+    Without the merge fix in ``ScorerFunctionMetric.compute_scores``, this
+    test fails because the translator builds ``ScorerInput.metadata`` from
+    ``row.data`` only and the scorer reads zeros.
+    """
+    metric = multiple_choice_acc.to_metric()
+    result = await metric.compute_scores(
+        MetricInput(
+            row=DatasetRow(data={"target": 0, "question": "What's hot?"}),
+            candidate=CandidateOutput(
+                output_text="lava",
+                metadata={
+                    "_mc_choices": ["lava", "ice"],
+                    "_mc_choices_logprobs": [-0.1, -3.0],
+                    "_mc_choices_is_greedy": [True, False],
+                },
+            ),
+        )
+    )
+    outputs = {o.name: o.value for o in result.outputs}
+    assert outputs == {"acc": 1.0, "acc_norm": 1.0, "acc_greedy": 1.0}
+
+
+@pytest.mark.asyncio
+async def test_mcq_letter_extract_basic_letter_match() -> None:
+    metric = mcq_letter_extract.to_metric()
+    result = await metric.compute_scores(
+        MetricInput(
+            row=DatasetRow(data={"target": "B"}),
+            candidate=CandidateOutput(output_text="The answer is B."),
+        )
+    )
+    outputs = {o.name: o.value for o in result.outputs}
+    assert outputs == {"correct": 1.0, "parsed": True}
+
+
+@pytest.mark.asyncio
+async def test_mcq_letter_extract_no_letter_in_response() -> None:
+    metric = mcq_letter_extract.to_metric()
+    result = await metric.compute_scores(
+        MetricInput(
+            row=DatasetRow(data={"target": "C"}),
+            candidate=CandidateOutput(output_text="????"),
+        )
+    )
+    outputs = {o.name: o.value for o in result.outputs}
+    assert outputs == {"correct": 0.0, "parsed": False}
+
+
+@pytest.mark.asyncio
+async def test_mcq_letter_extract_int_target() -> None:
+    metric = mcq_letter_extract.to_metric()
+    result = await metric.compute_scores(
+        MetricInput(
+            row=DatasetRow(data={"target": 1}),
+            candidate=CandidateOutput(output_text="Answer: B"),
+        )
+    )
+    outputs = {o.name: o.value for o in result.outputs}
+    assert outputs == {"correct": 1.0, "parsed": True}
+
+
+@pytest.mark.asyncio
+async def test_validate_metric_result_enforces_declared_outputs() -> None:
+    """Sandy's validate_metric_result rejects extra/missing keys at runtime."""
+    from nemo_evaluator.scoring.metric import (
+        MetricDescriptor,
+        MetricOutput,
+        MetricResult,
+        validate_metric_result,
+    )
+
+    desc = MetricDescriptor(
+        type="multiple_choice_acc",
+        outputs=[
+            MetricOutputSpec.continuous_score("acc"),
+            MetricOutputSpec.continuous_score("acc_norm"),
+            MetricOutputSpec.continuous_score("acc_greedy"),
+        ],
+    )
+    bad = MetricResult(outputs=[MetricOutput(name="acc", value=1.0)])
+    with pytest.raises(ValueError, match="Missing declared metric outputs"):
+        validate_metric_result(bad, desc.outputs)
diff --git a/tests/test_solvers/test_logprob_solver.py b/tests/test_solvers/test_logprob_solver.py
new file mode 100644
index 000000000..51ddf5c5b
--- /dev/null
+++ b/tests/test_solvers/test_logprob_solver.py
@@ -0,0 +1,179 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for :class:`LogprobRankingSolver` and the response parser."""
+
+from __future__ import annotations
+
+from unittest.mock import patch
+
+import pytest
+
+from nemo_evaluator.environments.base import SeedResult
+from nemo_evaluator.solvers.base import ErrorKind
+from nemo_evaluator.solvers.logprob import (
+    LogprobRankingSolver,
+    _parse_loglikelihood_response,
+)
+
+
+# ── _parse_loglikelihood_response ───────────────────────────────────────────
+
+
+def _build_logprobs_response(
+    *,
+    tokens: list[str],
+    token_logprobs: list[float | None],
+    text_offset: list[int | None],
+    top_logprobs: list[dict[str, float] | None] | None = None,
+) -> dict:
+    """Shape an OpenAI-compatible /completions response body."""
+    return {
+        "choices": [
+            {
+                "logprobs": {
+                    "tokens": tokens,
+                    "token_logprobs": token_logprobs,
+                    "text_offset": text_offset,
+                    "top_logprobs": top_logprobs or [{tok: lp or 0.0} for tok, lp in zip(tokens, token_logprobs)],
+                }
+            }
+        ]
+    }
+
+
+def test_parse_response_sums_continuation_logprobs() -> None:
+    body = _build_logprobs_response(
+        tokens=["The", " cat", " sat"],
+        token_logprobs=[None, -0.5, -0.3],
+        text_offset=[0, 3, 7],
+    )
+    sum_lp, _ = _parse_loglikelihood_response(body, context="The")
+    assert sum_lp == pytest.approx(-0.8)
+
+
+def test_parse_response_returns_minus_inf_when_logprobs_missing() -> None:
+    body: dict = {"choices": [{"text": "x"}]}  # no logprobs field
+    sum_lp, greedy = _parse_loglikelihood_response(body, context="ctx")
+    assert sum_lp == float("-inf")
+    assert greedy is False
+
+
+def test_parse_response_is_greedy_true_when_top1_matches_tokens() -> None:
+    body = _build_logprobs_response(
+        tokens=["The", " cat"],
+        token_logprobs=[None, -0.5],
+        text_offset=[0, 3],
+        top_logprobs=[{"The": -0.1, "An": -1.5}, {" cat": -0.5, " dog": -2.0}],
+    )
+    _, greedy = _parse_loglikelihood_response(body, context="The")
+    assert greedy is True
+
+
+def test_parse_response_is_greedy_false_when_top1_differs() -> None:
+    body = _build_logprobs_response(
+        tokens=["The", " cat"],
+        token_logprobs=[None, -0.5],
+        text_offset=[0, 3],
+        top_logprobs=[{"The": -0.1}, {" dog": -0.1, " cat": -0.5}],
+    )
+    _, greedy = _parse_loglikelihood_response(body, context="The")
+    assert greedy is False
+
+
+def test_parse_response_handles_token_straddling_with_recursive_fallback() -> None:
+    """If text_offset never crosses ctx_len, walk back from end."""
+    # All tokens have offset < 3 (i.e. text_offset never reaches ctx_len),
+    # falls back to last token containing some continuation.
+    body = _build_logprobs_response(
+        tokens=["The", " cat"],
+        token_logprobs=[None, -0.4],
+        text_offset=[0, 1],  # offsets all < 5 (ctx_len)
+    )
+    sum_lp, _ = _parse_loglikelihood_response(body, context="The c")
+    # Fallback: only the last token's logprob is summed
+    assert sum_lp == pytest.approx(-0.4)
+
+
+def test_parse_response_empty_continuation_returns_minus_inf() -> None:
+    body = _build_logprobs_response(
+        tokens=["a", "b"],
+        token_logprobs=[None, None],
+        text_offset=[0, 1],
+    )
+    sum_lp, _ = _parse_loglikelihood_response(body, context="ab")
+    assert sum_lp == float("-inf")
+
+
+# ── LogprobRankingSolver.solve ──────────────────────────────────────────────
+
+
+def _seed_with_choices(choices: list[str]) -> SeedResult:
+    return SeedResult(
+        prompt="Q: 2+2 = ",
+        expected_answer="4",
+        metadata={"_mc_choices": choices, "source": "byob"},
+    )
+
+
+@pytest.mark.asyncio
+async def test_solve_ranks_choices_by_sum_logprob() -> None:
+    # Three choices; the second one wins by raw logprob sum.
+    responses = [
+        _build_logprobs_response(tokens=["Q: 2+2 = ", "3"], token_logprobs=[None, -2.5], text_offset=[0, 9]),
+        _build_logprobs_response(tokens=["Q: 2+2 = ", "4"], token_logprobs=[None, -0.5], text_offset=[0, 9]),
+        _build_logprobs_response(tokens=["Q: 2+2 = ", "5"], token_logprobs=[None, -3.0], text_offset=[0, 9]),
+    ]
+    seed = _seed_with_choices(["3", "4", "5"])
+    response_iter = iter(responses)
+
+    solver = LogprobRankingSolver(base_url="http://fake", model="fake-model")
+
+    async def fake_post(url, payload):
+        return next(response_iter)
+
+    with patch.object(solver._model_client, "_post_with_retry", side_effect=fake_post):
+        result = await solver.solve(seed)
+
+    assert result.response == "4"
+    assert result.error is None
+    assert result.scoring_details["_mc_choices"] == ["3", "4", "5"]
+    assert result.scoring_details["_mc_choices_logprobs"] == pytest.approx([-2.5, -0.5, -3.0])
+    assert len(result.scoring_details["_mc_choices_is_greedy"]) == 3
+
+
+@pytest.mark.asyncio
+async def test_solve_returns_graceful_error_when_choices_missing() -> None:
+    seed = SeedResult(prompt="anything", expected_answer="?")
+    solver = LogprobRankingSolver(base_url="http://fake", model="fake-model")
+    result = await solver.solve(seed)
+    assert result.response == ""
+    assert result.error_kind == ErrorKind.GRACEFUL
+    assert "_mc_choices" in (result.error or "")
+
+
+@pytest.mark.asyncio
+async def test_solve_propagates_inference_failures_as_infra_error() -> None:
+    seed = _seed_with_choices(["a", "b"])
+    solver = LogprobRankingSolver(base_url="http://fake", model="fake-model")
+
+    async def boom(url, payload):
+        raise RuntimeError("connection refused")
+
+    with patch.object(solver._model_client, "_post_with_retry", side_effect=boom):
+        result = await solver.solve(seed)
+
+    assert result.response == ""
+    assert result.error_kind == ErrorKind.INFRA
+    assert "connection refused" in (result.error or "")