Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,25 @@

## 0.13.0 (unreleased)

### Shared Metric Contract

- Added public `MetricInput -> MetricResult` scorer/metric runtime types and `ScorerFunctionMetric`.
- Extended BYOB `@scorer` with typed scorer metadata and `to_metric()` while preserving current dict scorer behavior.
- Added optional `config_schema` support for typed scorer configs while keeping raw dict configs as the default.
- Split typed scorer config binding into strict `bind(config=ConfigModel(...))` and coercive `bind_raw_config(config={...})` paths.

### Multiple-Choice Loglikelihood + Few-Shot (lm-evaluation-harness parity)

Demonstrates non-trivial benchmark machinery composing with the shared metric contract **without protocol-type changes**. `MetricInput`, `MetricResult`, `MetricDescriptor`, `MetricOutputSpec` shapes untouched.

- **`@scorer`-typed `multiple_choice_acc`** in `nemo_evaluator.scoring.multiple_choice`: returns `acc`, `acc_norm`, `acc_greedy`. Reads candidate continuations + per-choice loglikelihoods from `MetricInput.candidate.metadata`.
- **`@scorer`-typed `mcq_letter_extract`**: free-form letter extraction (A-J), returns `correct` (continuous) and `parsed` (boolean).
- **`LogprobRankingSolver`** in `nemo_evaluator.solvers.logprob`: ranks candidate continuations via `/completions` (`max_tokens=0, echo=true, logprobs=1`), parses continuation spans via `text_offset`. Per-choice calls run concurrently behind `max_concurrent_choices`.
- **`@benchmark` extensions**: `choices`, `choices_field` (with dotted-path resolution), `num_fewshot`, `fewshot_split`, `fewshot_template`, `fewshot_separator`, `fewshot_seed`. Few-shot prefix is rendered in `ByobEnvironment.seed()`.
- **`_load_hf` dataset URI parsing**: path-segment configs (`hf://ns/name/config[/split]`) and row filters (`?filter_field=...&filter_value=...`). Required for namespaced multilingual datasets like `CohereForAI/Global-MMLU-Lite/en`.
- **Eval loop forwards solver `scoring_details` to `env.verify` kwargs**: lets a solver push per-row payloads to the scorer. `_metric_input_from_verify` lifts `_mc_*`/`_solver_*` namespaced keys onto `MetricInput.candidate.metadata` rather than `row.data`.
- **`ScorerFunctionMetric.compute_scores` merges `candidate.metadata` into legacy `ScorerInput.metadata`** so legacy `(ScorerInput) -> dict` scorers see solver-emitted payloads.

### Adapter Proxy (Breaking — replaces LiteLLM)

- **LiteLLM removed**: The `litellm` dependency, `proxy` and `proxy-full` extras, and `litellm_settings` config field are all removed. The adapter proxy is now built-in with zero external proxy dependencies.
Expand Down
169 changes: 169 additions & 0 deletions scripts/smoketest_logprob_solver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
"""End-to-end smoke test for LogprobRankingSolver against a fake /v1/completions.

Spins up an aiohttp server that returns OpenAI-shape responses with
deterministic logprobs derived from the continuation. This validates:

1. The HTTP wire format the solver emits (max_tokens=0, echo=true, logprobs=1).
2. The text_offset-based continuation parsing.
3. Concurrent per-choice ranking + argmax selection.
4. End-to-end seed → solve → verify through ByobEnvironment with the
typed multiple_choice_acc scorer.

Run:
python scripts/smoketest_logprob_solver.py
"""

from __future__ import annotations

import asyncio
import hashlib
import math
from typing import Any

from aiohttp import web

from nemo_evaluator.environments.custom import BenchmarkDefinition, ByobEnvironment
from nemo_evaluator.scoring.multiple_choice import multiple_choice_acc
from nemo_evaluator.solvers.logprob import LogprobRankingSolver


def _deterministic_logprob(continuation: str, *, target: str) -> float:
"""Deterministic per-continuation logprob: gold gets the highest score.

The "model" prefers the gold continuation (mocking a perfect oracle).
Other continuations get logprobs derived from a stable hash so the
ranking is reproducible across runs.
"""
if continuation == target:
return -0.5
h = int(hashlib.sha256(continuation.encode()).hexdigest()[:8], 16)
return -2.0 - (h % 100) / 25.0


async def fake_completions_handler(request: web.Request, *, gold_per_prompt: dict[str, str]) -> web.Response:
body = await request.json()
prompt = body["prompt"]
# The benchmark prompt is a stable substring of `prompt`; find which
# gold continuation belongs to it.
gold = next((g for stem, g in gold_per_prompt.items() if stem in prompt), "")

# Identify the continuation: it's whatever came AFTER the longest
# benchmark stem we recognise. For the smoke test we just take the
# last word as the continuation token sequence.
matched_stem = next((stem for stem in gold_per_prompt if stem in prompt), "")
continuation = prompt[len(matched_stem):] if matched_stem else prompt[-10:]

logprob = _deterministic_logprob(continuation, target=gold)

# Synthesize a token-level response with text_offset that puts the
# continuation just after `matched_stem`.
ctx_end = len(matched_stem)
tokens = ["<ctx>", continuation]
token_logprobs = [None, logprob]
text_offset = [0, ctx_end]
top_logprobs = [
{"<ctx>": -0.01},
{continuation: logprob, "_other": logprob - 1.0},
]

return web.json_response(
{
"choices": [
{
"text": "",
"finish_reason": "length",
"logprobs": {
"tokens": tokens,
"token_logprobs": token_logprobs,
"text_offset": text_offset,
"top_logprobs": top_logprobs,
},
}
],
"model": body["model"],
"usage": {"prompt_tokens": len(prompt), "completion_tokens": 0, "total_tokens": len(prompt)},
}
)


def make_app(gold_per_prompt: dict[str, str]) -> web.Application:
async def handler(request: web.Request) -> web.Response:
return await fake_completions_handler(request, gold_per_prompt=gold_per_prompt)

app = web.Application()
app.router.add_post("/v1/completions", handler)
return app


async def run_smoketest() -> None:
benchmark_rows = [
# prompt stem, choices, gold idx, gold text
{"q": "Capital of France?", "answer": 2},
{"q": "Capital of UK?", "answer": 3},
{"q": "Capital of Germany?", "answer": 0},
]
choices = ["Berlin", "Madrid", "Paris", "London"]
gold_per_prompt = {f"Q: {row['q']}\nA: ": choices[row["answer"]] for row in benchmark_rows}

# Start fake server
app = make_app(gold_per_prompt)
runner = web.AppRunner(app)
await runner.setup()
site = web.TCPSite(runner, "127.0.0.1", 11999)
await site.start()

try:
defn = BenchmarkDefinition(
name="capitals_smoke",
dataset=lambda: benchmark_rows,
prompt="Q: {q}\nA: ",
target_field="answer",
choices=choices,
scorer_fn=multiple_choice_acc,
)
env = ByobEnvironment(defn)
solver = LogprobRankingSolver(
base_url="http://127.0.0.1:11999/v1",
model="fake-mc-oracle",
)

print(f"running {len(benchmark_rows)} rows × {len(choices)} choices each\n")
per_row_results = []
for idx in range(len(benchmark_rows)):
seed = await env.seed(idx)
solve = await solver.solve(seed)
assert solve.error is None, f"solver error: {solve.error}"

merged_meta = {**seed.metadata, **solve.scoring_details}
vr = await env.verify(solve.response, seed.expected_answer, **merged_meta)
outputs = vr.scoring_details.get("outputs", {})
per_row_results.append((benchmark_rows[idx]["q"], solve, vr, outputs))

# Summary
print(f"{'question':<22} {'argmax':<10} {'gold':<10} {'acc':<6} {'logprobs'}")
print("─" * 96)
all_correct = True
for (q, solve, vr, outputs), row in zip(per_row_results, benchmark_rows):
argmax = solve.response
gold = choices[row["answer"]]
lps = solve.scoring_details["_mc_choices_logprobs"]
lps_str = " ".join(f"{c}={lp:.2f}" for c, lp in zip(choices, lps))
acc = outputs.get("acc", 0.0)
if acc != 1.0:
all_correct = False
print(f"{q:<22} {argmax:<10} {gold:<10} {acc:<6} {lps_str}")

print()
print(f"all correct: {all_correct}")
for q, solve, vr, outputs in per_row_results:
assert outputs.get("acc") == 1.0, f"acc != 1.0 for {q!r}: {outputs}"
assert math.isfinite(solve.scoring_details["_mc_choices_logprobs"][0])
print("OK: end-to-end seed → solve → verify works through real HTTP")

await solver.close()
finally:
await runner.cleanup()


if __name__ == "__main__":
asyncio.run(run_smoketest())
63 changes: 52 additions & 11 deletions src/nemo_evaluator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,30 @@

__version__ = "0.12.0"

from nemo_evaluator.engine.eval_loop import run_evaluation
from nemo_evaluator.engine.model_client import ModelClient
from nemo_evaluator.environments.base import EvalEnvironment, SeedResult, VerifyResult
from nemo_evaluator.environments.custom import benchmark, scorer
from nemo_evaluator.environments.registry import get_environment, list_environments, load_benchmark_file, register
from nemo_evaluator.engine.eval_loop import run_evaluation
from nemo_evaluator.engine.model_client import ModelClient
from nemo_evaluator.solvers import (
ChatSolver,
CompletionSolver,
NatSolver,
OpenClawSolver,
Solver,
SolveResult,
VLMSolver,
)
from nemo_evaluator.scoring import (
BooleanValue,
CandidateOutput,
ContinuousScore,
DatasetRow,
DiscreteScore,
Label,
Metric,
MetricDescriptor,
MetricInput,
MetricOutput,
MetricOutputSpec,
MetricResult,
MetricScorerFunction,
ScorerCallable,
ScorerConfig,
ScorerFunctionMetric,
ScorerInput,
ScorerReturn,
answer_line,
code_sandbox,
code_sandbox_async,
Expand All @@ -40,6 +48,18 @@
multichoice_regex,
needs_judge,
numeric_match,
score_names_from_output_spec,
)
from nemo_evaluator.scoring.multiple_choice import mcq_letter_extract, multiple_choice_acc
from nemo_evaluator.solvers import (
ChatSolver,
CompletionSolver,
LogprobRankingSolver,
NatSolver,
OpenClawSolver,
Solver,
SolveResult,
VLMSolver,
)

__all__ = [
Expand All @@ -57,6 +77,7 @@
"Solver",
"ChatSolver",
"CompletionSolver",
"LogprobRankingSolver",
"NatSolver",
"OpenClawSolver",
"VLMSolver",
Expand All @@ -65,6 +86,24 @@
"benchmark",
"scorer",
"ScorerInput",
"Metric",
"BooleanValue",
"DatasetRow",
"CandidateOutput",
"ContinuousScore",
"DiscreteScore",
"Label",
"MetricInput",
"MetricOutput",
"MetricOutputSpec",
"MetricDescriptor",
"MetricResult",
"MetricScorerFunction",
"ScorerCallable",
"ScorerConfig",
"ScorerFunctionMetric",
"ScorerReturn",
"score_names_from_output_spec",
# Scoring primitives
"exact_match",
"multichoice_regex",
Expand All @@ -73,5 +112,7 @@
"numeric_match",
"code_sandbox",
"code_sandbox_async",
"mcq_letter_extract",
"multiple_choice_acc",
"needs_judge",
]
9 changes: 8 additions & 1 deletion src/nemo_evaluator/engine/eval_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,11 +435,18 @@ async def _run_step(idx: int, slot: int, rep: int, seed_result, seed_ms: float):
logger.debug("p%d r%d: using pre-computed reward=%.4f", idx, rep, vr.reward)
elif not _solve_failed:
verify_sandbox = await lifecycle.get_verify_sandbox()
# Forward solver-emitted payload (e.g. logprobs, ranking
# metadata) into verify alongside seed metadata. Keys
# collide rarely in practice; solver payload wins on
# collision because it is the more recent, more
# specific source.
solver_meta = solve_result.scoring_details if solve_result else {}
merged_meta = {**seed_result.metadata, **solver_meta}
vr = await env.verify(
response_text,
seed_result.expected_answer,
sandbox=verify_sandbox,
**seed_result.metadata,
**merged_meta,
)
break # success — exit retry loop

Expand Down
Loading
Loading