From 3de932c68f81803a9d4f3b27524dd53a59f45439 Mon Sep 17 00:00:00 2001 From: krisztianfekete Date: Thu, 26 Mar 2026 12:34:21 +0100 Subject: [PATCH] add bertscore eval --- .../__pycache__/bertscore.cpython-314.pyc | Bin 0 -> 5775 bytes evaluators/bertscore/bertscore.py | 95 ++++++++++++++++++ evaluators/bertscore/evaluator.yaml | 6 ++ evaluators/bertscore/requirements.txt | 3 + scripts/validate_evaluator.py | 14 +++ 5 files changed, 118 insertions(+) create mode 100644 evaluators/bertscore/__pycache__/bertscore.cpython-314.pyc create mode 100644 evaluators/bertscore/bertscore.py create mode 100644 evaluators/bertscore/evaluator.yaml create mode 100644 evaluators/bertscore/requirements.txt diff --git a/evaluators/bertscore/__pycache__/bertscore.cpython-314.pyc b/evaluators/bertscore/__pycache__/bertscore.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..551cc69f9ea7d27c5e9dc862e507ab4d519eda31 GIT binary patch literal 5775 zcmb^#TWs6b^-`25krE~O6~~DkQ?}zov0~MM+a^xZ)=BKxc~xaxxoaE~j6~UNBuXWz zJdA#*K(X2FIy>l!de}ZZU=Q!lz<#p74cLHwT2VY{gn=1rfv$fxk}g0$ww+6fvg{`9 z3hW5FJiI*jo^#JVuX}sEEQ0{D*ZwTG_z?OGzgR$)5#rHr0LdX9>1Y!1#3(Uh({1e- zw~dlI2~Tp=KH|_FBa}{={q|Akh)Z{k&^kTB=*);)cef+n(TsSi872L?hj#+*GI930 zo2TK8!EgN)-n#d?k9Qw`1EF{WLape!HxjVK+J0KpFt@GK{0BYaunnOzL;Tp)SXz}h zO-hN1F2^`cPRU78mG!%vbW2QTMLn(dGR)a@DwEYEjoTh_IHO82S(DRBhyzSaCPUnC zUy!>Y>9-|G;UqCO&B@BGbWGHN=#U0fS|+V%5+f>cPK4Klq)JLm;&kbb&Sf=Onc`w; z1;1s*BqybANbxvyY8)OAO9KH&S{2X~OG(I60}RJWcQR5;m*U)EO;>{h9520{l~pO; z3)l`edKzX*WkguCI3>!8ruTBA2`-(Ibr_D9qGwe_QRS4V-sLoNf3V1DMN;$uuA{2=ju6*T);wS^c?XOHrM#|*Qxet* zhqYPJ4}PRluAK2BW;Bffmd7RhT2I8Fbb3Vg#5m z{$s!{2+l@43IUQsQ8Z~g{vMb<^tGbbh-rc+cpFdhcHY5LypwnFG|z+`oha!36098= zc5GS(3FTey0`phP-V-Qsq~IXik4P?hhX&- z>}*QZH7sX`aV0#RPD$a!P5q{-hhw0G@J&_LW^`Fp64Fgcm%`QMX|*WUwz=EkD%P91 zYj^~qqFqT4)F$Au5B}N!G&%GHF~q?w+>{u0lZ5AnuXWP_&rJ$>nl_ynb0M~6lg1c> zsQN88V2@xlis6-8-~IMjm< zRl25te=Cw9>bAiM{CndWqfU}-NG5{R%nO!nmZK1kH06!vlA@+_({elx!At|glxn@# zux0eIAZd8Z#DZ>`gJw80q8YOcMptn(z@hJoS`YGp>D%v2US)CP#SLT}uu2*G3~)$ib&P0O8{GgLr#4~&pD-oq!gkbDJD}IJm1}_H z(3Tf@n-}!ubN9PG?SIHditN=qdv$f}%>sKZH@r@J=iAx~ddq_Nb&J^OA zPHWrgw!Aw`f#Z5q?DPg5gME?Xum>uh&!DThqpT%tY-Lqpb20U$`Y~1q&5S=T$MkDv zkWD1hqJF)+AA|}y6P3Nz)S&5HW`pfXy95pPu>Hg<#Fp8XeD#|oe%%1Xff4XBf7_~ z+r8~+-V&k9bJn&|94v_!E1nD7Di}gf^Wpz_R%*wrwX&L(RILOE1<-gjg6(}5OcIl} z{)#--GvkapBiJK$!K4jVQPJQe3Sq;qKm=RCF0dj|1lzzau)bwn&)@`!=Ksuz9-qTW z&4NjE{HKuE+Ho|Sv_)N^8mo!YwQoDeg|H`9Aj4d?@wN=ZlTjCMkJ7v&%7jVqDd)_^ zN;y|k%~gwv?P3mx!r?x%Y!SJt10Joq9+weiwHNXq8JAOESZoctIu%ZtxGpqoiM}uY z0}y1)n54D=v0+mbgGoS)#352l8nlA5XcDRfgS?SSL55bLLdm2xcu$FU40}ov6~iv; zQYz?HyI>0SAZ~VrNf(wBM?u30l9~t!HKQ8Dv`28$1C1H7aPWlaiJ=@p>jhkAL7gj9 zGq8FbsLgQjZ$NVBu^;){7w@jTUhtjGjjVgy7Uh+pg7-}B{O7*L`8#uWioTA#ucPP- z<$a;$H|`%>^@R$)moRC={Pf&((HqEn14VC7-rKW${(k+cx2NDel{^2KLT>+j@crPz zNbx{8e;~Xs(NS-}XU_-g#4X1z2*|IqJ(AidE zy$dZLUSEu^v7JkI*AAUnV+ZDJFwLxO!_&OjvNW=sSc%_1v+90%)&6ohfMFN20u>HX zW^m_BvlovoM^TBzl$~k!FD%9sVY}j)b;I0>t2+h;m{7`^&IVI${m5NZ>jCur}#gl0)IlJdK+7fh>n> zhF4IYiLyLXI}`7Q9`}wO8hYs3p0ItJ-^JSq(dYn*(w2~M*ah$(9C%HL(iU}%PgVPB zlNjv0E#uwxr>GYanF}nGijuq^Cp5n9#ftwzglRBb)*<(biWi4dkf^1VnY}4lvo09$ zk+^yohO*pYp1KSQSKn~@Y|2QguzhhMm|vmpj!U{ICpClFz6qGA`^FL3pi<`H*i19q zS;w%)vT@NMW0|ah47O%^Ic|7&atxB{OCY=Y3T_6WfsUQ&E4as?%4d6ngreUdlagY% zwUm5@YS~oKrw#zFp28i3qMpGoXK{m#yDUOA zfx`~q%MQ)(R8%df>D!t?!n(|Sk2ll2D{!W7Nh%Z;8jeeq3|`LTRwk#n6Zo4zGX#I_ z_Yl)^XamypJ9F<8eeHQ)`w~;|_2foMPTvD(^SZ~k;3)b7d4Hhb>6j(feJ!Q>ruBxl zVna{9p{LY&XsLU-@7G64O>L{~CsrE=9@7p#vxyuuv*|_k&7ZX1Yc19V@^yiw(CTw% z3w7sm7fNjX{KVWuk!{PfZHv>(_5yn}H(YYEt4+rqI**m=nr5lb{S6BvAF;X7&oS}) z6Tcez?S)@nD0ZF9cbzPDoqo`D`e)sLe7?Yr!XeGy2I>BR|LF2t1^Cb!mSE2>asoYS>PXFk_yBFTOw6U)hIJO6Jmtg5dANSD5&61_Y=3--a zzOlR57+P%%EqBb)>vf$=qxXZK4j1aK&Qc|pcb=VN7ZM*U4_yaKuEwHkf8Mo!!{5Ht zPz)T)2ac8e`xm227uVXJEBW^>b}#K++jpeoZ(K+$j(wcmq#dmHU!;>}wrJ!JJlTgr z$9^1J=>Fj5FU~GeKfm}_@1YefcYfX7RPr{J4s@03S{7eeI<>NI_4MUW-&yrfJa*bW z)F!e!C^M@YEij{3epuNBa1Ur}%i^VL4{hdQg$2=kd+r-%BB!TRn*_nrAOUcgu=<>UlfB5PaiR}AKrvC+r2;0a2 literal 0 HcmV?d00001 diff --git a/evaluators/bertscore/bertscore.py b/evaluators/bertscore/bertscore.py new file mode 100644 index 0000000..ebce14d --- /dev/null +++ b/evaluators/bertscore/bertscore.py @@ -0,0 +1,95 @@ +"""BERTScore semantic similarity evaluator. + +Computes BERTScore (precision, recall, F1) between each invocation's response +and a reference text using contextual embeddings and cosine similarity. + +Config: + expected (str): Required. Reference text to compare against. If omitted, returns NOT_EVALUATED. + model_name (str, default "distilbert-base-uncased"): HuggingFace model for embeddings. + metric (str, default "f1"): Primary score component: "precision", "recall", or "f1". + +Usage: + config: + expected: "reference answer" + metric: "f1" +""" + +from __future__ import annotations + +import sys + +import torch +from transformers import AutoModel, AutoTokenizer + +from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator + +_VALID_METRICS = ("precision", "recall", "f1") + + +def _compute_bertscore( + candidate: str, + reference: str, + tokenizer: AutoTokenizer, + model: AutoModel, + device: torch.device, +) -> dict[str, float]: + """Compute BERTScore precision, recall, and F1 for a candidate-reference pair.""" + cand = tokenizer(candidate, return_tensors="pt", padding=True, truncation=True).to(device) + ref = tokenizer(reference, return_tensors="pt", padding=True, truncation=True).to(device) + + with torch.no_grad(): + cand_emb = torch.nn.functional.normalize(model(**cand).last_hidden_state, dim=-1) + ref_emb = torch.nn.functional.normalize(model(**ref).last_hidden_state, dim=-1) + + sim = torch.bmm(cand_emb, ref_emb.transpose(1, 2)) + + precision = sim.max(dim=2)[0].mean().item() + recall = sim.max(dim=1)[0].mean().item() + f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 + + return {"precision": precision, "recall": recall, "f1": f1} + + +@evaluator +def bertscore(input: EvalInput) -> EvalResult: + expected = input.config.get("expected") + if expected is None: + n = len(input.invocations) + return EvalResult( + score=0.0, + status=EvalStatus.NOT_EVALUATED, + per_invocation_scores=[None] * n, + details={"reason": "missing config: expected"}, + ) + + metric = input.config.get("metric", "f1") + if metric not in _VALID_METRICS: + print(f"WARNING: invalid metric '{metric}', using 'f1'", file=sys.stderr) + metric = "f1" + + model_name = input.config.get("model_name", "distilbert-base-uncased") + reference = str(expected) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModel.from_pretrained(model_name).to(device) + model.eval() + + scores: list[float] = [] + details_rows: list[dict] = [] + + for inv in input.invocations: + result = _compute_bertscore(inv.final_response or "", reference, tokenizer, model, device) + scores.append(result[metric]) + details_rows.append({"invocation_id": inv.invocation_id, **result}) + + overall = sum(scores) / len(scores) if scores else 0.0 + return EvalResult( + score=overall, + per_invocation_scores=scores, + details={"metric": metric, "model_name": model_name, "per_invocation": details_rows}, + ) + + +if __name__ == "__main__": + bertscore.run() diff --git a/evaluators/bertscore/evaluator.yaml b/evaluators/bertscore/evaluator.yaml new file mode 100644 index 0000000..96231d8 --- /dev/null +++ b/evaluators/bertscore/evaluator.yaml @@ -0,0 +1,6 @@ +name: bertscore +description: Semantic similarity scoring using BERTScore (precision, recall, F1) between response and reference text +language: python +entrypoint: bertscore.py +tags: [semantic, similarity, bert, nlp] +author: agentevals-dev diff --git a/evaluators/bertscore/requirements.txt b/evaluators/bertscore/requirements.txt new file mode 100644 index 0000000..99dd060 --- /dev/null +++ b/evaluators/bertscore/requirements.txt @@ -0,0 +1,3 @@ +--extra-index-url https://download.pytorch.org/whl/cpu +torch +transformers diff --git a/scripts/validate_evaluator.py b/scripts/validate_evaluator.py index 13379e3..60508ee 100644 --- a/scripts/validate_evaluator.py +++ b/scripts/validate_evaluator.py @@ -108,6 +108,15 @@ def validate_syntax(evaluator_dir: Path, manifest: dict) -> bool: return False _ok("Imports, decorator, and explicit run() present") + req_file = evaluator_dir / "requirements.txt" + if req_file.exists(): + lines = req_file.read_text().strip().splitlines() + valid_lines = [l.strip() for l in lines if l.strip() and not l.strip().startswith("#")] + if not valid_lines: + _fail("requirements.txt exists but contains no dependencies") + return False + _ok(f"requirements.txt found ({len(valid_lines)} dependencies)") + elif language in ("javascript", "typescript"): ext = Path(entrypoint).suffix expected = LANGUAGE_EXTENSIONS.get(language, set()) @@ -124,6 +133,11 @@ def validate_syntax(evaluator_dir: Path, manifest: dict) -> bool: def validate_smoke_run(evaluator_dir: Path, manifest: dict) -> bool: """Run the evaluator with synthetic input and validate the output.""" + req_file = evaluator_dir / "requirements.txt" + if req_file.exists() and req_file.read_text().strip(): + _ok("Skipping smoke run: evaluator has requirements.txt (deps may not be installed)") + return True + language = manifest.get("language", "python") entrypoint = manifest["entrypoint"] entry_path = evaluator_dir / entrypoint