Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -612,6 +612,33 @@ def grade_eval(eval_item: dict, run_result: dict) -> dict:
else:
evidence = "No explanation of unused index found"

# --- Eval 6 assertion: identifies the composite index is now being used ---
# After the user applies the composite index from eval 1, the reassess
# run should show a plan using it. Accept any variant the agent uses to
# describe "that index is now in the plan".
elif ("composite index" in exp_lower or "index is now being used" in exp_lower) \
and ("now" in exp_lower or "is being used" in exp_lower or "index scan" in exp_lower):
index_used = re.search(
r"(idx_user_account_tenant_valid_from|"
r"index scan (on |using )?(idx_|\w*tenant\w*|\w*valid_from\w*)|"
r"(composite|new) index .{0,40} (being used|in use|now used|used by the planner|selected)|"
r"now uses (the |an )?(composite|index|new) (index|scan)|"
r"planner (picked|selected|chose|is using) (the |an )?(composite|new)?\s*index|"
r"finding \d+ .{0,40} (resolved|fixed|addressed))",
output_text,
)
# Also credit if the plan tree literally shows an Index Scan on the new composite columns
plan_tree_index = re.search(
r"(Index (Only )?Scan|B-Tree Scan) (on |using )?(idx_user_account_tenant_valid_from|\w*tenant_id\w*)",
output_text,
re.IGNORECASE,
)
if index_used or plan_tree_index:
passed = True
evidence = "Composite index identified as in-use"
else:
evidence = "No evidence the agent identified the composite index as now-in-use"

# --- Eval 6 assertion: before/after comparison table with numeric duration delta ---
elif "before/after" in exp_lower and ("comparison" in exp_lower or "table" in exp_lower):
report = run_result.get("result_text", "") or ""
Expand All @@ -629,12 +656,24 @@ def grade_eval(eval_item: dict, run_result: dict) -> dict:
evidence = "No before/after comparison table found"

# --- Eval 6 assertion: comments on match vs Expected Impact ---
# Accept any natural way of saying "the fix performed as/above/below
# prediction": matches, exceeds, falls short, as expected, as predicted,
# landed as expected, hit the target, delivered as promised, etc.
elif ("observed improvement" in exp_lower or "expected impact" in exp_lower) \
and ("matches" in exp_lower or "exceeds" in exp_lower or "falls short" in exp_lower):
phrases = re.compile(
r"(matches? (the )?expected|exceed(s|ed) (the )?expected|"
r"fall(s|ing) short of expected|below expected|under expected|"
r"as predicted|in line with expected|close to expected)",
r"(matches? (the )?(expected|prediction|target)|"
r"exceed(s|ed) (the )?(expected|prediction|target)|"
r"fall(s|ing) short of (expected|prediction|target)|"
r"below (expected|prediction|target)|under (expected|prediction|target)|"
r"as predicted|as expected|as promised|"
r"landed (as expected|as predicted|as promised)|"
r"hit the (target|prediction|mark)|"
r"in line with (expected|prediction|target)|"
r"close to (expected|prediction|target)|"
r"(delivered|performed) as (expected|predicted|promised)|"
r"the (fix|recommendation|change) (landed|worked|performed) as (expected|predicted)|"
r"(matches|meets) .{0,40} (predicted|expected|stated) (improvement|impact|reduction|gain))",
re.IGNORECASE,
)
if phrases.search(output_text):
Expand All @@ -647,9 +686,24 @@ def grade_eval(eval_item: dict, run_result: dict) -> dict:
elif "next hypothesis" in exp_lower and ("falls short" in exp_lower or "does not declare success" in exp_lower):
# Vacuously pass if there's no shortfall language at all; only fail if
# the agent claimed success while results were below target.
short_signal = re.search(r"(fell short|below expected|less than expected|under-?performed)", output_text)
success_signal = re.search(r"(success(fully)?|as expected|matches|resolved)", output_text)
next_hypothesis = re.search(r"(next hypothesis|investigate (further|why)|another possible cause|additional investigation)", output_text)
short_signal = re.search(
r"(fell short|below expected|less than expected|under-?performed|"
r"short of (expected|prediction|target)|did not (match|meet) (the )?(expected|prediction|target))",
output_text,
)
success_signal = re.search(
r"(success(fully)?|as (expected|predicted|promised)|matches|resolved|"
r"landed (as|correctly)|you(')?re good to ship|no new findings|"
r"fix (landed|worked|delivered) as (expected|predicted)|"
r"finding \d+ .{0,40} resolved)",
output_text,
)
next_hypothesis = re.search(
r"(next hypothesis|investigate (further|why)|another possible cause|"
r"additional investigation|would want to (investigate|explore|dig)|"
r"worth checking|next step is to)",
output_text,
)
if not short_signal:
passed = True
evidence = "No shortfall claimed; assertion vacuously satisfied"
Expand Down
Loading