Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions assayer/judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

import json
import re
import sys
from dataclasses import dataclass, field

import click

from assayer.models import ModelResult
from assayer.runner import _make_provider

Expand Down Expand Up @@ -72,7 +73,7 @@ async def run_judge(
) -> JudgeResult | None:
valid = [r for r in results if not r.error and r.output]
if len(valid) < 2:
print("Judge skipped: fewer than 2 successful results.", file=sys.stderr)
click.echo("Judge skipped: fewer than 2 successful results.", err=True)
return None

judge_prompt = _build_prompt(prompt, valid, criteria)
Expand All @@ -81,15 +82,15 @@ async def run_judge(
try:
result = await provider.run(judge_prompt)
except Exception as exc:
print(f"Judge call failed: {exc}", file=sys.stderr)
click.echo(f"Judge call failed: {exc}", err=True)
return None

if result.error:
print(f"Judge call failed: {result.error}", file=sys.stderr)
click.echo(f"Judge call failed: {result.error}", err=True)
return None

try:
return _parse_response(result.output)
except Exception as exc:
print(f"Judge response could not be parsed: {exc}", file=sys.stderr)
click.echo(f"Judge response could not be parsed: {exc}", err=True)
return None
25 changes: 25 additions & 0 deletions assayer/renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from assayer.judge import JudgeResult
from assayer.models import ModelResult
from assayer.scorer import readability_stats

console = Console()

Expand Down Expand Up @@ -51,6 +52,7 @@ def render_run(

if similarity is not None:
render_similarity_matrix(results, similarity)
render_readability(results)

if judge_result is not None:
render_judge(judge_result)
Expand Down Expand Up @@ -83,6 +85,29 @@ def render_similarity_matrix(
console.print(table)


def render_readability(results: list[ModelResult]) -> None:
valid = [r for r in results if not r.error and r.output]
if not valid:
return

table = Table(title="Readability", show_header=True)
table.add_column("Model", style="bold")
table.add_column("Words", justify="right")
table.add_column("Sentences", justify="right")
table.add_column("Avg sentence length", justify="right")

for r in valid:
stats = readability_stats(r.output)
table.add_row(
r.model,
str(int(stats["word_count"])),
str(int(stats["sentence_count"])),
f"{stats['avg_sentence_length']:.1f}",
)

console.print(table)


def render_judge(judge_result: JudgeResult) -> None:
console.print(Rule("[bold]Judge[/bold]"))
console.print(f"[bold green]Winner:[/bold green] {judge_result.winner}")
Expand Down
Loading