diff --git a/assayer/judge.py b/assayer/judge.py index c141cc1..d7c8162 100644 --- a/assayer/judge.py +++ b/assayer/judge.py @@ -2,9 +2,10 @@ import json import re -import sys from dataclasses import dataclass, field +import click + from assayer.models import ModelResult from assayer.runner import _make_provider @@ -72,7 +73,7 @@ async def run_judge( ) -> JudgeResult | None: valid = [r for r in results if not r.error and r.output] if len(valid) < 2: - print("Judge skipped: fewer than 2 successful results.", file=sys.stderr) + click.echo("Judge skipped: fewer than 2 successful results.", err=True) return None judge_prompt = _build_prompt(prompt, valid, criteria) @@ -81,15 +82,15 @@ async def run_judge( try: result = await provider.run(judge_prompt) except Exception as exc: - print(f"Judge call failed: {exc}", file=sys.stderr) + click.echo(f"Judge call failed: {exc}", err=True) return None if result.error: - print(f"Judge call failed: {result.error}", file=sys.stderr) + click.echo(f"Judge call failed: {result.error}", err=True) return None try: return _parse_response(result.output) except Exception as exc: - print(f"Judge response could not be parsed: {exc}", file=sys.stderr) + click.echo(f"Judge response could not be parsed: {exc}", err=True) return None diff --git a/assayer/renderer.py b/assayer/renderer.py index 90a3755..e12f900 100644 --- a/assayer/renderer.py +++ b/assayer/renderer.py @@ -7,6 +7,7 @@ from assayer.judge import JudgeResult from assayer.models import ModelResult +from assayer.scorer import readability_stats console = Console() @@ -51,6 +52,7 @@ def render_run( if similarity is not None: render_similarity_matrix(results, similarity) + render_readability(results) if judge_result is not None: render_judge(judge_result) @@ -83,6 +85,29 @@ def render_similarity_matrix( console.print(table) +def render_readability(results: list[ModelResult]) -> None: + valid = [r for r in results if not r.error and r.output] + if not valid: + return + + table = Table(title="Readability", show_header=True) + table.add_column("Model", style="bold") + table.add_column("Words", justify="right") + table.add_column("Sentences", justify="right") + table.add_column("Avg sentence length", justify="right") + + for r in valid: + stats = readability_stats(r.output) + table.add_row( + r.model, + str(int(stats["word_count"])), + str(int(stats["sentence_count"])), + f"{stats['avg_sentence_length']:.1f}", + ) + + console.print(table) + + def render_judge(judge_result: JudgeResult) -> None: console.print(Rule("[bold]Judge[/bold]")) console.print(f"[bold green]Winner:[/bold green] {judge_result.winner}")