diff --git a/.gitignore b/.gitignore index 4a5e200..75a384a 100644 --- a/.gitignore +++ b/.gitignore @@ -22,7 +22,7 @@ env/ # Project specific .env -.skate/config.json +.assayer/config.json results.json results.csv .DS_Store diff --git a/README.md b/README.md index b2774a3..2c1b0e0 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# assayer +# Assayer -Send a prompt to multiple language models in parallel and compare their outputs in the terminal. Useful for evaluating which model handles a given task better, measuring semantic similarity between responses, or running an LLM-as-judge evaluation — without leaving the shell. +Send a prompt to multiple language models in parallel and compare their outputs in the terminal. Useful for evaluating which model handles a given task better, measuring semantic similarity between responses, or running an LLM-as-judge evaluation - without leaving the shell. ## Installation @@ -16,13 +16,11 @@ pip install "assayer[score]" Python 3.11 or newer is required. -> **Contributing?** See [CONTRIBUTING.md](CONTRIBUTING.md) for setup, code style, and PR guidelines. - ## Supported Providers - **OpenAI**: All GPT models. -- **Anthropic**: Claude 4.5 models (Opus, Sonnet, Haiku). -- **Google Gemini**: 1.5 Pro and Flash models. +- **Anthropic**: Claude models (Opus 4.7, Sonnet 4.6, Haiku 4.5, and earlier). +- **Google Gemini**: Gemini 2.x and 3.x models. - **Ollama**: Local models running on your machine. ## Configuration @@ -57,11 +55,11 @@ assayer run "Explain recursion in one sentence." --models gpt-4o,claude-haiku-4- ## Commands -### run +### `run` ```bash assayer run "prompt" --models gpt-4o,claude-sonnet-4-5 -assayer run --prompt-file prompt.txt --models gpt-4o,ollama/llama3 +assayer run --prompt-file prompt.txt --models gpt-4o,ollama/llama3.2 assayer run "prompt" --models gpt-4o,claude-sonnet-4-5 --score assayer run "prompt" --models gpt-4o,claude-sonnet-4-5 --judge gpt-4o --judge-criteria "clarity,brevity" assayer run "prompt" --models gpt-4o,claude-sonnet-4-5 --output results.json @@ -81,8 +79,9 @@ assayer run "prompt with {var}" --models gpt-4o --var key=value | `--judge` | Model to use as judge | | `--judge-criteria` | Comma-separated criteria for the judge | | `--output` | Save results to `.json` or `.csv` | +| `--timeout` | Per-model timeout in seconds (default: 30) | -### models +### `models` ```bash assayer models list # list all supported model identifiers @@ -90,7 +89,7 @@ assayer models check # check which API keys are configured assayer models check ollama # check if Ollama is running and list local models ``` -### config +### `config` ```bash assayer config set OPENAI_API_KEY sk-... @@ -154,3 +153,11 @@ If the judge call fails, a warning is printed to stderr and the run continues no ## Export `--output results.json` saves full results as JSON. `--output results.csv` saves as CSV. The file format is determined by the extension. + +## Contributing + +Contributions are welcome. See [CONTRIBUTING.md](CONTRIBUTING.md) for setup instructions, code style, and the PR process. + +## License + +MIT - see [LICENSE](LICENSE) for details. diff --git a/examples/basic.sh b/examples/basic.sh index 46edd56..fc3bd10 100644 --- a/examples/basic.sh +++ b/examples/basic.sh @@ -5,7 +5,7 @@ assayer run "Explain recursion in one sentence." \ --models gpt-4o-mini,claude-haiku-4-5-20251001 # Use a prompt file -assayer run --prompt-file prompt.txt --models gpt-4o,gemini-1.5-flash +assayer run --prompt-file prompt.txt --models gpt-4o,gemini-2.0-flash # Template variables assayer run "Translate '{text}' to French." \ @@ -20,4 +20,4 @@ assayer run "Write a haiku about autumn." \ # Include a local Ollama model assayer run "What is the capital of France?" \ - --models gpt-4o-mini,ollama/llama3 + --models gpt-4o-mini,ollama/llama3.2 diff --git a/examples/with_judge.sh b/examples/with_judge.sh index 6cbe925..e18ff29 100644 --- a/examples/with_judge.sh +++ b/examples/with_judge.sh @@ -7,7 +7,7 @@ assayer run "Explain the difference between TCP and UDP." \ # Judge with explicit evaluation criteria assayer run "Write a product description for wireless headphones." \ - --models gpt-4o,claude-sonnet-4-5,gemini-1.5-pro \ + --models gpt-4o,claude-sonnet-4-5,gemini-2.5-flash \ --judge claude-sonnet-4-5 \ --judge-criteria "clarity,persuasiveness,brevity" diff --git a/examples/with_scoring.sh b/examples/with_scoring.sh index 2282dc6..0875673 100644 --- a/examples/with_scoring.sh +++ b/examples/with_scoring.sh @@ -2,7 +2,7 @@ # Show similarity matrix between model outputs assayer run "Describe the water cycle in two sentences." \ - --models gpt-4o,claude-sonnet-4-5,gemini-1.5-flash \ + --models gpt-4o,claude-sonnet-4-5,gemini-2.0-flash \ --score # Combine scoring with export