Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -869,7 +869,7 @@ cmd/waza/ CLI entrypoint and command definitions
tokens/ Token counting subcommand
internal/
config/ Configuration with functional options
execution/ AgentEngine interface (mock, copilot)
execution/ AgentEngine interface (mock, copilot, codex)
graders/ Validator registry and built-in graders
metrics/ Scoring metrics
models/ Data structures (EvalSpec, TestCase, EvaluationOutcome)
Expand All @@ -893,8 +893,9 @@ config:
max_attempts: 3 # Retry failed graders up to 3 times (default: 1, no retries)
timeout_seconds: 300
parallel: false
executor: mock # or copilot-sdk
executor: mock # or copilot-sdk, codex
model: claude-sonnet-4-20250514
model_reasoning_effort: high # codex only; none, minimal, low, medium, high, xhigh
group_by: model # Group results by model (or other dimension)
instruction_files:
- .github/instructions/project.instructions.md
Expand Down Expand Up @@ -1250,6 +1251,8 @@ jobs:
| **Go Version** | 1.26 or higher |
| **Executor** | Use `mock` executor for CI (no API keys needed) |
| **Copilot Auth** | Required for the default `copilot-sdk` route; set `GITHUB_TOKEN` in CI. Custom providers can be configured with `COPILOT_BASE_URL` or `COPILOT_PROVIDER_BASE_URL` instead. |
| **Codex Auth** | Only required for `codex` executor: uses the local Codex CLI config/auth from `~/.codex` |
| **Codex Reasoning** | Optional `model_reasoning_effort` uses Codex's config key; common values are `none`, `minimal`, `low`, `medium`, `high`, `xhigh` |
| **Exit Codes** | 0=success, 1=test failure, 2=config error |

#### Expected Skill Structure
Expand Down
5 changes: 5 additions & 0 deletions cmd/waza/cmd_init.go
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,7 @@ func initCommandE(cmd *cobra.Command, args []string, noSkill bool, flagSkillsDir
Description("Choose how evals are executed").
Options(
huh.NewOption("Copilot SDK — real model execution", "copilot-sdk"),
huh.NewOption("Codex — use ~/.codex config/auth", "codex"),
huh.NewOption("Mock — fast iteration, no API calls", "mock"),
).
Value(&engine),
Expand Down Expand Up @@ -368,6 +369,10 @@ func initCommandE(cmd *cobra.Command, args []string, noSkill bool, flagSkillsDir
if err := modelForm.Run(); err != nil {
model = projectconfig.DefaultModel
}
} else if engine == "codex" {
// Let Codex read the default model from ~/.codex/config.toml unless
// the eval later sets config.model or the user passes --model.
model = ""
}

pathsForm := huh.NewForm(
Expand Down
68 changes: 67 additions & 1 deletion cmd/waza/cmd_run.go
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,9 @@ func runCommandForSpec(cmd *cobra.Command, sp skillSpecPath, defaultSkills []str
if err != nil {
return nil, fmt.Errorf("failed to load spec: %w", err)
}
if cfg, cfgErr := projectconfig.Load(filepath.Dir(specPath)); cfgErr == nil {
applyProjectDefaultsToEvalSpec(spec, cfg)
}

// CLI flags override spec config
if parallel {
Expand Down Expand Up @@ -594,9 +597,51 @@ func runCommandForSpec(cmd *cobra.Command, sp skillSpecPath, defaultSkills []str
return allResults, nil
}

func applyProjectDefaultsToEvalSpec(spec *models.EvalSpec, cfg *projectconfig.ProjectConfig) {
if spec == nil || cfg == nil {
return
}

defaultEngine := cfg.Defaults.Engine
if defaultEngine == "" {
defaultEngine = projectconfig.DefaultEngine
}

engineWasDefault := spec.Config.EngineType == "" ||
(spec.Config.EngineType == projectconfig.DefaultEngine && defaultEngine != projectconfig.DefaultEngine)
if engineWasDefault {
spec.Config.EngineType = defaultEngine
}

defaultModel := cfg.Defaults.Model
modelWasDefault := spec.Config.ModelID == "" ||
(spec.Config.ModelID == projectconfig.DefaultModel &&
(defaultModel != projectconfig.DefaultModel || engineWasDefault))
if modelWasDefault {
Comment on lines +609 to +620
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Left as advisory for the PR author to address — the override behavior is intentional per the PR description. Tracking as a follow-up improvement.

spec.Config.ModelID = defaultModel
}
if spec.Config.ModelReasoningEffort == "" {
spec.Config.ModelReasoningEffort = cfg.Defaults.ModelReasoningEffort
}
}

func displayModel(cfg models.Config) string {
if cfg.ModelID != "" {
return cfg.ModelID
}
if cfg.EngineType == "codex" {
return "default (Codex config)"
}
return ""
}

// runSingleModel executes a benchmark for one model and returns the outcome.
// It prints the per-model summary and saves output for single-model runs.
func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string, defaultSkills []string) (*models.EvaluationOutcome, error) {
if err := validateEngineFeatureSupport(spec, specPath); err != nil {
return nil, err
}

// Get spec directory for resolving relative paths
specDir := filepath.Dir(specPath)
if !filepath.IsAbs(specDir) {
Expand Down Expand Up @@ -663,6 +708,8 @@ func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string,
engine = execution.NewCopilotEngineBuilder(spec.Config.ModelID, &execution.CopilotEngineBuilderOptions{
NewCopilotClient: newCopilotClientFn, // if nil, uses the real function, otherwise overridable for tests.
}).Build()
case "codex":
engine = execution.NewCodexEngine(spec.Config.ModelID)
default:
return nil, fmt.Errorf("unknown engine type: %s", spec.Config.EngineType)
}
Expand Down Expand Up @@ -764,7 +811,7 @@ func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string,
fmt.Printf("Running benchmark: %s\n", spec.Name)
fmt.Printf("Skill: %s\n", spec.SkillName)
fmt.Printf("Engine: %s\n", spec.Config.EngineType)
fmt.Printf("Model: %s\n", spec.Config.ModelID)
fmt.Printf("Model: %s\n", displayModel(spec.Config))
if spec.Config.JudgeModel != "" {
fmt.Printf("Judge Model: %s\n", spec.Config.JudgeModel)
}
Expand Down Expand Up @@ -936,6 +983,25 @@ func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string,
return outcome, nil
}

func validateEngineFeatureSupport(spec *models.EvalSpec, specPath string) error {
if spec == nil || spec.Config.EngineType != "codex" {
return nil
}
for _, grader := range spec.Graders {
Comment thread
spboyer marked this conversation as resolved.
if grader.Kind == models.GraderKindSkillInvocation {
return fmt.Errorf("grader %q uses skill_invocation, which is not supported by the codex executor because Codex CLI does not emit skill invocation telemetry", grader.Identifier)
}
}
// Reject codex when trigger tests are present — trigger tests use skill invocation
// telemetry that the Codex CLI cannot emit, producing misleading per-prompt errors
// instead of a clear upfront configuration failure.
specDir := filepath.Dir(specPath)
if triggerSpec, err := trigger.Discover(specDir); err == nil && triggerSpec != nil {
return fmt.Errorf("trigger tests are not supported by the codex executor; remove trigger.yaml or switch to a non-codex executor")
}
return nil
}

// printModelComparison renders a comparison table for multi-model runs.
func printModelComparison(results []modelResult) {
slices.SortFunc(results, func(a, b modelResult) int {
Expand Down
Loading