microsoft · davidhonig · May 12, 2026 · Jun 5, 2026 · spboyer · Jun 5, 2026
@@ -869,7 +869,7 @@ cmd/waza/              CLI entrypoint and command definitions
   tokens/              Token counting subcommand
 internal/
   config/              Configuration with functional options
-  execution/           AgentEngine interface (mock, copilot)
+  execution/           AgentEngine interface (mock, copilot, codex)
   graders/             Validator registry and built-in graders
   metrics/             Scoring metrics
   models/              Data structures (EvalSpec, TestCase, EvaluationOutcome)
@@ -893,8 +893,9 @@ config:
   max_attempts: 3          # Retry failed graders up to 3 times (default: 1, no retries)
   timeout_seconds: 300
   parallel: false
-  executor: mock          # or copilot-sdk
+  executor: mock          # or copilot-sdk, codex
   model: claude-sonnet-4-20250514
+  model_reasoning_effort: high  # codex only; none, minimal, low, medium, high, xhigh
   group_by: model          # Group results by model (or other dimension)
   instruction_files:
     - .github/instructions/project.instructions.md
@@ -1250,6 +1251,8 @@ jobs:
 | **Go Version** | 1.26 or higher |
 | **Executor** | Use `mock` executor for CI (no API keys needed) |
 | **Copilot Auth** | Required for the default `copilot-sdk` route; set `GITHUB_TOKEN` in CI. Custom providers can be configured with `COPILOT_BASE_URL` or `COPILOT_PROVIDER_BASE_URL` instead. |
+| **Codex Auth** | Only required for `codex` executor: uses the local Codex CLI config/auth from `~/.codex` |
+| **Codex Reasoning** | Optional `model_reasoning_effort` uses Codex's config key; common values are `none`, `minimal`, `low`, `medium`, `high`, `xhigh` |
 | **Exit Codes** | 0=success, 1=test failure, 2=config error |
 
 #### Expected Skill Structure

@@ -324,6 +324,7 @@ func initCommandE(cmd *cobra.Command, args []string, noSkill bool, flagSkillsDir
 						Description("Choose how evals are executed").
 						Options(
 							huh.NewOption("Copilot SDK — real model execution", "copilot-sdk"),
+							huh.NewOption("Codex — use ~/.codex config/auth", "codex"),
 							huh.NewOption("Mock — fast iteration, no API calls", "mock"),
 						).
 						Value(&engine),
@@ -368,6 +369,10 @@ func initCommandE(cmd *cobra.Command, args []string, noSkill bool, flagSkillsDir
 				if err := modelForm.Run(); err != nil {
 					model = projectconfig.DefaultModel
 				}
+			} else if engine == "codex" {
+				// Let Codex read the default model from ~/.codex/config.toml unless
+				// the eval later sets config.model or the user passes --model.
+				model = ""
 			}
 
 			pathsForm := huh.NewForm(

@@ -473,6 +473,9 @@ func runCommandForSpec(cmd *cobra.Command, sp skillSpecPath, defaultSkills []str
 	if err != nil {
 		return nil, fmt.Errorf("failed to load spec: %w", err)
 	}
+	if cfg, cfgErr := projectconfig.Load(filepath.Dir(specPath)); cfgErr == nil {
+		applyProjectDefaultsToEvalSpec(spec, cfg)
+	}
 
 	// CLI flags override spec config
 	if parallel {
@@ -594,9 +597,51 @@ func runCommandForSpec(cmd *cobra.Command, sp skillSpecPath, defaultSkills []str
 	return allResults, nil
 }
 
+func applyProjectDefaultsToEvalSpec(spec *models.EvalSpec, cfg *projectconfig.ProjectConfig) {
+	if spec == nil || cfg == nil {
+		return
+	}
+
+	defaultEngine := cfg.Defaults.Engine
+	if defaultEngine == "" {
+		defaultEngine = projectconfig.DefaultEngine
+	}
+
+	engineWasDefault := spec.Config.EngineType == "" ||
+		(spec.Config.EngineType == projectconfig.DefaultEngine && defaultEngine != projectconfig.DefaultEngine)
+	if engineWasDefault {
+		spec.Config.EngineType = defaultEngine
+	}
+
+	defaultModel := cfg.Defaults.Model
+	modelWasDefault := spec.Config.ModelID == "" ||
+		(spec.Config.ModelID == projectconfig.DefaultModel &&
+			(defaultModel != projectconfig.DefaultModel || engineWasDefault))
+	if modelWasDefault {
+		spec.Config.ModelID = defaultModel
+	}
+	if spec.Config.ModelReasoningEffort == "" {
+		spec.Config.ModelReasoningEffort = cfg.Defaults.ModelReasoningEffort
+	}
+}
+
+func displayModel(cfg models.Config) string {
+	if cfg.ModelID != "" {
+		return cfg.ModelID
+	}
+	if cfg.EngineType == "codex" {
+		return "default (Codex config)"
+	}
+	return ""
+}
+
 // runSingleModel executes a benchmark for one model and returns the outcome.
 // It prints the per-model summary and saves output for single-model runs.
 func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string, defaultSkills []string) (*models.EvaluationOutcome, error) {
+	if err := validateEngineFeatureSupport(spec, specPath); err != nil {
+		return nil, err
+	}
+
 	// Get spec directory for resolving relative paths
 	specDir := filepath.Dir(specPath)
 	if !filepath.IsAbs(specDir) {
@@ -663,6 +708,8 @@ func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string,
 		engine = execution.NewCopilotEngineBuilder(spec.Config.ModelID, &execution.CopilotEngineBuilderOptions{
 			NewCopilotClient: newCopilotClientFn, // if nil, uses the real function, otherwise overridable for tests.
 		}).Build()
+	case "codex":
+		engine = execution.NewCodexEngine(spec.Config.ModelID)
 	default:
 		return nil, fmt.Errorf("unknown engine type: %s", spec.Config.EngineType)
 	}
@@ -764,7 +811,7 @@ func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string,
 	fmt.Printf("Running benchmark: %s\n", spec.Name)
 	fmt.Printf("Skill: %s\n", spec.SkillName)
 	fmt.Printf("Engine: %s\n", spec.Config.EngineType)
-	fmt.Printf("Model: %s\n", spec.Config.ModelID)
+	fmt.Printf("Model: %s\n", displayModel(spec.Config))
 	if spec.Config.JudgeModel != "" {
 		fmt.Printf("Judge Model: %s\n", spec.Config.JudgeModel)
 	}
@@ -936,6 +983,25 @@ func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string,
 	return outcome, nil
 }
 
+func validateEngineFeatureSupport(spec *models.EvalSpec, specPath string) error {
+	if spec == nil || spec.Config.EngineType != "codex" {
+		return nil
+	}
+	for _, grader := range spec.Graders {
+		if grader.Kind == models.GraderKindSkillInvocation {
+			return fmt.Errorf("grader %q uses skill_invocation, which is not supported by the codex executor because Codex CLI does not emit skill invocation telemetry", grader.Identifier)
+		}
+	}
+	// Reject codex when trigger tests are present — trigger tests use skill invocation
+	// telemetry that the Codex CLI cannot emit, producing misleading per-prompt errors
+	// instead of a clear upfront configuration failure.
+	specDir := filepath.Dir(specPath)
+	if triggerSpec, err := trigger.Discover(specDir); err == nil && triggerSpec != nil {
+		return fmt.Errorf("trigger tests are not supported by the codex executor; remove trigger.yaml or switch to a non-codex executor")
+	}
+	return nil
+}
+
 // printModelComparison renders a comparison table for multi-model runs.
 func printModelComparison(results []modelResult) {
 	slices.SortFunc(results, func(a, b modelResult) int {