diff --git a/docs/bundles.md b/docs/bundles.md index 3d90b8ac..93ca36ae 100644 --- a/docs/bundles.md +++ b/docs/bundles.md @@ -14,6 +14,26 @@ AgentOps ships five predefined bundles covering the most common evaluation scena | `agent_workflow_baseline` | Agent workflow | TaskCompletionEvaluator, ToolCallAccuracyEvaluator, IntentResolutionEvaluator, TaskAdherenceEvaluator, ToolSelectionEvaluator, ToolInputAccuracyEvaluator, avg\_latency\_seconds | Agents with tool calling | | `safe_agent_baseline` | Safety | ViolenceEvaluator, SexualEvaluator, SelfHarmEvaluator, HateUnfairnessEvaluator, ProtectedMaterialEvaluator, avg\_latency\_seconds | Content safety and responsible AI | +## Tuning Defaults for Your Backend + +The shipped bundles target a **Foundry cloud-evaluation** baseline. Two thresholds are +worth reviewing for your environment: + +- **`avg_latency_seconds`** — measures the *full pipeline* (agent invocation **plus** + evaluator execution), not just the agent. For Foundry cloud evaluation this is + typically 15–25 s/row including the judge model. Defaults are set conservatively + (30 s for most bundles, 45 s for `agent_workflow_baseline`). For HTTP or + local-adapter backends with light evaluators you can usually tighten this to + 5–10 s. + +- **Tool-related evaluators** (`agent_workflow_baseline`) — `ToolCallAccuracyEvaluator`, + `ToolSelectionEvaluator`, and `ToolInputAccuracyEvaluator` only produce meaningful + scores when the target agent **actually exposes tool definitions** matching the + `tool_definitions` field in your dataset rows. Running this bundle against an + agent without registered tools will silently produce near-zero scores (the + evaluators see no tool calls to grade); use `conversational_agent_baseline` + instead for tool-less agents. + ## Bundle YAML Structure ```yaml diff --git a/docs/how-it-works.md b/docs/how-it-works.md index 0910d359..bcf01e07 100644 --- a/docs/how-it-works.md +++ b/docs/how-it-works.md @@ -320,7 +320,7 @@ Run configs use `version: 1`. - `kind` — `foundry_agent` or `http` Foundry agent endpoint fields: -- `agent_id` — Agent identifier, e.g. `my-agent:3` (name:version) +- `agent_id` — Agent identifier, e.g. `my-agent:3` (name:version). Omitting the `:version` suffix is allowed and resolves to the **latest** version at run time, but is not deterministic across runs — pin a version for CI / baseline comparisons. - `project_endpoint` — Foundry project URL (inline value) - `project_endpoint_env` — Env var name holding the project URL (default: `AZURE_AI_FOUNDRY_PROJECT_ENDPOINT`) - `api_version` — Agent Service API version diff --git a/docs/tutorial-agent-workflow.md b/docs/tutorial-agent-workflow.md index 1cc00cb3..5b43306e 100644 --- a/docs/tutorial-agent-workflow.md +++ b/docs/tutorial-agent-workflow.md @@ -249,7 +249,7 @@ The `agent_workflow_baseline` bundle enforces: | TaskAdherenceEvaluator | ≥ | 3.0 | | ToolSelectionEvaluator | ≥ | 3.0 | | ToolInputAccuracyEvaluator | ≥ | 3.0 | -| avg_latency_seconds | ≤ | 15.0 | +| avg_latency_seconds | ≤ | 45.0 | Scores range from 1 to 5. Adjust thresholds in `.agentops/bundles/agent_workflow_baseline.yaml`. diff --git a/docs/tutorial-basic-foundry-agent.md b/docs/tutorial-basic-foundry-agent.md index 22f7ec8e..e8adab2a 100644 --- a/docs/tutorial-basic-foundry-agent.md +++ b/docs/tutorial-basic-foundry-agent.md @@ -76,6 +76,13 @@ After saving, you need the agent's identifier for the run config. There are two AgentOps handles both. Named agents use the Foundry Responses API; legacy agents use the Threads API. +> **Versioning tip.** When you omit the `:version` suffix on a named agent +> (e.g., `agent_id: my-agent`), Foundry resolves to the **latest** version of +> that agent at run time. This is convenient for local iteration but can hurt +> reproducibility — a later run could pick up a different agent revision. +> For CI / baseline runs, pin an explicit version (e.g., `my-agent:3`) so +> the same `results.json` can be re-generated deterministically. + ## Part 2: Set up AgentOps ### 1) Azure login diff --git a/docs/tutorial-conversational-agent.md b/docs/tutorial-conversational-agent.md index 5b49583f..b1aec399 100644 --- a/docs/tutorial-conversational-agent.md +++ b/docs/tutorial-conversational-agent.md @@ -233,7 +233,7 @@ The `conversational_agent_baseline` bundle enforces: | FluencyEvaluator | ≥ | 3.0 | | RelevanceEvaluator | ≥ | 3.0 | | SimilarityEvaluator | ≥ | 3.0 | -| avg_latency_seconds | ≤ | 10.0 | +| avg_latency_seconds | ≤ | 30.0 | Scores range from 1 to 5. Adjust thresholds in `.agentops/bundles/conversational_agent_baseline.yaml` for your quality bar. diff --git a/src/agentops/cli/app.py b/src/agentops/cli/app.py index 478536c7..e9dd1717 100644 --- a/src/agentops/cli/app.py +++ b/src/agentops/cli/app.py @@ -380,9 +380,44 @@ def cmd_dataset_import() -> None: @config_app.command("validate") -def cmd_config_validate() -> None: - """Validate configuration files (planned).""" - _planned_command("agentops config validate") +def cmd_config_validate( + config: Path = typer.Option( + Path(".agentops/run.yaml"), + "-c", + "--config", + help="Path to a run.yaml file to validate.", + ), +) -> None: + """Validate a run.yaml configuration file against the AgentOps schema. + + Loads the file with the same Pydantic models the runner uses, prints a + summary on success, and prints a clear error message and exits with + code 1 on validation failure. + """ + from agentops.core.config_loader import load_run_config + + if not config.exists(): + typer.echo(f"Error: config file not found: {config}", err=True) + raise typer.Exit(code=1) + + try: + run_cfg = load_run_config(config) + except ValueError as exc: + typer.echo(f"❌ {config} is invalid:\n{exc}", err=True) + raise typer.Exit(code=1) from exc + except Exception as exc: # pragma: no cover - unexpected I/O / parser error + typer.echo(f"Error reading {config}: {exc}", err=True) + raise typer.Exit(code=1) from exc + + target = run_cfg.target + bundle_ref = run_cfg.bundle.name or run_cfg.bundle.path + dataset_ref = run_cfg.dataset.name or run_cfg.dataset.path + typer.echo( + f"✅ {config} is valid " + f"(version={run_cfg.version}, target.type={target.type}, " + f"target.hosting={target.hosting}, target.execution_mode={target.execution_mode}, " + f"bundle={bundle_ref}, dataset={dataset_ref})" + ) @config_app.command("show") diff --git a/src/agentops/templates/bundles/agent_workflow_baseline.yaml b/src/agentops/templates/bundles/agent_workflow_baseline.yaml index ea6e015b..3eb916af 100644 --- a/src/agentops/templates/bundles/agent_workflow_baseline.yaml +++ b/src/agentops/templates/bundles/agent_workflow_baseline.yaml @@ -106,7 +106,7 @@ thresholds: value: 3 - evaluator: avg_latency_seconds criteria: "<=" - value: 15.0 + value: 45.0 metadata: category: agent-workflow scenario: agent_with_tools diff --git a/src/agentops/templates/bundles/conversational_agent_baseline.yaml b/src/agentops/templates/bundles/conversational_agent_baseline.yaml index 2126df44..8cd3e129 100644 --- a/src/agentops/templates/bundles/conversational_agent_baseline.yaml +++ b/src/agentops/templates/bundles/conversational_agent_baseline.yaml @@ -64,7 +64,7 @@ thresholds: value: 3 - evaluator: avg_latency_seconds criteria: "<=" - value: 10.0 + value: 30.0 metadata: category: conversational scenario: conversational_agent diff --git a/src/agentops/templates/bundles/model_quality_baseline.yaml b/src/agentops/templates/bundles/model_quality_baseline.yaml index 9b2f2583..27fe1abd 100644 --- a/src/agentops/templates/bundles/model_quality_baseline.yaml +++ b/src/agentops/templates/bundles/model_quality_baseline.yaml @@ -64,7 +64,7 @@ thresholds: value: 0.4 - evaluator: avg_latency_seconds criteria: "<=" - value: 10.0 + value: 30.0 metadata: category: model-quality scenario: model_direct diff --git a/src/agentops/templates/bundles/rag_quality_baseline.yaml b/src/agentops/templates/bundles/rag_quality_baseline.yaml index feb8f258..4b51c36b 100644 --- a/src/agentops/templates/bundles/rag_quality_baseline.yaml +++ b/src/agentops/templates/bundles/rag_quality_baseline.yaml @@ -79,7 +79,7 @@ thresholds: value: 3 - evaluator: avg_latency_seconds criteria: "<=" - value: 10.0 + value: 30.0 metadata: category: rag-quality scenario: rag_retrieval diff --git a/src/agentops/templates/bundles/safe_agent_baseline.yaml b/src/agentops/templates/bundles/safe_agent_baseline.yaml index 36108d7f..06814927 100644 --- a/src/agentops/templates/bundles/safe_agent_baseline.yaml +++ b/src/agentops/templates/bundles/safe_agent_baseline.yaml @@ -78,7 +78,7 @@ thresholds: value: 2 - evaluator: avg_latency_seconds criteria: "<=" - value: 10.0 + value: 30.0 metadata: category: safety scenario: content_safety diff --git a/tests/unit/test_cli_commands.py b/tests/unit/test_cli_commands.py index cf77e6ce..7c6060f3 100644 --- a/tests/unit/test_cli_commands.py +++ b/tests/unit/test_cli_commands.py @@ -52,3 +52,47 @@ def test_report_help_exposes_available_and_planned_commands() -> None: assert "generate" in stripped assert "show" in stripped assert "export" in stripped + + +def test_config_validate_accepts_valid_run_yaml(tmp_path) -> None: + cfg = tmp_path / "run.yaml" + cfg.write_text( + """\ +version: 1 +target: + type: agent + hosting: local + execution_mode: local + local: + callable: my_module:my_fn +bundle: + name: conversational_agent_baseline +dataset: + name: smoke-conversational +""", + encoding="utf-8", + ) + + result = runner.invoke(app, ["config", "validate", "-c", str(cfg)]) + + assert result.exit_code == 0, result.stdout + (result.stderr or "") + assert "is valid" in result.stdout + + +def test_config_validate_reports_missing_file() -> None: + result = runner.invoke(app, ["config", "validate", "-c", "/no/such/file.yaml"]) + + assert result.exit_code == 1 + combined = result.stdout + (result.stderr or "") + assert "not found" in combined.lower() + + +def test_config_validate_reports_invalid_schema(tmp_path) -> None: + cfg = tmp_path / "run.yaml" + cfg.write_text("version: 1\nbackend: legacy\n", encoding="utf-8") + + result = runner.invoke(app, ["config", "validate", "-c", str(cfg)]) + + assert result.exit_code == 1 + combined = result.stdout + (result.stderr or "") + assert "invalid" in combined.lower() or "backend" in combined.lower()