diff --git a/docs/bundles.md b/docs/bundles.md index 93ca36ae..3d90b8ac 100644 --- a/docs/bundles.md +++ b/docs/bundles.md @@ -14,26 +14,6 @@ AgentOps ships five predefined bundles covering the most common evaluation scena | `agent_workflow_baseline` | Agent workflow | TaskCompletionEvaluator, ToolCallAccuracyEvaluator, IntentResolutionEvaluator, TaskAdherenceEvaluator, ToolSelectionEvaluator, ToolInputAccuracyEvaluator, avg\_latency\_seconds | Agents with tool calling | | `safe_agent_baseline` | Safety | ViolenceEvaluator, SexualEvaluator, SelfHarmEvaluator, HateUnfairnessEvaluator, ProtectedMaterialEvaluator, avg\_latency\_seconds | Content safety and responsible AI | -## Tuning Defaults for Your Backend - -The shipped bundles target a **Foundry cloud-evaluation** baseline. Two thresholds are -worth reviewing for your environment: - -- **`avg_latency_seconds`** — measures the *full pipeline* (agent invocation **plus** - evaluator execution), not just the agent. For Foundry cloud evaluation this is - typically 15–25 s/row including the judge model. Defaults are set conservatively - (30 s for most bundles, 45 s for `agent_workflow_baseline`). For HTTP or - local-adapter backends with light evaluators you can usually tighten this to - 5–10 s. - -- **Tool-related evaluators** (`agent_workflow_baseline`) — `ToolCallAccuracyEvaluator`, - `ToolSelectionEvaluator`, and `ToolInputAccuracyEvaluator` only produce meaningful - scores when the target agent **actually exposes tool definitions** matching the - `tool_definitions` field in your dataset rows. Running this bundle against an - agent without registered tools will silently produce near-zero scores (the - evaluators see no tool calls to grade); use `conversational_agent_baseline` - instead for tool-less agents. - ## Bundle YAML Structure ```yaml diff --git a/docs/how-it-works.md b/docs/how-it-works.md index bcf01e07..0910d359 100644 --- a/docs/how-it-works.md +++ b/docs/how-it-works.md @@ -320,7 +320,7 @@ Run configs use `version: 1`. - `kind` — `foundry_agent` or `http` Foundry agent endpoint fields: -- `agent_id` — Agent identifier, e.g. `my-agent:3` (name:version). Omitting the `:version` suffix is allowed and resolves to the **latest** version at run time, but is not deterministic across runs — pin a version for CI / baseline comparisons. +- `agent_id` — Agent identifier, e.g. `my-agent:3` (name:version) - `project_endpoint` — Foundry project URL (inline value) - `project_endpoint_env` — Env var name holding the project URL (default: `AZURE_AI_FOUNDRY_PROJECT_ENDPOINT`) - `api_version` — Agent Service API version diff --git a/docs/tutorial-agent-workflow.md b/docs/tutorial-agent-workflow.md index 5b43306e..1cc00cb3 100644 --- a/docs/tutorial-agent-workflow.md +++ b/docs/tutorial-agent-workflow.md @@ -249,7 +249,7 @@ The `agent_workflow_baseline` bundle enforces: | TaskAdherenceEvaluator | ≥ | 3.0 | | ToolSelectionEvaluator | ≥ | 3.0 | | ToolInputAccuracyEvaluator | ≥ | 3.0 | -| avg_latency_seconds | ≤ | 45.0 | +| avg_latency_seconds | ≤ | 15.0 | Scores range from 1 to 5. Adjust thresholds in `.agentops/bundles/agent_workflow_baseline.yaml`. diff --git a/docs/tutorial-basic-foundry-agent.md b/docs/tutorial-basic-foundry-agent.md index e8adab2a..22f7ec8e 100644 --- a/docs/tutorial-basic-foundry-agent.md +++ b/docs/tutorial-basic-foundry-agent.md @@ -76,13 +76,6 @@ After saving, you need the agent's identifier for the run config. There are two AgentOps handles both. Named agents use the Foundry Responses API; legacy agents use the Threads API. -> **Versioning tip.** When you omit the `:version` suffix on a named agent -> (e.g., `agent_id: my-agent`), Foundry resolves to the **latest** version of -> that agent at run time. This is convenient for local iteration but can hurt -> reproducibility — a later run could pick up a different agent revision. -> For CI / baseline runs, pin an explicit version (e.g., `my-agent:3`) so -> the same `results.json` can be re-generated deterministically. - ## Part 2: Set up AgentOps ### 1) Azure login diff --git a/docs/tutorial-conversational-agent.md b/docs/tutorial-conversational-agent.md index b1aec399..5b49583f 100644 --- a/docs/tutorial-conversational-agent.md +++ b/docs/tutorial-conversational-agent.md @@ -233,7 +233,7 @@ The `conversational_agent_baseline` bundle enforces: | FluencyEvaluator | ≥ | 3.0 | | RelevanceEvaluator | ≥ | 3.0 | | SimilarityEvaluator | ≥ | 3.0 | -| avg_latency_seconds | ≤ | 30.0 | +| avg_latency_seconds | ≤ | 10.0 | Scores range from 1 to 5. Adjust thresholds in `.agentops/bundles/conversational_agent_baseline.yaml` for your quality bar. diff --git a/src/agentops/cli/app.py b/src/agentops/cli/app.py index e9dd1717..478536c7 100644 --- a/src/agentops/cli/app.py +++ b/src/agentops/cli/app.py @@ -380,44 +380,9 @@ def cmd_dataset_import() -> None: @config_app.command("validate") -def cmd_config_validate( - config: Path = typer.Option( - Path(".agentops/run.yaml"), - "-c", - "--config", - help="Path to a run.yaml file to validate.", - ), -) -> None: - """Validate a run.yaml configuration file against the AgentOps schema. - - Loads the file with the same Pydantic models the runner uses, prints a - summary on success, and prints a clear error message and exits with - code 1 on validation failure. - """ - from agentops.core.config_loader import load_run_config - - if not config.exists(): - typer.echo(f"Error: config file not found: {config}", err=True) - raise typer.Exit(code=1) - - try: - run_cfg = load_run_config(config) - except ValueError as exc: - typer.echo(f"❌ {config} is invalid:\n{exc}", err=True) - raise typer.Exit(code=1) from exc - except Exception as exc: # pragma: no cover - unexpected I/O / parser error - typer.echo(f"Error reading {config}: {exc}", err=True) - raise typer.Exit(code=1) from exc - - target = run_cfg.target - bundle_ref = run_cfg.bundle.name or run_cfg.bundle.path - dataset_ref = run_cfg.dataset.name or run_cfg.dataset.path - typer.echo( - f"✅ {config} is valid " - f"(version={run_cfg.version}, target.type={target.type}, " - f"target.hosting={target.hosting}, target.execution_mode={target.execution_mode}, " - f"bundle={bundle_ref}, dataset={dataset_ref})" - ) +def cmd_config_validate() -> None: + """Validate configuration files (planned).""" + _planned_command("agentops config validate") @config_app.command("show") diff --git a/src/agentops/templates/bundles/agent_workflow_baseline.yaml b/src/agentops/templates/bundles/agent_workflow_baseline.yaml index 3eb916af..ea6e015b 100644 --- a/src/agentops/templates/bundles/agent_workflow_baseline.yaml +++ b/src/agentops/templates/bundles/agent_workflow_baseline.yaml @@ -106,7 +106,7 @@ thresholds: value: 3 - evaluator: avg_latency_seconds criteria: "<=" - value: 45.0 + value: 15.0 metadata: category: agent-workflow scenario: agent_with_tools diff --git a/src/agentops/templates/bundles/conversational_agent_baseline.yaml b/src/agentops/templates/bundles/conversational_agent_baseline.yaml index 8cd3e129..2126df44 100644 --- a/src/agentops/templates/bundles/conversational_agent_baseline.yaml +++ b/src/agentops/templates/bundles/conversational_agent_baseline.yaml @@ -64,7 +64,7 @@ thresholds: value: 3 - evaluator: avg_latency_seconds criteria: "<=" - value: 30.0 + value: 10.0 metadata: category: conversational scenario: conversational_agent diff --git a/src/agentops/templates/bundles/model_quality_baseline.yaml b/src/agentops/templates/bundles/model_quality_baseline.yaml index 27fe1abd..9b2f2583 100644 --- a/src/agentops/templates/bundles/model_quality_baseline.yaml +++ b/src/agentops/templates/bundles/model_quality_baseline.yaml @@ -64,7 +64,7 @@ thresholds: value: 0.4 - evaluator: avg_latency_seconds criteria: "<=" - value: 30.0 + value: 10.0 metadata: category: model-quality scenario: model_direct diff --git a/src/agentops/templates/bundles/rag_quality_baseline.yaml b/src/agentops/templates/bundles/rag_quality_baseline.yaml index 4b51c36b..feb8f258 100644 --- a/src/agentops/templates/bundles/rag_quality_baseline.yaml +++ b/src/agentops/templates/bundles/rag_quality_baseline.yaml @@ -79,7 +79,7 @@ thresholds: value: 3 - evaluator: avg_latency_seconds criteria: "<=" - value: 30.0 + value: 10.0 metadata: category: rag-quality scenario: rag_retrieval diff --git a/src/agentops/templates/bundles/safe_agent_baseline.yaml b/src/agentops/templates/bundles/safe_agent_baseline.yaml index 06814927..36108d7f 100644 --- a/src/agentops/templates/bundles/safe_agent_baseline.yaml +++ b/src/agentops/templates/bundles/safe_agent_baseline.yaml @@ -78,7 +78,7 @@ thresholds: value: 2 - evaluator: avg_latency_seconds criteria: "<=" - value: 30.0 + value: 10.0 metadata: category: safety scenario: content_safety diff --git a/tests/unit/test_cli_commands.py b/tests/unit/test_cli_commands.py index 7c6060f3..cf77e6ce 100644 --- a/tests/unit/test_cli_commands.py +++ b/tests/unit/test_cli_commands.py @@ -52,47 +52,3 @@ def test_report_help_exposes_available_and_planned_commands() -> None: assert "generate" in stripped assert "show" in stripped assert "export" in stripped - - -def test_config_validate_accepts_valid_run_yaml(tmp_path) -> None: - cfg = tmp_path / "run.yaml" - cfg.write_text( - """\ -version: 1 -target: - type: agent - hosting: local - execution_mode: local - local: - callable: my_module:my_fn -bundle: - name: conversational_agent_baseline -dataset: - name: smoke-conversational -""", - encoding="utf-8", - ) - - result = runner.invoke(app, ["config", "validate", "-c", str(cfg)]) - - assert result.exit_code == 0, result.stdout + (result.stderr or "") - assert "is valid" in result.stdout - - -def test_config_validate_reports_missing_file() -> None: - result = runner.invoke(app, ["config", "validate", "-c", "/no/such/file.yaml"]) - - assert result.exit_code == 1 - combined = result.stdout + (result.stderr or "") - assert "not found" in combined.lower() - - -def test_config_validate_reports_invalid_schema(tmp_path) -> None: - cfg = tmp_path / "run.yaml" - cfg.write_text("version: 1\nbackend: legacy\n", encoding="utf-8") - - result = runner.invoke(app, ["config", "validate", "-c", str(cfg)]) - - assert result.exit_code == 1 - combined = result.stdout + (result.stderr or "") - assert "invalid" in combined.lower() or "backend" in combined.lower()