Azure · Dongbumlee · Apr 29, 2026 · Apr 29, 2026
diff --git a/docs/bundles.md b/docs/bundles.md
@@ -14,6 +14,26 @@ AgentOps ships five predefined bundles covering the most common evaluation scena
 | `agent_workflow_baseline` | Agent workflow | TaskCompletionEvaluator, ToolCallAccuracyEvaluator, IntentResolutionEvaluator, TaskAdherenceEvaluator, ToolSelectionEvaluator, ToolInputAccuracyEvaluator, avg\_latency\_seconds | Agents with tool calling |
 | `safe_agent_baseline` | Safety | ViolenceEvaluator, SexualEvaluator, SelfHarmEvaluator, HateUnfairnessEvaluator, ProtectedMaterialEvaluator, avg\_latency\_seconds | Content safety and responsible AI |
 
+## Tuning Defaults for Your Backend
+
+The shipped bundles target a **Foundry cloud-evaluation** baseline. Two thresholds are
+worth reviewing for your environment:
+
+- **`avg_latency_seconds`** — measures the *full pipeline* (agent invocation **plus**
+  evaluator execution), not just the agent. For Foundry cloud evaluation this is
+  typically 15–25 s/row including the judge model. Defaults are set conservatively
+  (30 s for most bundles, 45 s for `agent_workflow_baseline`). For HTTP or
+  local-adapter backends with light evaluators you can usually tighten this to
+  5–10 s.
+
+- **Tool-related evaluators** (`agent_workflow_baseline`) — `ToolCallAccuracyEvaluator`,
+  `ToolSelectionEvaluator`, and `ToolInputAccuracyEvaluator` only produce meaningful
+  scores when the target agent **actually exposes tool definitions** matching the
+  `tool_definitions` field in your dataset rows. Running this bundle against an
+  agent without registered tools will silently produce near-zero scores (the
+  evaluators see no tool calls to grade); use `conversational_agent_baseline`
+  instead for tool-less agents.
+
 ## Bundle YAML Structure
 
 ```yaml

diff --git a/docs/how-it-works.md b/docs/how-it-works.md
@@ -320,7 +320,7 @@ Run configs use `version: 1`.
 - `kind` — `foundry_agent` or `http`
 
 Foundry agent endpoint fields:
-- `agent_id` — Agent identifier, e.g. `my-agent:3` (name:version)
+- `agent_id` — Agent identifier, e.g. `my-agent:3` (name:version). Omitting the `:version` suffix is allowed and resolves to the **latest** version at run time, but is not deterministic across runs — pin a version for CI / baseline comparisons.
 - `project_endpoint` — Foundry project URL (inline value)
 - `project_endpoint_env` — Env var name holding the project URL (default: `AZURE_AI_FOUNDRY_PROJECT_ENDPOINT`)
 - `api_version` — Agent Service API version

diff --git a/docs/tutorial-agent-workflow.md b/docs/tutorial-agent-workflow.md
@@ -249,7 +249,7 @@ The `agent_workflow_baseline` bundle enforces:
 | TaskAdherenceEvaluator | ≥ | 3.0 |
 | ToolSelectionEvaluator | ≥ | 3.0 |
 | ToolInputAccuracyEvaluator | ≥ | 3.0 |
-| avg_latency_seconds | ≤ | 15.0 |
+| avg_latency_seconds | ≤ | 45.0 |
 
 Scores range from 1 to 5. Adjust thresholds in `.agentops/bundles/agent_workflow_baseline.yaml`.
 

diff --git a/docs/tutorial-basic-foundry-agent.md b/docs/tutorial-basic-foundry-agent.md
@@ -76,6 +76,13 @@ After saving, you need the agent's identifier for the run config. There are two
 
 AgentOps handles both. Named agents use the Foundry Responses API; legacy agents use the Threads API.
 
+> **Versioning tip.** When you omit the `:version` suffix on a named agent
+> (e.g., `agent_id: my-agent`), Foundry resolves to the **latest** version of
+> that agent at run time. This is convenient for local iteration but can hurt
+> reproducibility — a later run could pick up a different agent revision.
+> For CI / baseline runs, pin an explicit version (e.g., `my-agent:3`) so
+> the same `results.json` can be re-generated deterministically.
+
 ## Part 2: Set up AgentOps
 
 ### 1) Azure login

diff --git a/docs/tutorial-conversational-agent.md b/docs/tutorial-conversational-agent.md
@@ -233,7 +233,7 @@ The `conversational_agent_baseline` bundle enforces:
 | FluencyEvaluator | ≥ | 3.0 |
 | RelevanceEvaluator | ≥ | 3.0 |
 | SimilarityEvaluator | ≥ | 3.0 |
-| avg_latency_seconds | ≤ | 10.0 |
+| avg_latency_seconds | ≤ | 30.0 |
 
 Scores range from 1 to 5. Adjust thresholds in `.agentops/bundles/conversational_agent_baseline.yaml` for your quality bar.
 

diff --git a/src/agentops/cli/app.py b/src/agentops/cli/app.py
@@ -380,9 +380,44 @@ def cmd_dataset_import() -> None:
 
 
 @config_app.command("validate")
-def cmd_config_validate() -> None:
-    """Validate configuration files (planned)."""
-    _planned_command("agentops config validate")
+def cmd_config_validate(
+    config: Path = typer.Option(
+        Path(".agentops/run.yaml"),
+        "-c",
+        "--config",
+        help="Path to a run.yaml file to validate.",
+    ),
+) -> None:
+    """Validate a run.yaml configuration file against the AgentOps schema.
+
+    Loads the file with the same Pydantic models the runner uses, prints a
+    summary on success, and prints a clear error message and exits with
+    code 1 on validation failure.
+    """
+    from agentops.core.config_loader import load_run_config
+
+    if not config.exists():
+        typer.echo(f"Error: config file not found: {config}", err=True)
+        raise typer.Exit(code=1)
+
+    try:
+        run_cfg = load_run_config(config)
+    except ValueError as exc:
+        typer.echo(f"❌ {config} is invalid:\n{exc}", err=True)
+        raise typer.Exit(code=1) from exc
+    except Exception as exc:  # pragma: no cover - unexpected I/O / parser error
+        typer.echo(f"Error reading {config}: {exc}", err=True)
+        raise typer.Exit(code=1) from exc
+
+    target = run_cfg.target
+    bundle_ref = run_cfg.bundle.name or run_cfg.bundle.path
+    dataset_ref = run_cfg.dataset.name or run_cfg.dataset.path
+    typer.echo(
+        f"✅ {config} is valid "
+        f"(version={run_cfg.version}, target.type={target.type}, "
+        f"target.hosting={target.hosting}, target.execution_mode={target.execution_mode}, "
+        f"bundle={bundle_ref}, dataset={dataset_ref})"
+    )
 
 
 @config_app.command("show")

diff --git a/src/agentops/templates/bundles/agent_workflow_baseline.yaml b/src/agentops/templates/bundles/agent_workflow_baseline.yaml
@@ -106,7 +106,7 @@ thresholds:
     value: 3
   - evaluator: avg_latency_seconds
     criteria: "<="
-    value: 15.0
+    value: 45.0
 metadata:
   category: agent-workflow
   scenario: agent_with_tools

diff --git a/src/agentops/templates/bundles/conversational_agent_baseline.yaml b/src/agentops/templates/bundles/conversational_agent_baseline.yaml
@@ -64,7 +64,7 @@ thresholds:
     value: 3
   - evaluator: avg_latency_seconds
     criteria: "<="
-    value: 10.0
+    value: 30.0
 metadata:
   category: conversational
   scenario: conversational_agent

diff --git a/src/agentops/templates/bundles/model_quality_baseline.yaml b/src/agentops/templates/bundles/model_quality_baseline.yaml
@@ -64,7 +64,7 @@ thresholds:
     value: 0.4
   - evaluator: avg_latency_seconds
     criteria: "<="
-    value: 10.0
+    value: 30.0
 metadata:
   category: model-quality
   scenario: model_direct

diff --git a/src/agentops/templates/bundles/rag_quality_baseline.yaml b/src/agentops/templates/bundles/rag_quality_baseline.yaml
@@ -79,7 +79,7 @@ thresholds:
     value: 3
   - evaluator: avg_latency_seconds
     criteria: "<="
-    value: 10.0
+    value: 30.0
 metadata:
   category: rag-quality
   scenario: rag_retrieval

diff --git a/src/agentops/templates/bundles/safe_agent_baseline.yaml b/src/agentops/templates/bundles/safe_agent_baseline.yaml
@@ -78,7 +78,7 @@ thresholds:
     value: 2
   - evaluator: avg_latency_seconds
     criteria: "<="
-    value: 10.0
+    value: 30.0
 metadata:
   category: safety
   scenario: content_safety

diff --git a/tests/unit/test_cli_commands.py b/tests/unit/test_cli_commands.py
@@ -52,3 +52,47 @@ def test_report_help_exposes_available_and_planned_commands() -> None:
     assert "generate" in stripped
     assert "show" in stripped
     assert "export" in stripped
+
+
+def test_config_validate_accepts_valid_run_yaml(tmp_path) -> None:
+    cfg = tmp_path / "run.yaml"
+    cfg.write_text(
+        """\
+version: 1
+target:
+  type: agent
+  hosting: local
+  execution_mode: local
+  local:
+    callable: my_module:my_fn
+bundle:
+  name: conversational_agent_baseline
+dataset:
+  name: smoke-conversational
+""",
+        encoding="utf-8",
+    )
+
+    result = runner.invoke(app, ["config", "validate", "-c", str(cfg)])
+
+    assert result.exit_code == 0, result.stdout + (result.stderr or "")
+    assert "is valid" in result.stdout
+
+
+def test_config_validate_reports_missing_file() -> None:
+    result = runner.invoke(app, ["config", "validate", "-c", "/no/such/file.yaml"])
+
+    assert result.exit_code == 1
+    combined = result.stdout + (result.stderr or "")
+    assert "not found" in combined.lower()
+
+
+def test_config_validate_reports_invalid_schema(tmp_path) -> None:
+    cfg = tmp_path / "run.yaml"
+    cfg.write_text("version: 1\nbackend: legacy\n", encoding="utf-8")
+
+    result = runner.invoke(app, ["config", "validate", "-c", str(cfg)])
+
+    assert result.exit_code == 1
+    combined = result.stdout + (result.stderr or "")
+    assert "invalid" in combined.lower() or "backend" in combined.lower()