diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b0bddb9..fa616783 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,16 @@ All notable changes to this project will be documented in this file. This format follows [Keep a Changelog](https://keepachangelog.com/) and adheres to [Semantic Versioning](https://semver.org/). +## [0.1.8] - 2026-04-22 + +### Added +- **Pre-flight checks for `agentops eval run`** — detects common issues (missing `azure-identity` or `azure-ai-evaluation` packages, missing env vars for AI-assisted/safety evaluators, Azure credential failures, unreachable endpoints) *before* backend execution. All detectable issues are reported at once with actionable error messages and `pip install` hints. +- **`--dry-run` / `-n` flag on `eval run`** — runs pre-flight checks without executing the evaluation. Exits 0 if all checks pass, 1 otherwise. Useful for CI gating and fast feedback. +- **Credential warm-up in pre-flight** — acquires and caches the MSAL token once during pre-flight so subsequent evaluator calls don't each cold-start `az.cmd`. + +### Changed +- **Azure CLI credential timeout raised to 30s** — all `DefaultAzureCredential` instantiation sites (`eval_engine.py`, `foundry_backend.py`) now pass `process_timeout=30`. Default (10s) is insufficient for Windows `az.cmd` cold starts and was causing intermittent `AzureCliCredential: Failed to invoke the Azure CLI` errors. + ## [0.1.7] - 2026-04-21 ### Added diff --git a/src/agentops/backends/eval_engine.py b/src/agentops/backends/eval_engine.py index 544bef3e..a0a14b52 100644 --- a/src/agentops/backends/eval_engine.py +++ b/src/agentops/backends/eval_engine.py @@ -425,7 +425,12 @@ def _default_credential() -> Any: ) from exc try: - return DefaultAzureCredential(exclude_developer_cli_credential=True) + # process_timeout=30 accommodates slow Azure CLI cold starts + # (notably on Windows where az.cmd can take >10s to initialize). + return DefaultAzureCredential( + exclude_developer_cli_credential=True, + process_timeout=30, + ) except Exception as exc: raise RuntimeError(_CREDENTIAL_HELP_MESSAGE) from exc diff --git a/src/agentops/backends/foundry_backend.py b/src/agentops/backends/foundry_backend.py index 1ceead82..eb95206f 100644 --- a/src/agentops/backends/foundry_backend.py +++ b/src/agentops/backends/foundry_backend.py @@ -79,7 +79,12 @@ def _acquire_token(scope: str) -> str: ) from exc try: - credential = DefaultAzureCredential(exclude_developer_cli_credential=True) + # process_timeout=30 accommodates slow Azure CLI cold starts + # (notably on Windows where az.cmd can take >10s to initialize). + credential = DefaultAzureCredential( + exclude_developer_cli_credential=True, + process_timeout=30, + ) token = credential.get_token(scope) return token.token except Exception as exc: @@ -431,7 +436,10 @@ def _invoke_model_direct(self, settings: FoundrySettings, prompt: str) -> str: "Install with: pip install 'azure-ai-projects>=2.0.1' azure-identity openai" ) from exc - credential = DefaultAzureCredential(exclude_developer_cli_credential=True) + credential = DefaultAzureCredential( + exclude_developer_cli_credential=True, + process_timeout=30, + ) project_client = AIProjectClient( endpoint=settings.project_endpoint, credential=credential, diff --git a/src/agentops/cli/app.py b/src/agentops/cli/app.py index 693ffa0f..e8c00f8c 100644 --- a/src/agentops/cli/app.py +++ b/src/agentops/cli/app.py @@ -204,6 +204,15 @@ def cmd_eval_run( report_format: Annotated[ str, typer.Option("--format", "-f", help="Report format: md, html, or all.") ] = "md", + dry_run: Annotated[ + bool, + typer.Option( + "--dry-run", + "-n", + help="Run pre-flight checks (SDK imports, env vars, credentials, " + "endpoint reachability) without executing the evaluation.", + ), + ] = False, ) -> None: """Run an evaluation defined in a run.yaml file.""" from agentops.services.runner import run_evaluation @@ -213,19 +222,27 @@ def cmd_eval_run( raise typer.Exit(code=1) log.debug( - "cmd_eval_run called config=%s output=%s format=%s", + "cmd_eval_run called config=%s output=%s format=%s dry_run=%s", config, output, report_format, + dry_run, ) try: run_result = run_evaluation( - config_path=config, output_override=output, report_format=report_format + config_path=config, + output_override=output, + report_format=report_format, + dry_run=dry_run, ) except Exception as exc: typer.echo(f"Error: evaluation failed: {exc}", err=True) raise typer.Exit(code=1) from exc + if dry_run: + typer.echo("Pre-flight checks passed.") + return + typer.echo(f"Evaluation output directory: {run_result.output_dir}") typer.echo(f"results.json: {run_result.results_path}") typer.echo(f"report: {run_result.report_path}") diff --git a/src/agentops/services/preflight.py b/src/agentops/services/preflight.py new file mode 100644 index 00000000..7cf13ea6 --- /dev/null +++ b/src/agentops/services/preflight.py @@ -0,0 +1,273 @@ +"""Pre-flight checks for evaluation runs. + +Validates the environment and configuration *before* backend execution so that +common issues (missing SDKs, env vars, unreachable endpoints, credential +failures) surface fast with actionable error messages rather than deep within +the evaluation pipeline. +""" + +from __future__ import annotations + +import importlib +import logging +import os +from dataclasses import dataclass, field +from urllib import error as urllib_error +from urllib import request as urllib_request + +from agentops.core.models import BundleConfig, RunConfig + +logger = logging.getLogger(__name__) + + +# Evaluator class names that need AZURE_OPENAI_ENDPOINT + AZURE_OPENAI_DEPLOYMENT. +# Sourced from _AI_ASSISTED_EVALUATORS in eval_engine.py. +_AI_ASSISTED_EVALUATOR_CLASSES = frozenset( + { + "GroundednessEvaluator", + "RelevanceEvaluator", + "CoherenceEvaluator", + "FluencyEvaluator", + "SimilarityEvaluator", + "RetrievalEvaluator", + "ResponseCompletenessEvaluator", + "QAEvaluator", + "IntentResolutionEvaluator", + "TaskAdherenceEvaluator", + "ToolCallAccuracyEvaluator", + "TaskCompletionEvaluator", + "TaskNavigationEfficiencyEvaluator", + "ToolSelectionEvaluator", + "ToolInputAccuracyEvaluator", + "ToolOutputUtilizationEvaluator", + "ToolCallSuccessEvaluator", + } +) + +# Safety evaluators that need AZURE_AI_FOUNDRY_PROJECT_ENDPOINT. +_SAFETY_EVALUATOR_CLASSES = frozenset( + { + "ViolenceEvaluator", + "SexualEvaluator", + "SelfHarmEvaluator", + "HateUnfairnessEvaluator", + "ContentSafetyEvaluator", + "ProtectedMaterialEvaluator", + "CodeVulnerabilityEvaluator", + "UngroundedAttributesEvaluator", + "IndirectAttackEvaluator", + "GroundednessProEvaluator", + } +) + +# Local-only evaluators that don't need Azure at all. +_LOCAL_ONLY_EVALUATORS = frozenset( + { + "exact_match", + "latency_seconds", + "avg_latency_seconds", + } +) + + +@dataclass +class PreflightReport: + """Result of running pre-flight checks.""" + + errors: list[str] = field(default_factory=list) + warnings: list[str] = field(default_factory=list) + + @property + def ok(self) -> bool: + return not self.errors + + def format(self) -> str: + lines: list[str] = [] + if self.errors: + lines.append("Pre-flight checks failed:") + for i, err in enumerate(self.errors, 1): + lines.append(f" {i}. {err}") + if self.warnings: + lines.append("Pre-flight warnings:") + for warn in self.warnings: + lines.append(f" - {warn}") + return "\n".join(lines) + + +def _needs_ai_assisted_evaluator(bundle: BundleConfig) -> bool: + for ev in bundle.evaluators: + if not ev.enabled: + continue + class_name = ev.config.get("init", {}).get("class_name") or ev.name + if class_name in _AI_ASSISTED_EVALUATOR_CLASSES: + return True + return False + + +def _needs_safety_evaluator(bundle: BundleConfig) -> bool: + for ev in bundle.evaluators: + if not ev.enabled: + continue + class_name = ev.config.get("init", {}).get("class_name") or ev.name + if class_name in _SAFETY_EVALUATOR_CLASSES: + return True + return False + + +def _needs_azure_sdk(bundle: BundleConfig) -> bool: + for ev in bundle.evaluators: + if not ev.enabled: + continue + if ev.source == "foundry": + return True + if ev.name not in _LOCAL_ONLY_EVALUATORS: + return True + return False + + +def _check_sdk_imports(report: PreflightReport, bundle: BundleConfig) -> None: + if not _needs_azure_sdk(bundle): + return + + try: + importlib.import_module("azure.identity") + except ImportError: + report.errors.append( + "Missing dependency 'azure-identity'. " + "Install with: pip install azure-identity" + ) + + try: + importlib.import_module("azure.ai.evaluation") + except ImportError: + report.errors.append( + "Missing dependency 'azure-ai-evaluation'. " + "Install with: pip install azure-ai-evaluation" + ) + + +def _check_env_vars( + report: PreflightReport, + bundle: BundleConfig, + run_config: RunConfig, +) -> None: + if _needs_ai_assisted_evaluator(bundle): + # AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_DEPLOYMENT can fall back to + # values resolved from the target endpoint, so only warn if both env + # var and fallback are missing. + fallback_endpoint = None + fallback_deployment = None + endpoint = run_config.target.endpoint + if endpoint is not None: + fallback_deployment = endpoint.model + # Endpoint URL isn't directly usable as AZURE_OPENAI_ENDPOINT, so + # only the deployment gets a fallback. + + missing: list[str] = [] + if not os.getenv("AZURE_OPENAI_ENDPOINT") and not fallback_endpoint: + missing.append("AZURE_OPENAI_ENDPOINT") + if not os.getenv("AZURE_OPENAI_DEPLOYMENT") and not fallback_deployment: + missing.append("AZURE_OPENAI_DEPLOYMENT") + + if missing: + report.errors.append( + f"Missing required environment variable(s) for AI-assisted evaluators: " + f"{', '.join(missing)}. " + "Set them to your Azure OpenAI endpoint and model deployment name." + ) + + if _needs_safety_evaluator(bundle): + if not os.getenv("AZURE_AI_FOUNDRY_PROJECT_ENDPOINT"): + report.errors.append( + "Missing required environment variable 'AZURE_AI_FOUNDRY_PROJECT_ENDPOINT' " + "for safety evaluators. Set it to your Foundry project endpoint URL." + ) + + +def _check_credentials(report: PreflightReport, bundle: BundleConfig) -> None: + if not _needs_azure_sdk(bundle): + return + + # Skip if SDK imports already failed — no point trying to authenticate. + if any("azure-identity" in e for e in report.errors): + return + + try: + from azure.identity import DefaultAzureCredential + except ImportError: + return # already reported + + try: + credential = DefaultAzureCredential( + exclude_developer_cli_credential=True, + process_timeout=30, + ) + # Warm up the token cache. This also catches credential failures + # early, before any evaluator tries to authenticate. + credential.get_token("https://cognitiveservices.azure.com/.default") + except Exception as exc: # noqa: BLE001 — surface any credential error + report.errors.append( + f"Azure credential check failed: {exc}. " + "Run 'az login' or configure AZURE_CLIENT_ID/AZURE_TENANT_ID/" + "AZURE_CLIENT_SECRET for service-principal auth. " + "See https://aka.ms/azsdk/python/identity/defaultazurecredential/troubleshoot" + ) + + +def _check_endpoint_reachable( + report: PreflightReport, run_config: RunConfig +) -> None: + if run_config.target.execution_mode != "remote": + return + + endpoint = run_config.target.endpoint + if endpoint is None: + return + + url: str | None = None + if endpoint.kind == "http": + url = endpoint.url + if not url and endpoint.url_env: + url = os.getenv(endpoint.url_env) + elif endpoint.kind == "foundry_agent": + url = endpoint.project_endpoint + if not url and endpoint.project_endpoint_env: + url = os.getenv(endpoint.project_endpoint_env) + + if not url: + return # endpoint resolution will fail later with a clearer message + + try: + req = urllib_request.Request(url, method="HEAD") + # 10s is enough for a HEAD probe; longer hints at a real problem. + urllib_request.urlopen(req, timeout=10) # noqa: S310 — scheme validated by config + except urllib_error.HTTPError as exc: + # 4xx/5xx still means the endpoint is reachable; only unreachable + # hosts or connection errors are preflight failures. + if exc.code >= 500: + report.warnings.append( + f"Endpoint reachability: {url} returned HTTP {exc.code}." + ) + except urllib_error.URLError as exc: + report.errors.append( + f"Endpoint unreachable: {url} ({exc.reason}). " + "Check the URL, network connectivity, and DNS resolution." + ) + except Exception as exc: # noqa: BLE001 + report.warnings.append(f"Endpoint reachability check skipped: {exc}") + + +def run_preflight_checks( + run_config: RunConfig, bundle_config: BundleConfig +) -> PreflightReport: + """Run all pre-flight checks and return a collected report. + + Checks run in order but do not short-circuit — all detectable issues are + reported at once so the user can fix everything in a single pass. + """ + report = PreflightReport() + _check_sdk_imports(report, bundle_config) + _check_env_vars(report, bundle_config, run_config) + _check_credentials(report, bundle_config) + _check_endpoint_reachable(report, run_config) + return report diff --git a/src/agentops/services/runner.py b/src/agentops/services/runner.py index 1124cbcf..ccaeac69 100644 --- a/src/agentops/services/runner.py +++ b/src/agentops/services/runner.py @@ -32,6 +32,7 @@ ) from agentops.core.reporter import generate_report_html, generate_report_markdown from agentops.services.foundry_evals import publish_foundry_evaluation +from agentops.services.preflight import PreflightReport, run_preflight_checks from agentops.utils.telemetry import ( eval_item_span, eval_run_span, @@ -367,6 +368,7 @@ def run_evaluation( config_path: Path | None = None, output_override: Path | None = None, report_format: str = "md", + dry_run: bool = False, ) -> EvalRunServiceResult: run_config_path = ( config_path.resolve() if config_path is not None else _default_run_config_path() @@ -383,6 +385,27 @@ def run_evaluation( bundle_config = load_bundle_config(bundle_path) dataset_config = load_dataset_config(dataset_path) + # --- Pre-flight checks --------------------------------------------------- + # Detect common configuration issues (missing SDKs, env vars, unreachable + # endpoints, credential failures) before backend execution so they surface + # fast with actionable error messages. + preflight = run_preflight_checks(run_config, bundle_config) + if not preflight.ok: + raise RuntimeError(preflight.format()) + if preflight.warnings: + import sys as _sys + + _sys.stderr.write(preflight.format() + "\n") + + if dry_run: + # --dry-run: preflight passed, skip backend execution. + return EvalRunServiceResult( + output_dir=run_config_path.parent, + results_path=run_config_path.parent, + report_path=run_config_path.parent, + exit_code=0, + ) + output_dir = ( output_override.resolve() if output_override is not None diff --git a/tests/unit/test_preflight.py b/tests/unit/test_preflight.py new file mode 100644 index 00000000..67f7d6cd --- /dev/null +++ b/tests/unit/test_preflight.py @@ -0,0 +1,265 @@ +"""Tests for pre-flight checks.""" + +from __future__ import annotations + +import os +from unittest import mock + +import pytest + +from agentops.core.models import ( + BundleConfig, + BundleRef, + DatasetRef, + LocalAdapterConfig, + RunConfig, + TargetConfig, + TargetEndpointConfig, +) +from agentops.services.preflight import ( + PreflightReport, + _needs_ai_assisted_evaluator, + _needs_azure_sdk, + _needs_safety_evaluator, + run_preflight_checks, +) + + +def _local_run_config() -> RunConfig: + return RunConfig( + version=1, + target=TargetConfig( + type="model", + hosting="local", + execution_mode="local", + local=LocalAdapterConfig(adapter="python fake.py"), + ), + bundle=BundleRef(name="fake"), + dataset=DatasetRef(name="fake"), + ) + + +def _remote_http_run_config(url: str = "https://example.invalid/agent") -> RunConfig: + return RunConfig( + version=1, + target=TargetConfig( + type="agent", + hosting="containerapps", + execution_mode="remote", + endpoint=TargetEndpointConfig(kind="http", url=url), + ), + bundle=BundleRef(name="fake"), + dataset=DatasetRef(name="fake"), + ) + + +def _local_bundle() -> BundleConfig: + return BundleConfig( + version=1, + name="local_only", + evaluators=[ + {"name": "exact_match", "source": "local", "enabled": True}, + ], + ) + + +def _ai_assisted_bundle() -> BundleConfig: + return BundleConfig( + version=1, + name="rag_quality", + evaluators=[ + {"name": "RelevanceEvaluator", "source": "foundry", "enabled": True}, + ], + ) + + +def _safety_bundle() -> BundleConfig: + return BundleConfig( + version=1, + name="safety", + evaluators=[ + {"name": "ViolenceEvaluator", "source": "foundry", "enabled": True}, + ], + ) + + +def test_preflight_report_ok_when_empty() -> None: + report = PreflightReport() + assert report.ok is True + + +def test_preflight_report_format_lists_errors() -> None: + report = PreflightReport(errors=["error 1", "error 2"]) + text = report.format() + assert "Pre-flight checks failed" in text + assert "1. error 1" in text + assert "2. error 2" in text + + +def test_needs_azure_sdk_false_for_local_only() -> None: + assert _needs_azure_sdk(_local_bundle()) is False + + +def test_needs_azure_sdk_true_for_foundry() -> None: + assert _needs_azure_sdk(_ai_assisted_bundle()) is True + + +def test_needs_ai_assisted_true_for_ai_class() -> None: + assert _needs_ai_assisted_evaluator(_ai_assisted_bundle()) is True + + +def test_needs_ai_assisted_false_for_safety() -> None: + assert _needs_ai_assisted_evaluator(_safety_bundle()) is False + + +def test_needs_safety_true_for_safety_class() -> None: + assert _needs_safety_evaluator(_safety_bundle()) is True + + +def test_preflight_local_only_skips_azure_checks(monkeypatch: pytest.MonkeyPatch) -> None: + # No Azure env vars set, local-only bundle -> no errors. + for var in ( + "AZURE_OPENAI_ENDPOINT", + "AZURE_OPENAI_DEPLOYMENT", + "AZURE_AI_FOUNDRY_PROJECT_ENDPOINT", + ): + monkeypatch.delenv(var, raising=False) + + report = run_preflight_checks(_local_run_config(), _local_bundle()) + assert report.ok is True + + +def test_preflight_missing_env_vars_reported(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("AZURE_OPENAI_ENDPOINT", raising=False) + monkeypatch.delenv("AZURE_OPENAI_DEPLOYMENT", raising=False) + + with mock.patch( + "agentops.services.preflight.importlib.import_module" + ) as imp, mock.patch("agentops.services.preflight._check_credentials"): + imp.return_value = mock.Mock() + report = run_preflight_checks(_local_run_config(), _ai_assisted_bundle()) + + assert report.ok is False + combined = " ".join(report.errors) + assert "AZURE_OPENAI_ENDPOINT" in combined + assert "AZURE_OPENAI_DEPLOYMENT" in combined + + +def test_preflight_missing_foundry_project_endpoint( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("AZURE_AI_FOUNDRY_PROJECT_ENDPOINT", raising=False) + + with mock.patch( + "agentops.services.preflight.importlib.import_module" + ) as imp, mock.patch("agentops.services.preflight._check_credentials"): + imp.return_value = mock.Mock() + report = run_preflight_checks(_local_run_config(), _safety_bundle()) + + assert report.ok is False + assert any( + "AZURE_AI_FOUNDRY_PROJECT_ENDPOINT" in e for e in report.errors + ) + + +def test_preflight_missing_sdk_reports_install_hint( + monkeypatch: pytest.MonkeyPatch, +) -> None: + import importlib as _importlib + + real_import = _importlib.import_module + + def fake_import(name: str) -> object: + if name in ("azure.identity", "azure.ai.evaluation"): + raise ImportError("no module") + return real_import(name) + + monkeypatch.setenv("AZURE_OPENAI_ENDPOINT", "https://example.openai.azure.com/") + monkeypatch.setenv("AZURE_OPENAI_DEPLOYMENT", "gpt-4o-mini") + + with mock.patch( + "agentops.services.preflight.importlib.import_module", side_effect=fake_import + ), mock.patch("agentops.services.preflight._check_credentials"): + report = run_preflight_checks(_local_run_config(), _ai_assisted_bundle()) + + assert report.ok is False + combined = " ".join(report.errors) + assert "azure-identity" in combined + assert "azure-ai-evaluation" in combined + assert "pip install" in combined + + +def test_preflight_endpoint_unreachable_reports_error( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv("AZURE_OPENAI_ENDPOINT", "https://example.openai.azure.com/") + monkeypatch.setenv("AZURE_OPENAI_DEPLOYMENT", "gpt-4o-mini") + + from urllib import error as urllib_error + + with mock.patch( + "agentops.services.preflight.importlib.import_module" + ) as imp, mock.patch( + "agentops.services.preflight._check_credentials" + ), mock.patch( + "agentops.services.preflight.urllib_request.urlopen", + side_effect=urllib_error.URLError("Name or service not known"), + ): + imp.return_value = mock.Mock() + report = run_preflight_checks( + _remote_http_run_config(), _ai_assisted_bundle() + ) + + assert report.ok is False + assert any("unreachable" in e.lower() for e in report.errors) + + +def test_preflight_collects_multiple_errors(monkeypatch: pytest.MonkeyPatch) -> None: + """Preflight should surface all detectable issues at once.""" + import importlib as _importlib + + real_import = _importlib.import_module + + monkeypatch.delenv("AZURE_OPENAI_ENDPOINT", raising=False) + monkeypatch.delenv("AZURE_OPENAI_DEPLOYMENT", raising=False) + + def fake_import(name: str) -> object: + if name in ("azure.identity", "azure.ai.evaluation"): + raise ImportError("no module") + return real_import(name) + + with mock.patch( + "agentops.services.preflight.importlib.import_module", side_effect=fake_import + ): + report = run_preflight_checks(_local_run_config(), _ai_assisted_bundle()) + + # Should report both missing SDK and missing env vars. + assert len(report.errors) >= 2 + + +def test_preflight_http_endpoint_ok_on_success(monkeypatch: pytest.MonkeyPatch) -> None: + import importlib as _importlib + + real_import = _importlib.import_module + + def fake_import(name: str) -> object: + if name in ("azure.identity", "azure.ai.evaluation"): + return mock.MagicMock() + return real_import(name) + + monkeypatch.setenv("AZURE_OPENAI_ENDPOINT", "https://example.openai.azure.com/") + monkeypatch.setenv("AZURE_OPENAI_DEPLOYMENT", "gpt-4o-mini") + + with mock.patch( + "agentops.services.preflight.importlib.import_module", side_effect=fake_import + ), mock.patch( + "agentops.services.preflight._check_credentials" + ), mock.patch( + "agentops.services.preflight.urllib_request.urlopen" + ) as urlopen: + urlopen.return_value = mock.MagicMock() + report = run_preflight_checks( + _remote_http_run_config(), _ai_assisted_bundle() + ) + + assert report.ok is True