diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 0f04006c..91a05b3f 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -254,3 +254,12 @@ When generating or modifying code: - The `core/` package must remain free of Azure imports and I/O - Follow the request flow: CLI → Services → Backends → Core (never skip layers) - If a change is user-visible, add an entry to `CHANGELOG.md` under `[Unreleased]` (Keep a Changelog format) + +### OTLP Telemetry + +- `utils/telemetry.py` provides optional OTLP trace emission for evaluation runs +- Activated by `AGENTOPS_OTLP_ENDPOINT` env var — zero overhead when unset +- All OpenTelemetry imports must be **lazy** (inside functions in `utils/telemetry.py`) +- `opentelemetry-sdk` is an optional runtime dependency — not declared in `pyproject.toml` +- Span schema: CICD semconv (`cicd.pipeline.*`) for pipeline structure, GenAI semconv (`gen_ai.*`) for agent calls, `agentops.eval.*` for evaluator scores +- When adding new spans, follow the three-layer pattern in `telemetry.py` diff --git a/AGENTS.md b/AGENTS.md index 6b6bbdfb..73521af5 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -123,7 +123,8 @@ src/ │ ├── utils/ │ ├── yaml.py # YAML IO and interpolation helpers - │ └── logging.py # Logging setup + │ ├── logging.py # Logging setup + │ └── telemetry.py # Optional OTLP tracing (lazy imports) │ └── templates/ ├── config.yaml # Seed workspace config @@ -368,6 +369,7 @@ Important environment variables: - `AZURE_OPENAI_DEPLOYMENT` - `AZURE_AI_MODEL_DEPLOYMENT_NAME` - `AZURE_OPENAI_API_VERSION` +- `AGENTOPS_OTLP_ENDPOINT` — OTLP collector URL for evaluation tracing (opt-in, e.g. `http://localhost:4318`) Recommended default behavior: - Keep Foundry cloud mode as the default path @@ -377,6 +379,32 @@ Recommended default behavior: --- +## OTLP Telemetry + +AgentOps can optionally emit OpenTelemetry (OTLP) traces during evaluation runs. Set `AGENTOPS_OTLP_ENDPOINT` to enable. + +```bash +# Enable tracing (e.g. AI Toolkit collector, Azure Monitor, Jaeger) +export AGENTOPS_OTLP_ENDPOINT=http://localhost:4318 +agentops eval run +``` + +Span schema uses three OTel semantic convention layers: + +| Layer | Namespace | Purpose | +|---|---|---| +| CICD | `cicd.pipeline.*` | Eval run as pipeline, items as tasks | +| GenAI | `gen_ai.*` | Agent/model invocation (future Layer 2) | +| AgentOps | `agentops.eval.*` | Evaluator scores, thresholds, pass/fail | + +Design rules: +- All OpenTelemetry imports are **lazy** (inside `utils/telemetry.py` functions) +- Zero overhead when `AGENTOPS_OTLP_ENDPOINT` is unset +- Graceful no-op when `opentelemetry-sdk` is not installed +- `opentelemetry-sdk` and `opentelemetry-exporter-otlp-proto-http` are optional runtime dependencies (not in `pyproject.toml`) + +--- + ## Architectural Constraints ### Code Organization diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a26980b..d0ad2ae5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,11 @@ This format follows [Keep a Changelog](https://keepachangelog.com/) and adheres ## [Unreleased] ### Added +- Add optional OTLP tracing for evaluation runs — set `AGENTOPS_OTLP_ENDPOINT` to emit OpenTelemetry spans. + - Three-layer schema: CICD semconv (pipeline run/task), GenAI semconv (agent invocation), and `agentops.eval.*` (evaluator scores/thresholds). + - Per-row item spans with evaluator child spans showing score, threshold, and pass/fail. + - Zero overhead when `AGENTOPS_OTLP_ENDPOINT` is unset; graceful no-op when `opentelemetry-sdk` is not installed. + - Compatible with AI Toolkit (localhost:4318), Azure Monitor, Jaeger, Grafana Tempo, and any OTLP-compatible collector. - Implement `agentops eval compare --runs ,` for baseline comparison of evaluation runs. - Produces `comparison.json` (structured metric deltas, threshold flips, item-level changes) and `comparison.md` (human-readable report). - Exits with code `0` (no regressions), `2` (regressions detected), or `1` (error). diff --git a/src/agentops/services/runner.py b/src/agentops/services/runner.py index 37731aee..ac3258c0 100644 --- a/src/agentops/services/runner.py +++ b/src/agentops/services/runner.py @@ -30,6 +30,15 @@ ) from agentops.core.reporter import generate_report_html, generate_report_markdown from agentops.services.foundry_evals import publish_foundry_evaluation +from agentops.utils.telemetry import ( + eval_item_span, + eval_run_span, + init_tracing, + record_evaluator_span, + set_eval_item_result, + set_eval_run_result, + shutdown as shutdown_tracing, +) @dataclass(frozen=True) @@ -366,8 +375,72 @@ def _append_run_metric(name: str, value: float) -> None: return run_metrics +def _emit_item_spans( + *, + item_evaluations: List[ItemEvaluationResult], + row_metrics: List[RowMetricsResult], + bundle_config, +) -> None: + """Emit OTLP spans for each evaluated item with evaluator child spans.""" + from agentops.utils.telemetry import is_enabled + + if not is_enabled(): + return + + # Build lookup: row_index → {metric_name: value} + row_values_by_index: Dict[int, Dict[str, float]] = {} + for row in row_metrics: + row_values_by_index[row.row_index] = {m.name: m.value for m in row.metrics} + + # Build lookup: evaluator_name → (source, threshold_value, criteria) + evaluator_info: Dict[str, tuple] = {} + for ev in bundle_config.evaluators: + if not ev.enabled: + continue + threshold_value = None + criteria = None + for thr in bundle_config.thresholds: + if thr.evaluator == ev.name: + threshold_value = thr.value + criteria = thr.criteria + break + evaluator_info[ev.name] = (ev.source, threshold_value, criteria) + + for item in item_evaluations: + with eval_item_span(row_index=item.row_index) as item_span: + set_eval_item_result(item_span, passed=item.passed_all) + + # Emit evaluator child spans + row_scores = row_values_by_index.get(item.row_index, {}) + for thr_result in item.thresholds: + ev_name = thr_result.evaluator + source, threshold_val, criteria = evaluator_info.get( + ev_name, ("local", None, None) + ) + score = row_scores.get(ev_name, 0.0) + + import re + + builtin = ev_name.strip() + if builtin.endswith("Evaluator"): + builtin = builtin[:-9] + builtin = re.sub(r"(? EvalRunServiceResult: run_config_path = ( config_path.resolve() if config_path is not None else _default_run_config_path() @@ -381,6 +454,47 @@ def run_evaluation( bundle_config = load_bundle_config(bundle_path) dataset_config = load_dataset_config(dataset_path) + # Initialise OTLP tracing (no-op when AGENTOPS_OTLP_ENDPOINT is unset) + init_tracing() + + target = (run_config.backend.target or "agent").strip().lower() + + with eval_run_span( + bundle_name=bundle_config.name, + dataset_name=dataset_config.name, + backend_type=run_config.backend.type, + target=target, + model=run_config.backend.model, + agent_id=run_config.backend.agent_id, + ) as run_span: + result = _run_evaluation_inner( + run_config=run_config, + run_config_path=run_config_path, + bundle_config=bundle_config, + bundle_path=bundle_path, + dataset_config=dataset_config, + dataset_path=dataset_path, + output_override=output_override, + report_format=report_format, + run_span=run_span, + ) + + shutdown_tracing() + return result + + +def _run_evaluation_inner( + *, + run_config, + run_config_path: Path, + bundle_config, + bundle_path: Path, + dataset_config, + dataset_path: Path, + output_override: Path | None, + report_format: str, + run_span, +) -> EvalRunServiceResult: output_dir = ( output_override.resolve() if output_override is not None @@ -425,6 +539,13 @@ def run_evaluation( item_evaluations = _evaluate_item_thresholds(bundle_config.thresholds, row_metrics) + # Emit OTLP spans for each evaluated item (no-op when tracing is disabled) + _emit_item_spans( + item_evaluations=item_evaluations, + row_metrics=row_metrics, + bundle_config=bundle_config, + ) + if bundle_config.thresholds and not row_metrics: raise ValueError( "Item-level threshold evaluation requires backend 'row_metrics'" @@ -512,9 +633,7 @@ def run_evaluation( report_path = md_path if report_format in ("html", "all"): html_path = output_dir / "report.html" - html_path.write_text( - generate_report_html(normalized_result), encoding="utf-8" - ) + html_path.write_text(generate_report_html(normalized_result), encoding="utf-8") report_path = html_path if report_format == "all": report_path = md_path @@ -523,6 +642,15 @@ def run_evaluation( _sync_latest_output(output_dir, latest_dir) exit_code = 0 if summary.overall_passed else 2 + + # Set final result on the root OTLP span + set_eval_run_result( + run_span, + passed=summary.overall_passed, + items_total=len(item_evaluations), + items_passed=sum(1 for item in item_evaluations if item.passed_all), + ) + return EvalRunServiceResult( output_dir=output_dir, results_path=results_path, diff --git a/src/agentops/utils/telemetry.py b/src/agentops/utils/telemetry.py new file mode 100644 index 00000000..f0f79c5a --- /dev/null +++ b/src/agentops/utils/telemetry.py @@ -0,0 +1,291 @@ +"""Optional OpenTelemetry instrumentation for AgentOps evaluation runs. + +All OpenTelemetry imports are **lazy** — they only happen when tracing is +enabled via the ``AGENTOPS_OTLP_ENDPOINT`` environment variable. When the +variable is unset, every public function in this module is a no-op. + +Schema design follows three OTel semantic convention layers: +https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-agent-spans/ + +* **CICD** (``cicd.pipeline.*``) — the eval run as a pipeline +* **GenAI** (``gen_ai.*``) — the agent/model invocation +* **AgentOps** (``agentops.eval.*``) — evaluation-specific (score, threshold) +""" + +from __future__ import annotations + +import os +from contextlib import contextmanager +from typing import Any, Generator, Optional + +# --------------------------------------------------------------------------- +# Lazy globals — initialised on first call to ``init_tracing()`` +# --------------------------------------------------------------------------- +_tracer: Any = None +_tracing_enabled: bool = False + + +def is_enabled() -> bool: + """Return True when OTLP tracing has been initialised.""" + return _tracing_enabled + + +def init_tracing() -> None: + """Initialise the OTLP exporter if ``AGENTOPS_OTLP_ENDPOINT`` is set. + + Safe to call multiple times; only the first call has an effect. + """ + global _tracer, _tracing_enabled # noqa: PLW0603 + + if _tracing_enabled: + return + + endpoint = os.getenv("AGENTOPS_OTLP_ENDPOINT") + if not endpoint: + return + + try: + from opentelemetry import trace + from opentelemetry.exporter.otlp.proto.http.trace_exporter import ( + OTLPSpanExporter, + ) + from opentelemetry.sdk.resources import Resource + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import BatchSpanProcessor + + import agentops + + resource = Resource( + attributes={ + "service.name": "agentops", + "service.version": getattr(agentops, "__version__", "0.0.0"), + } + ) + + provider = TracerProvider(resource=resource) + exporter = OTLPSpanExporter(endpoint=endpoint + "/v1/traces") + provider.add_span_processor(BatchSpanProcessor(exporter)) + trace.set_tracer_provider(provider) + + _tracer = trace.get_tracer("agentops") + _tracing_enabled = True + except ImportError: + # opentelemetry not installed — tracing stays disabled + pass + + +def shutdown() -> None: + """Flush and shut down the tracer provider.""" + if not _tracing_enabled: + return + try: + from opentelemetry import trace + + provider = trace.get_tracer_provider() + if hasattr(provider, "shutdown"): + provider.shutdown() + except Exception: # noqa: BLE001 + pass + + +# --------------------------------------------------------------------------- +# Span context managers +# --------------------------------------------------------------------------- + + +@contextmanager +def eval_run_span( + *, + bundle_name: str, + dataset_name: str, + backend_type: str, + target: str, + model: Optional[str] = None, + agent_id: Optional[str] = None, +) -> Generator[Optional[Any], None, None]: + """Root span for an evaluation run (CICD pipeline run).""" + if not _tracing_enabled or _tracer is None: + yield None + return + + from opentelemetry.trace import SpanKind, StatusCode + + with _tracer.start_as_current_span( + f"RUN {bundle_name}", + kind=SpanKind.SERVER, + ) as span: + # CICD semconv + span.set_attribute("cicd.pipeline.name", bundle_name) + span.set_attribute("cicd.pipeline.action.name", "RUN") + + # AgentOps evaluation attributes + span.set_attribute("agentops.eval.dataset", dataset_name) + span.set_attribute("agentops.eval.backend", backend_type) + span.set_attribute("agentops.eval.target", target) + if model: + span.set_attribute("agentops.eval.model", model) + if agent_id: + span.set_attribute("agentops.eval.agent_id", agent_id) + + try: + yield span + except Exception as exc: + span.set_status(StatusCode.ERROR, str(exc)) + span.record_exception(exc) + raise + + +def set_eval_run_result( + span: Any, + *, + passed: bool, + items_total: int, + items_passed: int, +) -> None: + """Set final result attributes on the root eval run span.""" + if span is None: + return + + from opentelemetry.trace import StatusCode + + span.set_attribute("cicd.pipeline.result", "success" if passed else "failure") + span.set_attribute("agentops.eval.items_total", items_total) + span.set_attribute("agentops.eval.items_passed", items_passed) + if items_total > 0: + span.set_attribute("agentops.eval.pass_rate", items_passed / items_total) + + if passed: + span.set_status(StatusCode.OK) + else: + span.set_status(StatusCode.ERROR, "Threshold failure") + + +@contextmanager +def eval_item_span( + *, + row_index: int, + input_text: Optional[str] = None, + expected_text: Optional[str] = None, +) -> Generator[Optional[Any], None, None]: + """Span for a single evaluation item (CICD task run).""" + if not _tracing_enabled or _tracer is None: + yield None + return + + from opentelemetry.trace import SpanKind + + with _tracer.start_as_current_span( + f"eval_item {row_index}", + kind=SpanKind.INTERNAL, + ) as span: + # CICD task attributes + span.set_attribute("cicd.pipeline.task.name", "eval_item") + span.set_attribute("cicd.pipeline.task.run.id", str(row_index)) + + # AgentOps item attributes + span.set_attribute("agentops.eval.item.index", row_index) + if input_text: + span.set_attribute("agentops.eval.item.input", input_text) + if expected_text: + span.set_attribute("agentops.eval.item.expected", expected_text) + + yield span + + +def set_eval_item_result(span: Any, *, passed: bool) -> None: + """Set final result on an eval item span.""" + if span is None: + return + span.set_attribute( + "cicd.pipeline.task.run.result", "success" if passed else "failure" + ) + span.set_attribute("agentops.eval.item.passed", passed) + + +@contextmanager +def agent_invoke_span( + *, + target: str, + model: Optional[str] = None, + agent_id: Optional[str] = None, + agent_name: Optional[str] = None, + agent_version: Optional[str] = None, + provider: str = "azure.ai.inference", +) -> Generator[Optional[Any], None, None]: + """Span for agent/model invocation (GenAI semconv).""" + if not _tracing_enabled or _tracer is None: + yield None + return + + from opentelemetry.trace import SpanKind + + operation = "invoke_agent" if target == "agent" else "chat" + span_name = f"{operation} {agent_name or model or 'unknown'}" + + with _tracer.start_as_current_span( + span_name, + kind=SpanKind.CLIENT, + ) as span: + # GenAI semconv + span.set_attribute("gen_ai.operation.name", operation) + span.set_attribute("gen_ai.provider.name", provider) + if model: + span.set_attribute("gen_ai.request.model", model) + if agent_id: + span.set_attribute("gen_ai.agent.id", agent_id) + if agent_name: + span.set_attribute("gen_ai.agent.name", agent_name) + if agent_version: + span.set_attribute("gen_ai.agent.version", agent_version) + + yield span + + +def set_agent_invoke_result( + span: Any, + *, + response_model: Optional[str] = None, + input_tokens: Optional[int] = None, + output_tokens: Optional[int] = None, +) -> None: + """Set GenAI response attributes on an agent invoke span.""" + if span is None: + return + if response_model: + span.set_attribute("gen_ai.response.model", response_model) + if input_tokens is not None: + span.set_attribute("gen_ai.usage.input_tokens", input_tokens) + if output_tokens is not None: + span.set_attribute("gen_ai.usage.output_tokens", output_tokens) + + +def record_evaluator_span( + *, + evaluator_name: str, + builtin_name: str, + source: str, + score: float, + threshold: Optional[float] = None, + criteria: Optional[str] = None, + passed: Optional[bool] = None, +) -> None: + """Create a child span for a single evaluator result.""" + if not _tracing_enabled or _tracer is None: + return + + from opentelemetry.trace import SpanKind + + with _tracer.start_as_current_span( + f"evaluator {builtin_name}", + kind=SpanKind.INTERNAL, + ) as span: + span.set_attribute("agentops.eval.evaluator.name", evaluator_name) + span.set_attribute("agentops.eval.evaluator.builtin", builtin_name) + span.set_attribute("agentops.eval.evaluator.source", source) + span.set_attribute("agentops.eval.evaluator.score", score) + if threshold is not None: + span.set_attribute("agentops.eval.evaluator.threshold", threshold) + if criteria is not None: + span.set_attribute("agentops.eval.evaluator.criteria", criteria) + if passed is not None: + span.set_attribute("agentops.eval.evaluator.passed", passed) diff --git a/tests/unit/test_telemetry.py b/tests/unit/test_telemetry.py new file mode 100644 index 00000000..cec0bd22 --- /dev/null +++ b/tests/unit/test_telemetry.py @@ -0,0 +1,267 @@ +"""Tests for OTLP telemetry instrumentation.""" + +from __future__ import annotations + +import os +from unittest.mock import MagicMock, patch + +import pytest + +from agentops.utils.telemetry import ( + eval_item_span, + eval_run_span, + init_tracing, + is_enabled, + record_evaluator_span, + set_eval_item_result, + set_eval_run_result, +) + + +class TestTracingDisabledByDefault: + """When AGENTOPS_OTLP_ENDPOINT is unset, all functions are no-ops.""" + + def setup_method(self) -> None: + import agentops.utils.telemetry as tel + + tel._tracing_enabled = False + tel._tracer = None + + def test_is_enabled_returns_false(self) -> None: + assert is_enabled() is False + + def test_eval_run_span_yields_none(self) -> None: + with eval_run_span( + bundle_name="test", + dataset_name="test", + backend_type="foundry", + target="model", + ) as span: + assert span is None + + def test_eval_item_span_yields_none(self) -> None: + with eval_item_span(row_index=1) as span: + assert span is None + + def test_set_eval_run_result_noop(self) -> None: + # Should not raise + set_eval_run_result(None, passed=True, items_total=5, items_passed=5) + + def test_set_eval_item_result_noop(self) -> None: + set_eval_item_result(None, passed=True) + + def test_record_evaluator_span_noop(self) -> None: + # Should not raise + record_evaluator_span( + evaluator_name="SimilarityEvaluator", + builtin_name="similarity", + source="foundry", + score=4.0, + threshold=3.0, + criteria=">=", + passed=True, + ) + + +class TestInitTracingWithoutEndpoint: + def test_no_init_without_env_var(self) -> None: + # Ensure the env var is not set + env = os.environ.copy() + env.pop("AGENTOPS_OTLP_ENDPOINT", None) + with patch.dict(os.environ, env, clear=True): + # Reset module state + import agentops.utils.telemetry as tel + + tel._tracing_enabled = False + tel._tracer = None + + init_tracing() + assert is_enabled() is False + + +class TestInitTracingWithoutOtelInstalled: + def test_graceful_when_otel_missing(self) -> None: + import agentops.utils.telemetry as tel + + tel._tracing_enabled = False + tel._tracer = None + + with patch.dict( + os.environ, {"AGENTOPS_OTLP_ENDPOINT": "http://localhost:4318"} + ): + # Simulate opentelemetry not installed + with patch.dict("sys.modules", {"opentelemetry": None}): + init_tracing() + assert is_enabled() is False + + +class TestSpanAttributesWhenEnabled: + """Test that span context managers set correct attributes when tracing is enabled. + + These tests require opentelemetry to be installed because the code paths + import SpanKind/StatusCode when tracing is enabled. + """ + + otel = pytest.importorskip("opentelemetry") + + def setup_method(self) -> None: + """Mock the tracing module to simulate enabled state.""" + import agentops.utils.telemetry as tel + + self.mock_span = MagicMock() + self.mock_span.__enter__ = MagicMock(return_value=self.mock_span) + self.mock_span.__exit__ = MagicMock(return_value=False) + + self.mock_tracer = MagicMock() + self.mock_tracer.start_as_current_span.return_value = self.mock_span + + tel._tracing_enabled = True + tel._tracer = self.mock_tracer + + def teardown_method(self) -> None: + import agentops.utils.telemetry as tel + + tel._tracing_enabled = False + tel._tracer = None + + def test_eval_run_span_sets_cicd_attributes(self) -> None: + with eval_run_span( + bundle_name="model_direct", + dataset_name="smoke", + backend_type="foundry", + target="model", + model="gpt-4.1", + ) as span: + assert span is self.mock_span + + # Verify CICD semconv attributes + calls = { + call.args[0]: call.args[1] + for call in self.mock_span.set_attribute.call_args_list + } + assert calls["cicd.pipeline.name"] == "model_direct" + assert calls["cicd.pipeline.action.name"] == "RUN" + assert calls["agentops.eval.dataset"] == "smoke" + assert calls["agentops.eval.backend"] == "foundry" + assert calls["agentops.eval.target"] == "model" + assert calls["agentops.eval.model"] == "gpt-4.1" + + def test_eval_run_span_sets_agent_id(self) -> None: + with eval_run_span( + bundle_name="agent_test", + dataset_name="smoke", + backend_type="foundry", + target="agent", + agent_id="my-agent:3", + ): + pass + + calls = { + call.args[0]: call.args[1] + for call in self.mock_span.set_attribute.call_args_list + } + assert calls["agentops.eval.agent_id"] == "my-agent:3" + assert calls["agentops.eval.target"] == "agent" + + def test_eval_item_span_sets_task_attributes(self) -> None: + with eval_item_span( + row_index=3, + input_text="What is 2+2?", + expected_text="4", + ) as span: + assert span is self.mock_span + + calls = { + call.args[0]: call.args[1] + for call in self.mock_span.set_attribute.call_args_list + } + assert calls["cicd.pipeline.task.name"] == "eval_item" + assert calls["cicd.pipeline.task.run.id"] == "3" + assert calls["agentops.eval.item.index"] == 3 + assert calls["agentops.eval.item.input"] == "What is 2+2?" + assert calls["agentops.eval.item.expected"] == "4" + + def test_set_eval_run_result_pass(self) -> None: + set_eval_run_result( + self.mock_span, + passed=True, + items_total=5, + items_passed=5, + ) + + calls = { + call.args[0]: call.args[1] + for call in self.mock_span.set_attribute.call_args_list + } + assert calls["cicd.pipeline.result"] == "success" + assert calls["agentops.eval.items_total"] == 5 + assert calls["agentops.eval.items_passed"] == 5 + assert calls["agentops.eval.pass_rate"] == 1.0 + + def test_set_eval_run_result_fail(self) -> None: + set_eval_run_result( + self.mock_span, + passed=False, + items_total=5, + items_passed=3, + ) + + calls = { + call.args[0]: call.args[1] + for call in self.mock_span.set_attribute.call_args_list + } + assert calls["cicd.pipeline.result"] == "failure" + assert calls["agentops.eval.items_passed"] == 3 + assert calls["agentops.eval.pass_rate"] == 0.6 + + def test_set_eval_item_result(self) -> None: + set_eval_item_result(self.mock_span, passed=False) + + calls = { + call.args[0]: call.args[1] + for call in self.mock_span.set_attribute.call_args_list + } + assert calls["cicd.pipeline.task.run.result"] == "failure" + assert calls["agentops.eval.item.passed"] is False + + def test_record_evaluator_span(self) -> None: + record_evaluator_span( + evaluator_name="SimilarityEvaluator", + builtin_name="similarity", + source="foundry", + score=4.0, + threshold=3.0, + criteria=">=", + passed=True, + ) + + # Verify a child span was created + self.mock_tracer.start_as_current_span.assert_called_with( + "evaluator similarity", + kind=pytest.importorskip("opentelemetry.trace").SpanKind.INTERNAL, + ) + + calls = { + call.args[0]: call.args[1] + for call in self.mock_span.set_attribute.call_args_list + } + assert calls["agentops.eval.evaluator.name"] == "SimilarityEvaluator" + assert calls["agentops.eval.evaluator.builtin"] == "similarity" + assert calls["agentops.eval.evaluator.source"] == "foundry" + assert calls["agentops.eval.evaluator.score"] == 4.0 + assert calls["agentops.eval.evaluator.threshold"] == 3.0 + assert calls["agentops.eval.evaluator.criteria"] == ">=" + assert calls["agentops.eval.evaluator.passed"] is True + + def test_eval_run_span_name(self) -> None: + with eval_run_span( + bundle_name="my_bundle", + dataset_name="smoke", + backend_type="foundry", + target="model", + ): + pass + + self.mock_tracer.start_as_current_span.assert_called_once() + span_name = self.mock_tracer.start_as_current_span.call_args.args[0] + assert span_name == "RUN my_bundle"