Azure · Dongbumlee · Apr 7, 2026 · Apr 3, 2026 · Apr 3, 2026 · Apr 7, 2026
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
@@ -254,3 +254,12 @@ When generating or modifying code:
 - The `core/` package must remain free of Azure imports and I/O
 - Follow the request flow: CLI → Services → Backends → Core (never skip layers)
 - If a change is user-visible, add an entry to `CHANGELOG.md` under `[Unreleased]` (Keep a Changelog format)
+
+### OTLP Telemetry
+
+- `utils/telemetry.py` provides optional OTLP trace emission for evaluation runs
+- Activated by `AGENTOPS_OTLP_ENDPOINT` env var — zero overhead when unset
+- All OpenTelemetry imports must be **lazy** (inside functions in `utils/telemetry.py`)
+- `opentelemetry-sdk` is an optional runtime dependency — not declared in `pyproject.toml`
+- Span schema: CICD semconv (`cicd.pipeline.*`) for pipeline structure, GenAI semconv (`gen_ai.*`) for agent calls, `agentops.eval.*` for evaluator scores
+- When adding new spans, follow the three-layer pattern in `telemetry.py`
diff --git a/AGENTS.md b/AGENTS.md
@@ -123,7 +123,8 @@ src/
     │
     ├── utils/
     │   ├── yaml.py                    # YAML IO and interpolation helpers
-    │   └── logging.py                 # Logging setup
+    │   ├── logging.py                 # Logging setup
+    │   └── telemetry.py               # Optional OTLP tracing (lazy imports)
     │
     └── templates/
         ├── config.yaml                # Seed workspace config
@@ -368,6 +369,7 @@ Important environment variables:
 - `AZURE_OPENAI_DEPLOYMENT`
 - `AZURE_AI_MODEL_DEPLOYMENT_NAME`
 - `AZURE_OPENAI_API_VERSION`
+- `AGENTOPS_OTLP_ENDPOINT` — OTLP collector URL for evaluation tracing (opt-in, e.g. `http://localhost:4318`)
 
 Recommended default behavior:
 - Keep Foundry cloud mode as the default path
@@ -377,6 +379,32 @@ Recommended default behavior:
 
 ---
 
+## OTLP Telemetry
+
+AgentOps can optionally emit OpenTelemetry (OTLP) traces during evaluation runs. Set `AGENTOPS_OTLP_ENDPOINT` to enable.
+
+```bash
+# Enable tracing (e.g. AI Toolkit collector, Azure Monitor, Jaeger)
+export AGENTOPS_OTLP_ENDPOINT=http://localhost:4318
+agentops eval run
+```
+
+Span schema uses three OTel semantic convention layers:
+
+| Layer | Namespace | Purpose |
+|---|---|---|
+| CICD | `cicd.pipeline.*` | Eval run as pipeline, items as tasks |
+| GenAI | `gen_ai.*` | Agent/model invocation (future Layer 2) |
+| AgentOps | `agentops.eval.*` | Evaluator scores, thresholds, pass/fail |
+
+Design rules:
+- All OpenTelemetry imports are **lazy** (inside `utils/telemetry.py` functions)
+- Zero overhead when `AGENTOPS_OTLP_ENDPOINT` is unset
+- Graceful no-op when `opentelemetry-sdk` is not installed
+- `opentelemetry-sdk` and `opentelemetry-exporter-otlp-proto-http` are optional runtime dependencies (not in `pyproject.toml`)
+
+---
+
 ## Architectural Constraints
 
 ### Code Organization

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,11 @@ This format follows [Keep a Changelog](https://keepachangelog.com/) and adheres
 ## [Unreleased]
 
 ### Added
+- Add optional OTLP tracing for evaluation runs — set `AGENTOPS_OTLP_ENDPOINT` to emit OpenTelemetry spans.
+  - Three-layer schema: CICD semconv (pipeline run/task), GenAI semconv (agent invocation), and `agentops.eval.*` (evaluator scores/thresholds).
+  - Per-row item spans with evaluator child spans showing score, threshold, and pass/fail.
+  - Zero overhead when `AGENTOPS_OTLP_ENDPOINT` is unset; graceful no-op when `opentelemetry-sdk` is not installed.
+  - Compatible with AI Toolkit (localhost:4318), Azure Monitor, Jaeger, Grafana Tempo, and any OTLP-compatible collector.
 - Implement `agentops eval compare --runs <baseline>,<current>` for baseline comparison of evaluation runs.
   - Produces `comparison.json` (structured metric deltas, threshold flips, item-level changes) and `comparison.md` (human-readable report).
   - Exits with code `0` (no regressions), `2` (regressions detected), or `1` (error).

diff --git a/src/agentops/services/runner.py b/src/agentops/services/runner.py
@@ -30,6 +30,15 @@
 )
 from agentops.core.reporter import generate_report_html, generate_report_markdown
 from agentops.services.foundry_evals import publish_foundry_evaluation
+from agentops.utils.telemetry import (
+    eval_item_span,
+    eval_run_span,
+    init_tracing,
+    record_evaluator_span,
+    set_eval_item_result,
+    set_eval_run_result,
+    shutdown as shutdown_tracing,
+)
 
 
 @dataclass(frozen=True)
@@ -366,8 +375,72 @@ def _append_run_metric(name: str, value: float) -> None:
     return run_metrics
 
 
+def _emit_item_spans(
+    *,
+    item_evaluations: List[ItemEvaluationResult],
+    row_metrics: List[RowMetricsResult],
+    bundle_config,
+) -> None:
+    """Emit OTLP spans for each evaluated item with evaluator child spans."""
+    from agentops.utils.telemetry import is_enabled
+
+    if not is_enabled():
+        return
+
+    # Build lookup: row_index → {metric_name: value}
+    row_values_by_index: Dict[int, Dict[str, float]] = {}
+    for row in row_metrics:
+        row_values_by_index[row.row_index] = {m.name: m.value for m in row.metrics}
+
+    # Build lookup: evaluator_name → (source, threshold_value, criteria)
+    evaluator_info: Dict[str, tuple] = {}
+    for ev in bundle_config.evaluators:
+        if not ev.enabled:
+            continue
+        threshold_value = None
+        criteria = None
+        for thr in bundle_config.thresholds:
+            if thr.evaluator == ev.name:
+                threshold_value = thr.value
+                criteria = thr.criteria
+                break
+        evaluator_info[ev.name] = (ev.source, threshold_value, criteria)
+
+    for item in item_evaluations:
+        with eval_item_span(row_index=item.row_index) as item_span:
+            set_eval_item_result(item_span, passed=item.passed_all)
+
+            # Emit evaluator child spans
+            row_scores = row_values_by_index.get(item.row_index, {})
+            for thr_result in item.thresholds:
+                ev_name = thr_result.evaluator
+                source, threshold_val, criteria = evaluator_info.get(
+                    ev_name, ("local", None, None)
+                )
+                score = row_scores.get(ev_name, 0.0)
+
+                import re
+
+                builtin = ev_name.strip()
+                if builtin.endswith("Evaluator"):
+                    builtin = builtin[:-9]
+                builtin = re.sub(r"(?<!^)(?=[A-Z])", "_", builtin).lower()
+
+                record_evaluator_span(
+                    evaluator_name=ev_name,
+                    builtin_name=builtin,
+                    source=source,
+                    score=score,
+                    threshold=threshold_val,
+                    criteria=criteria,
+                    passed=thr_result.passed,
+                )
+
+
 def run_evaluation(
-    config_path: Path | None = None, output_override: Path | None = None, report_format: str = "md",
+    config_path: Path | None = None,
+    output_override: Path | None = None,
+    report_format: str = "md",
 ) -> EvalRunServiceResult:
     run_config_path = (
         config_path.resolve() if config_path is not None else _default_run_config_path()
@@ -381,6 +454,47 @@ def run_evaluation(
     bundle_config = load_bundle_config(bundle_path)
     dataset_config = load_dataset_config(dataset_path)
 
+    # Initialise OTLP tracing (no-op when AGENTOPS_OTLP_ENDPOINT is unset)
+    init_tracing()
+
+    target = (run_config.backend.target or "agent").strip().lower()
+
+    with eval_run_span(
+        bundle_name=bundle_config.name,
+        dataset_name=dataset_config.name,
+        backend_type=run_config.backend.type,
+        target=target,
+        model=run_config.backend.model,
+        agent_id=run_config.backend.agent_id,
+    ) as run_span:
+        result = _run_evaluation_inner(
+            run_config=run_config,
+            run_config_path=run_config_path,
+            bundle_config=bundle_config,
+            bundle_path=bundle_path,
+            dataset_config=dataset_config,
+            dataset_path=dataset_path,
+            output_override=output_override,
+            report_format=report_format,
+            run_span=run_span,
+        )
+
+    shutdown_tracing()
+    return result
+
+
+def _run_evaluation_inner(
+    *,
+    run_config,
+    run_config_path: Path,
+    bundle_config,
+    bundle_path: Path,
+    dataset_config,
+    dataset_path: Path,
+    output_override: Path | None,
+    report_format: str,
+    run_span,
+) -> EvalRunServiceResult:
     output_dir = (
         output_override.resolve()
         if output_override is not None
@@ -425,6 +539,13 @@ def run_evaluation(
 
     item_evaluations = _evaluate_item_thresholds(bundle_config.thresholds, row_metrics)
 
+    # Emit OTLP spans for each evaluated item (no-op when tracing is disabled)
+    _emit_item_spans(
+        item_evaluations=item_evaluations,
+        row_metrics=row_metrics,
+        bundle_config=bundle_config,
+    )
+
     if bundle_config.thresholds and not row_metrics:
         raise ValueError(
             "Item-level threshold evaluation requires backend 'row_metrics'"
@@ -512,9 +633,7 @@ def run_evaluation(
         report_path = md_path
     if report_format in ("html", "all"):
         html_path = output_dir / "report.html"
-        html_path.write_text(
-            generate_report_html(normalized_result), encoding="utf-8"
-        )
+        html_path.write_text(generate_report_html(normalized_result), encoding="utf-8")
         report_path = html_path
     if report_format == "all":
         report_path = md_path
@@ -523,6 +642,15 @@ def run_evaluation(
     _sync_latest_output(output_dir, latest_dir)
 
     exit_code = 0 if summary.overall_passed else 2
+
+    # Set final result on the root OTLP span
+    set_eval_run_result(
+        run_span,
+        passed=summary.overall_passed,
+        items_total=len(item_evaluations),
+        items_passed=sum(1 for item in item_evaluations if item.passed_all),
+    )
+
     return EvalRunServiceResult(
         output_dir=output_dir,
         results_path=results_path,