diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index 0f04006c..91a05b3f 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -254,3 +254,12 @@ When generating or modifying code:
 - The `core/` package must remain free of Azure imports and I/O
 - Follow the request flow: CLI → Services → Backends → Core (never skip layers)
 - If a change is user-visible, add an entry to `CHANGELOG.md` under `[Unreleased]` (Keep a Changelog format)
+
+### OTLP Telemetry
+
+- `utils/telemetry.py` provides optional OTLP trace emission for evaluation runs
+- Activated by `AGENTOPS_OTLP_ENDPOINT` env var — zero overhead when unset
+- All OpenTelemetry imports must be **lazy** (inside functions in `utils/telemetry.py`)
+- `opentelemetry-sdk` is an optional runtime dependency — not declared in `pyproject.toml`
+- Span schema: CICD semconv (`cicd.pipeline.*`) for pipeline structure, GenAI semconv (`gen_ai.*`) for agent calls, `agentops.eval.*` for evaluator scores
+- When adding new spans, follow the three-layer pattern in `telemetry.py`
diff --git a/AGENTS.md b/AGENTS.md
index 6b6bbdfb..73521af5 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -123,7 +123,8 @@ src/
     │
     ├── utils/
     │   ├── yaml.py                    # YAML IO and interpolation helpers
-    │   └── logging.py                 # Logging setup
+    │   ├── logging.py                 # Logging setup
+    │   └── telemetry.py               # Optional OTLP tracing (lazy imports)
     │
     └── templates/
         ├── config.yaml                # Seed workspace config
@@ -368,6 +369,7 @@ Important environment variables:
 - `AZURE_OPENAI_DEPLOYMENT`
 - `AZURE_AI_MODEL_DEPLOYMENT_NAME`
 - `AZURE_OPENAI_API_VERSION`
+- `AGENTOPS_OTLP_ENDPOINT` — OTLP collector URL for evaluation tracing (opt-in, e.g. `http://localhost:4318`)
 
 Recommended default behavior:
 - Keep Foundry cloud mode as the default path
@@ -377,6 +379,32 @@ Recommended default behavior:
 
 ---
 
+## OTLP Telemetry
+
+AgentOps can optionally emit OpenTelemetry (OTLP) traces during evaluation runs. Set `AGENTOPS_OTLP_ENDPOINT` to enable.
+
+```bash
+# Enable tracing (e.g. AI Toolkit collector, Azure Monitor, Jaeger)
+export AGENTOPS_OTLP_ENDPOINT=http://localhost:4318
+agentops eval run
+```
+
+Span schema uses three OTel semantic convention layers:
+
+| Layer | Namespace | Purpose |
+|---|---|---|
+| CICD | `cicd.pipeline.*` | Eval run as pipeline, items as tasks |
+| GenAI | `gen_ai.*` | Agent/model invocation (future Layer 2) |
+| AgentOps | `agentops.eval.*` | Evaluator scores, thresholds, pass/fail |
+
+Design rules:
+- All OpenTelemetry imports are **lazy** (inside `utils/telemetry.py` functions)
+- Zero overhead when `AGENTOPS_OTLP_ENDPOINT` is unset
+- Graceful no-op when `opentelemetry-sdk` is not installed
+- `opentelemetry-sdk` and `opentelemetry-exporter-otlp-proto-http` are optional runtime dependencies (not in `pyproject.toml`)
+
+---
+
 ## Architectural Constraints
 
 ### Code Organization
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1a26980b..d0ad2ae5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,11 @@ This format follows [Keep a Changelog](https://keepachangelog.com/) and adheres
 ## [Unreleased]
 
 ### Added
+- Add optional OTLP tracing for evaluation runs — set `AGENTOPS_OTLP_ENDPOINT` to emit OpenTelemetry spans.
+  - Three-layer schema: CICD semconv (pipeline run/task), GenAI semconv (agent invocation), and `agentops.eval.*` (evaluator scores/thresholds).
+  - Per-row item spans with evaluator child spans showing score, threshold, and pass/fail.
+  - Zero overhead when `AGENTOPS_OTLP_ENDPOINT` is unset; graceful no-op when `opentelemetry-sdk` is not installed.
+  - Compatible with AI Toolkit (localhost:4318), Azure Monitor, Jaeger, Grafana Tempo, and any OTLP-compatible collector.
 - Implement `agentops eval compare --runs <baseline>,<current>` for baseline comparison of evaluation runs.
   - Produces `comparison.json` (structured metric deltas, threshold flips, item-level changes) and `comparison.md` (human-readable report).
   - Exits with code `0` (no regressions), `2` (regressions detected), or `1` (error).
diff --git a/src/agentops/services/runner.py b/src/agentops/services/runner.py
index 37731aee..ac3258c0 100644
--- a/src/agentops/services/runner.py
+++ b/src/agentops/services/runner.py
@@ -30,6 +30,15 @@
 )
 from agentops.core.reporter import generate_report_html, generate_report_markdown
 from agentops.services.foundry_evals import publish_foundry_evaluation
+from agentops.utils.telemetry import (
+    eval_item_span,
+    eval_run_span,
+    init_tracing,
+    record_evaluator_span,
+    set_eval_item_result,
+    set_eval_run_result,
+    shutdown as shutdown_tracing,
+)
 
 
 @dataclass(frozen=True)
@@ -366,8 +375,72 @@ def _append_run_metric(name: str, value: float) -> None:
     return run_metrics
 
 
+def _emit_item_spans(
+    *,
+    item_evaluations: List[ItemEvaluationResult],
+    row_metrics: List[RowMetricsResult],
+    bundle_config,
+) -> None:
+    """Emit OTLP spans for each evaluated item with evaluator child spans."""
+    from agentops.utils.telemetry import is_enabled
+
+    if not is_enabled():
+        return
+
+    # Build lookup: row_index → {metric_name: value}
+    row_values_by_index: Dict[int, Dict[str, float]] = {}
+    for row in row_metrics:
+        row_values_by_index[row.row_index] = {m.name: m.value for m in row.metrics}
+
+    # Build lookup: evaluator_name → (source, threshold_value, criteria)
+    evaluator_info: Dict[str, tuple] = {}
+    for ev in bundle_config.evaluators:
+        if not ev.enabled:
+            continue
+        threshold_value = None
+        criteria = None
+        for thr in bundle_config.thresholds:
+            if thr.evaluator == ev.name:
+                threshold_value = thr.value
+                criteria = thr.criteria
+                break
+        evaluator_info[ev.name] = (ev.source, threshold_value, criteria)
+
+    for item in item_evaluations:
+        with eval_item_span(row_index=item.row_index) as item_span:
+            set_eval_item_result(item_span, passed=item.passed_all)
+
+            # Emit evaluator child spans
+            row_scores = row_values_by_index.get(item.row_index, {})
+            for thr_result in item.thresholds:
+                ev_name = thr_result.evaluator
+                source, threshold_val, criteria = evaluator_info.get(
+                    ev_name, ("local", None, None)
+                )
+                score = row_scores.get(ev_name, 0.0)
+
+                import re
+
+                builtin = ev_name.strip()
+                if builtin.endswith("Evaluator"):
+                    builtin = builtin[:-9]
+                builtin = re.sub(r"(?<!^)(?=[A-Z])", "_", builtin).lower()
+
+                record_evaluator_span(
+                    evaluator_name=ev_name,
+                    builtin_name=builtin,
+                    source=source,
+                    score=score,
+                    threshold=threshold_val,
+                    criteria=criteria,
+                    passed=thr_result.passed,
+                )
+
+
 def run_evaluation(
-    config_path: Path | None = None, output_override: Path | None = None, report_format: str = "md",
+    config_path: Path | None = None,
+    output_override: Path | None = None,
+    report_format: str = "md",
 ) -> EvalRunServiceResult:
     run_config_path = (
         config_path.resolve() if config_path is not None else _default_run_config_path()
@@ -381,6 +454,47 @@ def run_evaluation(
     bundle_config = load_bundle_config(bundle_path)
     dataset_config = load_dataset_config(dataset_path)
 
+    # Initialise OTLP tracing (no-op when AGENTOPS_OTLP_ENDPOINT is unset)
+    init_tracing()
+
+    target = (run_config.backend.target or "agent").strip().lower()
+
+    with eval_run_span(
+        bundle_name=bundle_config.name,
+        dataset_name=dataset_config.name,
+        backend_type=run_config.backend.type,
+        target=target,
+        model=run_config.backend.model,
+        agent_id=run_config.backend.agent_id,
+    ) as run_span:
+        result = _run_evaluation_inner(
+            run_config=run_config,
+            run_config_path=run_config_path,
+            bundle_config=bundle_config,
+            bundle_path=bundle_path,
+            dataset_config=dataset_config,
+            dataset_path=dataset_path,
+            output_override=output_override,
+            report_format=report_format,
+            run_span=run_span,
+        )
+
+    shutdown_tracing()
+    return result
+
+
+def _run_evaluation_inner(
+    *,
+    run_config,
+    run_config_path: Path,
+    bundle_config,
+    bundle_path: Path,
+    dataset_config,
+    dataset_path: Path,
+    output_override: Path | None,
+    report_format: str,
+    run_span,
+) -> EvalRunServiceResult:
     output_dir = (
         output_override.resolve()
         if output_override is not None
@@ -425,6 +539,13 @@ def run_evaluation(
 
     item_evaluations = _evaluate_item_thresholds(bundle_config.thresholds, row_metrics)
 
+    # Emit OTLP spans for each evaluated item (no-op when tracing is disabled)
+    _emit_item_spans(
+        item_evaluations=item_evaluations,
+        row_metrics=row_metrics,
+        bundle_config=bundle_config,
+    )
+
     if bundle_config.thresholds and not row_metrics:
         raise ValueError(
             "Item-level threshold evaluation requires backend 'row_metrics'"
@@ -512,9 +633,7 @@ def run_evaluation(
         report_path = md_path
     if report_format in ("html", "all"):
         html_path = output_dir / "report.html"
-        html_path.write_text(
-            generate_report_html(normalized_result), encoding="utf-8"
-        )
+        html_path.write_text(generate_report_html(normalized_result), encoding="utf-8")
         report_path = html_path
     if report_format == "all":
         report_path = md_path
@@ -523,6 +642,15 @@ def run_evaluation(
     _sync_latest_output(output_dir, latest_dir)
 
     exit_code = 0 if summary.overall_passed else 2
+
+    # Set final result on the root OTLP span
+    set_eval_run_result(
+        run_span,
+        passed=summary.overall_passed,
+        items_total=len(item_evaluations),
+        items_passed=sum(1 for item in item_evaluations if item.passed_all),
+    )
+
     return EvalRunServiceResult(
         output_dir=output_dir,
         results_path=results_path,
diff --git a/src/agentops/utils/telemetry.py b/src/agentops/utils/telemetry.py
new file mode 100644
index 00000000..f0f79c5a
--- /dev/null
+++ b/src/agentops/utils/telemetry.py
@@ -0,0 +1,291 @@
+"""Optional OpenTelemetry instrumentation for AgentOps evaluation runs.
+
+All OpenTelemetry imports are **lazy** — they only happen when tracing is
+enabled via the ``AGENTOPS_OTLP_ENDPOINT`` environment variable.  When the
+variable is unset, every public function in this module is a no-op.
+
+Schema design follows three OTel semantic convention layers:
+https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-agent-spans/
+
+* **CICD** (``cicd.pipeline.*``)  — the eval run as a pipeline
+* **GenAI** (``gen_ai.*``)        — the agent/model invocation
+* **AgentOps** (``agentops.eval.*``) — evaluation-specific (score, threshold)
+"""
+
+from __future__ import annotations
+
+import os
+from contextlib import contextmanager
+from typing import Any, Generator, Optional
+
+# ---------------------------------------------------------------------------
+# Lazy globals — initialised on first call to ``init_tracing()``
+# ---------------------------------------------------------------------------
+_tracer: Any = None
+_tracing_enabled: bool = False
+
+
+def is_enabled() -> bool:
+    """Return True when OTLP tracing has been initialised."""
+    return _tracing_enabled
+
+
+def init_tracing() -> None:
+    """Initialise the OTLP exporter if ``AGENTOPS_OTLP_ENDPOINT`` is set.
+
+    Safe to call multiple times; only the first call has an effect.
+    """
+    global _tracer, _tracing_enabled  # noqa: PLW0603
+
+    if _tracing_enabled:
+        return
+
+    endpoint = os.getenv("AGENTOPS_OTLP_ENDPOINT")
+    if not endpoint:
+        return
+
+    try:
+        from opentelemetry import trace
+        from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
+            OTLPSpanExporter,
+        )
+        from opentelemetry.sdk.resources import Resource
+        from opentelemetry.sdk.trace import TracerProvider
+        from opentelemetry.sdk.trace.export import BatchSpanProcessor
+
+        import agentops
+
+        resource = Resource(
+            attributes={
+                "service.name": "agentops",
+                "service.version": getattr(agentops, "__version__", "0.0.0"),
+            }
+        )
+
+        provider = TracerProvider(resource=resource)
+        exporter = OTLPSpanExporter(endpoint=endpoint + "/v1/traces")
+        provider.add_span_processor(BatchSpanProcessor(exporter))
+        trace.set_tracer_provider(provider)
+
+        _tracer = trace.get_tracer("agentops")
+        _tracing_enabled = True
+    except ImportError:
+        # opentelemetry not installed — tracing stays disabled
+        pass
+
+
+def shutdown() -> None:
+    """Flush and shut down the tracer provider."""
+    if not _tracing_enabled:
+        return
+    try:
+        from opentelemetry import trace
+
+        provider = trace.get_tracer_provider()
+        if hasattr(provider, "shutdown"):
+            provider.shutdown()
+    except Exception:  # noqa: BLE001
+        pass
+
+
+# ---------------------------------------------------------------------------
+# Span context managers
+# ---------------------------------------------------------------------------
+
+
+@contextmanager
+def eval_run_span(
+    *,
+    bundle_name: str,
+    dataset_name: str,
+    backend_type: str,
+    target: str,
+    model: Optional[str] = None,
+    agent_id: Optional[str] = None,
+) -> Generator[Optional[Any], None, None]:
+    """Root span for an evaluation run (CICD pipeline run)."""
+    if not _tracing_enabled or _tracer is None:
+        yield None
+        return
+
+    from opentelemetry.trace import SpanKind, StatusCode
+
+    with _tracer.start_as_current_span(
+        f"RUN {bundle_name}",
+        kind=SpanKind.SERVER,
+    ) as span:
+        # CICD semconv
+        span.set_attribute("cicd.pipeline.name", bundle_name)
+        span.set_attribute("cicd.pipeline.action.name", "RUN")
+
+        # AgentOps evaluation attributes
+        span.set_attribute("agentops.eval.dataset", dataset_name)
+        span.set_attribute("agentops.eval.backend", backend_type)
+        span.set_attribute("agentops.eval.target", target)
+        if model:
+            span.set_attribute("agentops.eval.model", model)
+        if agent_id:
+            span.set_attribute("agentops.eval.agent_id", agent_id)
+
+        try:
+            yield span
+        except Exception as exc:
+            span.set_status(StatusCode.ERROR, str(exc))
+            span.record_exception(exc)
+            raise
+
+
+def set_eval_run_result(
+    span: Any,
+    *,
+    passed: bool,
+    items_total: int,
+    items_passed: int,
+) -> None:
+    """Set final result attributes on the root eval run span."""
+    if span is None:
+        return
+
+    from opentelemetry.trace import StatusCode
+
+    span.set_attribute("cicd.pipeline.result", "success" if passed else "failure")
+    span.set_attribute("agentops.eval.items_total", items_total)
+    span.set_attribute("agentops.eval.items_passed", items_passed)
+    if items_total > 0:
+        span.set_attribute("agentops.eval.pass_rate", items_passed / items_total)
+
+    if passed:
+        span.set_status(StatusCode.OK)
+    else:
+        span.set_status(StatusCode.ERROR, "Threshold failure")
+
+
+@contextmanager
+def eval_item_span(
+    *,
+    row_index: int,
+    input_text: Optional[str] = None,
+    expected_text: Optional[str] = None,
+) -> Generator[Optional[Any], None, None]:
+    """Span for a single evaluation item (CICD task run)."""
+    if not _tracing_enabled or _tracer is None:
+        yield None
+        return
+
+    from opentelemetry.trace import SpanKind
+
+    with _tracer.start_as_current_span(
+        f"eval_item {row_index}",
+        kind=SpanKind.INTERNAL,
+    ) as span:
+        # CICD task attributes
+        span.set_attribute("cicd.pipeline.task.name", "eval_item")
+        span.set_attribute("cicd.pipeline.task.run.id", str(row_index))
+
+        # AgentOps item attributes
+        span.set_attribute("agentops.eval.item.index", row_index)
+        if input_text:
+            span.set_attribute("agentops.eval.item.input", input_text)
+        if expected_text:
+            span.set_attribute("agentops.eval.item.expected", expected_text)
+
+        yield span
+
+
+def set_eval_item_result(span: Any, *, passed: bool) -> None:
+    """Set final result on an eval item span."""
+    if span is None:
+        return
+    span.set_attribute(
+        "cicd.pipeline.task.run.result", "success" if passed else "failure"
+    )
+    span.set_attribute("agentops.eval.item.passed", passed)
+
+
+@contextmanager
+def agent_invoke_span(
+    *,
+    target: str,
+    model: Optional[str] = None,
+    agent_id: Optional[str] = None,
+    agent_name: Optional[str] = None,
+    agent_version: Optional[str] = None,
+    provider: str = "azure.ai.inference",
+) -> Generator[Optional[Any], None, None]:
+    """Span for agent/model invocation (GenAI semconv)."""
+    if not _tracing_enabled or _tracer is None:
+        yield None
+        return
+
+    from opentelemetry.trace import SpanKind
+
+    operation = "invoke_agent" if target == "agent" else "chat"
+    span_name = f"{operation} {agent_name or model or 'unknown'}"
+
+    with _tracer.start_as_current_span(
+        span_name,
+        kind=SpanKind.CLIENT,
+    ) as span:
+        # GenAI semconv
+        span.set_attribute("gen_ai.operation.name", operation)
+        span.set_attribute("gen_ai.provider.name", provider)
+        if model:
+            span.set_attribute("gen_ai.request.model", model)
+        if agent_id:
+            span.set_attribute("gen_ai.agent.id", agent_id)
+        if agent_name:
+            span.set_attribute("gen_ai.agent.name", agent_name)
+        if agent_version:
+            span.set_attribute("gen_ai.agent.version", agent_version)
+
+        yield span
+
+
+def set_agent_invoke_result(
+    span: Any,
+    *,
+    response_model: Optional[str] = None,
+    input_tokens: Optional[int] = None,
+    output_tokens: Optional[int] = None,
+) -> None:
+    """Set GenAI response attributes on an agent invoke span."""
+    if span is None:
+        return
+    if response_model:
+        span.set_attribute("gen_ai.response.model", response_model)
+    if input_tokens is not None:
+        span.set_attribute("gen_ai.usage.input_tokens", input_tokens)
+    if output_tokens is not None:
+        span.set_attribute("gen_ai.usage.output_tokens", output_tokens)
+
+
+def record_evaluator_span(
+    *,
+    evaluator_name: str,
+    builtin_name: str,
+    source: str,
+    score: float,
+    threshold: Optional[float] = None,
+    criteria: Optional[str] = None,
+    passed: Optional[bool] = None,
+) -> None:
+    """Create a child span for a single evaluator result."""
+    if not _tracing_enabled or _tracer is None:
+        return
+
+    from opentelemetry.trace import SpanKind
+
+    with _tracer.start_as_current_span(
+        f"evaluator {builtin_name}",
+        kind=SpanKind.INTERNAL,
+    ) as span:
+        span.set_attribute("agentops.eval.evaluator.name", evaluator_name)
+        span.set_attribute("agentops.eval.evaluator.builtin", builtin_name)
+        span.set_attribute("agentops.eval.evaluator.source", source)
+        span.set_attribute("agentops.eval.evaluator.score", score)
+        if threshold is not None:
+            span.set_attribute("agentops.eval.evaluator.threshold", threshold)
+        if criteria is not None:
+            span.set_attribute("agentops.eval.evaluator.criteria", criteria)
+        if passed is not None:
+            span.set_attribute("agentops.eval.evaluator.passed", passed)
diff --git a/tests/unit/test_telemetry.py b/tests/unit/test_telemetry.py
new file mode 100644
index 00000000..cec0bd22
--- /dev/null
+++ b/tests/unit/test_telemetry.py
@@ -0,0 +1,267 @@
+"""Tests for OTLP telemetry instrumentation."""
+
+from __future__ import annotations
+
+import os
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from agentops.utils.telemetry import (
+    eval_item_span,
+    eval_run_span,
+    init_tracing,
+    is_enabled,
+    record_evaluator_span,
+    set_eval_item_result,
+    set_eval_run_result,
+)
+
+
+class TestTracingDisabledByDefault:
+    """When AGENTOPS_OTLP_ENDPOINT is unset, all functions are no-ops."""
+
+    def setup_method(self) -> None:
+        import agentops.utils.telemetry as tel
+
+        tel._tracing_enabled = False
+        tel._tracer = None
+
+    def test_is_enabled_returns_false(self) -> None:
+        assert is_enabled() is False
+
+    def test_eval_run_span_yields_none(self) -> None:
+        with eval_run_span(
+            bundle_name="test",
+            dataset_name="test",
+            backend_type="foundry",
+            target="model",
+        ) as span:
+            assert span is None
+
+    def test_eval_item_span_yields_none(self) -> None:
+        with eval_item_span(row_index=1) as span:
+            assert span is None
+
+    def test_set_eval_run_result_noop(self) -> None:
+        # Should not raise
+        set_eval_run_result(None, passed=True, items_total=5, items_passed=5)
+
+    def test_set_eval_item_result_noop(self) -> None:
+        set_eval_item_result(None, passed=True)
+
+    def test_record_evaluator_span_noop(self) -> None:
+        # Should not raise
+        record_evaluator_span(
+            evaluator_name="SimilarityEvaluator",
+            builtin_name="similarity",
+            source="foundry",
+            score=4.0,
+            threshold=3.0,
+            criteria=">=",
+            passed=True,
+        )
+
+
+class TestInitTracingWithoutEndpoint:
+    def test_no_init_without_env_var(self) -> None:
+        # Ensure the env var is not set
+        env = os.environ.copy()
+        env.pop("AGENTOPS_OTLP_ENDPOINT", None)
+        with patch.dict(os.environ, env, clear=True):
+            # Reset module state
+            import agentops.utils.telemetry as tel
+
+            tel._tracing_enabled = False
+            tel._tracer = None
+
+            init_tracing()
+            assert is_enabled() is False
+
+
+class TestInitTracingWithoutOtelInstalled:
+    def test_graceful_when_otel_missing(self) -> None:
+        import agentops.utils.telemetry as tel
+
+        tel._tracing_enabled = False
+        tel._tracer = None
+
+        with patch.dict(
+            os.environ, {"AGENTOPS_OTLP_ENDPOINT": "http://localhost:4318"}
+        ):
+            # Simulate opentelemetry not installed
+            with patch.dict("sys.modules", {"opentelemetry": None}):
+                init_tracing()
+                assert is_enabled() is False
+
+
+class TestSpanAttributesWhenEnabled:
+    """Test that span context managers set correct attributes when tracing is enabled.
+
+    These tests require opentelemetry to be installed because the code paths
+    import SpanKind/StatusCode when tracing is enabled.
+    """
+
+    otel = pytest.importorskip("opentelemetry")
+
+    def setup_method(self) -> None:
+        """Mock the tracing module to simulate enabled state."""
+        import agentops.utils.telemetry as tel
+
+        self.mock_span = MagicMock()
+        self.mock_span.__enter__ = MagicMock(return_value=self.mock_span)
+        self.mock_span.__exit__ = MagicMock(return_value=False)
+
+        self.mock_tracer = MagicMock()
+        self.mock_tracer.start_as_current_span.return_value = self.mock_span
+
+        tel._tracing_enabled = True
+        tel._tracer = self.mock_tracer
+
+    def teardown_method(self) -> None:
+        import agentops.utils.telemetry as tel
+
+        tel._tracing_enabled = False
+        tel._tracer = None
+
+    def test_eval_run_span_sets_cicd_attributes(self) -> None:
+        with eval_run_span(
+            bundle_name="model_direct",
+            dataset_name="smoke",
+            backend_type="foundry",
+            target="model",
+            model="gpt-4.1",
+        ) as span:
+            assert span is self.mock_span
+
+        # Verify CICD semconv attributes
+        calls = {
+            call.args[0]: call.args[1]
+            for call in self.mock_span.set_attribute.call_args_list
+        }
+        assert calls["cicd.pipeline.name"] == "model_direct"
+        assert calls["cicd.pipeline.action.name"] == "RUN"
+        assert calls["agentops.eval.dataset"] == "smoke"
+        assert calls["agentops.eval.backend"] == "foundry"
+        assert calls["agentops.eval.target"] == "model"
+        assert calls["agentops.eval.model"] == "gpt-4.1"
+
+    def test_eval_run_span_sets_agent_id(self) -> None:
+        with eval_run_span(
+            bundle_name="agent_test",
+            dataset_name="smoke",
+            backend_type="foundry",
+            target="agent",
+            agent_id="my-agent:3",
+        ):
+            pass
+
+        calls = {
+            call.args[0]: call.args[1]
+            for call in self.mock_span.set_attribute.call_args_list
+        }
+        assert calls["agentops.eval.agent_id"] == "my-agent:3"
+        assert calls["agentops.eval.target"] == "agent"
+
+    def test_eval_item_span_sets_task_attributes(self) -> None:
+        with eval_item_span(
+            row_index=3,
+            input_text="What is 2+2?",
+            expected_text="4",
+        ) as span:
+            assert span is self.mock_span
+
+        calls = {
+            call.args[0]: call.args[1]
+            for call in self.mock_span.set_attribute.call_args_list
+        }
+        assert calls["cicd.pipeline.task.name"] == "eval_item"
+        assert calls["cicd.pipeline.task.run.id"] == "3"
+        assert calls["agentops.eval.item.index"] == 3
+        assert calls["agentops.eval.item.input"] == "What is 2+2?"
+        assert calls["agentops.eval.item.expected"] == "4"
+
+    def test_set_eval_run_result_pass(self) -> None:
+        set_eval_run_result(
+            self.mock_span,
+            passed=True,
+            items_total=5,
+            items_passed=5,
+        )
+
+        calls = {
+            call.args[0]: call.args[1]
+            for call in self.mock_span.set_attribute.call_args_list
+        }
+        assert calls["cicd.pipeline.result"] == "success"
+        assert calls["agentops.eval.items_total"] == 5
+        assert calls["agentops.eval.items_passed"] == 5
+        assert calls["agentops.eval.pass_rate"] == 1.0
+
+    def test_set_eval_run_result_fail(self) -> None:
+        set_eval_run_result(
+            self.mock_span,
+            passed=False,
+            items_total=5,
+            items_passed=3,
+        )
+
+        calls = {
+            call.args[0]: call.args[1]
+            for call in self.mock_span.set_attribute.call_args_list
+        }
+        assert calls["cicd.pipeline.result"] == "failure"
+        assert calls["agentops.eval.items_passed"] == 3
+        assert calls["agentops.eval.pass_rate"] == 0.6
+
+    def test_set_eval_item_result(self) -> None:
+        set_eval_item_result(self.mock_span, passed=False)
+
+        calls = {
+            call.args[0]: call.args[1]
+            for call in self.mock_span.set_attribute.call_args_list
+        }
+        assert calls["cicd.pipeline.task.run.result"] == "failure"
+        assert calls["agentops.eval.item.passed"] is False
+
+    def test_record_evaluator_span(self) -> None:
+        record_evaluator_span(
+            evaluator_name="SimilarityEvaluator",
+            builtin_name="similarity",
+            source="foundry",
+            score=4.0,
+            threshold=3.0,
+            criteria=">=",
+            passed=True,
+        )
+
+        # Verify a child span was created
+        self.mock_tracer.start_as_current_span.assert_called_with(
+            "evaluator similarity",
+            kind=pytest.importorskip("opentelemetry.trace").SpanKind.INTERNAL,
+        )
+
+        calls = {
+            call.args[0]: call.args[1]
+            for call in self.mock_span.set_attribute.call_args_list
+        }
+        assert calls["agentops.eval.evaluator.name"] == "SimilarityEvaluator"
+        assert calls["agentops.eval.evaluator.builtin"] == "similarity"
+        assert calls["agentops.eval.evaluator.source"] == "foundry"
+        assert calls["agentops.eval.evaluator.score"] == 4.0
+        assert calls["agentops.eval.evaluator.threshold"] == 3.0
+        assert calls["agentops.eval.evaluator.criteria"] == ">="
+        assert calls["agentops.eval.evaluator.passed"] is True
+
+    def test_eval_run_span_name(self) -> None:
+        with eval_run_span(
+            bundle_name="my_bundle",
+            dataset_name="smoke",
+            backend_type="foundry",
+            target="model",
+        ):
+            pass
+
+        self.mock_tracer.start_as_current_span.assert_called_once()
+        span_name = self.mock_tracer.start_as_current_span.call_args.args[0]
+        assert span_name == "RUN my_bundle"