Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .github/copilot-instructions.md
Original file line number Diff line number Diff line change
Expand Up @@ -254,3 +254,12 @@ When generating or modifying code:
- The `core/` package must remain free of Azure imports and I/O
- Follow the request flow: CLI → Services → Backends → Core (never skip layers)
- If a change is user-visible, add an entry to `CHANGELOG.md` under `[Unreleased]` (Keep a Changelog format)

### OTLP Telemetry

- `utils/telemetry.py` provides optional OTLP trace emission for evaluation runs
- Activated by `AGENTOPS_OTLP_ENDPOINT` env var — zero overhead when unset
- All OpenTelemetry imports must be **lazy** (inside functions in `utils/telemetry.py`)
- `opentelemetry-sdk` is an optional runtime dependency — not declared in `pyproject.toml`
- Span schema: CICD semconv (`cicd.pipeline.*`) for pipeline structure, GenAI semconv (`gen_ai.*`) for agent calls, `agentops.eval.*` for evaluator scores
- When adding new spans, follow the three-layer pattern in `telemetry.py`
30 changes: 29 additions & 1 deletion AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,8 @@ src/
├── utils/
│ ├── yaml.py # YAML IO and interpolation helpers
│ └── logging.py # Logging setup
│ ├── logging.py # Logging setup
│ └── telemetry.py # Optional OTLP tracing (lazy imports)
└── templates/
├── config.yaml # Seed workspace config
Expand Down Expand Up @@ -368,6 +369,7 @@ Important environment variables:
- `AZURE_OPENAI_DEPLOYMENT`
- `AZURE_AI_MODEL_DEPLOYMENT_NAME`
- `AZURE_OPENAI_API_VERSION`
- `AGENTOPS_OTLP_ENDPOINT` — OTLP collector URL for evaluation tracing (opt-in, e.g. `http://localhost:4318`)

Recommended default behavior:
- Keep Foundry cloud mode as the default path
Expand All @@ -377,6 +379,32 @@ Recommended default behavior:

---

## OTLP Telemetry

AgentOps can optionally emit OpenTelemetry (OTLP) traces during evaluation runs. Set `AGENTOPS_OTLP_ENDPOINT` to enable.

```bash
# Enable tracing (e.g. AI Toolkit collector, Azure Monitor, Jaeger)
export AGENTOPS_OTLP_ENDPOINT=http://localhost:4318
agentops eval run
```

Span schema uses three OTel semantic convention layers:

| Layer | Namespace | Purpose |
|---|---|---|
| CICD | `cicd.pipeline.*` | Eval run as pipeline, items as tasks |
| GenAI | `gen_ai.*` | Agent/model invocation (future Layer 2) |
| AgentOps | `agentops.eval.*` | Evaluator scores, thresholds, pass/fail |

Design rules:
- All OpenTelemetry imports are **lazy** (inside `utils/telemetry.py` functions)
- Zero overhead when `AGENTOPS_OTLP_ENDPOINT` is unset
- Graceful no-op when `opentelemetry-sdk` is not installed
- `opentelemetry-sdk` and `opentelemetry-exporter-otlp-proto-http` are optional runtime dependencies (not in `pyproject.toml`)

---

## Architectural Constraints

### Code Organization
Expand Down
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ This format follows [Keep a Changelog](https://keepachangelog.com/) and adheres
## [Unreleased]

### Added
- Add optional OTLP tracing for evaluation runs — set `AGENTOPS_OTLP_ENDPOINT` to emit OpenTelemetry spans.
- Three-layer schema: CICD semconv (pipeline run/task), GenAI semconv (agent invocation), and `agentops.eval.*` (evaluator scores/thresholds).
- Per-row item spans with evaluator child spans showing score, threshold, and pass/fail.
- Zero overhead when `AGENTOPS_OTLP_ENDPOINT` is unset; graceful no-op when `opentelemetry-sdk` is not installed.
- Compatible with AI Toolkit (localhost:4318), Azure Monitor, Jaeger, Grafana Tempo, and any OTLP-compatible collector.
- Implement `agentops eval compare --runs <baseline>,<current>` for baseline comparison of evaluation runs.
- Produces `comparison.json` (structured metric deltas, threshold flips, item-level changes) and `comparison.md` (human-readable report).
- Exits with code `0` (no regressions), `2` (regressions detected), or `1` (error).
Expand Down
136 changes: 132 additions & 4 deletions src/agentops/services/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@
)
from agentops.core.reporter import generate_report_html, generate_report_markdown
from agentops.services.foundry_evals import publish_foundry_evaluation
from agentops.utils.telemetry import (
eval_item_span,
eval_run_span,
init_tracing,
record_evaluator_span,
set_eval_item_result,
set_eval_run_result,
shutdown as shutdown_tracing,
)


@dataclass(frozen=True)
Expand Down Expand Up @@ -366,8 +375,72 @@ def _append_run_metric(name: str, value: float) -> None:
return run_metrics


def _emit_item_spans(
*,
item_evaluations: List[ItemEvaluationResult],
row_metrics: List[RowMetricsResult],
bundle_config,
) -> None:
"""Emit OTLP spans for each evaluated item with evaluator child spans."""
from agentops.utils.telemetry import is_enabled

if not is_enabled():
return

# Build lookup: row_index → {metric_name: value}
row_values_by_index: Dict[int, Dict[str, float]] = {}
for row in row_metrics:
row_values_by_index[row.row_index] = {m.name: m.value for m in row.metrics}

# Build lookup: evaluator_name → (source, threshold_value, criteria)
evaluator_info: Dict[str, tuple] = {}
for ev in bundle_config.evaluators:
if not ev.enabled:
continue
threshold_value = None
criteria = None
for thr in bundle_config.thresholds:
if thr.evaluator == ev.name:
threshold_value = thr.value
criteria = thr.criteria
break
evaluator_info[ev.name] = (ev.source, threshold_value, criteria)

for item in item_evaluations:
with eval_item_span(row_index=item.row_index) as item_span:
set_eval_item_result(item_span, passed=item.passed_all)

# Emit evaluator child spans
row_scores = row_values_by_index.get(item.row_index, {})
for thr_result in item.thresholds:
ev_name = thr_result.evaluator
source, threshold_val, criteria = evaluator_info.get(
ev_name, ("local", None, None)
)
score = row_scores.get(ev_name, 0.0)

import re

builtin = ev_name.strip()
if builtin.endswith("Evaluator"):
builtin = builtin[:-9]
builtin = re.sub(r"(?<!^)(?=[A-Z])", "_", builtin).lower()

record_evaluator_span(
evaluator_name=ev_name,
builtin_name=builtin,
source=source,
score=score,
threshold=threshold_val,
criteria=criteria,
passed=thr_result.passed,
)


def run_evaluation(
config_path: Path | None = None, output_override: Path | None = None, report_format: str = "md",
config_path: Path | None = None,
output_override: Path | None = None,
report_format: str = "md",
) -> EvalRunServiceResult:
run_config_path = (
config_path.resolve() if config_path is not None else _default_run_config_path()
Expand All @@ -381,6 +454,47 @@ def run_evaluation(
bundle_config = load_bundle_config(bundle_path)
dataset_config = load_dataset_config(dataset_path)

# Initialise OTLP tracing (no-op when AGENTOPS_OTLP_ENDPOINT is unset)
init_tracing()

target = (run_config.backend.target or "agent").strip().lower()

with eval_run_span(
bundle_name=bundle_config.name,
dataset_name=dataset_config.name,
backend_type=run_config.backend.type,
target=target,
model=run_config.backend.model,
agent_id=run_config.backend.agent_id,
) as run_span:
result = _run_evaluation_inner(
run_config=run_config,
run_config_path=run_config_path,
bundle_config=bundle_config,
bundle_path=bundle_path,
dataset_config=dataset_config,
dataset_path=dataset_path,
output_override=output_override,
report_format=report_format,
run_span=run_span,
)

shutdown_tracing()
return result


def _run_evaluation_inner(
*,
run_config,
run_config_path: Path,
bundle_config,
bundle_path: Path,
dataset_config,
dataset_path: Path,
output_override: Path | None,
report_format: str,
run_span,
) -> EvalRunServiceResult:
output_dir = (
output_override.resolve()
if output_override is not None
Expand Down Expand Up @@ -425,6 +539,13 @@ def run_evaluation(

item_evaluations = _evaluate_item_thresholds(bundle_config.thresholds, row_metrics)

# Emit OTLP spans for each evaluated item (no-op when tracing is disabled)
_emit_item_spans(
item_evaluations=item_evaluations,
row_metrics=row_metrics,
bundle_config=bundle_config,
)

if bundle_config.thresholds and not row_metrics:
raise ValueError(
"Item-level threshold evaluation requires backend 'row_metrics'"
Expand Down Expand Up @@ -512,9 +633,7 @@ def run_evaluation(
report_path = md_path
if report_format in ("html", "all"):
html_path = output_dir / "report.html"
html_path.write_text(
generate_report_html(normalized_result), encoding="utf-8"
)
html_path.write_text(generate_report_html(normalized_result), encoding="utf-8")
report_path = html_path
if report_format == "all":
report_path = md_path
Expand All @@ -523,6 +642,15 @@ def run_evaluation(
_sync_latest_output(output_dir, latest_dir)

exit_code = 0 if summary.overall_passed else 2

# Set final result on the root OTLP span
set_eval_run_result(
run_span,
passed=summary.overall_passed,
items_total=len(item_evaluations),
items_passed=sum(1 for item in item_evaluations if item.passed_all),
)

return EvalRunServiceResult(
output_dir=output_dir,
results_path=results_path,
Expand Down
Loading
Loading