Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,23 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # Full history for version derivation

- name: Sync VSIX version from git tag
run: |
LAST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0")
LAST_VERSION=${LAST_TAG#v}
IFS='.' read -r MAJOR MINOR PATCH <<< "$LAST_VERSION"
if git describe --tags --exact-match HEAD >/dev/null 2>&1; then
BASE_VERSION="$LAST_VERSION"
else
BASE_VERSION="$MAJOR.$MINOR.$((PATCH + 1))"
fi
jq --arg v "$BASE_VERSION" '.version = $v' \
plugins/agentops/package.json > plugins/agentops/package.json.tmp
mv plugins/agentops/package.json.tmp plugins/agentops/package.json
echo "VSIX version set to $BASE_VERSION (from tag $LAST_TAG)"

- name: Set up Node.js
uses: actions/setup-node@v4
Expand Down Expand Up @@ -224,6 +241,23 @@ jobs:
environment: staging
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # Full history for version derivation

- name: Sync VSIX version from git tag
run: |
LAST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0")
LAST_VERSION=${LAST_TAG#v}
IFS='.' read -r MAJOR MINOR PATCH <<< "$LAST_VERSION"
if git describe --tags --exact-match HEAD >/dev/null 2>&1; then
BASE_VERSION="$LAST_VERSION"
else
BASE_VERSION="$MAJOR.$MINOR.$((PATCH + 1))"
fi
jq --arg v "$BASE_VERSION" '.version = $v' \
plugins/agentops/package.json > plugins/agentops/package.json.tmp
mv plugins/agentops/package.json.tmp plugins/agentops/package.json
echo "VSIX version set to $BASE_VERSION (from tag $LAST_TAG)"

- name: Set up Node.js
uses: actions/setup-node@v4
Expand Down
17 changes: 17 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,23 @@ jobs:
environment: release # same approval gate as PyPI
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # Full history for version derivation

- name: Sync VSIX version from git tag
run: |
LAST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0")
LAST_VERSION=${LAST_TAG#v}
IFS='.' read -r MAJOR MINOR PATCH <<< "$LAST_VERSION"
if git describe --tags --exact-match HEAD >/dev/null 2>&1; then
BASE_VERSION="$LAST_VERSION"
else
BASE_VERSION="$MAJOR.$MINOR.$((PATCH + 1))"
fi
jq --arg v "$BASE_VERSION" '.version = $v' \
plugins/agentops/package.json > plugins/agentops/package.json.tmp
mv plugins/agentops/package.json.tmp plugins/agentops/package.json
echo "VSIX version set to $BASE_VERSION (from tag $LAST_TAG)"

- name: Set up Node.js
uses: actions/setup-node@v4
Expand Down
17 changes: 17 additions & 0 deletions .github/workflows/staging.yml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,23 @@ jobs:
environment: staging
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # Full history for version derivation

- name: Sync VSIX version from git tag
run: |
LAST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0")
LAST_VERSION=${LAST_TAG#v}
IFS='.' read -r MAJOR MINOR PATCH <<< "$LAST_VERSION"
if git describe --tags --exact-match HEAD >/dev/null 2>&1; then
BASE_VERSION="$LAST_VERSION"
else
BASE_VERSION="$MAJOR.$MINOR.$((PATCH + 1))"
fi
jq --arg v "$BASE_VERSION" '.version = $v' \
plugins/agentops/package.json > plugins/agentops/package.json.tmp
mv plugins/agentops/package.json.tmp plugins/agentops/package.json
echo "VSIX version set to $BASE_VERSION (from tag $LAST_TAG)"

- name: Set up Node.js
uses: actions/setup-node@v4
Expand Down
42 changes: 29 additions & 13 deletions plugins/agentops/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,10 @@ Copilot agent skills for running standardized evaluation workflows with
| **Browse & Inspect** | List and inspect evaluation runs, view per-row scores, browse run history |
| **Dataset Management** | Validate, describe, and import datasets for evaluation workflows |

## Prerequisites

Install the AgentOps CLI in your project's virtual environment:

```bash
pip install agentops-toolkit
```

## Installation

Install from the
[VS Code Marketplace](https://marketplace.visualstudio.com/items?itemName=PUBLISHER_ID.agentops-skills)
[VS Code Marketplace](https://marketplace.visualstudio.com/items?itemName=AgentOpsToolkit.agentops-toolkit)
or search **"AgentOps Skills"** in the VS Code Extensions view.

A **pre-release** channel is available for early access to new skills and updates —
Expand All @@ -34,13 +26,37 @@ enable it from the extension's Marketplace page or the Extensions view.
## Usage

Open **Copilot Chat** in VS Code and describe what you want to do.
The skills are invoked automatically when your request matches their domain:
The skills are invoked automatically when your request matches their domain.

**Set up a workspace**

```
> Initialize an agentops workspace for my Foundry agent project
> Create a RAG evaluation bundle with groundedness and similarity
```

**Run and compare evaluations**

```
> Run the default evaluation against my agent
> Benchmark gpt-4o vs gpt-4o-mini using the smoke dataset
> Compare the last two evaluation runs and summarize the differences
```

**Investigate results**

```
> Initialize an agentops workspace for my project
> Run the default evaluation
> Compare run abc123 with run def456
> Which rows failed the groundedness threshold?
> Show me the worst-scoring items from the latest run
> Why did similarity drop between run abc123 and run def456?
```

**Browse and manage**

```
> List all evaluation runs
> Show details for the latest run
> Validate my dataset before running an eval
```

## Links
Expand Down
15 changes: 11 additions & 4 deletions src/agentops/backends/foundry_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,8 @@ def _azure_openai_model_config(
"Missing: " + ", ".join(missing)
)

assert endpoint is not None
assert deployment is not None
model_config: Dict[str, str] = {
"azure_endpoint": endpoint,
"azure_deployment": deployment,
Expand Down Expand Up @@ -903,6 +905,7 @@ def _read_settings(self, context: BackendRunContext) -> FoundrySettings:
# Model-direct: use cognitive services scope
token_scope = "https://cognitiveservices.azure.com/.default"
else:
assert agent_id is not None
token_scope = _preferred_scope_for_agent_id(agent_id)
logger.info("Acquiring token via DefaultAzureCredential…")
agent_token = _acquire_token(token_scope)
Expand Down Expand Up @@ -1025,6 +1028,7 @@ def _invoke_agent_reference(
"Authorization": f"Bearer {settings.agent_token}",
}

assert settings.agent_id is not None
agent_name, agent_version = (settings.agent_id, None)
if ":" in settings.agent_id:
split_name, split_version = settings.agent_id.split(":", 1)
Expand Down Expand Up @@ -1055,6 +1059,7 @@ def _invoke_agent_reference(
def _invoke_agent_service(
self, settings: FoundrySettings, prompt: str, timeout_seconds: int | None
) -> str:
assert settings.agent_id is not None
if not settings.agent_id.startswith("asst_"):
return self._invoke_agent_reference(settings, prompt, timeout_seconds)

Expand Down Expand Up @@ -1161,6 +1166,7 @@ def _invoke_model_direct(self, settings: FoundrySettings, prompt: str) -> str:
)
openai_client = project_client.get_openai_client()

assert settings.model is not None
response = openai_client.chat.completions.create(
model=settings.model,
messages=[{"role": "user", "content": prompt}],
Expand Down Expand Up @@ -1381,6 +1387,7 @@ def _evals_get(path: str, extra_params: str = "") -> Dict[str, Any]:
)
else:
# Agent target
assert settings.agent_id is not None
agent_name, agent_version = _parse_agent_name_version(settings.agent_id)
target: Dict[str, Any] = {
"type": "azure_ai_agent",
Expand Down Expand Up @@ -1500,7 +1507,7 @@ def _evals_get(path: str, extra_params: str = "") -> Dict[str, Any]:
if isinstance(sample, dict):
prediction = _normalize_text(sample.get("output_text", ""))

row_metric_entries: List[Dict[str, float]] = []
row_metric_entries: List[Dict[str, Any]] = []
for result in item.get("results", []) or []:
metric_name = result.get("name", "") if isinstance(result, dict) else ""
metric_score = (
Expand Down Expand Up @@ -1586,7 +1593,7 @@ def _evals_get(path: str, extra_params: str = "") -> Dict[str, Any]:
total = len(output_items)

# --- Aggregate metrics ----------------------------------------------
metrics_entries: List[Dict[str, float]] = []
metrics_entries: List[Dict[str, Any]] = []
for name in enabled_evaluator_order:
values = evaluator_aggregate_values.get(name, [])
if values:
Expand Down Expand Up @@ -1748,7 +1755,7 @@ def _record_row_metrics(
prediction_normalized = _normalize_text(prediction_text)
total += 1

row_metric_entries: List[Dict[str, float]] = []
row_metric_entries: List[Dict[str, Any]] = []

for runtime in foundry_evaluator_runtimes:
score = _run_foundry_evaluator(
Expand Down Expand Up @@ -1912,7 +1919,7 @@ def _record_row_metrics(
else 0.0
)

metrics_entries: List[Dict[str, float]] = []
metrics_entries: List[Dict[str, Any]] = []
for evaluator_name in enabled_evaluator_order:
values = evaluator_aggregate_values.get(evaluator_name, [])
if values:
Expand Down
4 changes: 2 additions & 2 deletions src/agentops/core/config_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from pathlib import Path
from typing import Type, TypeVar

from pydantic import ValidationError
from pydantic import BaseModel, ValidationError

from agentops.core.models import (
BundleConfig,
Expand All @@ -15,7 +15,7 @@
)
from agentops.utils.yaml import load_yaml

TModel = TypeVar("TModel")
TModel = TypeVar("TModel", bound=BaseModel)


def _load_model(path: Path, model_cls: Type[TModel], label: str) -> TModel:
Expand Down
12 changes: 6 additions & 6 deletions src/agentops/core/reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,20 +525,20 @@ def generate_comparison_html(result: ComparisonResult) -> str:

# Pre-compute per-evaluator row pass rates
eval_row_rates: dict[str, list[tuple[int, int]]] = {}
for tr in result.threshold_rows:
for thr in result.threshold_rows:
rates = []
for run_idx in range(len(result.runs)):
total = 0
passed = 0
for ir in result.item_rows:
scores_list = ir.scores.get(tr.evaluator, [])
scores_list = ir.scores.get(thr.evaluator, [])
score = scores_list[run_idx] if run_idx < len(scores_list) else None
if score is not None:
total += 1
if _check_threshold(score, tr.criteria, tr.target):
if _check_threshold(score, thr.criteria, thr.target):
passed += 1
rates.append((passed, total))
eval_row_rates[tr.evaluator] = rates
eval_row_rates[thr.evaluator] = rates

parts: list[str] = []

Expand Down Expand Up @@ -707,9 +707,9 @@ def generate_comparison_html(result: ComparisonResult) -> str:
parts.append(
"<table><thead><tr><th>Parameter</th><th>Value</th></tr></thead><tbody>"
)
for k, v in cond.fixed.items():
for key, val in cond.fixed.items():
parts.append(
f"<tr><td>{_html_escape(k)}</td><td>{_html_escape(v)}</td></tr>"
f"<tr><td>{_html_escape(key)}</td><td>{_html_escape(val)}</td></tr>"
)
parts.append("</tbody></table>")

Expand Down
5 changes: 2 additions & 3 deletions src/agentops/services/browse.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,9 +288,8 @@ def show_run(run_id: str, directory: Path = Path(".")) -> RunDetail:
data = json.loads(results_file.read_text(encoding="utf-8"))
result = RunResult.model_validate(data)

report_path = run_dir / "report.md"
if not report_path.exists():
report_path = None
_rp = run_dir / "report.md"
report_path: Path | None = _rp if _rp.exists() else None

foundry_url = None
if result.artifacts and result.artifacts.foundry_eval_studio_url:
Expand Down
26 changes: 16 additions & 10 deletions src/agentops/services/comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,13 @@
ComparisonResult,
ComparisonSummary,
ComparisonThresholdRow,
ComparisonType,
Criteria,
Direction,
ItemEvaluationResult,
RunReference,
RunResult,
ThresholdEvaluationResult,
)


Expand Down Expand Up @@ -120,7 +125,7 @@ def _lower_is_better_metrics(*results: RunResult) -> frozenset[str]:
return frozenset(names)


def _compute_metric_direction(delta: float, lower_is_better: bool) -> str:
def _compute_metric_direction(delta: float, lower_is_better: bool) -> Direction:
if delta == 0:
return "unchanged"
if lower_is_better:
Expand Down Expand Up @@ -153,6 +158,7 @@ def _detect_conditions(refs: List[RunReference]) -> ComparisonConditions:
varying.append(key)

# Determine comparison type
ctype: ComparisonType
if "dataset" not in varying and "agent" in varying:
ctype = "agent"
elif "dataset" not in varying and "model" in varying:
Expand Down Expand Up @@ -198,7 +204,7 @@ def compare_runs(
values: List[float] = []
deltas: List[Optional[float]] = []
delta_percents: List[Optional[float]] = []
directions: List[str] = []
directions: List[Direction] = []
baseline_val: Optional[float] = None

for i, r in enumerate(results):
Expand Down Expand Up @@ -254,11 +260,11 @@ def compare_runs(
)

# Build threshold rows
all_thresholds: List[tuple[str, str]] = []
seen_thresholds: set[tuple[str, str]] = set()
all_thresholds: List[tuple[str, Criteria]] = []
seen_thresholds: set[tuple[str, Criteria]] = set()
for r in results:
for t in r.thresholds:
key = (t.evaluator, t.criteria)
for th in r.thresholds:
key = (th.evaluator, th.criteria)
if key not in seen_thresholds:
all_thresholds.append(key)
seen_thresholds.add(key)
Expand All @@ -269,7 +275,7 @@ def compare_runs(
target_val: str | None = None
for r in results:
t_map = {(t.evaluator, t.criteria): t for t in r.thresholds}
t = t_map.get((evaluator, criteria))
t: ThresholdEvaluationResult | None = t_map.get((evaluator, criteria))
passed_list.append(t.passed if t else False)
if t and target_val is None:
target_val = t.expected
Expand All @@ -285,8 +291,8 @@ def compare_runs(
# Build item rows
all_row_indices: set[int] = set()
for r in results:
for item in r.item_evaluations:
all_row_indices.add(item.row_index)
for ie in r.item_evaluations:
all_row_indices.add(ie.row_index)

# Collect evaluator names that have thresholds (for row-level display)
threshold_evaluator_names = [tr.evaluator for tr in threshold_rows]
Expand All @@ -300,7 +306,7 @@ def compare_runs(
}
for r in results:
item_map = {item.row_index: item for item in r.item_evaluations}
item = item_map.get(idx)
item: ItemEvaluationResult | None = item_map.get(idx)
passed_list.append(item.passed_all if item else False)
# Extract row-level metric scores
row_metrics_map = {row.row_index: row for row in r.row_metrics}
Expand Down
Loading
Loading