From 75d4524beb0158c1e361ab29f880befc04d9acb1 Mon Sep 17 00:00:00 2001 From: Dongbumlee Date: Mon, 13 Apr 2026 16:11:57 -0700 Subject: [PATCH 1/5] ci(vsix): sync VSIX version from git tags in all pipelines Derive package.json version at CI time from the latest git tag using git describe + jq. Mimics setuptools-scm patch-increment behavior: - On exact tag (release): use tag version directly (e.g. v0.2.0 -> 0.2.0) - Off tag (develop/PR): increment patch (e.g. v0.1.0 + commits -> 0.1.1) Applied to all 4 VSIX jobs: - ci.yml: build-vsix, publish-vsix-dev - staging.yml: publish-vsix-prerelease - release.yml: publish-vsix Also adds fetch-depth: 0 to checkout steps so git describe has access to the full tag history. --- .github/workflows/ci.yml | 34 ++++++++++++++++++++++++++++++++++ .github/workflows/release.yml | 17 +++++++++++++++++ .github/workflows/staging.yml | 17 +++++++++++++++++ 3 files changed, 68 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 83ecfc25..b2536916 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -192,6 +192,23 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + with: + fetch-depth: 0 # Full history for version derivation + + - name: Sync VSIX version from git tag + run: | + LAST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0") + LAST_VERSION=${LAST_TAG#v} + IFS='.' read -r MAJOR MINOR PATCH <<< "$LAST_VERSION" + if git describe --tags --exact-match HEAD >/dev/null 2>&1; then + BASE_VERSION="$LAST_VERSION" + else + BASE_VERSION="$MAJOR.$MINOR.$((PATCH + 1))" + fi + jq --arg v "$BASE_VERSION" '.version = $v' \ + plugins/agentops/package.json > plugins/agentops/package.json.tmp + mv plugins/agentops/package.json.tmp plugins/agentops/package.json + echo "VSIX version set to $BASE_VERSION (from tag $LAST_TAG)" - name: Set up Node.js uses: actions/setup-node@v4 @@ -224,6 +241,23 @@ jobs: environment: staging steps: - uses: actions/checkout@v4 + with: + fetch-depth: 0 # Full history for version derivation + + - name: Sync VSIX version from git tag + run: | + LAST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0") + LAST_VERSION=${LAST_TAG#v} + IFS='.' read -r MAJOR MINOR PATCH <<< "$LAST_VERSION" + if git describe --tags --exact-match HEAD >/dev/null 2>&1; then + BASE_VERSION="$LAST_VERSION" + else + BASE_VERSION="$MAJOR.$MINOR.$((PATCH + 1))" + fi + jq --arg v "$BASE_VERSION" '.version = $v' \ + plugins/agentops/package.json > plugins/agentops/package.json.tmp + mv plugins/agentops/package.json.tmp plugins/agentops/package.json + echo "VSIX version set to $BASE_VERSION (from tag $LAST_TAG)" - name: Set up Node.js uses: actions/setup-node@v4 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index aeb9fcc0..9100c6e7 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -147,6 +147,23 @@ jobs: environment: release # same approval gate as PyPI steps: - uses: actions/checkout@v4 + with: + fetch-depth: 0 # Full history for version derivation + + - name: Sync VSIX version from git tag + run: | + LAST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0") + LAST_VERSION=${LAST_TAG#v} + IFS='.' read -r MAJOR MINOR PATCH <<< "$LAST_VERSION" + if git describe --tags --exact-match HEAD >/dev/null 2>&1; then + BASE_VERSION="$LAST_VERSION" + else + BASE_VERSION="$MAJOR.$MINOR.$((PATCH + 1))" + fi + jq --arg v "$BASE_VERSION" '.version = $v' \ + plugins/agentops/package.json > plugins/agentops/package.json.tmp + mv plugins/agentops/package.json.tmp plugins/agentops/package.json + echo "VSIX version set to $BASE_VERSION (from tag $LAST_TAG)" - name: Set up Node.js uses: actions/setup-node@v4 diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index 41292f21..2ceb08ea 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -124,6 +124,23 @@ jobs: environment: staging steps: - uses: actions/checkout@v4 + with: + fetch-depth: 0 # Full history for version derivation + + - name: Sync VSIX version from git tag + run: | + LAST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0") + LAST_VERSION=${LAST_TAG#v} + IFS='.' read -r MAJOR MINOR PATCH <<< "$LAST_VERSION" + if git describe --tags --exact-match HEAD >/dev/null 2>&1; then + BASE_VERSION="$LAST_VERSION" + else + BASE_VERSION="$MAJOR.$MINOR.$((PATCH + 1))" + fi + jq --arg v "$BASE_VERSION" '.version = $v' \ + plugins/agentops/package.json > plugins/agentops/package.json.tmp + mv plugins/agentops/package.json.tmp plugins/agentops/package.json + echo "VSIX version set to $BASE_VERSION (from tag $LAST_TAG)" - name: Set up Node.js uses: actions/setup-node@v4 From 17f13c1b37d822e112e61324bfc3d3a8ec0b695f Mon Sep 17 00:00:00 2001 From: Dongbumlee Date: Mon, 13 Apr 2026 16:13:28 -0700 Subject: [PATCH 2/5] fix(vsix): update Marketplace link placeholder in README --- plugins/agentops/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/agentops/README.md b/plugins/agentops/README.md index b8c6c5ce..4077bbe3 100644 --- a/plugins/agentops/README.md +++ b/plugins/agentops/README.md @@ -25,7 +25,7 @@ pip install agentops-toolkit ## Installation Install from the -[VS Code Marketplace](https://marketplace.visualstudio.com/items?itemName=PUBLISHER_ID.agentops-skills) +[VS Code Marketplace](https://marketplace.visualstudio.com/items?itemName=AgentOpsToolkit.agentops-toolkit) or search **"AgentOps Skills"** in the VS Code Extensions view. A **pre-release** channel is available for early access to new skills and updates — From 68f646d2f2b7b9b53ecf7d9d2b256c3076bd4fd1 Mon Sep 17 00:00:00 2001 From: Dongbumlee Date: Mon, 13 Apr 2026 16:19:14 -0700 Subject: [PATCH 3/5] =?UTF-8?q?docs(vsix):=20improve=20README=20=E2=80=94?= =?UTF-8?q?=20remove=20misleading=20Prerequisites,=20expand=20Usage=20exam?= =?UTF-8?q?ples?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- plugins/agentops/README.md | 43 +++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/plugins/agentops/README.md b/plugins/agentops/README.md index 4077bbe3..b036f89a 100644 --- a/plugins/agentops/README.md +++ b/plugins/agentops/README.md @@ -14,14 +14,6 @@ Copilot agent skills for running standardized evaluation workflows with | **Browse & Inspect** | List and inspect evaluation runs, view per-row scores, browse run history | | **Dataset Management** | Validate, describe, and import datasets for evaluation workflows | -## Prerequisites - -Install the AgentOps CLI in your project's virtual environment: - -```bash -pip install agentops-toolkit -``` - ## Installation Install from the @@ -34,15 +26,42 @@ enable it from the extension's Marketplace page or the Extensions view. ## Usage Open **Copilot Chat** in VS Code and describe what you want to do. -The skills are invoked automatically when your request matches their domain: +The skills are invoked automatically when your request matches their domain. + +**Set up a workspace** + +``` +> Initialize an agentops workspace for my Foundry agent project +> Create a RAG evaluation bundle with groundedness and similarity +``` + +**Run and compare evaluations** + +``` +> Run the default evaluation against my agent +> Benchmark gpt-4o vs gpt-4o-mini using the smoke dataset +> Compare the last two evaluation runs and summarize the differences +``` + +**Investigate results** ``` -> Initialize an agentops workspace for my project -> Run the default evaluation -> Compare run abc123 with run def456 > Which rows failed the groundedness threshold? +> Show me the worst-scoring items from the latest run +> Why did similarity drop between run abc123 and run def456? ``` +**Browse and manage** + +``` +> List all evaluation runs +> Show details for the latest run +> Validate my dataset before running an eval +``` + +> **Note:** To run evaluations, install the AgentOps CLI in your project's virtual +> environment: `pip install agentops-toolkit` + ## Links - [AgentOps Toolkit](https://github.com/Azure/agentops) — CLI and documentation From 01acc555c3d25fcf8436e6bcfc01b10420be80bd Mon Sep 17 00:00:00 2001 From: Dongbumlee Date: Mon, 13 Apr 2026 16:23:46 -0700 Subject: [PATCH 4/5] =?UTF-8?q?docs(vsix):=20remove=20CLI=20install=20note?= =?UTF-8?q?=20=E2=80=94=20skills=20handle=20setup=20automatically?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- plugins/agentops/README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/plugins/agentops/README.md b/plugins/agentops/README.md index b036f89a..6e118171 100644 --- a/plugins/agentops/README.md +++ b/plugins/agentops/README.md @@ -59,9 +59,6 @@ The skills are invoked automatically when your request matches their domain. > Validate my dataset before running an eval ``` -> **Note:** To run evaluations, install the AgentOps CLI in your project's virtual -> environment: `pip install agentops-toolkit` - ## Links - [AgentOps Toolkit](https://github.com/Azure/agentops) — CLI and documentation From 7636e1196045623c5b33cd123112ed38b7c526eb Mon Sep 17 00:00:00 2001 From: Dongbumlee Date: Mon, 13 Apr 2026 17:25:03 -0700 Subject: [PATCH 5/5] fix: resolve all mypy type errors across 6 source files - foundry_backend.py: assert narrowing for Optional[str], Dict type widening - config_loader.py: added BaseModel import and TypeVar bound - reporter.py: removed conflicting annotations, renamed shadowed loop vars - browse.py: split Path | None annotation into separate assignment - comparison.py: fixed _compute_metric_direction return type, renamed loop vars - runner.py: added imports, Pydantic model constructors --- src/agentops/backends/foundry_backend.py | 15 ++++++++++---- src/agentops/core/config_loader.py | 4 ++-- src/agentops/core/reporter.py | 12 +++++------ src/agentops/services/browse.py | 5 ++--- src/agentops/services/comparison.py | 26 +++++++++++++++--------- src/agentops/services/runner.py | 25 +++++++++++++---------- 6 files changed, 51 insertions(+), 36 deletions(-) diff --git a/src/agentops/backends/foundry_backend.py b/src/agentops/backends/foundry_backend.py index 3850bea8..84b2d0bd 100644 --- a/src/agentops/backends/foundry_backend.py +++ b/src/agentops/backends/foundry_backend.py @@ -310,6 +310,8 @@ def _azure_openai_model_config( "Missing: " + ", ".join(missing) ) + assert endpoint is not None + assert deployment is not None model_config: Dict[str, str] = { "azure_endpoint": endpoint, "azure_deployment": deployment, @@ -903,6 +905,7 @@ def _read_settings(self, context: BackendRunContext) -> FoundrySettings: # Model-direct: use cognitive services scope token_scope = "https://cognitiveservices.azure.com/.default" else: + assert agent_id is not None token_scope = _preferred_scope_for_agent_id(agent_id) logger.info("Acquiring token via DefaultAzureCredential…") agent_token = _acquire_token(token_scope) @@ -1025,6 +1028,7 @@ def _invoke_agent_reference( "Authorization": f"Bearer {settings.agent_token}", } + assert settings.agent_id is not None agent_name, agent_version = (settings.agent_id, None) if ":" in settings.agent_id: split_name, split_version = settings.agent_id.split(":", 1) @@ -1055,6 +1059,7 @@ def _invoke_agent_reference( def _invoke_agent_service( self, settings: FoundrySettings, prompt: str, timeout_seconds: int | None ) -> str: + assert settings.agent_id is not None if not settings.agent_id.startswith("asst_"): return self._invoke_agent_reference(settings, prompt, timeout_seconds) @@ -1161,6 +1166,7 @@ def _invoke_model_direct(self, settings: FoundrySettings, prompt: str) -> str: ) openai_client = project_client.get_openai_client() + assert settings.model is not None response = openai_client.chat.completions.create( model=settings.model, messages=[{"role": "user", "content": prompt}], @@ -1381,6 +1387,7 @@ def _evals_get(path: str, extra_params: str = "") -> Dict[str, Any]: ) else: # Agent target + assert settings.agent_id is not None agent_name, agent_version = _parse_agent_name_version(settings.agent_id) target: Dict[str, Any] = { "type": "azure_ai_agent", @@ -1500,7 +1507,7 @@ def _evals_get(path: str, extra_params: str = "") -> Dict[str, Any]: if isinstance(sample, dict): prediction = _normalize_text(sample.get("output_text", "")) - row_metric_entries: List[Dict[str, float]] = [] + row_metric_entries: List[Dict[str, Any]] = [] for result in item.get("results", []) or []: metric_name = result.get("name", "") if isinstance(result, dict) else "" metric_score = ( @@ -1586,7 +1593,7 @@ def _evals_get(path: str, extra_params: str = "") -> Dict[str, Any]: total = len(output_items) # --- Aggregate metrics ---------------------------------------------- - metrics_entries: List[Dict[str, float]] = [] + metrics_entries: List[Dict[str, Any]] = [] for name in enabled_evaluator_order: values = evaluator_aggregate_values.get(name, []) if values: @@ -1748,7 +1755,7 @@ def _record_row_metrics( prediction_normalized = _normalize_text(prediction_text) total += 1 - row_metric_entries: List[Dict[str, float]] = [] + row_metric_entries: List[Dict[str, Any]] = [] for runtime in foundry_evaluator_runtimes: score = _run_foundry_evaluator( @@ -1912,7 +1919,7 @@ def _record_row_metrics( else 0.0 ) - metrics_entries: List[Dict[str, float]] = [] + metrics_entries: List[Dict[str, Any]] = [] for evaluator_name in enabled_evaluator_order: values = evaluator_aggregate_values.get(evaluator_name, []) if values: diff --git a/src/agentops/core/config_loader.py b/src/agentops/core/config_loader.py index 13c50001..45a18251 100644 --- a/src/agentops/core/config_loader.py +++ b/src/agentops/core/config_loader.py @@ -5,7 +5,7 @@ from pathlib import Path from typing import Type, TypeVar -from pydantic import ValidationError +from pydantic import BaseModel, ValidationError from agentops.core.models import ( BundleConfig, @@ -15,7 +15,7 @@ ) from agentops.utils.yaml import load_yaml -TModel = TypeVar("TModel") +TModel = TypeVar("TModel", bound=BaseModel) def _load_model(path: Path, model_cls: Type[TModel], label: str) -> TModel: diff --git a/src/agentops/core/reporter.py b/src/agentops/core/reporter.py index 82080751..625b9380 100644 --- a/src/agentops/core/reporter.py +++ b/src/agentops/core/reporter.py @@ -525,20 +525,20 @@ def generate_comparison_html(result: ComparisonResult) -> str: # Pre-compute per-evaluator row pass rates eval_row_rates: dict[str, list[tuple[int, int]]] = {} - for tr in result.threshold_rows: + for thr in result.threshold_rows: rates = [] for run_idx in range(len(result.runs)): total = 0 passed = 0 for ir in result.item_rows: - scores_list = ir.scores.get(tr.evaluator, []) + scores_list = ir.scores.get(thr.evaluator, []) score = scores_list[run_idx] if run_idx < len(scores_list) else None if score is not None: total += 1 - if _check_threshold(score, tr.criteria, tr.target): + if _check_threshold(score, thr.criteria, thr.target): passed += 1 rates.append((passed, total)) - eval_row_rates[tr.evaluator] = rates + eval_row_rates[thr.evaluator] = rates parts: list[str] = [] @@ -707,9 +707,9 @@ def generate_comparison_html(result: ComparisonResult) -> str: parts.append( "" ) - for k, v in cond.fixed.items(): + for key, val in cond.fixed.items(): parts.append( - f"" + f"" ) parts.append("
ParameterValue
{_html_escape(k)}{_html_escape(v)}
{_html_escape(key)}{_html_escape(val)}
") diff --git a/src/agentops/services/browse.py b/src/agentops/services/browse.py index 93f777d9..e562b9fc 100644 --- a/src/agentops/services/browse.py +++ b/src/agentops/services/browse.py @@ -288,9 +288,8 @@ def show_run(run_id: str, directory: Path = Path(".")) -> RunDetail: data = json.loads(results_file.read_text(encoding="utf-8")) result = RunResult.model_validate(data) - report_path = run_dir / "report.md" - if not report_path.exists(): - report_path = None + _rp = run_dir / "report.md" + report_path: Path | None = _rp if _rp.exists() else None foundry_url = None if result.artifacts and result.artifacts.foundry_eval_studio_url: diff --git a/src/agentops/services/comparison.py b/src/agentops/services/comparison.py index 0e3f8b05..ae6ebf3f 100644 --- a/src/agentops/services/comparison.py +++ b/src/agentops/services/comparison.py @@ -14,8 +14,13 @@ ComparisonResult, ComparisonSummary, ComparisonThresholdRow, + ComparisonType, + Criteria, + Direction, + ItemEvaluationResult, RunReference, RunResult, + ThresholdEvaluationResult, ) @@ -120,7 +125,7 @@ def _lower_is_better_metrics(*results: RunResult) -> frozenset[str]: return frozenset(names) -def _compute_metric_direction(delta: float, lower_is_better: bool) -> str: +def _compute_metric_direction(delta: float, lower_is_better: bool) -> Direction: if delta == 0: return "unchanged" if lower_is_better: @@ -153,6 +158,7 @@ def _detect_conditions(refs: List[RunReference]) -> ComparisonConditions: varying.append(key) # Determine comparison type + ctype: ComparisonType if "dataset" not in varying and "agent" in varying: ctype = "agent" elif "dataset" not in varying and "model" in varying: @@ -198,7 +204,7 @@ def compare_runs( values: List[float] = [] deltas: List[Optional[float]] = [] delta_percents: List[Optional[float]] = [] - directions: List[str] = [] + directions: List[Direction] = [] baseline_val: Optional[float] = None for i, r in enumerate(results): @@ -254,11 +260,11 @@ def compare_runs( ) # Build threshold rows - all_thresholds: List[tuple[str, str]] = [] - seen_thresholds: set[tuple[str, str]] = set() + all_thresholds: List[tuple[str, Criteria]] = [] + seen_thresholds: set[tuple[str, Criteria]] = set() for r in results: - for t in r.thresholds: - key = (t.evaluator, t.criteria) + for th in r.thresholds: + key = (th.evaluator, th.criteria) if key not in seen_thresholds: all_thresholds.append(key) seen_thresholds.add(key) @@ -269,7 +275,7 @@ def compare_runs( target_val: str | None = None for r in results: t_map = {(t.evaluator, t.criteria): t for t in r.thresholds} - t = t_map.get((evaluator, criteria)) + t: ThresholdEvaluationResult | None = t_map.get((evaluator, criteria)) passed_list.append(t.passed if t else False) if t and target_val is None: target_val = t.expected @@ -285,8 +291,8 @@ def compare_runs( # Build item rows all_row_indices: set[int] = set() for r in results: - for item in r.item_evaluations: - all_row_indices.add(item.row_index) + for ie in r.item_evaluations: + all_row_indices.add(ie.row_index) # Collect evaluator names that have thresholds (for row-level display) threshold_evaluator_names = [tr.evaluator for tr in threshold_rows] @@ -300,7 +306,7 @@ def compare_runs( } for r in results: item_map = {item.row_index: item for item in r.item_evaluations} - item = item_map.get(idx) + item: ItemEvaluationResult | None = item_map.get(idx) passed_list.append(item.passed_all if item else False) # Extract row-level metric scores row_metrics_map = {row.row_index: row for row in r.row_metrics} diff --git a/src/agentops/services/runner.py b/src/agentops/services/runner.py index ae0d5a4a..724b93c1 100644 --- a/src/agentops/services/runner.py +++ b/src/agentops/services/runner.py @@ -19,6 +19,9 @@ ) from agentops.core.models import ( Artifacts, + BundleInfo, + DatasetInfo, + ExecutionInfo, ItemEvaluationResult, ItemThresholdEvaluationResult, MetricResult, @@ -506,7 +509,7 @@ def _run_evaluation_inner( output_dir.mkdir(parents=True, exist_ok=True) if run_config.backend.type == "subprocess": - backend = SubprocessBackend() + backend: SubprocessBackend | FoundryBackend = SubprocessBackend() elif run_config.backend.type == "foundry": backend = FoundryBackend() else: @@ -597,16 +600,16 @@ def _run_evaluation_inner( normalized_result = RunResult( version=1, status="completed", - bundle={"name": bundle_config.name, "path": bundle_path}, - dataset={"name": dataset_config.name, "path": dataset_path}, - execution={ - "backend": backend_result.backend, - "command": backend_result.command, - "started_at": backend_result.started_at, - "finished_at": backend_result.finished_at, - "duration_seconds": backend_result.duration_seconds, - "exit_code": backend_result.exit_code, - }, + bundle=BundleInfo(name=bundle_config.name, path=bundle_path), + dataset=DatasetInfo(name=dataset_config.name, path=dataset_path), + execution=ExecutionInfo( + backend=backend_result.backend, + command=backend_result.command, + started_at=backend_result.started_at, + finished_at=backend_result.finished_at, + duration_seconds=backend_result.duration_seconds, + exit_code=backend_result.exit_code, + ), metrics=metrics, row_metrics=row_metrics, item_evaluations=item_evaluations,