Azure · Dongbumlee · Apr 14, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -192,6 +192,23 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # Full history for version derivation
+
+      - name: Sync VSIX version from git tag
+        run: |
+          LAST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0")
+          LAST_VERSION=${LAST_TAG#v}
+          IFS='.' read -r MAJOR MINOR PATCH <<< "$LAST_VERSION"
+          if git describe --tags --exact-match HEAD >/dev/null 2>&1; then
+            BASE_VERSION="$LAST_VERSION"
+          else
+            BASE_VERSION="$MAJOR.$MINOR.$((PATCH + 1))"
+          fi
+          jq --arg v "$BASE_VERSION" '.version = $v' \
+            plugins/agentops/package.json > plugins/agentops/package.json.tmp
+          mv plugins/agentops/package.json.tmp plugins/agentops/package.json
+          echo "VSIX version set to $BASE_VERSION (from tag $LAST_TAG)"
 
       - name: Set up Node.js
         uses: actions/setup-node@v4
@@ -224,6 +241,23 @@ jobs:
     environment: staging
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # Full history for version derivation
+
+      - name: Sync VSIX version from git tag
+        run: |
+          LAST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0")
+          LAST_VERSION=${LAST_TAG#v}
+          IFS='.' read -r MAJOR MINOR PATCH <<< "$LAST_VERSION"
+          if git describe --tags --exact-match HEAD >/dev/null 2>&1; then
+            BASE_VERSION="$LAST_VERSION"
+          else
+            BASE_VERSION="$MAJOR.$MINOR.$((PATCH + 1))"
+          fi
+          jq --arg v "$BASE_VERSION" '.version = $v' \
+            plugins/agentops/package.json > plugins/agentops/package.json.tmp
+          mv plugins/agentops/package.json.tmp plugins/agentops/package.json
+          echo "VSIX version set to $BASE_VERSION (from tag $LAST_TAG)"
 
       - name: Set up Node.js
         uses: actions/setup-node@v4

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -147,6 +147,23 @@ jobs:
     environment: release  # same approval gate as PyPI
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # Full history for version derivation
+
+      - name: Sync VSIX version from git tag
+        run: |
+          LAST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0")
+          LAST_VERSION=${LAST_TAG#v}
+          IFS='.' read -r MAJOR MINOR PATCH <<< "$LAST_VERSION"
+          if git describe --tags --exact-match HEAD >/dev/null 2>&1; then
+            BASE_VERSION="$LAST_VERSION"
+          else
+            BASE_VERSION="$MAJOR.$MINOR.$((PATCH + 1))"
+          fi
+          jq --arg v "$BASE_VERSION" '.version = $v' \
+            plugins/agentops/package.json > plugins/agentops/package.json.tmp
+          mv plugins/agentops/package.json.tmp plugins/agentops/package.json
+          echo "VSIX version set to $BASE_VERSION (from tag $LAST_TAG)"
 
       - name: Set up Node.js
         uses: actions/setup-node@v4

diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml
@@ -124,6 +124,23 @@ jobs:
     environment: staging
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # Full history for version derivation
+
+      - name: Sync VSIX version from git tag
+        run: |
+          LAST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0")
+          LAST_VERSION=${LAST_TAG#v}
+          IFS='.' read -r MAJOR MINOR PATCH <<< "$LAST_VERSION"
+          if git describe --tags --exact-match HEAD >/dev/null 2>&1; then
+            BASE_VERSION="$LAST_VERSION"
+          else
+            BASE_VERSION="$MAJOR.$MINOR.$((PATCH + 1))"
+          fi
+          jq --arg v "$BASE_VERSION" '.version = $v' \
+            plugins/agentops/package.json > plugins/agentops/package.json.tmp
+          mv plugins/agentops/package.json.tmp plugins/agentops/package.json
+          echo "VSIX version set to $BASE_VERSION (from tag $LAST_TAG)"
 
       - name: Set up Node.js
         uses: actions/setup-node@v4

diff --git a/plugins/agentops/README.md b/plugins/agentops/README.md
@@ -14,18 +14,10 @@ Copilot agent skills for running standardized evaluation workflows with
 | **Browse & Inspect** | List and inspect evaluation runs, view per-row scores, browse run history |
 | **Dataset Management** | Validate, describe, and import datasets for evaluation workflows |
 
-## Prerequisites
-
-Install the AgentOps CLI in your project's virtual environment:
-
-```bash
-pip install agentops-toolkit
-```
-
 ## Installation
 
 Install from the
-[VS Code Marketplace](https://marketplace.visualstudio.com/items?itemName=PUBLISHER_ID.agentops-skills)
+[VS Code Marketplace](https://marketplace.visualstudio.com/items?itemName=AgentOpsToolkit.agentops-toolkit)
 or search **"AgentOps Skills"** in the VS Code Extensions view.
 
 A **pre-release** channel is available for early access to new skills and updates —
@@ -34,13 +26,37 @@ enable it from the extension's Marketplace page or the Extensions view.
 ## Usage
 
 Open **Copilot Chat** in VS Code and describe what you want to do.
-The skills are invoked automatically when your request matches their domain:
+The skills are invoked automatically when your request matches their domain.
+
+**Set up a workspace**
+
+```
+> Initialize an agentops workspace for my Foundry agent project
+> Create a RAG evaluation bundle with groundedness and similarity
+```
+
+**Run and compare evaluations**
+
+```
+> Run the default evaluation against my agent
+> Benchmark gpt-4o vs gpt-4o-mini using the smoke dataset
+> Compare the last two evaluation runs and summarize the differences
+```
+
+**Investigate results**
 
 ```
-> Initialize an agentops workspace for my project
-> Run the default evaluation
-> Compare run abc123 with run def456
 > Which rows failed the groundedness threshold?
+> Show me the worst-scoring items from the latest run
+> Why did similarity drop between run abc123 and run def456?
+```
+
+**Browse and manage**
+
+```
+> List all evaluation runs
+> Show details for the latest run
+> Validate my dataset before running an eval
 ```
 
 ## Links

diff --git a/src/agentops/backends/foundry_backend.py b/src/agentops/backends/foundry_backend.py
@@ -310,6 +310,8 @@ def _azure_openai_model_config(
             "Missing: " + ", ".join(missing)
         )
 
+    assert endpoint is not None
+    assert deployment is not None
     model_config: Dict[str, str] = {
         "azure_endpoint": endpoint,
         "azure_deployment": deployment,
@@ -903,6 +905,7 @@ def _read_settings(self, context: BackendRunContext) -> FoundrySettings:
             # Model-direct: use cognitive services scope
             token_scope = "https://cognitiveservices.azure.com/.default"
         else:
+            assert agent_id is not None
             token_scope = _preferred_scope_for_agent_id(agent_id)
         logger.info("Acquiring token via DefaultAzureCredential…")
         agent_token = _acquire_token(token_scope)
@@ -1025,6 +1028,7 @@ def _invoke_agent_reference(
             "Authorization": f"Bearer {settings.agent_token}",
         }
 
+        assert settings.agent_id is not None
         agent_name, agent_version = (settings.agent_id, None)
         if ":" in settings.agent_id:
             split_name, split_version = settings.agent_id.split(":", 1)
@@ -1055,6 +1059,7 @@ def _invoke_agent_reference(
     def _invoke_agent_service(
         self, settings: FoundrySettings, prompt: str, timeout_seconds: int | None
     ) -> str:
+        assert settings.agent_id is not None
         if not settings.agent_id.startswith("asst_"):
             return self._invoke_agent_reference(settings, prompt, timeout_seconds)
 
@@ -1161,6 +1166,7 @@ def _invoke_model_direct(self, settings: FoundrySettings, prompt: str) -> str:
         )
         openai_client = project_client.get_openai_client()
 
+        assert settings.model is not None
         response = openai_client.chat.completions.create(
             model=settings.model,
             messages=[{"role": "user", "content": prompt}],
@@ -1381,6 +1387,7 @@ def _evals_get(path: str, extra_params: str = "") -> Dict[str, Any]:
             )
         else:
             # Agent target
+            assert settings.agent_id is not None
             agent_name, agent_version = _parse_agent_name_version(settings.agent_id)
             target: Dict[str, Any] = {
                 "type": "azure_ai_agent",
@@ -1500,7 +1507,7 @@ def _evals_get(path: str, extra_params: str = "") -> Dict[str, Any]:
             if isinstance(sample, dict):
                 prediction = _normalize_text(sample.get("output_text", ""))
 
-            row_metric_entries: List[Dict[str, float]] = []
+            row_metric_entries: List[Dict[str, Any]] = []
             for result in item.get("results", []) or []:
                 metric_name = result.get("name", "") if isinstance(result, dict) else ""
                 metric_score = (
@@ -1586,7 +1593,7 @@ def _evals_get(path: str, extra_params: str = "") -> Dict[str, Any]:
         total = len(output_items)
 
         # --- Aggregate metrics ----------------------------------------------
-        metrics_entries: List[Dict[str, float]] = []
+        metrics_entries: List[Dict[str, Any]] = []
         for name in enabled_evaluator_order:
             values = evaluator_aggregate_values.get(name, [])
             if values:
@@ -1748,7 +1755,7 @@ def _record_row_metrics(
             prediction_normalized = _normalize_text(prediction_text)
             total += 1
 
-            row_metric_entries: List[Dict[str, float]] = []
+            row_metric_entries: List[Dict[str, Any]] = []
 
             for runtime in foundry_evaluator_runtimes:
                 score = _run_foundry_evaluator(
@@ -1912,7 +1919,7 @@ def _record_row_metrics(
             else 0.0
         )
 
-        metrics_entries: List[Dict[str, float]] = []
+        metrics_entries: List[Dict[str, Any]] = []
         for evaluator_name in enabled_evaluator_order:
             values = evaluator_aggregate_values.get(evaluator_name, [])
             if values:

diff --git a/src/agentops/core/config_loader.py b/src/agentops/core/config_loader.py
@@ -5,7 +5,7 @@
 from pathlib import Path
 from typing import Type, TypeVar
 
-from pydantic import ValidationError
+from pydantic import BaseModel, ValidationError
 
 from agentops.core.models import (
     BundleConfig,
@@ -15,7 +15,7 @@
 )
 from agentops.utils.yaml import load_yaml
 
-TModel = TypeVar("TModel")
+TModel = TypeVar("TModel", bound=BaseModel)
 
 
 def _load_model(path: Path, model_cls: Type[TModel], label: str) -> TModel:

diff --git a/src/agentops/core/reporter.py b/src/agentops/core/reporter.py
@@ -525,20 +525,20 @@ def generate_comparison_html(result: ComparisonResult) -> str:
 
     # Pre-compute per-evaluator row pass rates
     eval_row_rates: dict[str, list[tuple[int, int]]] = {}
-    for tr in result.threshold_rows:
+    for thr in result.threshold_rows:
         rates = []
         for run_idx in range(len(result.runs)):
             total = 0
             passed = 0
             for ir in result.item_rows:
-                scores_list = ir.scores.get(tr.evaluator, [])
+                scores_list = ir.scores.get(thr.evaluator, [])
                 score = scores_list[run_idx] if run_idx < len(scores_list) else None
                 if score is not None:
                     total += 1
-                    if _check_threshold(score, tr.criteria, tr.target):
+                    if _check_threshold(score, thr.criteria, thr.target):
                         passed += 1
             rates.append((passed, total))
-        eval_row_rates[tr.evaluator] = rates
+        eval_row_rates[thr.evaluator] = rates
 
     parts: list[str] = []
 
@@ -707,9 +707,9 @@ def generate_comparison_html(result: ComparisonResult) -> str:
         parts.append(
             "<table><thead><tr><th>Parameter</th><th>Value</th></tr></thead><tbody>"
         )
-        for k, v in cond.fixed.items():
+        for key, val in cond.fixed.items():
             parts.append(
-                f"<tr><td>{_html_escape(k)}</td><td>{_html_escape(v)}</td></tr>"
+                f"<tr><td>{_html_escape(key)}</td><td>{_html_escape(val)}</td></tr>"
             )
         parts.append("</tbody></table>")
 

diff --git a/src/agentops/services/browse.py b/src/agentops/services/browse.py
@@ -288,9 +288,8 @@ def show_run(run_id: str, directory: Path = Path(".")) -> RunDetail:
     data = json.loads(results_file.read_text(encoding="utf-8"))
     result = RunResult.model_validate(data)
 
-    report_path = run_dir / "report.md"
-    if not report_path.exists():
-        report_path = None
+    _rp = run_dir / "report.md"
+    report_path: Path | None = _rp if _rp.exists() else None
 
     foundry_url = None
     if result.artifacts and result.artifacts.foundry_eval_studio_url:

diff --git a/src/agentops/services/comparison.py b/src/agentops/services/comparison.py
@@ -14,8 +14,13 @@
     ComparisonResult,
     ComparisonSummary,
     ComparisonThresholdRow,
+    ComparisonType,
+    Criteria,
+    Direction,
+    ItemEvaluationResult,
     RunReference,
     RunResult,
+    ThresholdEvaluationResult,
 )
 
 
@@ -120,7 +125,7 @@ def _lower_is_better_metrics(*results: RunResult) -> frozenset[str]:
     return frozenset(names)
 
 
-def _compute_metric_direction(delta: float, lower_is_better: bool) -> str:
+def _compute_metric_direction(delta: float, lower_is_better: bool) -> Direction:
     if delta == 0:
         return "unchanged"
     if lower_is_better:
@@ -153,6 +158,7 @@ def _detect_conditions(refs: List[RunReference]) -> ComparisonConditions:
             varying.append(key)
 
     # Determine comparison type
+    ctype: ComparisonType
     if "dataset" not in varying and "agent" in varying:
         ctype = "agent"
     elif "dataset" not in varying and "model" in varying:
@@ -198,7 +204,7 @@ def compare_runs(
         values: List[float] = []
         deltas: List[Optional[float]] = []
         delta_percents: List[Optional[float]] = []
-        directions: List[str] = []
+        directions: List[Direction] = []
         baseline_val: Optional[float] = None
 
         for i, r in enumerate(results):
@@ -254,11 +260,11 @@ def compare_runs(
         )
 
     # Build threshold rows
-    all_thresholds: List[tuple[str, str]] = []
-    seen_thresholds: set[tuple[str, str]] = set()
+    all_thresholds: List[tuple[str, Criteria]] = []
+    seen_thresholds: set[tuple[str, Criteria]] = set()
     for r in results:
-        for t in r.thresholds:
-            key = (t.evaluator, t.criteria)
+        for th in r.thresholds:
+            key = (th.evaluator, th.criteria)
             if key not in seen_thresholds:
                 all_thresholds.append(key)
                 seen_thresholds.add(key)
@@ -269,7 +275,7 @@ def compare_runs(
         target_val: str | None = None
         for r in results:
             t_map = {(t.evaluator, t.criteria): t for t in r.thresholds}
-            t = t_map.get((evaluator, criteria))
+            t: ThresholdEvaluationResult | None = t_map.get((evaluator, criteria))
             passed_list.append(t.passed if t else False)
             if t and target_val is None:
                 target_val = t.expected
@@ -285,8 +291,8 @@ def compare_runs(
     # Build item rows
     all_row_indices: set[int] = set()
     for r in results:
-        for item in r.item_evaluations:
-            all_row_indices.add(item.row_index)
+        for ie in r.item_evaluations:
+            all_row_indices.add(ie.row_index)
 
     # Collect evaluator names that have thresholds (for row-level display)
     threshold_evaluator_names = [tr.evaluator for tr in threshold_rows]
@@ -300,7 +306,7 @@ def compare_runs(
         }
         for r in results:
             item_map = {item.row_index: item for item in r.item_evaluations}
-            item = item_map.get(idx)
+            item: ItemEvaluationResult | None = item_map.get(idx)
             passed_list.append(item.passed_all if item else False)
             # Extract row-level metric scores
             row_metrics_map = {row.row_index: row for row in r.row_metrics}