diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 83ecfc25..b2536916 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -192,6 +192,23 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + with: + fetch-depth: 0 # Full history for version derivation + + - name: Sync VSIX version from git tag + run: | + LAST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0") + LAST_VERSION=${LAST_TAG#v} + IFS='.' read -r MAJOR MINOR PATCH <<< "$LAST_VERSION" + if git describe --tags --exact-match HEAD >/dev/null 2>&1; then + BASE_VERSION="$LAST_VERSION" + else + BASE_VERSION="$MAJOR.$MINOR.$((PATCH + 1))" + fi + jq --arg v "$BASE_VERSION" '.version = $v' \ + plugins/agentops/package.json > plugins/agentops/package.json.tmp + mv plugins/agentops/package.json.tmp plugins/agentops/package.json + echo "VSIX version set to $BASE_VERSION (from tag $LAST_TAG)" - name: Set up Node.js uses: actions/setup-node@v4 @@ -224,6 +241,23 @@ jobs: environment: staging steps: - uses: actions/checkout@v4 + with: + fetch-depth: 0 # Full history for version derivation + + - name: Sync VSIX version from git tag + run: | + LAST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0") + LAST_VERSION=${LAST_TAG#v} + IFS='.' read -r MAJOR MINOR PATCH <<< "$LAST_VERSION" + if git describe --tags --exact-match HEAD >/dev/null 2>&1; then + BASE_VERSION="$LAST_VERSION" + else + BASE_VERSION="$MAJOR.$MINOR.$((PATCH + 1))" + fi + jq --arg v "$BASE_VERSION" '.version = $v' \ + plugins/agentops/package.json > plugins/agentops/package.json.tmp + mv plugins/agentops/package.json.tmp plugins/agentops/package.json + echo "VSIX version set to $BASE_VERSION (from tag $LAST_TAG)" - name: Set up Node.js uses: actions/setup-node@v4 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index aeb9fcc0..9100c6e7 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -147,6 +147,23 @@ jobs: environment: release # same approval gate as PyPI steps: - uses: actions/checkout@v4 + with: + fetch-depth: 0 # Full history for version derivation + + - name: Sync VSIX version from git tag + run: | + LAST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0") + LAST_VERSION=${LAST_TAG#v} + IFS='.' read -r MAJOR MINOR PATCH <<< "$LAST_VERSION" + if git describe --tags --exact-match HEAD >/dev/null 2>&1; then + BASE_VERSION="$LAST_VERSION" + else + BASE_VERSION="$MAJOR.$MINOR.$((PATCH + 1))" + fi + jq --arg v "$BASE_VERSION" '.version = $v' \ + plugins/agentops/package.json > plugins/agentops/package.json.tmp + mv plugins/agentops/package.json.tmp plugins/agentops/package.json + echo "VSIX version set to $BASE_VERSION (from tag $LAST_TAG)" - name: Set up Node.js uses: actions/setup-node@v4 diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index 41292f21..2ceb08ea 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -124,6 +124,23 @@ jobs: environment: staging steps: - uses: actions/checkout@v4 + with: + fetch-depth: 0 # Full history for version derivation + + - name: Sync VSIX version from git tag + run: | + LAST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0") + LAST_VERSION=${LAST_TAG#v} + IFS='.' read -r MAJOR MINOR PATCH <<< "$LAST_VERSION" + if git describe --tags --exact-match HEAD >/dev/null 2>&1; then + BASE_VERSION="$LAST_VERSION" + else + BASE_VERSION="$MAJOR.$MINOR.$((PATCH + 1))" + fi + jq --arg v "$BASE_VERSION" '.version = $v' \ + plugins/agentops/package.json > plugins/agentops/package.json.tmp + mv plugins/agentops/package.json.tmp plugins/agentops/package.json + echo "VSIX version set to $BASE_VERSION (from tag $LAST_TAG)" - name: Set up Node.js uses: actions/setup-node@v4 diff --git a/plugins/agentops/README.md b/plugins/agentops/README.md index b8c6c5ce..6e118171 100644 --- a/plugins/agentops/README.md +++ b/plugins/agentops/README.md @@ -14,18 +14,10 @@ Copilot agent skills for running standardized evaluation workflows with | **Browse & Inspect** | List and inspect evaluation runs, view per-row scores, browse run history | | **Dataset Management** | Validate, describe, and import datasets for evaluation workflows | -## Prerequisites - -Install the AgentOps CLI in your project's virtual environment: - -```bash -pip install agentops-toolkit -``` - ## Installation Install from the -[VS Code Marketplace](https://marketplace.visualstudio.com/items?itemName=PUBLISHER_ID.agentops-skills) +[VS Code Marketplace](https://marketplace.visualstudio.com/items?itemName=AgentOpsToolkit.agentops-toolkit) or search **"AgentOps Skills"** in the VS Code Extensions view. A **pre-release** channel is available for early access to new skills and updates — @@ -34,13 +26,37 @@ enable it from the extension's Marketplace page or the Extensions view. ## Usage Open **Copilot Chat** in VS Code and describe what you want to do. -The skills are invoked automatically when your request matches their domain: +The skills are invoked automatically when your request matches their domain. + +**Set up a workspace** + +``` +> Initialize an agentops workspace for my Foundry agent project +> Create a RAG evaluation bundle with groundedness and similarity +``` + +**Run and compare evaluations** + +``` +> Run the default evaluation against my agent +> Benchmark gpt-4o vs gpt-4o-mini using the smoke dataset +> Compare the last two evaluation runs and summarize the differences +``` + +**Investigate results** ``` -> Initialize an agentops workspace for my project -> Run the default evaluation -> Compare run abc123 with run def456 > Which rows failed the groundedness threshold? +> Show me the worst-scoring items from the latest run +> Why did similarity drop between run abc123 and run def456? +``` + +**Browse and manage** + +``` +> List all evaluation runs +> Show details for the latest run +> Validate my dataset before running an eval ``` ## Links diff --git a/src/agentops/backends/foundry_backend.py b/src/agentops/backends/foundry_backend.py index 3850bea8..84b2d0bd 100644 --- a/src/agentops/backends/foundry_backend.py +++ b/src/agentops/backends/foundry_backend.py @@ -310,6 +310,8 @@ def _azure_openai_model_config( "Missing: " + ", ".join(missing) ) + assert endpoint is not None + assert deployment is not None model_config: Dict[str, str] = { "azure_endpoint": endpoint, "azure_deployment": deployment, @@ -903,6 +905,7 @@ def _read_settings(self, context: BackendRunContext) -> FoundrySettings: # Model-direct: use cognitive services scope token_scope = "https://cognitiveservices.azure.com/.default" else: + assert agent_id is not None token_scope = _preferred_scope_for_agent_id(agent_id) logger.info("Acquiring token via DefaultAzureCredential…") agent_token = _acquire_token(token_scope) @@ -1025,6 +1028,7 @@ def _invoke_agent_reference( "Authorization": f"Bearer {settings.agent_token}", } + assert settings.agent_id is not None agent_name, agent_version = (settings.agent_id, None) if ":" in settings.agent_id: split_name, split_version = settings.agent_id.split(":", 1) @@ -1055,6 +1059,7 @@ def _invoke_agent_reference( def _invoke_agent_service( self, settings: FoundrySettings, prompt: str, timeout_seconds: int | None ) -> str: + assert settings.agent_id is not None if not settings.agent_id.startswith("asst_"): return self._invoke_agent_reference(settings, prompt, timeout_seconds) @@ -1161,6 +1166,7 @@ def _invoke_model_direct(self, settings: FoundrySettings, prompt: str) -> str: ) openai_client = project_client.get_openai_client() + assert settings.model is not None response = openai_client.chat.completions.create( model=settings.model, messages=[{"role": "user", "content": prompt}], @@ -1381,6 +1387,7 @@ def _evals_get(path: str, extra_params: str = "") -> Dict[str, Any]: ) else: # Agent target + assert settings.agent_id is not None agent_name, agent_version = _parse_agent_name_version(settings.agent_id) target: Dict[str, Any] = { "type": "azure_ai_agent", @@ -1500,7 +1507,7 @@ def _evals_get(path: str, extra_params: str = "") -> Dict[str, Any]: if isinstance(sample, dict): prediction = _normalize_text(sample.get("output_text", "")) - row_metric_entries: List[Dict[str, float]] = [] + row_metric_entries: List[Dict[str, Any]] = [] for result in item.get("results", []) or []: metric_name = result.get("name", "") if isinstance(result, dict) else "" metric_score = ( @@ -1586,7 +1593,7 @@ def _evals_get(path: str, extra_params: str = "") -> Dict[str, Any]: total = len(output_items) # --- Aggregate metrics ---------------------------------------------- - metrics_entries: List[Dict[str, float]] = [] + metrics_entries: List[Dict[str, Any]] = [] for name in enabled_evaluator_order: values = evaluator_aggregate_values.get(name, []) if values: @@ -1748,7 +1755,7 @@ def _record_row_metrics( prediction_normalized = _normalize_text(prediction_text) total += 1 - row_metric_entries: List[Dict[str, float]] = [] + row_metric_entries: List[Dict[str, Any]] = [] for runtime in foundry_evaluator_runtimes: score = _run_foundry_evaluator( @@ -1912,7 +1919,7 @@ def _record_row_metrics( else 0.0 ) - metrics_entries: List[Dict[str, float]] = [] + metrics_entries: List[Dict[str, Any]] = [] for evaluator_name in enabled_evaluator_order: values = evaluator_aggregate_values.get(evaluator_name, []) if values: diff --git a/src/agentops/core/config_loader.py b/src/agentops/core/config_loader.py index 13c50001..45a18251 100644 --- a/src/agentops/core/config_loader.py +++ b/src/agentops/core/config_loader.py @@ -5,7 +5,7 @@ from pathlib import Path from typing import Type, TypeVar -from pydantic import ValidationError +from pydantic import BaseModel, ValidationError from agentops.core.models import ( BundleConfig, @@ -15,7 +15,7 @@ ) from agentops.utils.yaml import load_yaml -TModel = TypeVar("TModel") +TModel = TypeVar("TModel", bound=BaseModel) def _load_model(path: Path, model_cls: Type[TModel], label: str) -> TModel: diff --git a/src/agentops/core/reporter.py b/src/agentops/core/reporter.py index 82080751..625b9380 100644 --- a/src/agentops/core/reporter.py +++ b/src/agentops/core/reporter.py @@ -525,20 +525,20 @@ def generate_comparison_html(result: ComparisonResult) -> str: # Pre-compute per-evaluator row pass rates eval_row_rates: dict[str, list[tuple[int, int]]] = {} - for tr in result.threshold_rows: + for thr in result.threshold_rows: rates = [] for run_idx in range(len(result.runs)): total = 0 passed = 0 for ir in result.item_rows: - scores_list = ir.scores.get(tr.evaluator, []) + scores_list = ir.scores.get(thr.evaluator, []) score = scores_list[run_idx] if run_idx < len(scores_list) else None if score is not None: total += 1 - if _check_threshold(score, tr.criteria, tr.target): + if _check_threshold(score, thr.criteria, thr.target): passed += 1 rates.append((passed, total)) - eval_row_rates[tr.evaluator] = rates + eval_row_rates[thr.evaluator] = rates parts: list[str] = [] @@ -707,9 +707,9 @@ def generate_comparison_html(result: ComparisonResult) -> str: parts.append( "
| Parameter | Value |
|---|---|
| {_html_escape(k)} | {_html_escape(v)} |
| {_html_escape(key)} | {_html_escape(val)} |