diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml index 479673c5..31d74036 100644 --- a/.github/workflows/_build.yml +++ b/.github/workflows/_build.yml @@ -1,16 +1,19 @@ # AgentOps Toolkit — Reusable Build Workflow # # Workflows: -# 1. ci.yml — Lint + test on every push/PR; publish dev builds to TestPyPI on develop -# 2. _build.yml — Reusable build (test + package), called by staging and release -# 3. staging.yml — Staging: release/* branch → TestPyPI → verify -# 4. release.yml — Production: v* tag → TestPyPI → verify → PyPI → GitHub Release +# 1. ci.yml — Lint + test on every push/PR; build VSIX validation +# 2. _build.yml — Reusable Python build (test + package), called by staging and release +# 3. staging.yml — Staging: release/* → TestPyPI + VSIX pre-release +# 4. release.yml — Production: v* tag → PyPI + VSIX stable + GitHub Release # 5. cut-release.yml — Manual dispatch: create release branch + PR from develop # # Called by staging.yml and release.yml via workflow_call. -# Runs tests, builds the package (version via setuptools-scm), and uploads +# Runs tests, builds the Python package (version via setuptools-scm), and uploads # the dist/ artifacts for downstream jobs. # +# Note: VSIX packaging is handled directly in ci/staging/release workflows +# (requires Node.js + @vscode/vsce), not in this Python-focused reusable build. +# # Usage in caller workflows: # jobs: # build: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fad45d2a..56d6683b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,8 +3,8 @@ # Workflows: # 1. ci.yml — Lint + test on every push/PR; publish dev builds to TestPyPI on develop # 2. _build.yml — Reusable build (test + package), called by staging and release -# 3. staging.yml — Staging: release/* branch → TestPyPI → verify -# 4. release.yml — Production: v* tag → TestPyPI → verify → PyPI → GitHub Release +# 3. staging.yml — Staging: release/* branch → TestPyPI → verify; VSIX pre-release → Marketplace +# 4. release.yml — Production: v* tag → TestPyPI → verify → PyPI → GitHub Release; VSIX stable → Marketplace # 5. cut-release.yml — Manual dispatch: create release branch + PR from develop name: CI @@ -186,3 +186,26 @@ jobs: echo "- TestPyPI: https://test.pypi.org/project/agentops-toolkit/${{ steps.version.outputs.version }}/" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" echo "Install: \`pip install agentops-toolkit==${{ steps.version.outputs.version }} --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/\`" >> "$GITHUB_STEP_SUMMARY" + + # Validate that the VSIX extension packages correctly + build-vsix: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: "20" + + - name: Install vsce + run: npm install -g @vscode/vsce + + - name: Package VSIX (dry run) + working-directory: plugins/agentops + run: vsce package -o agentops-skills.vsix + + - name: Show VSIX info + run: | + ls -la plugins/agentops/*.vsix + echo "✅ VSIX packaging validated" diff --git a/.github/workflows/cut-release.yml b/.github/workflows/cut-release.yml index 9d2cbc4a..11c2cf89 100644 --- a/.github/workflows/cut-release.yml +++ b/.github/workflows/cut-release.yml @@ -1,14 +1,15 @@ # AgentOps Toolkit — Cut Release # # Workflows: -# 1. ci.yml — Lint + test on every push/PR +# 1. ci.yml — Lint + test on every push/PR; VSIX build validation # 2. _build.yml — Reusable build (test + package), called by staging and release -# 3. staging.yml — Staging: release/* branch → TestPyPI → verify -# 4. release.yml — Production: v* tag → TestPyPI → verify → PyPI → GitHub Release +# 3. staging.yml — Staging: release/* → TestPyPI → verify; VSIX pre-release → Marketplace +# 4. release.yml — Production: v* tag → TestPyPI → verify → PyPI → GH Release; VSIX stable → Marketplace # 5. cut-release.yml — Manual dispatch: create release branch + PR from develop # # One-click release branch creation. Triggered manually from the Actions tab. -# Creates a release branch from develop, updates CHANGELOG.md, and opens a PR to main. +# Creates a release branch from develop, updates CHANGELOG.md, syncs the +# VS Code extension version in package.json, and opens a PR to main. # The branch push then triggers staging.yml automatically. # # Usage: @@ -72,6 +73,13 @@ jobs: # Replace [Unreleased] with versioned section, add fresh Unreleased above sed -i "s/## \[Unreleased\]/## [Unreleased]\n\n## [${{ env.version }}] - $DATE/" CHANGELOG.md + - name: Sync VS Code extension version + run: | + jq --arg v "${{ env.version }}" '.version = $v' \ + plugins/agentops/package.json > plugins/agentops/package.json.tmp + mv plugins/agentops/package.json.tmp plugins/agentops/package.json + echo "VSIX version set to ${{ env.version }}" + - name: Configure git run: | git config user.name "github-actions[bot]" @@ -79,7 +87,7 @@ jobs: - name: Commit and push run: | - git add CHANGELOG.md + git add CHANGELOG.md plugins/agentops/package.json git commit -m "chore: prepare release ${{ env.version }}" git push origin "release/v${{ env.version }}" @@ -98,22 +106,24 @@ jobs: ### What happened - Branch \`release/v${{ env.version }}\` created from \`develop\` - \`CHANGELOG.md\` updated: \`[Unreleased]\` → \`[${{ env.version }}]\` - - Staging pipeline triggered automatically (build → TestPyPI → verify) + - \`plugins/agentops/package.json\` version synced to \`${{ env.version }}\` + - Staging pipeline triggered automatically (build → TestPyPI + VSIX pre-release → verify) ### Next steps 1. Wait for the **Staging** pipeline to pass 2. Review and approve this PR 3. Merge to \`main\` 4. Tag and push: \`git tag v${{ env.version }} && git push origin v${{ env.version }}\` - 5. Approve the PyPI publish in the **Release** workflow + 5. Approve the PyPI publish and VSIX stable publish in the **Release** workflow 6. Sync develop: \`git checkout develop && git merge main && git push origin develop\` ### Checklist - - [ ] Staging pipeline passes (build + TestPyPI + verify) + - [ ] Staging pipeline passes (build + TestPyPI + VSIX pre-release + verify) - [ ] CHANGELOG entries reviewed - [ ] PR approved and merged to main - [ ] Tag \`v${{ env.version }}\` pushed - [ ] PyPI publish approved + - [ ] VSIX stable publish approved - [ ] develop synced from main" - name: Summary @@ -122,6 +132,7 @@ jobs: echo "" >> "$GITHUB_STEP_SUMMARY" echo "- Branch: \`release/v${{ env.version }}\`" >> "$GITHUB_STEP_SUMMARY" echo "- CHANGELOG updated with version **${{ env.version }}**" >> "$GITHUB_STEP_SUMMARY" + echo "- VS Code extension version synced to **${{ env.version }}**" >> "$GITHUB_STEP_SUMMARY" echo "- PR opened: \`release/v${{ env.version }}\` → \`main\`" >> "$GITHUB_STEP_SUMMARY" echo "- Staging pipeline triggered automatically" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 402ac960..aeb9fcc0 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,33 +1,38 @@ -# AgentOps Toolkit — Production Release +# AgentOps Toolkit — Production Release (PyPI + VSIX Stable) # # Workflows: -# 1. ci.yml — Lint + test on every push/PR +# 1. ci.yml — Lint + test on every push/PR; VSIX build validation # 2. _build.yml — Reusable build (test + package), called by staging and release -# 3. staging.yml — Staging: release/* branch → TestPyPI → verify -# 4. release.yml — Production: v* tag → TestPyPI → verify → PyPI → GitHub Release +# 3. staging.yml — Staging: release/* → TestPyPI → verify; VSIX pre-release → Marketplace +# 4. release.yml — Production: v* tag → TestPyPI → verify → PyPI → GH Release; VSIX stable → Marketplace +# 5. cut-release.yml — Manual dispatch: create release branch + PR from develop # # Triggered by v* tag pushes (e.g. v0.2.0). # Calls the reusable _build.yml, then publishes to TestPyPI for final # verification, then to PyPI (requires 'release' environment approval), -# and finally creates a GitHub Release. +# publishes the VS Code extension as stable to the Marketplace, +# and finally creates a GitHub Release (with both dist and VSIX attached). # # Versioning: # Uses setuptools-scm — version is derived from the git tag automatically. # Tagged commit v0.2.0 → version 0.2.0. No manual version in pyproject.toml. +# VSIX version is managed in plugins/agentops/package.json (synced by cut-release). # # Required GitHub secrets (in respective environments): # TEST_PYPI_TOKEN — TestPyPI API token (environment: staging) # PYPI_TOKEN — PyPI API token (environment: release) +# VSCE_PAT — VS Code Marketplace PAT (environment: release) # # Required GitHub environments: # staging — for TestPyPI publish (optional approval) -# release — for PyPI publish (requires approval from designated reviewers) +# release — for PyPI + VSIX publish (requires approval from designated reviewers) # # Setup: # 1. https://test.pypi.org/manage/account/token/ → Create TEST_PYPI_TOKEN # 2. https://pypi.org/manage/account/token/ → Create PYPI_TOKEN -# 3. GitHub repo → Settings → Secrets → Actions → Add secrets to environments -# 4. GitHub repo → Settings → Environments → Create "release" with required reviewers +# 3. https://dev.azure.com/ → PAT with Marketplace scope → Create VSCE_PAT +# 4. GitHub repo → Settings → Secrets → Actions → Add secrets to environments +# 5. GitHub repo → Settings → Environments → Create "release" with required reviewers name: Release @@ -133,26 +138,64 @@ jobs: password: ${{ secrets.PYPI_TOKEN }} verbose: true - # Create GitHub Release with built artifacts + # ── VSIX Stable Publish ────────────────────────────────────────────── + # Publish the VS Code extension as a stable release to the Marketplace. + # Runs in parallel with the TestPyPI→PyPI flow (only needs source checkout). + publish-vsix: + needs: build # gate on successful lint + test + runs-on: ubuntu-latest + environment: release # same approval gate as PyPI + steps: + - uses: actions/checkout@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: "20" + + - name: Install vsce + run: npm install -g @vscode/vsce + + - name: Package VSIX + working-directory: plugins/agentops + run: vsce package -o agentops-skills.vsix + + - name: Publish stable to VS Code Marketplace + working-directory: plugins/agentops + run: vsce publish --packagePath agentops-skills.vsix -p "${{ secrets.VSCE_PAT }}" + + - name: Upload VSIX artifact + uses: actions/upload-artifact@v4 + with: + name: vsix + path: plugins/agentops/agentops-skills.vsix + + # Create GitHub Release with built artifacts (Python dist + VSIX) github-release: - needs: publish-pypi + needs: [publish-pypi, publish-vsix] runs-on: ubuntu-latest permissions: contents: write steps: - uses: actions/checkout@v4 - - name: Download build artifacts + - name: Download Python dist artifacts uses: actions/download-artifact@v4 with: name: dist path: dist/ + - name: Download VSIX artifact + uses: actions/download-artifact@v4 + with: + name: vsix + path: vsix/ + - name: Create GitHub Release env: GH_TOKEN: ${{ github.token }} run: | - gh release create "${{ github.ref_name }}" dist/* \ + gh release create "${{ github.ref_name }}" dist/* vsix/* \ --repo "${{ github.repository }}" \ --title "${{ github.ref_name }}" \ --generate-notes diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index 2a9987ae..5ea32e45 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -1,32 +1,36 @@ -# AgentOps Toolkit — Staging (TestPyPI) +# AgentOps Toolkit — Staging (TestPyPI + VSIX Pre-release) # # Workflows: -# 1. ci.yml — Lint + test on every push/PR +# 1. ci.yml — Lint + test on every push/PR; VSIX build validation # 2. _build.yml — Reusable build (test + package), called by staging and release -# 3. staging.yml — Staging: release/* branch → TestPyPI → verify -# 4. release.yml — Production: v* tag → TestPyPI → verify → PyPI → GitHub Release +# 3. staging.yml — Staging: release/* → TestPyPI → verify; VSIX pre-release → Marketplace +# 4. release.yml — Production: v* tag → TestPyPI → verify → PyPI → GH Release; VSIX stable → Marketplace +# 5. cut-release.yml — Manual dispatch: create release branch + PR from develop # # Triggered by pushes to release/* branches. -# Calls the reusable _build.yml, publishes to TestPyPI, and verifies the -# package installs correctly with a CLI smoke test. -# -# This workflow lets you iterate on a release branch and validate the -# built package before tagging for production. +# Calls the reusable _build.yml, publishes to TestPyPI, verifies the +# package installs correctly with a CLI smoke test, and publishes the +# VS Code extension as a pre-release to the Marketplace. # # Branch flow: # develop → release/v0.2.0 → push → this workflow # → build → TestPyPI → verify install → ✅ ready to merge and tag +# → VSIX pre-release → Marketplace (early access channel) # # Versioning: # Uses setuptools-scm — on a release branch 5 commits after the last tag, # the version will be something like 0.2.0.dev5 (PEP 440 pre-release). +# VSIX version is managed in plugins/agentops/package.json. # # Required GitHub secrets (environment: staging): # TEST_PYPI_TOKEN — TestPyPI API token +# VSCE_PAT — VS Code Marketplace Personal Access Token # # Setup: # 1. https://test.pypi.org/manage/account/token/ → Create TEST_PYPI_TOKEN # 2. GitHub repo → Settings → Secrets → Actions → Add to staging environment +# 3. https://dev.azure.com/ → PAT with Marketplace scope → Create VSCE_PAT +# 4. Add VSCE_PAT to staging environment name: Staging @@ -110,3 +114,35 @@ jobs: test -f .agentops/config.yaml test -f .agentops/run.yaml echo "✅ agentops init succeeded" + + # ── VSIX Pre-release ───────────────────────────────────────────────── + # Publish the VS Code extension as a pre-release to the Marketplace. + # Runs in parallel with the TestPyPI flow (only needs source checkout). + publish-vsix-prerelease: + needs: build # gate on successful lint + test + runs-on: ubuntu-latest + environment: staging + steps: + - uses: actions/checkout@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: "20" + + - name: Install vsce + run: npm install -g @vscode/vsce + + - name: Package VSIX (pre-release) + working-directory: plugins/agentops + run: vsce package --pre-release -o agentops-skills.vsix + + - name: Publish pre-release to VS Code Marketplace + working-directory: plugins/agentops + run: vsce publish --pre-release --packagePath agentops-skills.vsix -p "${{ secrets.VSCE_PAT }}" + + - name: Show VSIX info + working-directory: plugins/agentops + run: | + ls -lh agentops-skills.vsix + echo "✅ VSIX pre-release published to Marketplace" diff --git a/plugins/agentops/.vscodeignore b/plugins/agentops/.vscodeignore new file mode 100644 index 00000000..1b470091 --- /dev/null +++ b/plugins/agentops/.vscodeignore @@ -0,0 +1,11 @@ +.git +node_modules +*.vsix + +# Keep these (explicit include after exclude) +!README.md +!CHANGELOG.md +!skills/**/SKILL.md +!package.json +!LICENSE +!icon.png diff --git a/plugins/agentops/CHANGELOG.md b/plugins/agentops/CHANGELOG.md new file mode 100644 index 00000000..7b0fb885 --- /dev/null +++ b/plugins/agentops/CHANGELOG.md @@ -0,0 +1,20 @@ +# Changelog + +All notable changes to the **AgentOps Skills for GitHub Copilot** extension +will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/). + +## [Unreleased] + +## [0.1.0] - 2025-07-08 + +### Added + +- Initial pre-release with six Copilot agent skills: + - **Workspace Setup** — initialize `.agentops/`, create configs, manage bundles and datasets + - **Run Evals** — execute evaluations, multi-model benchmarks, N-run comparisons + - **Investigate Regression** — compare runs, analyze row-level scores, root-cause regressions + - **Observability & Triage** — OTLP tracing setup, interpret evaluation outputs + - **Browse & Inspect** — list/inspect runs, view per-row scores, browse history + - **Dataset Management** — validate, describe, and import datasets diff --git a/plugins/agentops/LICENSE b/plugins/agentops/LICENSE new file mode 100644 index 00000000..22aed37e --- /dev/null +++ b/plugins/agentops/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) Microsoft Corporation. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/plugins/agentops/README.md b/plugins/agentops/README.md new file mode 100644 index 00000000..b8c6c5ce --- /dev/null +++ b/plugins/agentops/README.md @@ -0,0 +1,50 @@ +# AgentOps Skills for GitHub Copilot + +Copilot agent skills for running standardized evaluation workflows with +[AgentOps Toolkit](https://github.com/Azure/agentops) and Microsoft Foundry agents. + +## Skills + +| Skill | What it does | +|---|---| +| **Workspace Setup** | Initialize an `.agentops/` workspace, create configs, manage bundles and datasets | +| **Run Evals** | Execute evaluations, multi-model benchmarks, N-run comparisons, and generate reports | +| **Investigate Regression** | Compare runs, analyze row-level scores, and identify root causes of regressions | +| **Observability & Triage** | Set up OTLP tracing, interpret evaluation outputs, triage failed runs | +| **Browse & Inspect** | List and inspect evaluation runs, view per-row scores, browse run history | +| **Dataset Management** | Validate, describe, and import datasets for evaluation workflows | + +## Prerequisites + +Install the AgentOps CLI in your project's virtual environment: + +```bash +pip install agentops-toolkit +``` + +## Installation + +Install from the +[VS Code Marketplace](https://marketplace.visualstudio.com/items?itemName=PUBLISHER_ID.agentops-skills) +or search **"AgentOps Skills"** in the VS Code Extensions view. + +A **pre-release** channel is available for early access to new skills and updates — +enable it from the extension's Marketplace page or the Extensions view. + +## Usage + +Open **Copilot Chat** in VS Code and describe what you want to do. +The skills are invoked automatically when your request matches their domain: + +``` +> Initialize an agentops workspace for my project +> Run the default evaluation +> Compare run abc123 with run def456 +> Which rows failed the groundedness threshold? +``` + +## Links + +- [AgentOps Toolkit](https://github.com/Azure/agentops) — CLI and documentation +- [Tutorial: Basic Foundry Agent](https://github.com/Azure/agentops/blob/main/docs/tutorial-basic-foundry-agent.md) +- [How It Works](https://github.com/Azure/agentops/blob/main/docs/how-it-works.md) diff --git a/plugins/agentops/package.json b/plugins/agentops/package.json new file mode 100644 index 00000000..849a5b3d --- /dev/null +++ b/plugins/agentops/package.json @@ -0,0 +1,59 @@ +{ + "name": "agentops-toolkit", + "displayName": "AgentOps Skills for GitHub Copilot", + "description": "Copilot agent skills for running standardized evaluation workflows with AgentOps Toolkit and Microsoft Foundry agents.", + "version": "0.1.0", + "publisher": "AgentOpsToolkit", + "license": "MIT", + "repository": { + "type": "git", + "url": "https://github.com/Azure/agentops" + }, + "bugs": { + "url": "https://github.com/Azure/agentops/issues" + }, + "engines": { + "vscode": "^1.99.0" + }, + "categories": [ + "AI", + "Other" + ], + "keywords": [ + "agentops", + "evaluation", + "foundry", + "copilot", + "agent-skills", + "ai-evaluation" + ], + "contributes": { + "chatSkills": [ + { + "path": "./skills/agentops-workspace-setup/SKILL.md" + }, + { + "path": "./skills/agentops-run-evals/SKILL.md" + }, + { + "path": "./skills/agentops-investigate-regression/SKILL.md" + }, + { + "path": "./skills/agentops-observability-triage/SKILL.md" + }, + { + "path": "./skills/agentops-browse-inspect/SKILL.md" + }, + { + "path": "./skills/agentops-dataset-management/SKILL.md" + } + ] + }, + "scripts": { + "vscode:prepublish": "echo 'Declarative extension — no build step required'", + "package": "vsce package", + "package:prerelease": "vsce package --pre-release", + "publish": "vsce publish", + "publish:prerelease": "vsce publish --pre-release" + } +} \ No newline at end of file diff --git a/plugins/agentops/skills/agentops-browse-inspect/SKILL.md b/plugins/agentops/skills/agentops-browse-inspect/SKILL.md new file mode 100644 index 00000000..1e16363e --- /dev/null +++ b/plugins/agentops/skills/agentops-browse-inspect/SKILL.md @@ -0,0 +1,205 @@ +--- +name: agentops-browse-inspect +description: Browse evaluation bundles, inspect past runs, and explore evaluation history in an AgentOps workspace. Trigger when users ask to list bundles, show bundle details, list past runs, show run results, view run entries, inspect evaluation history, check what evaluators are configured, list available models, or list agents in a Foundry project. Common phrases include "list bundles", "show bundle", "what bundles", "list runs", "show run", "view run", "run history", "past evaluations", "inspect run", "what evaluators", "browse evaluations", "check thresholds", "list models", "what models", "list agents", "what agents", "available models". Install agentops-toolkit via pip. Commands are agentops bundle list, agentops bundle show, agentops run list, agentops run show, agentops run view, agentops model list, and agentops agent list. +--- + +# AgentOps Browse and Inspect + +> **Prerequisite:** Install the AgentOps CLI with `pip install agentops-toolkit`. + +## Before You Start + +**IMPORTANT:** Before running any commands below, verify the workspace exists: + +1. Check if `.agentops/` directory exists in the project root. +2. If it does **not** exist, run `agentops init` first (see the **agentops-workspace-setup** skill). +3. Do not proceed until `.agentops/` is confirmed. + +## Purpose + +Browse evaluation bundles and inspect past evaluation runs in an AgentOps workspace. Useful for exploring available evaluators, reviewing run history, understanding evaluation configurations, and discovering Foundry resources like models and agents. + +## When to Use + +- User asks what bundles or evaluators are available. +- User wants to see details of a specific bundle (evaluators, thresholds). +- User asks about past evaluation runs or run history. +- User wants to inspect results of a specific run. +- User asks which runs passed or failed thresholds. +- User wants to find the Foundry portal link for a run. +- User asks what models are available in the Foundry project. +- User asks what agents are deployed in the Foundry project. + +## Available Commands + +```bash +agentops bundle list [--dir ] # List evaluation bundles +agentops bundle show [--dir ] # Show bundle details +agentops run list [--dir ] # List past evaluation runs +agentops run show [--dir ] # Show run summary +agentops run view [--entry N] # Deep-inspect run (planned) +agentops model list # List models in Foundry project (planned) +agentops agent list # List agents in Foundry project (planned) +``` + +### Key Flags + +| Command | Flag | Description | +|---|---|---| +| `bundle list` | `--dir` | Workspace directory (default: current directory) | +| `bundle show` | `` | Bundle name or filename without `.yaml` | +| `run list` | `--dir` | Workspace directory (default: current directory) | +| `run show` | `` | Run ID (timestamp folder name or `latest`) | +| `run view` | `--entry N` | Row/entry index for deep inspection (planned) | +| `model list` | — | List chat-capable models (planned) | +| `agent list` | — | List agents in Foundry project (planned) | + +## Recommended Workflow + +### Explore Available Bundles + +List all bundles in the workspace: + +```bash +agentops bundle list +``` + +Output shows each bundle's name, description, enabled evaluators, and threshold count: + +``` +Bundles in .agentops/bundles: + + model_direct_baseline + Baseline evaluation for model-direct targets + evaluators: SimilarityEvaluator, avg_latency_seconds + thresholds: 2 + + rag_retrieval_baseline + Baseline evaluation for RAG retrieval + evaluators: GroundednessEvaluator, SimilarityEvaluator, avg_latency_seconds + thresholds: 3 +``` + +### Inspect a Bundle + +View full details of a specific bundle including evaluator settings and threshold definitions: + +```bash +agentops bundle show model_direct_baseline +``` + +Output: + +``` +Bundle: model_direct_baseline +Path: .agentops/bundles/model_direct_baseline.yaml + +Evaluators: + SimilarityEvaluator (source=foundry, enabled) + avg_latency_seconds (source=local, enabled) + +Thresholds: + SimilarityEvaluator >= 0.7 + avg_latency_seconds <= 5.0 +``` + +### Browse Run History + +List past evaluation runs sorted by most recent first: + +```bash +agentops run list +``` + +Output: + +``` +Runs in .agentops/results: + + 20250610-143022 PASS bundle=model_direct_baseline dataset=smoke-model-direct duration=42.3s + 20250609-091500 FAIL bundle=rag_retrieval_baseline dataset=smoke-rag duration=58.1s +``` + +### Inspect a Specific Run + +Show the full summary of a run by its ID or use `latest`: + +```bash +agentops run show latest +agentops run show 20250610-143022 +``` + +Output includes: +- Run status (PASS/FAIL) +- Bundle and dataset used +- Backend type +- Start time and duration +- Items passed/failed counts +- Metric scores +- Threshold results with actual vs expected values +- Foundry portal URL (if cloud evaluation was used) + +### Deep-Inspect a Run Entry (Planned) + +The `run view` command will allow inspecting individual evaluation entries: + +```bash +agentops run view 20250610-143022 --entry 3 +``` + +This command is planned for a future release. + +## Foundry Resource Discovery (Planned) + +These commands are planned for a future release: + +### List Models + +```bash +agentops model list +``` + +Will list chat-capable model deployments available in the Foundry project. Useful for choosing which model to target in a `run.yaml` when using `target: model`. + +### List Agents + +```bash +agentops agent list +``` + +Will list agents deployed in the Foundry project. Useful for discovering agent IDs (e.g., `my-agent:3`) to target in a `run.yaml` when using `target: agent`. + +When users ask about available models or agents, mention that these commands are planned and suggest checking the Foundry portal or using `az` CLI as a workaround. + +## Common Patterns + +### Check if a bundle meets your needs + +```bash +agentops bundle show rag_retrieval_baseline +``` + +Review the evaluators list to confirm the right metrics are being measured, then check thresholds to ensure quality gates match your requirements. + +### Find which runs failed and why + +```bash +agentops run list # Find runs with FAIL status +agentops run show # Check threshold results +``` + +Look at the Thresholds section in the run output — it shows which specific evaluators failed with actual vs expected values. + +### Compare with latest run + +```bash +agentops run show latest # Current baseline +agentops eval compare --runs latest, # Side-by-side (from agentops-run-evals skill) +``` + +## Exit Codes + +| Code | Meaning | +|---|---| +| `0` | Command succeeded | +| `1` | Runtime or configuration error (e.g., workspace not found, bundle not found) | diff --git a/plugins/agentops/skills/agentops-dataset-management/SKILL.md b/plugins/agentops/skills/agentops-dataset-management/SKILL.md new file mode 100644 index 00000000..7eedffb0 --- /dev/null +++ b/plugins/agentops/skills/agentops-dataset-management/SKILL.md @@ -0,0 +1,222 @@ +--- +name: agentops-dataset-management +description: Guide users through creating, validating, and managing evaluation datasets for AgentOps. Trigger when users ask about dataset format, creating datasets, JSONL rows, dataset YAML config, dataset fields, validating datasets, describing datasets, importing datasets, input/expected/context fields, or dataset schema mapping. Common phrases include "create dataset", "validate dataset", "dataset format", "JSONL format", "dataset schema", "import dataset", "dataset fields", "input field", "expected field", "context field", "describe dataset", "dataset rows", "dataset YAML", "add evaluation data". Install agentops-toolkit via pip. Commands are agentops dataset validate, agentops dataset describe, and agentops dataset import. +--- + +# AgentOps Dataset Management + +> **Prerequisite:** Install the AgentOps CLI with `pip install agentops-toolkit`. + +## Before You Start + +**IMPORTANT:** Before running any commands below, verify the workspace exists: + +1. Check if `.agentops/` directory exists in the project root. +2. If it does **not** exist, run `agentops init` first (see the **agentops-workspace-setup** skill). +3. Do not proceed until `.agentops/` is confirmed. + +## Purpose + +Guide users through creating, formatting, and managing evaluation datasets used by AgentOps evaluations. Covers the two-file dataset structure (YAML config + JSONL rows), field mapping for different evaluation scenarios, and dataset management commands. + +## When to Use + +- User wants to create a new evaluation dataset. +- User asks about dataset format or JSONL structure. +- User needs to understand field mapping (input, expected, context). +- User wants to validate a dataset before running an evaluation. +- User asks how to import data into AgentOps format. +- User wants to understand what fields different evaluators require. + +## Available Commands + +```bash +agentops dataset validate # Validate dataset config (planned) +agentops dataset describe # Describe dataset structure (planned) +agentops dataset import # Import external data (planned) +``` + +> These commands are planned for a future release. This skill guides you through manual dataset creation and formatting. + +## Dataset Structure + +AgentOps uses a **two-file structure** for datasets: + +1. **Dataset YAML config** — metadata, schema mapping, and path to JSONL rows +2. **Dataset JSONL file** — one JSON object per line containing evaluation data + +### File Layout + +``` +.agentops/ +├── datasets/ +│ ├── smoke-model-direct.yaml # Dataset config +│ ├── smoke-rag.yaml +│ └── smoke-agent-tools.yaml +└── data/ + ├── smoke-model-direct.jsonl # Dataset rows + ├── smoke-rag.jsonl + └── smoke-agent-tools.jsonl +``` + +## Dataset YAML Config + +The dataset YAML config defines metadata, the source JSONL path, and field mapping. + +### Model-Direct Dataset + +```yaml +version: 1 +name: smoke-model-direct +description: Smoke test for model-direct evaluation +source: + type: file + path: ../data/smoke-model-direct.jsonl +format: + type: jsonl + input_field: input + expected_field: expected +``` + +### RAG Dataset + +```yaml +version: 1 +name: smoke-rag +description: Smoke test for RAG evaluation +source: + type: file + path: ../data/smoke-rag.jsonl +format: + type: jsonl + input_field: input + expected_field: expected + context_field: context +``` + +### Agent with Tools Dataset + +```yaml +version: 1 +name: smoke-agent-tools +description: Smoke test for agent with tools evaluation +source: + type: file + path: ../data/smoke-agent-tools.jsonl +format: + type: jsonl + input_field: input + expected_field: expected +``` + +### Key Fields + +| Field | Required | Description | +|---|---|---| +| `version` | Yes | Schema version (currently `1`) | +| `name` | Yes | Dataset identifier | +| `description` | No | Human-readable description | +| `source.type` | Yes | Source type (`file`) | +| `source.path` | Yes | Relative path to JSONL file (relative to dataset YAML location) | +| `format.type` | Yes | Row format (`jsonl`) | +| `format.input_field` | Yes | Field name for evaluation input/query | +| `format.expected_field` | No | Field name for expected/ground truth answer | +| `format.context_field` | No | Field name for retrieval context (RAG scenarios) | + +## JSONL Row Format + +Each line in the JSONL file is a JSON object representing one evaluation item. + +### Model-Direct Rows + +```jsonl +{"input": "What is the capital of France?", "expected": "Paris"} +{"input": "Explain photosynthesis briefly.", "expected": "Photosynthesis converts sunlight into chemical energy in plants."} +``` + +### RAG Rows + +```jsonl +{"input": "What are the return policy terms?", "expected": "30-day return window with receipt.", "context": "Our return policy allows returns within 30 days of purchase with a valid receipt."} +{"input": "What is the shipping time?", "expected": "3-5 business days.", "context": "Standard shipping takes 3-5 business days for domestic orders."} +``` + +### Agent with Tools Rows + +```jsonl +{"input": "Book a meeting for tomorrow at 2pm", "expected": "Meeting booked for tomorrow at 2:00 PM"} +{"input": "What is the weather in Seattle?", "expected": "Current weather conditions in Seattle"} +``` + +## Creating a New Dataset + +### Step 1: Create the JSONL Data File + +Create a new file in `.agentops/data/`: + +```bash +# Example: create a custom evaluation dataset +``` + +Write one JSON object per line. Each object must include at minimum the field specified by `input_field`: + +```jsonl +{"input": "Your test query", "expected": "Expected response"} +``` + +### Step 2: Create the Dataset YAML Config + +Create a new file in `.agentops/datasets/`: + +```yaml +version: 1 +name: my-custom-dataset +description: Custom evaluation dataset for my agent +source: + type: file + path: ../data/my-custom-dataset.jsonl +format: + type: jsonl + input_field: input + expected_field: expected +``` + +### Step 3: Reference in run.yaml + +Update your run configuration to use the new dataset: + +```yaml +dataset: + path: datasets/my-custom-dataset.yaml +``` + +## Field Requirements by Evaluator Type + +Different evaluators require different fields in the dataset: + +| Evaluator Category | Required Fields | Optional Fields | +|---|---|---| +| Similarity (SimilarityEvaluator) | `input`, `expected` | — | +| Groundedness (GroundednessEvaluator) | `input`, `context` | `expected` | +| RAG evaluators (RelevanceEvaluator, etc.) | `input`, `context` | `expected` | +| Tool evaluators (ToolCallAccuracyEvaluator) | `input` | `expected`, `tool_definitions` | +| Task completion (TaskCompletionEvaluator) | `input`, `expected` | — | +| Latency (avg_latency_seconds) | `input` | — | + +## Validation Checklist + +Before running an evaluation, verify: + +1. **JSONL format** — Each line is valid JSON, no trailing commas. +2. **Required fields** — Every row has the `input_field` defined in the YAML config. +3. **Expected fields** — Rows include `expected` if the bundle uses similarity or task-completion evaluators. +4. **Context fields** — Rows include `context` if the bundle uses groundedness or RAG evaluators. +5. **Path reference** — The `source.path` in dataset YAML correctly points to the JSONL file. +6. **Encoding** — Files are UTF-8 encoded. + +## Troubleshooting + +- **"Dataset file not found"** — Check that `source.path` in the YAML config is correct relative to the dataset YAML file location. +- **"Missing required field"** — Ensure every JSONL row contains the field specified by `format.input_field`. +- **"Invalid JSON"** — Check JSONL file for syntax errors. Each line must be valid JSON. +- **Evaluator returns null scores** — The dataset may be missing fields that the evaluator requires (e.g., `context` for groundedness). diff --git a/plugins/agentops/skills/agentops-investigate-regression/SKILL.md b/plugins/agentops/skills/agentops-investigate-regression/SKILL.md index 32f05a59..abb2d6b6 100644 --- a/plugins/agentops/skills/agentops-investigate-regression/SKILL.md +++ b/plugins/agentops/skills/agentops-investigate-regression/SKILL.md @@ -7,6 +7,14 @@ description: Help users investigate evaluation regressions in AgentOps by compar > **Prerequisite:** Install the AgentOps CLI with `pip install agentops-toolkit`. +## Before You Start + +**IMPORTANT:** Before running any commands below, verify the workspace exists: + +1. Check if `.agentops/` directory exists in the project root. +2. If it does **not** exist, run `agentops init` first (see the **agentops-workspace-setup** skill). +3. Do not proceed until `.agentops/` is confirmed. + ## Purpose Guide users through regression investigation using N-run comparison, row-level score analysis, and structured root cause identification. diff --git a/plugins/agentops/skills/agentops-observability-triage/SKILL.md b/plugins/agentops/skills/agentops-observability-triage/SKILL.md index 451d13dc..a1e5481a 100644 --- a/plugins/agentops/skills/agentops-observability-triage/SKILL.md +++ b/plugins/agentops/skills/agentops-observability-triage/SKILL.md @@ -7,6 +7,14 @@ description: Guide users on observability and triage workflows for AgentOps eval > **Prerequisite:** Install the AgentOps CLI with `pip install agentops-toolkit`. +## Before You Start + +**IMPORTANT:** Before running any commands below, verify the workspace exists: + +1. Check if `.agentops/` directory exists in the project root. +2. If it does **not** exist, run `agentops init` first (see the **agentops-workspace-setup** skill). +3. Do not proceed until `.agentops/` is confirmed. + ## Purpose Provide practical observability guidance using current reporting artifacts. Frame tracing/monitoring as planned future features while showing what's available today — including HTML reports with visual indicators and N-run comparison dashboards. diff --git a/plugins/agentops/skills/agentops-run-evals/SKILL.md b/plugins/agentops/skills/agentops-run-evals/SKILL.md index 64340e93..9c9c1f82 100644 --- a/plugins/agentops/skills/agentops-run-evals/SKILL.md +++ b/plugins/agentops/skills/agentops-run-evals/SKILL.md @@ -7,6 +7,14 @@ description: Guide users through running AgentOps evaluations end to end — sin > **Prerequisite:** Install the AgentOps CLI with `pip install agentops-toolkit`. +## Before You Start + +**IMPORTANT:** Before running any commands below, verify the workspace exists: + +1. Check if `.agentops/` directory exists in the project root. +2. If it does **not** exist, run `agentops init` first (see the **agentops-workspace-setup** skill). +3. Do not proceed until `.agentops/` is confirmed. + ## Purpose Guide users through the full AgentOps evaluation workflow — workspace setup, running evaluations, comparing N runs, benchmarking models/agents, and interpreting reports. @@ -26,6 +34,8 @@ agentops init [--path ] # Scaffold workspace agentops eval run [-c ] [-f md|html|all] # Run evaluation agentops report [--in ] [-f md|html|all] # Regenerate report agentops eval compare --runs ,[,,...] [-f md|html|all] # Compare N runs +agentops report show # View reports in table format (planned) +agentops report export # Export reports as JSON/Markdown/CSV (planned) ``` ### Key flags @@ -124,9 +134,18 @@ az login # local development # CI/CD: set AZURE_CLIENT_ID, AZURE_TENANT_ID, AZURE_CLIENT_SECRET ``` +### Report Inspection (Planned) + +These commands are planned for a future release: + +- `agentops report show` — view reports interactively in table format +- `agentops report export` — export reports in JSON, Markdown, or CSV formats + +When users ask about viewing or exporting reports, mention that these commands are planned and recommend using `agentops report --in ` to regenerate reports in the meantime. + ## Guardrails - Do not invent commands or flags beyond documented CLI behavior. -- Planned commands (`run list`, `bundle show`, `trace init`, `monitor`) are NOT implemented — state they are planned. +- Planned commands (`run list`, `bundle show`, `trace init`, `monitor`, `report show`, `report export`) are NOT implemented — state they are planned. - The `--format` flag accepts only `md`, `html`, or `all`. - When comparing runs with different datasets, row-level comparison is not meaningful — the report handles this automatically. diff --git a/plugins/agentops/skills/agentops-workspace-setup/SKILL.md b/plugins/agentops/skills/agentops-workspace-setup/SKILL.md new file mode 100644 index 00000000..a5fb1b8f --- /dev/null +++ b/plugins/agentops/skills/agentops-workspace-setup/SKILL.md @@ -0,0 +1,194 @@ +--- +name: agentops-workspace-setup +description: Guide users through initializing an AgentOps workspace, configuring CI/CD pipelines, and managing workspace settings. Trigger when users ask to initialize agentops, scaffold workspace, generate CI/CD workflow, set up GitHub Actions, configure agentops, validate config, show config, customize workspace paths, or set up evaluation pipelines. Common phrases include "initialize agentops", "set up workspace", "config cicd", "CI/CD pipeline", "GitHub Actions", "generate workflow", "configure agentops", "workspace setup", "config.yaml", "config validate", "config show". Install agentops-toolkit via pip. Commands are agentops init, agentops config cicd, agentops config validate, and agentops config show. +--- + +# AgentOps Workspace Setup + +> **Prerequisite:** Install the AgentOps CLI with `pip install agentops-toolkit`. + +## Purpose + +Guide users through initializing an AgentOps evaluation workspace, configuring CI/CD pipelines with GitHub Actions, and managing workspace configuration. + +## When to Use + +- User wants to start using AgentOps in a new project. +- User asks how to set up the `.agentops/` directory. +- User wants to generate a GitHub Actions workflow for evaluation. +- User asks about CI/CD integration for AgentOps evaluations. +- User wants to inspect or validate workspace configuration. +- User asks about workspace directory structure or config.yaml. + +## Available Commands + +```bash +agentops init [--path ] [--force] # Scaffold .agentops/ workspace +agentops config cicd [--force] [--dir ] # Generate GitHub Actions workflow +agentops config validate # Validate workspace config (planned) +agentops config show # Show resolved config (planned) +``` + +### Key Flags + +| Command | Flag | Description | +|---|---|---| +| `init` | `--path / --dir` | Target project directory (default: current directory) | +| `init` | `--force` | Overwrite existing files | +| `config cicd` | `--force` | Overwrite existing workflow file | +| `config cicd` | `--dir` | Project root directory (default: current directory) | + +## Recommended Workflow + +### Initialize a New Workspace + +1. Navigate to your project root. +2. Run `agentops init` to scaffold the `.agentops/` directory. +3. Review the generated files and customize as needed. + +```bash +cd my-project +agentops init +``` + +This creates: + +``` +.agentops/ +├── config.yaml # Workspace defaults +├── run.yaml # Default run configuration +├── run-rag.yaml # RAG evaluation run config +├── run-agent.yaml # Agent evaluation run config +├── .gitignore # Git exclusions for results +├── bundles/ +│ ├── model_direct_baseline.yaml +│ ├── rag_retrieval_baseline.yaml +│ └── agent_tools_baseline.yaml +├── datasets/ +│ ├── smoke-model-direct.yaml +│ ├── smoke-rag.yaml +│ └── smoke-agent-tools.yaml +├── data/ +│ ├── smoke-model-direct.jsonl +│ ├── smoke-rag.jsonl +│ └── smoke-agent-tools.jsonl +└── results/ # Created on first run +``` + +Use `--force` to re-scaffold and overwrite existing files: + +```bash +agentops init --force +``` + +### Configure run.yaml + +Edit `.agentops/run.yaml` to point to your bundle, dataset, and backend: + +```yaml +version: 1 +bundle: + path: bundles/model_direct_baseline.yaml +dataset: + path: datasets/smoke-model-direct.yaml +backend: + type: foundry + target: model + model: gpt-4o-mini + project_endpoint_env: AZURE_AI_FOUNDRY_PROJECT_ENDPOINT + timeout_seconds: 1800 +output: + write_report: true +``` + +Backend type options: +- `type: foundry` — Microsoft Foundry Agent Service (default) +- `type: subprocess` — Custom subprocess pipeline + +Foundry target options: +- `target: agent` — Evaluate a Foundry agent (requires `agent_id`) +- `target: model` — Evaluate a model deployment directly (requires `model`) + +### Set Up CI/CD with GitHub Actions + +1. Generate the workflow file: + +```bash +agentops config cicd +``` + +This creates `.github/workflows/agentops-eval.yml`. + +2. Configure GitHub repository settings: + +**Repository variables** (Settings → Secrets and variables → Actions → Variables): + +| Variable | Value | +|---|---| +| `AZURE_CLIENT_ID` | Application (client) ID | +| `AZURE_TENANT_ID` | Directory (tenant) ID | +| `AZURE_SUBSCRIPTION_ID` | Subscription ID | + +**Repository secret** (Settings → Secrets and variables → Actions → Secrets): + +| Secret | Value | +|---|---| +| `AZURE_AI_FOUNDRY_PROJECT_ENDPOINT` | Foundry project endpoint URL | + +3. The workflow uses **Workload Identity Federation (OIDC)** — no client secrets to rotate. + +4. Triggers: + - `pull_request` — Runs on PRs targeting `main` or `develop` + - `workflow_dispatch` — Manual runs from the Actions tab + +5. Push a PR to trigger the evaluation automatically. + +### Regenerate the workflow file + +Use `--force` to overwrite an existing workflow: + +```bash +agentops config cicd --force +``` + +## Exit Codes + +| Code | Meaning | +|---|---| +| `0` | Command succeeded | +| `1` | Runtime or configuration error | + +## Workspace Config Reference + +The `.agentops/config.yaml` file controls workspace-level defaults: + +```yaml +paths: + bundles_dir: bundles + datasets_dir: datasets + data_dir: data + results_dir: results +defaults: + backend: foundry + timeout_seconds: 1800 +report: + generate_markdown: true +``` + +## CI/CD Artifacts + +The generated workflow uploads these artifacts as `agentops-eval-results`: + +| File | Description | +|---|---| +| `results.json` | Machine-readable evaluation results | +| `report.md` | Human-readable Markdown summary | +| `cloud_evaluation.json` | Foundry portal link (cloud mode only) | +| `backend_metrics.json` | Raw backend scores per row | + +## Troubleshooting + +- **"No .agentops workspace found"** — Run `agentops init` first. +- **Workflow file already exists** — Use `agentops config cicd --force` to overwrite. +- **OIDC authentication fails** — Ensure federated credentials match your repo and branch pattern. +- **Missing environment variables** — Set `AZURE_AI_FOUNDRY_PROJECT_ENDPOINT` as a repository secret. diff --git a/src/agentops/cli/app.py b/src/agentops/cli/app.py index 8441c116..97625839 100644 --- a/src/agentops/cli/app.py +++ b/src/agentops/cli/app.py @@ -102,16 +102,6 @@ def cmd_agent_list() -> None: DEFAULT_REPORT_INPUT = Path(".agentops/results/latest/results.json") -def _planned_command(command_name: str) -> None: - typer.echo( - "This command is planned but not implemented in this release:\n" - f" {command_name}\n" - "Please use the currently available commands" - " (`init`, `eval run`, `eval compare`, `report`, `config cicd`) for now." - ) - raise typer.Exit(code=1) - - # --------------------------------------------------------------------------- # Global callback — configures logging before any command runs # --------------------------------------------------------------------------- diff --git a/src/agentops/services/comparison.py b/src/agentops/services/comparison.py index 388fecf2..0e3f8b05 100644 --- a/src/agentops/services/comparison.py +++ b/src/agentops/services/comparison.py @@ -1,4 +1,5 @@ """Comparison service for evaluating baseline vs current run results.""" + from __future__ import annotations import json @@ -53,7 +54,9 @@ def _resolve_run_path(run_id: str, workspace_dir: Path | None = None) -> Path: return results_in_dir.resolve() results_base = workspace_dir or (Path.cwd() / ".agentops") - results_dir = results_base / "results" if results_base.name != "results" else results_base + results_dir = ( + results_base / "results" if results_base.name != "results" else results_base + ) run_dir = results_dir / run_id results_file = run_dir / "results.json" if results_file.is_file(): @@ -228,7 +231,8 @@ def compare_runs( # Best run: for lower-is-better pick min, otherwise pick max valid_vals = [ - (i, v) for i, v in enumerate(values) + (i, v) + for i, v in enumerate(values) if any(m.name == name for m in results[i].metrics) ] best_idx: Optional[int] = None @@ -238,14 +242,16 @@ def compare_runs( else: best_idx = max(valid_vals, key=lambda x: x[1])[0] - metric_rows.append(ComparisonMetricRow( - name=name, - values=values, - deltas=deltas, - delta_percents=delta_percents, - directions=directions, - best_run_index=best_idx, - )) + metric_rows.append( + ComparisonMetricRow( + name=name, + values=values, + deltas=deltas, + delta_percents=delta_percents, + directions=directions, + best_run_index=best_idx, + ) + ) # Build threshold rows all_thresholds: List[tuple[str, str]] = [] @@ -267,12 +273,14 @@ def compare_runs( passed_list.append(t.passed if t else False) if t and target_val is None: target_val = t.expected - threshold_rows.append(ComparisonThresholdRow( - evaluator=evaluator, - criteria=criteria, - target=target_val, - passed=passed_list, - )) + threshold_rows.append( + ComparisonThresholdRow( + evaluator=evaluator, + criteria=criteria, + target=target_val, + passed=passed_list, + ) + ) # Build item rows all_row_indices: set[int] = set() @@ -287,7 +295,9 @@ def compare_runs( for idx in sorted(all_row_indices): passed_list = [] # Per-evaluator scores for this row across all runs - scores: Dict[str, List[Optional[float]]] = {name: [] for name in threshold_evaluator_names} + scores: Dict[str, List[Optional[float]]] = { + name: [] for name in threshold_evaluator_names + } for r in results: item_map = {item.row_index: item for item in r.item_evaluations} item = item_map.get(idx) @@ -301,7 +311,9 @@ def compare_runs( scores[name].append(val_map.get(name)) else: scores[name].append(None) - item_rows.append(ComparisonItemRow(row_index=idx, passed_all=passed_list, scores=scores)) + item_rows.append( + ComparisonItemRow(row_index=idx, passed_all=passed_list, scores=scores) + ) # Summary: regression = a run whose status flipped from PASS to FAIL, # or a threshold that was met by baseline but missed by this run. @@ -345,7 +357,10 @@ def run_comparison( report_format: str = "md", ) -> ComparisonServiceResult: """Resolve run IDs, compare, and write comparison outputs.""" - from agentops.core.reporter import generate_comparison_html, generate_comparison_markdown + from agentops.core.reporter import ( + generate_comparison_html, + generate_comparison_markdown, + ) paths = [_resolve_run_path(rid) for rid in run_ids] result = compare_runs(run_paths=paths, run_ids=run_ids) diff --git a/tests/unit/test_cli_commands.py b/tests/unit/test_cli_commands.py index 4676f846..595af980 100644 --- a/tests/unit/test_cli_commands.py +++ b/tests/unit/test_cli_commands.py @@ -24,7 +24,10 @@ def test_eval_compare_rejects_wrong_run_count() -> None: result = runner.invoke(app, ["eval", "compare", "--runs", "only_one"]) assert result.exit_code == 1 - assert "at least two" in result.stdout.lower() or "at least two" in (result.stderr or "").lower() + assert ( + "at least two" in result.stdout.lower() + or "at least two" in (result.stderr or "").lower() + ) def test_trace_init_is_planned_stub() -> None: