From 5e1f4b2f610c23e94b89f1d1c2c777ebd27c3971 Mon Sep 17 00:00:00 2001 From: Paulo Lacerda Date: Fri, 8 May 2026 09:57:02 -0300 Subject: [PATCH] Harden AgentOps tutorials and skills docs Align shipped skills with the installer, clarify Watchdog vs coding-agent skills, and rewrite the HTTP, tool workflow, and Copilot skills tutorials around real Azure Container Apps evals and PR-first CI guidance. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/ci-github-actions.md | 497 +++++++++--------- docs/tutorial-agent-workflow.md | 315 ++++++++--- docs/tutorial-copilot-skills.md | 369 ++++++------- docs/tutorial-http-agent.md | 416 +++++++++------ plugins/agentops/README.md | 171 +++--- .../skills/agentops-workflow/SKILL.md | 33 +- src/agentops/services/skills.py | 3 + .../skills/agentops-workflow/SKILL.md | 33 +- tests/unit/test_skills.py | 12 +- 9 files changed, 1102 insertions(+), 747 deletions(-) diff --git a/docs/ci-github-actions.md b/docs/ci-github-actions.md index 88dc91a9..85d98fa2 100644 --- a/docs/ci-github-actions.md +++ b/docs/ci-github-actions.md @@ -1,245 +1,252 @@ -# AgentOps GenAIOps GitFlow on GitHub Actions - -This guide shows how to wire AgentOps into a complete GenAIOps CI/CD -pipeline on GitHub Actions, mapped to a classic GitFlow branching model -with three deployment environments (`dev`, `qa`, `production`). - -`agentops workflow generate` ships **four** ready-to-use templates that -form the full scaffold: - -| File | Trigger | GitHub Environment | Purpose | -|---|---|---|---| -| `agentops-pr.yml` | PRs to `develop`, `release/**`, `main` | (none) | Eval gate. Fails the PR if thresholds drop. Comments report on PR. | -| `agentops-deploy-dev.yml` | push to `develop` | `dev` | Eval → build → deploy DEV | -| `agentops-deploy-qa.yml` | push to `release/**` | `qa` | Eval → build → deploy QA | -| `agentops-deploy-prod.yml` | push to `main` | `production` | Safety eval → build → deploy PROD (gated by required reviewers) | - -## GitFlow assumed - -```mermaid -flowchart LR - feat["feature/*"] -->|PR| prGate1{{"agentops-pr.yml
(gate)"}} - prGate1 -->|merge| dev["develop"] - dev --> deployDev["agentops-deploy-dev.yml"] - deployDev --> DEV(["DEV"]) - - rel["release/*"] -->|push| deployQa["agentops-deploy-qa.yml"] - deployQa --> QA(["QA"]) - - rel -->|PR| prGate2{{"agentops-pr.yml
(gate)"}} - prGate2 -->|merge| main["main"] - main --> deployProd["agentops-deploy-prod.yml"] - deployProd --> PROD(["PROD
(required reviewers)"]) - - classDef gate fill:#fff3cd,stroke:#856404,color:#000; - classDef env fill:#d1ecf1,stroke:#0c5460,color:#000; - class prGate1,prGate2 gate; - class DEV,QA,PROD env; -``` - -If you are on trunk-based development, generate only the templates you -need: `agentops workflow generate --kinds pr,dev,prod`. - -## Quick start - -```bash -# 1. Make sure your eval works locally first. -agentops eval run - -# 2. Generate the four workflows. -agentops workflow generate - -# 3. Configure GitHub (see sections below): -# - OIDC repo variables -# - dev / qa / production environments -# - branch protection on develop and main -# - fill in Build / Deploy placeholders - -# 4. Commit and push. -``` - -## Configuration walkthrough - -### 1. Repository variables (OIDC) - -In Settings → Secrets and variables → Actions → **Variables**, add: - -| Variable | Purpose | -|---|---| -| `AZURE_CLIENT_ID` | App registration / managed identity used for federated login | -| `AZURE_TENANT_ID` | Azure AD tenant | -| `AZURE_SUBSCRIPTION_ID` | Target subscription | -| `AZURE_AI_FOUNDRY_PROJECT_ENDPOINT` | Foundry project URL (used by the eval step) | - -Then on the Azure side, configure Workload Identity Federation -(federated credentials) on the app registration so it can be assumed -from GitHub Actions runs. See -[Microsoft's WIF docs](https://learn.microsoft.com/azure/active-directory/workload-identities/workload-identity-federation-create-trust?pivots=identity-wif-apps-methods-azp). - -### 2. GitHub Environments - -In Settings → Environments, create three: - -#### `dev` -- Usually no protection rules. -- Override env-specific variables here (e.g. dev resource group, dev - ACA app name). - -#### `qa` -- Optional: restrict deployment branches to `release/**`. -- Override env-specific variables for QA infra. - -#### `production` -- **Required reviewers**: at least one. Deploys to PROD pause until - approved. -- Optional: **Wait timer** for an extra cool-down. -- Optional: **Deployment branches**: restrict to `main`. -- Override env-specific variables for production infra. - -Environment-level variables override repo-level ones automatically -when the workflow's `environment:` matches. - -### 3. Fill in Build and Deploy - -Each `agentops-deploy-*.yml` ships with `Build (placeholder)` and -`Deploy (placeholder)` steps. The DEV template lists commented example -snippets for the most common patterns. Copy the relevant one into all -three deploy templates. - -#### Container Apps - -```yaml -# Build -- name: Build image - run: | - az acr build \ - --registry "${{ vars.ACR_NAME }}" \ - --image "myapp:${{ github.sha }}" \ - . - -# Deploy -- name: Deploy to ACA - run: | - az containerapp update \ - --name "${{ vars.ACA_APP_NAME }}" \ - --resource-group "${{ vars.AZURE_RESOURCE_GROUP }}" \ - --image "${{ vars.ACR_NAME }}.azurecr.io/myapp:${{ github.sha }}" -``` - -#### App Service - -```yaml -# Build -- uses: actions/setup-python@v5 - with: { python-version: "3.11" } -- run: pip install -r requirements.txt -t ./dist -- run: cp -r src ./dist/ - -# Deploy -- uses: azure/webapps-deploy@v3 - with: - app-name: ${{ vars.WEBAPP_NAME }} - package: ./dist -``` - -#### Foundry hosted agent - -```yaml -# Build is typically empty: hosted agents are configured, not packaged. - -# Deploy: publish a new agent version with whatever your project uses -# to manage Foundry agents (project-specific tooling). -``` - -#### azd-managed app - -```yaml -# Build -- uses: Azure/setup-azd@v2 -- run: azd package --no-prompt - -# Deploy -- run: azd deploy --no-prompt - env: - AZURE_ENV_NAME: dev # or qa / prod -``` - -### 4. Branch protection - -In Settings → Branches, add a rule for **both `develop` and `main`**: - -- ✅ Require a pull request before merging. -- ✅ Require status checks to pass: select - **`AgentOps PR / Eval (PR gate)`**. -- (Optional) Require linear history. - -This makes the AgentOps eval a hard merge requirement. - -## Exit codes - -The eval step uses the AgentOps exit code contract to gate deploys: - -| Exit code | Meaning | Job result | -|---|---|---| -| `0` | Eval ran, all thresholds passed | ✅ pass | -| `2` | Eval ran, one or more thresholds failed | ❌ fail (deploy never runs) | -| `1` | Runtime / config error | ❌ fail | - -## Artifacts - -Each workflow uploads (always — even on failure): - -- `results.json` — machine-readable, versioned -- `report.md` — human-readable -- `cloud_evaluation.json` — present when using Foundry cloud evaluation; - contains a deep link to the New Foundry Experience Evaluations page - -Artifact names per workflow: - -| Workflow | Artifact name | -|---|---| -| `agentops-pr.yml` | `agentops-pr-results` | -| `agentops-deploy-dev.yml` | `agentops-dev-results` | -| `agentops-deploy-qa.yml` | `agentops-qa-results` | -| `agentops-deploy-prod.yml` | `agentops-prod-results` | - -## CLI reference - -```bash -agentops workflow generate # all four templates (default) -agentops workflow generate --kinds pr,dev,prod # subset (trunk-based) -agentops workflow generate --force # overwrite existing files -agentops workflow generate --dir # different repo root -``` - -| Flag | Description | Default | -|---|---|---| -| `--kinds` | Comma-separated subset of `pr,dev,qa,prod` | all four | -| `--force` | Overwrite existing workflow files | `false` | -| `--dir` | Repository root | `.` | - -## Customisation tips - -- **Tighten thresholds for QA / PROD** - copy `agentops.yaml` to - `agentops-qa.yaml` / `agentops-prod.yaml` and tighten the - `thresholds:` block. Update the `inputs.config` default in the - matching workflow file. -- **Scheduled runs** — add a `schedule:` entry in `agentops-pr.yml` (or - a new file) to evaluate against `main` nightly. -- **Matrix per scenario** - if you have multiple AgentOps config files, extend - the eval job with `strategy.matrix.config:` and reference - `${{ matrix.config }}` in the eval step. -- **Regression baseline** - wire deploy templates to download the - previous run's `results.json` artifact and call - `agentops eval run --baseline `. - -## Migration from the older 3-template layout - -If your repository still has `agentops-eval.yml`, `agentops-eval-ci.yml`, -or `agentops-eval-cd.yml` from a prior version of AgentOps: - -1. Delete the three old files. -2. Run `agentops workflow generate`. -3. Re-add Build / Deploy commands you had customised. -4. Update branch-protection status checks to point at the new - `AgentOps PR` job. +# AgentOps GenAIOps GitFlow on GitHub Actions + +This guide shows how to wire AgentOps into a complete GenAIOps CI/CD +pipeline on GitHub Actions, mapped to a classic GitFlow branching model +with three deployment environments (`dev`, `qa`, `production`). + +`agentops workflow generate --kinds pr` is the safe first step for a new +repository: it creates only the PR eval gate. Generate the full DEV/QA/PROD +deploy scaffold after GitHub Environments, Azure OIDC, and real +build/deploy commands are configured. + +The full scaffold ships four templates: + +| File | Trigger | GitHub Environment | Purpose | +|---|---|---|---| +| `agentops-pr.yml` | PRs to `develop`, `release/**`, `main` | `dev` | Eval gate. Fails the PR if thresholds drop. Comments report on PR. | +| `agentops-deploy-dev.yml` | push to `develop` | `dev` | Eval → build → deploy DEV | +| `agentops-deploy-qa.yml` | push to `release/**` | `qa` | Eval → build → deploy QA | +| `agentops-deploy-prod.yml` | push to `main` | `production` | Safety eval → build → deploy PROD (gated by required reviewers) | + +## GitFlow assumed + +```mermaid +flowchart LR + feat["feature/*"] -->|PR| prGate1{{"agentops-pr.yml
(gate)"}} + prGate1 -->|merge| dev["develop"] + dev --> deployDev["agentops-deploy-dev.yml"] + deployDev --> DEV(["DEV"]) + + rel["release/*"] -->|push| deployQa["agentops-deploy-qa.yml"] + deployQa --> QA(["QA"]) + + rel -->|PR| prGate2{{"agentops-pr.yml
(gate)"}} + prGate2 -->|merge| main["main"] + main --> deployProd["agentops-deploy-prod.yml"] + deployProd --> PROD(["PROD
(required reviewers)"]) + + classDef gate fill:#fff3cd,stroke:#856404,color:#000; + classDef env fill:#d1ecf1,stroke:#0c5460,color:#000; + class prGate1,prGate2 gate; + class DEV,QA,PROD env; +``` + +If you are on trunk-based development, generate only the templates you +need: `agentops workflow generate --kinds pr,dev,prod`. + +## Quick start + +```bash +# 1. Make sure your eval works locally first. +agentops eval run + +# 2. Generate the PR gate first. +agentops workflow generate --kinds pr + +# 3. Configure GitHub (see sections below): +# - OIDC repo variables +# - dev environment +# - branch protection on develop and main + +# 4. Commit and push the PR gate. + +# 5. Only after deploy wiring is real, generate the full scaffold: +agentops workflow generate --kinds pr,dev,qa,prod --force +``` + +## Configuration walkthrough + +### 1. Repository variables (OIDC) + +In Settings → Secrets and variables → Actions → **Variables**, add: + +| Variable | Purpose | +|---|---| +| `AZURE_CLIENT_ID` | App registration / managed identity used for federated login | +| `AZURE_TENANT_ID` | Azure AD tenant | +| `AZURE_SUBSCRIPTION_ID` | Target subscription | +| `AZURE_AI_FOUNDRY_PROJECT_ENDPOINT` | Foundry project URL (used by the eval step) | + +Then on the Azure side, configure Workload Identity Federation +(federated credentials) on the app registration so it can be assumed +from GitHub Actions runs. See +[Microsoft's WIF docs](https://learn.microsoft.com/azure/active-directory/workload-identities/workload-identity-federation-create-trust?pivots=identity-wif-apps-methods-azp). + +### 2. GitHub Environments + +In Settings → Environments, create three: + +#### `dev` +- Usually no protection rules. +- Override env-specific variables here (e.g. dev resource group, dev + ACA app name). + +#### `qa` +- Optional: restrict deployment branches to `release/**`. +- Override env-specific variables for QA infra. + +#### `production` +- **Required reviewers**: at least one. Deploys to PROD pause until + approved. +- Optional: **Wait timer** for an extra cool-down. +- Optional: **Deployment branches**: restrict to `main`. +- Override env-specific variables for production infra. + +Environment-level variables override repo-level ones automatically +when the workflow's `environment:` matches. + +### 3. Fill in Build and Deploy + +Each `agentops-deploy-*.yml` ships with `Build (placeholder)` and +`Deploy (placeholder)` steps. The DEV template lists commented example +snippets for the most common patterns. Copy the relevant one into all +three deploy templates. + +#### Container Apps + +```yaml +# Build +- name: Build image + run: | + az acr build \ + --registry "${{ vars.ACR_NAME }}" \ + --image "myapp:${{ github.sha }}" \ + . + +# Deploy +- name: Deploy to ACA + run: | + az containerapp update \ + --name "${{ vars.ACA_APP_NAME }}" \ + --resource-group "${{ vars.AZURE_RESOURCE_GROUP }}" \ + --image "${{ vars.ACR_NAME }}.azurecr.io/myapp:${{ github.sha }}" +``` + +#### App Service + +```yaml +# Build +- uses: actions/setup-python@v5 + with: { python-version: "3.11" } +- run: pip install -r requirements.txt -t ./dist +- run: cp -r src ./dist/ + +# Deploy +- uses: azure/webapps-deploy@v3 + with: + app-name: ${{ vars.WEBAPP_NAME }} + package: ./dist +``` + +#### Foundry hosted agent + +```yaml +# Build is typically empty: hosted agents are configured, not packaged. + +# Deploy: publish a new agent version with whatever your project uses +# to manage Foundry agents (project-specific tooling). +``` + +#### azd-managed app + +```yaml +# Build +- uses: Azure/setup-azd@v2 +- run: azd package --no-prompt + +# Deploy +- run: azd deploy --no-prompt + env: + AZURE_ENV_NAME: dev # or qa / prod +``` + +### 4. Branch protection + +In Settings → Branches, add a rule for **both `develop` and `main`**: + +- ✅ Require a pull request before merging. +- ✅ Require status checks to pass: select + **`AgentOps PR / Eval (PR gate)`**. +- (Optional) Require linear history. + +This makes the AgentOps eval a hard merge requirement. + +## Exit codes + +The eval step uses the AgentOps exit code contract to gate deploys: + +| Exit code | Meaning | Job result | +|---|---|---| +| `0` | Eval ran, all thresholds passed | ✅ pass | +| `2` | Eval ran, one or more thresholds failed | ❌ fail (deploy never runs) | +| `1` | Runtime / config error | ❌ fail | + +## Artifacts + +Each workflow uploads (always — even on failure): + +- `results.json` — machine-readable, versioned +- `report.md` — human-readable +- `cloud_evaluation.json` — present when using Foundry cloud evaluation; + contains a deep link to the New Foundry Experience Evaluations page + +Artifact names per workflow: + +| Workflow | Artifact name | +|---|---| +| `agentops-pr.yml` | `agentops-pr-results` | +| `agentops-deploy-dev.yml` | `agentops-dev-results` | +| `agentops-deploy-qa.yml` | `agentops-qa-results` | +| `agentops-deploy-prod.yml` | `agentops-prod-results` | + +## CLI reference + +```bash +agentops workflow generate --kinds pr # safe first PR gate +agentops workflow generate # all four templates (default) +agentops workflow generate --kinds pr,dev,prod # subset (trunk-based) +agentops workflow generate --force # overwrite existing files +agentops workflow generate --dir # different repo root +``` + +| Flag | Description | Default | +|---|---|---| +| `--kinds` | Comma-separated subset of `pr,dev,qa,prod` | all four | +| `--force` | Overwrite existing workflow files | `false` | +| `--dir` | Repository root | `.` | + +## Customisation tips + +- **Tighten thresholds for QA / PROD** - copy `agentops.yaml` to + `agentops-qa.yaml` / `agentops-prod.yaml` and tighten the + `thresholds:` block. Update the `inputs.config` default in the + matching workflow file. +- **Scheduled runs** — add a `schedule:` entry in `agentops-pr.yml` (or + a new file) to evaluate against `main` nightly. +- **Matrix per scenario** - if you have multiple AgentOps config files, extend + the eval job with `strategy.matrix.config:` and reference + `${{ matrix.config }}` in the eval step. +- **Regression baseline** - wire deploy templates to download the + previous run's `results.json` artifact and call + `agentops eval run --baseline `. + +## Migration from the older 3-template layout + +If your repository still has `agentops-eval.yml`, `agentops-eval-ci.yml`, +or `agentops-eval-cd.yml` from a prior version of AgentOps: + +1. Delete the three old files. +2. Run `agentops workflow generate`. +3. Re-add Build / Deploy commands you had customised. +4. Update branch-protection status checks to point at the new + `AgentOps PR` job. diff --git a/docs/tutorial-agent-workflow.md b/docs/tutorial-agent-workflow.md index 3cf86d6d..c9aba5e9 100644 --- a/docs/tutorial-agent-workflow.md +++ b/docs/tutorial-agent-workflow.md @@ -1,110 +1,291 @@ -# Tutorial — agent workflow with tool calling +# Tutorial: Build and evaluate a real tool-calling agent -Evaluate an agent that calls **tools** (function calls / actions). -AgentOps grades both the **final natural-language answer** *and* the -**tool selection / arguments** the agent chose along the way. +This tutorial is the tool-calling companion to the HTTP tutorial. You +will build an agent that chooses between support tools, deploy it to +Azure Container Apps, evaluate both the final answer and the tool trace, +and add a CI gate. -## Required dataset shape +Use this tutorial when you care about questions such as: -What turns a regular dataset into a tool-calling dataset is one or -both of these row fields: +- Did the agent call the right tool? +- Did it pass the right arguments? +- Did it avoid tools when the user only said hello? +- Did tool quality regress in a pull request? -| Field | What it is | +## How AgentOps grades tool workflows + +AgentOps uses normal answer-quality metrics plus tool-specific metrics +when the dataset includes `tool_calls` or `tool_definitions`. + +| Dataset field | Purpose | |---|---| -| `tool_definitions` | The tools the agent has access to (OpenAI tool-call schema). | -| `tool_calls` | The expected tool calls (name + arguments). | +| `tool_definitions` | Tool catalogue available to the agent. Include it on every JSONL row so each row is self-contained. | +| `tool_calls` | Expected tool trace: tool name, call id, and arguments. | +| `input` | User message sent to the agent. | +| `expected` | Reference final answer. | -When AgentOps sees `tool_calls` (or `tool_definitions`) in the -dataset rows, it auto-selects the **agent workflow** evaluators: -TaskCompletion, ToolCallAccuracy, IntentResolution, TaskAdherence, -plus the conversational baseline metrics that apply to the target -(Coherence, Fluency, latency, and any explicitly configured text metric). +For HTTP agents, the response also needs a field that contains the +actual tool trace. In this tutorial that field is `tool_calls`. -## 1. Bootstrap +## 1. Create the support-agent project -```bash -pip install "agentops-toolkit @ git+https://github.com/Azure/agentops.git@develop" -agentops init -export AZURE_AI_FOUNDRY_PROJECT_ENDPOINT="https://.services.ai.azure.com/api/projects/" +```powershell +mkdir support-tools-agent +Set-Location support-tools-agent + +python -m venv .venv +.\.venv\Scripts\Activate.ps1 +python -m pip install -U pip +python -m pip install "agentops-toolkit[foundry,agent] @ git+https://github.com/Azure/agentops.git@develop" ``` -## 2. Edit `agentops.yaml` +Create the same FastAPI tool-calling agent used by the HTTP tutorial: -For a Foundry prompt agent that already has tools registered: +```powershell +@' +from __future__ import annotations -```yaml -version: 1 -agent: "weather-bot:2" -dataset: .agentops/data/tools.jsonl +from fastapi import FastAPI +from pydantic import BaseModel + + +app = FastAPI(title="AgentOps Support Tools Agent") + + +class ChatRequest(BaseModel): + message: str + + +def lookup_order(order_id: str) -> dict[str, str]: + status = { + "ORD-12345": "in transit and expected to arrive tomorrow", + "ORD-99001": "shipped yesterday and is waiting for carrier pickup", + }.get(order_id, "not found") + return {"order_id": order_id, "status": status} + + +def refund_order(order_id: str, reason: str) -> dict[str, str]: + return {"order_id": order_id, "status": "refund_started", "reason": reason} + + +@app.get("/health") +def health() -> dict[str, str]: + return {"status": "ok"} + + +@app.post("/chat") +def chat(request: ChatRequest) -> dict[str, object]: + message = request.message + + if "ORD-12345" in message or "ORD-99001" in message: + order_id = "ORD-12345" if "ORD-12345" in message else "ORD-99001" + result = lookup_order(order_id) + return { + "text": f"Order {order_id} is {result['status']}.", + "tool_calls": [ + { + "type": "tool_call", + "tool_call_id": "lookup_1", + "name": "lookup_order", + "arguments": {"order_id": order_id}, + } + ], + } + + if "refund" in message.lower() and "ORD-77821" in message: + result = refund_order("ORD-77821", "arrived broken") + return { + "text": "I started a refund for ORD-77821 because it arrived broken.", + "tool_calls": [ + { + "type": "tool_call", + "tool_call_id": "refund_1", + "name": "refund_order", + "arguments": { + "order_id": result["order_id"], + "reason": result["reason"], + }, + } + ], + } + + return { + "text": "Hello! I can help with order status, refunds, or connecting you to support.", + "tool_calls": [], + } +'@ | Set-Content app.py -Encoding utf8 + +@' +fastapi==0.115.14 +uvicorn[standard]==0.35.0 +pydantic==2.11.9 +'@ | Set-Content requirements.txt -Encoding utf8 + +@' +FROM python:3.11-slim + +WORKDIR /app +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt +COPY app.py . + +EXPOSE 8000 +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"] +'@ | Set-Content Dockerfile -Encoding utf8 ``` -For an HTTP-deployed agent that returns tool calls in its response -body: +The implementation is simple enough to inspect but still has the core +production contract: request JSON, business tools, final answer, and +structured tool trace. -```yaml +## 2. Deploy the agent to Azure + +```powershell +az login + +$env:AZURE_LOCATION = "eastus2" +$env:AZURE_RESOURCE_GROUP = "rg-agentops-tools-tutorial" +$env:ACA_NAME = "agentops-tools-$((Get-Date).ToString('MMddHHmm'))" + +az group create ` + --name $env:AZURE_RESOURCE_GROUP ` + --location $env:AZURE_LOCATION + +az containerapp up ` + --name $env:ACA_NAME ` + --resource-group $env:AZURE_RESOURCE_GROUP ` + --location $env:AZURE_LOCATION ` + --source . ` + --target-port 8000 ` + --ingress external + +$fqdn = az containerapp show ` + --name $env:ACA_NAME ` + --resource-group $env:AZURE_RESOURCE_GROUP ` + --query properties.configuration.ingress.fqdn ` + -o tsv + +$agentUrl = "https://$fqdn/chat" +Invoke-RestMethod -Uri "https://$fqdn/health" +``` + +This step matters because a tool workflow eval should exercise the same +HTTP boundary your production clients use, not a local-only shortcut. + +## 3. Initialize AgentOps + +```powershell +agentops init + +$env:AZURE_AI_FOUNDRY_PROJECT_ENDPOINT = "https://.services.ai.azure.com/api/projects/" +$env:AZURE_OPENAI_ENDPOINT = "https://.openai.azure.com" +$env:AZURE_OPENAI_DEPLOYMENT = "gpt-4o-mini" +$env:AZURE_AI_MODEL_DEPLOYMENT_NAME = "gpt-4o-mini" +``` + +## 4. Write `agentops.yaml` + +```powershell +@" version: 1 -agent: "https://aca-weather-bot.example.com/" -dataset: .agentops/data/tools.jsonl +agent: "$agentUrl" +dataset: .agentops/data/support-tools.jsonl request_field: message response_field: text tool_calls_field: tool_calls + +thresholds: + coherence: ">=3" + fluency: ">=3" + tool_call_accuracy: ">=0.8" + intent_resolution: ">=3" + task_adherence: ">=0.6" + avg_latency_seconds: "<=30" +"@ | Set-Content agentops.yaml -Encoding utf8 ``` -`tool_calls_field` tells AgentOps where in the response JSON to find -the structured tool calls (dot-path notation supported). +Why each threshold exists: + +| Threshold | What it protects | +|---|---| +| `coherence`, `fluency` | The final answer remains readable. | +| `tool_call_accuracy` | The tool name and arguments match the expected trace. | +| `intent_resolution` | The agent understood the user's task. | +| `task_adherence` | The agent did not drift away from the requested action. | +| `avg_latency_seconds` | The deployed endpoint stays responsive. | -## 3. Dataset shape (`tools.jsonl`) +## 5. Create the dataset -```jsonl -{"id":"1","input":"What's the weather in Paris, France?","expected":"Calls get_weather with location='Paris, France'.","tool_calls":[{"type":"function_call","name":"get_weather","arguments":{"location":"Paris, France"}}]} -{"id":"2","input":"How is the weather in Tokyo, Japan?","expected":"Calls get_weather with location='Tokyo, Japan'.","tool_calls":[{"type":"function_call","name":"get_weather","arguments":{"location":"Tokyo, Japan"}}]} +```powershell +New-Item -ItemType Directory -Force .agentops/data | Out-Null +@' +{"input":"Where is my order ORD-12345?","expected":"Order ORD-12345 is in transit and expected to arrive tomorrow.","tool_definitions":[{"type":"function","name":"lookup_order","description":"Look up an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"}},"required":["order_id"]}},{"type":"function","name":"refund_order","description":"Refund an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"},"reason":{"type":"string"}},"required":["order_id","reason"]}}],"tool_calls":[{"type":"tool_call","tool_call_id":"lookup_1","name":"lookup_order","arguments":{"order_id":"ORD-12345"}}]} +{"input":"I want a refund for ORD-77821, it arrived broken.","expected":"A refund is started for ORD-77821 because it arrived broken.","tool_definitions":[{"type":"function","name":"lookup_order","description":"Look up an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"}},"required":["order_id"]}},{"type":"function","name":"refund_order","description":"Refund an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"},"reason":{"type":"string"}},"required":["order_id","reason"]}}],"tool_calls":[{"type":"tool_call","tool_call_id":"refund_1","name":"refund_order","arguments":{"order_id":"ORD-77821","reason":"arrived broken"}}]} +{"input":"Hi there!","expected":"The assistant replies with a clear greeting and offers support options without calling a tool.","tool_definitions":[{"type":"function","name":"lookup_order","description":"Look up an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"}},"required":["order_id"]}},{"type":"function","name":"refund_order","description":"Refund an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"},"reason":{"type":"string"}},"required":["order_id","reason"]}}],"tool_calls":[]} +'@ | Set-Content .agentops/data/support-tools.jsonl -Encoding utf8 ``` -Include `tool_definitions` when you evaluate tool-call accuracy. The -evaluator needs the schema of every tool the agent should know about; -repeat the catalogue on each JSONL row so every row is self-contained. +The third row is as important as the first two. It asserts that greeting +messages should not call a business tool. -## 4. Run +## 6. Run and inspect the eval -```bash +```powershell agentops eval run +code .agentops/results/latest/report.md ``` -The report's per-row block shows: +The report should include: -- The agent's final text response -- The structured tool calls the agent emitted -- ToolCallAccuracy / IntentResolution / TaskAdherence scores +- Aggregate metric values. +- Threshold pass/fail status. +- Per-row tool traces. +- The latency of calls to the deployed Container App. -## 5. CI gate +If the tool-call metrics fail, inspect the row in the report before +changing thresholds. Usually the bug is an incorrect tool name, missing +argument, or response mapping mismatch. -In a PR check, fail when tool quality regresses. After your first -run, diff every subsequent run against it: +## 7. Add a PR gate -```bash -agentops eval run --baseline .agentops/results/latest/results.json +```powershell +agentops workflow generate --kinds pr --force ``` -AgentOps loads the baseline into memory before refreshing `latest/`, -so `latest/results.json` is shorthand for "the run before this one". -For CI, commit a stable baseline file (see -[tutorial-baseline-comparison.md](tutorial-baseline-comparison.md)). +Use PR-only first. Generate DEV/QA/PROD deploy workflows only after you +have configured GitHub Environments, OIDC federated credentials, and real +build/deploy commands. Otherwise a push to `main` will create a red +workflow that proves nothing about agent quality. -## Build a real tool-calling agent +Configure the `dev` environment variables and OIDC credential as shown in +[tutorial-http-agent.md](tutorial-http-agent.md#8-add-a-pr-evaluation-gate). -The repo's E2E test deploys a real Microsoft Agent Framework agent -(FastAPI on Container Apps) with a `get_weather` tool. See: +## 8. Run Watchdog -- `infra/e2e/agent-app/app.py` — minimal Agent Framework + FastAPI app -- `infra/e2e/perrun.bicep` — per-run ACA deployment -- `scripts/e2e_data/tools.jsonl` — the dataset used to grade it +```powershell +agentops agent analyze --severity-fail critical +code .agentops/agent/report.md +``` + +Watchdog reads `.agentops/results/*/results.json` and looks for quality, +latency, error, and safety findings. If you configure +`.agentops/agent.yaml` with an Application Insights resource id, it also +queries Azure Monitor. The coding-agent skill `agentops-agent` is just a +guided way to run these commands; it is not the runtime analyzer itself. + +## 9. Expand the scenario -That same setup is what `tutorial-http-agent.md` walks through. +After this tutorial passes, make the dataset closer to production: -## See also +- Add a row for an unknown order and expect a safe escalation. +- Add a refund row without an order id and expect no `refund_order` call. +- Add negative rows where the user asks for unrelated help. +- Save one passing `results.json` as a baseline and compare future runs + with `agentops eval run --baseline `. -- [tutorial-conversational-agent.md](tutorial-conversational-agent.md) — same shape, no tools -- [tutorial-http-agent.md](tutorial-http-agent.md) — deploying an HTTP agent -- [tutorial-rag.md](tutorial-rag.md) — RAG instead of tools -- [foundry-evaluation-sdk-built-in-evaluators.md](foundry-evaluation-sdk-built-in-evaluators.md) — full evaluator reference +## Cleanup + +```powershell +az group delete --name $env:AZURE_RESOURCE_GROUP --yes --no-wait +``` diff --git a/docs/tutorial-copilot-skills.md b/docs/tutorial-copilot-skills.md index 7816c44c..6773c488 100644 --- a/docs/tutorial-copilot-skills.md +++ b/docs/tutorial-copilot-skills.md @@ -1,174 +1,159 @@ -# Tutorial — Copilot-assisted AgentOps workflow +# Tutorial: Copilot skills for AgentOps -This tutorial shows how to use the AgentOps coding-agent skills as a -guided development workflow. Instead of memorizing the AgentOps schema, -you let Copilot inspect the project, generate the config and dataset, run -the eval, explain the report, and create the CI/CD workflow. +AgentOps skills are instructions for a coding agent. They help GitHub +Copilot, Copilot CLI, Cursor, or Claude Code inspect your repository and +produce the right AgentOps files and commands. -The tutorial is still fully executable without guessing: each Copilot -prompt is followed by the concrete file or command you should expect. +They are not the same thing as the AgentOps Watchdog runtime: -## What you will build +| Concept | Where it lives | What it does | +|---|---|---| +| Coding-agent skills | `.github/skills/` or `.claude/commands/` | Guide Copilot to create config, datasets, workflows, evals, reports, and Watchdog commands. | +| Watchdog runtime | `agentops agent analyze` / `agentops agent serve` | Reads real eval history, Azure Monitor telemetry, and Foundry metadata to produce findings. | +| `agentops-agent` skill | Installed skill file | The Copilot-facing workflow for invoking Watchdog. It does not invent findings. | -- A small HTTP support agent that answers three customer-service - questions. -- Installed AgentOps skills under `.github/skills/`. -- A flat `agentops.yaml` generated from project context. -- A JSONL dataset generated for the agent's behavior. -- One passing local evaluation and a readable `report.md`. -- GitHub Actions workflow files generated from the skill-guided flow. +## Implemented skills -## Prerequisites +The current CLI installs these skills: -- Python 3.11 or later. -- GitHub Copilot Chat or Copilot CLI with repository context. -- Azure CLI login and a judge-model deployment for AI-assisted evaluators. - -```powershell -python -m venv .venv -.\.venv\Scripts\Activate.ps1 -python -m pip install -U pip -python -m pip install "agentops-toolkit @ git+https://github.com/Azure/agentops.git@develop" - -az login -$env:AZURE_AI_FOUNDRY_PROJECT_ENDPOINT = "https://.services.ai.azure.com/api/projects/" -$env:AZURE_OPENAI_ENDPOINT = "https://.openai.azure.com" -$env:AZURE_OPENAI_DEPLOYMENT = "gpt-4o-mini" -``` - -> If you are testing unreleased AgentOps changes locally, install from -> your checkout instead: -> -> ```powershell -> python -m pip install -e "C:\path\to\agentops[foundry,agent]" -> ``` - -## 1. Create the sample agent - -Create `support_agent.py`: +| Skill | Responsibility | +|---|---| +| `agentops-config` | Generate or update flat `agentops.yaml` from project context. | +| `agentops-dataset` | Create realistic JSONL evaluation rows. | +| `agentops-eval` | Run evals, handle exit codes, and compare against baselines. | +| `agentops-report` | Explain `results.json` and `report.md`. | +| `agentops-workflow` | Generate supported GitHub Actions workflows and explain required GitHub/Azure wiring. | +| `agentops-agent` | Run and interpret Watchdog (`agentops agent analyze` / `serve`). | -```python -from http.server import BaseHTTPRequestHandler, HTTPServer -import json +There are no shipped `agentops-monitor`, `agentops-trace`, or +`agentops-regression` skills in this implementation. Monitoring, +tracing, and regression analysis belong to the Watchdog runtime and +reports until dedicated skills are implemented. +## Installation options -RESPONSES = { - "Where is my order ORD-12345?": "Order ORD-12345 is in transit and expected to arrive tomorrow.", - "Can I return a damaged headset from ORD-77821?": "Yes. Start a return for ORD-77821 and choose damaged item as the reason.", - "How do I contact a human support agent?": "I can connect you to a human support agent for account or order issues.", -} +### Option 1: VS Code extension +Install **AgentOps Skills** from the VS Code Marketplace. Use this when +you want Copilot in VS Code to discover the packaged skills through the +extension/plugin experience. -class Handler(BaseHTTPRequestHandler): - def do_POST(self): - length = int(self.headers.get("content-length", "0")) - body = json.loads(self.rfile.read(length)) - message = body.get("message", "") - text = RESPONSES.get(message, "I can help with order status, returns, and support escalation.") +### Option 2: CLI install into a repository - payload = json.dumps({"text": text}).encode("utf-8") - self.send_response(200) - self.send_header("content-type", "application/json") - self.send_header("content-length", str(len(payload))) - self.end_headers() - self.wfile.write(payload) +Use this when you want the skills checked into a repo: +```powershell +python -m venv .venv +.\.venv\Scripts\Activate.ps1 +python -m pip install -U pip +python -m pip install "agentops-toolkit[foundry,agent] @ git+https://github.com/Azure/agentops.git@develop" -HTTPServer(("127.0.0.1", 8790), Handler).serve_forever() +agentops skills install --platform copilot --force ``` -Start it in a second terminal: +Expected files: -```powershell -.\.venv\Scripts\Activate.ps1 -python support_agent.py +```text +.github/copilot-instructions.md +.github/skills/agentops-config/SKILL.md +.github/skills/agentops-dataset/SKILL.md +.github/skills/agentops-eval/SKILL.md +.github/skills/agentops-report/SKILL.md +.github/skills/agentops-workflow/SKILL.md +.github/skills/agentops-agent/SKILL.md ``` -## 2. Initialize AgentOps and install skills +Use `--platform claude` for `.claude/commands/*.md`, `--platform cursor` +for Cursor rules, or omit `--platform` and let AgentOps auto-detect the +repo. -```powershell -agentops init -agentops skills install --platform copilot --force -``` +## Scenario: use Copilot to set up AgentOps for a real HTTP agent -You should now have: +This scenario assumes you already built and deployed the Azure Container +Apps support agent from [tutorial-http-agent.md](tutorial-http-agent.md). +That tutorial gives you a URL like: ```text -agentops.yaml -.agentops/data/smoke.jsonl -.github/skills/ - agentops-config/SKILL.md - agentops-dataset/SKILL.md - agentops-eval/SKILL.md - agentops-report/SKILL.md - agentops-workflow/SKILL.md +https://..azurecontainerapps.io/chat ``` -The skills are workflow instructions for Copilot. They tell Copilot how -to inspect the workspace, which AgentOps files to create, which commands -are valid, and when to ask for missing values instead of inventing them. +Set local evaluator variables: + +```powershell +$env:AZURE_AI_FOUNDRY_PROJECT_ENDPOINT = "https://.services.ai.azure.com/api/projects/" +$env:AZURE_OPENAI_ENDPOINT = "https://.openai.azure.com" +$env:AZURE_OPENAI_DEPLOYMENT = "gpt-4o-mini" +$env:AZURE_AI_MODEL_DEPLOYMENT_NAME = "gpt-4o-mini" +``` -## 3. Ask Copilot to configure AgentOps +## 1. Ask Copilot to configure AgentOps -In Copilot Chat, ask: +Prompt Copilot: ```text -Use the agentops-config skill. Inspect this project and create an -AgentOps config for the local HTTP support agent on port 8790. +Use the agentops-config skill. Inspect this repository and create an +AgentOps config for the deployed HTTP support agent. The agent URL is +https://..azurecontainerapps.io/chat. The request +field is message, the final answer is in text, and returned tool calls +are in tool_calls. ``` Expected `agentops.yaml`: ```yaml version: 1 -agent: "http://127.0.0.1:8790/" -dataset: .agentops/data/support-agent.jsonl +agent: "https://..azurecontainerapps.io/chat" +dataset: .agentops/data/support-tools.jsonl request_field: message response_field: text +tool_calls_field: tool_calls thresholds: coherence: ">=3" fluency: ">=3" - similarity: ">=3" - avg_latency_seconds: "<=2" + tool_call_accuracy: ">=0.8" + intent_resolution: ">=3" + task_adherence: ">=0.6" + avg_latency_seconds: "<=30" ``` -Why this is the right config: +What Copilot should explain: -- `agent` is the local HTTP endpoint. -- `request_field` matches `body.get("message")` in `support_agent.py`. -- `response_field` matches the returned JSON key `{ "text": ... }`. -- The thresholds are intentionally simple for the first smoke gate. +- `agent` is the deployed URL, not a local loopback address. +- `request_field` matches the HTTP request body the agent expects. +- `response_field` and `tool_calls_field` match the JSON response. +- Tool thresholds are included because the endpoint returns tool traces. -## 4. Ask Copilot to generate the dataset +## 2. Ask Copilot to create the dataset -In Copilot Chat, ask: +Prompt Copilot: ```text -Use the agentops-dataset skill. Generate a small deterministic JSONL -dataset for the support agent behavior in support_agent.py. +Use the agentops-dataset skill. Generate a deterministic JSONL dataset +for this support agent. Include order lookup, refund, and no-tool greeting +rows. Include tool_definitions and expected tool_calls on every row. ``` -Expected `.agentops/data/support-agent.jsonl`: +Expected file: `.agentops/data/support-tools.jsonl` ```jsonl -{"input":"Where is my order ORD-12345?","expected":"Order ORD-12345 is in transit and expected to arrive tomorrow."} -{"input":"Can I return a damaged headset from ORD-77821?","expected":"The customer can start a return for ORD-77821 and choose damaged item as the reason."} -{"input":"How do I contact a human support agent?","expected":"The assistant can connect the customer to a human support agent for account or order issues."} +{"input":"Where is my order ORD-12345?","expected":"Order ORD-12345 is in transit and expected to arrive tomorrow.","tool_definitions":[{"type":"function","name":"lookup_order","description":"Look up an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"}},"required":["order_id"]}},{"type":"function","name":"refund_order","description":"Refund an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"},"reason":{"type":"string"}},"required":["order_id","reason"]}}],"tool_calls":[{"type":"tool_call","tool_call_id":"lookup_1","name":"lookup_order","arguments":{"order_id":"ORD-12345"}}]} +{"input":"I want a refund for ORD-77821, it arrived broken.","expected":"A refund is started for ORD-77821 because it arrived broken.","tool_definitions":[{"type":"function","name":"lookup_order","description":"Look up an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"}},"required":["order_id"]}},{"type":"function","name":"refund_order","description":"Refund an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"},"reason":{"type":"string"}},"required":["order_id","reason"]}}],"tool_calls":[{"type":"tool_call","tool_call_id":"refund_1","name":"refund_order","arguments":{"order_id":"ORD-77821","reason":"arrived broken"}}]} +{"input":"Hi there!","expected":"The assistant replies with a clear greeting and offers support options without calling a tool.","tool_definitions":[{"type":"function","name":"lookup_order","description":"Look up an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"}},"required":["order_id"]}},{"type":"function","name":"refund_order","description":"Refund an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"},"reason":{"type":"string"}},"required":["order_id","reason"]}}],"tool_calls":[]} ``` -The dataset uses exact intents that the sample app implements. That makes -the first run a configuration smoke test: if it fails, you likely have a -field mapping, endpoint, auth, or environment problem rather than a -prompt-quality problem. +The no-tool greeting row prevents a common regression: agents that call a +business action even when the user only greets them. -## 5. Ask Copilot to run the eval +## 3. Ask Copilot to run the eval -In Copilot Chat, ask: +Prompt Copilot: ```text -Use the agentops-eval skill. Run the evaluation and explain any failure. +Use the agentops-eval skill. Run the AgentOps evaluation. If it fails, +explain whether the failure is config, endpoint reachability, auth, tool +trace mismatch, or threshold quality. ``` Expected command: @@ -180,113 +165,133 @@ agentops eval run Expected outputs: ```text -.agentops/results//results.json -.agentops/results//report.md .agentops/results/latest/results.json .agentops/results/latest/report.md ``` -Exit code `0` means the config, dataset, HTTP agent, and thresholds all -worked. Exit code `2` means the run completed but one or more thresholds -failed. Exit code `1` means a runtime/configuration error. +Exit codes: -## 6. Ask Copilot to interpret the report +| Code | Meaning | +|---|---| +| `0` | Eval succeeded and thresholds passed. | +| `2` | Eval succeeded but one or more thresholds failed. | +| `1` | Runtime or configuration error. | -In Copilot Chat, ask: +## 4. Ask Copilot to explain the report + +Prompt Copilot: ```text -Use the agentops-report skill. Read the latest report and summarize the -strongest rows, weakest rows, and next improvement. +Use the agentops-report skill. Read .agentops/results/latest/report.md +and summarize the verdict, weakest metric, weakest row, and next code or +dataset change. ``` -A useful answer should not just say "pass" or "fail". It should point to: +A useful answer should cite concrete report evidence. It should not just +say "passed". For tool agents, it should mention: -- the threshold table in `.agentops/results/latest/report.md`; -- the lowest-scoring row or metric; -- whether latency is agent runtime or evaluator overhead; -- a concrete next change, such as improving an answer or tightening a - threshold after repeated passing runs. +- whether text-quality thresholds passed; +- whether `tool_call_accuracy` passed; +- which row had the weakest intent or adherence score; +- whether latency looks like an endpoint problem. -## 7. Ask Copilot to add the PR gate +## 5. Ask Copilot to add CI -In Copilot Chat, ask: +Prompt Copilot: ```text -Use the agentops-workflow skill. Generate the GitHub Actions workflow -files and tell me which GitHub environment variables are required. +Use the agentops-workflow skill. Add a GitHub Actions PR gate only, then +tell me which GitHub environment variables and Azure federated credential +are required before I push it. ``` Expected command: ```powershell -agentops workflow generate +agentops workflow generate --kinds pr --force +``` + +Why PR-only? The full `agentops workflow generate` scaffold includes +DEV/QA/PROD deploy workflows. Those are correct for a real release +pipeline, but they must not be pushed until GitHub Environments, Azure +OIDC, and build/deploy placeholders are configured. Otherwise the first +push to `main` will create a red deploy workflow before the tutorial +teaches anything useful. + +Configure the `dev` environment variables: + +```powershell +$repo = "/" + +gh api -X PUT "repos/$repo/environments/dev" | Out-Null +gh variable set AZURE_CLIENT_ID --repo $repo --env dev --body "" +gh variable set AZURE_TENANT_ID --repo $repo --env dev --body "" +gh variable set AZURE_SUBSCRIPTION_ID --repo $repo --env dev --body "" +gh variable set AZURE_AI_FOUNDRY_PROJECT_ENDPOINT --repo $repo --env dev --body $env:AZURE_AI_FOUNDRY_PROJECT_ENDPOINT +gh variable set AZURE_OPENAI_ENDPOINT --repo $repo --env dev --body $env:AZURE_OPENAI_ENDPOINT +gh variable set AZURE_OPENAI_DEPLOYMENT --repo $repo --env dev --body $env:AZURE_OPENAI_DEPLOYMENT ``` -Expected workflow files: +Add an Azure federated credential with subject: ```text -.github/workflows/agentops-pr.yml -.github/workflows/agentops-deploy-dev.yml -.github/workflows/agentops-deploy-qa.yml -.github/workflows/agentops-deploy-prod.yml +repo:/:environment:dev ``` -For this HTTP tutorial, the PR gate needs the same evaluator-model values -you used locally: +Open a PR and wait for `AgentOps PR` to pass before merging. -| GitHub variable | Purpose | -|---|---| -| `AZURE_CLIENT_ID` | OIDC identity used by `azure/login`. | -| `AZURE_TENANT_ID` | Tenant for the OIDC login. | -| `AZURE_SUBSCRIPTION_ID` | Azure subscription for the login. | -| `AZURE_AI_FOUNDRY_PROJECT_ENDPOINT` | Foundry project used by AI-assisted evaluators. | -| `AZURE_OPENAI_ENDPOINT` | Azure OpenAI endpoint for the judge model. | -| `AZURE_OPENAI_DEPLOYMENT` | Judge model deployment, for example `gpt-4o-mini`. | +## 6. Ask Copilot to run Watchdog -If your HTTP agent is remote and protected, also add the token variable -referenced by `auth_header_env`. +Prompt Copilot: -Because this tutorial starts the sample agent on `127.0.0.1`, GitHub -Actions must start that process before `agentops eval run`. For this -sample repo, add this step between **Install AgentOps Toolkit** and -**Run AgentOps eval** in `agentops-pr.yml`: +```text +Use the agentops-agent skill. Run Watchdog against this repository's +AgentOps results and summarize the top findings. Do not invent telemetry; +if a source is skipped, say why. +``` -```yaml - - name: Start local tutorial agent - run: | - python support_agent.py & - sleep 2 +Expected command: + +```powershell +agentops agent analyze --severity-fail critical ``` -For a deployed ACA/AKS/App Service endpoint, skip that step and point -`agent:` at the public or private URL your runner can reach. +Expected output: -## 8. Push the tutorial repo +```text +.agentops/agent/report.md +``` -```powershell -git init -b main -git add . -git commit -m "feat: add Copilot-assisted AgentOps eval" -gh repo create "agentops-copilot-skills-" --public --source=. --push +The Watchdog report should list: + +- sources that ran, such as results history; +- sources that skipped, such as Azure Monitor if `.agentops/agent.yaml` + has no Application Insights resource id; +- findings sorted by severity; +- recommendations generated from the analyzer, not from Copilot guesses. + +## 7. When to generate the full CI/CD scaffold + +After the PR gate is green and you have real deployment commands, ask: + +```text +Use the agentops-workflow skill. Generate the full dev/qa/prod workflow +scaffold and wire it to this repository's actual Azure Container Apps +build and deploy commands. ``` -The first PR against `main` or `develop` will run `agentops-pr.yml`. -When it finishes, open the workflow artifact or PR comment to view the -same `report.md` you inspected locally. +Expected command: -## What Copilot should have learned +```powershell +agentops workflow generate --kinds pr,dev,qa,prod --force +``` -The skills keep Copilot inside the AgentOps contract: +Before pushing those files, verify: -- `agentops-config` creates a flat `agentops.yaml`, not legacy - `run.yaml` / bundle / dataset config files. -- `agentops-dataset` creates rows tailored to the app instead of generic - trivia. -- `agentops-eval` runs `agentops eval run` and respects exit codes. -- `agentops-report` turns metrics into actionable insights. -- `agentops-workflow` generates the standard GitFlow workflow scaffold - without inventing unsupported flags or commands. +- GitHub Environments `dev`, `qa`, and `production` exist. +- Production has required reviewers. +- Azure federated credentials exist for every workflow subject. +- Build and deploy placeholders are replaced with real commands. -That is the intended AgentOps development loop: Copilot accelerates the -file creation and interpretation, while AgentOps supplies the repeatable -evaluation contract. +That is the difference between a useful CI/CD tutorial and a red Action +that only proves the repo was not configured. diff --git a/docs/tutorial-http-agent.md b/docs/tutorial-http-agent.md index e9fa7f14..4397c6a7 100644 --- a/docs/tutorial-http-agent.md +++ b/docs/tutorial-http-agent.md @@ -1,128 +1,214 @@ -# Tutorial: HTTP Agent Evaluation +# Tutorial: HTTP agent on Azure Container Apps -This tutorial shows how to evaluate an agent that is exposed as an -HTTP/JSON endpoint. That endpoint can be a local development server, -Azure Container Apps, AKS, App Service, FastAPI, Express, Microsoft Agent -Framework, LangGraph, or any service that accepts a prompt and returns a -text response. +This tutorial builds a real HTTP tool-calling agent, deploys it to Azure +Container Apps, evaluates it with AgentOps, adds a GitHub Actions PR gate, +and runs the Watchdog analyzer over the produced eval history. -AgentOps treats HTTP agents the same way it treats Foundry agents after -the call succeeds: it loads JSONL rows, POSTs one row at a time, extracts -the answer, runs evaluators, and writes `results.json` plus `report.md`. +The important idea is that AgentOps does not care which framework hosts +your agent. For HTTP targets it needs only: -## What you will build +1. A URL to call. +2. The JSON field that receives the user message. +3. The JSON field that contains the final response. +4. Optionally, the JSON field that contains structured tool calls. -- A tiny local HTTP agent so you can run the tutorial without deploying - anything. -- A flat `agentops.yaml` that points to the HTTP URL. -- A JSONL dataset with deterministic support-style questions. -- One `agentops eval run` producing a passing report. +## What you will build -Use the same pattern later by changing only the `agent:` URL and field -mapping for your real deployed agent. +- A FastAPI customer-support agent with two real Python tools: + `lookup_order` and `refund_order`. +- A Docker image deployed to Azure Container Apps. +- An `agentops.yaml` pointing to the public Container Apps URL. +- A JSONL dataset that checks both final answers and tool-call behavior. +- A passing local eval, a PR workflow, and a Watchdog report. ## Prerequisites ```powershell +az login +gh auth login + python -m venv .venv .\.venv\Scripts\Activate.ps1 python -m pip install -U pip -python -m pip install "agentops-toolkit @ git+https://github.com/Azure/agentops.git@develop" -``` - -If you use AI-assisted evaluators such as Similarity or Fluency, also set -the judge model and sign in to Azure: +python -m pip install "agentops-toolkit[foundry,agent] @ git+https://github.com/Azure/agentops.git@develop" -```powershell -az login $env:AZURE_AI_FOUNDRY_PROJECT_ENDPOINT = "https://.services.ai.azure.com/api/projects/" $env:AZURE_OPENAI_ENDPOINT = "https://.openai.azure.com" $env:AZURE_OPENAI_DEPLOYMENT = "gpt-4o-mini" +$env:AZURE_AI_MODEL_DEPLOYMENT_NAME = "gpt-4o-mini" ``` -## 1. Create a local HTTP agent +AgentOps is installed from the `develop` branch in this tutorial because +the 1.0 tutorial surface is still being tested before the PyPI release. + +## 1. Create the HTTP agent -Create `http_agent.py`: +Create `app.py`: ```python -from http.server import BaseHTTPRequestHandler, HTTPServer -import json - - -ANSWERS = { - "Where is my order ORD-12345?": { - "text": "Order ORD-12345 is in transit and expected to arrive tomorrow.", - "tool_calls": [{"type": "tool_call", "tool_call_id": "c1", "name": "lookup_order", "arguments": {"order_id": "ORD-12345"}}], - }, - "I want a refund for ORD-77821, it arrived broken.": { - "text": "I started a refund for ORD-77821 because it arrived broken.", - "tool_calls": [{"type": "tool_call", "tool_call_id": "c2", "name": "refund_order", "arguments": {"order_id": "ORD-77821", "reason": "arrived broken"}}], - }, - "Hi there!": { - "text": "Hello! I can help with order status, refunds, or connecting you to a human support agent.", +from __future__ import annotations + +from fastapi import FastAPI +from pydantic import BaseModel + + +app = FastAPI(title="AgentOps Support Tools Agent") + + +class ChatRequest(BaseModel): + message: str + + +def lookup_order(order_id: str) -> dict[str, str]: + status = { + "ORD-12345": "in transit and expected to arrive tomorrow", + "ORD-99001": "shipped yesterday and is waiting for carrier pickup", + }.get(order_id, "not found") + return {"order_id": order_id, "status": status} + + +def refund_order(order_id: str, reason: str) -> dict[str, str]: + return {"order_id": order_id, "status": "refund_started", "reason": reason} + + +@app.get("/health") +def health() -> dict[str, str]: + return {"status": "ok"} + + +@app.post("/chat") +def chat(request: ChatRequest) -> dict[str, object]: + message = request.message + + if "ORD-12345" in message or "ORD-99001" in message: + order_id = "ORD-12345" if "ORD-12345" in message else "ORD-99001" + result = lookup_order(order_id) + return { + "text": f"Order {order_id} is {result['status']}.", + "tool_calls": [ + { + "type": "tool_call", + "tool_call_id": "lookup_1", + "name": "lookup_order", + "arguments": {"order_id": order_id}, + } + ], + } + + if "refund" in message.lower() and "ORD-77821" in message: + result = refund_order("ORD-77821", "arrived broken") + return { + "text": "I started a refund for ORD-77821 because it arrived broken.", + "tool_calls": [ + { + "type": "tool_call", + "tool_call_id": "refund_1", + "name": "refund_order", + "arguments": { + "order_id": result["order_id"], + "reason": result["reason"], + }, + } + ], + } + + return { + "text": "Hello! I can help with order status, refunds, or connecting you to support.", "tool_calls": [], - }, -} + } +``` +This is intentionally small but not fake: the agent has a real request +contract, real tool functions, and returns the structured tool trace that +AgentOps can grade. + +Create `requirements.txt`: + +```text +fastapi==0.115.14 +uvicorn[standard]==0.35.0 +pydantic==2.11.9 +``` -class Handler(BaseHTTPRequestHandler): - def do_POST(self): - length = int(self.headers.get("content-length", "0")) - body = json.loads(self.rfile.read(length)) - message = body.get("message", "") - response = ANSWERS.get(message, {"text": "I do not know yet.", "tool_calls": []}) +Create `Dockerfile`: - payload = json.dumps(response).encode("utf-8") - self.send_response(200) - self.send_header("content-type", "application/json") - self.send_header("content-length", str(len(payload))) - self.end_headers() - self.wfile.write(payload) +```dockerfile +FROM python:3.11-slim +WORKDIR /app +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt +COPY app.py . -HTTPServer(("127.0.0.1", 8787), Handler).serve_forever() +EXPOSE 8000 +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"] ``` -Start it in a second terminal: +## 2. Deploy to Azure Container Apps + +Choose names once: ```powershell -.\.venv\Scripts\Activate.ps1 -python http_agent.py +$env:AZURE_LOCATION = "eastus2" +$env:AZURE_RESOURCE_GROUP = "rg-agentops-http-tutorial" +$env:ACA_NAME = "agentops-http-agent-$((Get-Date).ToString('MMddHHmm'))" ``` -Why this local server? It lets you prove the AgentOps HTTP contract before -you involve Container Apps, auth, networking, or deployment variables. -When this passes locally, a remote HTTP target is just a URL swap. +Deploy the container: -## 2. Initialize AgentOps +```powershell +az group create ` + --name $env:AZURE_RESOURCE_GROUP ` + --location $env:AZURE_LOCATION + +az containerapp up ` + --name $env:ACA_NAME ` + --resource-group $env:AZURE_RESOURCE_GROUP ` + --location $env:AZURE_LOCATION ` + --source . ` + --target-port 8000 ` + --ingress external + +$fqdn = az containerapp show ` + --name $env:ACA_NAME ` + --resource-group $env:AZURE_RESOURCE_GROUP ` + --query properties.configuration.ingress.fqdn ` + -o tsv + +$agentUrl = "https://$fqdn/chat" +$agentUrl +``` -Back in your first terminal: +Smoke-test the deployed service: ```powershell -agentops init +Invoke-RestMethod -Uri "https://$fqdn/health" +Invoke-RestMethod ` + -Uri $agentUrl ` + -Method Post ` + -ContentType "application/json" ` + -Body '{"message":"I want a refund for ORD-77821, it arrived broken."}' ``` -This creates: +## 3. Initialize AgentOps -```text -agentops.yaml -.agentops/ - data/smoke.jsonl - results/ -.github/skills/ +```powershell +agentops init ``` -AgentOps 1.0 uses one flat config file at the project root. You do not -need legacy `run-http.yaml`, bundle YAML, or dataset YAML files. +This creates `.agentops/`, a starter `agentops.yaml`, and coding-agent +skills under `.github/skills/`. The skills are guidance for Copilot or +another coding agent; they are not the Watchdog runtime. -## 3. Configure the HTTP endpoint +## 4. Configure the HTTP eval -Replace `agentops.yaml` with: +Replace `agentops.yaml`: -```yaml +```powershell +@" version: 1 -agent: "http://127.0.0.1:8787/" -dataset: .agentops/data/http-support.jsonl +agent: "$agentUrl" +dataset: .agentops/data/http-support-tools.jsonl request_field: message response_field: text @@ -133,120 +219,156 @@ thresholds: fluency: ">=3" tool_call_accuracy: ">=0.8" intent_resolution: ">=3" - task_adherence: ">=0.8" - avg_latency_seconds: "<=2" + task_adherence: ">=0.6" + avg_latency_seconds: "<=30" +"@ | Set-Content agentops.yaml -Encoding utf8 ``` -The HTTP field mapping controls the JSON protocol: +The field mapping is the HTTP contract: | Config field | Meaning | |---|---| | `request_field: message` | AgentOps sends `{"message": ""}`. | -| `response_field: text` | AgentOps reads the final answer from `response.text`. Dot paths such as `output.text` are supported. | -| `tool_calls_field: tool_calls` | AgentOps reads structured tool calls from `response.tool_calls` so tool metrics can run. | - -For a deployed endpoint that requires a Bearer token, add: - -```yaml -auth_header_env: AGENT_TOKEN -``` +| `response_field: text` | AgentOps reads the natural-language answer from `response.text`. | +| `tool_calls_field: tool_calls` | AgentOps reads the structured tool trace from `response.tool_calls`. | -Then set `$env:AGENT_TOKEN` before running the eval. +If your real endpoint is protected, add `auth_header_env: AGENT_TOKEN` +and set that environment variable before running the eval. -## 4. Create the dataset +## 5. Create the tool-calling dataset -Create `.agentops/data/http-support.jsonl`: +Create `.agentops/data/http-support-tools.jsonl`: -```jsonl -{"input":"Where is my order ORD-12345?","expected":"Order ORD-12345 is in transit and expected to arrive tomorrow.","tool_definitions":[{"type":"function","name":"lookup_order","description":"Look up an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"}},"required":["order_id"]}},{"type":"function","name":"refund_order","description":"Refund an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"},"reason":{"type":"string"}},"required":["order_id","reason"]}}],"tool_calls":[{"type":"tool_call","tool_call_id":"c1","name":"lookup_order","arguments":{"order_id":"ORD-12345"}}]} -{"input":"I want a refund for ORD-77821, it arrived broken.","expected":"A refund is started for ORD-77821 because it arrived broken.","tool_definitions":[{"type":"function","name":"lookup_order","description":"Look up an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"}},"required":["order_id"]}},{"type":"function","name":"refund_order","description":"Refund an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"},"reason":{"type":"string"}},"required":["order_id","reason"]}}],"tool_calls":[{"type":"tool_call","tool_call_id":"c2","name":"refund_order","arguments":{"order_id":"ORD-77821","reason":"arrived broken"}}]} +```powershell +New-Item -ItemType Directory -Force .agentops/data | Out-Null +@' +{"input":"Where is my order ORD-12345?","expected":"Order ORD-12345 is in transit and expected to arrive tomorrow.","tool_definitions":[{"type":"function","name":"lookup_order","description":"Look up an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"}},"required":["order_id"]}},{"type":"function","name":"refund_order","description":"Refund an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"},"reason":{"type":"string"}},"required":["order_id","reason"]}}],"tool_calls":[{"type":"tool_call","tool_call_id":"lookup_1","name":"lookup_order","arguments":{"order_id":"ORD-12345"}}]} +{"input":"I want a refund for ORD-77821, it arrived broken.","expected":"A refund is started for ORD-77821 because it arrived broken.","tool_definitions":[{"type":"function","name":"lookup_order","description":"Look up an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"}},"required":["order_id"]}},{"type":"function","name":"refund_order","description":"Refund an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"},"reason":{"type":"string"}},"required":["order_id","reason"]}}],"tool_calls":[{"type":"tool_call","tool_call_id":"refund_1","name":"refund_order","arguments":{"order_id":"ORD-77821","reason":"arrived broken"}}]} {"input":"Hi there!","expected":"The assistant replies with a clear greeting and offers support options without calling a tool.","tool_definitions":[{"type":"function","name":"lookup_order","description":"Look up an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"}},"required":["order_id"]}},{"type":"function","name":"refund_order","description":"Refund an order.","parameters":{"type":"object","properties":{"order_id":{"type":"string"},"reason":{"type":"string"}},"required":["order_id","reason"]}}],"tool_calls":[]} +'@ | Set-Content .agentops/data/http-support-tools.jsonl -Encoding utf8 ``` -Each row has: - -- `input` — what AgentOps sends to the HTTP service. -- `expected` — the reference answer for text-quality metrics. -- `tool_calls` — the expected structured tool behavior. Omit this field - if your HTTP endpoint does not expose tool calls. -- `tool_definitions` — the function-tool schema available to the agent. - Tool-call accuracy evaluators need this catalogue on each row. +Each row is self-contained. The expected `tool_calls` define what the +agent should do, and `tool_definitions` define the tool catalogue the +evaluator uses to judge selection and arguments. -## 5. Run the evaluation +## 6. Run the eval ```powershell agentops eval run ``` -The CLI should print a passing threshold summary and write: +Expected outputs: ```text .agentops/results//results.json .agentops/results//report.md -.agentops/results/latest/ +.agentops/results/latest/results.json +.agentops/results/latest/report.md ``` -Open the Markdown report: +Open the report: ```powershell code .agentops/results/latest/report.md ``` -The report shows the aggregate metrics, threshold table, and per-row -details. For the first two rows, the per-row section should include the -tool calls returned by the HTTP server. +The report should show text-quality metrics plus tool metrics such as +`tool_call_accuracy`, `intent_resolution`, and `task_adherence`. -## 6. Point it at a real service +## 7. Add a PR evaluation gate -When you deploy the agent, keep the dataset and thresholds but change the -URL and field mapping: - -```yaml -version: 1 -agent: "https://your-agent.region.azurecontainerapps.io/chat" -dataset: .agentops/data/http-support.jsonl +For a tutorial or a new repo, generate only the PR gate until Azure OIDC +and deploy placeholders are configured. This avoids the common mistake of +pushing DEV/QA/PROD deploy workflows that immediately fail on `main`. -request_field: message -response_field: output.text -tool_calls_field: output.tool_calls -auth_header_env: AGENT_TOKEN +```powershell +agentops workflow generate --kinds pr --force ``` -Run the same command: +Configure the `dev` GitHub Environment variables used by +`.github/workflows/agentops-pr.yml`: ```powershell -agentops eval run +$repo = "/" + +gh api -X PUT "repos/$repo/environments/dev" | Out-Null +gh variable set AZURE_CLIENT_ID --repo $repo --env dev --body "" +gh variable set AZURE_TENANT_ID --repo $repo --env dev --body "" +gh variable set AZURE_SUBSCRIPTION_ID --repo $repo --env dev --body "" +gh variable set AZURE_AI_FOUNDRY_PROJECT_ENDPOINT --repo $repo --env dev --body $env:AZURE_AI_FOUNDRY_PROJECT_ENDPOINT +gh variable set AZURE_OPENAI_ENDPOINT --repo $repo --env dev --body $env:AZURE_OPENAI_ENDPOINT +gh variable set AZURE_OPENAI_DEPLOYMENT --repo $repo --env dev --body $env:AZURE_OPENAI_DEPLOYMENT ``` -If the local server passed but the remote service fails, the issue is -usually deployment reachability, auth, or a response-field mismatch rather -than evaluator logic. +On the Azure app registration, add a federated credential for: -## Troubleshooting +```text +repo:/:environment:dev +``` -| Symptom | What to check | -|---|---| -| `connection refused` | The server is not running or the URL/port is wrong. | -| `Response field 'text' not found` | Update `response_field` to match your JSON response shape. | -| `tool_call_accuracy` is missing | Add `tool_calls_field` and make sure the response includes structured tool calls. | -| AI evaluator auth error | Run `az login` and set the Azure OpenAI / Foundry environment variables. | +Then open a PR and verify the `AgentOps PR` workflow is green before +merging. -## Exit codes +## 8. Run the Watchdog analyzer -| Code | Meaning | -|---|---| -| `0` | Evaluation succeeded and all thresholds passed. | -| `2` | Evaluation succeeded but at least one threshold failed. | -| `1` | Runtime or configuration error. | +Watchdog is a runtime analyzer, not a coding-agent skill. The +`agentops-agent` skill only tells Copilot how to call it. -## CI/CD integration +Start with results-history analysis: -After the local run passes, generate workflow files with: +```powershell +agentops agent analyze --severity-fail critical +code .agentops/agent/report.md +``` + +If you also want Azure Monitor data, create an Application Insights +resource and point `.agentops/agent.yaml` at it: ```powershell -agentops workflow generate +$appInsightsName = "$env:ACA_NAME-ai" +$appInsightsId = az monitor app-insights component create ` + --app $appInsightsName ` + --location $env:AZURE_LOCATION ` + --resource-group $env:AZURE_RESOURCE_GROUP ` + --application-type web ` + --query id ` + -o tsv + +@" +version: 1 +sources: + results_history: + enabled: true + path: .agentops/results + lookback_runs: 10 + azure_monitor: + enabled: true + app_insights_resource_id: $appInsightsId + foundry_control: + enabled: true + project_endpoint_env: AZURE_AI_FOUNDRY_PROJECT_ENDPOINT +"@ | Set-Content .agentops/agent.yaml -Encoding utf8 + +agentops agent analyze --severity-fail critical ``` -The generated PR workflow uses the same `agentops eval run` exit codes to -gate pull requests. See [ci-github-actions.md](ci-github-actions.md) for -the GitHub environment and OIDC setup. +The Watchdog report lists which sources ran and which were skipped. Do +not treat skipped telemetry sources as success; wire them before relying +on production-health conclusions. + +## Troubleshooting + +| Symptom | What to check | +|---|---| +| `connection refused` | You are still pointing at a local URL or the container app has no external ingress. | +| `Response field 'text' not found` | Update `response_field` to match the JSON response. | +| Tool metrics are missing | Add `tool_calls_field` and return structured tool calls from the endpoint. | +| GitHub Action fails in `azure/login` | Create the GitHub `dev` environment variables and the Azure federated credential before pushing the workflow. | +| AI evaluator auth fails | Confirm OIDC role assignments or run `az login` locally. | + +## Cleanup + +```powershell +az group delete --name $env:AZURE_RESOURCE_GROUP --yes --no-wait +``` diff --git a/plugins/agentops/README.md b/plugins/agentops/README.md index 10032811..adf5c444 100644 --- a/plugins/agentops/README.md +++ b/plugins/agentops/README.md @@ -1,85 +1,86 @@ -# AgentOps Skills for GitHub Copilot - -Copilot agent skills for running standardized evaluation workflows with -[AgentOps Toolkit](https://github.com/Azure/agentops) and Microsoft Foundry agents. - -## Skills - -| Skill | What it does | -|---|---| -| **agentops-eval** | Run evaluations end to end — single runs, multi-model benchmarks, and N-run comparisons | -| **agentops-config** | Infer the evaluation scenario from your codebase and generate `run.yaml` | -| **agentops-dataset** | Generate evaluation datasets (JSONL + YAML config) tailored to the project | -| **agentops-report** | Interpret evaluation reports, explain scores, and regenerate `report.md` | -| **agentops-regression** | Investigate regressions — compare runs, analyze per-row scores, identify root causes | -| **agentops-workflow** | Generate CI/CD pipelines (GitHub Actions) with PR gating and post-merge evaluation | -| **agentops-trace** | Set up OTLP tracing for evaluation runs | -| **agentops-monitor** | Guidance on monitoring evaluation quality over time | - -## Installation - -### VS Code Extension Marketplace - -Install from the -[VS Code Marketplace](https://marketplace.visualstudio.com/items?itemName=AgentOpsToolkit.agentops-toolkit) -or search **"AgentOps Skills"** in the VS Code Extensions view. - -### Agent Plugin Marketplace - -The AgentOps plugin is also available through the cross-tool **Agent Plugin -Marketplace**, which works with VS Code Copilot, Copilot CLI, and Claude Code. - -**VS Code** — add this to your `.vscode/settings.json`: - -```json -{ - "chat.plugins.extraKnownMarketplaces": ["Azure/agentops"], - "chat.plugins.enabledPlugins": ["agentops-toolkit"] -} -``` - -**Claude Code** — register the marketplace: - -```bash -claude plugin marketplace add Azure/agentops -``` - -## Usage - -Open **Copilot Chat** in VS Code and describe what you want to do. -Skills are invoked automatically when your request matches their domain. - -### Configure and run an evaluation - -``` -> Set up an evaluation for my Foundry agent -> Generate a dataset for my RAG pipeline -> Run the default evaluation against my agent -``` - -### Benchmark and compare - -``` -> Benchmark gpt-4o vs gpt-4o-mini using the smoke dataset -> Compare the last two runs and tell me what changed -``` - -### Understand results - -``` -> Explain the scores in my latest report -> Which rows failed the groundedness threshold? -> Why did similarity drop between these two runs? -``` - -### Automate with CI/CD - -``` -> Generate a GitHub Actions workflow that gates PRs on evaluation quality -``` - -## Links - -- [AgentOps Toolkit](https://github.com/Azure/agentops) — CLI and documentation -- [Tutorial: Basic Foundry Agent](https://github.com/Azure/agentops/blob/main/docs/tutorial-basic-foundry-agent.md) -- [How It Works](https://github.com/Azure/agentops/blob/main/docs/how-it-works.md) +# AgentOps Skills for Coding Agents + +This extension packages the same AgentOps skills that the CLI installs +with `agentops skills install`. Skills are **instructions for a coding +agent** such as GitHub Copilot, Copilot CLI, Cursor, or Claude Code. They +help the coding agent create config, datasets, reports, and workflows in +your repository. + +They are different from the **AgentOps Watchdog runtime agent**: + +- **Skills** live in `.github/skills/` or `.claude/commands/` and guide a + coding assistant. +- **Watchdog** is the runtime CLI/server behind `agentops agent analyze` + and `agentops agent serve`; it reads real eval history, Azure Monitor + telemetry, and Foundry metadata. +- The `agentops-agent` skill is only the coding-agent front door to that + Watchdog runtime. It does not fabricate findings. + +## Implemented skills + +| Skill | What it does | +|---|---| +| **agentops-config** | Inspect the workspace and generate or update flat `agentops.yaml`. | +| **agentops-dataset** | Generate realistic JSONL evaluation rows grounded in the app. | +| **agentops-eval** | Run `agentops eval run`, handle exit codes, and compare with `--baseline`. | +| **agentops-report** | Explain `results.json` / `report.md` and suggest concrete next actions. | +| **agentops-workflow** | Generate the supported GitHub Actions CI/CD scaffold and explain required GitHub/Azure wiring. | +| **agentops-agent** | Run and interpret the Watchdog runtime (`agentops agent analyze` / `serve`). | + +There are no shipped `agentops-monitor`, `agentops-trace`, or +`agentops-regression` skills in the current implementation. + +## Installation options + +### Option 1: VS Code extension + +Install from the +[VS Code Marketplace](https://marketplace.visualstudio.com/items?itemName=AgentOpsToolkit.agentops-toolkit) +or search **AgentOps Skills** in the VS Code Extensions view. + +Use this when you want Copilot in VS Code to discover the packaged +skills from the extension/plugin. + +### Option 2: CLI install into a repository + +Run this from the repository where you want skills checked in: + +```bash +python -m pip install "agentops-toolkit @ git+https://github.com/Azure/agentops.git@develop" +agentops skills install --platform copilot --force +``` + +This writes: + +```text +.github/copilot-instructions.md +.github/skills/agentops-config/SKILL.md +.github/skills/agentops-dataset/SKILL.md +.github/skills/agentops-eval/SKILL.md +.github/skills/agentops-report/SKILL.md +.github/skills/agentops-workflow/SKILL.md +.github/skills/agentops-agent/SKILL.md +``` + +Use `--platform claude` for `.claude/commands/*.md`, or omit +`--platform` and let AgentOps auto-detect the coding agent setup. + +## Usage + +Open Copilot Chat or your coding-agent CLI in the project and ask for the +workflow you need: + +```text +Set up AgentOps evaluation for this app. +Generate an evaluation dataset for the support-agent tools. +Run the eval and explain the failing rows. +Generate the GitHub Actions AgentOps workflow and tell me what Azure/GitHub variables it needs. +Run the AgentOps watchdog and summarize production latency findings. +``` + +## Links + +- [AgentOps Toolkit](https://github.com/Azure/agentops) +- [Copilot skills tutorial](https://github.com/Azure/agentops/blob/main/docs/tutorial-copilot-skills.md) +- [Watchdog tutorial](https://github.com/Azure/agentops/blob/main/docs/tutorial-agent-watchdog.md) +- [How it works](https://github.com/Azure/agentops/blob/main/docs/how-it-works.md) diff --git a/plugins/agentops/skills/agentops-workflow/SKILL.md b/plugins/agentops/skills/agentops-workflow/SKILL.md index d8e569f7..f17cc351 100644 --- a/plugins/agentops/skills/agentops-workflow/SKILL.md +++ b/plugins/agentops/skills/agentops-workflow/SKILL.md @@ -9,9 +9,11 @@ Help the user wire AgentOps into a real GenAIOps GitFlow CI/CD setup with three environments (`dev`, `qa`, `production`) and an automatic eval gate on every change. -This skill produces four workflow files via `agentops workflow generate` -and then walks the user through the GitHub-side configuration (OIDC, -environments, branch protection, deploy step). +For a new repository or tutorial, start with the PR gate only: +`agentops workflow generate --kinds pr`. Generate DEV/QA/PROD deploy +workflows only after GitHub Environments, Azure OIDC, and real +build/deploy commands are configured. This avoids creating failing +deploy Actions on the first push to `main`. ## Branch model assumed @@ -40,14 +42,27 @@ and have them generate `--kinds pr,dev,prod`. ## Step 1 — Generate the workflows ```bash -agentops workflow generate +agentops workflow generate --kinds pr ``` -This writes **four** files into `.github/workflows/`: +This writes the safe first workflow into `.github/workflows/`: | File | Trigger | Environment | |---|---|---| -| `agentops-pr.yml` | PRs to `develop`, `release/**`, `main` | (none) | +| `agentops-pr.yml` | PRs to `develop`, `release/**`, `main` plus manual dispatch | `dev` | + +After OIDC, environments, and real build/deploy commands are ready, expand +to the full scaffold: + +```bash +agentops workflow generate --kinds pr,dev,qa,prod --force +``` + +The full scaffold writes: + +| File | Trigger | Environment | +|---|---|---| +| `agentops-pr.yml` | PRs to `develop`, `release/**`, `main` | `dev` | | `agentops-deploy-dev.yml` | push to `develop` | `dev` | | `agentops-deploy-qa.yml` | push to `release/**` | `qa` | | `agentops-deploy-prod.yml` | push to `main` | `production` | @@ -55,8 +70,8 @@ This writes **four** files into `.github/workflows/`: Useful flags: - `--force` — overwrite existing workflow files. -- `--kinds pr,dev,qa,prod` — generate a subset (e.g. `--kinds pr,dev,prod` - for trunk-based teams). +- `--kinds pr,dev,qa,prod` — generate a subset. Prefer `--kinds pr` + until deploy environments are configured. - `--dir ` — non-default repo root. ## Step 2 — Configure GitHub Environments @@ -144,6 +159,8 @@ Common follow-ups: - Do **not** invent CLI flags. The supported `workflow generate` flags are `--force`, `--dir`, `--kinds`. +- Do **not** push DEV/QA/PROD deploy workflows with placeholder + Build/Deploy steps or missing OIDC variables; generate PR-only first. - Do **not** create parallel workflow files. Prefer editing the generated ones. - Do **not** auto-fill Build/Deploy with steps you can't justify from diff --git a/src/agentops/services/skills.py b/src/agentops/services/skills.py index cc904fa7..47088d93 100644 --- a/src/agentops/services/skills.py +++ b/src/agentops/services/skills.py @@ -21,6 +21,7 @@ "skills/agentops-dataset/SKILL.md", "skills/agentops-report/SKILL.md", "skills/agentops-workflow/SKILL.md", + "skills/agentops-agent/SKILL.md", ) _PLATFORM_CONFIGS: Dict[str, Dict[str, str]] = { @@ -61,6 +62,7 @@ | Generate evaluation datasets | `.github/skills/agentops-dataset/SKILL.md` | "create dataset", "generate test data", "JSONL" | | Interpret and regenerate reports | `.github/skills/agentops-report/SKILL.md` | "report", "results", "explain scores" | | CI/CD workflow setup | `.github/skills/agentops-workflow/SKILL.md` | "CI", "workflow", "pipeline", "GitHub Actions" | +| Watchdog analysis | `.github/skills/agentops-agent/SKILL.md` | "watchdog", "agent analyze", "production health", "latency spikes" | {_COPILOT_MARKER_END}""" _CURSOR_MDC = """\ @@ -80,6 +82,7 @@ | Generate evaluation datasets | `.github/skills/agentops-dataset/SKILL.md` | | Interpret and regenerate reports | `.github/skills/agentops-report/SKILL.md` | | CI/CD workflow setup | `.github/skills/agentops-workflow/SKILL.md` | +| Watchdog analysis | `.github/skills/agentops-agent/SKILL.md` | """ diff --git a/src/agentops/templates/skills/agentops-workflow/SKILL.md b/src/agentops/templates/skills/agentops-workflow/SKILL.md index d8e569f7..f17cc351 100644 --- a/src/agentops/templates/skills/agentops-workflow/SKILL.md +++ b/src/agentops/templates/skills/agentops-workflow/SKILL.md @@ -9,9 +9,11 @@ Help the user wire AgentOps into a real GenAIOps GitFlow CI/CD setup with three environments (`dev`, `qa`, `production`) and an automatic eval gate on every change. -This skill produces four workflow files via `agentops workflow generate` -and then walks the user through the GitHub-side configuration (OIDC, -environments, branch protection, deploy step). +For a new repository or tutorial, start with the PR gate only: +`agentops workflow generate --kinds pr`. Generate DEV/QA/PROD deploy +workflows only after GitHub Environments, Azure OIDC, and real +build/deploy commands are configured. This avoids creating failing +deploy Actions on the first push to `main`. ## Branch model assumed @@ -40,14 +42,27 @@ and have them generate `--kinds pr,dev,prod`. ## Step 1 — Generate the workflows ```bash -agentops workflow generate +agentops workflow generate --kinds pr ``` -This writes **four** files into `.github/workflows/`: +This writes the safe first workflow into `.github/workflows/`: | File | Trigger | Environment | |---|---|---| -| `agentops-pr.yml` | PRs to `develop`, `release/**`, `main` | (none) | +| `agentops-pr.yml` | PRs to `develop`, `release/**`, `main` plus manual dispatch | `dev` | + +After OIDC, environments, and real build/deploy commands are ready, expand +to the full scaffold: + +```bash +agentops workflow generate --kinds pr,dev,qa,prod --force +``` + +The full scaffold writes: + +| File | Trigger | Environment | +|---|---|---| +| `agentops-pr.yml` | PRs to `develop`, `release/**`, `main` | `dev` | | `agentops-deploy-dev.yml` | push to `develop` | `dev` | | `agentops-deploy-qa.yml` | push to `release/**` | `qa` | | `agentops-deploy-prod.yml` | push to `main` | `production` | @@ -55,8 +70,8 @@ This writes **four** files into `.github/workflows/`: Useful flags: - `--force` — overwrite existing workflow files. -- `--kinds pr,dev,qa,prod` — generate a subset (e.g. `--kinds pr,dev,prod` - for trunk-based teams). +- `--kinds pr,dev,qa,prod` — generate a subset. Prefer `--kinds pr` + until deploy environments are configured. - `--dir ` — non-default repo root. ## Step 2 — Configure GitHub Environments @@ -144,6 +159,8 @@ Common follow-ups: - Do **not** invent CLI flags. The supported `workflow generate` flags are `--force`, `--dir`, `--kinds`. +- Do **not** push DEV/QA/PROD deploy workflows with placeholder + Build/Deploy steps or missing OIDC variables; generate PR-only first. - Do **not** create parallel workflow files. Prefer editing the generated ones. - Do **not** auto-fill Build/Deploy with steps you can't justify from diff --git a/tests/unit/test_skills.py b/tests/unit/test_skills.py index 1b2fae8e..34217e1c 100644 --- a/tests/unit/test_skills.py +++ b/tests/unit/test_skills.py @@ -29,6 +29,7 @@ ".github/skills/agentops-dataset/SKILL.md", ".github/skills/agentops-report/SKILL.md", ".github/skills/agentops-workflow/SKILL.md", + ".github/skills/agentops-agent/SKILL.md", ] _CLAUDE_SKILL_PATHS = [ @@ -37,6 +38,7 @@ ".claude/commands/agentops-dataset.md", ".claude/commands/agentops-report.md", ".claude/commands/agentops-workflow.md", + ".claude/commands/agentops-agent.md", ] @@ -87,7 +89,7 @@ def test_install_creates_copilot_files(tmp_path: Path) -> None: result = install_skills(directory=tmp_path, platforms=["copilot"]) assert result.platforms == ["copilot"] - assert len(result.created_files) == 5 + assert len(result.created_files) == 6 assert len(result.skipped_files) == 0 for rel in _COPILOT_SKILL_PATHS: @@ -114,7 +116,7 @@ def test_install_creates_claude_files(tmp_path: Path) -> None: result = install_skills(directory=tmp_path, platforms=["claude"]) assert result.platforms == ["claude"] - assert len(result.created_files) == 5 + assert len(result.created_files) == 6 for rel in _CLAUDE_SKILL_PATHS: skill_file = tmp_path / rel @@ -137,7 +139,7 @@ def test_claude_files_strip_frontmatter(tmp_path: Path) -> None: def test_install_multi_platform(tmp_path: Path) -> None: result = install_skills(directory=tmp_path, platforms=["copilot", "claude"]) - assert len(result.created_files) == 10 # 5 per platform + assert len(result.created_files) == 12 # 6 per platform assert result.platforms == ["copilot", "claude"] @@ -154,7 +156,7 @@ def test_install_skips_existing(tmp_path: Path) -> None: result = install_skills(directory=tmp_path, platforms=["copilot"], force=False) - assert len(result.skipped_files) == 5 + assert len(result.skipped_files) == 6 assert len(result.created_files) == 0 assert skill.read_text(encoding="utf-8") == "custom content" @@ -167,7 +169,7 @@ def test_install_overwrites_with_force(tmp_path: Path) -> None: result = install_skills(directory=tmp_path, platforms=["copilot"], force=True) - assert len(result.overwritten_files) == 5 + assert len(result.overwritten_files) == 6 content = skill.read_text(encoding="utf-8") assert content != "custom content" assert "AgentOps" in content