From 58ccdfbe880994700e26cd0f9d32f341d06faf5d Mon Sep 17 00:00:00 2001 From: Paulo Lacerda Date: Tue, 21 Apr 2026 10:22:39 -0300 Subject: [PATCH] feat: improve eval workflow and add skills sync (#87) - Add sync-skills scripts (bash + PowerShell) and CI test to enforce single source of truth for skills between src/agentops/templates/skills/ and plugins/agentops/skills/ (closes #87) - Add cross-platform subprocess pattern in agentops-eval and agentops-dataset skills (shutil.which + shell detection) - Genericize auth carrythrough: AGENT_AUTH_HEADER/AGENT_AUTH_TOKEN env vars in callable_adapter.py template and agentops-eval skill - Add azd environment validation step in agentops-eval and agentops-config skills - Add optional unit test generation question and guidance section in agentops-eval skill - Enhance smoke test diagnostics with empty response, format mismatch, UUID prefix, and HTML error detection - Update CONTRIBUTING.md with skills single-source-of-truth rule - Sync plugins/agentops/skills/ from canonical src/ templates --- CHANGELOG.md | 13 ++ CONTRIBUTING.md | 15 ++ .../agentops/skills/agentops-config/SKILL.md | 12 ++ .../agentops/skills/agentops-dataset/SKILL.md | 11 +- .../agentops/skills/agentops-eval/SKILL.md | 150 ++++++++++++++++-- .../agentops/skills/agentops-trace/SKILL.md | 103 +++--------- scripts/sync-skills.ps1 | 38 +++++ scripts/sync-skills.sh | 37 +++++ src/agentops/templates/callable_adapter.py | 13 +- .../templates/skills/agentops-config/SKILL.md | 12 ++ .../skills/agentops-dataset/SKILL.md | 11 +- .../templates/skills/agentops-eval/SKILL.md | 150 ++++++++++++++++-- tests/unit/test_skills_sync.py | 81 ++++++++++ 13 files changed, 529 insertions(+), 117 deletions(-) create mode 100644 scripts/sync-skills.ps1 create mode 100644 scripts/sync-skills.sh create mode 100644 tests/unit/test_skills_sync.py diff --git a/CHANGELOG.md b/CHANGELOG.md index ab8bb60b..3b0bddb9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,19 @@ All notable changes to this project will be documented in this file. This format follows [Keep a Changelog](https://keepachangelog.com/) and adheres to [Semantic Versioning](https://semver.org/). +## [0.1.7] - 2026-04-21 + +### Added +- **Single source of truth for skills (closes #87)** — `src/agentops/templates/skills/` is now the canonical location. Added `scripts/sync-skills.sh` and `scripts/sync-skills.ps1` to propagate changes to `plugins/agentops/skills/`. CI test `test_skills_sync.py` fails if the two directories diverge. +- **Optional unit test generation** — `agentops-eval` skill (Step 1) now offers to generate unit tests for agent code when no existing tests are detected. Generates `pytest` + `unittest.mock` tests covering endpoint handlers, response parsing, and error handling. Opt-in only — skips silently if tests already exist or user declines. + +### Changed +- **Cross-platform subprocess handling in generated scripts** — `agentops-eval` and `agentops-dataset` skills now instruct generated `rag_context.py` scripts to use `shutil.which()` + `shell=(sys.platform == "win32")` when calling external CLIs, preventing `FileNotFoundError` on Windows. +- **Auth detection carrythrough to callable adapter** — `agentops-eval` skill Step 5.5 now explicitly wires the auth pattern detected in Step 2 into the adapter using generic `AGENT_AUTH_HEADER` and `AGENT_AUTH_TOKEN` env vars. Updated `callable_adapter.py` template to use the same generic auth mechanism. Prevents 401 errors on first smoke test. +- **azd environment validation** — `agentops-eval` (Step 4) and `agentops-config` (Step 3) skills now validate azd environments before trusting `.azure//.env` values: checks `azd env list`, verifies resource group exists via `az group exists`, and warns on stale environments. +- **Enhanced smoke test diagnostics** — `agentops-eval` skill Step 6 smoke test now checks for empty responses, response length, response format mismatches (JSON vs SSE), unexpected prefixes (UUIDs), and HTML error pages. Expanded troubleshooting table with specific remediation steps. +- **Updated CONTRIBUTING.md** — added single-source-of-truth rule for skills and sync script instructions. + ## [0.1.6] - 2026-04-15 ### Changed diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 107867af..b2f559bd 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -81,6 +81,21 @@ These rules are critical to maintaining the project's architecture. PRs that vio | Add a new CLI command | `cli/app.py` (keep it thin — delegate to `services/`) | | Add a new workflow/service | `services/` (new file) | | Add starter templates | `templates/` + update `pyproject.toml` package-data | +| Edit a coding agent skill | `src/agentops/templates/skills//SKILL.md` (single source of truth) → run `scripts/sync-skills.sh` or `.ps1` to propagate to `plugins/agentops/skills/` | + +### Skills: Single Source of Truth + +`src/agentops/templates/skills/` is the **canonical location** for all SKILL.md files. The VS Code extension directory `plugins/agentops/skills/` is a copy. After editing any skill file, run the sync script: + +```bash +# Linux / macOS +bash scripts/sync-skills.sh + +# Windows +.\scripts\sync-skills.ps1 +``` + +CI will fail if the two directories diverge (`test_skills_sync.py`). ### Testing Expectations diff --git a/plugins/agentops/skills/agentops-config/SKILL.md b/plugins/agentops/skills/agentops-config/SKILL.md index ba3f1ce8..06845854 100644 --- a/plugins/agentops/skills/agentops-config/SKILL.md +++ b/plugins/agentops/skills/agentops-config/SKILL.md @@ -76,6 +76,18 @@ Search these locations **in order** — stop as soon as each value is found: 3. `.azure//.env` files (azd environments) — also read `AZURE_RESOURCE_GROUP`, `AZURE_SUBSCRIPTION_ID` 4. `.azure/config.json` for `defaultEnvironment` to pick the right env folder +### Validate azd environment (if using `.azure//.env`) + +Before trusting values from `.azure//.env`, verify the environment is still valid: + +1. **Check the environment is current** — run `azd env list` and confirm the selected environment appears. If multiple environments exist, list them and ask the user which to use. +2. **Verify the resource group exists**: + ```bash + az group exists --name $RG --subscription $SUB + ``` + If this returns `false`, warn: *"Resource group '$RG' no longer exists. Your azd environment may be outdated."* +3. **If validation fails**, ask the user for correct values or to select a different environment. + If values are **not found** in any file, run Azure CLI discovery: ```bash # 1. Confirm auth and get subscription diff --git a/plugins/agentops/skills/agentops-dataset/SKILL.md b/plugins/agentops/skills/agentops-dataset/SKILL.md index faa1a0e5..602b334e 100644 --- a/plugins/agentops/skills/agentops-dataset/SKILL.md +++ b/plugins/agentops/skills/agentops-dataset/SKILL.md @@ -90,8 +90,17 @@ If the scenario is **RAG** and the generated JSONL has no `context` field: 2. **Build a retrieval script** at `.agentops/rag_context.py` (**never** in `src/`) that: - Reads the project's own retrieval config (env vars, endpoint, index name) from whatever the project uses - For each row in the JSONL, queries the retrieval backend with `row["input"]` and writes the result into `row["context"]` - - Uses only stdlib (`urllib.request`, `json`, `os`) — no third-party dependencies + - Uses only stdlib (`urllib.request`, `json`, `os`, `subprocess`, `sys`, `shutil`) — no third-party dependencies - Accepts the JSONL file path as a CLI argument: `python .agentops/rag_context.py .agentops/data/data.jsonl` + - **Must be cross-platform** (Windows + Linux/macOS) — when calling external CLIs (e.g. `az`), use: + ```python + import shutil, subprocess, sys + def _run_cli(args: list[str], **kwargs) -> subprocess.CompletedProcess: + exe = shutil.which(args[0]) + if exe is None: + raise FileNotFoundError(f"'{args[0]}' not found in PATH.") + return subprocess.run([exe] + args[1:], **kwargs, shell=(sys.platform == "win32")) + ``` 3. Verify: each JSONL row now has a `context` field. 4. Update dataset YAML to include `context_field: context` under `format:`. diff --git a/plugins/agentops/skills/agentops-eval/SKILL.md b/plugins/agentops/skills/agentops-eval/SKILL.md index 547ee415..2463a46c 100644 --- a/plugins/agentops/skills/agentops-eval/SKILL.md +++ b/plugins/agentops/skills/agentops-eval/SKILL.md @@ -40,6 +40,8 @@ Analyze the codebase holistically to understand the agent's **primary purpose**: 5. **Responsible AI (optional)**: Ask *"Do you also want to include safety evaluators (violence, hate/unfairness, self-harm, protected material)? These can be added alongside your main bundle."* If yes, add the safety evaluators from `safe_agent_baseline` to the selected bundle. +6. **Unit tests (optional)**: Only ask this if **all** of the following are true: (a) the codebase has testable agent code in Python, JavaScript, or TypeScript (endpoint handlers, tool definitions, orchestration logic), (b) no existing test directory or test files are detected (e.g., `tests/`, `test_*.py`, `*_test.py`, `*.test.ts`, `*.test.js`, `__tests__/`). If both conditions are met, ask: *"Would you also like me to generate unit tests for your agent code? (e.g., mocked HTTP calls, response parsing, error handling)"*. If the user declines or if conditions are not met, skip silently. See the **Unit Test Generation** section at the end of this skill for details. + ## Step 2 — Detect endpoint type | Search for | `endpoint.kind` | `hosting` | `execution_mode` | @@ -117,8 +119,29 @@ If the scenario is **RAG** and the dataset has no `context` field: 2. **Build a retrieval script** at `.agentops/rag_context.py` (**never** in `src/`) that: - Reads the project's own retrieval config (env vars, endpoint, index name) from whatever the project uses - For each row in the JSONL, queries the retrieval backend with `row["input"]` and writes the result into `row["context"]` - - Uses only stdlib (`urllib.request`, `json`, `os`) — no third-party dependencies + - Uses only stdlib (`urllib.request`, `json`, `os`, `subprocess`, `sys`, `shutil`) — no third-party dependencies - Accepts the JSONL file path as a CLI argument: `python .agentops/rag_context.py .agentops/data/data.jsonl` + - **Must be cross-platform** (Windows + Linux/macOS) — when calling external CLIs (e.g. `az`), use the following pattern: + ```python + import shutil + import subprocess + import sys + + def _run_cli(args: list[str], **kwargs) -> subprocess.CompletedProcess: + """Run an external CLI command, cross-platform.""" + exe = shutil.which(args[0]) + if exe is None: + raise FileNotFoundError( + f"'{args[0]}' not found in PATH. " + "Make sure it is installed and available." + ) + return subprocess.run( + [exe] + args[1:], + **kwargs, + shell=(sys.platform == "win32"), + ) + ``` + - This avoids `FileNotFoundError` on Windows where `subprocess.run(["az", ...])` fails without `shell=True` 3. Update dataset YAML to include `context_field: context` under `format:`. 4. Now `rag_quality_baseline` with GroundednessEvaluator and RetrievalEvaluator can be used. @@ -134,6 +157,18 @@ Search these locations in order — stop as soon as each value is found: 3. `.azure//.env` (azd environments) — also read `AZURE_RESOURCE_GROUP`, `AZURE_SUBSCRIPTION_ID` 4. `.azure/config.json` for `defaultEnvironment` to pick the right env folder +### Validate azd environment (if using `.azure//.env`) + +Before trusting values from `.azure//.env`, verify the environment is still valid: + +1. **Check if the environment is current** — run `azd env list` and confirm the selected environment appears in the output. If multiple environments exist, list them and ask the user which one to use. +2. **Verify the resource group exists** — after reading `AZURE_RESOURCE_GROUP` and `AZURE_SUBSCRIPTION_ID` from the env file, run: + ```bash + az group exists --name $RG --subscription $SUB + ``` + If this returns `false`, the environment is stale (resources were deleted). Warn the user: *"The resource group '$RG' no longer exists. Your azd environment may be outdated. Please re-run `azd up` or provide current Azure values."* +3. **If validation fails**, do not silently proceed with stale values — ask the user for correct values or to select a different environment. + If values are **not found** in files, use Azure CLI to discover them: ```bash @@ -282,13 +317,17 @@ import os import urllib.request ENDPOINT = os.environ["AGENT_HTTP_URL"] -AUTH_TOKEN = os.environ.get("APP_API_TOKEN", "") +# Auth: set AGENT_AUTH_HEADER and AGENT_AUTH_TOKEN env vars if your endpoint requires auth. +# Example: AGENT_AUTH_HEADER=dapr-api-token AGENT_AUTH_TOKEN=dev-token +# AGENT_AUTH_HEADER=X-API-KEY AGENT_AUTH_TOKEN=my-key +AUTH_HEADER = os.environ.get("AGENT_AUTH_HEADER", "") +AUTH_TOKEN = os.environ.get("AGENT_AUTH_TOKEN", "") def run_evaluation(input_text: str, context: dict) -> dict: body = json.dumps({"message": input_text}).encode() headers = {"Content-Type": "application/json"} - if AUTH_TOKEN: - headers["dapr-api-token"] = AUTH_TOKEN + if AUTH_HEADER and AUTH_TOKEN: + headers[AUTH_HEADER] = AUTH_TOKEN req = urllib.request.Request(ENDPOINT, data=body, headers=headers, method="POST") with urllib.request.urlopen(req, timeout=120) as resp: data = json.loads(resp.read()) @@ -302,13 +341,15 @@ import os import urllib.request ENDPOINT = os.environ["AGENT_HTTP_URL"] -AUTH_TOKEN = os.environ.get("APP_API_TOKEN", "") +# Auth: set AGENT_AUTH_HEADER and AGENT_AUTH_TOKEN env vars if your endpoint requires auth. +AUTH_HEADER = os.environ.get("AGENT_AUTH_HEADER", "") +AUTH_TOKEN = os.environ.get("AGENT_AUTH_TOKEN", "") def run_evaluation(input_text: str, context: dict) -> dict: body = json.dumps({"message": input_text}).encode() headers = {"Content-Type": "application/json"} - if AUTH_TOKEN: - headers["dapr-api-token"] = AUTH_TOKEN + if AUTH_HEADER and AUTH_TOKEN: + headers[AUTH_HEADER] = AUTH_TOKEN req = urllib.request.Request(ENDPOINT, data=body, headers=headers, method="POST") chunks = [] try: @@ -340,11 +381,20 @@ def run_evaluation(input_text: str, context: dict) -> dict: ``` Customize the adapter: -- **Dapr auth** (`dapr-api-token` / `APP_API_TOKEN` found in code or `.env`) → keep the auth lines above. -- **API key** (`X-API-KEY` / `api_key` / `API_KEY` found in code or `.env`) → change header to `headers["X-API-KEY"] = AUTH_TOKEN` and env var to `API_KEY`. -- **Bearer token** (`Authorization: Bearer` found in code) → recommend using `http` backend with `auth_header_env` instead of callable. -- **No auth found** → remove the `AUTH_TOKEN` lines entirely. +- **Apply the auth pattern detected in Step 2.** Use the table below to wire the correct header and env var into the adapter: + +| Auth detected in Step 2 | Adapter env var | Header line in adapter | +|---|---|---| +| `dapr-api-token` / `APP_API_TOKEN` | `AGENT_AUTH_TOKEN` (tell user to set it to their Dapr token) | `headers["dapr-api-token"] = AUTH_TOKEN` | +| `X-API-KEY` / `api_key` / `API_KEY` | `AGENT_AUTH_TOKEN` (tell user to set it to their API key) | `headers["X-API-KEY"] = AUTH_TOKEN` | +| `Authorization: Bearer` | Recommend HTTP backend with `auth_header_env` instead of callable adapter | N/A | +| No auth detected | Remove `AUTH_TOKEN` and auth header lines entirely | N/A | + + **Important**: Do NOT generate the adapter with auth lines commented out or using hardcoded tokens. If auth was detected, the adapter must include the correct header from the start — otherwise the smoke test will fail with 401. + - **Choose the right template:** If the agent code uses `yield`, `StreamingResponse`, `EventSourceResponse`, or `text/event-stream` content type, use the **SSE/streaming adapter** template. Otherwise use the **standard JSON adapter**. +- **Customize the request field:** If the agent expects a different key than `"message"` (e.g. `"ask"`, `"question"`, `"input"`), change the `json.dumps({"message": ...})` line to match. +- **Customize the response extraction:** If the agent returns a different key than `"text"` or `"response"`, update the `.get()` call accordingly. ### Context sanitization (RAG scenarios) @@ -424,17 +474,29 @@ import sys; sys.path.insert(0, '.agentops') from callable_adapter import run_evaluation result = run_evaluation('hello', {}) assert 'response' in result, f'Missing response key: {result}' -assert not result['response'].startswith('ERROR:'), f'Adapter error: {result[\"response\"]}' +resp = result['response'] +assert not resp.startswith('ERROR:'), f'Adapter error: {resp}' +assert len(resp.strip()) > 0, 'Empty response — check endpoint and request format' print('Smoke test PASSED') -print('Response preview:', result['response'][:120]) +print(f'Response length: {len(resp)} chars') +print('Response preview:', resp[:200]) " ``` If the smoke test fails: - **Connection refused** → the agent endpoint is not running. Start it first. -- **401 Unauthorized** → auth token is missing or wrong. Check the env var. -- **400/422** → the request body format doesn't match the endpoint. Check `request_field`. +- **401 Unauthorized** → auth token is missing or wrong. Check `AGENT_AUTH_HEADER` and `AGENT_AUTH_TOKEN` env vars. +- **400/422** → the request body format doesn't match the endpoint. Check the `json.dumps({"message": ...})` field name in the adapter — the endpoint may expect a different key (e.g. `"ask"`, `"question"`, `"input"`). - **Response starts with `ERROR:`** → the adapter caught an exception. Read the error message. +- **Empty response** → the endpoint returned successfully but the adapter extracted no text. Check `response_field` / `.get()` key in the adapter. +- **Response contains unexpected prefix** (UUID, metadata, HTML) → add a post-processing step to the adapter to strip it. Common pattern: `re.sub(r'^[0-9a-f-]{36}\s*', '', response_text)` for UUID prefixes. + +### Smoke test response format verification + +After the basic smoke test passes, verify the response format matches expectations: +1. If the response contains HTML tags (``, `
`, etc.) but the adapter expects plain text → the endpoint may be returning an error page, not agent output. +2. If the response is very short (< 10 chars) for a conversational prompt like "hello" → warn the user: *"Response seems unusually short. Verify the endpoint is returning the full agent response."* +3. If the response starts with `data:` or contains SSE markers but the adapter uses the standard JSON template → switch to the SSE/streaming adapter template. Do NOT proceed to Step 7 until the smoke test passes. @@ -482,7 +544,7 @@ agentops report generate [--in results.json] # Regenerate report - **NEVER** fabricate `agent_id`, model names, or endpoint URLs. - **NEVER** edit `.agentops/` template files (`run-callable.yaml`, `run-http-rag.yaml`, etc.) — always update `.agentops/run.yaml`. - **NEVER** use dotted import paths like `.agentops.callable_adapter` — they fail. -- **NEVER** create files outside `.agentops/` — all generated artifacts (adapters, datasets, configs, scripts) belong in `.agentops/`. +- **NEVER** create files outside `.agentops/` — all generated artifacts (adapters, datasets, configs, scripts) belong in `.agentops/`. Exception: unit tests go in the project's existing test directory. - **NEVER** try `az login` automatically — ask the user to authenticate. - **NEVER** use `requests` or `httpx` in callable adapters — use only stdlib (`urllib.request`, `json`, `os`). - If a bundle uses SDK-version-dependent evaluators, verify availability before running (Step 4.5). Don't block on this — if verification is hard, proceed and fix on failure. @@ -490,3 +552,59 @@ agentops report generate [--in results.json] # Regenerate report - Use generic file names: `dataset.yaml`, `data.jsonl` — not project-specific prefixes. - Use plain language in questions — not technical jargon ("callable adapter", "SSE", "POST"). - Always run pre-flight (Step 6) before executing. Fix all issues first. + +## Unit Test Generation (Optional) + +This section is only executed if the user accepted the unit test offer in Step 1. + +### When to generate + +- The codebase has Python, JavaScript, or TypeScript agent code with testable logic (endpoint handlers, tool definitions, response parsing, orchestration). +- No existing test files or test directories were detected. + +### What to generate + +Create tests in the project's conventional test directory (e.g. `tests/test_agent.py` for Python, `__tests__/agent.test.ts` for TypeScript). Use only standard testing libraries — no extra dependencies. + +**For Python agents**, generate `pytest` tests using `unittest.mock`: + +1. **Endpoint handler test** — mock the HTTP framework (FastAPI `TestClient`, Flask `test_client`) and verify the handler returns expected response format. +2. **Response parsing test** — if the agent has response parsing logic (JSON extraction, SSE chunk assembly, UUID stripping), test it with known inputs/outputs. +3. **Error handling test** — verify the agent handles timeouts, 4xx/5xx from downstream services, and malformed inputs gracefully. +4. **Tool schema test** (if applicable) — if the agent defines tools with schemas, validate the schema structure is correct (required fields, types). + +**Template pattern** (adapt to the detected code): +```python +"""Unit tests for agent endpoint — generated by AgentOps.""" +import json +from unittest.mock import MagicMock, patch + +import pytest + + +class TestAgentEndpoint: + """Tests for the agent's HTTP endpoint handler.""" + + def test_returns_valid_response_format(self): + # Mock the downstream model/service call + # Call the endpoint handler directly + # Assert response has expected keys and types + ... + + def test_handles_empty_input(self): + # Verify the agent handles empty or whitespace-only input + ... + + def test_handles_downstream_timeout(self): + # Mock the downstream call to raise a timeout + # Assert the agent returns an error response (not a crash) + ... +``` + +### Rules for generated tests + +- Tests must run **without** Azure credentials or live services — all external calls must be mocked. +- Do not generate tests that duplicate what AgentOps evaluations already cover (response quality, groundedness, coherence). +- Focus on **functional correctness**: does the code do what it's supposed to do? +- Place tests in the project's existing test directory structure, not in `.agentops/`. +- If the project uses a specific test runner or framework (detected via `pyproject.toml`, `package.json`, `conftest.py`), follow its conventions. diff --git a/plugins/agentops/skills/agentops-trace/SKILL.md b/plugins/agentops/skills/agentops-trace/SKILL.md index 04aec69c..33435e9e 100644 --- a/plugins/agentops/skills/agentops-trace/SKILL.md +++ b/plugins/agentops/skills/agentops-trace/SKILL.md @@ -1,105 +1,44 @@ --- name: agentops-trace -description: Guidance on tracing for AgentOps evaluations. Trigger when users ask about tracing agent execution, setting up telemetry, or inspecting spans. Common phrases include "tracing", "trace setup", "distributed tracing", "span", "telemetry", "trace evaluation", "trace agent", "OTLP", "Jaeger", "Azure Monitor traces". Install agentops-toolkit via pip. +description: Guidance on tracing for AgentOps evaluations. Trigger when users ask about tracing agent execution, setting up telemetry, or inspecting spans. Common phrases include "tracing", "trace init", "trace setup", "distributed tracing", "span", "telemetry", "trace evaluation", "trace agent". Install agentops-toolkit via pip. --- # AgentOps Trace ## Purpose -Provide guidance on the built-in OpenTelemetry (OTel) tracing that is automatically emitted during every `agentops eval run`. +Provide guidance on tracing agent execution. The `agentops trace` command is **planned but not yet implemented**. ## Before You Start 1. **AgentOps installed?** Check if `agentops` CLI is available. If not: `pip install agentops-toolkit`. 2. **Workspace exists?** Check for `.agentops/`. If missing: `agentops init`. +3. **Foundry endpoint configured?** Search for `AZURE_AI_FOUNDRY_PROJECT_ENDPOINT` in environment variables, `.env`, `.env.local`. If not found, ask the user for the endpoint URL and instruct them to set it. -## How It Works +## Status -Tracing is **opt-in** and controlled by a single environment variable: +🚧 **Not yet implemented.** The CLI stub exists but has no runtime behavior. -| Variable | Required? | Default | Description | -|---|---|---|---| -| `AGENTOPS_OTLP_ENDPOINT` | No | *(unset — tracing disabled)* | Base URL of the OTLP/HTTP collector. AgentOps appends `/v1/traces`. | +## Current Alternatives -When set, every `agentops eval run` emits a full trace tree: +Until `agentops trace` is available, use these tools directly: -``` -RUN (root span — the whole evaluation) -├── eval_item 0 (one per dataset row) -│ ├── evaluator builtin.similarity (one per evaluator score) -│ └── evaluator builtin.coherence -├── eval_item 1 -│ └── ... -└── (final attributes: pass_rate, items_total, items_passed) -``` - -When unset, tracing is fully disabled — zero overhead, no OTel packages imported. - -## Quick Start — Local Jaeger - -```bash -# 1. Start Jaeger (OTLP on 4318, UI on 16686) -docker run -d --name jaeger -p 16686:16686 -p 4318:4318 jaegertracing/jaeger:latest - -# 2. Install OTel packages -pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp-proto-http - -# 3. Enable tracing -export AGENTOPS_OTLP_ENDPOINT=http://localhost:4318 # Linux/macOS -$env:AGENTOPS_OTLP_ENDPOINT = "http://localhost:4318" # PowerShell - -# 4. Run evaluation -agentops eval run - -# 5. View traces at http://localhost:16686 → service "agentops" -``` - -## Semantic Conventions - -Spans follow three OTel semantic convention layers: - -- **CICD** (`cicd.pipeline.*`) — models the evaluation run as a CI/CD pipeline -- **GenAI** (`gen_ai.*`) — follows the OTel GenAI spec for agent/model invocation -- **AgentOps** (`agentops.eval.*`) — evaluation-specific: scores, thresholds, pass/fail - -Full attribute reference: `docs/telemetry.md`. - -## Production Backends - -Any OTLP-compatible backend works: - -| Backend | Setup | +| Tool | Use case | |---|---| -| Azure Monitor / App Insights | Use OTel Collector with Azure Monitor exporter, or native OTLP ingestion | -| Grafana Tempo | Point `AGENTOPS_OTLP_ENDPOINT` at the Tempo OTLP receiver | -| Datadog | Use the Datadog OTLP ingest endpoint | - -## Agent Execution Tracing - -Agent execution tracing (tool calls, LLM calls, retrieval steps) is **already provided by Foundry and the Agent Framework** — AgentOps does not reimplement it. The skill can help users **verify that tracing is properly configured** in their agent code. - -### What to check - -When a user asks about agent tracing, inspect their codebase for: - -1. **Foundry agents** — Tracing is automatic. Verify the agent is deployed and visible in the Foundry portal → Agent → Traces tab. -2. **Agent Framework SDK** — Check that `AIProjectClient` or `AgentsClient` is configured with the correct project endpoint. Traces flow to Azure Monitor automatically. -3. **Custom agents (HTTP/local)** — Look for OTel instrumentation in the agent code: - - `opentelemetry` imports and `TracerProvider` setup - - `APPLICATIONINSIGHTS_CONNECTION_STRING` or `OTEL_EXPORTER_OTLP_ENDPOINT` env vars - - If missing, guide the user to add OTel SDK setup to their agent entrypoint +| Azure Monitor / Application Insights | Production tracing for Foundry agents | +| OpenTelemetry SDK | Custom span instrumentation | +| Foundry portal | Built-in agent execution traces | +| `results.json` row metrics | Per-row latency via `avg_latency_seconds` | -### Verification steps +## What Will Be Available -1. Check for tracing env vars: `APPLICATIONINSIGHTS_CONNECTION_STRING`, `OTEL_EXPORTER_OTLP_ENDPOINT`, `AZURE_AI_FOUNDRY_PROJECT_ENDPOINT` -2. Search agent code for `TracerProvider`, `trace.get_tracer`, or `configure_azure_monitor` -3. If nothing is found, suggest adding Azure Monitor OpenTelemetry: `pip install azure-monitor-opentelemetry` + `configure_azure_monitor()` -4. Point user to Foundry portal or Azure Monitor to confirm traces are flowing +When implemented, `agentops trace init` will: +- Configure OpenTelemetry export for AgentOps evaluation runs +- Capture per-row agent execution spans +- Link traces to evaluation results for debugging -## Rules +## Guardrails -- Tracing is built-in and emitted automatically when `AGENTOPS_OTLP_ENDPOINT` is set. -- Do not suggest `agentops trace init` — that command has been removed. -- For latency analysis without OTel, point users to `avg_latency_seconds` in evaluation bundles. -- Agent execution tracing is handled by Foundry/Agent Framework natively — the skill helps verify it is configured, not reimplement it. +- Do not pretend tracing features exist — clearly state they are planned. +- For latency analysis, point users to `avg_latency_seconds` in evaluation bundles. +- For production tracing, recommend Azure Monitor or OpenTelemetry directly. diff --git a/scripts/sync-skills.ps1 b/scripts/sync-skills.ps1 new file mode 100644 index 00000000..50888399 --- /dev/null +++ b/scripts/sync-skills.ps1 @@ -0,0 +1,38 @@ +# sync-skills.ps1 — Copy skills from the single source of truth +# (src/agentops/templates/skills/) to the VS Code extension +# (plugins/agentops/skills/). +# +# Run this after editing any SKILL.md in src/agentops/templates/skills/. +# CI will fail if the two directories diverge. + +$ErrorActionPreference = "Stop" + +$RepoRoot = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$SrcDir = Join-Path $RepoRoot "src" "agentops" "templates" "skills" +$DestDir = Join-Path $RepoRoot "plugins" "agentops" "skills" + +if (-not (Test-Path $SrcDir)) { + Write-Error "Source directory not found: $SrcDir" + exit 1 +} + +$synced = 0 +foreach ($skillDir in Get-ChildItem -Path $SrcDir -Directory) { + $srcFile = Join-Path $skillDir.FullName "SKILL.md" + if (-not (Test-Path $srcFile)) { + continue + } + + $destSkillDir = Join-Path $DestDir $skillDir.Name + if (-not (Test-Path $destSkillDir)) { + New-Item -ItemType Directory -Path $destSkillDir -Force | Out-Null + } + + $destFile = Join-Path $destSkillDir "SKILL.md" + Copy-Item -Path $srcFile -Destination $destFile -Force + $synced++ + Write-Host " OK $($skillDir.Name)/SKILL.md" +} + +Write-Host "" +Write-Host "Synced $synced skill(s) from src/agentops/templates/skills/ -> plugins/agentops/skills/" diff --git a/scripts/sync-skills.sh b/scripts/sync-skills.sh new file mode 100644 index 00000000..0e239f61 --- /dev/null +++ b/scripts/sync-skills.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +# sync-skills.sh — Copy skills from the single source of truth +# (src/agentops/templates/skills/) to the VS Code extension +# (plugins/agentops/skills/). +# +# Run this after editing any SKILL.md in src/agentops/templates/skills/. +# CI will fail if the two directories diverge. + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +SRC_DIR="$REPO_ROOT/src/agentops/templates/skills" +DEST_DIR="$REPO_ROOT/plugins/agentops/skills" + +if [ ! -d "$SRC_DIR" ]; then + echo "ERROR: Source directory not found: $SRC_DIR" >&2 + exit 1 +fi + +synced=0 +for skill_dir in "$SRC_DIR"/*/; do + skill_name="$(basename "$skill_dir")" + src_file="$skill_dir/SKILL.md" + dest_file="$DEST_DIR/$skill_name/SKILL.md" + + if [ ! -f "$src_file" ]; then + continue + fi + + mkdir -p "$DEST_DIR/$skill_name" + cp "$src_file" "$dest_file" + synced=$((synced + 1)) + echo " ✔ $skill_name/SKILL.md" +done + +echo "" +echo "Synced $synced skill(s) from src/agentops/templates/skills/ → plugins/agentops/skills/" diff --git a/src/agentops/templates/callable_adapter.py b/src/agentops/templates/callable_adapter.py index 0943bf2f..f66c400b 100644 --- a/src/agentops/templates/callable_adapter.py +++ b/src/agentops/templates/callable_adapter.py @@ -27,6 +27,14 @@ # Set AGENT_HTTP_URL in your environment or replace the default below. ENDPOINT = os.environ.get("AGENT_HTTP_URL", "http://localhost:8000/api/chat") +# ── Authentication ───────────────────────────────────────────────────── +# Set both AGENT_AUTH_HEADER and AGENT_AUTH_TOKEN to enable auth. +# Examples: +# Dapr: AGENT_AUTH_HEADER=dapr-api-token AGENT_AUTH_TOKEN=dev-token +# API Key: AGENT_AUTH_HEADER=X-API-KEY AGENT_AUTH_TOKEN=my-key +AUTH_HEADER = os.environ.get("AGENT_AUTH_HEADER", "") +AUTH_TOKEN = os.environ.get("AGENT_AUTH_TOKEN", "") + # ── Response cleaning helpers ────────────────────────────────────────── _HTML_COMMENT_RE = re.compile(r"", re.DOTALL) @@ -49,10 +57,13 @@ def run_evaluation(input_text: str, context: dict) -> dict: """ # --- Option 1: Standard JSON POST (default) --- body = json.dumps({"message": input_text}).encode() + headers: dict[str, str] = {"Content-Type": "application/json"} + if AUTH_HEADER and AUTH_TOKEN: + headers[AUTH_HEADER] = AUTH_TOKEN req = urllib.request.Request( ENDPOINT, data=body, - headers={"Content-Type": "application/json"}, + headers=headers, method="POST", ) with urllib.request.urlopen(req) as resp: diff --git a/src/agentops/templates/skills/agentops-config/SKILL.md b/src/agentops/templates/skills/agentops-config/SKILL.md index ba3f1ce8..06845854 100644 --- a/src/agentops/templates/skills/agentops-config/SKILL.md +++ b/src/agentops/templates/skills/agentops-config/SKILL.md @@ -76,6 +76,18 @@ Search these locations **in order** — stop as soon as each value is found: 3. `.azure//.env` files (azd environments) — also read `AZURE_RESOURCE_GROUP`, `AZURE_SUBSCRIPTION_ID` 4. `.azure/config.json` for `defaultEnvironment` to pick the right env folder +### Validate azd environment (if using `.azure//.env`) + +Before trusting values from `.azure//.env`, verify the environment is still valid: + +1. **Check the environment is current** — run `azd env list` and confirm the selected environment appears. If multiple environments exist, list them and ask the user which to use. +2. **Verify the resource group exists**: + ```bash + az group exists --name $RG --subscription $SUB + ``` + If this returns `false`, warn: *"Resource group '$RG' no longer exists. Your azd environment may be outdated."* +3. **If validation fails**, ask the user for correct values or to select a different environment. + If values are **not found** in any file, run Azure CLI discovery: ```bash # 1. Confirm auth and get subscription diff --git a/src/agentops/templates/skills/agentops-dataset/SKILL.md b/src/agentops/templates/skills/agentops-dataset/SKILL.md index faa1a0e5..602b334e 100644 --- a/src/agentops/templates/skills/agentops-dataset/SKILL.md +++ b/src/agentops/templates/skills/agentops-dataset/SKILL.md @@ -90,8 +90,17 @@ If the scenario is **RAG** and the generated JSONL has no `context` field: 2. **Build a retrieval script** at `.agentops/rag_context.py` (**never** in `src/`) that: - Reads the project's own retrieval config (env vars, endpoint, index name) from whatever the project uses - For each row in the JSONL, queries the retrieval backend with `row["input"]` and writes the result into `row["context"]` - - Uses only stdlib (`urllib.request`, `json`, `os`) — no third-party dependencies + - Uses only stdlib (`urllib.request`, `json`, `os`, `subprocess`, `sys`, `shutil`) — no third-party dependencies - Accepts the JSONL file path as a CLI argument: `python .agentops/rag_context.py .agentops/data/data.jsonl` + - **Must be cross-platform** (Windows + Linux/macOS) — when calling external CLIs (e.g. `az`), use: + ```python + import shutil, subprocess, sys + def _run_cli(args: list[str], **kwargs) -> subprocess.CompletedProcess: + exe = shutil.which(args[0]) + if exe is None: + raise FileNotFoundError(f"'{args[0]}' not found in PATH.") + return subprocess.run([exe] + args[1:], **kwargs, shell=(sys.platform == "win32")) + ``` 3. Verify: each JSONL row now has a `context` field. 4. Update dataset YAML to include `context_field: context` under `format:`. diff --git a/src/agentops/templates/skills/agentops-eval/SKILL.md b/src/agentops/templates/skills/agentops-eval/SKILL.md index 547ee415..2463a46c 100644 --- a/src/agentops/templates/skills/agentops-eval/SKILL.md +++ b/src/agentops/templates/skills/agentops-eval/SKILL.md @@ -40,6 +40,8 @@ Analyze the codebase holistically to understand the agent's **primary purpose**: 5. **Responsible AI (optional)**: Ask *"Do you also want to include safety evaluators (violence, hate/unfairness, self-harm, protected material)? These can be added alongside your main bundle."* If yes, add the safety evaluators from `safe_agent_baseline` to the selected bundle. +6. **Unit tests (optional)**: Only ask this if **all** of the following are true: (a) the codebase has testable agent code in Python, JavaScript, or TypeScript (endpoint handlers, tool definitions, orchestration logic), (b) no existing test directory or test files are detected (e.g., `tests/`, `test_*.py`, `*_test.py`, `*.test.ts`, `*.test.js`, `__tests__/`). If both conditions are met, ask: *"Would you also like me to generate unit tests for your agent code? (e.g., mocked HTTP calls, response parsing, error handling)"*. If the user declines or if conditions are not met, skip silently. See the **Unit Test Generation** section at the end of this skill for details. + ## Step 2 — Detect endpoint type | Search for | `endpoint.kind` | `hosting` | `execution_mode` | @@ -117,8 +119,29 @@ If the scenario is **RAG** and the dataset has no `context` field: 2. **Build a retrieval script** at `.agentops/rag_context.py` (**never** in `src/`) that: - Reads the project's own retrieval config (env vars, endpoint, index name) from whatever the project uses - For each row in the JSONL, queries the retrieval backend with `row["input"]` and writes the result into `row["context"]` - - Uses only stdlib (`urllib.request`, `json`, `os`) — no third-party dependencies + - Uses only stdlib (`urllib.request`, `json`, `os`, `subprocess`, `sys`, `shutil`) — no third-party dependencies - Accepts the JSONL file path as a CLI argument: `python .agentops/rag_context.py .agentops/data/data.jsonl` + - **Must be cross-platform** (Windows + Linux/macOS) — when calling external CLIs (e.g. `az`), use the following pattern: + ```python + import shutil + import subprocess + import sys + + def _run_cli(args: list[str], **kwargs) -> subprocess.CompletedProcess: + """Run an external CLI command, cross-platform.""" + exe = shutil.which(args[0]) + if exe is None: + raise FileNotFoundError( + f"'{args[0]}' not found in PATH. " + "Make sure it is installed and available." + ) + return subprocess.run( + [exe] + args[1:], + **kwargs, + shell=(sys.platform == "win32"), + ) + ``` + - This avoids `FileNotFoundError` on Windows where `subprocess.run(["az", ...])` fails without `shell=True` 3. Update dataset YAML to include `context_field: context` under `format:`. 4. Now `rag_quality_baseline` with GroundednessEvaluator and RetrievalEvaluator can be used. @@ -134,6 +157,18 @@ Search these locations in order — stop as soon as each value is found: 3. `.azure//.env` (azd environments) — also read `AZURE_RESOURCE_GROUP`, `AZURE_SUBSCRIPTION_ID` 4. `.azure/config.json` for `defaultEnvironment` to pick the right env folder +### Validate azd environment (if using `.azure//.env`) + +Before trusting values from `.azure//.env`, verify the environment is still valid: + +1. **Check if the environment is current** — run `azd env list` and confirm the selected environment appears in the output. If multiple environments exist, list them and ask the user which one to use. +2. **Verify the resource group exists** — after reading `AZURE_RESOURCE_GROUP` and `AZURE_SUBSCRIPTION_ID` from the env file, run: + ```bash + az group exists --name $RG --subscription $SUB + ``` + If this returns `false`, the environment is stale (resources were deleted). Warn the user: *"The resource group '$RG' no longer exists. Your azd environment may be outdated. Please re-run `azd up` or provide current Azure values."* +3. **If validation fails**, do not silently proceed with stale values — ask the user for correct values or to select a different environment. + If values are **not found** in files, use Azure CLI to discover them: ```bash @@ -282,13 +317,17 @@ import os import urllib.request ENDPOINT = os.environ["AGENT_HTTP_URL"] -AUTH_TOKEN = os.environ.get("APP_API_TOKEN", "") +# Auth: set AGENT_AUTH_HEADER and AGENT_AUTH_TOKEN env vars if your endpoint requires auth. +# Example: AGENT_AUTH_HEADER=dapr-api-token AGENT_AUTH_TOKEN=dev-token +# AGENT_AUTH_HEADER=X-API-KEY AGENT_AUTH_TOKEN=my-key +AUTH_HEADER = os.environ.get("AGENT_AUTH_HEADER", "") +AUTH_TOKEN = os.environ.get("AGENT_AUTH_TOKEN", "") def run_evaluation(input_text: str, context: dict) -> dict: body = json.dumps({"message": input_text}).encode() headers = {"Content-Type": "application/json"} - if AUTH_TOKEN: - headers["dapr-api-token"] = AUTH_TOKEN + if AUTH_HEADER and AUTH_TOKEN: + headers[AUTH_HEADER] = AUTH_TOKEN req = urllib.request.Request(ENDPOINT, data=body, headers=headers, method="POST") with urllib.request.urlopen(req, timeout=120) as resp: data = json.loads(resp.read()) @@ -302,13 +341,15 @@ import os import urllib.request ENDPOINT = os.environ["AGENT_HTTP_URL"] -AUTH_TOKEN = os.environ.get("APP_API_TOKEN", "") +# Auth: set AGENT_AUTH_HEADER and AGENT_AUTH_TOKEN env vars if your endpoint requires auth. +AUTH_HEADER = os.environ.get("AGENT_AUTH_HEADER", "") +AUTH_TOKEN = os.environ.get("AGENT_AUTH_TOKEN", "") def run_evaluation(input_text: str, context: dict) -> dict: body = json.dumps({"message": input_text}).encode() headers = {"Content-Type": "application/json"} - if AUTH_TOKEN: - headers["dapr-api-token"] = AUTH_TOKEN + if AUTH_HEADER and AUTH_TOKEN: + headers[AUTH_HEADER] = AUTH_TOKEN req = urllib.request.Request(ENDPOINT, data=body, headers=headers, method="POST") chunks = [] try: @@ -340,11 +381,20 @@ def run_evaluation(input_text: str, context: dict) -> dict: ``` Customize the adapter: -- **Dapr auth** (`dapr-api-token` / `APP_API_TOKEN` found in code or `.env`) → keep the auth lines above. -- **API key** (`X-API-KEY` / `api_key` / `API_KEY` found in code or `.env`) → change header to `headers["X-API-KEY"] = AUTH_TOKEN` and env var to `API_KEY`. -- **Bearer token** (`Authorization: Bearer` found in code) → recommend using `http` backend with `auth_header_env` instead of callable. -- **No auth found** → remove the `AUTH_TOKEN` lines entirely. +- **Apply the auth pattern detected in Step 2.** Use the table below to wire the correct header and env var into the adapter: + +| Auth detected in Step 2 | Adapter env var | Header line in adapter | +|---|---|---| +| `dapr-api-token` / `APP_API_TOKEN` | `AGENT_AUTH_TOKEN` (tell user to set it to their Dapr token) | `headers["dapr-api-token"] = AUTH_TOKEN` | +| `X-API-KEY` / `api_key` / `API_KEY` | `AGENT_AUTH_TOKEN` (tell user to set it to their API key) | `headers["X-API-KEY"] = AUTH_TOKEN` | +| `Authorization: Bearer` | Recommend HTTP backend with `auth_header_env` instead of callable adapter | N/A | +| No auth detected | Remove `AUTH_TOKEN` and auth header lines entirely | N/A | + + **Important**: Do NOT generate the adapter with auth lines commented out or using hardcoded tokens. If auth was detected, the adapter must include the correct header from the start — otherwise the smoke test will fail with 401. + - **Choose the right template:** If the agent code uses `yield`, `StreamingResponse`, `EventSourceResponse`, or `text/event-stream` content type, use the **SSE/streaming adapter** template. Otherwise use the **standard JSON adapter**. +- **Customize the request field:** If the agent expects a different key than `"message"` (e.g. `"ask"`, `"question"`, `"input"`), change the `json.dumps({"message": ...})` line to match. +- **Customize the response extraction:** If the agent returns a different key than `"text"` or `"response"`, update the `.get()` call accordingly. ### Context sanitization (RAG scenarios) @@ -424,17 +474,29 @@ import sys; sys.path.insert(0, '.agentops') from callable_adapter import run_evaluation result = run_evaluation('hello', {}) assert 'response' in result, f'Missing response key: {result}' -assert not result['response'].startswith('ERROR:'), f'Adapter error: {result[\"response\"]}' +resp = result['response'] +assert not resp.startswith('ERROR:'), f'Adapter error: {resp}' +assert len(resp.strip()) > 0, 'Empty response — check endpoint and request format' print('Smoke test PASSED') -print('Response preview:', result['response'][:120]) +print(f'Response length: {len(resp)} chars') +print('Response preview:', resp[:200]) " ``` If the smoke test fails: - **Connection refused** → the agent endpoint is not running. Start it first. -- **401 Unauthorized** → auth token is missing or wrong. Check the env var. -- **400/422** → the request body format doesn't match the endpoint. Check `request_field`. +- **401 Unauthorized** → auth token is missing or wrong. Check `AGENT_AUTH_HEADER` and `AGENT_AUTH_TOKEN` env vars. +- **400/422** → the request body format doesn't match the endpoint. Check the `json.dumps({"message": ...})` field name in the adapter — the endpoint may expect a different key (e.g. `"ask"`, `"question"`, `"input"`). - **Response starts with `ERROR:`** → the adapter caught an exception. Read the error message. +- **Empty response** → the endpoint returned successfully but the adapter extracted no text. Check `response_field` / `.get()` key in the adapter. +- **Response contains unexpected prefix** (UUID, metadata, HTML) → add a post-processing step to the adapter to strip it. Common pattern: `re.sub(r'^[0-9a-f-]{36}\s*', '', response_text)` for UUID prefixes. + +### Smoke test response format verification + +After the basic smoke test passes, verify the response format matches expectations: +1. If the response contains HTML tags (``, `
`, etc.) but the adapter expects plain text → the endpoint may be returning an error page, not agent output. +2. If the response is very short (< 10 chars) for a conversational prompt like "hello" → warn the user: *"Response seems unusually short. Verify the endpoint is returning the full agent response."* +3. If the response starts with `data:` or contains SSE markers but the adapter uses the standard JSON template → switch to the SSE/streaming adapter template. Do NOT proceed to Step 7 until the smoke test passes. @@ -482,7 +544,7 @@ agentops report generate [--in results.json] # Regenerate report - **NEVER** fabricate `agent_id`, model names, or endpoint URLs. - **NEVER** edit `.agentops/` template files (`run-callable.yaml`, `run-http-rag.yaml`, etc.) — always update `.agentops/run.yaml`. - **NEVER** use dotted import paths like `.agentops.callable_adapter` — they fail. -- **NEVER** create files outside `.agentops/` — all generated artifacts (adapters, datasets, configs, scripts) belong in `.agentops/`. +- **NEVER** create files outside `.agentops/` — all generated artifacts (adapters, datasets, configs, scripts) belong in `.agentops/`. Exception: unit tests go in the project's existing test directory. - **NEVER** try `az login` automatically — ask the user to authenticate. - **NEVER** use `requests` or `httpx` in callable adapters — use only stdlib (`urllib.request`, `json`, `os`). - If a bundle uses SDK-version-dependent evaluators, verify availability before running (Step 4.5). Don't block on this — if verification is hard, proceed and fix on failure. @@ -490,3 +552,59 @@ agentops report generate [--in results.json] # Regenerate report - Use generic file names: `dataset.yaml`, `data.jsonl` — not project-specific prefixes. - Use plain language in questions — not technical jargon ("callable adapter", "SSE", "POST"). - Always run pre-flight (Step 6) before executing. Fix all issues first. + +## Unit Test Generation (Optional) + +This section is only executed if the user accepted the unit test offer in Step 1. + +### When to generate + +- The codebase has Python, JavaScript, or TypeScript agent code with testable logic (endpoint handlers, tool definitions, response parsing, orchestration). +- No existing test files or test directories were detected. + +### What to generate + +Create tests in the project's conventional test directory (e.g. `tests/test_agent.py` for Python, `__tests__/agent.test.ts` for TypeScript). Use only standard testing libraries — no extra dependencies. + +**For Python agents**, generate `pytest` tests using `unittest.mock`: + +1. **Endpoint handler test** — mock the HTTP framework (FastAPI `TestClient`, Flask `test_client`) and verify the handler returns expected response format. +2. **Response parsing test** — if the agent has response parsing logic (JSON extraction, SSE chunk assembly, UUID stripping), test it with known inputs/outputs. +3. **Error handling test** — verify the agent handles timeouts, 4xx/5xx from downstream services, and malformed inputs gracefully. +4. **Tool schema test** (if applicable) — if the agent defines tools with schemas, validate the schema structure is correct (required fields, types). + +**Template pattern** (adapt to the detected code): +```python +"""Unit tests for agent endpoint — generated by AgentOps.""" +import json +from unittest.mock import MagicMock, patch + +import pytest + + +class TestAgentEndpoint: + """Tests for the agent's HTTP endpoint handler.""" + + def test_returns_valid_response_format(self): + # Mock the downstream model/service call + # Call the endpoint handler directly + # Assert response has expected keys and types + ... + + def test_handles_empty_input(self): + # Verify the agent handles empty or whitespace-only input + ... + + def test_handles_downstream_timeout(self): + # Mock the downstream call to raise a timeout + # Assert the agent returns an error response (not a crash) + ... +``` + +### Rules for generated tests + +- Tests must run **without** Azure credentials or live services — all external calls must be mocked. +- Do not generate tests that duplicate what AgentOps evaluations already cover (response quality, groundedness, coherence). +- Focus on **functional correctness**: does the code do what it's supposed to do? +- Place tests in the project's existing test directory structure, not in `.agentops/`. +- If the project uses a specific test runner or framework (detected via `pyproject.toml`, `package.json`, `conftest.py`), follow its conventions. diff --git a/tests/unit/test_skills_sync.py b/tests/unit/test_skills_sync.py new file mode 100644 index 00000000..3b4d341c --- /dev/null +++ b/tests/unit/test_skills_sync.py @@ -0,0 +1,81 @@ +"""Verify that plugins/agentops/skills/ is in sync with src/agentops/templates/skills/. + +The single source of truth for skill files is src/agentops/templates/skills/. +The VS Code extension at plugins/agentops/skills/ must be an exact copy. + +If this test fails, run: + scripts/sync-skills.sh (Linux/macOS) + scripts/sync-skills.ps1 (Windows) +""" + +from pathlib import Path + +# Repository root is four levels up from this test file (tests/unit/test_skills_sync.py). +_REPO_ROOT = Path(__file__).resolve().parent.parent.parent +_SRC_SKILLS = _REPO_ROOT / "src" / "agentops" / "templates" / "skills" +_PLUGIN_SKILLS = _REPO_ROOT / "plugins" / "agentops" / "skills" + + +def _skill_dirs() -> list[str]: + """Return skill directory names from the source of truth.""" + if not _SRC_SKILLS.is_dir(): + return [] + return sorted(d.name for d in _SRC_SKILLS.iterdir() if d.is_dir()) + + +def test_plugin_skills_directory_exists() -> None: + assert _PLUGIN_SKILLS.is_dir(), ( + f"Plugin skills directory missing: {_PLUGIN_SKILLS}\n" + "Run scripts/sync-skills.sh (or .ps1) to create it." + ) + + +def test_all_skills_present_in_plugin() -> None: + """Every skill in src/ must also exist in plugins/.""" + src_skills = _skill_dirs() + assert src_skills, "No skills found in src/agentops/templates/skills/" + + for skill_name in src_skills: + plugin_file = _PLUGIN_SKILLS / skill_name / "SKILL.md" + assert plugin_file.exists(), ( + f"Skill '{skill_name}/SKILL.md' exists in src/ but not in plugins/.\n" + "Run scripts/sync-skills.sh (or .ps1) to fix." + ) + + +def test_skill_contents_match() -> None: + """Content of each SKILL.md must be identical between src/ and plugins/.""" + src_skills = _skill_dirs() + mismatched: list[str] = [] + + for skill_name in src_skills: + src_file = _SRC_SKILLS / skill_name / "SKILL.md" + plugin_file = _PLUGIN_SKILLS / skill_name / "SKILL.md" + + if not src_file.exists() or not plugin_file.exists(): + continue + + src_content = src_file.read_text(encoding="utf-8") + plugin_content = plugin_file.read_text(encoding="utf-8") + + if src_content != plugin_content: + mismatched.append(skill_name) + + assert not mismatched, ( + f"Skill file(s) out of sync: {', '.join(mismatched)}\n" + "Run scripts/sync-skills.sh (or .ps1) to fix." + ) + + +def test_no_extra_skills_in_plugin() -> None: + """Plugin dir should not contain skills that don't exist in src/.""" + src_skills = set(_skill_dirs()) + if not _PLUGIN_SKILLS.is_dir(): + return + + plugin_skills = {d.name for d in _PLUGIN_SKILLS.iterdir() if d.is_dir()} + extra = plugin_skills - src_skills + assert not extra, ( + f"Plugin contains skill(s) not in src/: {', '.join(sorted(extra))}\n" + "Remove them or add them to src/agentops/templates/skills/ first." + )