diff --git a/src/agentops/backends/eval_engine.py b/src/agentops/backends/eval_engine.py index 544bef3e..fbe5e80c 100644 --- a/src/agentops/backends/eval_engine.py +++ b/src/agentops/backends/eval_engine.py @@ -23,6 +23,44 @@ logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Suppress noisy SDK warnings for single-turn evaluation inputs +# --------------------------------------------------------------------------- + +class _ConversationHistoryFilter(logging.Filter): + """Suppress 'Conversation history could not be parsed' from azure-ai-evaluation. + + This warning fires on every single-turn evaluation row because plain-text + inputs are not in conversation-list format. It is expected and harmless. + """ + + def filter(self, record: logging.LogRecord) -> bool: + return "Conversation history could not be parsed" not in record.getMessage() + + +# Apply filter to SDK loggers that emit the warning. +# Each evaluator module passes its own logger to reformat_conversation_history(). +for _sdk_logger_name in ( + "azure.ai.evaluation._common.utils", + "azure.ai.evaluation._evaluators._task_adherence._task_adherence", + "azure.ai.evaluation._evaluators._intent_resolution._intent_resolution", + "azure.ai.evaluation._evaluators._task_completion._task_completion", + "azure.ai.evaluation._evaluators._tool_call_accuracy._tool_call_accuracy", + "azure.ai.evaluation", +): + logging.getLogger(_sdk_logger_name).addFilter(_ConversationHistoryFilter()) + + +# --------------------------------------------------------------------------- +# Cloud-only evaluator sentinel +# --------------------------------------------------------------------------- + + +class _CloudOnlyEvaluatorError(Exception): + """Raised when an evaluator is only available via Foundry Cloud Evaluation.""" + + # --------------------------------------------------------------------------- # Credential help (shared by _default_credential and _acquire_token) # --------------------------------------------------------------------------- @@ -609,8 +647,12 @@ def _load_foundry_evaluator_callable( "Install with: pip install azure-ai-evaluation" ) from exc except AttributeError as exc: - raise ValueError( - f"Unknown built-in Foundry evaluator class: {class_name}" + raise _CloudOnlyEvaluatorError( + f"Evaluator '{class_name}' is not available in the local " + f"azure-ai-evaluation SDK. It may only be available via " + f"Foundry Cloud Evaluation (builtin.{_to_builtin_evaluator_name(class_name)}). " + f"Use 'hosting: foundry' with 'execution_mode: remote' to " + f"run this evaluator, or disable it for local runs." ) from exc return _instantiate_evaluator_symbol( @@ -691,12 +733,23 @@ def _build_foundry_evaluator_runtimes( ) score_keys = score_keys_raw - evaluator_callable = _load_foundry_evaluator_callable( - evaluator_name=evaluator.name, - evaluator_config=config, - fallback_endpoint=fallback_endpoint, - fallback_deployment=fallback_deployment, - ) + try: + evaluator_callable = _load_foundry_evaluator_callable( + evaluator_name=evaluator.name, + evaluator_config=config, + fallback_endpoint=fallback_endpoint, + fallback_deployment=fallback_deployment, + ) + except _CloudOnlyEvaluatorError: + logger.warning( + "Skipping evaluator '%s' — not available in the local " + "azure-ai-evaluation SDK. This evaluator is only supported " + "via Foundry Cloud Evaluation (hosting: foundry, " + "execution_mode: remote). It will be ignored for this " + "local run.", + evaluator.name, + ) + continue runtimes.append( FoundryEvaluatorRuntime( diff --git a/src/agentops/backends/local_adapter_backend.py b/src/agentops/backends/local_adapter_backend.py index 9eae1b62..1636e97a 100644 --- a/src/agentops/backends/local_adapter_backend.py +++ b/src/agentops/backends/local_adapter_backend.py @@ -222,6 +222,7 @@ def execute(self, context: BackendRunContext) -> BackendExecutionResult: prediction_text = _normalize_text( result.get("response", "") ) + returned_tool_calls = result.get("tool_calls") set_agent_invoke_result(invoke_span) except Exception as exc: # noqa: BLE001 stderr_lines.append(f"row={index} error={exc!s}") @@ -230,6 +231,7 @@ def execute(self, context: BackendRunContext) -> BackendExecutionResult: continue else: # --- Subprocess mode --- + assert adapter_command is not None adapter_input = json.dumps( {"input": prompt_text, "expected": expected_text, **row} ) @@ -267,6 +269,7 @@ def execute(self, context: BackendRunContext) -> BackendExecutionResult: prediction_text = _normalize_text( adapter_output.get("response", "") ) + returned_tool_calls = adapter_output.get("tool_calls") set_agent_invoke_result(invoke_span) except subprocess.TimeoutExpired: stderr_lines.append(f"row={index} error=adapter timeout") @@ -333,6 +336,7 @@ def execute(self, context: BackendRunContext) -> BackendExecutionResult: "input": prompt_text, "response": prediction_text, "context": row.get("context"), + "tool_calls": returned_tool_calls, "metrics": row_metric_entries, } ) diff --git a/src/agentops/cli/app.py b/src/agentops/cli/app.py index 693ffa0f..478536c7 100644 --- a/src/agentops/cli/app.py +++ b/src/agentops/cli/app.py @@ -491,6 +491,17 @@ def cmd_skills_install( help="Target platform(s): copilot, claude.", ), ] = None, + from_github: Annotated[ + str | None, + typer.Option( + "--from", + help=( + "Install a community skill from GitHub. " + "Format: org/repo or github:org/repo[@ref]. " + "Example: --from donlee/pptx-designer" + ), + ), + ] = None, force: bool = typer.Option( False, "--force", @@ -507,12 +518,18 @@ def cmd_skills_install( help="Target repository root directory.", ), ) -> None: - """Install AgentOps coding agent skills into the target project.""" - from agentops.services.skills import install_skills + """Install AgentOps coding agent skills into the target project. + + Use --from to install a community skill from GitHub: + agentops skills install --from donlee/pptx-designer + + agentops skills install --from github:org/repo@v1.0 + """ log.debug( - "cmd_skills_install called platform=%s force=%s prompt=%s dir=%s", + "cmd_skills_install called platform=%s from=%s force=%s prompt=%s dir=%s", platform, + from_github, force, prompt, directory, @@ -524,6 +541,31 @@ def cmd_skills_install( typer.echo("No platforms selected. Skipping skill installation.") return + if from_github: + # GitHub-based skill installation + from agentops.services.skills import install_github_skill + + typer.echo(f"Installing skill from GitHub: {from_github}") + try: + result = install_github_skill( + source=from_github, + directory=directory, + platforms=resolved_platforms, + force=True, + ) + except ValueError as exc: + typer.echo(f"Error: {exc}", err=True) + raise typer.Exit(code=1) from exc + except Exception as exc: + typer.echo(f"Error: failed to install skill: {exc}", err=True) + raise typer.Exit(code=1) from exc + + _print_skills_result(result) + return + + # Bundled skills installation + from agentops.services.skills import install_skills + try: result = install_skills( directory=directory, platforms=resolved_platforms, force=True diff --git a/src/agentops/services/initializer.py b/src/agentops/services/initializer.py index 28e3d402..5f6e3559 100644 --- a/src/agentops/services/initializer.py +++ b/src/agentops/services/initializer.py @@ -23,11 +23,14 @@ class InitResult: "run.yaml", "run-rag.yaml", "run-agent.yaml", + "run-agent-local.yaml", "run-http-model.yaml", "run-http-rag.yaml", "run-http-agent-tools.yaml", "run-callable.yaml", "callable_adapter.py", + "agent_framework_adapter.py", + "multi_agent_workflow.py", ".gitignore", "bundles/model_quality_baseline.yaml", "bundles/rag_quality_baseline.yaml", diff --git a/src/agentops/services/runner.py b/src/agentops/services/runner.py index 1124cbcf..18319af9 100644 --- a/src/agentops/services/runner.py +++ b/src/agentops/services/runner.py @@ -3,6 +3,7 @@ from __future__ import annotations import json +import logging import shutil from dataclasses import dataclass from datetime import datetime @@ -42,6 +43,8 @@ shutdown as shutdown_tracing, ) +logger = logging.getLogger(__name__) + @dataclass(frozen=True) class EvalRunServiceResult: @@ -218,9 +221,9 @@ def _evaluate_item_thresholds( threshold_results: list[ItemThresholdEvaluationResult] = [] for rule in threshold_rules: if rule.evaluator not in row_values: - raise ValueError( - f"Missing evaluator score '{rule.evaluator}' for row {row.row_index}" - ) + # Evaluator may be cloud-only and was skipped during local + # execution — silently skip its threshold check. + continue threshold_results.append( _evaluate_threshold_against_value( @@ -266,8 +269,12 @@ def _validate_enabled_evaluators_scored( missing = [name for name in evaluator_names if name not in scored_names] if missing: - raise ValueError( - "Missing scores for enabled evaluators: " + ", ".join(sorted(missing)) + logger.warning( + "Some enabled evaluators did not produce scores and will be " + "excluded from threshold checks: %s. These evaluators may " + "only be available via Foundry Cloud Evaluation " + "(hosting: foundry, execution_mode: remote).", + ", ".join(sorted(missing)), ) @@ -291,6 +298,11 @@ def _summarize_thresholds_from_items( ): rule_results.append(threshold_result) + # Skip threshold rules for evaluators that produced no scores + # (e.g., cloud-only evaluators skipped during local execution). + if not rule_results: + continue + passed_items = sum(1 for result in rule_results if result.passed) passed = bool(rule_results) and passed_items == len(rule_results) diff --git a/src/agentops/services/skills.py b/src/agentops/services/skills.py index 35092588..ce66e276 100644 --- a/src/agentops/services/skills.py +++ b/src/agentops/services/skills.py @@ -2,10 +2,15 @@ from __future__ import annotations +import io +import json import re +import tarfile +import urllib.error +import urllib.request from dataclasses import dataclass, field from importlib.resources import files -from pathlib import Path +from pathlib import Path, PurePosixPath from typing import Dict, List _TEMPLATE_PACKAGE = "agentops.templates" @@ -200,9 +205,357 @@ def install_skills( # --------------------------------------------------------------------------- -# Registration — add skill discovery entries to instruction files +# GitHub-based skill installation # --------------------------------------------------------------------------- +# Allowed sub-directories within a skill folder (agentskills.io spec). +_ALLOWED_SKILL_DIRS = {"references", "scripts", "assets"} + +# Directories skipped by default for security (opt-in only). +_RESTRICTED_DIRS = {"scripts"} + +_GITHUB_REF_RE = re.compile( + r"^(?:github:)?" + r"(?P[A-Za-z0-9._-]+)" + r"/(?P[A-Za-z0-9._-]+)" + r"(?:@(?P[A-Za-z0-9._/-]+))?$" +) + +_PROVENANCE_FILE = ".installed-from.json" + + +@dataclass +class GitHubSkillRef: + """Parsed GitHub skill reference.""" + + owner: str + repo: str + ref: str # branch, tag, or commit SHA + + +def _parse_github_ref(source: str) -> GitHubSkillRef: + """Parse ``github:org/repo@ref`` or ``org/repo`` into components. + + Raises ValueError on invalid input. + """ + m = _GITHUB_REF_RE.match(source.strip()) + if not m: + raise ValueError( + f"Invalid GitHub skill reference: '{source}'. " + "Expected format: github:org/repo or org/repo[@ref]" + ) + return GitHubSkillRef( + owner=m.group("owner"), + repo=m.group("repo"), + ref=m.group("ref") or "main", + ) + + +def _validate_skill_name(name: str) -> str: + """Validate and sanitize a skill name from SKILL.md frontmatter. + + Raises ValueError if the name contains path traversal or invalid chars. + """ + if not name or not re.fullmatch(r"[a-z0-9]+(?:-[a-z0-9]+)*", name): + raise ValueError( + f"Invalid skill name: '{name}'. " + "Must be lowercase alphanumeric with single hyphens, " + "e.g. 'pptx-designer'." + ) + if ".." in name or "/" in name or "\\" in name: + raise ValueError(f"Skill name contains path traversal: '{name}'") + return name + + +def _parse_skill_frontmatter(content: str) -> dict[str, str]: + """Extract YAML frontmatter fields from a SKILL.md file. + + Returns a dict with at least ``name`` and ``description`` keys. + Uses simple line parsing to avoid a YAML dependency in this module. + """ + if not content.startswith("---"): + raise ValueError("SKILL.md is missing YAML frontmatter (must start with ---).") + + lines = content.split("\n") + end_idx = None + for i, line in enumerate(lines[1:], 1): + if line.strip() == "---": + end_idx = i + break + + if end_idx is None: + raise ValueError("SKILL.md has unclosed YAML frontmatter.") + + meta: dict[str, str] = {} + current_key = "" + for line in lines[1:end_idx]: + if line.startswith(" ") and current_key: + # Continuation of multiline value + meta[current_key] = meta.get(current_key, "") + " " + line.strip() + continue + if ":" in line: + key, _, val = line.partition(":") + key = key.strip() + val = val.strip().strip(">").strip('"').strip("'").strip() + if key: + current_key = key + meta[key] = val + + if "name" not in meta: + raise ValueError("SKILL.md frontmatter is missing required 'name' field.") + if "description" not in meta: + raise ValueError("SKILL.md frontmatter is missing required 'description' field.") + + return meta + + +def _fetch_github_tarball(ref: GitHubSkillRef) -> bytes: + """Download a GitHub repo tarball for the given ref. + + Uses ``GITHUB_TOKEN`` or ``GH_TOKEN`` env var if available. + """ + import os + + url = f"https://api.github.com/repos/{ref.owner}/{ref.repo}/tarball/{ref.ref}" + + headers: dict[str, str] = {"Accept": "application/vnd.github+json"} + token = os.environ.get("GITHUB_TOKEN") or os.environ.get("GH_TOKEN") + if token: + headers["Authorization"] = f"Bearer {token}" + + req = urllib.request.Request(url, headers=headers) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + return resp.read() + except urllib.error.HTTPError as e: + if e.code == 404: + raise ValueError( + f"GitHub repository not found: {ref.owner}/{ref.repo}@{ref.ref}" + ) from e + if e.code == 403: + raise ValueError( + f"GitHub API rate limit or access denied for {ref.owner}/{ref.repo}. " + "Set GITHUB_TOKEN env var for authenticated access." + ) from e + raise ValueError( + f"GitHub API error ({e.code}): {e.reason}" + ) from e + except urllib.error.URLError as e: + raise ValueError(f"Network error fetching {ref.owner}/{ref.repo}: {e}") from e + + +def _extract_skill_from_tarball( + tarball: bytes, + repo_name: str, +) -> tuple[dict[str, str], dict[str, bytes]]: + """Extract a single skill from a GitHub repo tarball. + + Returns (frontmatter_metadata, {relative_path: content_bytes}). + + Searches for the skill directory following agentskills.io convention: + 1. ``{repo_name}/SKILL.md`` (skill dir = repo name) + 2. Any ``*/SKILL.md`` at depth 1 from repo root + 3. ``SKILL.md`` at repo root + + Raises ValueError if no SKILL.md is found or multiple candidates exist. + """ + with tarfile.open(fileobj=io.BytesIO(tarball), mode="r:gz") as tar: + members = tar.getnames() + + # GitHub tarballs have a prefix like "owner-repo-sha/" + prefix = "" + for name in members: + if "/" in name: + prefix = name.split("/")[0] + "/" + break + + # Find SKILL.md candidates + candidates: list[str] = [] + for name in members: + relative = name[len(prefix):] if name.startswith(prefix) else name + parts = PurePosixPath(relative).parts + if parts and parts[-1] == "SKILL.md": + if len(parts) <= 2: + candidates.append(relative) + + if not candidates: + raise ValueError( + f"No SKILL.md found in {repo_name}. " + "The repository must contain a skill directory with a SKILL.md file " + "(agentskills.io standard)." + ) + + # Prefer {repo_name}/SKILL.md, then first candidate + chosen = None + for c in candidates: + if c.startswith(repo_name + "/"): + chosen = c + break + if chosen is None: + if len(candidates) > 1: + dirs = [str(PurePosixPath(c).parent) for c in candidates] + raise ValueError( + f"Multiple skills found in {repo_name}: {', '.join(dirs)}. " + "Use github:org/repo with a repo that contains a single skill." + ) + chosen = candidates[0] + + skill_dir = str(PurePosixPath(chosen).parent) + if skill_dir == ".": + skill_dir = "" + + # Read SKILL.md and parse frontmatter + skill_md_path = prefix + chosen + member = tar.getmember(skill_md_path) + f = tar.extractfile(member) + if f is None: + raise ValueError(f"Cannot read {skill_md_path}") + skill_md_content = f.read() + metadata = _parse_skill_frontmatter(skill_md_content.decode("utf-8")) + + # Collect all files in the skill directory + skill_prefix = prefix + (skill_dir + "/" if skill_dir else "") + collected: dict[str, bytes] = {} + + for member in tar.getmembers(): + if not member.isfile(): + continue + if not member.name.startswith(skill_prefix): + continue + + relative = member.name[len(skill_prefix):] + parts = PurePosixPath(relative).parts + + if not parts: + continue + + # Security: block path traversal + if any(p in ("..", "") for p in parts): + continue + if any(p.startswith(".") for p in parts): + continue + + # Allow SKILL.md at root, and files in allowed subdirs + if len(parts) == 1 and parts[0] == "SKILL.md": + collected[relative] = skill_md_content + continue + + top_dir = parts[0] if len(parts) > 1 else None + if top_dir and top_dir in _ALLOWED_SKILL_DIRS: + if top_dir in _RESTRICTED_DIRS: + continue # Skip scripts/ by default + f = tar.extractfile(member) + if f: + collected[relative] = f.read() + + return metadata, collected + + +def install_github_skill( + source: str, + directory: Path, + platforms: list[str], + force: bool = False, +) -> SkillsInstallResult: + """Install a skill from a GitHub repository. + + Downloads the repo archive, extracts the skill, validates it, + and installs to platform-specific directories. + + Args: + source: GitHub reference, e.g. ``github:org/repo``, ``org/repo@v1.0``. + directory: Root directory of the consumer repository. + platforms: Platform identifiers (e.g. ``["copilot"]``). + force: When True, overwrite existing skill files. + + Returns: + SkillsInstallResult with paths of created, overwritten, or skipped files. + """ + ref = _parse_github_ref(source) + result = SkillsInstallResult(platforms=list(platforms)) + resolved = directory.resolve() + + # Fetch and extract + tarball = _fetch_github_tarball(ref) + metadata, skill_files = _extract_skill_from_tarball(tarball, ref.repo) + + skill_name = _validate_skill_name(metadata["name"]) + + if not skill_files: + raise ValueError(f"No installable files found in {ref.owner}/{ref.repo}.") + + # Install to each platform + for platform in platforms: + config = _PLATFORM_CONFIGS.get(platform) + if not config: + continue + + target_dir = resolved / config["target_dir"] + + for relative_path, content_bytes in skill_files.items(): + if relative_path == "SKILL.md": + # SKILL.md uses the platform file pattern + dest_relative = config["file_pattern"].format(skill_name=skill_name) + dest = target_dir / dest_relative + text_content = content_bytes.decode("utf-8") + text_content = _transform_content(text_content, platform) + write_bytes = text_content.encode("utf-8") + else: + # Reference/asset files go alongside the SKILL.md + if platform == "claude": + continue # Claude only gets the single .md file + skill_dest_dir = config["file_pattern"].format( + skill_name=skill_name + ) + # e.g. "pptx-designer/SKILL.md" → "pptx-designer/" + skill_base = str(PurePosixPath(skill_dest_dir).parent) + dest = target_dir / skill_base / relative_path + write_bytes = content_bytes + + # Security: ensure dest stays under target_dir + try: + dest.resolve().relative_to(target_dir.resolve()) + except ValueError: + continue # path traversal — skip silently + + existed = dest.exists() + if existed and not force: + result.skipped_files.append(dest) + continue + + dest.parent.mkdir(parents=True, exist_ok=True) + dest.write_bytes(write_bytes) + + if existed: + result.overwritten_files.append(dest) + else: + result.created_files.append(dest) + + # Write provenance file + if platform != "claude": + provenance_dest_rel = config["file_pattern"].format( + skill_name=skill_name + ) + provenance_dir = ( + target_dir / str(PurePosixPath(provenance_dest_rel).parent) + ) + provenance = { + "source": f"github:{ref.owner}/{ref.repo}", + "ref": ref.ref, + "skill_name": skill_name, + "description": metadata.get("description", ""), + "files": sorted(skill_files.keys()), + } + prov_path = provenance_dir / _PROVENANCE_FILE + prov_path.parent.mkdir(parents=True, exist_ok=True) + prov_path.write_text( + json.dumps(provenance, indent=2) + "\n", encoding="utf-8" + ) + if prov_path not in result.created_files: + result.created_files.append(prov_path) + + return result + @dataclass class RegistrationResult: diff --git a/src/agentops/templates/agent_framework_adapter.py b/src/agentops/templates/agent_framework_adapter.py new file mode 100644 index 00000000..281b785d --- /dev/null +++ b/src/agentops/templates/agent_framework_adapter.py @@ -0,0 +1,132 @@ +"""Agent Framework adapter for evaluating a single agent with tools. + +Uses Microsoft Agent Framework Agent with FoundryChatClient to create +an agent with local @tool functions. Unlike FoundryAgent (which requires +tools declared server-side), this pattern defines tools entirely in code. + +For multi-agent workflows with routing, use multi_agent_workflow.py. + +Reference: github.com/microsoft/agent-framework/python/samples/ + 03-workflows/_start-here/step2_agents_in_a_workflow.py + +Prerequisites: + pip install agent-framework[foundry] azure-identity + +Environment variables: + AZURE_AI_FOUNDRY_PROJECT_ENDPOINT — Foundry project endpoint + AZURE_OPENAI_DEPLOYMENT — model deployment name + +Usage in run.yaml: + target: + type: agent + hosting: local + execution_mode: local + framework: agent_framework + local: + callable: agent_framework_adapter:run_evaluation +""" +from __future__ import annotations + +import asyncio +import logging +import os +from typing import Any + +from agent_framework import Agent, AgentResponse, tool + +logger = logging.getLogger(__name__) + +PROJECT_ENDPOINT = os.environ.get("AZURE_AI_FOUNDRY_PROJECT_ENDPOINT", "") +MODEL = os.environ.get("AZURE_OPENAI_DEPLOYMENT", "") + +_client = None +_captured_tool_calls: list[dict[str, Any]] = [] + + +def _get_chat_client(): + """Lazily initialize the FoundryChatClient.""" + global _client + if _client is None: + from azure.identity import DefaultAzureCredential + from agent_framework.foundry import FoundryChatClient + + _client = FoundryChatClient( + project_endpoint=PROJECT_ENDPOINT, + model=MODEL, + credential=DefaultAzureCredential(), + ) + return _client + + +# ── Local tool implementations ───────────────────────────────────────── +# Replace these with your agent's actual tools. + + +@tool +def get_weather(city: str) -> str: + """Get current weather for a city""" + _captured_tool_calls.append({"name": "get_weather", "arguments": {"city": city}}) + return f"Current weather in {city}: 55°F, partly cloudy." + + +@tool +def convert_currency(amount: str, from_currency: str, to_currency: str) -> str: + """Convert an amount from one currency to another""" + amt = float(amount) + _captured_tool_calls.append({ + "name": "convert_currency", + "arguments": {"amount": amt, "from_currency": from_currency, "to_currency": to_currency}, + }) + return f"{amt} {from_currency} = {amt * 0.92:.2f} {to_currency}" + + +@tool +def search_news(query: str, max_results: str = "5") -> str: + """Search for recent news articles""" + _captured_tool_calls.append({ + "name": "search_news", + "arguments": {"query": query, "max_results": int(max_results)}, + }) + return f"Found {max_results} articles about '{query}'." + + +ALL_TOOLS = [get_weather, convert_currency, search_news] + + +async def _run_agent(input_text: str) -> dict[str, Any]: + """Run a single agent with local @tool functions.""" + agent = Agent( + client=_get_chat_client(), + name="EvalAgent", + instructions=( + "You are a helpful assistant with tools. " + "Use the appropriate tool to answer the user's query. " + "Always call a tool before responding." + ), + tools=ALL_TOOLS, + ) + + _captured_tool_calls.clear() + result: AgentResponse = await agent.run(input_text) + + response_text = result.text or "" + + return { + "response": response_text.strip(), + "tool_calls": list(_captured_tool_calls), + } + + +def run_evaluation(input_text: str, context: dict) -> dict: + """Callable entry point for AgentOps evaluation. + + Creates a single Agent with local @tool functions using + Microsoft Agent Framework. Tool calls are captured and + returned alongside the response for evaluator scoring. + """ + if not PROJECT_ENDPOINT or not MODEL: + raise ValueError( + "Set AZURE_AI_FOUNDRY_PROJECT_ENDPOINT and AZURE_OPENAI_DEPLOYMENT" + ) + + return asyncio.run(_run_agent(input_text)) diff --git a/src/agentops/templates/bundles/agent_workflow_baseline.yaml b/src/agentops/templates/bundles/agent_workflow_baseline.yaml index 03d27589..ea6e015b 100644 --- a/src/agentops/templates/bundles/agent_workflow_baseline.yaml +++ b/src/agentops/templates/bundles/agent_workflow_baseline.yaml @@ -5,6 +5,15 @@ description: > Measures task completion, tool call accuracy, intent resolution, task adherence, tool selection, and tool input accuracy using AI-assisted evaluators from the Foundry evaluation suite. + + Note: TaskCompletionEvaluator, ToolSelectionEvaluator, and + ToolInputAccuracyEvaluator are only available via Foundry Cloud + Evaluation and will be gracefully skipped in local execution mode. + + Note: TaskAdherenceEvaluator works best with multi-turn conversation + format (list of message dicts with role/content). Single-turn plain + text inputs may produce low scores because the evaluator cannot assess + procedural adherence without conversation context. evaluators: - name: TaskCompletionEvaluator source: foundry diff --git a/src/agentops/templates/callable_adapter.py b/src/agentops/templates/callable_adapter.py index f66c400b..2c7d7549 100644 --- a/src/agentops/templates/callable_adapter.py +++ b/src/agentops/templates/callable_adapter.py @@ -103,6 +103,14 @@ def run_evaluation(input_text: str, context: dict) -> dict: # result = workflow.invoke(input_text) # return {"response": result.output} + # --- Option 4: Agent Framework (Azure AI Foundry agent) --- + # For Agent Framework agents, use the dedicated adapter template instead: + # + # callable: agent_framework_adapter:run_evaluation + # + # Set AZURE_AI_FOUNDRY_PROJECT_ENDPOINT and AGENT_ID environment variables. + # See agent_framework_adapter.py for details. + # --- Context sanitization (RAG scenarios) --- # If your dataset has a "context" field with raw document content, # clean it before returning: diff --git a/src/agentops/templates/multi_agent_workflow.py b/src/agentops/templates/multi_agent_workflow.py new file mode 100644 index 00000000..f44c6b9a --- /dev/null +++ b/src/agentops/templates/multi_agent_workflow.py @@ -0,0 +1,281 @@ +"""Multi-agent workflow using Microsoft Agent Framework. + +Demonstrates a router-to-specialist pattern following the official +Agent Framework workflow samples (microsoft/agent-framework): + + Router Agent → Coordinator (custom Executor) → Specialist Agent + +The Coordinator examines the Router's output and forwards the original +user query to the correct Specialist Agent. Each specialist has @tool +functions that Agent Framework auto-executes. + +Reference: github.com/microsoft/agent-framework/python/samples/03-workflows/ + +Prerequisites: + pip install agent-framework[foundry] azure-identity + +Environment variables: + AZURE_AI_FOUNDRY_PROJECT_ENDPOINT — Foundry project endpoint + AZURE_OPENAI_DEPLOYMENT — model deployment name (e.g. gpt-5.1) + +Usage in run.yaml: + target: + type: agent + hosting: local + execution_mode: local + framework: agent_framework + local: + callable: multi_agent_workflow:run_evaluation +""" +from __future__ import annotations + +import asyncio +import logging +import os +from typing import Any + +from agent_framework import ( + Agent, + AgentExecutor, + AgentExecutorRequest, + AgentExecutorResponse, + AgentResponse, + Executor, + Message, + WorkflowBuilder, + WorkflowContext, + handler, + tool, +) + +logger = logging.getLogger(__name__) + +PROJECT_ENDPOINT = os.environ.get("AZURE_AI_FOUNDRY_PROJECT_ENDPOINT", "") +MODEL = os.environ.get("AZURE_OPENAI_DEPLOYMENT", "") + +_client = None +_captured_tool_calls: list[dict[str, Any]] = [] + + +def _get_chat_client(): + """Lazily initialize the FoundryChatClient.""" + global _client + if _client is None: + from azure.identity import DefaultAzureCredential + from agent_framework.foundry import FoundryChatClient + + _client = FoundryChatClient( + project_endpoint=PROJECT_ENDPOINT, + model=MODEL, + credential=DefaultAzureCredential(), + ) + return _client + + +# ── Tool functions (decorated with @tool for Agent Framework) ────────── + + +@tool +def get_weather(city: str) -> str: + """Get current weather for a city""" + _captured_tool_calls.append({"name": "get_weather", "arguments": {"city": city}}) + return f"Current weather in {city}: 55°F, partly cloudy." + + +@tool +def convert_currency(amount: str, from_currency: str, to_currency: str) -> str: + """Convert an amount from one currency to another""" + amt = float(amount) + _captured_tool_calls.append({ + "name": "convert_currency", + "arguments": {"amount": amt, "from_currency": from_currency, "to_currency": to_currency}, + }) + return f"{amt} {from_currency} = {amt * 0.92:.2f} {to_currency}" + + +@tool +def calculate_compound_interest(principal: str, rate: str, years: str) -> str: + """Calculate compound interest""" + p, r, y = float(principal), float(rate) / 100, int(float(years)) + total = p * ((1 + r) ** y) + interest = total - p + _captured_tool_calls.append({ + "name": "calculate_compound_interest", + "arguments": {"principal": p, "rate": r, "years": y}, + }) + return f"Compound interest: ${interest:,.2f}, total: ${total:,.2f}" + + +@tool +def search_news(query: str, max_results: str = "5") -> str: + """Search for recent news articles""" + _captured_tool_calls.append({ + "name": "search_news", + "arguments": {"query": query, "max_results": int(max_results)}, + }) + return f"Found {max_results} articles about '{query}'." + + +@tool +def search_flights(origin: str, destination: str, date: str) -> str: + """Search for available flights""" + _captured_tool_calls.append({ + "name": "search_flights", + "arguments": {"origin": origin, "destination": destination, "date": date}, + }) + return f"Found 3 flights from {origin} to {destination} on {date}." + + +# ── Coordinator Executor ────────────────────────────────────────────── +# Routes the user query to the correct specialist based on the Router's +# classification. Follows the official Coordinator pattern from +# microsoft/agent-framework samples. + + +class RoutingCoordinator(Executor): + """Routes between Router Agent and Specialist Agents.""" + + SPECIALIST_IDS = { + "weather": "weather_specialist", + "finance": "finance_specialist", + "search": "search_specialist", + } + + def __init__(self) -> None: + super().__init__(id="coordinator") + + @handler + async def on_agent_response( + self, + response: AgentExecutorResponse, + ctx: WorkflowContext[AgentExecutorRequest, AgentResponse], + ) -> None: + """Handle responses from Router and Specialist agents.""" + if response.executor_id != "router": + # Specialist response — yield as workflow output + await ctx.yield_output(response.agent_response) + return + + # Router response — parse routing decision and forward to specialist + routing_text = response.agent_response.text.strip().lower() + + if "weather" in routing_text: + target = "weather_specialist" + elif any(k in routing_text for k in ("finance", "currency", "interest")): + target = "finance_specialist" + else: + target = "search_specialist" + + logger.info("Coordinator routing to: %s (router said: %s)", target, routing_text) + + # Forward the original user query to the specialist + original_messages = list(response.full_conversation) + user_query = "" + for msg in original_messages: + if msg.role == "user": + user_query = msg.text or "" + break + + await ctx.send_message( + AgentExecutorRequest( + messages=[Message("user", contents=[user_query])], + should_respond=True, + ), + target_id=target, + ) + + +def _build_workflow(): + """Build the multi-agent workflow with Router → Coordinator → Specialists.""" + client = _get_chat_client() + + # Create agents + router = AgentExecutor(Agent( + client=client, + name="router", + instructions=( + "You are a routing agent. Analyze the user's query and respond " + "with ONLY one word:\n" + "- 'weather' for weather queries\n" + "- 'finance' for currency or interest calculations\n" + "- 'search' for news, flights, or general queries\n" + "Respond with only the category word, nothing else." + ), + )) + + weather = AgentExecutor(Agent( + client=client, + name="weather_specialist", + instructions="Use the get_weather tool to answer weather queries.", + tools=[get_weather], + )) + + finance = AgentExecutor(Agent( + client=client, + name="finance_specialist", + instructions=( + "Use convert_currency or calculate_compound_interest tools as needed." + ), + tools=[convert_currency, calculate_compound_interest], + )) + + search = AgentExecutor(Agent( + client=client, + name="search_specialist", + instructions="Use search_news or search_flights tools as needed.", + tools=[search_news, search_flights], + )) + + coordinator = RoutingCoordinator() + + # Build workflow: Router → Coordinator ↔ Specialists + workflow = ( + WorkflowBuilder(start_executor=router) + # Router output goes to Coordinator + .add_edge(router, coordinator) + # Coordinator can route to any specialist + .add_edge(coordinator, weather) + .add_edge(coordinator, finance) + .add_edge(coordinator, search) + # Specialist output goes back to Coordinator (which yields output) + .add_edge(weather, coordinator) + .add_edge(finance, coordinator) + .add_edge(search, coordinator) + .build() + ) + + return workflow + + +async def _run_workflow(input_text: str) -> dict[str, Any]: + """Run the multi-agent workflow for a single query.""" + workflow = _build_workflow() + + _captured_tool_calls.clear() + events = await workflow.run(input_text) + + # Extract the final response from workflow outputs + response_text = "" + outputs = events.get_outputs() + for output in outputs: + if isinstance(output, AgentResponse) and output.text: + response_text = output.text + + return { + "response": response_text.strip(), + "tool_calls": list(_captured_tool_calls), + } + + +def run_evaluation(input_text: str, context: dict) -> dict: + """Multi-agent workflow entry point for AgentOps evaluation. + + Uses Microsoft Agent Framework WorkflowBuilder with: + Router Agent → RoutingCoordinator → Specialist Agents (@tool) + """ + if not PROJECT_ENDPOINT or not MODEL: + raise ValueError( + "Set AZURE_AI_FOUNDRY_PROJECT_ENDPOINT and AZURE_OPENAI_DEPLOYMENT" + ) + + return asyncio.run(_run_workflow(input_text)) diff --git a/src/agentops/templates/run-agent-local.yaml b/src/agentops/templates/run-agent-local.yaml new file mode 100644 index 00000000..915b023a --- /dev/null +++ b/src/agentops/templates/run-agent-local.yaml @@ -0,0 +1,37 @@ +version: 1 + +# Local agent workflow evaluation via callable adapter. +# Evaluates a local Python function that implements a multi-agent workflow. +# +# Two adapter options: +# +# 1. agent_framework_adapter:run_evaluation +# For Azure AI Foundry agents (Agent Framework SDK). +# Requires: AZURE_AI_FOUNDRY_PROJECT_ENDPOINT and AGENT_ID env vars. +# pip install azure-ai-projects azure-identity +# +# 2. callable_adapter:run_evaluation +# For custom agents — HTTP, direct Python, or any callable. +# +# Cloud-only evaluators (TaskCompletionEvaluator, ToolSelectionEvaluator, +# ToolInputAccuracyEvaluator) will be gracefully skipped in local mode. +# Use 'hosting: foundry' with 'execution_mode: remote' to run all evaluators. + +target: + type: agent + hosting: local + execution_mode: local + framework: agent_framework + local: + # Option 1: Agent Framework adapter (Azure AI Foundry agents) + # callable: agent_framework_adapter:run_evaluation + # Option 2: Custom callable adapter (HTTP, direct Python, etc.) + callable: callable_adapter:run_evaluation +bundle: + name: agent_workflow_baseline +dataset: + name: smoke-agent-tools +execution: + timeout_seconds: 300 +output: + write_report: true diff --git a/tests/unit/test_initializer.py b/tests/unit/test_initializer.py index 67357b48..74daab3e 100644 --- a/tests/unit/test_initializer.py +++ b/tests/unit/test_initializer.py @@ -42,7 +42,7 @@ def test_init_creates_expected_files(tmp_path: Path) -> None: assert (tmp_path / ".agentops" / "data" / "smoke-conversational.jsonl").is_file() assert (tmp_path / ".agentops" / "workflows" / "agentops-eval.yml").is_file() - assert len(result.created_files) == 24 + assert len(result.created_files) == 27 assert len(result.overwritten_files) == 0 run_config = load_yaml(tmp_path / ".agentops" / "run.yaml") diff --git a/tests/unit/test_skills.py b/tests/unit/test_skills.py index 4392f077..5b9a4458 100644 --- a/tests/unit/test_skills.py +++ b/tests/unit/test_skills.py @@ -1,12 +1,22 @@ +import io +import json +import tarfile from pathlib import Path +from unittest.mock import patch +import pytest from typer.testing import CliRunner from agentops.cli.app import app from agentops.services.skills import ( _COPILOT_MARKER_END, _COPILOT_MARKER_START, + _extract_skill_from_tarball, + _parse_github_ref, + _parse_skill_frontmatter, + _validate_skill_name, detect_platforms, + install_github_skill, install_skills, register_skills, ) @@ -403,3 +413,347 @@ def test_cli_init_does_not_install_skills_claude(tmp_path: Path) -> None: for rel in _CLAUDE_SKILL_PATHS: assert not (tmp_path / rel).exists(), f"Should not exist after init: {rel}" + + +# --------------------------------------------------------------------------- +# GitHub ref parsing +# --------------------------------------------------------------------------- + + +def test_parse_github_ref_simple() -> None: + ref = _parse_github_ref("donlee/pptx-designer") + assert ref.owner == "donlee" + assert ref.repo == "pptx-designer" + assert ref.ref == "main" + + +def test_parse_github_ref_with_prefix() -> None: + ref = _parse_github_ref("github:org/repo") + assert ref.owner == "org" + assert ref.repo == "repo" + assert ref.ref == "main" + + +def test_parse_github_ref_with_version() -> None: + ref = _parse_github_ref("github:org/repo@v1.2.3") + assert ref.owner == "org" + assert ref.repo == "repo" + assert ref.ref == "v1.2.3" + + +def test_parse_github_ref_with_branch() -> None: + ref = _parse_github_ref("org/repo@feature/my-branch") + assert ref.ref == "feature/my-branch" + + +def test_parse_github_ref_invalid() -> None: + with pytest.raises(ValueError, match="Invalid GitHub skill reference"): + _parse_github_ref("not-valid") + + +def test_parse_github_ref_empty() -> None: + with pytest.raises(ValueError, match="Invalid GitHub skill reference"): + _parse_github_ref("") + + +# --------------------------------------------------------------------------- +# Skill name validation +# --------------------------------------------------------------------------- + + +def test_validate_skill_name_valid() -> None: + assert _validate_skill_name("pptx-designer") == "pptx-designer" + assert _validate_skill_name("myskill") == "myskill" + assert _validate_skill_name("my-cool-skill") == "my-cool-skill" + + +def test_validate_skill_name_invalid() -> None: + with pytest.raises(ValueError, match="Invalid skill name"): + _validate_skill_name("My Skill") + + with pytest.raises(ValueError, match="Invalid skill name"): + _validate_skill_name("../traversal") + + with pytest.raises(ValueError, match="Invalid skill name"): + _validate_skill_name("") + + with pytest.raises(ValueError, match="Invalid skill name"): + _validate_skill_name("UPPERCASE") + + +# --------------------------------------------------------------------------- +# Frontmatter parsing +# --------------------------------------------------------------------------- + + +_VALID_FRONTMATTER = """\ +--- +name: test-skill +description: A test skill for unit testing. +license: MIT +--- + +# Test Skill + +Instructions here. +""" + + +def test_parse_frontmatter_valid() -> None: + meta = _parse_skill_frontmatter(_VALID_FRONTMATTER) + assert meta["name"] == "test-skill" + assert "test skill" in meta["description"].lower() + + +def test_parse_frontmatter_missing_name() -> None: + content = "---\ndescription: test\n---\n# Body" + with pytest.raises(ValueError, match="missing required 'name'"): + _parse_skill_frontmatter(content) + + +def test_parse_frontmatter_missing_description() -> None: + content = "---\nname: test\n---\n# Body" + with pytest.raises(ValueError, match="missing required 'description'"): + _parse_skill_frontmatter(content) + + +def test_parse_frontmatter_no_frontmatter() -> None: + with pytest.raises(ValueError, match="missing YAML frontmatter"): + _parse_skill_frontmatter("# Just a heading") + + +def test_parse_frontmatter_unclosed() -> None: + with pytest.raises(ValueError, match="unclosed YAML frontmatter"): + _parse_skill_frontmatter("---\nname: test\n") + + +def test_parse_frontmatter_multiline_description() -> None: + content = "---\nname: test-skill\ndescription: >\n A long\n description here.\n---\n# Body" + meta = _parse_skill_frontmatter(content) + assert "long" in meta["description"] + assert "description here" in meta["description"] + + +# --------------------------------------------------------------------------- +# Tarball extraction +# --------------------------------------------------------------------------- + + +def _make_test_tarball(files: dict[str, str], prefix: str = "owner-repo-abc123") -> bytes: + """Create a gzipped tarball with the given files for testing.""" + buf = io.BytesIO() + with tarfile.open(fileobj=buf, mode="w:gz") as tar: + for path, content in files.items(): + full_path = f"{prefix}/{path}" + data = content.encode("utf-8") + info = tarfile.TarInfo(name=full_path) + info.size = len(data) + tar.addfile(info, io.BytesIO(data)) + return buf.getvalue() + + +def test_extract_skill_from_tarball() -> None: + tarball = _make_test_tarball({ + "my-skill/SKILL.md": _VALID_FRONTMATTER, + "my-skill/references/guide.md": "# Guide\n\nSome content.", + }) + meta, files = _extract_skill_from_tarball(tarball, "my-skill") + assert meta["name"] == "test-skill" + assert "SKILL.md" in files + assert "references/guide.md" in files + + +def test_extract_skill_prefers_repo_named_dir() -> None: + tarball = _make_test_tarball({ + "my-skill/SKILL.md": _VALID_FRONTMATTER, + "other-dir/SKILL.md": _VALID_FRONTMATTER, + }) + meta, files = _extract_skill_from_tarball(tarball, "my-skill") + assert meta["name"] == "test-skill" + + +def test_extract_skill_root_skill_md() -> None: + tarball = _make_test_tarball({ + "SKILL.md": _VALID_FRONTMATTER, + }) + meta, files = _extract_skill_from_tarball(tarball, "some-repo") + assert meta["name"] == "test-skill" + assert "SKILL.md" in files + + +def test_extract_skill_no_skill_md() -> None: + tarball = _make_test_tarball({ + "README.md": "# Hello", + }) + with pytest.raises(ValueError, match="No SKILL.md found"): + _extract_skill_from_tarball(tarball, "some-repo") + + +def test_extract_skill_multiple_ambiguous() -> None: + tarball = _make_test_tarball({ + "skill-a/SKILL.md": _VALID_FRONTMATTER, + "skill-b/SKILL.md": _VALID_FRONTMATTER.replace("test-skill", "other-skill"), + }) + with pytest.raises(ValueError, match="Multiple skills found"): + _extract_skill_from_tarball(tarball, "unrelated-repo") + + +def test_extract_skill_skips_scripts() -> None: + tarball = _make_test_tarball({ + "my-skill/SKILL.md": _VALID_FRONTMATTER, + "my-skill/scripts/run.py": "print('hello')", + "my-skill/references/ref.md": "# Ref", + }) + _, files = _extract_skill_from_tarball(tarball, "my-skill") + assert "references/ref.md" in files + assert "scripts/run.py" not in files # scripts blocked by default + + +def test_extract_skill_blocks_path_traversal() -> None: + tarball = _make_test_tarball({ + "my-skill/SKILL.md": _VALID_FRONTMATTER, + "my-skill/../../../etc/passwd": "root:x:0:0", + }) + _, files = _extract_skill_from_tarball(tarball, "my-skill") + assert all(".." not in p for p in files) + + +def test_extract_skill_blocks_hidden_files() -> None: + tarball = _make_test_tarball({ + "my-skill/SKILL.md": _VALID_FRONTMATTER, + "my-skill/.env": "SECRET=abc", + "my-skill/references/guide.md": "# Guide", + }) + _, files = _extract_skill_from_tarball(tarball, "my-skill") + assert ".env" not in files + assert "references/guide.md" in files + + +# --------------------------------------------------------------------------- +# install_github_skill (with mocked network) +# --------------------------------------------------------------------------- + + +def test_install_github_skill_copilot(tmp_path: Path) -> None: + tarball = _make_test_tarball({ + "pptx-designer/SKILL.md": _VALID_FRONTMATTER, + "pptx-designer/references/setup.md": "# Setup guide", + }) + + with patch( + "agentops.services.skills._fetch_github_tarball", return_value=tarball + ): + result = install_github_skill( + source="donlee/pptx-designer", + directory=tmp_path, + platforms=["copilot"], + force=True, + ) + + # SKILL.md installed + skill_path = tmp_path / ".github/skills/test-skill/SKILL.md" + assert skill_path.exists() + content = skill_path.read_text(encoding="utf-8") + assert content.startswith("---") # frontmatter preserved for copilot + + # Reference file installed + ref_path = tmp_path / ".github/skills/test-skill/references/setup.md" + assert ref_path.exists() + + # Provenance file created + prov_path = tmp_path / ".github/skills/test-skill/.installed-from.json" + assert prov_path.exists() + prov = json.loads(prov_path.read_text()) + assert prov["source"] == "github:donlee/pptx-designer" + assert prov["skill_name"] == "test-skill" + + assert len(result.created_files) >= 2 + + +def test_install_github_skill_claude(tmp_path: Path) -> None: + tarball = _make_test_tarball({ + "pptx-designer/SKILL.md": _VALID_FRONTMATTER, + "pptx-designer/references/setup.md": "# Setup guide", + }) + + with patch( + "agentops.services.skills._fetch_github_tarball", return_value=tarball + ): + install_github_skill( + source="donlee/pptx-designer", + directory=tmp_path, + platforms=["claude"], + ) + + # Claude gets a single .md file with frontmatter stripped + skill_path = tmp_path / ".claude/commands/test-skill.md" + assert skill_path.exists() + content = skill_path.read_text(encoding="utf-8") + assert not content.startswith("---") # frontmatter stripped + + # Claude does NOT get reference files + ref_path = tmp_path / ".claude/commands/references/setup.md" + assert not ref_path.exists() + + +def test_install_github_skill_skip_existing(tmp_path: Path) -> None: + tarball = _make_test_tarball({ + "my-skill/SKILL.md": _VALID_FRONTMATTER, + }) + + # Pre-create the file + dest = tmp_path / ".github/skills/test-skill/SKILL.md" + dest.parent.mkdir(parents=True) + dest.write_text("custom content") + + with patch( + "agentops.services.skills._fetch_github_tarball", return_value=tarball + ): + result = install_github_skill( + source="org/my-skill", + directory=tmp_path, + platforms=["copilot"], + force=False, + ) + + assert len(result.skipped_files) >= 1 + assert dest.read_text() == "custom content" + + +# --------------------------------------------------------------------------- +# CLI — agentops skills install --from +# --------------------------------------------------------------------------- + + +def test_cli_skills_install_from_github(tmp_path: Path) -> None: + tarball = _make_test_tarball({ + "pptx-designer/SKILL.md": _VALID_FRONTMATTER, + }) + + with patch( + "agentops.services.skills._fetch_github_tarball", return_value=tarball + ): + result = runner.invoke( + app, + [ + "skills", "install", + "--from", "donlee/pptx-designer", + "--dir", str(tmp_path), + ], + ) + + assert result.exit_code == 0 + assert "Installing skill from GitHub" in result.stdout + assert "created" in result.stdout + + +def test_cli_skills_install_from_invalid_ref(tmp_path: Path) -> None: + result = runner.invoke( + app, + [ + "skills", "install", + "--from", "not-valid-ref", + "--dir", str(tmp_path), + ], + ) + assert result.exit_code == 1