Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 61 additions & 8 deletions src/agentops/backends/eval_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,44 @@

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Suppress noisy SDK warnings for single-turn evaluation inputs
# ---------------------------------------------------------------------------

class _ConversationHistoryFilter(logging.Filter):
"""Suppress 'Conversation history could not be parsed' from azure-ai-evaluation.

This warning fires on every single-turn evaluation row because plain-text
inputs are not in conversation-list format. It is expected and harmless.
"""

def filter(self, record: logging.LogRecord) -> bool:
return "Conversation history could not be parsed" not in record.getMessage()


# Apply filter to SDK loggers that emit the warning.
# Each evaluator module passes its own logger to reformat_conversation_history().
for _sdk_logger_name in (
"azure.ai.evaluation._common.utils",
"azure.ai.evaluation._evaluators._task_adherence._task_adherence",
"azure.ai.evaluation._evaluators._intent_resolution._intent_resolution",
"azure.ai.evaluation._evaluators._task_completion._task_completion",
"azure.ai.evaluation._evaluators._tool_call_accuracy._tool_call_accuracy",
"azure.ai.evaluation",
):
logging.getLogger(_sdk_logger_name).addFilter(_ConversationHistoryFilter())


# ---------------------------------------------------------------------------
# Cloud-only evaluator sentinel
# ---------------------------------------------------------------------------


class _CloudOnlyEvaluatorError(Exception):
"""Raised when an evaluator is only available via Foundry Cloud Evaluation."""


# ---------------------------------------------------------------------------
# Credential help (shared by _default_credential and _acquire_token)
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -609,8 +647,12 @@ def _load_foundry_evaluator_callable(
"Install with: pip install azure-ai-evaluation"
) from exc
except AttributeError as exc:
raise ValueError(
f"Unknown built-in Foundry evaluator class: {class_name}"
raise _CloudOnlyEvaluatorError(
f"Evaluator '{class_name}' is not available in the local "
f"azure-ai-evaluation SDK. It may only be available via "
f"Foundry Cloud Evaluation (builtin.{_to_builtin_evaluator_name(class_name)}). "
f"Use 'hosting: foundry' with 'execution_mode: remote' to "
f"run this evaluator, or disable it for local runs."
) from exc

return _instantiate_evaluator_symbol(
Expand Down Expand Up @@ -691,12 +733,23 @@ def _build_foundry_evaluator_runtimes(
)
score_keys = score_keys_raw

evaluator_callable = _load_foundry_evaluator_callable(
evaluator_name=evaluator.name,
evaluator_config=config,
fallback_endpoint=fallback_endpoint,
fallback_deployment=fallback_deployment,
)
try:
evaluator_callable = _load_foundry_evaluator_callable(
evaluator_name=evaluator.name,
evaluator_config=config,
fallback_endpoint=fallback_endpoint,
fallback_deployment=fallback_deployment,
)
except _CloudOnlyEvaluatorError:
logger.warning(
"Skipping evaluator '%s' — not available in the local "
"azure-ai-evaluation SDK. This evaluator is only supported "
"via Foundry Cloud Evaluation (hosting: foundry, "
"execution_mode: remote). It will be ignored for this "
"local run.",
evaluator.name,
)
continue

runtimes.append(
FoundryEvaluatorRuntime(
Expand Down
4 changes: 4 additions & 0 deletions src/agentops/backends/local_adapter_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ def execute(self, context: BackendRunContext) -> BackendExecutionResult:
prediction_text = _normalize_text(
result.get("response", "")
)
returned_tool_calls = result.get("tool_calls")
set_agent_invoke_result(invoke_span)
except Exception as exc: # noqa: BLE001
stderr_lines.append(f"row={index} error={exc!s}")
Expand All @@ -230,6 +231,7 @@ def execute(self, context: BackendRunContext) -> BackendExecutionResult:
continue
else:
# --- Subprocess mode ---
assert adapter_command is not None
adapter_input = json.dumps(
{"input": prompt_text, "expected": expected_text, **row}
)
Expand Down Expand Up @@ -267,6 +269,7 @@ def execute(self, context: BackendRunContext) -> BackendExecutionResult:
prediction_text = _normalize_text(
adapter_output.get("response", "")
)
returned_tool_calls = adapter_output.get("tool_calls")
set_agent_invoke_result(invoke_span)
except subprocess.TimeoutExpired:
stderr_lines.append(f"row={index} error=adapter timeout")
Expand Down Expand Up @@ -333,6 +336,7 @@ def execute(self, context: BackendRunContext) -> BackendExecutionResult:
"input": prompt_text,
"response": prediction_text,
"context": row.get("context"),
"tool_calls": returned_tool_calls,
"metrics": row_metric_entries,
}
)
Expand Down
48 changes: 45 additions & 3 deletions src/agentops/cli/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,17 @@ def cmd_skills_install(
help="Target platform(s): copilot, claude.",
),
] = None,
from_github: Annotated[
str | None,
typer.Option(
"--from",
help=(
"Install a community skill from GitHub. "
"Format: org/repo or github:org/repo[@ref]. "
"Example: --from donlee/pptx-designer"
),
),
] = None,
force: bool = typer.Option(
False,
"--force",
Expand All @@ -507,12 +518,18 @@ def cmd_skills_install(
help="Target repository root directory.",
),
) -> None:
"""Install AgentOps coding agent skills into the target project."""
from agentops.services.skills import install_skills
"""Install AgentOps coding agent skills into the target project.

Use --from to install a community skill from GitHub:

agentops skills install --from donlee/pptx-designer

agentops skills install --from github:org/repo@v1.0
"""
log.debug(
"cmd_skills_install called platform=%s force=%s prompt=%s dir=%s",
"cmd_skills_install called platform=%s from=%s force=%s prompt=%s dir=%s",
platform,
from_github,
force,
prompt,
directory,
Expand All @@ -524,6 +541,31 @@ def cmd_skills_install(
typer.echo("No platforms selected. Skipping skill installation.")
return

if from_github:
# GitHub-based skill installation
from agentops.services.skills import install_github_skill

typer.echo(f"Installing skill from GitHub: {from_github}")
try:
result = install_github_skill(
source=from_github,
directory=directory,
platforms=resolved_platforms,
force=True,
)
except ValueError as exc:
typer.echo(f"Error: {exc}", err=True)
raise typer.Exit(code=1) from exc
except Exception as exc:
typer.echo(f"Error: failed to install skill: {exc}", err=True)
raise typer.Exit(code=1) from exc

_print_skills_result(result)
return

# Bundled skills installation
from agentops.services.skills import install_skills

try:
result = install_skills(
directory=directory, platforms=resolved_platforms, force=True
Expand Down
3 changes: 3 additions & 0 deletions src/agentops/services/initializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,14 @@ class InitResult:
"run.yaml",
"run-rag.yaml",
"run-agent.yaml",
"run-agent-local.yaml",
"run-http-model.yaml",
"run-http-rag.yaml",
"run-http-agent-tools.yaml",
"run-callable.yaml",
"callable_adapter.py",
"agent_framework_adapter.py",
"multi_agent_workflow.py",
".gitignore",
"bundles/model_quality_baseline.yaml",
"bundles/rag_quality_baseline.yaml",
Expand Down
22 changes: 17 additions & 5 deletions src/agentops/services/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import json
import logging
import shutil
from dataclasses import dataclass
from datetime import datetime
Expand Down Expand Up @@ -42,6 +43,8 @@
shutdown as shutdown_tracing,
)

logger = logging.getLogger(__name__)


@dataclass(frozen=True)
class EvalRunServiceResult:
Expand Down Expand Up @@ -218,9 +221,9 @@ def _evaluate_item_thresholds(
threshold_results: list[ItemThresholdEvaluationResult] = []
for rule in threshold_rules:
if rule.evaluator not in row_values:
raise ValueError(
f"Missing evaluator score '{rule.evaluator}' for row {row.row_index}"
)
# Evaluator may be cloud-only and was skipped during local
# execution — silently skip its threshold check.
continue

threshold_results.append(
_evaluate_threshold_against_value(
Expand Down Expand Up @@ -266,8 +269,12 @@ def _validate_enabled_evaluators_scored(

missing = [name for name in evaluator_names if name not in scored_names]
if missing:
raise ValueError(
"Missing scores for enabled evaluators: " + ", ".join(sorted(missing))
logger.warning(
"Some enabled evaluators did not produce scores and will be "
"excluded from threshold checks: %s. These evaluators may "
"only be available via Foundry Cloud Evaluation "
"(hosting: foundry, execution_mode: remote).",
", ".join(sorted(missing)),
)


Expand All @@ -291,6 +298,11 @@ def _summarize_thresholds_from_items(
):
rule_results.append(threshold_result)

# Skip threshold rules for evaluators that produced no scores
# (e.g., cloud-only evaluators skipped during local execution).
if not rule_results:
continue

passed_items = sum(1 for result in rule_results if result.passed)
passed = bool(rule_results) and passed_items == len(rule_results)

Expand Down
Loading