Azure · placerda · Apr 27, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/src/agentops/backends/eval_engine.py b/src/agentops/backends/eval_engine.py
@@ -23,6 +23,44 @@
 
 logger = logging.getLogger(__name__)
 
+
+# ---------------------------------------------------------------------------
+# Suppress noisy SDK warnings for single-turn evaluation inputs
+# ---------------------------------------------------------------------------
+
+class _ConversationHistoryFilter(logging.Filter):
+    """Suppress 'Conversation history could not be parsed' from azure-ai-evaluation.
+
+    This warning fires on every single-turn evaluation row because plain-text
+    inputs are not in conversation-list format.  It is expected and harmless.
+    """
+
+    def filter(self, record: logging.LogRecord) -> bool:
+        return "Conversation history could not be parsed" not in record.getMessage()
+
+
+# Apply filter to SDK loggers that emit the warning.
+# Each evaluator module passes its own logger to reformat_conversation_history().
+for _sdk_logger_name in (
+    "azure.ai.evaluation._common.utils",
+    "azure.ai.evaluation._evaluators._task_adherence._task_adherence",
+    "azure.ai.evaluation._evaluators._intent_resolution._intent_resolution",
+    "azure.ai.evaluation._evaluators._task_completion._task_completion",
+    "azure.ai.evaluation._evaluators._tool_call_accuracy._tool_call_accuracy",
+    "azure.ai.evaluation",
+):
+    logging.getLogger(_sdk_logger_name).addFilter(_ConversationHistoryFilter())
+
+
+# ---------------------------------------------------------------------------
+# Cloud-only evaluator sentinel
+# ---------------------------------------------------------------------------
+
+
+class _CloudOnlyEvaluatorError(Exception):
+    """Raised when an evaluator is only available via Foundry Cloud Evaluation."""
+
+
 # ---------------------------------------------------------------------------
 # Credential help (shared by _default_credential and _acquire_token)
 # ---------------------------------------------------------------------------
@@ -609,8 +647,12 @@ def _load_foundry_evaluator_callable(
                 "Install with: pip install azure-ai-evaluation"
             ) from exc
         except AttributeError as exc:
-            raise ValueError(
-                f"Unknown built-in Foundry evaluator class: {class_name}"
+            raise _CloudOnlyEvaluatorError(
+                f"Evaluator '{class_name}' is not available in the local "
+                f"azure-ai-evaluation SDK. It may only be available via "
+                f"Foundry Cloud Evaluation (builtin.{_to_builtin_evaluator_name(class_name)}). "
+                f"Use 'hosting: foundry' with 'execution_mode: remote' to "
+                f"run this evaluator, or disable it for local runs."
             ) from exc
 
         return _instantiate_evaluator_symbol(
@@ -691,12 +733,23 @@ def _build_foundry_evaluator_runtimes(
                 )
             score_keys = score_keys_raw
 
-        evaluator_callable = _load_foundry_evaluator_callable(
-            evaluator_name=evaluator.name,
-            evaluator_config=config,
-            fallback_endpoint=fallback_endpoint,
-            fallback_deployment=fallback_deployment,
-        )
+        try:
+            evaluator_callable = _load_foundry_evaluator_callable(
+                evaluator_name=evaluator.name,
+                evaluator_config=config,
+                fallback_endpoint=fallback_endpoint,
+                fallback_deployment=fallback_deployment,
+            )
+        except _CloudOnlyEvaluatorError:
+            logger.warning(
+                "Skipping evaluator '%s' — not available in the local "
+                "azure-ai-evaluation SDK. This evaluator is only supported "
+                "via Foundry Cloud Evaluation (hosting: foundry, "
+                "execution_mode: remote). It will be ignored for this "
+                "local run.",
+                evaluator.name,
+            )
+            continue
 
         runtimes.append(
             FoundryEvaluatorRuntime(

diff --git a/src/agentops/backends/local_adapter_backend.py b/src/agentops/backends/local_adapter_backend.py
@@ -222,6 +222,7 @@ def execute(self, context: BackendRunContext) -> BackendExecutionResult:
                             prediction_text = _normalize_text(
                                 result.get("response", "")
                             )
+                            returned_tool_calls = result.get("tool_calls")
                             set_agent_invoke_result(invoke_span)
                     except Exception as exc:  # noqa: BLE001
                         stderr_lines.append(f"row={index} error={exc!s}")
@@ -230,6 +231,7 @@ def execute(self, context: BackendRunContext) -> BackendExecutionResult:
                         continue
                 else:
                     # --- Subprocess mode ---
+                    assert adapter_command is not None
                     adapter_input = json.dumps(
                         {"input": prompt_text, "expected": expected_text, **row}
                     )
@@ -267,6 +269,7 @@ def execute(self, context: BackendRunContext) -> BackendExecutionResult:
                             prediction_text = _normalize_text(
                                 adapter_output.get("response", "")
                             )
+                            returned_tool_calls = adapter_output.get("tool_calls")
                             set_agent_invoke_result(invoke_span)
                     except subprocess.TimeoutExpired:
                         stderr_lines.append(f"row={index} error=adapter timeout")
@@ -333,6 +336,7 @@ def execute(self, context: BackendRunContext) -> BackendExecutionResult:
                         "input": prompt_text,
                         "response": prediction_text,
                         "context": row.get("context"),
+                        "tool_calls": returned_tool_calls,
                         "metrics": row_metric_entries,
                     }
                 )

diff --git a/src/agentops/cli/app.py b/src/agentops/cli/app.py
@@ -491,6 +491,17 @@ def cmd_skills_install(
             help="Target platform(s): copilot, claude.",
         ),
     ] = None,
+    from_github: Annotated[
+        str | None,
+        typer.Option(
+            "--from",
+            help=(
+                "Install a community skill from GitHub. "
+                "Format: org/repo or github:org/repo[@ref]. "
+                "Example: --from donlee/pptx-designer"
+            ),
+        ),
+    ] = None,
     force: bool = typer.Option(
         False,
         "--force",
@@ -507,12 +518,18 @@ def cmd_skills_install(
         help="Target repository root directory.",
     ),
 ) -> None:
-    """Install AgentOps coding agent skills into the target project."""
-    from agentops.services.skills import install_skills
+    """Install AgentOps coding agent skills into the target project.
+
+    Use --from to install a community skill from GitHub:
 
+        agentops skills install --from donlee/pptx-designer
+
+        agentops skills install --from github:org/repo@v1.0
+    """
     log.debug(
-        "cmd_skills_install called platform=%s force=%s prompt=%s dir=%s",
+        "cmd_skills_install called platform=%s from=%s force=%s prompt=%s dir=%s",
         platform,
+        from_github,
         force,
         prompt,
         directory,
@@ -524,6 +541,31 @@ def cmd_skills_install(
         typer.echo("No platforms selected. Skipping skill installation.")
         return
 
+    if from_github:
+        # GitHub-based skill installation
+        from agentops.services.skills import install_github_skill
+
+        typer.echo(f"Installing skill from GitHub: {from_github}")
+        try:
+            result = install_github_skill(
+                source=from_github,
+                directory=directory,
+                platforms=resolved_platforms,
+                force=True,
+            )
+        except ValueError as exc:
+            typer.echo(f"Error: {exc}", err=True)
+            raise typer.Exit(code=1) from exc
+        except Exception as exc:
+            typer.echo(f"Error: failed to install skill: {exc}", err=True)
+            raise typer.Exit(code=1) from exc
+
+        _print_skills_result(result)
+        return
+
+    # Bundled skills installation
+    from agentops.services.skills import install_skills
+
     try:
         result = install_skills(
             directory=directory, platforms=resolved_platforms, force=True

diff --git a/src/agentops/services/initializer.py b/src/agentops/services/initializer.py
@@ -23,11 +23,14 @@ class InitResult:
     "run.yaml",
     "run-rag.yaml",
     "run-agent.yaml",
+    "run-agent-local.yaml",
     "run-http-model.yaml",
     "run-http-rag.yaml",
     "run-http-agent-tools.yaml",
     "run-callable.yaml",
     "callable_adapter.py",
+    "agent_framework_adapter.py",
+    "multi_agent_workflow.py",
     ".gitignore",
     "bundles/model_quality_baseline.yaml",
     "bundles/rag_quality_baseline.yaml",

diff --git a/src/agentops/services/runner.py b/src/agentops/services/runner.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import json
+import logging
 import shutil
 from dataclasses import dataclass
 from datetime import datetime
@@ -42,6 +43,8 @@
     shutdown as shutdown_tracing,
 )
 
+logger = logging.getLogger(__name__)
+
 
 @dataclass(frozen=True)
 class EvalRunServiceResult:
@@ -218,9 +221,9 @@ def _evaluate_item_thresholds(
         threshold_results: list[ItemThresholdEvaluationResult] = []
         for rule in threshold_rules:
             if rule.evaluator not in row_values:
-                raise ValueError(
-                    f"Missing evaluator score '{rule.evaluator}' for row {row.row_index}"
-                )
+                # Evaluator may be cloud-only and was skipped during local
+                # execution — silently skip its threshold check.
+                continue
 
             threshold_results.append(
                 _evaluate_threshold_against_value(
@@ -266,8 +269,12 @@ def _validate_enabled_evaluators_scored(
 
     missing = [name for name in evaluator_names if name not in scored_names]
     if missing:
-        raise ValueError(
-            "Missing scores for enabled evaluators: " + ", ".join(sorted(missing))
+        logger.warning(
+            "Some enabled evaluators did not produce scores and will be "
+            "excluded from threshold checks: %s. These evaluators may "
+            "only be available via Foundry Cloud Evaluation "
+            "(hosting: foundry, execution_mode: remote).",
+            ", ".join(sorted(missing)),
         )
 
 
@@ -291,6 +298,11 @@ def _summarize_thresholds_from_items(
                 ):
                     rule_results.append(threshold_result)
 
+        # Skip threshold rules for evaluators that produced no scores
+        # (e.g., cloud-only evaluators skipped during local execution).
+        if not rule_results:
+            continue
+
         passed_items = sum(1 for result in rule_results if result.passed)
         passed = bool(rule_results) and passed_items == len(rule_results)