From 852beea72171a59cd507532df74299edda8337e8 Mon Sep 17 00:00:00 2001
From: Nicolas Borges <nickdb@amazon.com>
Date: Thu, 23 Apr 2026 14:15:33 -0400
Subject: [PATCH] feat: add evaluations client passthrough and tests

---
 src/bedrock_agentcore/evaluation/client.py    | 206 ++++++++++++++++++
 .../evaluation/test_client_passthrough.py     | 155 +++++++++++++
 .../evaluation/test_evaluation_passthrough.py | 197 +++++++++++++++++
 3 files changed, 558 insertions(+)
 create mode 100644 tests/bedrock_agentcore/evaluation/test_client_passthrough.py
 create mode 100644 tests_integ/evaluation/test_evaluation_passthrough.py

diff --git a/src/bedrock_agentcore/evaluation/client.py b/src/bedrock_agentcore/evaluation/client.py
index 9f8a32e6..d07c5c51 100644
--- a/src/bedrock_agentcore/evaluation/client.py
+++ b/src/bedrock_agentcore/evaluation/client.py
@@ -8,6 +8,9 @@
 from botocore.config import Config
 from pydantic import BaseModel
 
+from bedrock_agentcore._utils.config import WaitConfig
+from bedrock_agentcore._utils.polling import wait_until, wait_until_deleted
+from bedrock_agentcore._utils.snake_case import accept_snake_case_kwargs, convert_kwargs
 from bedrock_agentcore._utils.user_agent import build_user_agent_suffix
 from bedrock_agentcore.evaluation.agent_span_collector import CloudWatchAgentSpanCollector
 
@@ -17,6 +20,9 @@
 QUERY_TIMEOUT_SECONDS = 60
 POLL_INTERVAL_SECONDS = 2
 
+_EVALUATOR_FAILED_STATUSES = {"CREATE_FAILED", "UPDATE_FAILED"}
+_ONLINE_EVAL_FAILED_STATUSES = {"CREATE_FAILED", "UPDATE_FAILED"}
+
 
 class ReferenceInputs(BaseModel):
     """Ground truth inputs for evaluation.
@@ -92,6 +98,46 @@ def __init__(
 
         logger.info("Initialized EvaluationClient in region %s", self.region_name)
 
+    # Pass-through
+    # -------------------------------------------------------------------------
+    _ALLOWED_DP_METHODS = {
+        "evaluate",
+    }
+
+    _ALLOWED_CP_METHODS = {
+        # Evaluator CRUD
+        "create_evaluator",
+        "get_evaluator",
+        "list_evaluators",
+        "update_evaluator",
+        "delete_evaluator",
+        # Online evaluation config CRUD
+        "create_online_evaluation_config",
+        "get_online_evaluation_config",
+        "list_online_evaluation_configs",
+        "update_online_evaluation_config",
+        "delete_online_evaluation_config",
+    }
+
+    def __getattr__(self, name: str):
+        """Dynamically forward allowlisted method calls to the appropriate boto3 client."""
+        if name in self._ALLOWED_DP_METHODS and hasattr(self._dp_client, name):
+            method = getattr(self._dp_client, name)
+            logger.debug("Forwarding method '%s' to _dp_client", name)
+            return accept_snake_case_kwargs(method)
+
+        if name in self._ALLOWED_CP_METHODS and hasattr(self._cp_client, name):
+            method = getattr(self._cp_client, name)
+            logger.debug("Forwarding method '%s' to _cp_client", name)
+            return accept_snake_case_kwargs(method)
+
+        raise AttributeError(
+            f"'{self.__class__.__name__}' object has no attribute '{name}'. "
+            f"Method not found on data plane or control plane client. "
+            f"Available methods can be found in the boto3 documentation for "
+            f"'bedrock-agentcore' and 'bedrock-agentcore-control' services."
+        )
+
     def run(
         self,
         evaluator_ids: List[str],
@@ -349,3 +395,163 @@ def _build_reference_inputs(
                     )
 
         return result
+
+    # *_and_wait methods
+    # -------------------------------------------------------------------------
+    def create_evaluator_and_wait(
+        self,
+        wait_config: Optional[WaitConfig] = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """Create an evaluator and wait for it to reach ACTIVE status.
+
+        Args:
+            wait_config: Optional WaitConfig for polling behavior.
+            **kwargs: Arguments forwarded to the create_evaluator API.
+
+        Returns:
+            Evaluator details when ACTIVE.
+
+        Raises:
+            RuntimeError: If the evaluator reaches a failed state.
+            TimeoutError: If the evaluator doesn't become ACTIVE within max_wait.
+        """
+        response = self._cp_client.create_evaluator(**convert_kwargs(kwargs))
+        eid = response["evaluatorId"]
+        return wait_until(
+            lambda: self._cp_client.get_evaluator(evaluatorId=eid),
+            "ACTIVE",
+            _EVALUATOR_FAILED_STATUSES,
+            wait_config,
+        )
+
+    def update_evaluator_and_wait(
+        self,
+        wait_config: Optional[WaitConfig] = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """Update an evaluator and wait for it to reach ACTIVE status.
+
+        Args:
+            wait_config: Optional WaitConfig for polling behavior.
+            **kwargs: Arguments forwarded to the update_evaluator API.
+
+        Returns:
+            Evaluator details when ACTIVE.
+
+        Raises:
+            RuntimeError: If the evaluator reaches a failed state.
+            TimeoutError: If the evaluator doesn't become ACTIVE within max_wait.
+        """
+        response = self._cp_client.update_evaluator(**convert_kwargs(kwargs))
+        eid = response["evaluatorId"]
+        return wait_until(
+            lambda: self._cp_client.get_evaluator(evaluatorId=eid),
+            "ACTIVE",
+            _EVALUATOR_FAILED_STATUSES,
+            wait_config,
+        )
+
+    def create_online_evaluation_config_and_wait(
+        self,
+        wait_config: Optional[WaitConfig] = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """Create an online evaluation config and wait for ACTIVE status.
+
+        Args:
+            wait_config: Optional WaitConfig for polling behavior.
+            **kwargs: Arguments forwarded to the create_online_evaluation_config API.
+
+        Returns:
+            Online evaluation config details when ACTIVE.
+
+        Raises:
+            RuntimeError: If the config reaches a failed state.
+            TimeoutError: If the config doesn't become ACTIVE within max_wait.
+        """
+        response = self._cp_client.create_online_evaluation_config(**convert_kwargs(kwargs))
+        cid = response["onlineEvaluationConfigId"]
+        return wait_until(
+            lambda: self._cp_client.get_online_evaluation_config(
+                onlineEvaluationConfigId=cid,
+            ),
+            "ACTIVE",
+            _ONLINE_EVAL_FAILED_STATUSES,
+            wait_config,
+            error_field="failureReason",
+        )
+
+    def update_online_evaluation_config_and_wait(
+        self,
+        wait_config: Optional[WaitConfig] = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """Update an online evaluation config and wait for ACTIVE status.
+
+        Args:
+            wait_config: Optional WaitConfig for polling behavior.
+            **kwargs: Arguments forwarded to the update_online_evaluation_config API.
+
+        Returns:
+            Online evaluation config details when ACTIVE.
+
+        Raises:
+            RuntimeError: If the config reaches a failed state.
+            TimeoutError: If the config doesn't become ACTIVE within max_wait.
+        """
+        response = self._cp_client.update_online_evaluation_config(**convert_kwargs(kwargs))
+        cid = response["onlineEvaluationConfigId"]
+        return wait_until(
+            lambda: self._cp_client.get_online_evaluation_config(
+                onlineEvaluationConfigId=cid,
+            ),
+            "ACTIVE",
+            _ONLINE_EVAL_FAILED_STATUSES,
+            wait_config,
+            error_field="failureReason",
+        )
+
+    def delete_evaluator_and_wait(
+        self,
+        wait_config: Optional[WaitConfig] = None,
+        **kwargs,
+    ) -> None:
+        """Delete an evaluator and wait for deletion to complete.
+
+        Args:
+            wait_config: Optional WaitConfig for polling behavior.
+            **kwargs: Arguments forwarded to the delete_evaluator API.
+
+        Raises:
+            TimeoutError: If the evaluator isn't deleted within max_wait.
+        """
+        response = self._cp_client.delete_evaluator(**convert_kwargs(kwargs))
+        eid = response["evaluatorId"]
+        wait_until_deleted(
+            lambda: self._cp_client.get_evaluator(evaluatorId=eid),
+            wait_config=wait_config,
+        )
+
+    def delete_online_evaluation_config_and_wait(
+        self,
+        wait_config: Optional[WaitConfig] = None,
+        **kwargs,
+    ) -> None:
+        """Delete an online evaluation config and wait for deletion to complete.
+
+        Args:
+            wait_config: Optional WaitConfig for polling behavior.
+            **kwargs: Arguments forwarded to the delete_online_evaluation_config API.
+
+        Raises:
+            TimeoutError: If the config isn't deleted within max_wait.
+        """
+        response = self._cp_client.delete_online_evaluation_config(**convert_kwargs(kwargs))
+        cid = response["onlineEvaluationConfigId"]
+        wait_until_deleted(
+            lambda: self._cp_client.get_online_evaluation_config(
+                onlineEvaluationConfigId=cid,
+            ),
+            wait_config=wait_config,
+        )
diff --git a/tests/bedrock_agentcore/evaluation/test_client_passthrough.py b/tests/bedrock_agentcore/evaluation/test_client_passthrough.py
new file mode 100644
index 00000000..c752c125
--- /dev/null
+++ b/tests/bedrock_agentcore/evaluation/test_client_passthrough.py
@@ -0,0 +1,155 @@
+"""Tests for EvaluationClient passthrough and *_and_wait methods."""
+
+from unittest.mock import Mock, patch
+
+import pytest
+from botocore.exceptions import ClientError
+
+from bedrock_agentcore.evaluation.client import EvaluationClient
+
+
+class TestEvaluationClientPassthrough:
+    """Tests for __getattr__ passthrough."""
+
+    def _make_client(self):
+        with patch("boto3.client"):
+            client = EvaluationClient(region_name="us-west-2")
+        client._cp_client = Mock()
+        client._dp_client = Mock()
+        return client
+
+    def test_cp_method_forwarded(self):
+        client = self._make_client()
+        client._cp_client.get_evaluator.return_value = {"evaluatorId": "e-123"}
+        result = client.get_evaluator(evaluatorId="e-123")
+        client._cp_client.get_evaluator.assert_called_once_with(evaluatorId="e-123")
+        assert result["evaluatorId"] == "e-123"
+
+    def test_dp_method_forwarded(self):
+        client = self._make_client()
+        client._dp_client.evaluate.return_value = {"evaluationResults": []}
+        result = client.evaluate(evaluatorId="e-123")
+        client._dp_client.evaluate.assert_called_once_with(evaluatorId="e-123")
+        assert result["evaluationResults"] == []
+
+    def test_snake_case_kwargs_converted(self):
+        client = self._make_client()
+        client._cp_client.get_evaluator.return_value = {"evaluatorId": "e-123"}
+        client.get_evaluator(evaluator_id="e-123")
+        client._cp_client.get_evaluator.assert_called_once_with(evaluatorId="e-123")
+
+    def test_non_allowlisted_method_raises(self):
+        client = self._make_client()
+        with pytest.raises(AttributeError, match="has no attribute"):
+            client.not_a_real_method()
+
+    def test_all_cp_methods_in_allowlist(self):
+        expected = {
+            "create_evaluator",
+            "get_evaluator",
+            "list_evaluators",
+            "update_evaluator",
+            "delete_evaluator",
+            "create_online_evaluation_config",
+            "get_online_evaluation_config",
+            "list_online_evaluation_configs",
+            "update_online_evaluation_config",
+            "delete_online_evaluation_config",
+        }
+        assert expected == EvaluationClient._ALLOWED_CP_METHODS
+
+    def test_all_dp_methods_in_allowlist(self):
+        expected = {"evaluate"}
+        assert expected == EvaluationClient._ALLOWED_DP_METHODS
+
+
+class TestEvaluationAndWait:
+    """Tests for *_and_wait methods."""
+
+    def _make_client(self):
+        with patch("boto3.client"):
+            client = EvaluationClient(region_name="us-west-2")
+        client._cp_client = Mock()
+        client._dp_client = Mock()
+        return client
+
+    def test_create_evaluator_and_wait(self):
+        client = self._make_client()
+        client._cp_client.create_evaluator.return_value = {"evaluatorId": "e-123"}
+        client._cp_client.get_evaluator.return_value = {
+            "status": "ACTIVE",
+            "evaluatorId": "e-123",
+        }
+        result = client.create_evaluator_and_wait(evaluatorName="test")
+        assert result["status"] == "ACTIVE"
+
+    def test_create_evaluator_and_wait_failed(self):
+        client = self._make_client()
+        client._cp_client.create_evaluator.return_value = {"evaluatorId": "e-123"}
+        client._cp_client.get_evaluator.return_value = {"status": "CREATE_FAILED"}
+        with pytest.raises(RuntimeError, match="CREATE_FAILED"):
+            client.create_evaluator_and_wait(evaluatorName="test")
+
+    def test_update_evaluator_and_wait(self):
+        client = self._make_client()
+        client._cp_client.update_evaluator.return_value = {"evaluatorId": "e-123"}
+        client._cp_client.get_evaluator.return_value = {
+            "status": "ACTIVE",
+            "evaluatorId": "e-123",
+        }
+        result = client.update_evaluator_and_wait(evaluatorId="e-123")
+        assert result["status"] == "ACTIVE"
+
+    def test_create_online_eval_config_and_wait(self):
+        client = self._make_client()
+        client._cp_client.create_online_evaluation_config.return_value = {
+            "onlineEvaluationConfigId": "c-123",
+        }
+        client._cp_client.get_online_evaluation_config.return_value = {
+            "status": "ACTIVE",
+            "onlineEvaluationConfigId": "c-123",
+        }
+        result = client.create_online_evaluation_config_and_wait(
+            onlineEvaluationConfigName="test",
+        )
+        assert result["status"] == "ACTIVE"
+
+    def test_update_online_eval_config_and_wait(self):
+        client = self._make_client()
+        client._cp_client.update_online_evaluation_config.return_value = {
+            "onlineEvaluationConfigId": "c-123",
+        }
+        client._cp_client.get_online_evaluation_config.return_value = {
+            "status": "ACTIVE",
+            "onlineEvaluationConfigId": "c-123",
+        }
+        result = client.update_online_evaluation_config_and_wait(
+            onlineEvaluationConfigId="c-123",
+        )
+        assert result["status"] == "ACTIVE"
+
+    @patch("bedrock_agentcore._utils.polling.time.sleep")
+    def test_delete_evaluator_and_wait(self, _mock_sleep):
+        client = self._make_client()
+        client._cp_client.delete_evaluator.return_value = {"evaluatorId": "e-123"}
+        client._cp_client.get_evaluator.side_effect = ClientError(
+            {"Error": {"Code": "ResourceNotFoundException", "Message": "gone"}},
+            "GetEvaluator",
+        )
+        client.delete_evaluator_and_wait(evaluatorId="e-123")
+        client._cp_client.delete_evaluator.assert_called_once()
+
+    @patch("bedrock_agentcore._utils.polling.time.sleep")
+    def test_delete_online_eval_config_and_wait(self, _mock_sleep):
+        client = self._make_client()
+        client._cp_client.delete_online_evaluation_config.return_value = {
+            "onlineEvaluationConfigId": "c-123",
+        }
+        client._cp_client.get_online_evaluation_config.side_effect = ClientError(
+            {"Error": {"Code": "ResourceNotFoundException", "Message": "gone"}},
+            "GetOnlineEvaluationConfig",
+        )
+        client.delete_online_evaluation_config_and_wait(
+            onlineEvaluationConfigId="c-123",
+        )
+        client._cp_client.delete_online_evaluation_config.assert_called_once()
diff --git a/tests_integ/evaluation/test_evaluation_passthrough.py b/tests_integ/evaluation/test_evaluation_passthrough.py
new file mode 100644
index 00000000..6180d2be
--- /dev/null
+++ b/tests_integ/evaluation/test_evaluation_passthrough.py
@@ -0,0 +1,197 @@
+"""Integration tests for EvaluationClient passthrough and *_and_wait methods."""
+
+import os
+import time
+
+import pytest
+from botocore.exceptions import ClientError
+
+from bedrock_agentcore.evaluation.client import EvaluationClient
+
+
+@pytest.mark.integration
+class TestEvaluationClientPassthrough:
+    """Read-only passthrough tests. No resources needed."""
+
+    @classmethod
+    def setup_class(cls):
+        cls.region = os.environ.get("BEDROCK_TEST_REGION", "us-west-2")
+        cls.client = EvaluationClient(region_name=cls.region)
+
+    @pytest.mark.order(1)
+    def test_list_evaluators_passthrough(self):
+        response = self.client.list_evaluators()
+        assert "evaluators" in response
+
+    @pytest.mark.order(2)
+    def test_list_evaluators_snake_case(self):
+        response = self.client.list_evaluators(max_results=5)
+        assert "evaluators" in response
+
+    @pytest.mark.order(3)
+    def test_get_builtin_evaluator(self):
+        response = self.client.get_evaluator(evaluatorId="Builtin.Helpfulness")
+        assert response["evaluatorId"] == "Builtin.Helpfulness"
+        assert response["level"] in ("SESSION", "TRACE", "TOOL_CALL")
+
+    @pytest.mark.order(4)
+    def test_list_online_evaluation_configs_passthrough(self):
+        response = self.client.list_online_evaluation_configs()
+        assert "onlineEvaluationConfigs" in response
+
+    @pytest.mark.order(5)
+    def test_non_allowlisted_method_raises(self):
+        with pytest.raises(AttributeError):
+            self.client.not_a_real_method()
+
+
+@pytest.mark.integration
+class TestEvaluatorCrud:
+    """CRUD tests for custom evaluators."""
+
+    @classmethod
+    def setup_class(cls):
+        cls.region = os.environ.get("BEDROCK_TEST_REGION", "us-west-2")
+        cls.client = EvaluationClient(region_name=cls.region)
+        cls.test_prefix = f"sdk_integ_{int(time.time())}"
+        cls.evaluator_ids = []
+
+    @classmethod
+    def teardown_class(cls):
+        for eid in cls.evaluator_ids:
+            try:
+                cls.client.delete_evaluator(evaluatorId=eid)
+            except Exception as e:
+                print(f"Failed to delete evaluator {eid}: {e}")
+
+    @pytest.mark.order(6)
+    def test_create_evaluator_and_wait(self):
+        evaluator = self.client.create_evaluator_and_wait(
+            evaluatorName=f"{self.test_prefix}_eval",
+            level="SESSION",
+            evaluatorConfig={
+                "llmAsAJudge": {
+                    "instructions": "Rate the helpfulness of the response. {context}",
+                    "ratingScale": {
+                        "numerical": [
+                            {"definition": "Not helpful", "value": 1, "label": "Poor"},
+                            {"definition": "Very helpful", "value": 5, "label": "Excellent"},
+                        ]
+                    },
+                    "modelConfig": {
+                        "bedrockEvaluatorModelConfig": {
+                            "modelId": "us.anthropic.claude-haiku-4-5-20251001-v1:0",
+                        }
+                    },
+                }
+            },
+        )
+        self.__class__.evaluator_ids.append(evaluator["evaluatorId"])
+        assert evaluator["status"] == "ACTIVE"
+
+    @pytest.mark.order(7)
+    def test_get_evaluator_passthrough(self):
+        if not self.evaluator_ids:
+            pytest.skip("prerequisite test did not create evaluator")
+        evaluator = self.client.get_evaluator(
+            evaluatorId=self.evaluator_ids[0],
+        )
+        assert evaluator["status"] == "ACTIVE"
+
+    @pytest.mark.order(8)
+    def test_update_evaluator_and_wait(self):
+        if not self.evaluator_ids:
+            pytest.skip("prerequisite test did not create evaluator")
+        updated = self.client.update_evaluator_and_wait(
+            evaluatorId=self.evaluator_ids[0],
+            description="updated by integ test",
+        )
+        assert updated["status"] == "ACTIVE"
+
+    @pytest.mark.order(9)
+    def test_delete_evaluator_and_wait(self):
+        if not self.evaluator_ids:
+            pytest.skip("prerequisite test did not create evaluator")
+        eid = self.evaluator_ids.pop(0)
+        self.client.delete_evaluator_and_wait(evaluatorId=eid)
+        with pytest.raises(ClientError):
+            self.client.get_evaluator(evaluatorId=eid)
+
+
+@pytest.mark.integration
+class TestOnlineEvaluationConfigCrud:
+    """CRUD tests for online evaluation configs.
+
+    Requires EVAL_ROLE_ARN and EVAL_LOG_GROUP environment variables.
+    """
+
+    @classmethod
+    def setup_class(cls):
+        cls.region = os.environ.get("BEDROCK_TEST_REGION", "us-west-2")
+        cls.role_arn = os.environ.get("EVAL_ROLE_ARN")
+        cls.log_group = os.environ.get("EVAL_LOG_GROUP")
+        if not cls.role_arn or not cls.log_group:
+            pytest.skip("EVAL_ROLE_ARN and EVAL_LOG_GROUP must be set")
+        cls.client = EvaluationClient(region_name=cls.region)
+        cls.test_prefix = f"sdk_integ_{int(time.time())}"
+        cls.config_ids = []
+
+    @classmethod
+    def teardown_class(cls):
+        for cid in cls.config_ids:
+            try:
+                cls.client.delete_online_evaluation_config(
+                    onlineEvaluationConfigId=cid,
+                )
+            except Exception as e:
+                print(f"Failed to delete config {cid}: {e}")
+
+    @pytest.mark.order(10)
+    def test_create_online_eval_config_and_wait(self):
+        config = self.client.create_online_evaluation_config_and_wait(
+            onlineEvaluationConfigName=f"{self.test_prefix}_config",
+            rule={"samplingConfig": {"samplingPercentage": 100.0}},
+            dataSourceConfig={
+                "cloudWatchLogs": {
+                    "logGroupNames": [self.log_group],
+                    "serviceNames": ["sdk-integ-test"],
+                }
+            },
+            evaluators=[{"evaluatorId": "Builtin.Helpfulness"}],
+            evaluationExecutionRoleArn=self.role_arn,
+            enableOnCreate=False,
+        )
+        self.__class__.config_ids.append(config["onlineEvaluationConfigId"])
+        assert config["status"] == "ACTIVE"
+
+    @pytest.mark.order(11)
+    def test_get_online_eval_config_passthrough(self):
+        if not self.config_ids:
+            pytest.skip("prerequisite test did not create config")
+        config = self.client.get_online_evaluation_config(
+            onlineEvaluationConfigId=self.config_ids[0],
+        )
+        assert config["status"] == "ACTIVE"
+
+    @pytest.mark.order(12)
+    def test_update_online_eval_config_and_wait(self):
+        if not self.config_ids:
+            pytest.skip("prerequisite test did not create config")
+        updated = self.client.update_online_evaluation_config_and_wait(
+            onlineEvaluationConfigId=self.config_ids[0],
+            description="updated by integ test",
+        )
+        assert updated["status"] == "ACTIVE"
+
+    @pytest.mark.order(13)
+    def test_delete_online_eval_config_and_wait(self):
+        if not self.config_ids:
+            pytest.skip("prerequisite test did not create config")
+        cid = self.config_ids.pop(0)
+        self.client.delete_online_evaluation_config_and_wait(
+            onlineEvaluationConfigId=cid,
+        )
+        with pytest.raises(ClientError):
+            self.client.get_online_evaluation_config(
+                onlineEvaluationConfigId=cid,
+            )