From 852beea72171a59cd507532df74299edda8337e8 Mon Sep 17 00:00:00 2001 From: Nicolas Borges Date: Thu, 23 Apr 2026 14:15:33 -0400 Subject: [PATCH] feat: add evaluations client passthrough and tests --- src/bedrock_agentcore/evaluation/client.py | 206 ++++++++++++++++++ .../evaluation/test_client_passthrough.py | 155 +++++++++++++ .../evaluation/test_evaluation_passthrough.py | 197 +++++++++++++++++ 3 files changed, 558 insertions(+) create mode 100644 tests/bedrock_agentcore/evaluation/test_client_passthrough.py create mode 100644 tests_integ/evaluation/test_evaluation_passthrough.py diff --git a/src/bedrock_agentcore/evaluation/client.py b/src/bedrock_agentcore/evaluation/client.py index 9f8a32e6..d07c5c51 100644 --- a/src/bedrock_agentcore/evaluation/client.py +++ b/src/bedrock_agentcore/evaluation/client.py @@ -8,6 +8,9 @@ from botocore.config import Config from pydantic import BaseModel +from bedrock_agentcore._utils.config import WaitConfig +from bedrock_agentcore._utils.polling import wait_until, wait_until_deleted +from bedrock_agentcore._utils.snake_case import accept_snake_case_kwargs, convert_kwargs from bedrock_agentcore._utils.user_agent import build_user_agent_suffix from bedrock_agentcore.evaluation.agent_span_collector import CloudWatchAgentSpanCollector @@ -17,6 +20,9 @@ QUERY_TIMEOUT_SECONDS = 60 POLL_INTERVAL_SECONDS = 2 +_EVALUATOR_FAILED_STATUSES = {"CREATE_FAILED", "UPDATE_FAILED"} +_ONLINE_EVAL_FAILED_STATUSES = {"CREATE_FAILED", "UPDATE_FAILED"} + class ReferenceInputs(BaseModel): """Ground truth inputs for evaluation. @@ -92,6 +98,46 @@ def __init__( logger.info("Initialized EvaluationClient in region %s", self.region_name) + # Pass-through + # ------------------------------------------------------------------------- + _ALLOWED_DP_METHODS = { + "evaluate", + } + + _ALLOWED_CP_METHODS = { + # Evaluator CRUD + "create_evaluator", + "get_evaluator", + "list_evaluators", + "update_evaluator", + "delete_evaluator", + # Online evaluation config CRUD + "create_online_evaluation_config", + "get_online_evaluation_config", + "list_online_evaluation_configs", + "update_online_evaluation_config", + "delete_online_evaluation_config", + } + + def __getattr__(self, name: str): + """Dynamically forward allowlisted method calls to the appropriate boto3 client.""" + if name in self._ALLOWED_DP_METHODS and hasattr(self._dp_client, name): + method = getattr(self._dp_client, name) + logger.debug("Forwarding method '%s' to _dp_client", name) + return accept_snake_case_kwargs(method) + + if name in self._ALLOWED_CP_METHODS and hasattr(self._cp_client, name): + method = getattr(self._cp_client, name) + logger.debug("Forwarding method '%s' to _cp_client", name) + return accept_snake_case_kwargs(method) + + raise AttributeError( + f"'{self.__class__.__name__}' object has no attribute '{name}'. " + f"Method not found on data plane or control plane client. " + f"Available methods can be found in the boto3 documentation for " + f"'bedrock-agentcore' and 'bedrock-agentcore-control' services." + ) + def run( self, evaluator_ids: List[str], @@ -349,3 +395,163 @@ def _build_reference_inputs( ) return result + + # *_and_wait methods + # ------------------------------------------------------------------------- + def create_evaluator_and_wait( + self, + wait_config: Optional[WaitConfig] = None, + **kwargs, + ) -> Dict[str, Any]: + """Create an evaluator and wait for it to reach ACTIVE status. + + Args: + wait_config: Optional WaitConfig for polling behavior. + **kwargs: Arguments forwarded to the create_evaluator API. + + Returns: + Evaluator details when ACTIVE. + + Raises: + RuntimeError: If the evaluator reaches a failed state. + TimeoutError: If the evaluator doesn't become ACTIVE within max_wait. + """ + response = self._cp_client.create_evaluator(**convert_kwargs(kwargs)) + eid = response["evaluatorId"] + return wait_until( + lambda: self._cp_client.get_evaluator(evaluatorId=eid), + "ACTIVE", + _EVALUATOR_FAILED_STATUSES, + wait_config, + ) + + def update_evaluator_and_wait( + self, + wait_config: Optional[WaitConfig] = None, + **kwargs, + ) -> Dict[str, Any]: + """Update an evaluator and wait for it to reach ACTIVE status. + + Args: + wait_config: Optional WaitConfig for polling behavior. + **kwargs: Arguments forwarded to the update_evaluator API. + + Returns: + Evaluator details when ACTIVE. + + Raises: + RuntimeError: If the evaluator reaches a failed state. + TimeoutError: If the evaluator doesn't become ACTIVE within max_wait. + """ + response = self._cp_client.update_evaluator(**convert_kwargs(kwargs)) + eid = response["evaluatorId"] + return wait_until( + lambda: self._cp_client.get_evaluator(evaluatorId=eid), + "ACTIVE", + _EVALUATOR_FAILED_STATUSES, + wait_config, + ) + + def create_online_evaluation_config_and_wait( + self, + wait_config: Optional[WaitConfig] = None, + **kwargs, + ) -> Dict[str, Any]: + """Create an online evaluation config and wait for ACTIVE status. + + Args: + wait_config: Optional WaitConfig for polling behavior. + **kwargs: Arguments forwarded to the create_online_evaluation_config API. + + Returns: + Online evaluation config details when ACTIVE. + + Raises: + RuntimeError: If the config reaches a failed state. + TimeoutError: If the config doesn't become ACTIVE within max_wait. + """ + response = self._cp_client.create_online_evaluation_config(**convert_kwargs(kwargs)) + cid = response["onlineEvaluationConfigId"] + return wait_until( + lambda: self._cp_client.get_online_evaluation_config( + onlineEvaluationConfigId=cid, + ), + "ACTIVE", + _ONLINE_EVAL_FAILED_STATUSES, + wait_config, + error_field="failureReason", + ) + + def update_online_evaluation_config_and_wait( + self, + wait_config: Optional[WaitConfig] = None, + **kwargs, + ) -> Dict[str, Any]: + """Update an online evaluation config and wait for ACTIVE status. + + Args: + wait_config: Optional WaitConfig for polling behavior. + **kwargs: Arguments forwarded to the update_online_evaluation_config API. + + Returns: + Online evaluation config details when ACTIVE. + + Raises: + RuntimeError: If the config reaches a failed state. + TimeoutError: If the config doesn't become ACTIVE within max_wait. + """ + response = self._cp_client.update_online_evaluation_config(**convert_kwargs(kwargs)) + cid = response["onlineEvaluationConfigId"] + return wait_until( + lambda: self._cp_client.get_online_evaluation_config( + onlineEvaluationConfigId=cid, + ), + "ACTIVE", + _ONLINE_EVAL_FAILED_STATUSES, + wait_config, + error_field="failureReason", + ) + + def delete_evaluator_and_wait( + self, + wait_config: Optional[WaitConfig] = None, + **kwargs, + ) -> None: + """Delete an evaluator and wait for deletion to complete. + + Args: + wait_config: Optional WaitConfig for polling behavior. + **kwargs: Arguments forwarded to the delete_evaluator API. + + Raises: + TimeoutError: If the evaluator isn't deleted within max_wait. + """ + response = self._cp_client.delete_evaluator(**convert_kwargs(kwargs)) + eid = response["evaluatorId"] + wait_until_deleted( + lambda: self._cp_client.get_evaluator(evaluatorId=eid), + wait_config=wait_config, + ) + + def delete_online_evaluation_config_and_wait( + self, + wait_config: Optional[WaitConfig] = None, + **kwargs, + ) -> None: + """Delete an online evaluation config and wait for deletion to complete. + + Args: + wait_config: Optional WaitConfig for polling behavior. + **kwargs: Arguments forwarded to the delete_online_evaluation_config API. + + Raises: + TimeoutError: If the config isn't deleted within max_wait. + """ + response = self._cp_client.delete_online_evaluation_config(**convert_kwargs(kwargs)) + cid = response["onlineEvaluationConfigId"] + wait_until_deleted( + lambda: self._cp_client.get_online_evaluation_config( + onlineEvaluationConfigId=cid, + ), + wait_config=wait_config, + ) diff --git a/tests/bedrock_agentcore/evaluation/test_client_passthrough.py b/tests/bedrock_agentcore/evaluation/test_client_passthrough.py new file mode 100644 index 00000000..c752c125 --- /dev/null +++ b/tests/bedrock_agentcore/evaluation/test_client_passthrough.py @@ -0,0 +1,155 @@ +"""Tests for EvaluationClient passthrough and *_and_wait methods.""" + +from unittest.mock import Mock, patch + +import pytest +from botocore.exceptions import ClientError + +from bedrock_agentcore.evaluation.client import EvaluationClient + + +class TestEvaluationClientPassthrough: + """Tests for __getattr__ passthrough.""" + + def _make_client(self): + with patch("boto3.client"): + client = EvaluationClient(region_name="us-west-2") + client._cp_client = Mock() + client._dp_client = Mock() + return client + + def test_cp_method_forwarded(self): + client = self._make_client() + client._cp_client.get_evaluator.return_value = {"evaluatorId": "e-123"} + result = client.get_evaluator(evaluatorId="e-123") + client._cp_client.get_evaluator.assert_called_once_with(evaluatorId="e-123") + assert result["evaluatorId"] == "e-123" + + def test_dp_method_forwarded(self): + client = self._make_client() + client._dp_client.evaluate.return_value = {"evaluationResults": []} + result = client.evaluate(evaluatorId="e-123") + client._dp_client.evaluate.assert_called_once_with(evaluatorId="e-123") + assert result["evaluationResults"] == [] + + def test_snake_case_kwargs_converted(self): + client = self._make_client() + client._cp_client.get_evaluator.return_value = {"evaluatorId": "e-123"} + client.get_evaluator(evaluator_id="e-123") + client._cp_client.get_evaluator.assert_called_once_with(evaluatorId="e-123") + + def test_non_allowlisted_method_raises(self): + client = self._make_client() + with pytest.raises(AttributeError, match="has no attribute"): + client.not_a_real_method() + + def test_all_cp_methods_in_allowlist(self): + expected = { + "create_evaluator", + "get_evaluator", + "list_evaluators", + "update_evaluator", + "delete_evaluator", + "create_online_evaluation_config", + "get_online_evaluation_config", + "list_online_evaluation_configs", + "update_online_evaluation_config", + "delete_online_evaluation_config", + } + assert expected == EvaluationClient._ALLOWED_CP_METHODS + + def test_all_dp_methods_in_allowlist(self): + expected = {"evaluate"} + assert expected == EvaluationClient._ALLOWED_DP_METHODS + + +class TestEvaluationAndWait: + """Tests for *_and_wait methods.""" + + def _make_client(self): + with patch("boto3.client"): + client = EvaluationClient(region_name="us-west-2") + client._cp_client = Mock() + client._dp_client = Mock() + return client + + def test_create_evaluator_and_wait(self): + client = self._make_client() + client._cp_client.create_evaluator.return_value = {"evaluatorId": "e-123"} + client._cp_client.get_evaluator.return_value = { + "status": "ACTIVE", + "evaluatorId": "e-123", + } + result = client.create_evaluator_and_wait(evaluatorName="test") + assert result["status"] == "ACTIVE" + + def test_create_evaluator_and_wait_failed(self): + client = self._make_client() + client._cp_client.create_evaluator.return_value = {"evaluatorId": "e-123"} + client._cp_client.get_evaluator.return_value = {"status": "CREATE_FAILED"} + with pytest.raises(RuntimeError, match="CREATE_FAILED"): + client.create_evaluator_and_wait(evaluatorName="test") + + def test_update_evaluator_and_wait(self): + client = self._make_client() + client._cp_client.update_evaluator.return_value = {"evaluatorId": "e-123"} + client._cp_client.get_evaluator.return_value = { + "status": "ACTIVE", + "evaluatorId": "e-123", + } + result = client.update_evaluator_and_wait(evaluatorId="e-123") + assert result["status"] == "ACTIVE" + + def test_create_online_eval_config_and_wait(self): + client = self._make_client() + client._cp_client.create_online_evaluation_config.return_value = { + "onlineEvaluationConfigId": "c-123", + } + client._cp_client.get_online_evaluation_config.return_value = { + "status": "ACTIVE", + "onlineEvaluationConfigId": "c-123", + } + result = client.create_online_evaluation_config_and_wait( + onlineEvaluationConfigName="test", + ) + assert result["status"] == "ACTIVE" + + def test_update_online_eval_config_and_wait(self): + client = self._make_client() + client._cp_client.update_online_evaluation_config.return_value = { + "onlineEvaluationConfigId": "c-123", + } + client._cp_client.get_online_evaluation_config.return_value = { + "status": "ACTIVE", + "onlineEvaluationConfigId": "c-123", + } + result = client.update_online_evaluation_config_and_wait( + onlineEvaluationConfigId="c-123", + ) + assert result["status"] == "ACTIVE" + + @patch("bedrock_agentcore._utils.polling.time.sleep") + def test_delete_evaluator_and_wait(self, _mock_sleep): + client = self._make_client() + client._cp_client.delete_evaluator.return_value = {"evaluatorId": "e-123"} + client._cp_client.get_evaluator.side_effect = ClientError( + {"Error": {"Code": "ResourceNotFoundException", "Message": "gone"}}, + "GetEvaluator", + ) + client.delete_evaluator_and_wait(evaluatorId="e-123") + client._cp_client.delete_evaluator.assert_called_once() + + @patch("bedrock_agentcore._utils.polling.time.sleep") + def test_delete_online_eval_config_and_wait(self, _mock_sleep): + client = self._make_client() + client._cp_client.delete_online_evaluation_config.return_value = { + "onlineEvaluationConfigId": "c-123", + } + client._cp_client.get_online_evaluation_config.side_effect = ClientError( + {"Error": {"Code": "ResourceNotFoundException", "Message": "gone"}}, + "GetOnlineEvaluationConfig", + ) + client.delete_online_evaluation_config_and_wait( + onlineEvaluationConfigId="c-123", + ) + client._cp_client.delete_online_evaluation_config.assert_called_once() diff --git a/tests_integ/evaluation/test_evaluation_passthrough.py b/tests_integ/evaluation/test_evaluation_passthrough.py new file mode 100644 index 00000000..6180d2be --- /dev/null +++ b/tests_integ/evaluation/test_evaluation_passthrough.py @@ -0,0 +1,197 @@ +"""Integration tests for EvaluationClient passthrough and *_and_wait methods.""" + +import os +import time + +import pytest +from botocore.exceptions import ClientError + +from bedrock_agentcore.evaluation.client import EvaluationClient + + +@pytest.mark.integration +class TestEvaluationClientPassthrough: + """Read-only passthrough tests. No resources needed.""" + + @classmethod + def setup_class(cls): + cls.region = os.environ.get("BEDROCK_TEST_REGION", "us-west-2") + cls.client = EvaluationClient(region_name=cls.region) + + @pytest.mark.order(1) + def test_list_evaluators_passthrough(self): + response = self.client.list_evaluators() + assert "evaluators" in response + + @pytest.mark.order(2) + def test_list_evaluators_snake_case(self): + response = self.client.list_evaluators(max_results=5) + assert "evaluators" in response + + @pytest.mark.order(3) + def test_get_builtin_evaluator(self): + response = self.client.get_evaluator(evaluatorId="Builtin.Helpfulness") + assert response["evaluatorId"] == "Builtin.Helpfulness" + assert response["level"] in ("SESSION", "TRACE", "TOOL_CALL") + + @pytest.mark.order(4) + def test_list_online_evaluation_configs_passthrough(self): + response = self.client.list_online_evaluation_configs() + assert "onlineEvaluationConfigs" in response + + @pytest.mark.order(5) + def test_non_allowlisted_method_raises(self): + with pytest.raises(AttributeError): + self.client.not_a_real_method() + + +@pytest.mark.integration +class TestEvaluatorCrud: + """CRUD tests for custom evaluators.""" + + @classmethod + def setup_class(cls): + cls.region = os.environ.get("BEDROCK_TEST_REGION", "us-west-2") + cls.client = EvaluationClient(region_name=cls.region) + cls.test_prefix = f"sdk_integ_{int(time.time())}" + cls.evaluator_ids = [] + + @classmethod + def teardown_class(cls): + for eid in cls.evaluator_ids: + try: + cls.client.delete_evaluator(evaluatorId=eid) + except Exception as e: + print(f"Failed to delete evaluator {eid}: {e}") + + @pytest.mark.order(6) + def test_create_evaluator_and_wait(self): + evaluator = self.client.create_evaluator_and_wait( + evaluatorName=f"{self.test_prefix}_eval", + level="SESSION", + evaluatorConfig={ + "llmAsAJudge": { + "instructions": "Rate the helpfulness of the response. {context}", + "ratingScale": { + "numerical": [ + {"definition": "Not helpful", "value": 1, "label": "Poor"}, + {"definition": "Very helpful", "value": 5, "label": "Excellent"}, + ] + }, + "modelConfig": { + "bedrockEvaluatorModelConfig": { + "modelId": "us.anthropic.claude-haiku-4-5-20251001-v1:0", + } + }, + } + }, + ) + self.__class__.evaluator_ids.append(evaluator["evaluatorId"]) + assert evaluator["status"] == "ACTIVE" + + @pytest.mark.order(7) + def test_get_evaluator_passthrough(self): + if not self.evaluator_ids: + pytest.skip("prerequisite test did not create evaluator") + evaluator = self.client.get_evaluator( + evaluatorId=self.evaluator_ids[0], + ) + assert evaluator["status"] == "ACTIVE" + + @pytest.mark.order(8) + def test_update_evaluator_and_wait(self): + if not self.evaluator_ids: + pytest.skip("prerequisite test did not create evaluator") + updated = self.client.update_evaluator_and_wait( + evaluatorId=self.evaluator_ids[0], + description="updated by integ test", + ) + assert updated["status"] == "ACTIVE" + + @pytest.mark.order(9) + def test_delete_evaluator_and_wait(self): + if not self.evaluator_ids: + pytest.skip("prerequisite test did not create evaluator") + eid = self.evaluator_ids.pop(0) + self.client.delete_evaluator_and_wait(evaluatorId=eid) + with pytest.raises(ClientError): + self.client.get_evaluator(evaluatorId=eid) + + +@pytest.mark.integration +class TestOnlineEvaluationConfigCrud: + """CRUD tests for online evaluation configs. + + Requires EVAL_ROLE_ARN and EVAL_LOG_GROUP environment variables. + """ + + @classmethod + def setup_class(cls): + cls.region = os.environ.get("BEDROCK_TEST_REGION", "us-west-2") + cls.role_arn = os.environ.get("EVAL_ROLE_ARN") + cls.log_group = os.environ.get("EVAL_LOG_GROUP") + if not cls.role_arn or not cls.log_group: + pytest.skip("EVAL_ROLE_ARN and EVAL_LOG_GROUP must be set") + cls.client = EvaluationClient(region_name=cls.region) + cls.test_prefix = f"sdk_integ_{int(time.time())}" + cls.config_ids = [] + + @classmethod + def teardown_class(cls): + for cid in cls.config_ids: + try: + cls.client.delete_online_evaluation_config( + onlineEvaluationConfigId=cid, + ) + except Exception as e: + print(f"Failed to delete config {cid}: {e}") + + @pytest.mark.order(10) + def test_create_online_eval_config_and_wait(self): + config = self.client.create_online_evaluation_config_and_wait( + onlineEvaluationConfigName=f"{self.test_prefix}_config", + rule={"samplingConfig": {"samplingPercentage": 100.0}}, + dataSourceConfig={ + "cloudWatchLogs": { + "logGroupNames": [self.log_group], + "serviceNames": ["sdk-integ-test"], + } + }, + evaluators=[{"evaluatorId": "Builtin.Helpfulness"}], + evaluationExecutionRoleArn=self.role_arn, + enableOnCreate=False, + ) + self.__class__.config_ids.append(config["onlineEvaluationConfigId"]) + assert config["status"] == "ACTIVE" + + @pytest.mark.order(11) + def test_get_online_eval_config_passthrough(self): + if not self.config_ids: + pytest.skip("prerequisite test did not create config") + config = self.client.get_online_evaluation_config( + onlineEvaluationConfigId=self.config_ids[0], + ) + assert config["status"] == "ACTIVE" + + @pytest.mark.order(12) + def test_update_online_eval_config_and_wait(self): + if not self.config_ids: + pytest.skip("prerequisite test did not create config") + updated = self.client.update_online_evaluation_config_and_wait( + onlineEvaluationConfigId=self.config_ids[0], + description="updated by integ test", + ) + assert updated["status"] == "ACTIVE" + + @pytest.mark.order(13) + def test_delete_online_eval_config_and_wait(self): + if not self.config_ids: + pytest.skip("prerequisite test did not create config") + cid = self.config_ids.pop(0) + self.client.delete_online_evaluation_config_and_wait( + onlineEvaluationConfigId=cid, + ) + with pytest.raises(ClientError): + self.client.get_online_evaluation_config( + onlineEvaluationConfigId=cid, + )