From 7304c1a184e6c32eb61fc33a60bf9e51e506c3ae Mon Sep 17 00:00:00 2001 From: RheagalFire Date: Tue, 21 Apr 2026 18:16:45 +0530 Subject: [PATCH 1/3] feat: add LiteLLM adapter for provider independence --- py/autoevals/__init__.py | 1 + py/autoevals/litellm.py | 162 +++++++++++++++++++++++++++++++++++ py/autoevals/test_litellm.py | 110 ++++++++++++++++++++++++ setup.py | 1 + 4 files changed, 274 insertions(+) create mode 100644 py/autoevals/litellm.py create mode 100644 py/autoevals/test_litellm.py diff --git a/py/autoevals/__init__.py b/py/autoevals/__init__.py index b6e1dd4..10bda72 100644 --- a/py/autoevals/__init__.py +++ b/py/autoevals/__init__.py @@ -127,6 +127,7 @@ async def evaluate_qa(): from .json import * from .list import * +from .litellm import AsyncLiteLLMClient, LiteLLMClient from .llm import * from .moderation import * from .number import * diff --git a/py/autoevals/litellm.py b/py/autoevals/litellm.py new file mode 100644 index 0000000..32a135b --- /dev/null +++ b/py/autoevals/litellm.py @@ -0,0 +1,162 @@ +"""LiteLLM adapters — route Autoevals through the LiteLLM AI gateway for direct +access to 100+ LLM providers (OpenAI, Anthropic, Bedrock, Vertex, Gemini, Ollama, +OpenRouter, Groq, DeepSeek, etc.) using provider-native API keys. + +Example:: + + from autoevals import init + from autoevals.litellm import LiteLLMClient + from autoevals.llm import Factuality + + init( + client=LiteLLMClient(), + default_model="anthropic/claude-3-5-sonnet-20241022", + ) + + evaluator = Factuality() + result = evaluator.eval(input="...", output="...", expected="...") + +Unlike the Braintrust AI Proxy path (which requires a Braintrust API key), LiteLLM +uses each provider's native key (``ANTHROPIC_API_KEY``, ``GEMINI_API_KEY``, +``AWS_*``, etc.) and routes locally. See https://docs.litellm.ai/docs/providers. +""" + +from __future__ import annotations + +from typing import Any, Optional + + +class _LiteLLMChatCompletions: + """Sync ``openai.chat.completions`` surface backed by ``litellm.completion``.""" + + def __init__(self, api_key: Optional[str], base_url: Optional[str]): + self._api_key = api_key + self._base_url = base_url + + def create(self, **kwargs: Any) -> Any: + import litellm + + if self._api_key is not None: + kwargs.setdefault("api_key", self._api_key) + if self._base_url is not None: + kwargs.setdefault("api_base", self._base_url) + return litellm.completion(**kwargs) + + +class _AsyncLiteLLMChatCompletions: + """Async counterpart of ``_LiteLLMChatCompletions``.""" + + def __init__(self, api_key: Optional[str], base_url: Optional[str]): + self._api_key = api_key + self._base_url = base_url + + async def create(self, **kwargs: Any) -> Any: + import litellm + + if self._api_key is not None: + kwargs.setdefault("api_key", self._api_key) + if self._base_url is not None: + kwargs.setdefault("api_base", self._base_url) + return await litellm.acompletion(**kwargs) + + +class _LiteLLMChat: + def __init__(self, completions: Any): + self.completions = completions + + +class _LiteLLMEmbeddings: + def __init__(self, api_key: Optional[str], base_url: Optional[str], is_async: bool): + self._api_key = api_key + self._base_url = base_url + self._is_async = is_async + + def _kwargs(self, kwargs: dict[str, Any]) -> dict[str, Any]: + if self._api_key is not None: + kwargs.setdefault("api_key", self._api_key) + if self._base_url is not None: + kwargs.setdefault("api_base", self._base_url) + return kwargs + + def create(self, **kwargs: Any) -> Any: + import litellm + + if self._is_async: + return litellm.aembedding(**self._kwargs(kwargs)) + return litellm.embedding(**self._kwargs(kwargs)) + + +class _LiteLLMModerations: + def __init__(self, api_key: Optional[str], base_url: Optional[str], is_async: bool): + self._api_key = api_key + self._base_url = base_url + self._is_async = is_async + + def _kwargs(self, kwargs: dict[str, Any]) -> dict[str, Any]: + if self._api_key is not None: + kwargs.setdefault("api_key", self._api_key) + if self._base_url is not None: + kwargs.setdefault("api_base", self._base_url) + return kwargs + + def create(self, **kwargs: Any) -> Any: + import litellm + + if self._is_async: + return litellm.amoderation(**self._kwargs(kwargs)) + return litellm.moderation(**self._kwargs(kwargs)) + + +class LiteLLMClient: + """OpenAI-compatible client backed by ``litellm.completion``. + + Pass to ``autoevals.init(client=LiteLLMClient())``. Routes every chat/embedding/ + moderation call through LiteLLM, which resolves the target provider from the + model-name prefix (e.g. ``anthropic/claude-3-5-sonnet``, ``bedrock/anthropic.claude-3-sonnet``). + + Args: + api_key: Optional provider API key. If unset, LiteLLM falls back to the + per-provider env vars (``OPENAI_API_KEY``, ``ANTHROPIC_API_KEY``, etc.). + base_url: Optional custom base URL (forwarded to LiteLLM as ``api_base``). + organization: Accepted for OpenAI-protocol compatibility; not used. + """ + + def __init__( + self, + api_key: Optional[str] = None, + base_url: Optional[str] = None, + organization: Optional[str] = None, + ): + self.api_key = api_key + self.base_url = base_url + self.organization = organization + self.chat = _LiteLLMChat(_LiteLLMChatCompletions(api_key=api_key, base_url=base_url)) + self.embeddings = _LiteLLMEmbeddings(api_key=api_key, base_url=base_url, is_async=False) + self.moderations = _LiteLLMModerations(api_key=api_key, base_url=base_url, is_async=False) + + # responses API is required by the OpenAI v1 protocol check in oai.py but only + # exercised for GPT-5 routing; LiteLLM handles GPT-5 through chat.completions. + @property + def responses(self): # pragma: no cover - protocol conformance only + return self.chat.completions + + +class AsyncLiteLLMClient: + """Async variant of :class:`LiteLLMClient` — uses ``litellm.acompletion`` etc.""" + + def __init__( + self, + api_key: Optional[str] = None, + base_url: Optional[str] = None, + organization: Optional[str] = None, + ): + self.api_key = api_key + self.base_url = base_url + self.organization = organization + self.chat = _LiteLLMChat(_AsyncLiteLLMChatCompletions(api_key=api_key, base_url=base_url)) + self.embeddings = _LiteLLMEmbeddings(api_key=api_key, base_url=base_url, is_async=True) + self.moderations = _LiteLLMModerations(api_key=api_key, base_url=base_url, is_async=True) + + @property + def responses(self): # pragma: no cover - protocol conformance only + return self.chat.completions diff --git a/py/autoevals/test_litellm.py b/py/autoevals/test_litellm.py new file mode 100644 index 0000000..d58e156 --- /dev/null +++ b/py/autoevals/test_litellm.py @@ -0,0 +1,110 @@ +"""Tests for the LiteLLM adapter clients.""" + +from types import SimpleNamespace +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from autoevals import init +from autoevals.litellm import AsyncLiteLLMClient, LiteLLMClient +from autoevals.oai import LLMClient + + +@pytest.fixture(autouse=True) +def reset_autoevals_state(): + init() + yield + init() + + +def _fake_completion_response(content: str = "hi") -> SimpleNamespace: + """Minimal OpenAI-compatible completion response.""" + message = SimpleNamespace(content=content, role="assistant") + choice = SimpleNamespace(message=message, index=0, finish_reason="stop") + return SimpleNamespace(choices=[choice], id="cmpl-1", model="test", object="chat.completion") + + +def test_litellm_client_exposes_openai_v1_surface(): + client = LiteLLMClient(api_key="sk-test", base_url="https://proxy.example/v1") + # openai v1 protocol surface: + assert hasattr(client.chat.completions, "create") + assert hasattr(client.embeddings, "create") + assert hasattr(client.moderations, "create") + assert client.api_key == "sk-test" + assert client.base_url == "https://proxy.example/v1" + + +def test_litellm_chat_completions_forwards_to_litellm(mocker): + stub = mocker.patch("litellm.completion", return_value=_fake_completion_response("pong")) + client = LiteLLMClient(api_key="sk-test", base_url="https://proxy.example/v1") + + resp = client.chat.completions.create( + model="anthropic/claude-3-5-sonnet-20241022", + messages=[{"role": "user", "content": "ping"}], + ) + + assert resp.choices[0].message.content == "pong" + stub.assert_called_once() + kwargs = stub.call_args.kwargs + assert kwargs["model"] == "anthropic/claude-3-5-sonnet-20241022" + assert kwargs["api_key"] == "sk-test" + assert kwargs["api_base"] == "https://proxy.example/v1" + + +def test_litellm_client_without_api_key_does_not_forward_key(mocker): + stub = mocker.patch("litellm.completion", return_value=_fake_completion_response()) + client = LiteLLMClient() # LiteLLM will pick up env vars per provider + + client.chat.completions.create(model="openai/gpt-4o-mini", messages=[]) + kwargs = stub.call_args.kwargs + assert "api_key" not in kwargs + assert "api_base" not in kwargs + + +def test_litellm_embeddings_forwards_to_litellm(mocker): + stub = mocker.patch("litellm.embedding", return_value={"data": [{"embedding": [0.1, 0.2]}]}) + client = LiteLLMClient(api_key="sk-test") + + client.embeddings.create(model="text-embedding-3-small", input="hello") + + stub.assert_called_once() + assert stub.call_args.kwargs["model"] == "text-embedding-3-small" + assert stub.call_args.kwargs["api_key"] == "sk-test" + + +def test_litellm_moderations_forwards_to_litellm(mocker): + stub = mocker.patch("litellm.moderation", return_value={"results": [{"flagged": False}]}) + client = LiteLLMClient() + + client.moderations.create(input="some text") + + stub.assert_called_once() + + +@pytest.mark.asyncio +async def test_async_litellm_chat_completions_forwards(mocker): + stub = mocker.patch("litellm.acompletion", new=AsyncMock(return_value=_fake_completion_response("async-pong"))) + client = AsyncLiteLLMClient(api_key="sk-test") + + resp = await client.chat.completions.create( + model="openai/gpt-4o-mini", + messages=[{"role": "user", "content": "ping"}], + ) + + assert resp.choices[0].message.content == "async-pong" + assert stub.await_count == 1 + + +def test_init_accepts_litellm_client(mocker): + """End-to-end: init(client=LiteLLMClient()) builds a usable LLMClient.""" + mocker.patch("litellm.completion", return_value=_fake_completion_response("init-ok")) + + init(client=LiteLLMClient(api_key="sk-test")) + + from autoevals.oai import prepare_openai + + wrapper = prepare_openai() + assert isinstance(wrapper, LLMClient) + # Calling through the wrapper should dispatch to litellm.completion + result = wrapper.complete(model="openai/gpt-4o-mini", messages=[{"role": "user", "content": "ping"}]) + assert result.choices[0].message.content == "init-ok" diff --git a/setup.py b/setup.py index d8b7080..71aee08 100644 --- a/setup.py +++ b/setup.py @@ -30,6 +30,7 @@ ], "doc": ["pydoc-markdown"], "scipy": ["numpy", "scipy"], + "litellm": ["litellm"], } extras_require["all"] = sorted({package for packages in extras_require.values() for package in packages}) From 0268d008f8f3445647cef19d645044e8e367e146 Mon Sep 17 00:00:00 2001 From: RheagalFire Date: Tue, 21 Apr 2026 23:16:03 +0530 Subject: [PATCH 2/3] fix(litellm): responses-API shim for gpt-5 model routing --- py/autoevals/litellm.py | 90 ++++++++++++++++++++++++++++++++---- py/autoevals/test_litellm.py | 54 ++++++++++++++++++++++ 2 files changed, 134 insertions(+), 10 deletions(-) diff --git a/py/autoevals/litellm.py b/py/autoevals/litellm.py index 32a135b..fb8e74e 100644 --- a/py/autoevals/litellm.py +++ b/py/autoevals/litellm.py @@ -107,6 +107,80 @@ def create(self, **kwargs: Any) -> Any: return litellm.moderation(**self._kwargs(kwargs)) +def _responses_params_to_chat_kwargs(kwargs: dict[str, Any]) -> dict[str, Any]: + """Translate autoevals' Responses-API kwargs back into Chat-Completions kwargs + that ``litellm.completion`` understands. + + autoevals' ``oai.py`` routes GPT-5 models through ``client.responses.create`` + (see ``is_gpt5_model``, ``prepare_responses_params``). Those params use + ``input=`` instead of ``messages=`` and a Responses-API tool schema. + ``litellm.completion`` only speaks Chat-Completions, so we translate back. + The resulting ChatCompletion response is then detected by autoevals' + ``convert_responses_to_chat_completion`` as not-a-Responses-object and + returned as-is (see ``oai.py:226``). + """ + chat_kwargs = dict(kwargs) + if "input" in chat_kwargs and "messages" not in chat_kwargs: + chat_kwargs["messages"] = chat_kwargs.pop("input") + # Responses-API tools use flat {type, name, description, parameters}; Chat- + # Completions tools nest the schema under {type, function: {...}}. + if "tools" in chat_kwargs: + translated = [] + for tool in chat_kwargs["tools"]: + if isinstance(tool, dict) and tool.get("type") == "function" and "function" not in tool: + translated.append( + { + "type": "function", + "function": { + "name": tool.get("name"), + "description": tool.get("description"), + "parameters": tool.get("parameters"), + }, + } + ) + else: + translated.append(tool) + chat_kwargs["tools"] = translated + return chat_kwargs + + +class _LiteLLMResponses: + """Adapter for autoevals' Responses-API code path (triggered by GPT-5 models). + + Without this, ``init(client=LiteLLMClient())`` with autoevals' default model + (``gpt-5-mini``) would call ``litellm.completion(input=..., model=...)`` and + crash because LiteLLM requires ``messages=``. + """ + + def __init__(self, api_key: Optional[str], base_url: Optional[str], is_async: bool): + self._api_key = api_key + self._base_url = base_url + self._is_async = is_async + + def _kwargs(self, kwargs: dict[str, Any]) -> dict[str, Any]: + chat_kwargs = _responses_params_to_chat_kwargs(kwargs) + if self._api_key is not None: + chat_kwargs.setdefault("api_key", self._api_key) + if self._base_url is not None: + chat_kwargs.setdefault("api_base", self._base_url) + return chat_kwargs + + def create(self, **kwargs: Any) -> Any: + import litellm + + if self._is_async: + return litellm.acompletion(**self._kwargs(kwargs)) + return litellm.completion(**self._kwargs(kwargs)) + + +class _LiteLLMResponsesContainer: + """Exposes ``.create`` on the ``client.responses`` attribute to match the + OpenAI v1 client shape autoevals' ``oai.py`` duck-types on.""" + + def __init__(self, create_impl: Any): + self.create = create_impl + + class LiteLLMClient: """OpenAI-compatible client backed by ``litellm.completion``. @@ -133,12 +207,9 @@ def __init__( self.chat = _LiteLLMChat(_LiteLLMChatCompletions(api_key=api_key, base_url=base_url)) self.embeddings = _LiteLLMEmbeddings(api_key=api_key, base_url=base_url, is_async=False) self.moderations = _LiteLLMModerations(api_key=api_key, base_url=base_url, is_async=False) - - # responses API is required by the OpenAI v1 protocol check in oai.py but only - # exercised for GPT-5 routing; LiteLLM handles GPT-5 through chat.completions. - @property - def responses(self): # pragma: no cover - protocol conformance only - return self.chat.completions + self.responses = _LiteLLMResponsesContainer( + _LiteLLMResponses(api_key=api_key, base_url=base_url, is_async=False).create, + ) class AsyncLiteLLMClient: @@ -156,7 +227,6 @@ def __init__( self.chat = _LiteLLMChat(_AsyncLiteLLMChatCompletions(api_key=api_key, base_url=base_url)) self.embeddings = _LiteLLMEmbeddings(api_key=api_key, base_url=base_url, is_async=True) self.moderations = _LiteLLMModerations(api_key=api_key, base_url=base_url, is_async=True) - - @property - def responses(self): # pragma: no cover - protocol conformance only - return self.chat.completions + self.responses = _LiteLLMResponsesContainer( + _LiteLLMResponses(api_key=api_key, base_url=base_url, is_async=True).create, + ) diff --git a/py/autoevals/test_litellm.py b/py/autoevals/test_litellm.py index d58e156..0327091 100644 --- a/py/autoevals/test_litellm.py +++ b/py/autoevals/test_litellm.py @@ -81,6 +81,60 @@ def test_litellm_moderations_forwards_to_litellm(mocker): stub.assert_called_once() +def test_litellm_responses_create_translates_input_to_messages(mocker): + """autoevals routes GPT-5 models through client.responses.create (see + is_gpt5_model / prepare_responses_params in oai.py), which sends input=... + instead of messages=.... Our shim must translate back to messages= so + litellm.completion doesn't crash.""" + stub = mocker.patch("litellm.completion", return_value=_fake_completion_response("from-responses")) + client = LiteLLMClient(api_key="sk-test") + + resp = client.responses.create( + model="gpt-5-mini", + input=[{"role": "user", "content": "ping"}], + temperature=0.1, + ) + + assert resp.choices[0].message.content == "from-responses" + kwargs = stub.call_args.kwargs + assert "messages" in kwargs, "responses.create must translate input=... to messages=..." + assert kwargs["messages"] == [{"role": "user", "content": "ping"}] + assert "input" not in kwargs, "input=... must not leak through to litellm.completion" + assert kwargs["temperature"] == 0.1 + + +def test_litellm_responses_create_translates_responses_api_tool_schema(mocker): + """autoevals' prepare_responses_params emits flat tool schema {type, name, + description, parameters}. LiteLLM (via Chat-Completions) wants nested schema + {type, function: {...}}. Our shim translates.""" + stub = mocker.patch("litellm.completion", return_value=_fake_completion_response("ok")) + client = LiteLLMClient(api_key="sk-test") + + responses_tool = { + "type": "function", + "name": "select_choice", + "description": "Select a choice", + "parameters": {"type": "object", "properties": {"choice": {"type": "string"}}}, + } + client.responses.create( + model="gpt-5-mini", + input=[{"role": "user", "content": "pick"}], + tools=[responses_tool], + ) + + kwargs = stub.call_args.kwargs + assert kwargs["tools"] == [ + { + "type": "function", + "function": { + "name": "select_choice", + "description": "Select a choice", + "parameters": {"type": "object", "properties": {"choice": {"type": "string"}}}, + }, + } + ] + + @pytest.mark.asyncio async def test_async_litellm_chat_completions_forwards(mocker): stub = mocker.patch("litellm.acompletion", new=AsyncMock(return_value=_fake_completion_response("async-pong"))) From 26d367d02c48939f546c6b7d25bdcebbed9da12e Mon Sep 17 00:00:00 2001 From: RheagalFire Date: Wed, 22 Apr 2026 16:17:28 +0530 Subject: [PATCH 3/3] chore: pin litellm to >=1.60,<1.85 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 71aee08..9afbceb 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ ], "doc": ["pydoc-markdown"], "scipy": ["numpy", "scipy"], - "litellm": ["litellm"], + "litellm": ["litellm>=1.60,<1.85"], } extras_require["all"] = sorted({package for packages in extras_require.values() for package in packages})