From 7304c1a184e6c32eb61fc33a60bf9e51e506c3ae Mon Sep 17 00:00:00 2001
From: RheagalFire <arishalam121@gmail.com>
Date: Tue, 21 Apr 2026 18:16:45 +0530
Subject: [PATCH 1/3] feat: add LiteLLM adapter for provider independence

---
 py/autoevals/__init__.py     |   1 +
 py/autoevals/litellm.py      | 162 +++++++++++++++++++++++++++++++++++
 py/autoevals/test_litellm.py | 110 ++++++++++++++++++++++++
 setup.py                     |   1 +
 4 files changed, 274 insertions(+)
 create mode 100644 py/autoevals/litellm.py
 create mode 100644 py/autoevals/test_litellm.py

diff --git a/py/autoevals/__init__.py b/py/autoevals/__init__.py
index b6e1dd4..10bda72 100644
--- a/py/autoevals/__init__.py
+++ b/py/autoevals/__init__.py
@@ -127,6 +127,7 @@ async def evaluate_qa():
 
 from .json import *
 from .list import *
+from .litellm import AsyncLiteLLMClient, LiteLLMClient
 from .llm import *
 from .moderation import *
 from .number import *
diff --git a/py/autoevals/litellm.py b/py/autoevals/litellm.py
new file mode 100644
index 0000000..32a135b
--- /dev/null
+++ b/py/autoevals/litellm.py
@@ -0,0 +1,162 @@
+"""LiteLLM adapters — route Autoevals through the LiteLLM AI gateway for direct
+access to 100+ LLM providers (OpenAI, Anthropic, Bedrock, Vertex, Gemini, Ollama,
+OpenRouter, Groq, DeepSeek, etc.) using provider-native API keys.
+
+Example::
+
+    from autoevals import init
+    from autoevals.litellm import LiteLLMClient
+    from autoevals.llm import Factuality
+
+    init(
+        client=LiteLLMClient(),
+        default_model="anthropic/claude-3-5-sonnet-20241022",
+    )
+
+    evaluator = Factuality()
+    result = evaluator.eval(input="...", output="...", expected="...")
+
+Unlike the Braintrust AI Proxy path (which requires a Braintrust API key), LiteLLM
+uses each provider's native key (``ANTHROPIC_API_KEY``, ``GEMINI_API_KEY``,
+``AWS_*``, etc.) and routes locally. See https://docs.litellm.ai/docs/providers.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+
+class _LiteLLMChatCompletions:
+    """Sync ``openai.chat.completions`` surface backed by ``litellm.completion``."""
+
+    def __init__(self, api_key: Optional[str], base_url: Optional[str]):
+        self._api_key = api_key
+        self._base_url = base_url
+
+    def create(self, **kwargs: Any) -> Any:
+        import litellm
+
+        if self._api_key is not None:
+            kwargs.setdefault("api_key", self._api_key)
+        if self._base_url is not None:
+            kwargs.setdefault("api_base", self._base_url)
+        return litellm.completion(**kwargs)
+
+
+class _AsyncLiteLLMChatCompletions:
+    """Async counterpart of ``_LiteLLMChatCompletions``."""
+
+    def __init__(self, api_key: Optional[str], base_url: Optional[str]):
+        self._api_key = api_key
+        self._base_url = base_url
+
+    async def create(self, **kwargs: Any) -> Any:
+        import litellm
+
+        if self._api_key is not None:
+            kwargs.setdefault("api_key", self._api_key)
+        if self._base_url is not None:
+            kwargs.setdefault("api_base", self._base_url)
+        return await litellm.acompletion(**kwargs)
+
+
+class _LiteLLMChat:
+    def __init__(self, completions: Any):
+        self.completions = completions
+
+
+class _LiteLLMEmbeddings:
+    def __init__(self, api_key: Optional[str], base_url: Optional[str], is_async: bool):
+        self._api_key = api_key
+        self._base_url = base_url
+        self._is_async = is_async
+
+    def _kwargs(self, kwargs: dict[str, Any]) -> dict[str, Any]:
+        if self._api_key is not None:
+            kwargs.setdefault("api_key", self._api_key)
+        if self._base_url is not None:
+            kwargs.setdefault("api_base", self._base_url)
+        return kwargs
+
+    def create(self, **kwargs: Any) -> Any:
+        import litellm
+
+        if self._is_async:
+            return litellm.aembedding(**self._kwargs(kwargs))
+        return litellm.embedding(**self._kwargs(kwargs))
+
+
+class _LiteLLMModerations:
+    def __init__(self, api_key: Optional[str], base_url: Optional[str], is_async: bool):
+        self._api_key = api_key
+        self._base_url = base_url
+        self._is_async = is_async
+
+    def _kwargs(self, kwargs: dict[str, Any]) -> dict[str, Any]:
+        if self._api_key is not None:
+            kwargs.setdefault("api_key", self._api_key)
+        if self._base_url is not None:
+            kwargs.setdefault("api_base", self._base_url)
+        return kwargs
+
+    def create(self, **kwargs: Any) -> Any:
+        import litellm
+
+        if self._is_async:
+            return litellm.amoderation(**self._kwargs(kwargs))
+        return litellm.moderation(**self._kwargs(kwargs))
+
+
+class LiteLLMClient:
+    """OpenAI-compatible client backed by ``litellm.completion``.
+
+    Pass to ``autoevals.init(client=LiteLLMClient())``. Routes every chat/embedding/
+    moderation call through LiteLLM, which resolves the target provider from the
+    model-name prefix (e.g. ``anthropic/claude-3-5-sonnet``, ``bedrock/anthropic.claude-3-sonnet``).
+
+    Args:
+        api_key: Optional provider API key. If unset, LiteLLM falls back to the
+            per-provider env vars (``OPENAI_API_KEY``, ``ANTHROPIC_API_KEY``, etc.).
+        base_url: Optional custom base URL (forwarded to LiteLLM as ``api_base``).
+        organization: Accepted for OpenAI-protocol compatibility; not used.
+    """
+
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        base_url: Optional[str] = None,
+        organization: Optional[str] = None,
+    ):
+        self.api_key = api_key
+        self.base_url = base_url
+        self.organization = organization
+        self.chat = _LiteLLMChat(_LiteLLMChatCompletions(api_key=api_key, base_url=base_url))
+        self.embeddings = _LiteLLMEmbeddings(api_key=api_key, base_url=base_url, is_async=False)
+        self.moderations = _LiteLLMModerations(api_key=api_key, base_url=base_url, is_async=False)
+
+    # responses API is required by the OpenAI v1 protocol check in oai.py but only
+    # exercised for GPT-5 routing; LiteLLM handles GPT-5 through chat.completions.
+    @property
+    def responses(self):  # pragma: no cover - protocol conformance only
+        return self.chat.completions
+
+
+class AsyncLiteLLMClient:
+    """Async variant of :class:`LiteLLMClient` — uses ``litellm.acompletion`` etc."""
+
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        base_url: Optional[str] = None,
+        organization: Optional[str] = None,
+    ):
+        self.api_key = api_key
+        self.base_url = base_url
+        self.organization = organization
+        self.chat = _LiteLLMChat(_AsyncLiteLLMChatCompletions(api_key=api_key, base_url=base_url))
+        self.embeddings = _LiteLLMEmbeddings(api_key=api_key, base_url=base_url, is_async=True)
+        self.moderations = _LiteLLMModerations(api_key=api_key, base_url=base_url, is_async=True)
+
+    @property
+    def responses(self):  # pragma: no cover - protocol conformance only
+        return self.chat.completions
diff --git a/py/autoevals/test_litellm.py b/py/autoevals/test_litellm.py
new file mode 100644
index 0000000..d58e156
--- /dev/null
+++ b/py/autoevals/test_litellm.py
@@ -0,0 +1,110 @@
+"""Tests for the LiteLLM adapter clients."""
+
+from types import SimpleNamespace
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from autoevals import init
+from autoevals.litellm import AsyncLiteLLMClient, LiteLLMClient
+from autoevals.oai import LLMClient
+
+
+@pytest.fixture(autouse=True)
+def reset_autoevals_state():
+    init()
+    yield
+    init()
+
+
+def _fake_completion_response(content: str = "hi") -> SimpleNamespace:
+    """Minimal OpenAI-compatible completion response."""
+    message = SimpleNamespace(content=content, role="assistant")
+    choice = SimpleNamespace(message=message, index=0, finish_reason="stop")
+    return SimpleNamespace(choices=[choice], id="cmpl-1", model="test", object="chat.completion")
+
+
+def test_litellm_client_exposes_openai_v1_surface():
+    client = LiteLLMClient(api_key="sk-test", base_url="https://proxy.example/v1")
+    # openai v1 protocol surface:
+    assert hasattr(client.chat.completions, "create")
+    assert hasattr(client.embeddings, "create")
+    assert hasattr(client.moderations, "create")
+    assert client.api_key == "sk-test"
+    assert client.base_url == "https://proxy.example/v1"
+
+
+def test_litellm_chat_completions_forwards_to_litellm(mocker):
+    stub = mocker.patch("litellm.completion", return_value=_fake_completion_response("pong"))
+    client = LiteLLMClient(api_key="sk-test", base_url="https://proxy.example/v1")
+
+    resp = client.chat.completions.create(
+        model="anthropic/claude-3-5-sonnet-20241022",
+        messages=[{"role": "user", "content": "ping"}],
+    )
+
+    assert resp.choices[0].message.content == "pong"
+    stub.assert_called_once()
+    kwargs = stub.call_args.kwargs
+    assert kwargs["model"] == "anthropic/claude-3-5-sonnet-20241022"
+    assert kwargs["api_key"] == "sk-test"
+    assert kwargs["api_base"] == "https://proxy.example/v1"
+
+
+def test_litellm_client_without_api_key_does_not_forward_key(mocker):
+    stub = mocker.patch("litellm.completion", return_value=_fake_completion_response())
+    client = LiteLLMClient()  # LiteLLM will pick up env vars per provider
+
+    client.chat.completions.create(model="openai/gpt-4o-mini", messages=[])
+    kwargs = stub.call_args.kwargs
+    assert "api_key" not in kwargs
+    assert "api_base" not in kwargs
+
+
+def test_litellm_embeddings_forwards_to_litellm(mocker):
+    stub = mocker.patch("litellm.embedding", return_value={"data": [{"embedding": [0.1, 0.2]}]})
+    client = LiteLLMClient(api_key="sk-test")
+
+    client.embeddings.create(model="text-embedding-3-small", input="hello")
+
+    stub.assert_called_once()
+    assert stub.call_args.kwargs["model"] == "text-embedding-3-small"
+    assert stub.call_args.kwargs["api_key"] == "sk-test"
+
+
+def test_litellm_moderations_forwards_to_litellm(mocker):
+    stub = mocker.patch("litellm.moderation", return_value={"results": [{"flagged": False}]})
+    client = LiteLLMClient()
+
+    client.moderations.create(input="some text")
+
+    stub.assert_called_once()
+
+
+@pytest.mark.asyncio
+async def test_async_litellm_chat_completions_forwards(mocker):
+    stub = mocker.patch("litellm.acompletion", new=AsyncMock(return_value=_fake_completion_response("async-pong")))
+    client = AsyncLiteLLMClient(api_key="sk-test")
+
+    resp = await client.chat.completions.create(
+        model="openai/gpt-4o-mini",
+        messages=[{"role": "user", "content": "ping"}],
+    )
+
+    assert resp.choices[0].message.content == "async-pong"
+    assert stub.await_count == 1
+
+
+def test_init_accepts_litellm_client(mocker):
+    """End-to-end: init(client=LiteLLMClient()) builds a usable LLMClient."""
+    mocker.patch("litellm.completion", return_value=_fake_completion_response("init-ok"))
+
+    init(client=LiteLLMClient(api_key="sk-test"))
+
+    from autoevals.oai import prepare_openai
+
+    wrapper = prepare_openai()
+    assert isinstance(wrapper, LLMClient)
+    # Calling through the wrapper should dispatch to litellm.completion
+    result = wrapper.complete(model="openai/gpt-4o-mini", messages=[{"role": "user", "content": "ping"}])
+    assert result.choices[0].message.content == "init-ok"
diff --git a/setup.py b/setup.py
index d8b7080..71aee08 100644
--- a/setup.py
+++ b/setup.py
@@ -30,6 +30,7 @@
     ],
     "doc": ["pydoc-markdown"],
     "scipy": ["numpy", "scipy"],
+    "litellm": ["litellm"],
 }
 
 extras_require["all"] = sorted({package for packages in extras_require.values() for package in packages})

From 0268d008f8f3445647cef19d645044e8e367e146 Mon Sep 17 00:00:00 2001
From: RheagalFire <arishalam121@gmail.com>
Date: Tue, 21 Apr 2026 23:16:03 +0530
Subject: [PATCH 2/3] fix(litellm): responses-API shim for gpt-5 model routing

---
 py/autoevals/litellm.py      | 90 ++++++++++++++++++++++++++++++++----
 py/autoevals/test_litellm.py | 54 ++++++++++++++++++++++
 2 files changed, 134 insertions(+), 10 deletions(-)

diff --git a/py/autoevals/litellm.py b/py/autoevals/litellm.py
index 32a135b..fb8e74e 100644
--- a/py/autoevals/litellm.py
+++ b/py/autoevals/litellm.py
@@ -107,6 +107,80 @@ def create(self, **kwargs: Any) -> Any:
         return litellm.moderation(**self._kwargs(kwargs))
 
 
+def _responses_params_to_chat_kwargs(kwargs: dict[str, Any]) -> dict[str, Any]:
+    """Translate autoevals' Responses-API kwargs back into Chat-Completions kwargs
+    that ``litellm.completion`` understands.
+
+    autoevals' ``oai.py`` routes GPT-5 models through ``client.responses.create``
+    (see ``is_gpt5_model``, ``prepare_responses_params``). Those params use
+    ``input=`` instead of ``messages=`` and a Responses-API tool schema.
+    ``litellm.completion`` only speaks Chat-Completions, so we translate back.
+    The resulting ChatCompletion response is then detected by autoevals'
+    ``convert_responses_to_chat_completion`` as not-a-Responses-object and
+    returned as-is (see ``oai.py:226``).
+    """
+    chat_kwargs = dict(kwargs)
+    if "input" in chat_kwargs and "messages" not in chat_kwargs:
+        chat_kwargs["messages"] = chat_kwargs.pop("input")
+    # Responses-API tools use flat {type, name, description, parameters}; Chat-
+    # Completions tools nest the schema under {type, function: {...}}.
+    if "tools" in chat_kwargs:
+        translated = []
+        for tool in chat_kwargs["tools"]:
+            if isinstance(tool, dict) and tool.get("type") == "function" and "function" not in tool:
+                translated.append(
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": tool.get("name"),
+                            "description": tool.get("description"),
+                            "parameters": tool.get("parameters"),
+                        },
+                    }
+                )
+            else:
+                translated.append(tool)
+        chat_kwargs["tools"] = translated
+    return chat_kwargs
+
+
+class _LiteLLMResponses:
+    """Adapter for autoevals' Responses-API code path (triggered by GPT-5 models).
+
+    Without this, ``init(client=LiteLLMClient())`` with autoevals' default model
+    (``gpt-5-mini``) would call ``litellm.completion(input=..., model=...)`` and
+    crash because LiteLLM requires ``messages=``.
+    """
+
+    def __init__(self, api_key: Optional[str], base_url: Optional[str], is_async: bool):
+        self._api_key = api_key
+        self._base_url = base_url
+        self._is_async = is_async
+
+    def _kwargs(self, kwargs: dict[str, Any]) -> dict[str, Any]:
+        chat_kwargs = _responses_params_to_chat_kwargs(kwargs)
+        if self._api_key is not None:
+            chat_kwargs.setdefault("api_key", self._api_key)
+        if self._base_url is not None:
+            chat_kwargs.setdefault("api_base", self._base_url)
+        return chat_kwargs
+
+    def create(self, **kwargs: Any) -> Any:
+        import litellm
+
+        if self._is_async:
+            return litellm.acompletion(**self._kwargs(kwargs))
+        return litellm.completion(**self._kwargs(kwargs))
+
+
+class _LiteLLMResponsesContainer:
+    """Exposes ``.create`` on the ``client.responses`` attribute to match the
+    OpenAI v1 client shape autoevals' ``oai.py`` duck-types on."""
+
+    def __init__(self, create_impl: Any):
+        self.create = create_impl
+
+
 class LiteLLMClient:
     """OpenAI-compatible client backed by ``litellm.completion``.
 
@@ -133,12 +207,9 @@ def __init__(
         self.chat = _LiteLLMChat(_LiteLLMChatCompletions(api_key=api_key, base_url=base_url))
         self.embeddings = _LiteLLMEmbeddings(api_key=api_key, base_url=base_url, is_async=False)
         self.moderations = _LiteLLMModerations(api_key=api_key, base_url=base_url, is_async=False)
-
-    # responses API is required by the OpenAI v1 protocol check in oai.py but only
-    # exercised for GPT-5 routing; LiteLLM handles GPT-5 through chat.completions.
-    @property
-    def responses(self):  # pragma: no cover - protocol conformance only
-        return self.chat.completions
+        self.responses = _LiteLLMResponsesContainer(
+            _LiteLLMResponses(api_key=api_key, base_url=base_url, is_async=False).create,
+        )
 
 
 class AsyncLiteLLMClient:
@@ -156,7 +227,6 @@ def __init__(
         self.chat = _LiteLLMChat(_AsyncLiteLLMChatCompletions(api_key=api_key, base_url=base_url))
         self.embeddings = _LiteLLMEmbeddings(api_key=api_key, base_url=base_url, is_async=True)
         self.moderations = _LiteLLMModerations(api_key=api_key, base_url=base_url, is_async=True)
-
-    @property
-    def responses(self):  # pragma: no cover - protocol conformance only
-        return self.chat.completions
+        self.responses = _LiteLLMResponsesContainer(
+            _LiteLLMResponses(api_key=api_key, base_url=base_url, is_async=True).create,
+        )
diff --git a/py/autoevals/test_litellm.py b/py/autoevals/test_litellm.py
index d58e156..0327091 100644
--- a/py/autoevals/test_litellm.py
+++ b/py/autoevals/test_litellm.py
@@ -81,6 +81,60 @@ def test_litellm_moderations_forwards_to_litellm(mocker):
     stub.assert_called_once()
 
 
+def test_litellm_responses_create_translates_input_to_messages(mocker):
+    """autoevals routes GPT-5 models through client.responses.create (see
+    is_gpt5_model / prepare_responses_params in oai.py), which sends input=...
+    instead of messages=.... Our shim must translate back to messages= so
+    litellm.completion doesn't crash."""
+    stub = mocker.patch("litellm.completion", return_value=_fake_completion_response("from-responses"))
+    client = LiteLLMClient(api_key="sk-test")
+
+    resp = client.responses.create(
+        model="gpt-5-mini",
+        input=[{"role": "user", "content": "ping"}],
+        temperature=0.1,
+    )
+
+    assert resp.choices[0].message.content == "from-responses"
+    kwargs = stub.call_args.kwargs
+    assert "messages" in kwargs, "responses.create must translate input=... to messages=..."
+    assert kwargs["messages"] == [{"role": "user", "content": "ping"}]
+    assert "input" not in kwargs, "input=... must not leak through to litellm.completion"
+    assert kwargs["temperature"] == 0.1
+
+
+def test_litellm_responses_create_translates_responses_api_tool_schema(mocker):
+    """autoevals' prepare_responses_params emits flat tool schema {type, name,
+    description, parameters}. LiteLLM (via Chat-Completions) wants nested schema
+    {type, function: {...}}. Our shim translates."""
+    stub = mocker.patch("litellm.completion", return_value=_fake_completion_response("ok"))
+    client = LiteLLMClient(api_key="sk-test")
+
+    responses_tool = {
+        "type": "function",
+        "name": "select_choice",
+        "description": "Select a choice",
+        "parameters": {"type": "object", "properties": {"choice": {"type": "string"}}},
+    }
+    client.responses.create(
+        model="gpt-5-mini",
+        input=[{"role": "user", "content": "pick"}],
+        tools=[responses_tool],
+    )
+
+    kwargs = stub.call_args.kwargs
+    assert kwargs["tools"] == [
+        {
+            "type": "function",
+            "function": {
+                "name": "select_choice",
+                "description": "Select a choice",
+                "parameters": {"type": "object", "properties": {"choice": {"type": "string"}}},
+            },
+        }
+    ]
+
+
 @pytest.mark.asyncio
 async def test_async_litellm_chat_completions_forwards(mocker):
     stub = mocker.patch("litellm.acompletion", new=AsyncMock(return_value=_fake_completion_response("async-pong")))

From 26d367d02c48939f546c6b7d25bdcebbed9da12e Mon Sep 17 00:00:00 2001
From: RheagalFire <arishalam121@gmail.com>
Date: Wed, 22 Apr 2026 16:17:28 +0530
Subject: [PATCH 3/3] chore: pin litellm to >=1.60,<1.85

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 71aee08..9afbceb 100644
--- a/setup.py
+++ b/setup.py
@@ -30,7 +30,7 @@
     ],
     "doc": ["pydoc-markdown"],
     "scipy": ["numpy", "scipy"],
-    "litellm": ["litellm"],
+    "litellm": ["litellm>=1.60,<1.85"],
 }
 
 extras_require["all"] = sorted({package for packages in extras_require.values() for package in packages})