From 6f12ac88ea944aeb5875579a1031cbf7a58ef2e5 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 9 Jun 2026 03:32:05 +0000
Subject: [PATCH] feat(llm): add vertex_cached_content config for explicit
 Vertex AI caching
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Vertex AI Gemini exposes an explicit context-cache API: the caller creates
a CachedContent resource (https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache)
and references it by name on every subsequent generateContent request.
LiteLLM already understands the kwarg (it pops 'cached_content' from
optional_params in vertex_ai.gemini.transformation.sync_transform_request_body
and forwards it to the API body) but the SDK had no first-class way to
plumb it through — users had to fight with raw litellm_extra_body and a
proxy that may or may not let it through.

This commit adds:

* LLM.vertex_cached_content: str | None  --  optional resource name field.
* select_chat_options() emits 'cached_content=<name>' on the LiteLLM call
  whenever the field is set AND the model name contains 'gemini' (so
  vertex_ai/, gemini/, litellm_proxy/gemini-* all route correctly).
* The emission is gated by a Gemini-only check so non-Vertex providers
  (OpenAI, Anthropic, etc.) that reject unknown kwargs stay unaffected.
* A caller-supplied 'cached_content' kwarg always wins over the LLM
  config field, matching the precedence we apply elsewhere.

Cache lifecycle (create / refresh TTL / delete) stays with the caller,
who has the Vertex credentials and project context. This keeps the SDK
free of google-cloud-aiplatform as a hard dependency while still giving
users a clean, type-checked seam for explicit caching.

Tests cover:
* Vertex / Gemini / litellm_proxy positive cases all emit the kwarg.
* OpenAI and Claude negative cases never emit it.
* Default None is silent.
* User kwarg override wins.

This is part of an SDK cost-reduction investigation triggered by the
gemini-3.5-flash swebench run analysed in OpenHands/benchmarks#741 ($1,912
projected on 500 instances, dominated by uncached prompt tokens at
litellm_proxy). PR #3581 covered the thought-signature side of that
investigation; this PR gives a path to explicit caching for users running
against vertex_ai/ directly.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/llm/llm.py        | 19 ++++
 .../openhands/sdk/llm/options/chat_options.py | 26 ++++++
 tests/sdk/llm/test_chat_options.py            | 89 +++++++++++++++++++
 3 files changed, 134 insertions(+)
diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py
index df2dd3f0dc..ba199a69c3 100644
--- a/openhands-sdk/openhands/sdk/llm/llm.py
+++ b/openhands-sdk/openhands/sdk/llm/llm.py
@@ -472,6 +472,25 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
         ),
     )
 
+    vertex_cached_content: str | None = Field(
+        default=None,
+        description=(
+            "Reference an existing Vertex AI ``CachedContent`` resource to use as "
+            "the cache prefix for every request. Pass the full resource name, e.g. "
+            '``"cachedContents/1234567890"`` returned by '
+            "``CachedContent.create``. The SDK threads it through to LiteLLM, "
+            "which forwards it to the Vertex Gemini ``generateContent`` API as "
+            "``cachedContent``. This bypasses the inline ``cache_control`` marker "
+            "path (which only works for ``vertex_ai/`` direct, not via "
+            "``litellm_proxy/``) and gives deterministic, explicit caching for "
+            "long-running agent runs whose system + tool prefix exceeds Vertex's "
+            "minimum cache size. The caller is responsible for creating, "
+            "refreshing the TTL, and deleting the cache resource — see "
+            "https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache. "
+            "Ignored for non-Vertex / non-Gemini providers."
+        ),
+    )
+
     fallback_strategy: FallbackStrategy | None = Field(
         default=None,
         description=(
diff --git a/openhands-sdk/openhands/sdk/llm/options/chat_options.py b/openhands-sdk/openhands/sdk/llm/options/chat_options.py
index 88099d84e9..401ca90fe0 100644
--- a/openhands-sdk/openhands/sdk/llm/options/chat_options.py
+++ b/openhands-sdk/openhands/sdk/llm/options/chat_options.py
@@ -6,6 +6,19 @@
 from openhands.sdk.llm.utils.model_features import get_features
 
 
+def _model_supports_vertex_cached_content(model: str) -> bool:
+    """Return True iff sending ``cached_content`` to this model is safe.
+
+    LiteLLM forwards ``cached_content`` as a top-level kwarg through to the
+    Vertex Gemini ``generateContent`` API. Other providers (OpenAI, Anthropic,
+    etc.) will reject unknown kwargs, so we gate emission to model names that
+    look Gemini-flavoured regardless of which LiteLLM provider prefix routes
+    them: ``vertex_ai/``, ``gemini/``, ``litellm_proxy/gemini-*`` (assuming
+    the proxy forwards to a Vertex backend), and bare ``gemini-*``.
+    """
+    return "gemini" in (model or "").lower()
+
+
 def select_chat_options(
     llm, user_kwargs: dict[str, Any], has_tools: bool
 ) -> dict[str, Any]:
@@ -99,4 +112,17 @@ def select_chat_options(
     if llm._prompt_cache_key:
         out["prompt_cache_key"] = llm._prompt_cache_key
 
+    # Vertex AI explicit context cache: user pre-creates a CachedContent
+    # resource (see https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache)
+    # and references it by name. LiteLLM forwards the kwarg to the Vertex
+    # ``generateContent`` API as ``cachedContent``. We only emit when the model
+    # is Gemini-flavoured so unknown-kwarg-rejecting providers (OpenAI, etc.)
+    # are left untouched. User-supplied kwargs take precedence.
+    if (
+        llm.vertex_cached_content
+        and "cached_content" not in out
+        and _model_supports_vertex_cached_content(llm.model)
+    ):
+        out["cached_content"] = llm.vertex_cached_content
+
     return out
diff --git a/tests/sdk/llm/test_chat_options.py b/tests/sdk/llm/test_chat_options.py
index 4f63e95270..b5f066f520 100644
--- a/tests/sdk/llm/test_chat_options.py
+++ b/tests/sdk/llm/test_chat_options.py
@@ -21,6 +21,7 @@ class DummyLLM:
     _prompt_cache_key: str | None = None
     openrouter_site_url: str = ""
     openrouter_app_name: str = ""
+    vertex_cached_content: str | None = None
 
     def _openrouter_headers(self) -> dict[str, str]:
         headers: dict[str, str] = {}
@@ -243,3 +244,91 @@ def test_chat_options_omits_openrouter_headers_when_unset():
     llm = DummyLLM(model="gpt-4o")
     out = select_chat_options(llm, user_kwargs={}, has_tools=False)
     assert "extra_headers" not in out
+
+
+# ---------------------------------------------------------------------------
+# vertex_cached_content
+# ---------------------------------------------------------------------------
+
+
+def test_vertex_cached_content_emitted_for_vertex_gemini():
+    """``vertex_cached_content`` is forwarded as ``cached_content`` for Gemini."""
+    llm = LLM(
+        model="vertex_ai/gemini-3-flash",
+        vertex_cached_content="cachedContents/1234567890",
+    )
+    out = select_chat_options(llm, user_kwargs={}, has_tools=True)
+    assert out["cached_content"] == "cachedContents/1234567890"
+
+
+def test_vertex_cached_content_emitted_for_gemini_provider():
+    """Bare ``gemini/`` provider also gets the kwarg."""
+    llm = LLM(
+        model="gemini/gemini-3-flash",
+        vertex_cached_content="cachedContents/abc",
+    )
+    out = select_chat_options(llm, user_kwargs={}, has_tools=True)
+    assert out["cached_content"] == "cachedContents/abc"
+
+
+def test_vertex_cached_content_emitted_for_litellm_proxy_gemini():
+    """Proxy-routed Gemini models receive the kwarg too.
+
+    Whether the proxy actually forwards it to Vertex is a proxy-side concern,
+    but the SDK must pass it through so a well-configured proxy can act on it.
+    """
+    llm = LLM(
+        model="litellm_proxy/gemini-3.5-flash",
+        vertex_cached_content="cachedContents/xyz",
+    )
+    out = select_chat_options(llm, user_kwargs={}, has_tools=True)
+    assert out["cached_content"] == "cachedContents/xyz"
+
+
+def test_vertex_cached_content_suppressed_for_openai():
+    """Non-Gemini providers (OpenAI, Anthropic, …) must not see the kwarg.
+
+    They reject unknown kwargs, so emitting ``cached_content`` would break the
+    call. The user setting the field on a non-Gemini LLM is a misconfiguration
+    we silently tolerate rather than escalate to an error.
+    """
+    llm = LLM(
+        model="gpt-5-mini",
+        vertex_cached_content="cachedContents/should-be-ignored",
+    )
+    out = select_chat_options(llm, user_kwargs={}, has_tools=True)
+    assert "cached_content" not in out
+
+
+def test_vertex_cached_content_suppressed_for_claude():
+    """Anthropic models likewise."""
+    llm = LLM(
+        model="claude-sonnet-4-5",
+        vertex_cached_content="cachedContents/should-be-ignored",
+    )
+    out = select_chat_options(llm, user_kwargs={}, has_tools=True)
+    assert "cached_content" not in out
+
+
+def test_vertex_cached_content_omitted_when_unset():
+    """Default ``None`` produces no kwarg even on a Gemini model."""
+    llm = LLM(model="vertex_ai/gemini-3-flash")
+    out = select_chat_options(llm, user_kwargs={}, has_tools=True)
+    assert "cached_content" not in out
+
+
+def test_vertex_cached_content_respects_user_kwarg_override():
+    """A caller-supplied ``cached_content`` wins over the LLM config field.
+
+    This matches the precedence we apply elsewhere (extra_headers, etc.).
+    """
+    llm = LLM(
+        model="vertex_ai/gemini-3-flash",
+        vertex_cached_content="cachedContents/from-config",
+    )
+    out = select_chat_options(
+        llm,
+        user_kwargs={"cached_content": "cachedContents/from-caller"},
+        has_tools=True,
+    )
+    assert out["cached_content"] == "cachedContents/from-caller"