From 6f12ac88ea944aeb5875579a1031cbf7a58ef2e5 Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 9 Jun 2026 03:32:05 +0000 Subject: [PATCH] feat(llm): add vertex_cached_content config for explicit Vertex AI caching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vertex AI Gemini exposes an explicit context-cache API: the caller creates a CachedContent resource (https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache) and references it by name on every subsequent generateContent request. LiteLLM already understands the kwarg (it pops 'cached_content' from optional_params in vertex_ai.gemini.transformation.sync_transform_request_body and forwards it to the API body) but the SDK had no first-class way to plumb it through — users had to fight with raw litellm_extra_body and a proxy that may or may not let it through. This commit adds: * LLM.vertex_cached_content: str | None -- optional resource name field. * select_chat_options() emits 'cached_content=' on the LiteLLM call whenever the field is set AND the model name contains 'gemini' (so vertex_ai/, gemini/, litellm_proxy/gemini-* all route correctly). * The emission is gated by a Gemini-only check so non-Vertex providers (OpenAI, Anthropic, etc.) that reject unknown kwargs stay unaffected. * A caller-supplied 'cached_content' kwarg always wins over the LLM config field, matching the precedence we apply elsewhere. Cache lifecycle (create / refresh TTL / delete) stays with the caller, who has the Vertex credentials and project context. This keeps the SDK free of google-cloud-aiplatform as a hard dependency while still giving users a clean, type-checked seam for explicit caching. Tests cover: * Vertex / Gemini / litellm_proxy positive cases all emit the kwarg. * OpenAI and Claude negative cases never emit it. * Default None is silent. * User kwarg override wins. This is part of an SDK cost-reduction investigation triggered by the gemini-3.5-flash swebench run analysed in OpenHands/benchmarks#741 ($1,912 projected on 500 instances, dominated by uncached prompt tokens at litellm_proxy). PR #3581 covered the thought-signature side of that investigation; this PR gives a path to explicit caching for users running against vertex_ai/ directly. Co-authored-by: openhands --- openhands-sdk/openhands/sdk/llm/llm.py | 19 ++++ .../openhands/sdk/llm/options/chat_options.py | 26 ++++++ tests/sdk/llm/test_chat_options.py | 89 +++++++++++++++++++ 3 files changed, 134 insertions(+) diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py index df2dd3f0dc..ba199a69c3 100644 --- a/openhands-sdk/openhands/sdk/llm/llm.py +++ b/openhands-sdk/openhands/sdk/llm/llm.py @@ -472,6 +472,25 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin): ), ) + vertex_cached_content: str | None = Field( + default=None, + description=( + "Reference an existing Vertex AI ``CachedContent`` resource to use as " + "the cache prefix for every request. Pass the full resource name, e.g. " + '``"cachedContents/1234567890"`` returned by ' + "``CachedContent.create``. The SDK threads it through to LiteLLM, " + "which forwards it to the Vertex Gemini ``generateContent`` API as " + "``cachedContent``. This bypasses the inline ``cache_control`` marker " + "path (which only works for ``vertex_ai/`` direct, not via " + "``litellm_proxy/``) and gives deterministic, explicit caching for " + "long-running agent runs whose system + tool prefix exceeds Vertex's " + "minimum cache size. The caller is responsible for creating, " + "refreshing the TTL, and deleting the cache resource — see " + "https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache. " + "Ignored for non-Vertex / non-Gemini providers." + ), + ) + fallback_strategy: FallbackStrategy | None = Field( default=None, description=( diff --git a/openhands-sdk/openhands/sdk/llm/options/chat_options.py b/openhands-sdk/openhands/sdk/llm/options/chat_options.py index 88099d84e9..401ca90fe0 100644 --- a/openhands-sdk/openhands/sdk/llm/options/chat_options.py +++ b/openhands-sdk/openhands/sdk/llm/options/chat_options.py @@ -6,6 +6,19 @@ from openhands.sdk.llm.utils.model_features import get_features +def _model_supports_vertex_cached_content(model: str) -> bool: + """Return True iff sending ``cached_content`` to this model is safe. + + LiteLLM forwards ``cached_content`` as a top-level kwarg through to the + Vertex Gemini ``generateContent`` API. Other providers (OpenAI, Anthropic, + etc.) will reject unknown kwargs, so we gate emission to model names that + look Gemini-flavoured regardless of which LiteLLM provider prefix routes + them: ``vertex_ai/``, ``gemini/``, ``litellm_proxy/gemini-*`` (assuming + the proxy forwards to a Vertex backend), and bare ``gemini-*``. + """ + return "gemini" in (model or "").lower() + + def select_chat_options( llm, user_kwargs: dict[str, Any], has_tools: bool ) -> dict[str, Any]: @@ -99,4 +112,17 @@ def select_chat_options( if llm._prompt_cache_key: out["prompt_cache_key"] = llm._prompt_cache_key + # Vertex AI explicit context cache: user pre-creates a CachedContent + # resource (see https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache) + # and references it by name. LiteLLM forwards the kwarg to the Vertex + # ``generateContent`` API as ``cachedContent``. We only emit when the model + # is Gemini-flavoured so unknown-kwarg-rejecting providers (OpenAI, etc.) + # are left untouched. User-supplied kwargs take precedence. + if ( + llm.vertex_cached_content + and "cached_content" not in out + and _model_supports_vertex_cached_content(llm.model) + ): + out["cached_content"] = llm.vertex_cached_content + return out diff --git a/tests/sdk/llm/test_chat_options.py b/tests/sdk/llm/test_chat_options.py index 4f63e95270..b5f066f520 100644 --- a/tests/sdk/llm/test_chat_options.py +++ b/tests/sdk/llm/test_chat_options.py @@ -21,6 +21,7 @@ class DummyLLM: _prompt_cache_key: str | None = None openrouter_site_url: str = "" openrouter_app_name: str = "" + vertex_cached_content: str | None = None def _openrouter_headers(self) -> dict[str, str]: headers: dict[str, str] = {} @@ -243,3 +244,91 @@ def test_chat_options_omits_openrouter_headers_when_unset(): llm = DummyLLM(model="gpt-4o") out = select_chat_options(llm, user_kwargs={}, has_tools=False) assert "extra_headers" not in out + + +# --------------------------------------------------------------------------- +# vertex_cached_content +# --------------------------------------------------------------------------- + + +def test_vertex_cached_content_emitted_for_vertex_gemini(): + """``vertex_cached_content`` is forwarded as ``cached_content`` for Gemini.""" + llm = LLM( + model="vertex_ai/gemini-3-flash", + vertex_cached_content="cachedContents/1234567890", + ) + out = select_chat_options(llm, user_kwargs={}, has_tools=True) + assert out["cached_content"] == "cachedContents/1234567890" + + +def test_vertex_cached_content_emitted_for_gemini_provider(): + """Bare ``gemini/`` provider also gets the kwarg.""" + llm = LLM( + model="gemini/gemini-3-flash", + vertex_cached_content="cachedContents/abc", + ) + out = select_chat_options(llm, user_kwargs={}, has_tools=True) + assert out["cached_content"] == "cachedContents/abc" + + +def test_vertex_cached_content_emitted_for_litellm_proxy_gemini(): + """Proxy-routed Gemini models receive the kwarg too. + + Whether the proxy actually forwards it to Vertex is a proxy-side concern, + but the SDK must pass it through so a well-configured proxy can act on it. + """ + llm = LLM( + model="litellm_proxy/gemini-3.5-flash", + vertex_cached_content="cachedContents/xyz", + ) + out = select_chat_options(llm, user_kwargs={}, has_tools=True) + assert out["cached_content"] == "cachedContents/xyz" + + +def test_vertex_cached_content_suppressed_for_openai(): + """Non-Gemini providers (OpenAI, Anthropic, …) must not see the kwarg. + + They reject unknown kwargs, so emitting ``cached_content`` would break the + call. The user setting the field on a non-Gemini LLM is a misconfiguration + we silently tolerate rather than escalate to an error. + """ + llm = LLM( + model="gpt-5-mini", + vertex_cached_content="cachedContents/should-be-ignored", + ) + out = select_chat_options(llm, user_kwargs={}, has_tools=True) + assert "cached_content" not in out + + +def test_vertex_cached_content_suppressed_for_claude(): + """Anthropic models likewise.""" + llm = LLM( + model="claude-sonnet-4-5", + vertex_cached_content="cachedContents/should-be-ignored", + ) + out = select_chat_options(llm, user_kwargs={}, has_tools=True) + assert "cached_content" not in out + + +def test_vertex_cached_content_omitted_when_unset(): + """Default ``None`` produces no kwarg even on a Gemini model.""" + llm = LLM(model="vertex_ai/gemini-3-flash") + out = select_chat_options(llm, user_kwargs={}, has_tools=True) + assert "cached_content" not in out + + +def test_vertex_cached_content_respects_user_kwarg_override(): + """A caller-supplied ``cached_content`` wins over the LLM config field. + + This matches the precedence we apply elsewhere (extra_headers, etc.). + """ + llm = LLM( + model="vertex_ai/gemini-3-flash", + vertex_cached_content="cachedContents/from-config", + ) + out = select_chat_options( + llm, + user_kwargs={"cached_content": "cachedContents/from-caller"}, + has_tools=True, + ) + assert out["cached_content"] == "cachedContents/from-caller"