diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py index df2dd3f0dc..ba199a69c3 100644 --- a/openhands-sdk/openhands/sdk/llm/llm.py +++ b/openhands-sdk/openhands/sdk/llm/llm.py @@ -472,6 +472,25 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin): ), ) + vertex_cached_content: str | None = Field( + default=None, + description=( + "Reference an existing Vertex AI ``CachedContent`` resource to use as " + "the cache prefix for every request. Pass the full resource name, e.g. " + '``"cachedContents/1234567890"`` returned by ' + "``CachedContent.create``. The SDK threads it through to LiteLLM, " + "which forwards it to the Vertex Gemini ``generateContent`` API as " + "``cachedContent``. This bypasses the inline ``cache_control`` marker " + "path (which only works for ``vertex_ai/`` direct, not via " + "``litellm_proxy/``) and gives deterministic, explicit caching for " + "long-running agent runs whose system + tool prefix exceeds Vertex's " + "minimum cache size. The caller is responsible for creating, " + "refreshing the TTL, and deleting the cache resource — see " + "https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache. " + "Ignored for non-Vertex / non-Gemini providers." + ), + ) + fallback_strategy: FallbackStrategy | None = Field( default=None, description=( diff --git a/openhands-sdk/openhands/sdk/llm/options/chat_options.py b/openhands-sdk/openhands/sdk/llm/options/chat_options.py index 88099d84e9..401ca90fe0 100644 --- a/openhands-sdk/openhands/sdk/llm/options/chat_options.py +++ b/openhands-sdk/openhands/sdk/llm/options/chat_options.py @@ -6,6 +6,19 @@ from openhands.sdk.llm.utils.model_features import get_features +def _model_supports_vertex_cached_content(model: str) -> bool: + """Return True iff sending ``cached_content`` to this model is safe. + + LiteLLM forwards ``cached_content`` as a top-level kwarg through to the + Vertex Gemini ``generateContent`` API. Other providers (OpenAI, Anthropic, + etc.) will reject unknown kwargs, so we gate emission to model names that + look Gemini-flavoured regardless of which LiteLLM provider prefix routes + them: ``vertex_ai/``, ``gemini/``, ``litellm_proxy/gemini-*`` (assuming + the proxy forwards to a Vertex backend), and bare ``gemini-*``. + """ + return "gemini" in (model or "").lower() + + def select_chat_options( llm, user_kwargs: dict[str, Any], has_tools: bool ) -> dict[str, Any]: @@ -99,4 +112,17 @@ def select_chat_options( if llm._prompt_cache_key: out["prompt_cache_key"] = llm._prompt_cache_key + # Vertex AI explicit context cache: user pre-creates a CachedContent + # resource (see https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache) + # and references it by name. LiteLLM forwards the kwarg to the Vertex + # ``generateContent`` API as ``cachedContent``. We only emit when the model + # is Gemini-flavoured so unknown-kwarg-rejecting providers (OpenAI, etc.) + # are left untouched. User-supplied kwargs take precedence. + if ( + llm.vertex_cached_content + and "cached_content" not in out + and _model_supports_vertex_cached_content(llm.model) + ): + out["cached_content"] = llm.vertex_cached_content + return out diff --git a/tests/sdk/llm/test_chat_options.py b/tests/sdk/llm/test_chat_options.py index 4f63e95270..b5f066f520 100644 --- a/tests/sdk/llm/test_chat_options.py +++ b/tests/sdk/llm/test_chat_options.py @@ -21,6 +21,7 @@ class DummyLLM: _prompt_cache_key: str | None = None openrouter_site_url: str = "" openrouter_app_name: str = "" + vertex_cached_content: str | None = None def _openrouter_headers(self) -> dict[str, str]: headers: dict[str, str] = {} @@ -243,3 +244,91 @@ def test_chat_options_omits_openrouter_headers_when_unset(): llm = DummyLLM(model="gpt-4o") out = select_chat_options(llm, user_kwargs={}, has_tools=False) assert "extra_headers" not in out + + +# --------------------------------------------------------------------------- +# vertex_cached_content +# --------------------------------------------------------------------------- + + +def test_vertex_cached_content_emitted_for_vertex_gemini(): + """``vertex_cached_content`` is forwarded as ``cached_content`` for Gemini.""" + llm = LLM( + model="vertex_ai/gemini-3-flash", + vertex_cached_content="cachedContents/1234567890", + ) + out = select_chat_options(llm, user_kwargs={}, has_tools=True) + assert out["cached_content"] == "cachedContents/1234567890" + + +def test_vertex_cached_content_emitted_for_gemini_provider(): + """Bare ``gemini/`` provider also gets the kwarg.""" + llm = LLM( + model="gemini/gemini-3-flash", + vertex_cached_content="cachedContents/abc", + ) + out = select_chat_options(llm, user_kwargs={}, has_tools=True) + assert out["cached_content"] == "cachedContents/abc" + + +def test_vertex_cached_content_emitted_for_litellm_proxy_gemini(): + """Proxy-routed Gemini models receive the kwarg too. + + Whether the proxy actually forwards it to Vertex is a proxy-side concern, + but the SDK must pass it through so a well-configured proxy can act on it. + """ + llm = LLM( + model="litellm_proxy/gemini-3.5-flash", + vertex_cached_content="cachedContents/xyz", + ) + out = select_chat_options(llm, user_kwargs={}, has_tools=True) + assert out["cached_content"] == "cachedContents/xyz" + + +def test_vertex_cached_content_suppressed_for_openai(): + """Non-Gemini providers (OpenAI, Anthropic, …) must not see the kwarg. + + They reject unknown kwargs, so emitting ``cached_content`` would break the + call. The user setting the field on a non-Gemini LLM is a misconfiguration + we silently tolerate rather than escalate to an error. + """ + llm = LLM( + model="gpt-5-mini", + vertex_cached_content="cachedContents/should-be-ignored", + ) + out = select_chat_options(llm, user_kwargs={}, has_tools=True) + assert "cached_content" not in out + + +def test_vertex_cached_content_suppressed_for_claude(): + """Anthropic models likewise.""" + llm = LLM( + model="claude-sonnet-4-5", + vertex_cached_content="cachedContents/should-be-ignored", + ) + out = select_chat_options(llm, user_kwargs={}, has_tools=True) + assert "cached_content" not in out + + +def test_vertex_cached_content_omitted_when_unset(): + """Default ``None`` produces no kwarg even on a Gemini model.""" + llm = LLM(model="vertex_ai/gemini-3-flash") + out = select_chat_options(llm, user_kwargs={}, has_tools=True) + assert "cached_content" not in out + + +def test_vertex_cached_content_respects_user_kwarg_override(): + """A caller-supplied ``cached_content`` wins over the LLM config field. + + This matches the precedence we apply elsewhere (extra_headers, etc.). + """ + llm = LLM( + model="vertex_ai/gemini-3-flash", + vertex_cached_content="cachedContents/from-config", + ) + out = select_chat_options( + llm, + user_kwargs={"cached_content": "cachedContents/from-caller"}, + has_tools=True, + ) + assert out["cached_content"] == "cachedContents/from-caller"