From f59c5c3a2e09ce47414bb42f3005b6df22a7d56e Mon Sep 17 00:00:00 2001
From: Mongol Agent <mongol@chifat.ru>
Date: Tue, 19 May 2026 16:08:03 +0000
Subject: [PATCH] feat(litellm): pass-through cache_control_injection_points
 for Anthropic prompt caching

Add config pass-through to expose LiteLLM SDK's cache_control_injection_points
kwarg via .pr_agent.toml or configuration.toml.

Enables Anthropic prompt caching for self-hosted PR-Agent setups:

    [litellm]
    cache_control_injection_points = '[{"location": "message", "role": "system"}]'

LiteLLM SDK supports this kwarg natively per
https://docs.litellm.ai/docs/tutorials/prompt_caching
but PR-Agent did not surface it through configuration. With static system
prompts of 3-5K tokens (typical extra_instructions), caching delivers
30-50% input-token cost reduction on iterative review rounds within the
5-minute Anthropic TTL window.

Backwards compatible: empty/missing setting = current behavior (no caching).
---
 pr_agent/algo/ai_handlers/litellm_ai_handler.py | 12 ++++++++++++
 pr_agent/settings/configuration.toml            |  3 +++
 2 files changed, 15 insertions(+)

diff --git a/pr_agent/algo/ai_handlers/litellm_ai_handler.py b/pr_agent/algo/ai_handlers/litellm_ai_handler.py
index a6e79d7a07..42889fbb40 100644
--- a/pr_agent/algo/ai_handlers/litellm_ai_handler.py
+++ b/pr_agent/algo/ai_handlers/litellm_ai_handler.py
@@ -532,6 +532,18 @@ async def chat_completion(self, model: str, system: str, user: str, temperature:
                 # Support for custom OpenAI body fields (e.g., Flex Processing)
                 kwargs = _process_litellm_extra_body(kwargs)
 
+                # Support for Anthropic prompt caching via LiteLLM's cache_control_injection_points
+                # (https://docs.litellm.ai/docs/tutorials/prompt_caching). Configurable as a JSON
+                # array in [litellm] section of configuration.toml or .pr_agent.toml.
+                if get_settings().get("LITELLM.CACHE_CONTROL_INJECTION_POINTS", None):
+                    try:
+                        cache_points = json.loads(get_settings().litellm.cache_control_injection_points)
+                        if not isinstance(cache_points, list):
+                            raise ValueError("LITELLM.CACHE_CONTROL_INJECTION_POINTS must be a JSON array")
+                        kwargs["cache_control_injection_points"] = cache_points
+                    except json.JSONDecodeError as e:
+                        raise ValueError(f"LITELLM.CACHE_CONTROL_INJECTION_POINTS contains invalid JSON: {str(e)}")
+
                 # Support for Bedrock custom inference profile via model_id
                 model_id = get_settings().get("litellm.model_id")
                 if model_id and 'bedrock/' in model:
diff --git a/pr_agent/settings/configuration.toml b/pr_agent/settings/configuration.toml
index f4d63a73f2..f780720168 100644
--- a/pr_agent/settings/configuration.toml
+++ b/pr_agent/settings/configuration.toml
@@ -326,6 +326,9 @@ success_callback = []
 failure_callback = []
 service_callback = []
 # model_id = "" # Optional: Custom inference profile ID for Amazon Bedrock
+# cache_control_injection_points = "" # Optional: JSON array enabling Anthropic prompt caching via LiteLLM
+# Example: cache_control_injection_points = '[{"location": "message", "role": "system"}]'
+# See https://docs.litellm.ai/docs/tutorials/prompt_caching
 
 [pr_similar_issue]
 skip_comments = false