From f59c5c3a2e09ce47414bb42f3005b6df22a7d56e Mon Sep 17 00:00:00 2001 From: Mongol Agent Date: Tue, 19 May 2026 16:08:03 +0000 Subject: [PATCH] feat(litellm): pass-through cache_control_injection_points for Anthropic prompt caching Add config pass-through to expose LiteLLM SDK's cache_control_injection_points kwarg via .pr_agent.toml or configuration.toml. Enables Anthropic prompt caching for self-hosted PR-Agent setups: [litellm] cache_control_injection_points = '[{"location": "message", "role": "system"}]' LiteLLM SDK supports this kwarg natively per https://docs.litellm.ai/docs/tutorials/prompt_caching but PR-Agent did not surface it through configuration. With static system prompts of 3-5K tokens (typical extra_instructions), caching delivers 30-50% input-token cost reduction on iterative review rounds within the 5-minute Anthropic TTL window. Backwards compatible: empty/missing setting = current behavior (no caching). --- pr_agent/algo/ai_handlers/litellm_ai_handler.py | 12 ++++++++++++ pr_agent/settings/configuration.toml | 3 +++ 2 files changed, 15 insertions(+) diff --git a/pr_agent/algo/ai_handlers/litellm_ai_handler.py b/pr_agent/algo/ai_handlers/litellm_ai_handler.py index a6e79d7a07..42889fbb40 100644 --- a/pr_agent/algo/ai_handlers/litellm_ai_handler.py +++ b/pr_agent/algo/ai_handlers/litellm_ai_handler.py @@ -532,6 +532,18 @@ async def chat_completion(self, model: str, system: str, user: str, temperature: # Support for custom OpenAI body fields (e.g., Flex Processing) kwargs = _process_litellm_extra_body(kwargs) + # Support for Anthropic prompt caching via LiteLLM's cache_control_injection_points + # (https://docs.litellm.ai/docs/tutorials/prompt_caching). Configurable as a JSON + # array in [litellm] section of configuration.toml or .pr_agent.toml. + if get_settings().get("LITELLM.CACHE_CONTROL_INJECTION_POINTS", None): + try: + cache_points = json.loads(get_settings().litellm.cache_control_injection_points) + if not isinstance(cache_points, list): + raise ValueError("LITELLM.CACHE_CONTROL_INJECTION_POINTS must be a JSON array") + kwargs["cache_control_injection_points"] = cache_points + except json.JSONDecodeError as e: + raise ValueError(f"LITELLM.CACHE_CONTROL_INJECTION_POINTS contains invalid JSON: {str(e)}") + # Support for Bedrock custom inference profile via model_id model_id = get_settings().get("litellm.model_id") if model_id and 'bedrock/' in model: diff --git a/pr_agent/settings/configuration.toml b/pr_agent/settings/configuration.toml index f4d63a73f2..f780720168 100644 --- a/pr_agent/settings/configuration.toml +++ b/pr_agent/settings/configuration.toml @@ -326,6 +326,9 @@ success_callback = [] failure_callback = [] service_callback = [] # model_id = "" # Optional: Custom inference profile ID for Amazon Bedrock +# cache_control_injection_points = "" # Optional: JSON array enabling Anthropic prompt caching via LiteLLM +# Example: cache_control_injection_points = '[{"location": "message", "role": "system"}]' +# See https://docs.litellm.ai/docs/tutorials/prompt_caching [pr_similar_issue] skip_comments = false