paoloanzn · elliothux · Apr 16, 2026 · Apr 16, 2026
diff --git a/src/services/api/claude.ts b/src/services/api/claude.ts
@@ -2281,13 +2281,15 @@ async function* queryModel(
                 max_tokens: maxOutputTokens,
                 output_tokens: usage.output_tokens,
               })
-              // Reuse the max_output_tokens recovery path — from the model's
-              // perspective, both mean "response was cut off, continue from
-              // where you left off."
+              // Keep the public error shape aligned with max_output_tokens so
+              // existing clients keep the same UX, and stash the precise
+              // overflow kind in errorDetails for the query loop's recovery
+              // routing.
               yield createAssistantAPIErrorMessage({
                 content: `${API_ERROR_MESSAGE_PREFIX}: The model has reached its context window limit.`,
                 apiError: 'max_output_tokens',
                 error: 'max_output_tokens',
+                errorDetails: 'context_window_exceeded',
               })
             }
             break

diff --git a/src/utils/context.ts b/src/utils/context.ts
@@ -52,14 +52,11 @@ export function getContextWindowForModel(
   model: string,
   betas?: string[],
 ): number {
-  // Allow override via environment variable (ant-only)
+  // Allow override via environment variable.
   // This takes precedence over all other context window resolution, including 1M detection,
   // so users can cap the effective context window for local decisions (auto-compact, etc.)
   // while still using a 1M-capable endpoint.
-  if (
-    process.env.USER_TYPE === 'ant' &&
-    process.env.CLAUDE_CODE_MAX_CONTEXT_TOKENS
-  ) {
+  if (process.env.CLAUDE_CODE_MAX_CONTEXT_TOKENS) {
     const override = parseInt(process.env.CLAUDE_CODE_MAX_CONTEXT_TOKENS, 10)
     if (!isNaN(override) && override > 0) {
       return override