From 0f71a688a9cce47ef44b8a46f3496604fcb5d4ff Mon Sep 17 00:00:00 2001
From: thezzisu <thezzisu@gmail.com>
Date: Sun, 24 May 2026 22:45:07 +0800
Subject: [PATCH] feat(server): clamp coreReq.MaxTokens to per-route
 max_output_tokens

When the inbound OpenAI Responses request omits max_output_tokens (or sets
it above the upstream limit), moonbridge previously injected the global
defaults.max_tokens unchanged. Anthropic / Qwen / Gemini upstreams reject
oversized values with 400.

Resolve a per-alias cap from config.Routes[<alias>].MaxOutputTokens, with
fallback to provider catalog ModelMeta.MaxOutputTokens, and clamp
coreReq.MaxTokens before the protocol adapter serializes the upstream
request. Adds three unit tests covering route, fallback, and unset paths.
---
 internal/service/server/adapter_dispatch.go   |  6 ++
 .../service/server/adapter_dispatch_test.go   | 72 +++++++++++++++++++
 internal/service/server/server.go             | 20 ++++++
 3 files changed, 98 insertions(+)
diff --git a/internal/service/server/adapter_dispatch.go b/internal/service/server/adapter_dispatch.go
index 6031fa9c..facedb27 100644
--- a/internal/service/server/adapter_dispatch.go
+++ b/internal/service/server/adapter_dispatch.go
@@ -170,6 +170,12 @@ func (s *Server) handleWithAdapters(
 	// the upstream provider receives the correct model identifier.
 	coreReq.Model = preferred.UpstreamModel
 
+	if maxOut := s.routeMaxOutputTokens(openAIReq.Model, preferred); maxOut > 0 {
+		if coreReq.MaxTokens <= 0 || coreReq.MaxTokens > maxOut {
+			coreReq.MaxTokens = maxOut
+		}
+	}
+
 	wsMode := resolvedWebSearchMode(pm, openAIReq.Model, preferred)
 
 	// Inject web search tools at Core level if mode is "injected".
diff --git a/internal/service/server/adapter_dispatch_test.go b/internal/service/server/adapter_dispatch_test.go
index 32c6ce52..cfc61f41 100644
--- a/internal/service/server/adapter_dispatch_test.go
+++ b/internal/service/server/adapter_dispatch_test.go
@@ -265,3 +265,75 @@ func TestInjectCoreWebSearchSkipsWhenCandidateHasNativeSearch(t *testing.T) {
 		t.Fatalf("len(coreReq.Tools) = %d, want 0", len(coreReq.Tools))
 	}
 }
+
+func TestRouteMaxOutputTokensPrefersRouteEntry(t *testing.T) {
+	rt := runtime.NewRuntime(config.Config{
+		Routes: map[string]config.RouteEntry{
+			"claude-haiku-4-5": {
+				Provider:        "newapi",
+				Model:           "claude-haiku-4-5",
+				MaxOutputTokens: 64000,
+			},
+		},
+		ProviderDefs: map[string]config.ProviderDef{
+			"newapi": {
+				Models: map[string]config.ModelMeta{
+					"claude-haiku-4-5": {MaxOutputTokens: 200000},
+				},
+			},
+		},
+	}, nil, nil)
+	srv := &Server{runtime: rt}
+
+	got := srv.routeMaxOutputTokens("claude-haiku-4-5", provider.ProviderCandidate{
+		ProviderKey:   "newapi",
+		UpstreamModel: "claude-haiku-4-5",
+	})
+	if got != 64000 {
+		t.Fatalf("routeMaxOutputTokens() = %d, want 64000", got)
+	}
+}
+
+func TestRouteMaxOutputTokensFallsBackToProviderModelMeta(t *testing.T) {
+	rt := runtime.NewRuntime(config.Config{
+		Routes: map[string]config.RouteEntry{
+			"qwen3.6-plus": {Provider: "newapi", Model: "qwen3.6-plus"},
+		},
+		ProviderDefs: map[string]config.ProviderDef{
+			"newapi": {
+				Models: map[string]config.ModelMeta{
+					"qwen3.6-plus": {MaxOutputTokens: 65536},
+				},
+			},
+		},
+	}, nil, nil)
+	srv := &Server{runtime: rt}
+
+	got := srv.routeMaxOutputTokens("qwen3.6-plus", provider.ProviderCandidate{
+		ProviderKey:   "newapi",
+		UpstreamModel: "qwen3.6-plus",
+	})
+	if got != 65536 {
+		t.Fatalf("routeMaxOutputTokens() = %d, want 65536", got)
+	}
+}
+
+func TestRouteMaxOutputTokensReturnsZeroWhenUnset(t *testing.T) {
+	rt := runtime.NewRuntime(config.Config{
+		Routes: map[string]config.RouteEntry{
+			"unbounded": {Provider: "newapi", Model: "unbounded"},
+		},
+		ProviderDefs: map[string]config.ProviderDef{
+			"newapi": {Models: map[string]config.ModelMeta{}},
+		},
+	}, nil, nil)
+	srv := &Server{runtime: rt}
+
+	got := srv.routeMaxOutputTokens("unbounded", provider.ProviderCandidate{
+		ProviderKey:   "newapi",
+		UpstreamModel: "unbounded",
+	})
+	if got != 0 {
+		t.Fatalf("routeMaxOutputTokens() = %d, want 0", got)
+	}
+}
diff --git a/internal/service/server/server.go b/internal/service/server/server.go
index 4c792212..c8e98342 100644
--- a/internal/service/server/server.go
+++ b/internal/service/server/server.go
@@ -96,6 +96,26 @@ func (s *Server) activeProviderDefs() map[string]config.ProviderDef {
 	return nil
 }
 
+// routeMaxOutputTokens resolves the effective per-route max_output_tokens cap
+// for the given inbound model alias, falling back to the upstream provider
+// catalog metadata when the route does not declare its own value.
+// Returns 0 when no cap is configured (caller should leave defaults alone).
+func (s *Server) routeMaxOutputTokens(modelAlias string, preferred provider.ProviderCandidate) int {
+	snap := s.runtimeSnapshot()
+	if snap == nil {
+		return 0
+	}
+	if entry, ok := snap.Config.Routes[modelAlias]; ok && entry.MaxOutputTokens > 0 {
+		return entry.MaxOutputTokens
+	}
+	if def, ok := snap.Config.ProviderDefs[preferred.ProviderKey]; ok {
+		if meta, ok := def.Models[preferred.UpstreamModel]; ok && meta.MaxOutputTokens > 0 {
+			return meta.MaxOutputTokens
+		}
+	}
+	return 0
+}
+
 func (s *Server) activeChatClient(providerKey string) any {
 	if snap := s.runtimeSnapshot(); snap != nil {
 		if def, ok := snap.Config.ProviderDefs[providerKey]; ok && def.Protocol == config.ProtocolOpenAIChat {