LarryXFly · pull · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
@@ -3376,19 +3376,26 @@ SizeType32 KVCacheManager::getNeededBlocksOneStep(LlmRequest const& req, bool tw
             // Use the cached summary if provided; otherwise perform a fresh tree walk.
             auto const summary
                 = cachedSummary.has_value() ? cachedSummary.value() : analyzePrefixReuse(req.getUniqueTokens(0), req);
-            auto const numReusableBlocks = summary.reusableBlocksAllocated;
             auto const promptInputLen = std::min(req.mPromptLen, windowSize + mChunkSize);
             // Sequence insertion ignores the last prompt token because its KV cannot be recovered.
             // When the prompt lands exactly on a block boundary, counting reusable full blocks from
             // all unique tokens can over-credit one extra shared block.
             TLLM_CHECK_WITH_INFO(promptInputLen > 0, "Unexpected: promptInputLen == 0");
             auto const maxRecoverableSharedBlocks = (promptInputLen - 1) / getTokensPerBlock();
-            // Only subtract from shared blocks (reusable blocks are always shared)
-            auto const reusableSharedBlocks
-                = std::min(numReusableBlocks, std::min(numSharedBlocks, maxRecoverableSharedBlocks));
-            numRequiredBlocks -= reusableSharedBlocks;
-            // Store on request so the micro batch scheduler can use it for token budget
-            req.setEstimatedReusableTokens(reusableSharedBlocks * getTokensPerBlock());
+            // Block (capacity) budget: only allocated reuse reduces free-pool demand. A free-but-cached
+            // block still pulls one block from the free pool when reused, so crediting it here would
+            // double-count the eviction policy's free count and over-admit requests. Only subtract from
+            // shared blocks, since reusable blocks are always shared.
+            auto const reusableAllocatedBlocks
+                = std::min({summary.reusableBlocksAllocated, numSharedBlocks, maxRecoverableSharedBlocks});
+            numRequiredBlocks -= reusableAllocatedBlocks;
+            // Token (compute) budget: all cached prefix blocks skip recompute regardless of ref state,
+            // since the engine recovers their KV via prepopulatedPromptLen. This matches the accounting
+            // in getRemainingBlocksToCompletion (GUARANTEED_NO_EVICT) so the micro batch scheduler does
+            // not under-credit reuse and serialize context requests.
+            auto const reusableAllBlocks
+                = std::min({summary.reusableBlocksAll, numSharedBlocks, maxRecoverableSharedBlocks});
+            req.setEstimatedReusableTokens(reusableAllBlocks * getTokensPerBlock());
         }
         return numRequiredBlocks;
     }

diff --git a/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp b/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp
@@ -6511,6 +6511,91 @@ TEST(KVCacheManagerReuseAccountingTest, ReuseAwareBlockEstimatesStayConsistentAf
     EXPECT_EQ(remainingAfterContextAlloc, maxNewTokens / tokensPerBlock);
 }
 
+// Validates the two independent budgets getNeededBlocksOneStep drives for a first-chunk context request.
+// The block/capacity budget (numRequiredBlocks) credits only allocated reuse, since a free-but-cached
+// block still costs a free-pool block when reused. The compute/token budget (estimatedReusableTokens)
+// credits all cached prefix blocks, since the engine skips recomputing them via prepopulatedPromptLen
+// regardless of ref state.
+//
+// In steady state a shared prefix is free-but-cached (no active refs), so reusableBlocksAllocated is 0
+// while reusableBlocksAll stays large. Crediting only allocated reuse to the token budget would charge
+// the full prompt as compute and starve the FCFS micro-batch scheduler, serializing context requests.
+TEST(KVCacheManagerReuseAccountingTest, NeededBlocksOneStepCreditsFreeCachedReuseInTokenBudget)
+{
+    auto const stream = std::make_shared<tr::CudaStream>();
+    auto constexpr tokensPerBlock = 16;
+    auto constexpr promptLength = 64; // 4 full context blocks
+    auto constexpr maxNewTokens = 32;
+    auto constexpr maxBeamWidth = 1;
+    auto constexpr maxAttentionWindow = 512;
+    auto constexpr maxNumTokens = 1024;
+
+    auto kvCacheManager = createKvCacheManager(
+        KvCacheManagerInstantiationParameters{
+            /* numLayers */ 1,
+            /* numHeads */ 1,
+            /* sizePerHead */ 1,
+            /* tokensPerBlock */ tokensPerBlock,
+            /* blocksPerWindow */ blocksAndWindow(/* numPrimaryBlocks */ 256, /* windowSize */ maxAttentionWindow),
+            /* sinkTokenLength */ 0,
+            /* maxAttentionWindow */ maxAttentionWindow,
+            /* maxBeamWidth */ maxBeamWidth,
+            /* maxNumTokens */ maxNumTokens,
+            /* kvCacheBlockReuse */ true,
+        },
+        stream);
+    kvCacheManager->allocatePools(/*useUvm=*/false);
+    auto const onlyWindowSize = theOnlyWindowSize(*kvCacheManager);
+
+    auto const baseTokens = std::make_shared<std::vector<TokenIdType>>(static_cast<std::size_t>(promptLength), 7);
+
+    // req0 populates the radix tree, then is removed so its prefix blocks are FREE-but-cached
+    // (present in the reuse tree, but with no active references).
+    auto req0 = LlmRequest{0, maxNewTokens, baseTokens, tensorrt_llm::runtime::SamplingConfig{maxBeamWidth}, true};
+    kvCacheManager->addSequenceBatch({{{req0.mRequestId, req0.getPromptLen(), maxBeamWidth}}}, {std::ref(req0)});
+    tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion(req0);
+    kvCacheManager->storeContextBlocks(req0);
+    tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion(req0);
+    kvCacheManager->removeSequence(req0.mRequestId, req0);
+
+    // req1 shares the whole prefix. Because req0 was removed, the matching blocks have no refs.
+    auto req1 = LlmRequest{1, maxNewTokens, baseTokens, tensorrt_llm::runtime::SamplingConfig{maxBeamWidth}, true};
+
+    // storeContextBlocks only commits (promptLength - 1) tokens worth of blocks -> 3 full blocks.
+    auto const expectedReusableBlocks = (promptLength - 1) / tokensPerBlock; // 3
+    auto const summary = kvCacheManager->analyzePrefixReuse(req1.getUniqueTokens(0), req1);
+    // Every reusable block is free-but-cached, so all of them count in All and none in Allocated.
+    ASSERT_EQ(summary.reusableBlocksAll, expectedReusableBlocks);
+    ASSERT_EQ(summary.reusableBlocksAllocated, 0);
+
+    auto const numContextBlocks = promptLength / tokensPerBlock; // 4
+
+    // Block/capacity budget: free-cached reuse is not subtracted, since it still costs free blocks.
+    auto const neededOneStep
+        = kvCacheManager->getNeededBlocksOneStep(req1, /*twoStepsLookAhead=*/false, onlyWindowSize);
+    EXPECT_EQ(neededOneStep, numContextBlocks);
+
+    // Token/compute budget: the MAX_UTILIZATION path credits all cached blocks, so the whole prefix is
+    // free for compute.
+    auto const estimatedReusableTokens = req1.getEstimatedReusableTokens();
+    EXPECT_EQ(estimatedReusableTokens, expectedReusableBlocks * tokensPerBlock); // 48
+
+    // Model the FCFS micro-batch scheduler's compute-aware admission for a fixed per-iteration token
+    // budget: each first-chunk context request costs (promptLength - reusableTokens) compute tokens.
+    // Crediting all cached reuse admits more requests than crediting only allocated reuse.
+    auto const ctxTokenBudget = promptLength;                                                  // 64
+    auto const allocatedOnlyReusableTokens = summary.reusableBlocksAllocated * tokensPerBlock; // 0
+    auto const computeCostAllCached = std::max(1, promptLength - estimatedReusableTokens);
+    auto const computeCostAllocatedOnly = std::max(1, promptLength - allocatedOnlyReusableTokens);
+
+    auto const admittedAllocatedOnly = ctxTokenBudget / computeCostAllocatedOnly;
+    auto const admittedAllCached = ctxTokenBudget / computeCostAllCached;
+
+    EXPECT_EQ(admittedAllocatedOnly, 1); // full-prompt compute: only one request fits
+    EXPECT_EQ(admittedAllCached, 4);     // cached prefix is free: four requests fit
+    EXPECT_GT(admittedAllCached, admittedAllocatedOnly);
+}
+
 TEST(KVCacheManagerReuseAccountingTest, NeededBlocksOneStepCapsAllocatedReuseAtExactBlockBoundary)
 {
     auto const stream = std::make_shared<tr::CudaStream>();

diff --git a/examples/visual_gen/serve/README.md b/examples/visual_gen/serve/README.md
@@ -67,7 +67,7 @@ Demonstrates synchronous text-to-image generation using the OpenAI SDK. Supports
 
 **Features:**
 - Generates images from text prompts
-- Supports configurable model, image size, and quality
+- Supports configurable model and image size
 - Returns base64-encoded images or URLs
 - Saves generated images to disk
 
@@ -269,20 +269,52 @@ You can customize these by:
 ## Common Parameters
 
 ### Image Generation
-- `model`: Model identifier (e.g., "flux1", "flux2")
-- `prompt`: Text description
+- `prompt`: Text description (required)
 - `n`: Number of images to generate
-- `size`: Image dimensions (e.g., "512x512", "1024x1024")
-- `quality`: "standard" or "hd"
-- `response_format`: "b64_json" or "url"
+- `size`: Image dimensions in `WxH` format (e.g., `"512x512"`, `"1024x1024"`) — or use the structured pair `width` + `height` (both required when sent)
+- `seed`: Random seed; `null` / omitted means the engine draws a fresh seed
+- `num_inference_steps`, `guidance_scale`, `max_sequence_length`, `negative_prompt`: per-request denoise controls (override pipeline defaults when sent)
+- `extra_params`: model-specific overflow as a JSON object (see "Model-Specific `extra_params`" below). Unknown keys are rejected by the executor.
+- `response_format`: `"b64_json"` or `"url"`
+- `format`: Generation content encoding. Image encoders: `"png"`, `"webp"`, `"jpeg"`. Tensor formats: `"safetensors"`, `"pt"`.
+- Accept-and-warn OpenAI-shape fields (no engine semantic): `model`, `quality`, `style`, `user`. Sending `quality`/`style` logs a server-side WARNING; sending `model` warns on mismatch. None of these change generation behavior.
 
 ### Video Generation
-- `model`: Model identifier (e.g., "wan", "ltx2")
-- `prompt`: Text description
-- `size`: Video resolution (e.g., "256x256", "512x512", "1280x720")
-- `seconds`: Duration in seconds
-- `fps`: Frames per second
-- `input_reference`: Reference image file (for TI2V mode)
+- `prompt`: Text description (required)
+- `size` / `width` / `height`: same convention as image
+- `seconds`: Duration in seconds (engine multiplies by `frame_rate` to derive `num_frames` when the latter is absent)
+- `frame_rate` (canonical) or `fps` (alias): frames per second
+- `num_frames`: when set, wins over the `seconds * frame_rate` derivation
+- `seed`, `num_inference_steps`, `guidance_scale`, `max_sequence_length`, `negative_prompt`: per-request denoise controls
+- `input_reference`: Reference image (TI2V mode); accepted as base64-encoded string in JSON or as a file in multipart form-data
+- `extra_params`: model-specific overflow (see below)
+- `response_format`: `"b64_json"` or `"url"`
+- `format`: Generation content encoding. Video encoders: `"mp4"`, `"avi"`, `"auto"`. Tensor formats: `"safetensors"`, `"pt"` (carries video + audio + scalar metadata in one payload for LTX-2).
+
+#### Tensor-format consumer contract
+
+When `format="safetensors"` or `format="pt"`, the payload bundles every populated media tensor (`image` / `video` / `audio`) and the scalar metadata (`frame_rate`, `audio_sample_rate`) into one file.
+
+- **`pt`**: `torch.load(buf, weights_only=True)` returns a dict with the tensor keys and the scalars as native Python values.
+- **`safetensors`**: `safetensors.torch.load(bytes)` returns a dict with the tensor keys and each scalar as a 0-d tensor under the same key — call `.item()` to unbox (e.g. `loaded["frame_rate"].item()`). The same scalars are also written to the safetensors file header as strings; `safe_open(path, framework="pt").metadata()` exposes them in that form for consumers that prefer header access.
+
+#### Unknown-field policy
+
+The visual-gen endpoints reject unknown top-level fields with HTTP 422 (`extra="forbid"`). Anything model-specific belongs inside `extra_params`. Sending `output_format`, top-level `guidance_rescale`, or — for video — top-level `n` returns 422 with the offending field named in the error body.
+
+#### Model-specific `extra_params`
+
+Use the Python API to discover accepted keys for a loaded pipeline:
+
+```python
+generator = VisualGen(model="...")
+print(generator.extra_param_specs)   # {key: ExtraParamSchema(type=..., range=..., default=..., description=...)}
+```
+
+Examples:
+- **LTX-2**: `stg_scale`, `stg_blocks`, `modality_scale`, `guidance_rescale`, `output_type`, ...
+- **Wan 2.2 A14B**: `guidance_scale_2`, `boundary_ratio`
+- **Wan 2.1 / Flux**: no model-specific `extra_params` declared
 
 > **Note:** LTX-2 generates video **with audio**. The `ltx2.yml` config must include
 > `text_encoder_path` pointing to a Gemma3 model (e.g., `google/gemma-3-12b-it`).

diff --git a/examples/visual_gen/serve/async_video_gen.py b/examples/visual_gen/serve/async_video_gen.py
@@ -33,7 +33,7 @@ def test_async_video_generation(
     fps: int = 24,
     size: str = "256x256",
     output_file: str = "output_async.mp4",
-    output_format: str = "auto",
+    format: str = "auto",
 ):
     """Test asynchronous video generation with OpenAI SDK.
 
@@ -77,7 +77,7 @@ def test_async_video_generation(
             "seconds": duration,
             "extra_body": {
                 "fps": fps,
-                "output_format": output_format,
+                "format": format,
             },
         }
 
@@ -131,12 +131,18 @@ def test_async_video_generation(
         # For binary content, use the underlying HTTP client
         content = client.videos.download_content(video_id, variant="video")
 
-        # Check content type to determine actual file extension
-        content_type = getattr(content.response, "headers", {}).get("content-type", "video/mp4")
-        if "x-msvideo" in content_type or "avi" in content_type:
-            actual_ext = ".avi"
+        # Determine the on-disk extension. Tensor formats are
+        # selected by the request and the server returns
+        # ``application/octet-stream``; encoder formats can be
+        # disambiguated from Content-Type (mp4 vs avi).
+        if format in ("safetensors", "pt"):
+            actual_ext = f".{format}"
         else:
-            actual_ext = ".mp4"
+            content_type = getattr(content.response, "headers", {}).get("content-type", "video/mp4")
+            if "x-msvideo" in content_type or "avi" in content_type:
+                actual_ext = ".avi"
+            else:
+                actual_ext = ".mp4"
 
         # Adjust output filename if extension doesn't match
         output_path = Path(output_file)
@@ -233,11 +239,15 @@ def test_async_video_generation(
     )
 
     parser.add_argument(
-        "--output-format",
+        "--format",
         type=str,
         default="auto",
-        choices=["mp4", "avi", "auto"],
-        help="Output video format: mp4 or avi or auto",
+        choices=["mp4", "avi", "auto", "safetensors", "pt"],
+        help=(
+            "Generation content encoding format. Encoders: mp4 / avi / auto. "
+            "Tensor formats safetensors / pt return raw tensor bytes for "
+            "programmatic post-processing."
+        ),
     )
 
     args = parser.parse_args()
@@ -264,7 +274,7 @@ def test_async_video_generation(
         fps=args.fps,
         size=args.size,
         output_file=args.output,
-        output_format=args.output_format,
+        format=args.format,
     )
 
     sys.exit(0 if success else 1)
diff --git a/examples/visual_gen/serve/sync_image_gen.py b/examples/visual_gen/serve/sync_image_gen.py
@@ -28,11 +28,16 @@ def test_image_generation(
     prompt: str = "A lovely cat lying on a sofa",
     n: int = 1,
     size: str = "512x512",
-    quality: str = "standard",
+    format: str = "png",
     response_format: str = "b64_json",
     output_file: str = "output_generation.png",
 ):
-    """Test image generation endpoint."""
+    """Test image generation endpoint.
+
+    ``format`` selects the encoding for the returned bytes. Image encoders
+    are ``"png"``, ``"webp"``, ``"jpeg"``; tensor payloads are
+    ``"safetensors"`` and ``"pt"``.
+    """
     print("=" * 80)
     print("Testing Image Generation API (POST /v1/images/generations)")
     print("=" * 80)
@@ -44,30 +49,41 @@ def test_image_generation(
     print(f"   Model: {model}")
     print(f"   Prompt: {prompt}")
     print(f"   Size: {size}")
-    print(f"   Quality: {quality}")
+    print(f"   Format: {format}")
     print(f"   Number of images: {n}")
 
     try:
-        # Use OpenAI SDK's images.generate() method
+        # ``format`` is a trtllm-serve extension over the OpenAI image
+        # API; the SDK forwards it via ``extra_body``.
         response = client.images.generate(
             model=model,
             prompt=prompt,
             n=n,
             size=size,
-            quality=quality,
             response_format=response_format,
+            extra_body={"format": format},
         )
 
         print("\n✓ Image generated successfully!")
         print(f"   Number of images: {len(response.data)}")
 
+        # Choose the on-disk extension to match the requested format so
+        # the saved file's suffix reflects its actual contents.
+        ext_map = {
+            "png": ".png",
+            "webp": ".webp",
+            "jpeg": ".jpeg",
+            "safetensors": ".safetensors",
+            "pt": ".pt",
+        }
+        ext = ext_map[format]
+        stem = output_file.rsplit(".", 1)[0]
+
         # Save images
         for i, image in enumerate(response.data):
             if response_format == "b64_json":
-                # Decode base64 image
                 image_data = base64.b64decode(image.b64_json)
-                output = f"{output_file.rsplit('.', 1)[0]}_{i}.png" if n > 1 else output_file
-
+                output = f"{stem}_{i}{ext}" if n > 1 else f"{stem}{ext}"
                 with open(output, "wb") as f:
                     f.write(image_data)
 
@@ -116,6 +132,16 @@ def test_image_generation(
         default="512x512",
         help="Image size in WxH format (e.g., 512x512, 1024x1024)",
     )
+    parser.add_argument(
+        "--format",
+        type=str,
+        default="png",
+        choices=["png", "webp", "jpeg", "safetensors", "pt"],
+        help=(
+            "Generation content encoding format. Image encoders: png / "
+            "webp / jpeg. Tensor payloads: safetensors / pt."
+        ),
+    )
     parser.add_argument(
         "--output",
         type=str,
@@ -137,6 +163,7 @@ def test_image_generation(
         model=args.model,
         prompt=args.prompt,
         size=args.size,
+        format=args.format,
         output_file=args.output,
     )