Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 14 additions & 7 deletions cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3376,19 +3376,26 @@ SizeType32 KVCacheManager::getNeededBlocksOneStep(LlmRequest const& req, bool tw
// Use the cached summary if provided; otherwise perform a fresh tree walk.
auto const summary
= cachedSummary.has_value() ? cachedSummary.value() : analyzePrefixReuse(req.getUniqueTokens(0), req);
auto const numReusableBlocks = summary.reusableBlocksAllocated;
auto const promptInputLen = std::min(req.mPromptLen, windowSize + mChunkSize);
// Sequence insertion ignores the last prompt token because its KV cannot be recovered.
// When the prompt lands exactly on a block boundary, counting reusable full blocks from
// all unique tokens can over-credit one extra shared block.
TLLM_CHECK_WITH_INFO(promptInputLen > 0, "Unexpected: promptInputLen == 0");
auto const maxRecoverableSharedBlocks = (promptInputLen - 1) / getTokensPerBlock();
// Only subtract from shared blocks (reusable blocks are always shared)
auto const reusableSharedBlocks
= std::min(numReusableBlocks, std::min(numSharedBlocks, maxRecoverableSharedBlocks));
numRequiredBlocks -= reusableSharedBlocks;
// Store on request so the micro batch scheduler can use it for token budget
req.setEstimatedReusableTokens(reusableSharedBlocks * getTokensPerBlock());
// Block (capacity) budget: only allocated reuse reduces free-pool demand. A free-but-cached
// block still pulls one block from the free pool when reused, so crediting it here would
// double-count the eviction policy's free count and over-admit requests. Only subtract from
// shared blocks, since reusable blocks are always shared.
auto const reusableAllocatedBlocks
= std::min({summary.reusableBlocksAllocated, numSharedBlocks, maxRecoverableSharedBlocks});
numRequiredBlocks -= reusableAllocatedBlocks;
// Token (compute) budget: all cached prefix blocks skip recompute regardless of ref state,
// since the engine recovers their KV via prepopulatedPromptLen. This matches the accounting
// in getRemainingBlocksToCompletion (GUARANTEED_NO_EVICT) so the micro batch scheduler does
// not under-credit reuse and serialize context requests.
auto const reusableAllBlocks
= std::min({summary.reusableBlocksAll, numSharedBlocks, maxRecoverableSharedBlocks});
req.setEstimatedReusableTokens(reusableAllBlocks * getTokensPerBlock());
}
return numRequiredBlocks;
}
Expand Down
85 changes: 85 additions & 0 deletions cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6511,6 +6511,91 @@ TEST(KVCacheManagerReuseAccountingTest, ReuseAwareBlockEstimatesStayConsistentAf
EXPECT_EQ(remainingAfterContextAlloc, maxNewTokens / tokensPerBlock);
}

// Validates the two independent budgets getNeededBlocksOneStep drives for a first-chunk context request.
// The block/capacity budget (numRequiredBlocks) credits only allocated reuse, since a free-but-cached
// block still costs a free-pool block when reused. The compute/token budget (estimatedReusableTokens)
// credits all cached prefix blocks, since the engine skips recomputing them via prepopulatedPromptLen
// regardless of ref state.
//
// In steady state a shared prefix is free-but-cached (no active refs), so reusableBlocksAllocated is 0
// while reusableBlocksAll stays large. Crediting only allocated reuse to the token budget would charge
// the full prompt as compute and starve the FCFS micro-batch scheduler, serializing context requests.
TEST(KVCacheManagerReuseAccountingTest, NeededBlocksOneStepCreditsFreeCachedReuseInTokenBudget)
{
auto const stream = std::make_shared<tr::CudaStream>();
auto constexpr tokensPerBlock = 16;
auto constexpr promptLength = 64; // 4 full context blocks
auto constexpr maxNewTokens = 32;
auto constexpr maxBeamWidth = 1;
auto constexpr maxAttentionWindow = 512;
auto constexpr maxNumTokens = 1024;

auto kvCacheManager = createKvCacheManager(
KvCacheManagerInstantiationParameters{
/* numLayers */ 1,
/* numHeads */ 1,
/* sizePerHead */ 1,
/* tokensPerBlock */ tokensPerBlock,
/* blocksPerWindow */ blocksAndWindow(/* numPrimaryBlocks */ 256, /* windowSize */ maxAttentionWindow),
/* sinkTokenLength */ 0,
/* maxAttentionWindow */ maxAttentionWindow,
/* maxBeamWidth */ maxBeamWidth,
/* maxNumTokens */ maxNumTokens,
/* kvCacheBlockReuse */ true,
},
stream);
kvCacheManager->allocatePools(/*useUvm=*/false);
auto const onlyWindowSize = theOnlyWindowSize(*kvCacheManager);

auto const baseTokens = std::make_shared<std::vector<TokenIdType>>(static_cast<std::size_t>(promptLength), 7);

// req0 populates the radix tree, then is removed so its prefix blocks are FREE-but-cached
// (present in the reuse tree, but with no active references).
auto req0 = LlmRequest{0, maxNewTokens, baseTokens, tensorrt_llm::runtime::SamplingConfig{maxBeamWidth}, true};
kvCacheManager->addSequenceBatch({{{req0.mRequestId, req0.getPromptLen(), maxBeamWidth}}}, {std::ref(req0)});
tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion(req0);
kvCacheManager->storeContextBlocks(req0);
tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion(req0);
kvCacheManager->removeSequence(req0.mRequestId, req0);

// req1 shares the whole prefix. Because req0 was removed, the matching blocks have no refs.
auto req1 = LlmRequest{1, maxNewTokens, baseTokens, tensorrt_llm::runtime::SamplingConfig{maxBeamWidth}, true};

// storeContextBlocks only commits (promptLength - 1) tokens worth of blocks -> 3 full blocks.
auto const expectedReusableBlocks = (promptLength - 1) / tokensPerBlock; // 3
auto const summary = kvCacheManager->analyzePrefixReuse(req1.getUniqueTokens(0), req1);
// Every reusable block is free-but-cached, so all of them count in All and none in Allocated.
ASSERT_EQ(summary.reusableBlocksAll, expectedReusableBlocks);
ASSERT_EQ(summary.reusableBlocksAllocated, 0);

auto const numContextBlocks = promptLength / tokensPerBlock; // 4

// Block/capacity budget: free-cached reuse is not subtracted, since it still costs free blocks.
auto const neededOneStep
= kvCacheManager->getNeededBlocksOneStep(req1, /*twoStepsLookAhead=*/false, onlyWindowSize);
EXPECT_EQ(neededOneStep, numContextBlocks);

// Token/compute budget: the MAX_UTILIZATION path credits all cached blocks, so the whole prefix is
// free for compute.
auto const estimatedReusableTokens = req1.getEstimatedReusableTokens();
EXPECT_EQ(estimatedReusableTokens, expectedReusableBlocks * tokensPerBlock); // 48

// Model the FCFS micro-batch scheduler's compute-aware admission for a fixed per-iteration token
// budget: each first-chunk context request costs (promptLength - reusableTokens) compute tokens.
// Crediting all cached reuse admits more requests than crediting only allocated reuse.
auto const ctxTokenBudget = promptLength; // 64
auto const allocatedOnlyReusableTokens = summary.reusableBlocksAllocated * tokensPerBlock; // 0
auto const computeCostAllCached = std::max(1, promptLength - estimatedReusableTokens);
auto const computeCostAllocatedOnly = std::max(1, promptLength - allocatedOnlyReusableTokens);

auto const admittedAllocatedOnly = ctxTokenBudget / computeCostAllocatedOnly;
auto const admittedAllCached = ctxTokenBudget / computeCostAllCached;

EXPECT_EQ(admittedAllocatedOnly, 1); // full-prompt compute: only one request fits
EXPECT_EQ(admittedAllCached, 4); // cached prefix is free: four requests fit
EXPECT_GT(admittedAllCached, admittedAllocatedOnly);
}

TEST(KVCacheManagerReuseAccountingTest, NeededBlocksOneStepCapsAllocatedReuseAtExactBlockBoundary)
{
auto const stream = std::make_shared<tr::CudaStream>();
Expand Down
56 changes: 44 additions & 12 deletions examples/visual_gen/serve/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ Demonstrates synchronous text-to-image generation using the OpenAI SDK. Supports

**Features:**
- Generates images from text prompts
- Supports configurable model, image size, and quality
- Supports configurable model and image size
- Returns base64-encoded images or URLs
- Saves generated images to disk

Expand Down Expand Up @@ -269,20 +269,52 @@ You can customize these by:
## Common Parameters

### Image Generation
- `model`: Model identifier (e.g., "flux1", "flux2")
- `prompt`: Text description
- `prompt`: Text description (required)
- `n`: Number of images to generate
- `size`: Image dimensions (e.g., "512x512", "1024x1024")
- `quality`: "standard" or "hd"
- `response_format`: "b64_json" or "url"
- `size`: Image dimensions in `WxH` format (e.g., `"512x512"`, `"1024x1024"`) — or use the structured pair `width` + `height` (both required when sent)
- `seed`: Random seed; `null` / omitted means the engine draws a fresh seed
- `num_inference_steps`, `guidance_scale`, `max_sequence_length`, `negative_prompt`: per-request denoise controls (override pipeline defaults when sent)
- `extra_params`: model-specific overflow as a JSON object (see "Model-Specific `extra_params`" below). Unknown keys are rejected by the executor.
- `response_format`: `"b64_json"` or `"url"`
- `format`: Generation content encoding. Image encoders: `"png"`, `"webp"`, `"jpeg"`. Tensor formats: `"safetensors"`, `"pt"`.
- Accept-and-warn OpenAI-shape fields (no engine semantic): `model`, `quality`, `style`, `user`. Sending `quality`/`style` logs a server-side WARNING; sending `model` warns on mismatch. None of these change generation behavior.

### Video Generation
- `model`: Model identifier (e.g., "wan", "ltx2")
- `prompt`: Text description
- `size`: Video resolution (e.g., "256x256", "512x512", "1280x720")
- `seconds`: Duration in seconds
- `fps`: Frames per second
- `input_reference`: Reference image file (for TI2V mode)
- `prompt`: Text description (required)
- `size` / `width` / `height`: same convention as image
- `seconds`: Duration in seconds (engine multiplies by `frame_rate` to derive `num_frames` when the latter is absent)
- `frame_rate` (canonical) or `fps` (alias): frames per second
- `num_frames`: when set, wins over the `seconds * frame_rate` derivation
- `seed`, `num_inference_steps`, `guidance_scale`, `max_sequence_length`, `negative_prompt`: per-request denoise controls
- `input_reference`: Reference image (TI2V mode); accepted as base64-encoded string in JSON or as a file in multipart form-data
- `extra_params`: model-specific overflow (see below)
- `response_format`: `"b64_json"` or `"url"`
- `format`: Generation content encoding. Video encoders: `"mp4"`, `"avi"`, `"auto"`. Tensor formats: `"safetensors"`, `"pt"` (carries video + audio + scalar metadata in one payload for LTX-2).

#### Tensor-format consumer contract

When `format="safetensors"` or `format="pt"`, the payload bundles every populated media tensor (`image` / `video` / `audio`) and the scalar metadata (`frame_rate`, `audio_sample_rate`) into one file.

- **`pt`**: `torch.load(buf, weights_only=True)` returns a dict with the tensor keys and the scalars as native Python values.
- **`safetensors`**: `safetensors.torch.load(bytes)` returns a dict with the tensor keys and each scalar as a 0-d tensor under the same key — call `.item()` to unbox (e.g. `loaded["frame_rate"].item()`). The same scalars are also written to the safetensors file header as strings; `safe_open(path, framework="pt").metadata()` exposes them in that form for consumers that prefer header access.

#### Unknown-field policy

The visual-gen endpoints reject unknown top-level fields with HTTP 422 (`extra="forbid"`). Anything model-specific belongs inside `extra_params`. Sending `output_format`, top-level `guidance_rescale`, or — for video — top-level `n` returns 422 with the offending field named in the error body.

#### Model-specific `extra_params`

Use the Python API to discover accepted keys for a loaded pipeline:

```python
generator = VisualGen(model="...")
print(generator.extra_param_specs) # {key: ExtraParamSchema(type=..., range=..., default=..., description=...)}
```

Examples:
- **LTX-2**: `stg_scale`, `stg_blocks`, `modality_scale`, `guidance_rescale`, `output_type`, ...
- **Wan 2.2 A14B**: `guidance_scale_2`, `boundary_ratio`
- **Wan 2.1 / Flux**: no model-specific `extra_params` declared

> **Note:** LTX-2 generates video **with audio**. The `ltx2.yml` config must include
> `text_encoder_path` pointing to a Gemma3 model (e.g., `google/gemma-3-12b-it`).
Expand Down
32 changes: 21 additions & 11 deletions examples/visual_gen/serve/async_video_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test_async_video_generation(
fps: int = 24,
size: str = "256x256",
output_file: str = "output_async.mp4",
output_format: str = "auto",
format: str = "auto",
):
"""Test asynchronous video generation with OpenAI SDK.

Expand Down Expand Up @@ -77,7 +77,7 @@ def test_async_video_generation(
"seconds": duration,
"extra_body": {
"fps": fps,
"output_format": output_format,
"format": format,
},
}

Expand Down Expand Up @@ -131,12 +131,18 @@ def test_async_video_generation(
# For binary content, use the underlying HTTP client
content = client.videos.download_content(video_id, variant="video")

# Check content type to determine actual file extension
content_type = getattr(content.response, "headers", {}).get("content-type", "video/mp4")
if "x-msvideo" in content_type or "avi" in content_type:
actual_ext = ".avi"
# Determine the on-disk extension. Tensor formats are
# selected by the request and the server returns
# ``application/octet-stream``; encoder formats can be
# disambiguated from Content-Type (mp4 vs avi).
if format in ("safetensors", "pt"):
actual_ext = f".{format}"
else:
actual_ext = ".mp4"
content_type = getattr(content.response, "headers", {}).get("content-type", "video/mp4")
if "x-msvideo" in content_type or "avi" in content_type:
actual_ext = ".avi"
else:
actual_ext = ".mp4"

# Adjust output filename if extension doesn't match
output_path = Path(output_file)
Expand Down Expand Up @@ -233,11 +239,15 @@ def test_async_video_generation(
)

parser.add_argument(
"--output-format",
"--format",
type=str,
default="auto",
choices=["mp4", "avi", "auto"],
help="Output video format: mp4 or avi or auto",
choices=["mp4", "avi", "auto", "safetensors", "pt"],
help=(
"Generation content encoding format. Encoders: mp4 / avi / auto. "
"Tensor formats safetensors / pt return raw tensor bytes for "
"programmatic post-processing."
),
)

args = parser.parse_args()
Expand All @@ -264,7 +274,7 @@ def test_async_video_generation(
fps=args.fps,
size=args.size,
output_file=args.output,
output_format=args.output_format,
format=args.format,
)

sys.exit(0 if success else 1)
43 changes: 35 additions & 8 deletions examples/visual_gen/serve/sync_image_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,16 @@ def test_image_generation(
prompt: str = "A lovely cat lying on a sofa",
n: int = 1,
size: str = "512x512",
quality: str = "standard",
format: str = "png",
response_format: str = "b64_json",
output_file: str = "output_generation.png",
):
"""Test image generation endpoint."""
"""Test image generation endpoint.

``format`` selects the encoding for the returned bytes. Image encoders
are ``"png"``, ``"webp"``, ``"jpeg"``; tensor payloads are
``"safetensors"`` and ``"pt"``.
"""
print("=" * 80)
print("Testing Image Generation API (POST /v1/images/generations)")
print("=" * 80)
Expand All @@ -44,30 +49,41 @@ def test_image_generation(
print(f" Model: {model}")
print(f" Prompt: {prompt}")
print(f" Size: {size}")
print(f" Quality: {quality}")
print(f" Format: {format}")
print(f" Number of images: {n}")

try:
# Use OpenAI SDK's images.generate() method
# ``format`` is a trtllm-serve extension over the OpenAI image
# API; the SDK forwards it via ``extra_body``.
response = client.images.generate(
model=model,
prompt=prompt,
n=n,
size=size,
quality=quality,
response_format=response_format,
extra_body={"format": format},
)

print("\n✓ Image generated successfully!")
print(f" Number of images: {len(response.data)}")

# Choose the on-disk extension to match the requested format so
# the saved file's suffix reflects its actual contents.
ext_map = {
"png": ".png",
"webp": ".webp",
"jpeg": ".jpeg",
"safetensors": ".safetensors",
"pt": ".pt",
}
ext = ext_map[format]
stem = output_file.rsplit(".", 1)[0]

# Save images
for i, image in enumerate(response.data):
if response_format == "b64_json":
# Decode base64 image
image_data = base64.b64decode(image.b64_json)
output = f"{output_file.rsplit('.', 1)[0]}_{i}.png" if n > 1 else output_file

output = f"{stem}_{i}{ext}" if n > 1 else f"{stem}{ext}"
with open(output, "wb") as f:
f.write(image_data)

Expand Down Expand Up @@ -116,6 +132,16 @@ def test_image_generation(
default="512x512",
help="Image size in WxH format (e.g., 512x512, 1024x1024)",
)
parser.add_argument(
"--format",
type=str,
default="png",
choices=["png", "webp", "jpeg", "safetensors", "pt"],
help=(
"Generation content encoding format. Image encoders: png / "
"webp / jpeg. Tensor payloads: safetensors / pt."
),
)
parser.add_argument(
"--output",
type=str,
Expand All @@ -137,6 +163,7 @@ def test_image_generation(
model=args.model,
prompt=args.prompt,
size=args.size,
format=args.format,
output_file=args.output,
)

Expand Down
Loading
Loading