From 4ec8d0467b65d322845c45f34e673ecdaa0b6aaa Mon Sep 17 00:00:00 2001 From: Lianet Magrans Date: Fri, 5 Jun 2026 12:09:20 -0400 Subject: [PATCH 01/11] support dyn for uncompressed --- checkstyle/suppressions.xml | 2 +- .../producer/internals/BufferPool.java | 62 ++-- .../producer/internals/ChunkedBufferPool.java | 200 +++++++++++ .../ChunkedByteBufferOutputStream.java | 246 +++++++++++++ .../internals/ChunkedRecordAccumulator.java | 336 ++++++++++++++++++ .../producer/internals/ProducerBatch.java | 33 ++ .../producer/internals/RecordAccumulator.java | 61 ++-- .../record/internal/MemoryRecordsBuilder.java | 9 + gradle/spotbugs-exclude.xml | 12 + 9 files changed, 914 insertions(+), 47 deletions(-) create mode 100644 clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPool.java create mode 100644 clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStream.java create mode 100644 clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulator.java diff --git a/checkstyle/suppressions.xml b/checkstyle/suppressions.xml index 1881f95ead5ea..a537e61d0217e 100644 --- a/checkstyle/suppressions.xml +++ b/checkstyle/suppressions.xml @@ -71,7 +71,7 @@ files="(Utils|Topic|Lz4BlockOutputStream|JoinGroupRequest).java"/> + files="(AbstractFetch|ChunkedBufferPool|ClientTelemetryReporter|ConsumerCoordinator|CommitRequestManager|FetchCollector|OffsetFetcherUtils|KafkaProducer|Sender|ConfigDef|KerberosLogin|AbstractRequest|AbstractResponse|Selector|SslFactory|SslTransportLayer|SaslClientAuthenticator|SaslClientCallbackHandler|SaslServerAuthenticator|AbstractCoordinator|TransactionManager|AbstractStickyAssignor|DefaultSslEngineFactory|Authorizer|RecordAccumulator|MemoryRecords|FetchSessionHandler|MockAdminClient).java"/> diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/BufferPool.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/BufferPool.java index 517a2bd9ca7a6..f376c0fe2f4a3 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/internals/BufferPool.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/BufferPool.java @@ -48,15 +48,19 @@ public class BufferPool { private final long totalMemory; private final int poolableSize; - private final ReentrantLock lock; - private final Deque free; - private final Deque waiters; - /** Total available memory is the sum of nonPooledAvailableMemory and the number of byte buffers in free * poolableSize. */ - private long nonPooledAvailableMemory; + /** Lock held for any read or write of {@link #free}, {@link #waiters}, {@link #nonPooledAvailableMemory}, or {@link #closed}. */ + protected final ReentrantLock lock; + /** Pooled buffers of capacity {@link #poolableSize}, available for reuse. Guarded by {@link #lock}. */ + protected final Deque free; + /** FIFO queue of pending allocation requests; the longest-waiting thread is woken first. Guarded by {@link #lock}. */ + protected final Deque waiters; + /** Total available memory is the sum of nonPooledAvailableMemory and the number of byte buffers in free * poolableSize. Guarded by {@link #lock}. */ + protected long nonPooledAvailableMemory; private final Metrics metrics; - private final Time time; + protected final Time time; private final Sensor waitTime; - private boolean closed; + /** True once {@link #close()} has been invoked. */ + protected boolean closed; /** * Create a new buffer pool @@ -192,8 +196,7 @@ public ByteBuffer allocate(int size, long maxTimeToBlockMs) throws InterruptedEx // signal any additional waiters if there is more memory left // over for them try { - if (!(this.nonPooledAvailableMemory == 0 && this.free.isEmpty()) && !this.waiters.isEmpty()) - this.waiters.peekFirst().signal(); + signalNextWaiterIfMemoryAvailable(); } finally { // Another finally... otherwise find bugs complains lock.unlock(); @@ -211,6 +214,15 @@ protected void recordWaitTime(long timeNs) { this.waitTime.record(timeNs, time.milliseconds()); } + /** + * Wake the longest-waiting thread if any memory (pooled or non-pooled) is available. + * Must be called with {@link #lock} held. No-op if no waiters or no memory is free. + */ + protected void signalNextWaiterIfMemoryAvailable() { + if (!(this.nonPooledAvailableMemory == 0 && this.free.isEmpty()) && !this.waiters.isEmpty()) + this.waiters.peekFirst().signal(); + } + /** * Allocate a buffer. If buffer allocation fails (e.g. because of OOM) then return the size count back to * available memory and signal the next waiter if it exists. @@ -222,16 +234,24 @@ private ByteBuffer safeAllocateByteBuffer(int size) { error = false; return buffer; } finally { - if (error) { - this.lock.lock(); - try { - this.nonPooledAvailableMemory += size; - if (!this.waiters.isEmpty()) - this.waiters.peekFirst().signal(); - } finally { - this.lock.unlock(); - } - } + if (error) + releaseReservedBytes(size); + } + } + + /** + * Return previously-reserved non-pooled bytes to the pool and signal the next + * waiter. Acquires {@link #lock} internally. Used by callers + * that reserve memory and then need to roll back the reservation (e.g., upon errors). + */ + protected void releaseReservedBytes(long bytes) { + this.lock.lock(); + try { + this.nonPooledAvailableMemory += bytes; + if (!this.waiters.isEmpty()) + this.waiters.peekFirst().signal(); + } finally { + this.lock.unlock(); } } @@ -242,9 +262,9 @@ protected ByteBuffer allocateByteBuffer(int size) { /** * Attempt to ensure we have at least the requested number of bytes of memory for allocation by deallocating pooled - * buffers (if needed) + * buffers (if needed). Must be called with {@link #lock} held. */ - private void freeUp(int size) { + protected void freeUp(int size) { while (!this.free.isEmpty() && this.nonPooledAvailableMemory < size) this.nonPooledAvailableMemory += this.free.pollLast().capacity(); } diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPool.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPool.java new file mode 100644 index 0000000000000..96a4e6e0b36e4 --- /dev/null +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPool.java @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.clients.producer.internals; + +import org.apache.kafka.clients.producer.BufferExhaustedException; +import org.apache.kafka.common.KafkaException; +import org.apache.kafka.common.metrics.Metrics; +import org.apache.kafka.common.utils.Time; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Condition; + +/** + * A {@link BufferPool} dedicated to chunk-sized buffer reuse. The chunk size is passed to the + * parent as its {@link #poolableSize()}. + *

+ * Adds {@link #allocateChunks(int, long)} for callers that need to acquire multiple + * chunks at once. The implementation is atomic across the K-chunk request: + * a single lock acquisition for the whole reservation, a single {@link Condition} + * added to {@link #waiters} so the request is FIFO-fair against single-chunk acquirers, and + * any failure (timeout, close, OOM) refunds the entire reservation atomically before the + * exception propagates. No partial chunk holds are visible outside the lock during the wait. + */ +public class ChunkedBufferPool extends BufferPool { + + public ChunkedBufferPool(long memory, int chunkSize, Metrics metrics, Time time, String metricGrpName) { + super(memory, chunkSize, metrics, time, metricGrpName); + } + + /** + * Allocate {@code ceil(totalSize / chunkSize)} chunk-sized buffers atomically. + * This follows the same principle as the allocate in the parent BufferPool class: if there is + * memory available, allocate the requested number of chunks. If not, wait for memory. + * If it needs to wait, it blocks up to {@code maxTimeToBlockMs} for the whole request, + * queued on {@link #waiters} alongside single-chunk acquirers (FIFO). + *

+ * On any failure path every byte reserved during the call is returned to the pool + * and the next waiter is signaled. No partial chunk holds are visible to other threads during the wait. + * The reservation is tracked as bytes against {@link #nonPooledAvailableMemory} plus chunks polled from {@link #free}. + * + * @param totalSize minimum total bytes of capacity required across the returned chunks + * @param maxTimeToBlockMs maximum time in milliseconds to block waiting for memory + * @return list of {@code ceil(totalSize / chunkSize)} {@code ByteBuffer}s, each of capacity + * {@code chunkSize} + * @throws InterruptedException if interrupted while waiting + * @throws IllegalArgumentException if {@code totalSize <= 0} or {@code totalSize > totalMemory()} + * @throws BufferExhaustedException if the request can't be satisfied within {@code maxTimeToBlockMs} + * @throws KafkaException if the pool is closed during the wait + */ + public List allocateChunks(int totalSize, long maxTimeToBlockMs) throws InterruptedException { + if (totalSize <= 0) + throw new IllegalArgumentException("totalSize must be positive: " + totalSize); + if (totalSize > totalMemory()) + throw new IllegalArgumentException("Attempt to allocate " + totalSize + + " bytes across chunks, but there is a hard limit of " + + totalMemory() + " on memory allocations."); + + int chunkSize = poolableSize(); + int numChunks = (int) (((long) totalSize + chunkSize - 1L) / chunkSize); + long memoryRequired = (long) numChunks * chunkSize; + + // Chunks pulled from the free list during the reservation. Remaining bytes + // (memoryRequired - pooled.size() * chunkSize) are reserved against + // nonPooledAvailableMemory and materialized as raw allocations after the lock is + // released. + List pooled = new ArrayList<>(numChunks); + + lock.lock(); + if (this.closed) { + lock.unlock(); + throw new KafkaException("Producer closed while allocating memory"); + } + try { + long freeListBytes = (long) free.size() * chunkSize; + if (this.nonPooledAvailableMemory + freeListBytes >= memoryRequired) { + // Enough memory available to allocate the chunks needed + while (pooled.size() < numChunks && !free.isEmpty()) + pooled.add(free.pollFirst()); + long remainingBytes = memoryRequired - (long) pooled.size() * chunkSize; + if (remainingBytes > 0) { + // remainingBytes is bounded by memoryRequired ≤ totalMemory; the entry check + // rejects requests larger than that, so the int cast is safe in practice. + freeUp((int) remainingBytes); + this.nonPooledAvailableMemory -= remainingBytes; + } + } else { + // Not enough memory available to allocate the chunks needed, so we need to wait for memory. + // Same as in BufferPool.allocate, but wait to acquire the memory needed for all the chunks. + // A single Condition is added to the waiter's list to ensure FIFO fairness at the request level. + // + // `accumulated` tracks bytes drawn from nonPooledAvailableMemory only (pool chunks + // already taken live in `pooled`). Matches BufferPool.allocate's semantics: on + // failure, `accumulated` is exactly the amount to refund; on success it is reset to 0. + long accumulated = 0; + Condition moreMemory = lock.newCondition(); + try { + long remainingTimeToBlockNs = TimeUnit.MILLISECONDS.toNanos(maxTimeToBlockMs); + waiters.addLast(moreMemory); + while ((long) pooled.size() * chunkSize + accumulated < memoryRequired) { + long startWaitNs = time.nanoseconds(); + long timeNs; + boolean waitingTimeElapsed; + try { + waitingTimeElapsed = !moreMemory.await(remainingTimeToBlockNs, TimeUnit.NANOSECONDS); + } finally { + long endWaitNs = time.nanoseconds(); + timeNs = Math.max(0L, endWaitNs - startWaitNs); + recordWaitTime(timeNs); + } + + if (this.closed) + throw new KafkaException("Producer closed while allocating memory"); + + if (waitingTimeElapsed) { + throw new BufferExhaustedException("Failed to allocate " + memoryRequired + + " bytes (" + numChunks + " chunks of " + chunkSize + + ") within the configured max blocking time " + maxTimeToBlockMs + + " ms. Total memory: " + totalMemory() + " bytes. Available memory: " + + availableMemory() + " bytes."); + } + + remainingTimeToBlockNs -= timeNs; + + // Take pool chunks first, then reserve non-pool bytes for the remainder. + // Pool chunks don't bump `accumulated` because their refund path is + // `free.addFirst` in the outer catch, not nonPool. + while (pooled.size() < numChunks + && (long) (pooled.size() + 1) * chunkSize + accumulated <= memoryRequired + && !free.isEmpty()) { + pooled.add(free.pollFirst()); + } + long stillNeeded = memoryRequired - (long) pooled.size() * chunkSize - accumulated; + if (stillNeeded > 0) { + freeUp((int) stillNeeded); + long got = Math.min(stillNeeded, this.nonPooledAvailableMemory); + this.nonPooledAvailableMemory -= got; + accumulated += got; + } + } + // Loop terminated normally — clear the rollback marker (mirrors BufferPool.allocate). + accumulated = 0; + } finally { + // On failure (timeout / close / interrupt), refund the non-pool bytes taken. + // Pool chunks already in `pooled` are returned to `free` separately by the + // outer catch. + this.nonPooledAvailableMemory += accumulated; + waiters.remove(moreMemory); + } + } + } catch (RuntimeException | InterruptedException e) { + // Any chunks taken from the pool must be returned. + for (ByteBuffer chunk : pooled) + free.addFirst(chunk); + throw e; + } finally { + try { + signalNextWaiterIfMemoryAvailable(); + } finally { + lock.unlock(); + } + } + + // Allocate raw chunks for the reserved non-pooled portion outside the lock. On OOM, + // refund all reserved bytes (memoryRequired) and let the next waiter try. + int chunksStillNeeded = numChunks - pooled.size(); + List result = new ArrayList<>(numChunks); + result.addAll(pooled); + boolean error = true; + try { + for (int i = 0; i < chunksStillNeeded; i++) + result.add(allocateByteBuffer(chunkSize)); + error = false; + return result; + } finally { + if (error) { + // The pooled buffers we already drained are also lost on this path; mirror + // BufferPool.safeAllocateByteBuffer's behaviour (the bytes return, the ByteBuffer + // instances become garbage). + releaseReservedBytes(memoryRequired); + } + } + } +} diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStream.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStream.java new file mode 100644 index 0000000000000..ccf1d3d7374c9 --- /dev/null +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStream.java @@ -0,0 +1,246 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.clients.producer.internals; + +import org.apache.kafka.common.utils.ByteBufferOutputStream; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; + +/** + * A {@link ByteBufferOutputStream} backed by a linked list of fixed-size chunks instead of a + * single re-allocated buffer. + *

+ * This stream does not grow on its own. Chunks are supplied by the caller — initial + * chunks via the constructor, additional chunks via {@link #addBuffers(List)}. When a write's + * size exceeds the stream's {@link #remaining()} (sum of free bytes across all attached + * chunks), {@link IllegalStateException} is thrown. The caller is responsible for attaching + * additional chunks before any such write. + *

+ * Automatic mid-write growth (allocating a chunk from inside {@code write()}) will be added + * together with compression support in a follow-up: compressor buffering can cause a record's + * write to exceed its reservation, which the caller can't predict in advance. Until then, the + * write path stays allocation-free and deterministic. + *

+ * When {@link #buffer()} is called (typically at batch close), all chunks are flattened into a + * single contiguous ByteBuffer — exactly one copy operation. + */ +public class ChunkedByteBufferOutputStream extends ByteBufferOutputStream { + + private final List chunks; + private final int chunkSize; + private final BufferPool pool; + private ByteBuffer currentChunk; + private int currentChunkIndex; + private ByteBuffer flattenedBuffer; + private boolean dirty; + + /** + * Constructs a chunked output stream backed by the given pre-allocated chunks. Ownership of + * {@code initialChunks} transfers to this stream — they will be returned to the pool via + * {@link #deallocate()}. + * + * @param initialChunks pre-allocated chunks; must be non-empty; each chunk's capacity must + * equal {@code chunkSize} + * @param chunkSize the size of each chunk in bytes + * @param pool the buffer pool used for deallocation + */ + @SuppressWarnings("this-escape") + public ChunkedByteBufferOutputStream(List initialChunks, int chunkSize, BufferPool pool) { + super(initialChunks.get(0)); + this.chunkSize = chunkSize; + this.pool = pool; + this.chunks = new ArrayList<>(initialChunks); + this.currentChunk = this.chunks.get(0); + this.currentChunkIndex = 0; + this.dirty = true; + } + + @Override + public void write(int b) { + ensureChunkCapacity(1); + currentChunk.put((byte) b); + dirty = true; + } + + @Override + public void write(byte[] bytes, int off, int len) { + while (len > 0) { + ensureChunkCapacity(1); + int toWrite = Math.min(len, currentChunk.remaining()); + currentChunk.put(bytes, off, toWrite); + off += toWrite; + len -= toWrite; + } + dirty = true; + } + + @Override + public void write(ByteBuffer sourceBuffer) { + while (sourceBuffer.hasRemaining()) { + ensureChunkCapacity(1); + int toWrite = Math.min(sourceBuffer.remaining(), currentChunk.remaining()); + int oldLimit = sourceBuffer.limit(); + sourceBuffer.limit(sourceBuffer.position() + toWrite); + currentChunk.put(sourceBuffer); + sourceBuffer.limit(oldLimit); + } + dirty = true; + } + + private void ensureChunkCapacity(int needed) { + if (currentChunk.remaining() < needed) { + advanceToNextChunk(); + } + } + + /** + * Advances {@code currentChunk} to the next pre-supplied chunk. The caller is responsible + * for obtaining additional chunks (e.g. {@code ChunkedBufferPool.allocateChunks}) and + * attaching them via {@link #addBuffers(List)} before any write that would exceed + * {@link #remaining()}. The KIP's mid-record growth path (non-blocking pool then direct + * heap) lands together with compression support. + */ + private void advanceToNextChunk() { + if (currentChunkIndex + 1 >= chunks.size()) { + throw new IllegalStateException( + "No more chunks available; the write exceeded the stream's remaining capacity. " + + "Caller must obtain additional chunks (e.g. from ChunkedBufferPool) and attach them " + + "via addBuffers before any write whose size exceeds remaining()"); + } + currentChunkIndex++; + currentChunk = chunks.get(currentChunkIndex); + } + + /** + * Appends pre-allocated chunks to this stream. Ownership of {@code newChunks} transfers to + * the stream; they will be returned to the pool via {@link #deallocate()}. + */ + public void addBuffers(List newChunks) { + chunks.addAll(newChunks); + } + + @Override + public ByteBuffer buffer() { + if (flattenedBuffer != null && !dirty) { + return flattenedBuffer; + } + int totalSize = 0; + for (ByteBuffer chunk : chunks) { + totalSize += chunk.position(); + } + flattenedBuffer = ByteBuffer.allocate(totalSize); + for (ByteBuffer chunk : chunks) { + int chunkPos = chunk.position(); + chunk.flip(); + flattenedBuffer.put(chunk); + chunk.limit(chunk.capacity()); + chunk.position(chunkPos); + } + dirty = false; + return flattenedBuffer; + } + + /** + * Total bytes written across all chunks. + */ + @Override + public int position() { + int total = 0; + for (ByteBuffer chunk : chunks) { + total += chunk.position(); + } + return total; + } + + /** + * Sets the write position, walking across pre-supplied chunks if the requested position + * exceeds the first chunk's capacity. Only valid before any write. + */ + @Override + public void position(int position) { + if (currentChunkIndex != 0 || currentChunk.position() != 0) { + throw new IllegalStateException("position() can only be called before any writes"); + } + int remaining = position; + int idx = 0; + while (remaining > 0 && idx < chunks.size()) { + ByteBuffer chunk = chunks.get(idx); + int take = Math.min(remaining, chunk.capacity()); + chunk.position(take); + remaining -= take; + if (remaining > 0) + idx++; + } + if (remaining > 0) { + throw new IllegalArgumentException("position " + position + + " exceeds total pre-allocated capacity"); + } + currentChunkIndex = idx; + currentChunk = chunks.get(idx); + dirty = true; + } + + /** + * Total bytes available across the current chunk and every queued (not-yet-active) chunk. + */ + @Override + public int remaining() { + int total = currentChunk.remaining(); + for (int i = currentChunkIndex + 1; i < chunks.size(); i++) + total += chunks.get(i).remaining(); + return total; + } + + @Override + public int limit() { + return Integer.MAX_VALUE; + } + + @Override + public int initialCapacity() { + return chunks.isEmpty() ? chunkSize : chunks.get(0).capacity(); + } + + @Override + public void ensureRemaining(int remainingBytesRequired) { + // The stream advances one chunk at a time — the most any single ensureChunkCapacity + // call can guarantee is `chunkSize` of contiguous-by-chunk space. Callers needing more + // than that are expected to attach chunks via addBuffers before writing; write(byte[]) + // itself loops across chunks so contiguous capacity isn't required. + ensureChunkCapacity(Math.min(remainingBytesRequired, chunkSize)); + } + + /** + * Returns all pool-allocated chunks to the buffer pool. + */ + public void deallocate(BufferPool pool) { + if (pool != null) { + for (ByteBuffer chunk : chunks) { + pool.deallocate(chunk); + } + } + chunks.clear(); + currentChunk = null; + flattenedBuffer = null; + } + + public void deallocate() { + deallocate(pool); + } +} diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulator.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulator.java new file mode 100644 index 0000000000000..cac18c608d195 --- /dev/null +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulator.java @@ -0,0 +1,336 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.clients.producer.internals; + +import org.apache.kafka.clients.producer.BufferExhaustedException; +import org.apache.kafka.clients.producer.RecordMetadata; +import org.apache.kafka.common.Cluster; +import org.apache.kafka.common.compress.Compression; +import org.apache.kafka.common.header.Header; +import org.apache.kafka.common.metrics.Metrics; +import org.apache.kafka.common.record.TimestampType; +import org.apache.kafka.common.record.internal.AbstractRecords; +import org.apache.kafka.common.record.internal.CompressionType; +import org.apache.kafka.common.record.internal.MemoryRecordsBuilder; +import org.apache.kafka.common.record.internal.Record; +import org.apache.kafka.common.record.internal.RecordBatch; +import org.apache.kafka.common.utils.Time; +import org.apache.kafka.common.utils.internals.LogContext; + +import java.nio.ByteBuffer; +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.List; + +/** + * A {@link RecordAccumulator} variant that uses chunked buffer allocation: each new batch + * pre-allocates {@code ceil(recordSize / chunkSize)} chunks from a {@link ChunkedBufferPool}, + * and grows mid-batch by attaching additional chunks via + * {@link ProducerBatch#addBuffers(List)} when later records would overflow. + *

+ * This is the "dynamic" buffer.memory allocation strategy: memory consumption scales with + * actual data buffered, not with {@code active_partition_count × batch.size}. + *

+ * PR1 scope: uncompressed only. Batch creation throws + * {@link UnsupportedOperationException} if compression is requested. The mid-batch flow + * matches the KIP: try a non-blocking pool acquire for the extension chunks; on + * {@link BufferExhaustedException}, close the existing batch (so the sender can drain it) + * and let the new record flow through the first-record path, which blocks on the pool for + * a fresh batch's chunks up to {@code max.block.ms}. + *

+ * TODO: add the new buffer.memory.allocation.strategy config and instantiate this class only when set to "dynamic" + * TODO: add support for compressed data. Throws UnsupportedOperationException for now, + * and IllegalStateException if overflow detected. Follow-ups will support compressed data and growth. + */ +public class ChunkedRecordAccumulator extends RecordAccumulator { + + private final ChunkedBufferPool chunkedFree; + + public ChunkedRecordAccumulator(LogContext logContext, + int batchSize, + Compression compression, + int lingerMs, + long retryBackoffMs, + long retryBackoffMaxMs, + int deliveryTimeoutMs, + PartitionerConfig partitionerConfig, + Metrics metrics, + String metricGrpName, + Time time, + TransactionManager transactionManager, + ChunkedBufferPool bufferPool) { + super(logContext, batchSize, compression, lingerMs, retryBackoffMs, retryBackoffMaxMs, + deliveryTimeoutMs, partitionerConfig, metrics, metricGrpName, time, transactionManager, bufferPool); + this.chunkedFree = bufferPool; + } + + public ChunkedRecordAccumulator(LogContext logContext, + int batchSize, + Compression compression, + int lingerMs, + long retryBackoffMs, + long retryBackoffMaxMs, + int deliveryTimeoutMs, + Metrics metrics, + String metricGrpName, + Time time, + TransactionManager transactionManager, + ChunkedBufferPool bufferPool) { + this(logContext, batchSize, compression, lingerMs, retryBackoffMs, retryBackoffMaxMs, + deliveryTimeoutMs, new PartitionerConfig(), metrics, metricGrpName, time, transactionManager, + bufferPool); + } + + @Override + public RecordAppendResult append(String topic, + int partition, + long timestamp, + byte[] key, + byte[] value, + Header[] headers, + AppendCallbacks callbacks, + long maxTimeToBlock, + long nowMs, + Cluster cluster) throws InterruptedException { + TopicInfo topicInfo = topicInfoMap.computeIfAbsent(topic, + k -> new TopicInfo(createBuiltInPartitioner(logContext, k, batchSize, partitionerRackAware, rack))); + + appendsInProgress.incrementAndGet(); + ChunkedByteBufferOutputStream bufferStream = null; + List extensionChunks = null; + int extensionBytes = 0; + if (headers == null) headers = Record.EMPTY_HEADERS; + try { + while (true) { + final BuiltInPartitioner.StickyPartitionInfo partitionInfo; + final int effectivePartition; + if (partition == RecordMetadata.UNKNOWN_PARTITION) { + partitionInfo = topicInfo.builtInPartitioner.peekCurrentPartitionInfo(cluster); + effectivePartition = partitionInfo.partition(); + } else { + partitionInfo = null; + effectivePartition = partition; + } + setPartition(callbacks, effectivePartition); + + Deque dq = topicInfo.batches.computeIfAbsent(effectivePartition, k -> new ArrayDeque<>()); + synchronized (dq) { + if (partitionChanged(topic, topicInfo, partitionInfo, dq, nowMs, cluster)) + continue; + + // Probe the tail directly. If the existing batch has logical room but the + // underlying stream lacks physical capacity, capture the gap and fall through + // to allocate the extension outside the deque lock. Otherwise, attempt the + // append against the existing batch. + ProducerBatch tail = dq.peekLast(); + int gap = tail == null ? 0 : tail.extensionBytesNeeded(timestamp, key, value, headers); + if (gap > 0) { + extensionBytes = gap; + } else { + extensionBytes = 0; // clear any stale value carried from a prior iteration + RecordAppendResult appendResult = tryAppend(timestamp, key, value, headers, callbacks, dq, nowMs); + if (appendResult != null) { + boolean enableSwitch = allBatchesFull(dq); + topicInfo.builtInPartitioner.updatePartitionInfo(partitionInfo, appendResult.appendedBytes, cluster, enableSwitch); + return appendResult; + } + } + } + + if (extensionBytes > 0 && extensionChunks == null) { + // Mid-batch extension: non-blocking attempt only. The producer thread already + // holds chunks for the open batch; blocking here risks deadlock with the Sender + // thread (which frees pool memory by completing batches). If the pool is + // exhausted, we close the existing batch (so it becomes drainable) and fall + // through to the first-record blocking path on the next loop iteration. + try { + extensionChunks = chunkedFree.allocateChunks(extensionBytes, 0L); + } catch (BufferExhaustedException e) { + log.trace("Pool exhausted while extending batch for topic {} partition {}; closing existing batch", + topic, effectivePartition); + synchronized (dq) { + ProducerBatch last = dq.peekLast(); + if (last != null && last.isWritable()) { + last.closeForRecordAppends(); + } + } + extensionBytes = 0; + continue; + } + nowMs = time.milliseconds(); + } else if (extensionBytes == 0 && bufferStream == null) { + // First-record path: block on the pool for enough chunks to fit this record. + // Temporary while compressed data not supported. + // TODO: drop this throw and route through the + // compressed path (which adds the mid-record growth fallback for compressor overshoot). + if (compression.type() != CompressionType.NONE) { + throw new UnsupportedOperationException( + "Compression is not implemented yet for the dynamic buffer.memory allocation strategy"); + } + int size = AbstractRecords.estimateSizeInBytesUpperBound( + RecordBatch.CURRENT_MAGIC_VALUE, compression.type(), key, value, headers); + log.trace("Allocating {} byte chunked buffer ({} byte chunks) for topic {} partition {} with remaining timeout {}ms", + size, chunkedFree.poolableSize(), topic, effectivePartition, maxTimeToBlock); + List initialChunks = chunkedFree.allocateChunks(size, maxTimeToBlock); + nowMs = time.milliseconds(); + bufferStream = new ChunkedByteBufferOutputStream(initialChunks, chunkedFree.poolableSize(), chunkedFree); + } + + synchronized (dq) { + if (partitionChanged(topic, topicInfo, partitionInfo, dq, nowMs, cluster)) + continue; + + if (extensionChunks != null) { + ProducerBatch last = dq.peekLast(); + // The off-lock allocateChunks window allows the original tail batch to be + // drained and a split batch (plain ByteBufferOutputStream) to be addFirst'd + // into an emptied deque — see splitAndReenqueue. The instanceof guard below + // is load-bearing: addBuffers is only defined on ChunkedByteBufferOutputStream; + // the else branch returns the chunks to the pool to avoid leaking them. + if (last != null && last.isWritable() + && last.bufferStream() instanceof ChunkedByteBufferOutputStream) { + ((ChunkedByteBufferOutputStream) last.bufferStream()).addBuffers(extensionChunks); + extensionChunks = null; + extensionBytes = 0; + // Re-probe after attach: between our first-lock probe and now we were off-lock, + // and another appender (or a tail-replacement) may have consumed capacity that + // our extension allocation was sized against. If a gap remains, loop to acquire + // more chunks against the now-current state. Without this re-probe, a + // tryAppend that passes hasRoomFor (writeLimit) but exceeds the stream's + // physical remaining() trips IllegalStateException in + // ChunkedByteBufferOutputStream.advanceToNextChunk. Termination: writeLimit is + // bounded and fixed; once hasRoomFor turns false, extensionBytesNeeded returns 0 + // and we fall through to new-batch creation. Regression test: + // testConcurrentExtensionRaceDoesNotOverflowChunkedStream. + int recheck = last.extensionBytesNeeded(timestamp, key, value, headers); + if (recheck > 0) { + extensionBytes = recheck; + continue; + } + RecordAppendResult retryResult = tryAppend(timestamp, key, value, headers, callbacks, dq, nowMs); + if (retryResult != null) { + boolean enableSwitch = allBatchesFull(dq); + topicInfo.builtInPartitioner.updatePartitionInfo(partitionInfo, retryResult.appendedBytes, cluster, enableSwitch); + return retryResult; + } + // Batch became full via writeLimit while we were off-lock — fall through + // to new-batch creation on the next iteration. + continue; + } + // Tail is gone, closed, or non-chunked (e.g., a split batch). Return chunks to pool. + for (ByteBuffer chunk : extensionChunks) + chunkedFree.deallocate(chunk); + extensionChunks = null; + extensionBytes = 0; + continue; + } + + // bufferStream is non-null here (first-record path). Before creating a new + // batch with it, re-probe the tail: a concurrent appender may have created a + // chunked batch we should extend instead. If so, release the oversized + // bufferStream and let the next iteration take the extension path. + ProducerBatch tail = dq.peekLast(); + if (tail != null && tail.isWritable() + && tail.bufferStream() instanceof ChunkedByteBufferOutputStream + && tail.extensionBytesNeeded(timestamp, key, value, headers) > 0) { + bufferStream.deallocate(); + bufferStream = null; + continue; + } + + int firstRecordSize = AbstractRecords.estimateSizeInBytesUpperBound( + RecordBatch.CURRENT_MAGIC_VALUE, compression.type(), key, value, headers); + final ChunkedByteBufferOutputStream batchStream = bufferStream; + RecordAppendResult appendResult = appendNewBatch(topic, effectivePartition, dq, timestamp, key, value, headers, callbacks, + () -> chunkedRecordsBuilder(batchStream, firstRecordSize), nowMs); + if (appendResult.newBatchCreated) + bufferStream = null; + boolean enableSwitch = allBatchesFull(dq); + topicInfo.builtInPartitioner.updatePartitionInfo(partitionInfo, appendResult.appendedBytes, cluster, enableSwitch); + return appendResult; + } + } + } finally { + if (bufferStream != null) + bufferStream.deallocate(); + if (extensionChunks != null) { + for (ByteBuffer chunk : extensionChunks) + chunkedFree.deallocate(chunk); + } + appendsInProgress.decrementAndGet(); + } + } + + /** + * Build a {@link MemoryRecordsBuilder} backed by the chunked stream. + * {@code writeLimit} is set to {@code max(batchSize, firstRecordSize)} — the same logical + * cap the legacy path uses. Physical chunk capacity grows on demand via + * {@link ChunkedByteBufferOutputStream#addBuffers(List)}; {@code writeLimit} bounds when a + * batch is considered logically full and a new one must be started. + */ + private MemoryRecordsBuilder chunkedRecordsBuilder(ChunkedByteBufferOutputStream bufferStream, + int firstRecordSize) { + int writeLimit = Math.max(batchSize, firstRecordSize); + return new MemoryRecordsBuilder(bufferStream, RecordBatch.CURRENT_MAGIC_VALUE, compression, + TimestampType.CREATE_TIME, 0L, RecordBatch.NO_TIMESTAMP, RecordBatch.NO_PRODUCER_ID, + RecordBatch.NO_PRODUCER_EPOCH, RecordBatch.NO_SEQUENCE, false, false, + RecordBatch.NO_PARTITION_LEADER_EPOCH, writeLimit); + } + + /** + * Chunked-aware deallocation for {@link ProducerBatch}. + *

+ * Intercepts the {@link #isInflight()} branch of the parent's + * {@link RecordAccumulator#deallocate(ProducerBatch)} when the batch is backed by a + * {@link ChunkedByteBufferOutputStream}. The parent's defensive path credits only + * {@code initialCapacity} (one {@code chunkSize}) back to the pool and throws — that's the + * KAFKA-19012-aware safety net for the legacy single-buffer path, where the in-flight + * buffer is the pooled buffer. A chunked batch holds K chunks but the in-flight + * network bytes live in a separate flattened heap buffer (see + * {@link ChunkedByteBufferOutputStream#buffer()}), so it is safe to return all K chunks + * to the pool here. The {@code IllegalStateException} is still propagated to preserve the + * contract that deallocating an inflight batch is unexpected upstream. + */ + @Override + public void deallocate(ProducerBatch batch) { + if (!batch.isSplitBatch() + && !batch.isBufferDeallocated() + && batch.isInflight() + && batch.bufferStream() instanceof ChunkedByteBufferOutputStream) { + batch.markBufferDeallocated(); + deallocateBatchBuffer(batch); + throw new IllegalStateException("Attempting to deallocate a batch that is inflight. Batch is " + batch); + } + super.deallocate(batch); + } + + /** + * Route chunked batch deallocation through the stream rather than the legacy + * single-buffer pool path. The flattened buffer returned by {@code batch.buffer()} is a + * fresh heap allocation and must NOT be passed to the pool — instead, the stream owns + * the chunk-sized buffers and returns them to the pool. + */ + @Override + protected void deallocateBatchBuffer(ProducerBatch batch) { + if (batch.bufferStream() instanceof ChunkedByteBufferOutputStream) { + ((ChunkedByteBufferOutputStream) batch.bufferStream()).deallocate(chunkedFree); + } else { + // Split batches and any non-chunked batches go through the default deallocation. + super.deallocateBatchBuffer(batch); + } + } +} diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ProducerBatch.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ProducerBatch.java index 14d0d6ef6334d..a7e47583d310b 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ProducerBatch.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ProducerBatch.java @@ -31,6 +31,7 @@ import org.apache.kafka.common.record.internal.Record; import org.apache.kafka.common.record.internal.RecordBatch; import org.apache.kafka.common.requests.ProduceResponse; +import org.apache.kafka.common.utils.ByteBufferOutputStream; import org.apache.kafka.common.utils.ProducerIdAndEpoch; import org.apache.kafka.common.utils.Time; @@ -139,6 +140,38 @@ boolean hasLeaderChangedForTheOngoingRetry() { } + /** + * Bytes of physical buffer this batch needs before {@code tryAppend} could accept the given + * record. Returns 0 when the record fits (either logically full, or stream has room). + * Positive when {@code hasRoomFor} allows the record but the underlying stream lacks + * physical capacity — used by the dynamic strategy to allocate exactly the missing bytes + * before retrying. + *

+ * For batches with the default {@link ByteBufferOutputStream} (legacy path), the stream's + * {@code remaining()} is large enough that this almost always returns 0. + */ + int extensionBytesNeeded(long timestamp, byte[] key, byte[] value, Header[] headers) { + if (recordCount == 0) + return 0; + if (!recordsBuilder.hasRoomFor(timestamp, key, value, headers)) + return 0; + // Upper-bound estimate is fine: over-counting by header bytes only over-allocates by a + // small fraction of one chunk. Under-estimation would be unsafe. + int recordSize = AbstractRecords.estimateSizeInBytesUpperBound( + magic(), recordsBuilder.compression().type(), key, value, headers); + int gap = recordSize - recordsBuilder.bufferStream().remaining(); + return Math.max(0, gap); + } + + /** + * The batch's underlying output stream. Exposed for the dynamic strategy to attach + * extension chunks and route deallocation through the chunked stream rather than the + * standard {@code BufferPool.deallocate(ByteBuffer, int)} path. + */ + ByteBufferOutputStream bufferStream() { + return recordsBuilder.bufferStream(); + } + /** * Append the record to the current record set and return the relative offset within that record set * diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/RecordAccumulator.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/RecordAccumulator.java index b1eced17e4f25..29ddd07a489bc 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/internals/RecordAccumulator.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/RecordAccumulator.java @@ -58,6 +58,7 @@ import java.util.Set; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Supplier; /** * This class acts as a queue that accumulates records into {@link MemoryRecords} @@ -68,23 +69,23 @@ */ public class RecordAccumulator { - private final LogContext logContext; - private final Logger log; - private volatile boolean closed; + protected final LogContext logContext; + protected final Logger log; + protected volatile boolean closed; private final AtomicInteger flushesInProgress; - private final AtomicInteger appendsInProgress; - private final int batchSize; - private final Compression compression; + protected final AtomicInteger appendsInProgress; + protected final int batchSize; + protected final Compression compression; private final int lingerMs; private final ExponentialBackoff retryBackoff; private final int deliveryTimeoutMs; private final long partitionAvailabilityTimeoutMs; // latency threshold for marking partition temporary unavailable - private final boolean partitionerRackAware; - private final String rack; + protected final boolean partitionerRackAware; + protected final String rack; private final boolean enableAdaptivePartitioning; private final BufferPool free; - private final Time time; - private final ConcurrentMap topicInfoMap = new CopyOnWriteMap<>(); + protected final Time time; + protected final ConcurrentMap topicInfoMap = new CopyOnWriteMap<>(); private final ConcurrentMap nodeStats = new CopyOnWriteMap<>(); private final IncompleteBatches incomplete; // The following variables are only accessed by the sender thread, so we don't need to protect them. @@ -217,7 +218,7 @@ private void registerMetrics(Metrics metrics, String metricGrpName) { (config, now) -> free.availableMemory()); } - private void setPartition(AppendCallbacks callbacks, int partition) { + protected void setPartition(AppendCallbacks callbacks, int partition) { if (callbacks != null) callbacks.setPartition(partition); } @@ -234,7 +235,7 @@ private void setPartition(AppendCallbacks callbacks, int partition) { * @return 'true' if partition changed and we need to get new partition info and retry, * 'false' otherwise */ - private boolean partitionChanged(String topic, + protected boolean partitionChanged(String topic, TopicInfo topicInfo, BuiltInPartitioner.StickyPartitionInfo partitionInfo, Deque deque, long nowMs, @@ -347,7 +348,10 @@ public RecordAppendResult append(String topic, if (partitionChanged(topic, topicInfo, partitionInfo, dq, nowMs, cluster)) continue; - RecordAppendResult appendResult = appendNewBatch(topic, effectivePartition, dq, timestamp, key, value, headers, callbacks, buffer, nowMs); + final ByteBuffer batchBuffer = buffer; + RecordAppendResult appendResult = appendNewBatch(topic, effectivePartition, dq, timestamp, key, value, headers, callbacks, + () -> MemoryRecords.builder(batchBuffer, RecordBatch.CURRENT_MAGIC_VALUE, compression, TimestampType.CREATE_TIME, 0L), + nowMs); // Set buffer to null, so that deallocate doesn't return it back to free pool, since it's used in the batch. if (appendResult.newBatchCreated) buffer = null; @@ -374,10 +378,13 @@ public RecordAppendResult append(String topic, * @param value The value for the record * @param headers the Headers for the record * @param callbacks The callbacks to execute - * @param buffer The buffer for the new batch + * @param recordsBuilderSupplier Supplies the {@link MemoryRecordsBuilder} for the new + * batch. Invoked lazily, only when this method is committed to creating a new + * batch (i.e. after the in-lock concurrent-append race check has lost). The + * chunked subclass passes a supplier that produces a stream-backed builder. * @param nowMs The current time, in milliseconds */ - private RecordAppendResult appendNewBatch(String topic, + protected RecordAppendResult appendNewBatch(String topic, int partition, Deque dq, long timestamp, @@ -385,7 +392,7 @@ private RecordAppendResult appendNewBatch(String topic, byte[] value, Header[] headers, AppendCallbacks callbacks, - ByteBuffer buffer, + Supplier recordsBuilderSupplier, long nowMs) { assert partition != RecordMetadata.UNKNOWN_PARTITION; @@ -395,7 +402,7 @@ private RecordAppendResult appendNewBatch(String topic, return appendResult; } - MemoryRecordsBuilder recordsBuilder = recordsBuilder(buffer); + MemoryRecordsBuilder recordsBuilder = recordsBuilderSupplier.get(); ProducerBatch batch = new ProducerBatch(new TopicPartition(topic, partition), recordsBuilder, nowMs); FutureRecordMetadata future = Objects.requireNonNull(batch.tryAppend(timestamp, key, value, headers, callbacks, nowMs)); @@ -406,14 +413,10 @@ private RecordAppendResult appendNewBatch(String topic, return new RecordAppendResult(future, dq.size() > 1 || batch.isFull(), true, batch.estimatedSizeInBytes()); } - private MemoryRecordsBuilder recordsBuilder(ByteBuffer buffer) { - return MemoryRecords.builder(buffer, RecordBatch.CURRENT_MAGIC_VALUE, compression, TimestampType.CREATE_TIME, 0L); - } - /** * Check if all batches in the queue are full. */ - private boolean allBatchesFull(Deque deque) { + protected boolean allBatchesFull(Deque deque) { // Only the last batch may be incomplete, so we just check that. ProducerBatch last = deque.peekLast(); return last == null || last.isFull(); @@ -427,7 +430,7 @@ private boolean allBatchesFull(Deque deque) { * and memory records built) in one of the following cases (whichever comes first): right before send, * if it is expired, or when the producer is closed. */ - private RecordAppendResult tryAppend(long timestamp, byte[] key, byte[] value, Header[] headers, + protected RecordAppendResult tryAppend(long timestamp, byte[] key, byte[] value, Header[] headers, Callback callback, Deque deque, long nowMs) { if (closed) throw new KafkaException("Producer closed while send in progress"); @@ -1058,11 +1061,19 @@ public void deallocate(ProducerBatch batch) { free.deallocate(ByteBuffer.allocate(batch.initialCapacity())); throw new IllegalStateException("Attempting to deallocate a batch that is inflight. Batch is " + batch); } - free.deallocate(batch.buffer(), batch.initialCapacity()); + deallocateBatchBuffer(batch); } } } + /** + * Return the batch's underlying buffer to the pool. + * This default implementation returns the buffer at its initial capacity (single buffer). + */ + protected void deallocateBatchBuffer(ProducerBatch batch) { + free.deallocate(batch.buffer(), batch.initialCapacity()); + } + /** * Remove from the incomplete list but do not free memory yet */ @@ -1298,7 +1309,7 @@ public ReadyCheckResult(Set readyNodes, long nextReadyCheckDelayMs, Set> batches = new CopyOnWriteMap<>(); public final BuiltInPartitioner builtInPartitioner; diff --git a/clients/src/main/java/org/apache/kafka/common/record/internal/MemoryRecordsBuilder.java b/clients/src/main/java/org/apache/kafka/common/record/internal/MemoryRecordsBuilder.java index 3fd89deaba135..47a955cfcf18e 100644 --- a/clients/src/main/java/org/apache/kafka/common/record/internal/MemoryRecordsBuilder.java +++ b/clients/src/main/java/org/apache/kafka/common/record/internal/MemoryRecordsBuilder.java @@ -211,6 +211,15 @@ public int initialCapacity() { return bufferStream.initialCapacity(); } + /** + * The underlying output stream. Exposed so the dynamic-strategy callers can attach + * pre-allocated chunks to a {@code ChunkedByteBufferOutputStream} and route deallocation + * polymorphically. + */ + public ByteBufferOutputStream bufferStream() { + return bufferStream; + } + public double compressionRatio() { return actualCompressionRatio; } diff --git a/gradle/spotbugs-exclude.xml b/gradle/spotbugs-exclude.xml index 6d18691973a45..1a56dc05037c9 100644 --- a/gradle/spotbugs-exclude.xml +++ b/gradle/spotbugs-exclude.xml @@ -127,6 +127,17 @@ For a detailed description of spotbugs bug categories, see https://spotbugs.read + + + + + + @@ -546,6 +557,7 @@ For a detailed description of spotbugs bug categories, see https://spotbugs.read --> + From 203a410dac3ec480aae2a483b26852d378df8513 Mon Sep 17 00:00:00 2001 From: Lianet Magrans Date: Fri, 5 Jun 2026 12:09:30 -0400 Subject: [PATCH 02/11] tests --- .../internals/ChunkedBufferPoolTest.java | 422 ++++++++++++++++++ .../ChunkedByteBufferOutputStreamTest.java | 171 +++++++ .../ChunkedRecordAccumulatorTest.java | 350 +++++++++++++++ 3 files changed, 943 insertions(+) create mode 100644 clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPoolTest.java create mode 100644 clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStreamTest.java create mode 100644 clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulatorTest.java diff --git a/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPoolTest.java b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPoolTest.java new file mode 100644 index 0000000000000..e0de8ae19e4dc --- /dev/null +++ b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPoolTest.java @@ -0,0 +1,422 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.clients.producer.internals; + +import org.apache.kafka.clients.producer.BufferExhaustedException; +import org.apache.kafka.common.metrics.Metrics; +import org.apache.kafka.common.utils.MockTime; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; + +import org.apache.kafka.common.KafkaException; + +import java.nio.ByteBuffer; +import java.util.List; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class ChunkedBufferPoolTest { + + private final MockTime time = new MockTime(); + private final Metrics metrics = new Metrics(time); + private final String metricGroup = "producer-metrics"; + + @AfterEach + public void teardown() { + metrics.close(); + } + + private ChunkedBufferPool pool(long totalMemory, int chunkSize) { + return new ChunkedBufferPool(totalMemory, chunkSize, metrics, time, metricGroup); + } + + /** Single-chunk request returns a list of one buffer at the chunk size. */ + @Test + public void testAllocateOneChunk() throws Exception { + int chunkSize = 64; + ChunkedBufferPool p = pool(1024, chunkSize); + List chunks = p.allocateChunks(chunkSize, 100); + assertEquals(1, chunks.size()); + assertEquals(chunkSize, chunks.get(0).capacity()); + } + + /** Total size that's not a multiple of chunk size rounds up; returned chunks cover it. */ + @Test + public void testAllocateRoundsUpToChunkBoundary() throws Exception { + int chunkSize = 64; + ChunkedBufferPool p = pool(1024, chunkSize); + // 65 bytes requested → 2 chunks (128 bytes total). + List chunks = p.allocateChunks(65, 100); + assertEquals(2, chunks.size()); + for (ByteBuffer chunk : chunks) + assertEquals(chunkSize, chunk.capacity()); + } + + /** Multi-chunk request returns ceil(totalSize / chunkSize) chunks. */ + @Test + public void testAllocateMultipleChunks() throws Exception { + int chunkSize = 64; + ChunkedBufferPool p = pool(1024, chunkSize); + List chunks = p.allocateChunks(4 * chunkSize, 100); + assertEquals(4, chunks.size()); + } + + /** Pool memory accounting: after allocation, the unallocated portion shrinks by the request. */ + @Test + public void testAvailableMemoryAfterAllocation() throws Exception { + int chunkSize = 64; + long total = 256; + ChunkedBufferPool p = pool(total, chunkSize); + p.allocateChunks(3 * chunkSize, 100); + assertEquals(total - 3L * chunkSize, p.availableMemory()); + } + + /** Returning chunks via deallocate restores pool memory. */ + @Test + public void testDeallocationRestoresMemory() throws Exception { + int chunkSize = 64; + long total = 256; + ChunkedBufferPool p = pool(total, chunkSize); + List chunks = p.allocateChunks(2 * chunkSize, 100); + for (ByteBuffer chunk : chunks) + p.deallocate(chunk); + assertEquals(total, p.availableMemory()); + } + + /** Requesting more than total memory throws IllegalArgumentException. */ + @Test + public void testRejectsRequestExceedingTotalMemory() { + ChunkedBufferPool p = pool(128, 64); + assertThrows(IllegalArgumentException.class, () -> p.allocateChunks(129, 100)); + } + + /** Non-positive totalSize is rejected. */ + @Test + public void testRejectsNonPositiveRequest() { + ChunkedBufferPool p = pool(128, 64); + assertThrows(IllegalArgumentException.class, () -> p.allocateChunks(0, 100)); + assertThrows(IllegalArgumentException.class, () -> p.allocateChunks(-1, 100)); + } + + /** + * When a multi-chunk request can't be fully satisfied within the deadline, chunks already + * acquired during the call must be returned to the pool — the caller must not leak memory. + */ + @Test + public void testRollbackOnPartialFailure() throws Exception { + int chunkSize = 64; + long total = 2 * chunkSize; // only 2 chunks worth of memory + ChunkedBufferPool p = pool(total, chunkSize); + // Reserve one chunk so the pool has only 1 left. + ByteBuffer held = p.allocate(chunkSize, 100); + + // Request 2 chunks with a zero deadline — first chunk fits, second can't, must throw. + assertThrows(BufferExhaustedException.class, () -> p.allocateChunks(2 * chunkSize, 0)); + + // Available memory must reflect only the chunk we deliberately hold; the request's + // first-chunk acquisition was rolled back. + assertEquals(total - chunkSize, p.availableMemory()); + + p.deallocate(held); + assertEquals(total, p.availableMemory()); + } + + /** + * No partial chunk holds during the wait. While a multi-chunk request blocks, the pool's + * {@code availableMemory()} must reflect bytes the waiter has not yet "earned" — i.e., the + * reservation is via the accumulator and not via chunks pulled out of the pool's accounting. + * Distinguishes the atomic implementation from the older loop-with-rollback (which held + * partials off the accounting between iterations). + */ + @Test + public void testNoPartialHoldsDuringWait() throws Exception { + int chunkSize = 64; + long total = 4L * chunkSize; + ChunkedBufferPool p = pool(total, chunkSize); + // Drain the pool down to 1 chunk's worth so a 2-chunk request can't be satisfied immediately. + ByteBuffer h1 = p.allocate(chunkSize, 100); + ByteBuffer h2 = p.allocate(chunkSize, 100); + ByteBuffer h3 = p.allocate(chunkSize, 100); + long available = p.availableMemory(); + assertEquals(chunkSize, available, "pool should have exactly 1 chunk free"); + + // Background thread requests 2 chunks; will block on the 2nd. + AtomicReference err = new AtomicReference<>(); + Thread t = new Thread(() -> { + try { + p.allocateChunks(2 * chunkSize, 5_000); + } catch (Throwable th) { + err.set(th); + } + }, "chunked-waiter"); + t.start(); + // Wait until the thread has joined the waiters queue. + long deadlineMs = System.currentTimeMillis() + 2_000; + while (p.queued() == 0 && System.currentTimeMillis() < deadlineMs) + Thread.sleep(5); + assertEquals(1, p.queued(), "waiter should be parked on the pool's queue"); + + // While the waiter is parked, the pool's available memory must still report the + // 1 chunk's worth — the waiter has NOT consumed any of it (no partial hold). + assertEquals(chunkSize, p.availableMemory(), + "pool memory must reflect no partial holds during the atomic wait"); + + // Free enough chunks for the waiter to complete. + p.deallocate(h1); + p.deallocate(h2); + t.join(5_000); + assertNull(err.get(), "waiter unexpectedly threw: " + err.get()); + p.deallocate(h3); + } + + /** + * FIFO fairness across the K-chunk request. A multi-chunk request that joins the wait queue + * before a single-chunk request must complete first when memory becomes available — the + * K-chunk request occupies a single waiter slot, not K of them. + */ + @Test + public void testFifoFairnessAcrossChunkedAndSingleChunkRequests() throws Exception { + int chunkSize = 64; + long total = 4L * chunkSize; + ChunkedBufferPool p = pool(total, chunkSize); + // Drain the pool entirely so any new request must wait. + ByteBuffer h1 = p.allocate(chunkSize, 100); + ByteBuffer h2 = p.allocate(chunkSize, 100); + ByteBuffer h3 = p.allocate(chunkSize, 100); + ByteBuffer h4 = p.allocate(chunkSize, 100); + assertEquals(0, p.availableMemory()); + + // T_multi enters first, requesting 2 chunks; T_single enters second, requesting 1 chunk. + AtomicReference multiCompletionMs = new AtomicReference<>(); + AtomicReference singleCompletionMs = new AtomicReference<>(); + CountDownLatch multiStarted = new CountDownLatch(1); + Thread tMulti = new Thread(() -> { + try { + multiStarted.countDown(); + List got = p.allocateChunks(2 * chunkSize, 10_000); + multiCompletionMs.set(System.nanoTime()); + for (ByteBuffer b : got) p.deallocate(b); + } catch (Throwable th) { + multiCompletionMs.set(-1L); + } + }, "multi"); + Thread tSingle = new Thread(() -> { + try { + ByteBuffer got = p.allocate(chunkSize, 10_000); + singleCompletionMs.set(System.nanoTime()); + p.deallocate(got); + } catch (Throwable th) { + singleCompletionMs.set(-1L); + } + }, "single"); + + tMulti.start(); + assertTrue(multiStarted.await(2, TimeUnit.SECONDS)); + // Wait for tMulti to be parked before starting tSingle, so the FIFO order is deterministic. + long deadlineMs = System.currentTimeMillis() + 2_000; + while (p.queued() == 0 && System.currentTimeMillis() < deadlineMs) + Thread.sleep(5); + assertEquals(1, p.queued(), "multi-chunk waiter should be the only one queued"); + tSingle.start(); + // Wait for tSingle to also be parked. + deadlineMs = System.currentTimeMillis() + 2_000; + while (p.queued() < 2 && System.currentTimeMillis() < deadlineMs) + Thread.sleep(5); + assertEquals(2, p.queued(), "single-chunk waiter joined after the multi-chunk one"); + + // Now free chunks one at a time, allowing the multi-chunk request to accumulate. + p.deallocate(h1); + Thread.sleep(50); + p.deallocate(h2); + // Both freed — the FIFO leader (multi) should claim both before the single-chunk waiter + // gets any. The multi completes first. + tMulti.join(5_000); + // Now free another chunk for the single-chunk waiter. + p.deallocate(h3); + tSingle.join(5_000); + + assertNotNull(multiCompletionMs.get(), "multi did not complete"); + assertNotNull(singleCompletionMs.get(), "single did not complete"); + assertTrue(multiCompletionMs.get() != -1L && singleCompletionMs.get() != -1L, + "both threads must complete without exception"); + assertTrue(multiCompletionMs.get() <= singleCompletionMs.get(), + "FIFO violated: single (joined later) completed at " + + singleCompletionMs.get() + " before multi at " + multiCompletionMs.get()); + p.deallocate(h4); + } + + /** + * Slow-path rollback must not double-refund pooled chunks. When the waiter polls some + * chunks from the free list, then times out on a subsequent iteration, the inner finally + * refunds {@code accumulated} bytes to {@code nonPooledAvailableMemory} and the outer + * catch re-inserts the polled chunks into {@code free}. If {@code accumulated} includes + * bytes that came from polled chunks, the same memory is refunded twice — the pool + * over-reports {@code availableMemory()} and subsequent allocations can exceed the + * configured {@code buffer.memory} limit. + */ + @Test + public void testSlowPathDoesNotDoubleRefundPolledChunksOnTimeout() throws Exception { + int chunkSize = 64; + long total = 3L * chunkSize; + ChunkedBufferPool p = pool(total, chunkSize); + + // Drain so the next allocateChunks must wait. + ByteBuffer h1 = p.allocate(chunkSize, 100); + ByteBuffer h2 = p.allocate(chunkSize, 100); + ByteBuffer h3 = p.allocate(chunkSize, 100); + assertEquals(0, p.availableMemory()); + + AtomicReference err = new AtomicReference<>(); + Thread t = new Thread(() -> { + try { + // Asks for 3 chunks with a finite deadline. After we deallocate 2 chunks below + // the waiter will poll them, find itself still 1 short, and time out on the + // next await iteration — with 2 chunks held in `pooled` and 2*chunkSize in + // `accumulated`. + p.allocateChunks(3 * chunkSize, 500); + } catch (Throwable th) { + err.set(th); + } + }, "partial-holder"); + t.start(); + + long deadline = System.currentTimeMillis() + 2_000; + while (p.queued() == 0 && System.currentTimeMillis() < deadline) + Thread.sleep(5); + assertEquals(1, p.queued()); + + // Hand the waiter 2 chunks — it polls them into `pooled` then awaits again for the 3rd. + p.deallocate(h1); + p.deallocate(h2); + + // Give the waiter a moment to wake, poll, and re-enter await before the timeout fires. + Thread.sleep(100); + + t.join(5_000); + assertTrue(err.get() instanceof BufferExhaustedException, + "expected BufferExhaustedException, got " + err.get()); + + // After the throw, exactly 2 chunkSizes of physical memory should be back in the pool + // (h1 + h2). h3 is still held outside the pool. The bug double-refunds: it credits + // 2*chunkSize to `nonPooledAvailableMemory` (inner finally) AND re-inserts h1, h2 into + // `free` (outer catch), so availableMemory() reports 4*chunkSize. + assertEquals(2L * chunkSize, p.availableMemory(), + "pool over-reports availableMemory due to double-refund of pooled chunks on rollback"); + + p.deallocate(h3); + } + + /** + * Slow-path success must not corrupt the pool's accounting. When the waiter polls chunks + * from the free list to satisfy its request, the finally block runs with the inner loop's + * "success" reset still in effect — it must not refund or decrement {@code nonPool}. + * Pool chunks transferred to the caller via {@code pooled} are accounted for by removing + * them from {@code free}, not by deducting their bytes from {@code nonPool}. + */ + @Test + public void testSlowPathSuccessDoesNotCorruptAccounting() throws Exception { + int chunkSize = 64; + long total = 3L * chunkSize; + ChunkedBufferPool p = pool(total, chunkSize); + + // Drain so a 2-chunk request must wait. + ByteBuffer h1 = p.allocate(chunkSize, 100); + ByteBuffer h2 = p.allocate(chunkSize, 100); + ByteBuffer h3 = p.allocate(chunkSize, 100); + assertEquals(0, p.availableMemory()); + + AtomicReference err = new AtomicReference<>(); + AtomicReference> got = new AtomicReference<>(); + Thread t = new Thread(() -> { + try { + got.set(p.allocateChunks(2 * chunkSize, 10_000)); + } catch (Throwable th) { + err.set(th); + } + }, "slow-success"); + t.start(); + + long deadline = System.currentTimeMillis() + 2_000; + while (p.queued() == 0 && System.currentTimeMillis() < deadline) + Thread.sleep(5); + assertEquals(1, p.queued()); + + // Deallocate 2 chunks → waiter wakes, polls both, accumulated reaches memoryRequired, + // exits normally. The success path's finally must NOT refund or decrement nonPool. + p.deallocate(h1); + p.deallocate(h2); + t.join(5_000); + assertNull(err.get(), "waiter unexpectedly threw: " + err.get()); + + // After success: waiter owns the 2 chunks (returned via `got`), the test still holds h3. + // The pool has lent out everything it had — availableMemory must be exactly 0. + assertEquals(0, p.availableMemory(), + "slow-path success corrupted accounting (likely a phantom nonPool decrement in the finally)"); + + // Returning all the held buffers must fully restore the pool. + for (ByteBuffer chunk : got.get()) + p.deallocate(chunk); + p.deallocate(h3); + assertEquals(total, p.availableMemory(), + "pool not fully restored after returning all chunks"); + } + + /** + * Closing the pool while a multi-chunk request is parked must surface a + * {@link KafkaException} and refund all reserved bytes. + */ + @Test + public void testCloseDuringAtomicWait() throws Exception { + int chunkSize = 64; + long total = 2L * chunkSize; + ChunkedBufferPool p = pool(total, chunkSize); + // Drain so the next request must wait. + ByteBuffer h1 = p.allocate(chunkSize, 100); + ByteBuffer h2 = p.allocate(chunkSize, 100); + assertEquals(0, p.availableMemory()); + + AtomicReference err = new AtomicReference<>(); + Thread t = new Thread(() -> { + try { + p.allocateChunks(2 * chunkSize, 10_000); + } catch (Throwable th) { + err.set(th); + } + }, "close-during-wait"); + t.start(); + long deadlineMs = System.currentTimeMillis() + 2_000; + while (p.queued() == 0 && System.currentTimeMillis() < deadlineMs) + Thread.sleep(5); + assertEquals(1, p.queued()); + + // Close the pool: signals all waiters; the waiter must throw KafkaException. + p.close(); + t.join(5_000); + assertTrue(err.get() instanceof KafkaException, + "expected KafkaException, got " + err.get()); + p.deallocate(h1); + p.deallocate(h2); + } +} diff --git a/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStreamTest.java b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStreamTest.java new file mode 100644 index 0000000000000..569c3bbcfcbdc --- /dev/null +++ b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStreamTest.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.clients.producer.internals; + +import org.apache.kafka.common.metrics.Metrics; +import org.apache.kafka.common.utils.MockTime; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; + +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class ChunkedByteBufferOutputStreamTest { + + private final MockTime time = new MockTime(); + private final Metrics metrics = new Metrics(time); + private final String metricGroup = "test"; + + @AfterEach + public void teardown() { + metrics.close(); + } + + private ChunkedBufferPool pool(long total, int chunkSize) { + return new ChunkedBufferPool(total, chunkSize, metrics, time, metricGroup); + } + + private List chunks(ChunkedBufferPool pool, int chunkSize, int count) throws InterruptedException { + return pool.allocateChunks(chunkSize * count, 100); + } + + /** Writes that fit in the initial chunk: stream produces the bytes back via buffer(). */ + @Test + public void testSingleChunkWriteRoundtrip() throws Exception { + int chunkSize = 16; + ChunkedBufferPool p = pool(64, chunkSize); + ChunkedByteBufferOutputStream stream = new ChunkedByteBufferOutputStream(chunks(p, chunkSize, 1), chunkSize, p); + + byte[] payload = new byte[]{1, 2, 3, 4, 5}; + stream.write(payload, 0, payload.length); + + ByteBuffer flat = stream.buffer(); + flat.flip(); + byte[] out = new byte[flat.remaining()]; + flat.get(out); + assertArrayEquals(payload, out); + + stream.deallocate(); + } + + /** Writes that span multiple pre-supplied chunks land in subsequent chunks in order. */ + @Test + public void testWriteSpansChunks() throws Exception { + int chunkSize = 8; + ChunkedBufferPool p = pool(64, chunkSize); + ChunkedByteBufferOutputStream stream = new ChunkedByteBufferOutputStream(chunks(p, chunkSize, 3), chunkSize, p); + + byte[] payload = new byte[20]; + for (int i = 0; i < payload.length; i++) payload[i] = (byte) i; + stream.write(payload, 0, payload.length); + + ByteBuffer flat = stream.buffer(); + flat.flip(); + byte[] out = new byte[flat.remaining()]; + flat.get(out); + assertArrayEquals(payload, out); + + stream.deallocate(); + } + + /** remaining() sums current + queued chunk capacities and shrinks as writes consume bytes. */ + @Test + public void testRemainingSumsChunks() throws Exception { + int chunkSize = 8; + ChunkedBufferPool p = pool(64, chunkSize); + ChunkedByteBufferOutputStream stream = new ChunkedByteBufferOutputStream(chunks(p, chunkSize, 2), chunkSize, p); + + assertEquals(2 * chunkSize, stream.remaining()); + stream.write(new byte[3], 0, 3); + assertEquals(2 * chunkSize - 3, stream.remaining()); + stream.write(new byte[chunkSize], 0, chunkSize); // crosses into chunk 2 + assertEquals(chunkSize - 3, stream.remaining()); + + stream.deallocate(); + } + + /** addBuffers extends capacity so a write that would have overflowed now fits. */ + @Test + public void testAddBuffersExtendsStream() throws Exception { + int chunkSize = 8; + ChunkedBufferPool p = pool(64, chunkSize); + ChunkedByteBufferOutputStream stream = new ChunkedByteBufferOutputStream(chunks(p, chunkSize, 1), chunkSize, p); + + // Fill the initial chunk. + stream.write(new byte[chunkSize], 0, chunkSize); + assertEquals(0, stream.remaining()); + + // Extend and write more — must land in the new chunk. + stream.addBuffers(Collections.singletonList(p.allocate(chunkSize, 100))); + assertEquals(chunkSize, stream.remaining()); + + byte[] more = new byte[]{9, 9, 9}; + stream.write(more, 0, more.length); + assertEquals(chunkSize - 3, stream.remaining()); + + stream.deallocate(); + } + + /** Writing past all pre-supplied (and added) chunks throws — uncompressed-only contract. */ + @Test + public void testOverflowThrows() throws Exception { + int chunkSize = 4; + ChunkedBufferPool p = pool(64, chunkSize); + ChunkedByteBufferOutputStream stream = new ChunkedByteBufferOutputStream(chunks(p, chunkSize, 1), chunkSize, p); + + stream.write(new byte[chunkSize], 0, chunkSize); + assertThrows(IllegalStateException.class, () -> stream.write((byte) 1)); + + stream.deallocate(); + } + + /** position(int) at construction walks across pre-supplied chunks. */ + @Test + public void testPositionWalksAcrossChunks() throws Exception { + int chunkSize = 4; + ChunkedBufferPool p = pool(32, chunkSize); + ChunkedByteBufferOutputStream stream = new ChunkedByteBufferOutputStream(chunks(p, chunkSize, 3), chunkSize, p); + + stream.position(6); // straddles chunk 0 (4 bytes) and chunk 1 (2 bytes) + assertEquals(6, stream.position()); + assertEquals(6, stream.remaining()); + + stream.deallocate(); + } + + /** deallocate returns all chunks to the pool. */ + @Test + public void testDeallocateReturnsAllChunks() throws Exception { + int chunkSize = 8; + long total = 64; + ChunkedBufferPool p = pool(total, chunkSize); + List initial = chunks(p, chunkSize, 3); + ChunkedByteBufferOutputStream stream = new ChunkedByteBufferOutputStream(initial, chunkSize, p); + // Also attach one extra chunk so deallocate must handle the added-buffer case too. + stream.addBuffers(Collections.singletonList(p.allocate(chunkSize, 100))); + assertEquals(total - 4L * chunkSize, p.availableMemory()); + + stream.deallocate(); + assertEquals(total, p.availableMemory()); + } +} diff --git a/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulatorTest.java b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulatorTest.java new file mode 100644 index 0000000000000..b191aadb4645b --- /dev/null +++ b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulatorTest.java @@ -0,0 +1,350 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.clients.producer.internals; + +import org.apache.kafka.clients.MetadataSnapshot; +import org.apache.kafka.common.Cluster; +import org.apache.kafka.common.Node; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.compress.Compression; +import org.apache.kafka.common.metrics.Metrics; +import org.apache.kafka.common.protocol.Errors; +import org.apache.kafka.common.record.internal.MemoryRecords; +import org.apache.kafka.common.record.internal.Record; +import org.apache.kafka.common.record.internal.RecordBatch; +import org.apache.kafka.common.requests.MetadataResponse.PartitionMetadata; +import org.apache.kafka.common.utils.MockTime; +import org.apache.kafka.common.utils.internals.LogContext; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Deque; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicReference; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class ChunkedRecordAccumulatorTest { + + private final String topic = "test"; + private final int partition1 = 0; + private final Node node1 = new Node(0, "localhost", 1111); + private final TopicPartition tp1 = new TopicPartition(topic, partition1); + + private final PartitionMetadata partMetadata1 = + new PartitionMetadata(Errors.NONE, tp1, Optional.of(node1.id()), Optional.empty(), null, null, null); + private final List partMetadatas = new ArrayList<>(Arrays.asList(partMetadata1)); + private final Map nodes = + Stream.of(node1).collect(Collectors.toMap(Node::id, java.util.function.Function.identity())); + private final MetadataSnapshot metadataCache = new MetadataSnapshot(null, nodes, partMetadatas, + Collections.emptySet(), Collections.emptySet(), Collections.emptySet(), null, Collections.emptyMap()); + private final Cluster cluster = metadataCache.cluster(); + + private final MockTime time = new MockTime(); + private final Metrics metrics = new Metrics(time); + private final LogContext logContext = new LogContext(); + private final long maxBlockTimeMs = 1000; + private final byte[] key = "k".getBytes(); + + @AfterEach + public void teardown() { + metrics.close(); + } + + private ChunkedRecordAccumulator newAccumulator(int batchSize, int chunkSize, long totalMemory, Compression compression) { + ChunkedBufferPool pool = new ChunkedBufferPool(totalMemory, chunkSize, metrics, time, "producer-metrics"); + return new ChunkedRecordAccumulator(logContext, batchSize, compression, + /* lingerMs */ 0, /* retryBackoffMs */ 0L, /* retryBackoffMaxMs */ 0L, + /* deliveryTimeoutMs */ 3200, metrics, "producer-metrics", time, + /* transactionManager */ null, pool); + } + + /** First record creates a batch backed by chunked allocation. */ + @Test + public void testFirstRecordCreatesChunkedBatch() throws Exception { + int chunkSize = 256; + ChunkedRecordAccumulator accum = newAccumulator(1024, chunkSize, 16L * chunkSize, Compression.NONE); + + byte[] value = new byte[64]; + accum.append(topic, partition1, 0L, key, value, Record.EMPTY_HEADERS, null, + maxBlockTimeMs, time.milliseconds(), cluster); + + Deque dq = batchesFor(accum, tp1); + assertEquals(1, dq.size()); + ProducerBatch batch = dq.peekFirst(); + assertNotNull(batch); + assertEquals(1, batch.recordCount); + accum.close(); + } + + /** + * A small follow-up record fits in the existing batch's pre-allocated chunks — no extension + * needed. + */ + @Test + public void testFollowupRecordFitsWithoutExtension() throws Exception { + int chunkSize = 256; + ChunkedRecordAccumulator accum = newAccumulator(1024, chunkSize, 16L * chunkSize, Compression.NONE); + + accum.append(topic, partition1, 0L, key, new byte[16], Record.EMPTY_HEADERS, null, + maxBlockTimeMs, time.milliseconds(), cluster); + accum.append(topic, partition1, 0L, key, new byte[16], Record.EMPTY_HEADERS, null, + maxBlockTimeMs, time.milliseconds(), cluster); + + Deque dq = batchesFor(accum, tp1); + assertEquals(1, dq.size(), "Both records should land in the same batch"); + assertEquals(2, dq.peekFirst().recordCount); + accum.close(); + } + + /** + * A follow-up record that would overflow the existing chunks triggers mid-batch extension: + * additional chunks are attached and the record lands in the same batch. + */ + @Test + public void testMidBatchExtensionGrowsExistingBatch() throws Exception { + int chunkSize = 128; + ChunkedRecordAccumulator accum = newAccumulator(8192, chunkSize, 16L * chunkSize, Compression.NONE); + + // First record fits in 1 chunk (~64+overhead). + accum.append(topic, partition1, 0L, key, new byte[32], Record.EMPTY_HEADERS, null, + maxBlockTimeMs, time.milliseconds(), cluster); + + Deque dq = batchesFor(accum, tp1); + assertEquals(1, dq.size()); + int initialRecordCount = dq.peekFirst().recordCount; + assertEquals(1, initialRecordCount); + + // Second record big enough to require an extra chunk. + accum.append(topic, partition1, 0L, key, new byte[200], Record.EMPTY_HEADERS, null, + maxBlockTimeMs, time.milliseconds(), cluster); + + // Same batch, now with the second record. + assertEquals(1, dq.size(), "Should still be the same batch — extension should not roll"); + assertEquals(2, dq.peekFirst().recordCount, + "Second record should land in the extended batch"); + accum.close(); + } + + /** Compression is rejected at batch creation in PR1 with a clear message. */ + @Test + public void testCompressionThrowsAtBatchCreation() { + int chunkSize = 256; + ChunkedRecordAccumulator accum = newAccumulator(1024, chunkSize, 16L * chunkSize, Compression.gzip().build()); + + UnsupportedOperationException ex = assertThrows(UnsupportedOperationException.class, () -> + accum.append(topic, partition1, 0L, key, new byte[32], Record.EMPTY_HEADERS, null, + maxBlockTimeMs, time.milliseconds(), cluster)); + assertTrue(ex.getMessage().contains("not implemented yet"), + "Expected 'not implemented yet' in message, got: " + ex.getMessage()); + accum.close(); + } + + /** + * Race between two concurrent appenders on the same chunked tail. Both threads probe + * {@code extensionBytesNeeded} under the deque lock against the same physical remaining + * capacity, drop the lock, and allocate gap-sized chunks off-lock. When the first appender + * re-takes the lock and consumes most of the original remaining, the second appender's + * allocation — sized against the pre-race remaining — is short by the bytes the first + * appender consumed. Without a re-probe in the second-lock block, the second appender + * proceeds to write past the stream's physical capacity and {@code advanceToNextChunk} + * throws {@link IllegalStateException}. + *

+ * Coordination: a {@link ChunkedBufferPool} subclass blocks the FIRST {@code allocateChunks} + * call after the test arms a latch (i.e. thread A's). Thread B's allocate is the second call + * and proceeds normally — B re-takes the lock, attaches its chunks, appends its record, and + * returns. Only then is thread A released, so A attaches its chunks and tries to append + * against a tail whose remaining has already shrunk. + */ + @Test + public void testConcurrentExtensionRaceDoesNotOverflowChunkedStream() throws Exception { + final int chunkSize = 256; + final int recordValueSize = 350; // sized so gap ≈ chunkSize → minimal rounding cushion + + final AtomicBoolean armed = new AtomicBoolean(false); + final CountDownLatch aHoldsChunks = new CountDownLatch(1); + final CountDownLatch bDoneAttaching = new CountDownLatch(1); + + ChunkedBufferPool pool = new ChunkedBufferPool(64L * chunkSize, chunkSize, metrics, time, "producer-metrics") { + @Override + public List allocateChunks(int totalSize, long maxTimeToBlockMs) throws InterruptedException { + boolean blockThisCall = armed.compareAndSet(true, false); + List chunks = super.allocateChunks(totalSize, maxTimeToBlockMs); + if (blockThisCall) { + aHoldsChunks.countDown(); + if (!bDoneAttaching.await(10, TimeUnit.SECONDS)) { + throw new InterruptedException("test timeout waiting for B"); + } + } + return chunks; + } + }; + ChunkedRecordAccumulator accum = new ChunkedRecordAccumulator(logContext, 8192, Compression.NONE, + /* lingerMs */ 0, /* retryBackoffMs */ 0L, /* retryBackoffMaxMs */ 0L, + /* deliveryTimeoutMs */ 3200, metrics, "producer-metrics", time, + /* transactionManager */ null, pool); + try { + // Warmup: tiny first record establishes a chunked tail with a known remainingInStream. + accum.append(topic, partition1, 0L, key, new byte[1], Record.EMPTY_HEADERS, null, + maxBlockTimeMs, time.milliseconds(), cluster); + + // Arm the race: the next allocateChunks call (A's) will block until B has appended. + armed.set(true); + + AtomicReference aError = new AtomicReference<>(); + Thread tA = new Thread(() -> { + try { + accum.append(topic, partition1, 0L, key, new byte[recordValueSize], + Record.EMPTY_HEADERS, null, maxBlockTimeMs, time.milliseconds(), cluster); + } catch (Throwable t) { + aError.set(t); + } + }, "test-appender-A"); + tA.start(); + + // Wait for A to be parked inside allocateChunks holding its (pre-attach) chunks. + assertTrue(aHoldsChunks.await(10, TimeUnit.SECONDS), "A did not reach allocateChunks"); + + // B runs on this thread. Its allocateChunks is the second call — armed=false now, + // so it proceeds without blocking. B probes the same R as A (A hasn't attached yet), + // attaches its own chunks, appends its record. Now the tail's remaining has shrunk + // by B's actual record size, but A's still-off-lock allocation didn't know that. + accum.append(topic, partition1, 0L, key, new byte[recordValueSize], Record.EMPTY_HEADERS, null, + maxBlockTimeMs, time.milliseconds(), cluster); + + // Release A. Without the fix in the second-lock block, A's tryAppendToExisting + // will overflow the chunked stream and throw IllegalStateException. + bDoneAttaching.countDown(); + tA.join(10_000); + assertFalse(tA.isAlive(), "Thread A did not complete in time"); + + assertNull(aError.get(), + "Thread A failed: " + (aError.get() == null ? "" : aError.get().toString())); + } finally { + // Drain any state regardless of outcome. + accum.close(); + } + } + + /** + * Regression guard: an inflight chunked batch returns all K chunks to the pool on the + * expiration / broker-disconnect path, not just one {@code initialCapacity} chunkSize. + * The parent {@code RecordAccumulator.deallocate} for inflight batches credits only + * {@code initialCapacity} back to the pool and throws; {@link ChunkedRecordAccumulator} + * overrides {@code deallocate} to return all chunks before the throw. Removing that + * override would re-introduce a (K−1)×chunkSize leak and fail this test. + */ + @Test + public void testInflightExpirationReturnsAllChunksToPool() throws Exception { + int chunkSize = 128; + long totalMemory = 32L * chunkSize; + ChunkedBufferPool pool = new ChunkedBufferPool(totalMemory, chunkSize, metrics, time, "producer-metrics"); + ChunkedRecordAccumulator accum = new ChunkedRecordAccumulator(logContext, 8192, Compression.NONE, + /* lingerMs */ 0, /* retryBackoffMs */ 0L, /* retryBackoffMaxMs */ 0L, + /* deliveryTimeoutMs */ 3200, metrics, "producer-metrics", time, + /* transactionManager */ null, pool); + + // A record large enough that allocateChunks reserves multiple chunks (K > 1). + byte[] value = new byte[400]; + accum.append(topic, partition1, 0L, key, value, Record.EMPTY_HEADERS, null, + maxBlockTimeMs, time.milliseconds(), cluster); + + Deque dq = batchesFor(accum, tp1); + assertEquals(1, dq.size()); + ProducerBatch batch = dq.peekFirst(); + + // Confirm the batch actually consumed multiple chunks. estimateSizeInBytesUpperBound for + // a 400-byte value with v2 framing is well over chunkSize, so K should be >= 2. + long heldBeforeDeallocate = totalMemory - pool.availableMemory(); + assertTrue(heldBeforeDeallocate >= 2L * chunkSize, + "test setup expects K >= 2; pool held " + heldBeforeDeallocate + " bytes"); + + // Mark the batch as if the Sender had drained it. + batch.setInflight(true); + + // The inflight branch throws after deallocating. The chunked override must return + // all K chunks (not just initialCapacity) before propagating the exception. + assertThrows(IllegalStateException.class, () -> accum.deallocate(batch)); + + assertEquals(totalMemory, pool.availableMemory(), + "pool should be fully restored after inflight-expiration deallocate; " + + "any K-1 unsurrendered chunks indicate the chunked-leak regression"); + + accum.close(); + } + + private Deque batchesFor(RecordAccumulator accum, TopicPartition tp) { + return accum.getDeque(tp); + } + + /** + * Sender's drain (and any other caller of {@code batch.close()}) cascades through + * {@code MemoryRecordsBuilder.closeForRecordAppends()} → + * {@code appendStream.close()} → {@code bufferStream.close()}. The chunked stream's + * buffer must remain readable after that cascade so the builder can flatten chunks into + * the heap buffer it sends over the network. If the chunked stream's {@code close()} + * prematurely returns chunks to the pool, the flattened buffer is empty and the header + * write fails (or produces an empty record set). + */ + @Test + public void testBatchCloseDoesNotDeallocateChunksPrematurely() throws Exception { + int chunkSize = 256; + ChunkedRecordAccumulator accum = newAccumulator(8192, chunkSize, 32L * chunkSize, Compression.NONE); + + byte[] value = new byte[64]; + accum.append(topic, partition1, 0L, key, value, Record.EMPTY_HEADERS, null, + maxBlockTimeMs, time.milliseconds(), cluster); + + Deque dq = batchesFor(accum, tp1); + ProducerBatch batch = dq.peekFirst(); + assertNotNull(batch); + + // Matches the call sites in RecordAccumulator.drain (line 941) and Sender. + batch.close(); + MemoryRecords records = batch.records(); + assertTrue(records.sizeInBytes() > 0, + "batch must produce a non-empty record set after close; chunks were deallocated prematurely"); + + int count = 0; + for (RecordBatch rb : records.batches()) { + for (Record r : rb) { + count++; + assertNotNull(r.value()); + } + } + assertEquals(1, count, "expected exactly 1 record after close"); + + accum.close(); + } +} From 64147f6b492933e343d983f3a168edcdc427976b Mon Sep 17 00:00:00 2001 From: Lianet Magrans Date: Thu, 18 Jun 2026 15:27:38 -0400 Subject: [PATCH 03/11] config & ratio-aware chunk sizing --- .../kafka/clients/producer/KafkaProducer.java | 58 ++++- .../clients/producer/ProducerConfig.java | 21 ++ .../producer/internals/ChunkedBufferPool.java | 47 ++-- .../ChunkedByteBufferOutputStream.java | 36 +-- .../internals/ChunkedProducerBatch.java | 94 +++++++ .../internals/ChunkedRecordAccumulator.java | 236 ++++++++---------- .../producer/internals/ProducerBatch.java | 38 +-- .../producer/internals/RecordAccumulator.java | 54 +++- .../record/internal/AbstractRecords.java | 25 ++ .../record/internal/MemoryRecordsBuilder.java | 48 +++- .../clients/producer/ProducerConfigTest.java | 35 +++ .../ChunkedRecordAccumulatorTest.java | 91 +++++-- 12 files changed, 544 insertions(+), 239 deletions(-) create mode 100644 clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedProducerBatch.java diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/KafkaProducer.java b/clients/src/main/java/org/apache/kafka/clients/producer/KafkaProducer.java index dfa72f2adad71..0990a68439171 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/KafkaProducer.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/KafkaProducer.java @@ -28,6 +28,8 @@ import org.apache.kafka.clients.consumer.OffsetCommitCallback; import org.apache.kafka.clients.producer.internals.BufferPool; import org.apache.kafka.clients.producer.internals.BuiltInPartitioner; +import org.apache.kafka.clients.producer.internals.ChunkedBufferPool; +import org.apache.kafka.clients.producer.internals.ChunkedRecordAccumulator; import org.apache.kafka.clients.producer.internals.KafkaProducerMetrics; import org.apache.kafka.clients.producer.internals.ProducerInterceptors; import org.apache.kafka.clients.producer.internals.ProducerMetadata; @@ -90,6 +92,7 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.Optional; @@ -456,19 +459,48 @@ public KafkaProducer(Properties properties, Serializer keySerializer, Seriali // As per Kafka producer configuration documentation batch.size may be set to 0 to explicitly disable // batching which in practice actually means using a batch size of 1. int batchSize = Math.max(1, config.getInt(ProducerConfig.BATCH_SIZE_CONFIG)); - this.accumulator = new RecordAccumulator(logContext, - batchSize, - compression, - lingerMs(config), - retryBackoffMs, - retryBackoffMaxMs, - deliveryTimeoutMs, - partitionerConfig, - metrics, - PRODUCER_METRIC_GROUP_NAME, - time, - transactionManager, - new BufferPool(this.totalMemorySize, batchSize, metrics, time, PRODUCER_METRIC_GROUP_NAME)); + String allocationStrategy = config.getString(ProducerConfig.BUFFER_MEMORY_ALLOCATION_STRATEGY_CONFIG) + .toLowerCase(Locale.ROOT); + boolean incremental = ProducerConfig.BUFFER_MEMORY_ALLOCATION_STRATEGY_INCREMENTAL.equals(allocationStrategy); + if (incremental && compression.type() != CompressionType.NONE) { + throw new ConfigException("The " + ProducerConfig.BUFFER_MEMORY_ALLOCATION_STRATEGY_INCREMENTAL + + " " + ProducerConfig.BUFFER_MEMORY_ALLOCATION_STRATEGY_CONFIG + + " does not support compression yet. " + ProducerConfig.COMPRESSION_TYPE_CONFIG + + " must be set to none."); + } + // Use the chunked path only when a batch is at least one full chunk + // (batch.size >= CHUNK_SIZE). Below that, a batch can't fill even one chunk, so chunking + // would reserve a whole chunk for a batch capped under it (over-reservation); the full + // strategy (reserving up to batch.size per batch) is both leaner and identical to trunk. + if (incremental && batchSize >= ChunkedRecordAccumulator.CHUNK_SIZE) { + this.accumulator = new ChunkedRecordAccumulator(logContext, + batchSize, + compression, + lingerMs(config), + retryBackoffMs, + retryBackoffMaxMs, + deliveryTimeoutMs, + partitionerConfig, + metrics, + PRODUCER_METRIC_GROUP_NAME, + time, + transactionManager, + new ChunkedBufferPool(this.totalMemorySize, ChunkedRecordAccumulator.CHUNK_SIZE, metrics, time, PRODUCER_METRIC_GROUP_NAME)); + } else { + this.accumulator = new RecordAccumulator(logContext, + batchSize, + compression, + lingerMs(config), + retryBackoffMs, + retryBackoffMaxMs, + deliveryTimeoutMs, + partitionerConfig, + metrics, + PRODUCER_METRIC_GROUP_NAME, + time, + transactionManager, + new BufferPool(this.totalMemorySize, batchSize, metrics, time, PRODUCER_METRIC_GROUP_NAME)); + } this.errors = this.metrics.sensor("errors"); this.sender = newSender(logContext, kafkaClient, this.metadata); diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/ProducerConfig.java b/clients/src/main/java/org/apache/kafka/clients/producer/ProducerConfig.java index 38a007b6d212b..286f3e37dc823 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/ProducerConfig.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/ProducerConfig.java @@ -228,6 +228,20 @@ public class ProducerConfig extends AbstractConfig { + "not all memory the producer uses is used for buffering. Some additional memory will be used for compression (if " + "compression is enabled) as well as for maintaining in-flight requests."; + /** buffer.memory.allocation.strategy */ + public static final String BUFFER_MEMORY_ALLOCATION_STRATEGY_CONFIG = "buffer.memory.allocation.strategy"; + public static final String BUFFER_MEMORY_ALLOCATION_STRATEGY_FULL = "full"; + public static final String BUFFER_MEMORY_ALLOCATION_STRATEGY_INCREMENTAL = "incremental"; + private static final String BUFFER_MEMORY_ALLOCATION_STRATEGY_DOC = "Controls how the producer allocates memory from " + BUFFER_MEMORY_CONFIG + " for record batches. The following values are supported: " + + "

    " + + "
  • " + BUFFER_MEMORY_ALLOCATION_STRATEGY_FULL + ": reserve " + BATCH_SIZE_CONFIG + " bytes up front for each in-progress batch, " + + "regardless of the actual size of the records appended to it.
  • " + + "
  • " + BUFFER_MEMORY_ALLOCATION_STRATEGY_INCREMENTAL + ": allocate memory on demand as records are appended, growing each batch " + + "up to " + BATCH_SIZE_CONFIG + " bytes. This keeps memory usage proportional to the data actually buffered rather than to the number of " + + "active partitions, which allows larger " + BATCH_SIZE_CONFIG + " values (e.g. for high-latency clusters) without reserving " + + "" + BATCH_SIZE_CONFIG + " bytes for every active partition.
  • " + + "
"; + /** retry.backoff.ms */ public static final String RETRY_BACKOFF_MS_CONFIG = CommonClientConfigs.RETRY_BACKOFF_MS_CONFIG; @@ -399,6 +413,13 @@ public class ProducerConfig extends AbstractConfig { Importance.MEDIUM, CommonClientConfigs.CLIENT_DNS_LOOKUP_DOC) .define(BUFFER_MEMORY_CONFIG, Type.LONG, 32 * 1024 * 1024L, atLeast(0L), Importance.HIGH, BUFFER_MEMORY_DOC) + .define(BUFFER_MEMORY_ALLOCATION_STRATEGY_CONFIG, + Type.STRING, + BUFFER_MEMORY_ALLOCATION_STRATEGY_FULL, + ConfigDef.CaseInsensitiveValidString + .in(BUFFER_MEMORY_ALLOCATION_STRATEGY_FULL, BUFFER_MEMORY_ALLOCATION_STRATEGY_INCREMENTAL), + Importance.MEDIUM, + BUFFER_MEMORY_ALLOCATION_STRATEGY_DOC) .define(RETRIES_CONFIG, Type.INT, Integer.MAX_VALUE, between(0, Integer.MAX_VALUE), Importance.HIGH, RETRIES_DOC) .define(ACKS_CONFIG, Type.STRING, diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPool.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPool.java index 96a4e6e0b36e4..9fa367d80e889 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPool.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPool.java @@ -28,15 +28,12 @@ import java.util.concurrent.locks.Condition; /** - * A {@link BufferPool} dedicated to chunk-sized buffer reuse. The chunk size is passed to the - * parent as its {@link #poolableSize()}. + * A {@link BufferPool} dedicated to chunk-sized buffer reuse (chunk size = {@link #poolableSize()}). *

- * Adds {@link #allocateChunks(int, long)} for callers that need to acquire multiple - * chunks at once. The implementation is atomic across the K-chunk request: - * a single lock acquisition for the whole reservation, a single {@link Condition} - * added to {@link #waiters} so the request is FIFO-fair against single-chunk acquirers, and - * any failure (timeout, close, OOM) refunds the entire reservation atomically before the - * exception propagates. No partial chunk holds are visible outside the lock during the wait. + * Adds {@link #allocateChunks(int, long)} to acquire multiple chunks atomically: one lock + * acquisition and a single {@link Condition} on {@link #waiters} (FIFO-fair against single-chunk + * acquirers), with any failure (timeout, close, OOM) refunding the whole reservation before the + * exception propagates. No partial holds are visible during the wait. */ public class ChunkedBufferPool extends BufferPool { @@ -45,22 +42,19 @@ public ChunkedBufferPool(long memory, int chunkSize, Metrics metrics, Time time, } /** - * Allocate {@code ceil(totalSize / chunkSize)} chunk-sized buffers atomically. - * This follows the same principle as the allocate in the parent BufferPool class: if there is - * memory available, allocate the requested number of chunks. If not, wait for memory. - * If it needs to wait, it blocks up to {@code maxTimeToBlockMs} for the whole request, - * queued on {@link #waiters} alongside single-chunk acquirers (FIFO). - *

- * On any failure path every byte reserved during the call is returned to the pool - * and the next waiter is signaled. No partial chunk holds are visible to other threads during the wait. - * The reservation is tracked as bytes against {@link #nonPooledAvailableMemory} plus chunks polled from {@link #free}. + * Allocate {@code ceil(totalSize / chunkSize)} chunk-sized buffers atomically, mirroring + * {@link BufferPool#allocate}: satisfied immediately if memory is available, else blocks up to + * {@code maxTimeToBlockMs} for the whole request (FIFO on {@link #waiters}). The reservation is + * tracked as bytes against {@link #nonPooledAvailableMemory} plus chunks polled from + * {@link #free}; on any failure path every reserved byte is returned and the next waiter signaled. * * @param totalSize minimum total bytes of capacity required across the returned chunks * @param maxTimeToBlockMs maximum time in milliseconds to block waiting for memory * @return list of {@code ceil(totalSize / chunkSize)} {@code ByteBuffer}s, each of capacity * {@code chunkSize} * @throws InterruptedException if interrupted while waiting - * @throws IllegalArgumentException if {@code totalSize <= 0} or {@code totalSize > totalMemory()} + * @throws IllegalArgumentException if {@code totalSize <= 0}, or if the request rounded up to + * whole chunks exceeds {@code totalMemory()} * @throws BufferExhaustedException if the request can't be satisfied within {@code maxTimeToBlockMs} * @throws KafkaException if the pool is closed during the wait */ @@ -75,11 +69,17 @@ public List allocateChunks(int totalSize, long maxTimeToBlockMs) thr int chunkSize = poolableSize(); int numChunks = (int) (((long) totalSize + chunkSize - 1L) / chunkSize); long memoryRequired = (long) numChunks * chunkSize; + // Rounding totalSize up to whole chunks can require more pool memory than buffer.memory + // holds even when totalSize alone fits (the single-buffer "full" path reserves exactly the + // record size, so it never hits this). Reject it here to fail fast, instead of blocking for + // max.block.ms and then throwing BufferExhausted for a request that can never be satisfied. + if (memoryRequired > totalMemory()) + throw new IllegalArgumentException("Attempt to allocate " + numChunks + " chunks of " + + chunkSize + " bytes (" + memoryRequired + " bytes total), but there is a hard limit of " + + totalMemory() + " on memory allocations."); - // Chunks pulled from the free list during the reservation. Remaining bytes - // (memoryRequired - pooled.size() * chunkSize) are reserved against - // nonPooledAvailableMemory and materialized as raw allocations after the lock is - // released. + // Chunks pulled from the free list; the remaining bytes are reserved against + // nonPooledAvailableMemory and materialized as raw allocations after the lock is released. List pooled = new ArrayList<>(numChunks); lock.lock(); @@ -95,8 +95,7 @@ public List allocateChunks(int totalSize, long maxTimeToBlockMs) thr pooled.add(free.pollFirst()); long remainingBytes = memoryRequired - (long) pooled.size() * chunkSize; if (remainingBytes > 0) { - // remainingBytes is bounded by memoryRequired ≤ totalMemory; the entry check - // rejects requests larger than that, so the int cast is safe in practice. + // remainingBytes <= memoryRequired <= totalMemory (validated above), so the int cast is safe. freeUp((int) remainingBytes); this.nonPooledAvailableMemory -= remainingBytes; } diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStream.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStream.java index ccf1d3d7374c9..3b9a9ed42ff5e 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStream.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStream.java @@ -32,10 +32,9 @@ * chunks), {@link IllegalStateException} is thrown. The caller is responsible for attaching * additional chunks before any such write. *

- * Automatic mid-write growth (allocating a chunk from inside {@code write()}) will be added - * together with compression support in a follow-up: compressor buffering can cause a record's - * write to exceed its reservation, which the caller can't predict in advance. Until then, the - * write path stays allocation-free and deterministic. + * Automatic mid-write growth (allocating from inside {@code write()}) is a follow-up tied to + * compression support, where compressor buffering can make a write exceed its reservation; until + * then the write path stays allocation-free and deterministic. *

* When {@link #buffer()} is called (typically at batch close), all chunks are flattened into a * single contiguous ByteBuffer — exactly one copy operation. @@ -110,11 +109,10 @@ private void ensureChunkCapacity(int needed) { } /** - * Advances {@code currentChunk} to the next pre-supplied chunk. The caller is responsible - * for obtaining additional chunks (e.g. {@code ChunkedBufferPool.allocateChunks}) and - * attaching them via {@link #addBuffers(List)} before any write that would exceed - * {@link #remaining()}. The KIP's mid-record growth path (non-blocking pool then direct - * heap) lands together with compression support. + * Advances {@code currentChunk} to the next pre-supplied chunk; throws if none is left. The + * caller must attach additional chunks via {@link #addBuffers(List)} before any write that + * exceeds {@link #remaining()}. (Mid-record growth — non-blocking pool then heap — is a + * follow-up that lands with compression support.) */ private void advanceToNextChunk() { if (currentChunkIndex + 1 >= chunks.size()) { @@ -140,6 +138,11 @@ public ByteBuffer buffer() { if (flattenedBuffer != null && !dirty) { return flattenedBuffer; } + // TODO: KAFKA-20687. This flatten runs at batch close, when the chunk set is final. + // Today all chunks (used and unused) are returned to the pool only when the batch + // completes (via deallocate(pool)). Consider releasing the fully-unused chunks + // early, at close, rather than holding them until completion — removing them from + // `chunks` here so the completion-time deallocate(pool) does not double-return them. int totalSize = 0; for (ByteBuffer chunk : chunks) { totalSize += chunk.position(); @@ -153,6 +156,12 @@ public ByteBuffer buffer() { chunk.position(chunkPos); } dirty = false; + // The bytes are now copied into flattenedBuffer, but we intentionally do not release the + // chunks until batch completion, so the in-flight data stays reserved against the + // buffer.memory budget (the pool's available memory reflects it), consistent with the + // "full" strategy. Releasing here would return that memory to the pool while the bytes are + // still in flight in the heap copy, letting the pool admit more than buffer.memory intends. + // This flattening is an initial approach and will be removed with KAFKA-20580. return flattenedBuffer; } @@ -219,15 +228,14 @@ public int initialCapacity() { @Override public void ensureRemaining(int remainingBytesRequired) { - // The stream advances one chunk at a time — the most any single ensureChunkCapacity - // call can guarantee is `chunkSize` of contiguous-by-chunk space. Callers needing more - // than that are expected to attach chunks via addBuffers before writing; write(byte[]) - // itself loops across chunks so contiguous capacity isn't required. + // A single call can guarantee at most `chunkSize` of space (the stream advances one chunk + // at a time); callers needing more attach chunks via addBuffers first. write(byte[]) loops + // across chunks, so contiguous capacity isn't required. ensureChunkCapacity(Math.min(remainingBytesRequired, chunkSize)); } /** - * Returns all pool-allocated chunks to the buffer pool. + * Returns all pool-allocated chunks to the buffer pool. Called at batch completion. */ public void deallocate(BufferPool pool) { if (pool != null) { diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedProducerBatch.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedProducerBatch.java new file mode 100644 index 0000000000000..e80ed7861826d --- /dev/null +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedProducerBatch.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.clients.producer.internals; + +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.header.Header; +import org.apache.kafka.common.record.internal.MemoryRecordsBuilder; +import org.apache.kafka.common.utils.ByteBufferOutputStream; + +import java.nio.ByteBuffer; +import java.util.List; + +/** + * A {@link ProducerBatch} for the incremental (chunked) buffer.memory allocation strategy, backed + * by a {@link MemoryRecordsBuilder} whose stream is a {@link ChunkedByteBufferOutputStream}. + * It adds mid-batch chunk extension support ({@link #extensionBytesNeeded} / + * {@link #addBuffers}) and overrides the pool deallocation hooks so all chunks are returned to + * the pool rather than a single buffer. + *

+ * This class is not thread safe and external synchronization must be used when modifying it. + */ +public class ChunkedProducerBatch extends ProducerBatch { + + // The parent's builder reference is private; keep our own to access stream capacity state. + private final MemoryRecordsBuilder recordsBuilder; + + public ChunkedProducerBatch(TopicPartition tp, MemoryRecordsBuilder recordsBuilder, long createdMs) { + super(tp, recordsBuilder, createdMs); + this.recordsBuilder = recordsBuilder; + } + + /** + * Bytes of physical buffer this batch needs before {@code tryAppend} could accept the given + * record. Returns 0 when the record fits (either logically full, or the stream's combined + * chunk capacity has room). Positive when {@code hasRoomFor} allows the record but the + * chunks lack physical capacity — the accumulator allocates exactly the missing bytes + * (rounded up to whole chunks) and attaches them via {@link #addBuffers} before retrying. + */ + int extensionBytesNeeded(long timestamp, byte[] key, byte[] value, Header[] headers) { + if (recordCount == 0) + return 0; + if (!recordsBuilder.hasRoomFor(timestamp, key, value, headers)) + return 0; + // Size against the batch's projected total output after this record (header counted once, + // ratio-adjusted when compressed), not per-record. Per-record sizing would over-count the + // header and miss the compressor's flush-accumulation behavior. + int target = recordsBuilder.estimatedBytesWrittenAfter(key, value, headers); + ByteBufferOutputStream stream = recordsBuilder.bufferStream(); + int totalAttachedCapacity = stream.position() + stream.remaining(); + return Math.max(0, target - totalAttachedCapacity); + } + + /** + * Attach pre-allocated chunks to this batch's stream so the next {@code tryAppend} can + * spill into them. Ownership of the chunks transfers to the stream. + */ + void addBuffers(List chunks) { + stream().addBuffers(chunks); + } + + @Override + protected void deallocateBuffer(BufferPool pool) { + stream().deallocate(pool); + } + + /** + * Unlike the single-buffer batch — which must donate a fresh buffer because the network + * layer may still be reading the pooled one — a chunked batch's inflight bytes live in the + * separate flattened buffer (see {@link ChunkedByteBufferOutputStream#buffer()}), so it is + * safe to return the actual chunks to the pool here. + */ + @Override + protected void deallocateInflightBuffer(BufferPool pool) { + stream().deallocate(pool); + } + + private ChunkedByteBufferOutputStream stream() { + return (ChunkedByteBufferOutputStream) recordsBuilder.bufferStream(); + } +} diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulator.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulator.java index cac18c608d195..a587754d80c44 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulator.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulator.java @@ -17,13 +17,17 @@ package org.apache.kafka.clients.producer.internals; import org.apache.kafka.clients.producer.BufferExhaustedException; +import org.apache.kafka.clients.producer.Callback; import org.apache.kafka.clients.producer.RecordMetadata; import org.apache.kafka.common.Cluster; +import org.apache.kafka.common.KafkaException; +import org.apache.kafka.common.TopicPartition; import org.apache.kafka.common.compress.Compression; import org.apache.kafka.common.header.Header; import org.apache.kafka.common.metrics.Metrics; import org.apache.kafka.common.record.TimestampType; import org.apache.kafka.common.record.internal.AbstractRecords; +import org.apache.kafka.common.record.internal.CompressionRatioEstimator; import org.apache.kafka.common.record.internal.CompressionType; import org.apache.kafka.common.record.internal.MemoryRecordsBuilder; import org.apache.kafka.common.record.internal.Record; @@ -40,24 +44,31 @@ * A {@link RecordAccumulator} variant that uses chunked buffer allocation: each new batch * pre-allocates {@code ceil(recordSize / chunkSize)} chunks from a {@link ChunkedBufferPool}, * and grows mid-batch by attaching additional chunks via - * {@link ProducerBatch#addBuffers(List)} when later records would overflow. + * {@link ChunkedProducerBatch#addBuffers(List)} when later records would overflow. *

- * This is the "dynamic" buffer.memory allocation strategy: memory consumption scales with + * This is the "incremental" buffer.memory allocation strategy: memory consumption scales with * actual data buffered, not with {@code active_partition_count × batch.size}. *

- * PR1 scope: uncompressed only. Batch creation throws - * {@link UnsupportedOperationException} if compression is requested. The mid-batch flow - * matches the KIP: try a non-blocking pool acquire for the extension chunks; on - * {@link BufferExhaustedException}, close the existing batch (so the sender can drain it) - * and let the new record flow through the first-record path, which blocks on the pool for - * a fresh batch's chunks up to {@code max.block.ms}. + * When a record arrives mid-batch (an open batch already exists) and the batch's chunks are + * full, the flow is: under the deque lock, probe that open batch — the tail of the partition's + * deque — via the {@link #tryAppend} override, which returns + * {@link RecordAppendResult#needsExtension(int)} when the batch has logical room but lacks + * physical chunk capacity, then attempt a non-blocking pool acquire for the extra chunks. If the pool is exhausted, close the current batch (so the sender can drain it) + * and route the record through the first-record path, which blocks on the pool up to + * {@code max.block.ms}. As a result, unlike the "full" strategy, {@code send()} may block (or + * fail with {@link BufferExhaustedException}) on records other than the first of a batch. *

- * TODO: add the new buffer.memory.allocation.strategy config and instantiate this class only when set to "dynamic" - * TODO: add support for compressed data. Throws UnsupportedOperationException for now, - * and IllegalStateException if overflow detected. Follow-ups will support compressed data and growth. + * TODO: support compressed data (with mid-record growth); the constructor rejects compression for now. */ public class ChunkedRecordAccumulator extends RecordAccumulator { + /** + * Fixed size of every chunk, independent of {@code batch.size}. The incremental strategy is + * only used when {@code batch.size >= CHUNK_SIZE} (see {@code KafkaProducer}); below it a batch + * is smaller than a single chunk, so the producer uses the full strategy instead. + */ + public static final int CHUNK_SIZE = 16 * 1024; + private final ChunkedBufferPool chunkedFree; public ChunkedRecordAccumulator(LogContext logContext, @@ -75,6 +86,11 @@ public ChunkedRecordAccumulator(LogContext logContext, ChunkedBufferPool bufferPool) { super(logContext, batchSize, compression, lingerMs, retryBackoffMs, retryBackoffMaxMs, deliveryTimeoutMs, partitionerConfig, metrics, metricGrpName, time, transactionManager, bufferPool); + // TODO: drop this once the incremental strategy supports compressed data (with the + // mid-record growth fallback for compressor overshoot). + if (compression.type() != CompressionType.NONE) + throw new UnsupportedOperationException( + "Compression is not yet supported with the incremental buffer.memory allocation strategy"); this.chunkedFree = bufferPool; } @@ -132,31 +148,25 @@ public RecordAppendResult append(String topic, if (partitionChanged(topic, topicInfo, partitionInfo, dq, nowMs, cluster)) continue; - // Probe the tail directly. If the existing batch has logical room but the - // underlying stream lacks physical capacity, capture the gap and fall through - // to allocate the extension outside the deque lock. Otherwise, attempt the - // append against the existing batch. - ProducerBatch tail = dq.peekLast(); - int gap = tail == null ? 0 : tail.extensionBytesNeeded(timestamp, key, value, headers); - if (gap > 0) { - extensionBytes = gap; - } else { - extensionBytes = 0; // clear any stale value carried from a prior iteration - RecordAppendResult appendResult = tryAppend(timestamp, key, value, headers, callbacks, dq, nowMs); - if (appendResult != null) { - boolean enableSwitch = allBatchesFull(dq); - topicInfo.builtInPartitioner.updatePartitionInfo(partitionInfo, appendResult.appendedBytes, cluster, enableSwitch); - return appendResult; - } + // The tryAppend override probes the physical capacity of the tail batch + // (dq.peekLast(), the partition's open batch): a needsBufferExtension result + // means it has logical room but its chunks lack space for this record — fall + // through to allocate the gap outside the deque lock. A null result means the + // tail batch is full or absent — fall through to the first-record (new batch) path. + RecordAppendResult appendResult = tryAppend(timestamp, key, value, headers, callbacks, dq, nowMs); + if (appendResult != null && !appendResult.needsBufferExtension) { + boolean enableSwitch = allBatchesFull(dq); + topicInfo.builtInPartitioner.updatePartitionInfo(partitionInfo, appendResult.appendedBytes, cluster, enableSwitch); + return appendResult; } + extensionBytes = appendResult == null ? 0 : appendResult.extensionBytesNeeded; } if (extensionBytes > 0 && extensionChunks == null) { - // Mid-batch extension: non-blocking attempt only. The producer thread already - // holds chunks for the open batch; blocking here risks deadlock with the Sender - // thread (which frees pool memory by completing batches). If the pool is - // exhausted, we close the existing batch (so it becomes drainable) and fall - // through to the first-record blocking path on the next loop iteration. + // Mid-batch extension: non-blocking only. The thread already holds the open + // batch's chunks, so blocking here could deadlock with the Sender (which frees + // pool memory by completing batches). On exhaustion, close the batch (making it + // drainable) and fall through to the blocking first-record path next iteration. try { extensionChunks = chunkedFree.allocateChunks(extensionBytes, 0L); } catch (BufferExhaustedException e) { @@ -168,21 +178,19 @@ public RecordAppendResult append(String topic, last.closeForRecordAppends(); } } - extensionBytes = 0; continue; } nowMs = time.milliseconds(); } else if (extensionBytes == 0 && bufferStream == null) { - // First-record path: block on the pool for enough chunks to fit this record. - // Temporary while compressed data not supported. - // TODO: drop this throw and route through the - // compressed path (which adds the mid-record growth fallback for compressor overshoot). - if (compression.type() != CompressionType.NONE) { - throw new UnsupportedOperationException( - "Compression is not implemented yet for the dynamic buffer.memory allocation strategy"); - } - int size = AbstractRecords.estimateSizeInBytesUpperBound( + // First-record path: block on the pool for enough chunks to fit this record, + // sized with the same cumulative estimator used mid-batch (header + record bytes + // for NONE, ratio-adjusted when compressed) so the two stay consistent. + int recordUncompressed = AbstractRecords.recordSizeUpperBound( RecordBatch.CURRENT_MAGIC_VALUE, compression.type(), key, value, headers); + int size = MemoryRecordsBuilder.estimatedBytesWritten( + RecordBatch.CURRENT_MAGIC_VALUE, compression.type(), + CompressionRatioEstimator.estimation(topic, compression.type()), + recordUncompressed); log.trace("Allocating {} byte chunked buffer ({} byte chunks) for topic {} partition {} with remaining timeout {}ms", size, chunkedFree.poolableSize(), topic, effectivePartition, maxTimeToBlock); List initialChunks = chunkedFree.allocateChunks(size, maxTimeToBlock); @@ -196,67 +204,48 @@ public RecordAppendResult append(String topic, if (extensionChunks != null) { ProducerBatch last = dq.peekLast(); - // The off-lock allocateChunks window allows the original tail batch to be - // drained and a split batch (plain ByteBufferOutputStream) to be addFirst'd - // into an emptied deque — see splitAndReenqueue. The instanceof guard below - // is load-bearing: addBuffers is only defined on ChunkedByteBufferOutputStream; - // the else branch returns the chunks to the pool to avoid leaking them. - if (last != null && last.isWritable() - && last.bufferStream() instanceof ChunkedByteBufferOutputStream) { - ((ChunkedByteBufferOutputStream) last.bufferStream()).addBuffers(extensionChunks); + // The off-lock allocateChunks window allows the probed tail batch to be + // drained and replaced — possibly by a split batch (a plain + // ProducerBatch), which can't take extension chunks. Only attach to a + // writable chunked tail; otherwise refund the chunks and re-evaluate. + if (last instanceof ChunkedProducerBatch && last.isWritable()) { + ((ChunkedProducerBatch) last).addBuffers(extensionChunks); extensionChunks = null; - extensionBytes = 0; - // Re-probe after attach: between our first-lock probe and now we were off-lock, - // and another appender (or a tail-replacement) may have consumed capacity that - // our extension allocation was sized against. If a gap remains, loop to acquire - // more chunks against the now-current state. Without this re-probe, a - // tryAppend that passes hasRoomFor (writeLimit) but exceeds the stream's - // physical remaining() trips IllegalStateException in - // ChunkedByteBufferOutputStream.advanceToNextChunk. Termination: writeLimit is - // bounded and fixed; once hasRoomFor turns false, extensionBytesNeeded returns 0 - // and we fall through to new-batch creation. Regression test: - // testConcurrentExtensionRaceDoesNotOverflowChunkedStream. - int recheck = last.extensionBytesNeeded(timestamp, key, value, headers); - if (recheck > 0) { - extensionBytes = recheck; - continue; - } RecordAppendResult retryResult = tryAppend(timestamp, key, value, headers, callbacks, dq, nowMs); - if (retryResult != null) { + if (retryResult != null && !retryResult.needsBufferExtension) { boolean enableSwitch = allBatchesFull(dq); topicInfo.builtInPartitioner.updatePartitionInfo(partitionInfo, retryResult.appendedBytes, cluster, enableSwitch); return retryResult; } - // Batch became full via writeLimit while we were off-lock — fall through - // to new-batch creation on the next iteration. + // needsBufferExtension: a concurrent appender consumed capacity our + // extension was sized against — loop to re-probe. null: batch became + // full — loop into new-batch creation. Terminates because writeLimit is + // fixed: once full, the probe stops requesting extension. Regression: + // testConcurrentExtensionRaceDoesNotOverflowChunkedStream. continue; } // Tail is gone, closed, or non-chunked (e.g., a split batch). Return chunks to pool. for (ByteBuffer chunk : extensionChunks) chunkedFree.deallocate(chunk); extensionChunks = null; - extensionBytes = 0; - continue; - } - - // bufferStream is non-null here (first-record path). Before creating a new - // batch with it, re-probe the tail: a concurrent appender may have created a - // chunked batch we should extend instead. If so, release the oversized - // bufferStream and let the next iteration take the extension path. - ProducerBatch tail = dq.peekLast(); - if (tail != null && tail.isWritable() - && tail.bufferStream() instanceof ChunkedByteBufferOutputStream - && tail.extensionBytesNeeded(timestamp, key, value, headers) > 0) { - bufferStream.deallocate(); - bufferStream = null; continue; } + // bufferStream is non-null here (first-record path). int firstRecordSize = AbstractRecords.estimateSizeInBytesUpperBound( RecordBatch.CURRENT_MAGIC_VALUE, compression.type(), key, value, headers); final ChunkedByteBufferOutputStream batchStream = bufferStream; RecordAppendResult appendResult = appendNewBatch(topic, effectivePartition, dq, timestamp, key, value, headers, callbacks, () -> chunkedRecordsBuilder(batchStream, firstRecordSize), nowMs); + if (appendResult.needsBufferExtension) { + // A concurrent appender created a tail batch we should extend rather + // than start a new one (detected by appendNewBatch's in-lock tryAppend). + // Our bufferStream was sized for a fresh batch — release it and loop so + // the extension path allocates exactly the gap-sized chunks. + bufferStream.deallocate(); + bufferStream = null; + continue; + } if (appendResult.newBatchCreated) bufferStream = null; boolean enableSwitch = allBatchesFull(dq); @@ -276,61 +265,46 @@ public RecordAppendResult append(String topic, } /** - * Build a {@link MemoryRecordsBuilder} backed by the chunked stream. - * {@code writeLimit} is set to {@code max(batchSize, firstRecordSize)} — the same logical - * cap the legacy path uses. Physical chunk capacity grows on demand via - * {@link ChunkedByteBufferOutputStream#addBuffers(List)}; {@code writeLimit} bounds when a - * batch is considered logically full and a new one must be started. - */ - private MemoryRecordsBuilder chunkedRecordsBuilder(ChunkedByteBufferOutputStream bufferStream, - int firstRecordSize) { - int writeLimit = Math.max(batchSize, firstRecordSize); - return new MemoryRecordsBuilder(bufferStream, RecordBatch.CURRENT_MAGIC_VALUE, compression, - TimestampType.CREATE_TIME, 0L, RecordBatch.NO_TIMESTAMP, RecordBatch.NO_PRODUCER_ID, - RecordBatch.NO_PRODUCER_EPOCH, RecordBatch.NO_SEQUENCE, false, false, - RecordBatch.NO_PARTITION_LEADER_EPOCH, writeLimit); - } - - /** - * Chunked-aware deallocation for {@link ProducerBatch}. + * Try to append to a ProducerBatch, with mid-batch chunk extension support. *

- * Intercepts the {@link #isInflight()} branch of the parent's - * {@link RecordAccumulator#deallocate(ProducerBatch)} when the batch is backed by a - * {@link ChunkedByteBufferOutputStream}. The parent's defensive path credits only - * {@code initialCapacity} (one {@code chunkSize}) back to the pool and throws — that's the - * KAFKA-19012-aware safety net for the legacy single-buffer path, where the in-flight - * buffer is the pooled buffer. A chunked batch holds K chunks but the in-flight - * network bytes live in a separate flattened heap buffer (see - * {@link ChunkedByteBufferOutputStream#buffer()}), so it is safe to return all K chunks - * to the pool here. The {@code IllegalStateException} is still propagated to preserve the - * contract that deallocating an inflight batch is unexpected upstream. + * If the tail batch has logical room (writeLimit-wise) but its chunked stream lacks + * physical capacity, returns {@link RecordAppendResult#needsExtension(int)} without + * attempting the append; the caller allocates chunks outside the deque lock, attaches + * them, and retries. Otherwise defers to the parent. */ @Override - public void deallocate(ProducerBatch batch) { - if (!batch.isSplitBatch() - && !batch.isBufferDeallocated() - && batch.isInflight() - && batch.bufferStream() instanceof ChunkedByteBufferOutputStream) { - batch.markBufferDeallocated(); - deallocateBatchBuffer(batch); - throw new IllegalStateException("Attempting to deallocate a batch that is inflight. Batch is " + batch); + protected RecordAppendResult tryAppend(long timestamp, byte[] key, byte[] value, Header[] headers, + Callback callback, Deque deque, long nowMs) { + if (closed) + throw new KafkaException("Producer closed while send in progress"); + ProducerBatch last = deque.peekLast(); + // Split batches in an incremental deque are plain ProducerBatch (heap-backed, grow-on-demand) + // and never need chunk extension, so the probe only applies to chunked batches. + if (last instanceof ChunkedProducerBatch) { + int extensionBytes = ((ChunkedProducerBatch) last).extensionBytesNeeded(timestamp, key, value, headers); + if (extensionBytes > 0) + return RecordAppendResult.needsExtension(extensionBytes); } - super.deallocate(batch); + return super.tryAppend(timestamp, key, value, headers, callback, deque, nowMs); + } + + @Override + protected ProducerBatch createProducerBatch(TopicPartition tp, MemoryRecordsBuilder recordsBuilder, long nowMs) { + return new ChunkedProducerBatch(tp, recordsBuilder, nowMs); } /** - * Route chunked batch deallocation through the stream rather than the legacy - * single-buffer pool path. The flattened buffer returned by {@code batch.buffer()} is a - * fresh heap allocation and must NOT be passed to the pool — instead, the stream owns - * the chunk-sized buffers and returns them to the pool. + * Build a {@link MemoryRecordsBuilder} backed by the chunked stream. {@code writeLimit} = + * {@code max(batchSize, firstRecordSize)} (the same logical cap as the full path) bounds when + * the batch is full; physical capacity grows on demand via + * {@link ChunkedByteBufferOutputStream#addBuffers(List)}. */ - @Override - protected void deallocateBatchBuffer(ProducerBatch batch) { - if (batch.bufferStream() instanceof ChunkedByteBufferOutputStream) { - ((ChunkedByteBufferOutputStream) batch.bufferStream()).deallocate(chunkedFree); - } else { - // Split batches and any non-chunked batches go through the default deallocation. - super.deallocateBatchBuffer(batch); - } + private MemoryRecordsBuilder chunkedRecordsBuilder(ChunkedByteBufferOutputStream bufferStream, + int firstRecordSize) { + int writeLimit = Math.max(batchSize, firstRecordSize); + return new MemoryRecordsBuilder(bufferStream, RecordBatch.CURRENT_MAGIC_VALUE, compression, + TimestampType.CREATE_TIME, 0L, RecordBatch.NO_TIMESTAMP, RecordBatch.NO_PRODUCER_ID, + RecordBatch.NO_PRODUCER_EPOCH, RecordBatch.NO_SEQUENCE, false, false, + RecordBatch.NO_PARTITION_LEADER_EPOCH, writeLimit); } } diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ProducerBatch.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ProducerBatch.java index a7e47583d310b..6adac7a24424d 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ProducerBatch.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ProducerBatch.java @@ -31,7 +31,6 @@ import org.apache.kafka.common.record.internal.Record; import org.apache.kafka.common.record.internal.RecordBatch; import org.apache.kafka.common.requests.ProduceResponse; -import org.apache.kafka.common.utils.ByteBufferOutputStream; import org.apache.kafka.common.utils.ProducerIdAndEpoch; import org.apache.kafka.common.utils.Time; @@ -58,7 +57,7 @@ * * This class is not thread safe and external synchronization must be used when modifying it */ -public final class ProducerBatch { +public class ProducerBatch { private static final Logger log = LoggerFactory.getLogger(ProducerBatch.class); @@ -141,35 +140,22 @@ boolean hasLeaderChangedForTheOngoingRetry() { /** - * Bytes of physical buffer this batch needs before {@code tryAppend} could accept the given - * record. Returns 0 when the record fits (either logically full, or stream has room). - * Positive when {@code hasRoomFor} allows the record but the underlying stream lacks - * physical capacity — used by the dynamic strategy to allocate exactly the missing bytes - * before retrying. - *

- * For batches with the default {@link ByteBufferOutputStream} (legacy path), the stream's - * {@code remaining()} is large enough that this almost always returns 0. + * Return this batch's buffer memory to the pool. The default single-buffer batch returns + * the pooled buffer at its initial capacity; {@link ChunkedProducerBatch} overrides this + * to return all of its chunks. */ - int extensionBytesNeeded(long timestamp, byte[] key, byte[] value, Header[] headers) { - if (recordCount == 0) - return 0; - if (!recordsBuilder.hasRoomFor(timestamp, key, value, headers)) - return 0; - // Upper-bound estimate is fine: over-counting by header bytes only over-allocates by a - // small fraction of one chunk. Under-estimation would be unsafe. - int recordSize = AbstractRecords.estimateSizeInBytesUpperBound( - magic(), recordsBuilder.compression().type(), key, value, headers); - int gap = recordSize - recordsBuilder.bufferStream().remaining(); - return Math.max(0, gap); + protected void deallocateBuffer(BufferPool pool) { + pool.deallocate(buffer(), initialCapacity()); } /** - * The batch's underlying output stream. Exposed for the dynamic strategy to attach - * extension chunks and route deallocation through the chunked stream rather than the - * standard {@code BufferPool.deallocate(ByteBuffer, int)} path. + * Credit this batch's memory back to the pool when it is unexpectedly still inflight + * (KAFKA-19012): the buffer can't be touched (the network may still be reading it), so the + * default donates a fresh same-capacity buffer. {@link ChunkedProducerBatch} instead returns + * the actual chunks (safe — inflight bytes live in the separate flattened buffer). */ - ByteBufferOutputStream bufferStream() { - return recordsBuilder.bufferStream(); + protected void deallocateInflightBuffer(BufferPool pool) { + pool.deallocate(ByteBuffer.allocate(initialCapacity())); } /** diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/RecordAccumulator.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/RecordAccumulator.java index 29ddd07a489bc..7bb4243e24f9d 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/internals/RecordAccumulator.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/RecordAccumulator.java @@ -398,12 +398,15 @@ protected RecordAppendResult appendNewBatch(String topic, RecordAppendResult appendResult = tryAppend(timestamp, key, value, headers, callbacks, dq, nowMs); if (appendResult != null) { - // Somebody else found us a batch, return the one we waited for! Hopefully this doesn't happen often... + // Propagate without creating a new batch: either another thread already made us a batch + // (success), or — incremental strategy — a concurrent appender created an extendable tail + // (needsBufferExtension), so the caller releases its pre-allocated buffer and retries via + // the extension path. return appendResult; } MemoryRecordsBuilder recordsBuilder = recordsBuilderSupplier.get(); - ProducerBatch batch = new ProducerBatch(new TopicPartition(topic, partition), recordsBuilder, nowMs); + ProducerBatch batch = createProducerBatch(new TopicPartition(topic, partition), recordsBuilder, nowMs); FutureRecordMetadata future = Objects.requireNonNull(batch.tryAppend(timestamp, key, value, headers, callbacks, nowMs)); @@ -413,6 +416,14 @@ protected RecordAppendResult appendNewBatch(String topic, return new RecordAppendResult(future, dq.size() > 1 || batch.isFull(), true, batch.estimatedSizeInBytes()); } + /** + * Create the {@link ProducerBatch} for a new batch. The incremental strategy overrides this to + * create a {@link ChunkedProducerBatch}. + */ + protected ProducerBatch createProducerBatch(TopicPartition tp, MemoryRecordsBuilder recordsBuilder, long nowMs) { + return new ProducerBatch(tp, recordsBuilder, nowMs); + } + /** * Check if all batches in the queue are full. */ @@ -1057,23 +1068,16 @@ public void deallocate(ProducerBatch batch) { } else { batch.markBufferDeallocated(); if (batch.isInflight()) { - // Create a fresh ByteBuffer to give to BufferPool to reuse since we can't safely call deallocate with the ProduceBatch's buffer - free.deallocate(ByteBuffer.allocate(batch.initialCapacity())); + // We can't safely deallocate the buffer of an inflight batch; the batch credits + // the memory back to the pool in a way that suits its buffer type. + batch.deallocateInflightBuffer(free); throw new IllegalStateException("Attempting to deallocate a batch that is inflight. Batch is " + batch); } - deallocateBatchBuffer(batch); + batch.deallocateBuffer(free); } } } - /** - * Return the batch's underlying buffer to the pool. - * This default implementation returns the buffer at its initial capacity (single buffer). - */ - protected void deallocateBatchBuffer(ProducerBatch batch) { - free.deallocate(batch.buffer(), batch.initialCapacity()); - } - /** * Remove from the incomplete list but do not free memory yet */ @@ -1267,17 +1271,41 @@ public static final class RecordAppendResult { public final FutureRecordMetadata future; public final boolean batchIsFull; public final boolean newBatchCreated; + /** + * Signal (incremental strategy) that the tail batch has logical room (writeLimit-wise) but + * its chunks lack physical capacity. The append was NOT attempted; the caller allocates + * {@link #extensionBytesNeeded} bytes of chunk capacity, attaches them via + * {@link ChunkedProducerBatch#addBuffers}, and retries. When {@code true}, {@code future} is null. + */ + public final boolean needsBufferExtension; + public final int extensionBytesNeeded; public final int appendedBytes; public RecordAppendResult(FutureRecordMetadata future, boolean batchIsFull, boolean newBatchCreated, int appendedBytes) { + this(future, batchIsFull, newBatchCreated, false, 0, appendedBytes); + } + + private RecordAppendResult(FutureRecordMetadata future, + boolean batchIsFull, + boolean newBatchCreated, + boolean needsBufferExtension, + int extensionBytesNeeded, + int appendedBytes) { this.future = future; this.batchIsFull = batchIsFull; this.newBatchCreated = newBatchCreated; + this.needsBufferExtension = needsBufferExtension; + this.extensionBytesNeeded = extensionBytesNeeded; this.appendedBytes = appendedBytes; } + + /** A signal-only result indicating the caller must allocate more chunk capacity. */ + static RecordAppendResult needsExtension(int extensionBytesNeeded) { + return new RecordAppendResult(null, false, false, true, extensionBytesNeeded, 0); + } } /* diff --git a/clients/src/main/java/org/apache/kafka/common/record/internal/AbstractRecords.java b/clients/src/main/java/org/apache/kafka/common/record/internal/AbstractRecords.java index d0704d18ec37a..d8eb6c4dd0be7 100644 --- a/clients/src/main/java/org/apache/kafka/common/record/internal/AbstractRecords.java +++ b/clients/src/main/java/org/apache/kafka/common/record/internal/AbstractRecords.java @@ -143,6 +143,31 @@ else if (compressionType != CompressionType.NONE) return Records.LOG_OVERHEAD + LegacyRecord.recordSize(magic, key, value); } + /** + * Upper-bound size of one record's contribution to a batch, excluding the batch header. + * For magic v2 it's {@link #estimateSizeInBytesUpperBound} minus + * {@link #recordBatchHeaderSizeInBytes}; for older magic it equals + * {@link #estimateSizeInBytesUpperBound}. Used by the incremental strategy's cumulative sizing, + * where the header is counted once via {@link MemoryRecordsBuilder#estimatedBytesWritten}. + */ + public static int recordSizeUpperBound(byte magic, CompressionType compressionType, byte[] key, + byte[] value, Header[] headers) { + return recordSizeUpperBound(magic, compressionType, Utils.wrapNullable(key), Utils.wrapNullable(value), headers); + } + + /** + * @see #recordSizeUpperBound(byte, CompressionType, byte[], byte[], Header[]) + */ + private static int recordSizeUpperBound(byte magic, CompressionType compressionType, ByteBuffer key, + ByteBuffer value, Header[] headers) { + if (magic >= RecordBatch.MAGIC_VALUE_V2) + return DefaultRecord.recordSizeUpperBound(key, value, headers); + else if (compressionType != CompressionType.NONE) + return Records.LOG_OVERHEAD + LegacyRecord.recordOverhead(magic) + LegacyRecord.recordSize(magic, key, value); + else + return Records.LOG_OVERHEAD + LegacyRecord.recordSize(magic, key, value); + } + /** * Return the size of the record batch header. * diff --git a/clients/src/main/java/org/apache/kafka/common/record/internal/MemoryRecordsBuilder.java b/clients/src/main/java/org/apache/kafka/common/record/internal/MemoryRecordsBuilder.java index 47a955cfcf18e..2bf48518b2d6f 100644 --- a/clients/src/main/java/org/apache/kafka/common/record/internal/MemoryRecordsBuilder.java +++ b/clients/src/main/java/org/apache/kafka/common/record/internal/MemoryRecordsBuilder.java @@ -212,7 +212,7 @@ public int initialCapacity() { } /** - * The underlying output stream. Exposed so the dynamic-strategy callers can attach + * The underlying output stream. Exposed so the incremental-strategy callers can attach * pre-allocated chunks to a {@code ChunkedByteBufferOutputStream} and route deallocation * polymorphically. */ @@ -824,12 +824,52 @@ private void ensureOpenForRecordBatchWrite() { * @return The estimated number of bytes written */ private int estimatedBytesWritten() { - if (compression.type() == CompressionType.NONE) { + return estimatedBytesWritten(magic, compression.type(), estimatedCompressionRatio, uncompressedRecordsSizeInBytes); + } + + /** + * Static projection of the bytes a builder with the given magic, compression type, and ratio + * would have written for {@code uncompressedRecordsSizeInBytes} of records. Exact for + * uncompressed ({@code header + uncompressedBytes}); for compressed it applies the ratio and 5% + * safety multiplier to the whole total, like the per-builder {@link #estimatedBytesWritten()}. + *

+ * Used by the incremental strategy to size chunk reservations (first record, and mid-batch via + * {@link #estimatedBytesWrittenAfter}). This is the batch's projected total output, + * distinct from the {@link #hasRoomFor} writeLimit gate, which counts the incoming record at + * full uncompressed size; the two coincide for uncompressed data and diverge under compression. + */ + public static int estimatedBytesWritten(byte magic, CompressionType compressionType, + float compressionRatio, + int uncompressedRecordsSizeInBytes) { + int batchHeaderSizeInBytes = AbstractRecords.recordBatchHeaderSizeInBytes(magic, compressionType); + if (compressionType == CompressionType.NONE) { return batchHeaderSizeInBytes + uncompressedRecordsSizeInBytes; } else { - // estimate the written bytes to the underlying byte buffer based on uncompressed written bytes - return batchHeaderSizeInBytes + (int) (uncompressedRecordsSizeInBytes * estimatedCompressionRatio * COMPRESSION_RATE_ESTIMATION_FACTOR); + return batchHeaderSizeInBytes + (int) (uncompressedRecordsSizeInBytes * compressionRatio * COMPRESSION_RATE_ESTIMATION_FACTOR); + } + } + + /** + * Projected value of {@link #estimatedBytesWritten} after appending one more record with the + * given fields, using the record's worst-case (upper-bound) per-record size. Used by the + * incremental strategy to size mid-batch chunk extensions. + */ + public int estimatedBytesWrittenAfter(byte[] key, byte[] value, Header[] headers) { + return estimatedBytesWrittenAfter(wrapNullable(key), wrapNullable(value), headers); + } + + /** + * @see #estimatedBytesWrittenAfter(byte[], byte[], Header[]) + */ + private int estimatedBytesWrittenAfter(ByteBuffer key, ByteBuffer value, Header[] headers) { + final int recordSize; + if (magic < RecordBatch.MAGIC_VALUE_V2) { + recordSize = Records.LOG_OVERHEAD + LegacyRecord.recordSize(magic, key, value); + } else { + recordSize = DefaultRecord.recordSizeUpperBound(key, value, headers); } + return estimatedBytesWritten(magic, compression.type(), estimatedCompressionRatio, + uncompressedRecordsSizeInBytes + recordSize); } /** diff --git a/clients/src/test/java/org/apache/kafka/clients/producer/ProducerConfigTest.java b/clients/src/test/java/org/apache/kafka/clients/producer/ProducerConfigTest.java index 5fd9ab727e046..45615be25dce0 100644 --- a/clients/src/test/java/org/apache/kafka/clients/producer/ProducerConfigTest.java +++ b/clients/src/test/java/org/apache/kafka/clients/producer/ProducerConfigTest.java @@ -129,6 +129,41 @@ public void testInvalidMetadataRecoveryStrategy() { assertTrue(ce.getMessage().contains(CommonClientConfigs.METADATA_RECOVERY_STRATEGY_CONFIG)); } + @Test + public void testDefaultBufferMemoryAllocationStrategy() { + Map configs = new HashMap<>(); + configs.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, keySerializerClass); + configs.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, valueSerializerClass); + configs.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); + final ProducerConfig producerConfig = new ProducerConfig(configs); + assertEquals(ProducerConfig.BUFFER_MEMORY_ALLOCATION_STRATEGY_FULL, + producerConfig.getString(ProducerConfig.BUFFER_MEMORY_ALLOCATION_STRATEGY_CONFIG)); + } + + @Test + public void testValidBufferMemoryAllocationStrategy() { + Map configs = new HashMap<>(); + configs.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, keySerializerClass); + configs.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, valueSerializerClass); + configs.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); + configs.put(ProducerConfig.BUFFER_MEMORY_ALLOCATION_STRATEGY_CONFIG, + ProducerConfig.BUFFER_MEMORY_ALLOCATION_STRATEGY_INCREMENTAL); + final ProducerConfig producerConfig = new ProducerConfig(configs); + assertEquals(ProducerConfig.BUFFER_MEMORY_ALLOCATION_STRATEGY_INCREMENTAL, + producerConfig.getString(ProducerConfig.BUFFER_MEMORY_ALLOCATION_STRATEGY_CONFIG)); + } + + @Test + public void testInvalidBufferMemoryAllocationStrategy() { + Map configs = new HashMap<>(); + configs.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, keySerializerClass); + configs.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, valueSerializerClass); + configs.put(ProducerConfig.BUFFER_MEMORY_ALLOCATION_STRATEGY_CONFIG, "abc"); + configs.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); + ConfigException ce = assertThrows(ConfigException.class, () -> new ProducerConfig(configs)); + assertTrue(ce.getMessage().contains(ProducerConfig.BUFFER_MEMORY_ALLOCATION_STRATEGY_CONFIG)); + } + @Test public void testCaseInsensitiveSecurityProtocol() { final String saslSslLowerCase = SecurityProtocol.SASL_SSL.name.toLowerCase(Locale.ROOT); diff --git a/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulatorTest.java b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulatorTest.java index b191aadb4645b..191224cd437d8 100644 --- a/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulatorTest.java +++ b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulatorTest.java @@ -157,20 +157,6 @@ public void testMidBatchExtensionGrowsExistingBatch() throws Exception { accum.close(); } - /** Compression is rejected at batch creation in PR1 with a clear message. */ - @Test - public void testCompressionThrowsAtBatchCreation() { - int chunkSize = 256; - ChunkedRecordAccumulator accum = newAccumulator(1024, chunkSize, 16L * chunkSize, Compression.gzip().build()); - - UnsupportedOperationException ex = assertThrows(UnsupportedOperationException.class, () -> - accum.append(topic, partition1, 0L, key, new byte[32], Record.EMPTY_HEADERS, null, - maxBlockTimeMs, time.milliseconds(), cluster)); - assertTrue(ex.getMessage().contains("not implemented yet"), - "Expected 'not implemented yet' in message, got: " + ex.getMessage()); - accum.close(); - } - /** * Race between two concurrent appenders on the same chunked tail. Both threads probe * {@code extensionBytesNeeded} under the deque lock against the same physical remaining @@ -347,4 +333,81 @@ public void testBatchCloseDoesNotDeallocateChunksPrematurely() throws Exception accum.close(); } + + /** + * Chunks attached grow with the batch's cumulative projected output, not per-record. With a + * chunkSize smaller than the batch's eventual content, a sequence of small records should + * extend the chunks as the running total (header + uncompressed bytes for NONE) crosses + * chunk boundaries — not allocate the first record's worth and then never extend until + * physical capacity is exhausted (which the per-record formula would do). + */ + @Test + public void testExtensionTracksCumulativeBatchSize() throws Exception { + int chunkSize = 64; + int batchSize = 512; + ChunkedRecordAccumulator accum = newAccumulator(batchSize, chunkSize, 64L * chunkSize, Compression.NONE); + + byte[] smallValue = new byte[24]; + for (int i = 0; i < 6; i++) { + accum.append(topic, partition1, 0L, key, smallValue, Record.EMPTY_HEADERS, null, + maxBlockTimeMs, time.milliseconds(), cluster); + } + + Deque dq = batchesFor(accum, tp1); + assertEquals(1, dq.size()); + ProducerBatch batch = dq.peekFirst(); + assertNotNull(batch); + assertEquals(6, batch.recordCount); + + // batch.close() flattens chunks and writes the header. If chunks under-allocated, this + // throws (insufficient capacity in the underlying buffer). + batch.close(); + MemoryRecords records = batch.records(); + int actualSize = records.sizeInBytes(); + // Under NONE compression, estimatedBytesWritten is exact: physical bytes ≈ header + sum + // of per-record bytes. The chunks attached must cover that, so actualSize must be > + // chunkSize for a multi-record batch with non-trivial content. + assertTrue(actualSize > chunkSize, + "batch should have grown beyond a single chunk; got " + actualSize); + + accum.close(); + } + + /** + * The cumulative sizing formula counts the batch header once. The previous per-record + * formula effectively included the batch header in each record's upper bound, which + * over-allocated as records accumulated. With the cumulative formula, chunk count for N + * small records of total uncompressed size {@code U} is {@code ceil((header + U) / chunkSize)}, + * not {@code N × ceil((header + recordSize) / chunkSize)}. + */ + @Test + public void testCumulativeAccountsForHeaderOnce() throws Exception { + int chunkSize = 256; + int batchSize = 8192; + long totalMemory = 64L * chunkSize; + ChunkedBufferPool pool = new ChunkedBufferPool(totalMemory, chunkSize, metrics, time, "producer-metrics"); + ChunkedRecordAccumulator accum = new ChunkedRecordAccumulator(logContext, batchSize, Compression.NONE, + /* lingerMs */ 0, /* retryBackoffMs */ 0L, /* retryBackoffMaxMs */ 0L, + /* deliveryTimeoutMs */ 3200, metrics, "producer-metrics", time, + /* transactionManager */ null, pool); + + long beforeAlloc = pool.availableMemory(); + // First record establishes the batch. Each subsequent small record contributes its + // uncompressed bytes to the cumulative target; the batch header is NOT re-counted. + byte[] smallValue = new byte[8]; + for (int i = 0; i < 4; i++) { + accum.append(topic, partition1, 0L, key, smallValue, Record.EMPTY_HEADERS, null, + maxBlockTimeMs, time.milliseconds(), cluster); + } + + // Each per-record append adds ~10-30 bytes (key + value + record overhead, V2). Cumulative + // total for 4 records is well below chunkSize=256, so only 1 chunk should ever be attached. + // The per-record formula (header counted once per record) would have allocated more. + long held = beforeAlloc - pool.availableMemory(); + assertEquals(chunkSize, held, + "cumulative formula should hold exactly one chunk for a small-record batch; " + + "header double-counting (per-record formula) would inflate this"); + + accum.close(); + } } From 8c82cd261c2c79d091697f35779bf7b5067cb635 Mon Sep 17 00:00:00 2001 From: Lianet Magrans Date: Thu, 18 Jun 2026 15:27:50 -0400 Subject: [PATCH 04/11] integration tests --- .../kafka/api/BaseProducerSendTest.scala | 10 +- ...ncrementalAllocationProducerSendTest.scala | 94 +++++++++++++++++++ .../scala/unit/kafka/utils/TestUtils.scala | 4 +- 3 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 core/src/test/scala/integration/kafka/api/IncrementalAllocationProducerSendTest.scala diff --git a/core/src/test/scala/integration/kafka/api/BaseProducerSendTest.scala b/core/src/test/scala/integration/kafka/api/BaseProducerSendTest.scala index 71d0dd8273992..1d50bd8910549 100644 --- a/core/src/test/scala/integration/kafka/api/BaseProducerSendTest.scala +++ b/core/src/test/scala/integration/kafka/api/BaseProducerSendTest.scala @@ -101,6 +101,13 @@ abstract class BaseProducerSendTest extends KafkaServerTestHarness { super.tearDown() } + /** + * Additional producer properties applied to every producer created via [[createProducer]]. + * Subclasses override this to run the whole suite under a different producer configuration + * (e.g. a different buffer.memory.allocation.strategy). + */ + protected def producerOverrides: Properties = new Properties() + protected def createProducer(lingerMs: Int = 0, deliveryTimeoutMs: Int = 2 * 60 * 1000, batchSize: Int = 16384, @@ -117,7 +124,8 @@ abstract class BaseProducerSendTest extends KafkaServerTestHarness { deliveryTimeoutMs = deliveryTimeoutMs, maxBlockMs = maxBlockMs, batchSize = batchSize, - bufferSize = bufferSize) + bufferSize = bufferSize, + additionalProperties = Some(producerOverrides)) registerProducer(producer) } diff --git a/core/src/test/scala/integration/kafka/api/IncrementalAllocationProducerSendTest.scala b/core/src/test/scala/integration/kafka/api/IncrementalAllocationProducerSendTest.scala new file mode 100644 index 0000000000000..74bcd892813c7 --- /dev/null +++ b/core/src/test/scala/integration/kafka/api/IncrementalAllocationProducerSendTest.scala @@ -0,0 +1,94 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package kafka.api + +import java.util.concurrent.ThreadLocalRandom +import java.util.{Properties, List => JList} +import kafka.utils.{TestInfoUtils, TestUtils} +import org.apache.kafka.clients.producer.{ProducerConfig, ProducerRecord} +import org.apache.kafka.common.TopicPartition +import org.apache.kafka.common.network.ListenerName +import org.apache.kafka.common.security.auth.SecurityProtocol +import org.junit.jupiter.api.Assertions.{assertArrayEquals, assertEquals} +import org.junit.jupiter.params.ParameterizedTest +import org.junit.jupiter.params.provider.MethodSource + +/** + * Runs the whole [[BaseProducerSendTest]] suite with the producer configured with the incremental + * buffer.memory allocation strategy, plus a few scenarios specific to chunked allocation. + */ +class IncrementalAllocationProducerSendTest extends BaseProducerSendTest { + + override protected def producerOverrides: Properties = { + val props = new Properties() + props.put(ProducerConfig.BUFFER_MEMORY_ALLOCATION_STRATEGY_CONFIG, + ProducerConfig.BUFFER_MEMORY_ALLOCATION_STRATEGY_INCREMENTAL) + props + } + + // The incremental strategy does not support compression yet; the base test would fail at + // producer construction. Overriding without the test annotations removes it from this + // subclass's run. TODO: remove this override when compression support lands. + override def testSendCompressedMessageWithCreateTime(groupProtocol: String): Unit = {} + + // batch.size=0 is below the internal chunk size, so the incremental strategy falls back to the full + // allocation path (a batch is smaller than one chunk). Verifies that fallback yields a working producer. + @ParameterizedTest(name = TestInfoUtils.TestWithParameterizedGroupProtocolNames) + @MethodSource(Array("getTestGroupProtocolParametersConsumerGroupProtocolOnly")) + def testBatchSizeZero(groupProtocol: String): Unit = { + sendAndVerify(createProducer( + lingerMs = Int.MaxValue, + deliveryTimeoutMs = Int.MaxValue, + batchSize = 0)) + } + + @ParameterizedTest(name = TestInfoUtils.TestWithParameterizedGroupProtocolNames) + @MethodSource(Array("getTestGroupProtocolParametersConsumerGroupProtocolOnly")) + def testSendLargeRecordsSpanningMultipleChunks(groupProtocol: String): Unit = { + TestUtils.createTopicWithAdmin(admin, topic, brokers, controllerServers, 1, 2) + val partition = 0 + val tp = new TopicPartition(topic, partition) + val valueSize = 600_000 // far larger than one chunk, so each record spans many chunks + // TODO: also exercise the compressed codecs here once the incremental strategy supports compression. + + val producer = createProducer(batchSize = 1024 * 1024, bufferSize = 8L * 1024 * 1024) + val value = randomBytes(valueSize) + val metadata = producer.send(new ProducerRecord(topic, partition, "key".getBytes, value)).get + assertEquals(valueSize, metadata.serializedValueSize) + + // Verify the exact bytes round-trip. + val consumer = TestUtils.createConsumer( + bootstrapServers(listenerName = ListenerName.forSecurityProtocol(SecurityProtocol.PLAINTEXT)), + groupProtocolFromTestParameters()) + try { + consumer.assign(JList.of(tp)) + consumer.seekToBeginning(JList.of(tp)) + val consumed = TestUtils.consumeRecords(consumer, 1) + assertEquals(metadata.offset, consumed.head.offset) + assertArrayEquals(value, consumed.head.value) + } finally { + consumer.close() + } + } + + private def randomBytes(size: Int): Array[Byte] = { + val bytes = new Array[Byte](size) + ThreadLocalRandom.current().nextBytes(bytes) + bytes + } +} diff --git a/core/src/test/scala/unit/kafka/utils/TestUtils.scala b/core/src/test/scala/unit/kafka/utils/TestUtils.scala index bc41c2ed2aeeb..088fe6384ba6d 100755 --- a/core/src/test/scala/unit/kafka/utils/TestUtils.scala +++ b/core/src/test/scala/unit/kafka/utils/TestUtils.scala @@ -491,7 +491,8 @@ object TestUtils extends Logging { saslProperties: Option[Properties] = None, keySerializer: Serializer[K] = new ByteArraySerializer, valueSerializer: Serializer[V] = new ByteArraySerializer, - enableIdempotence: Boolean = false): KafkaProducer[K, V] = { + enableIdempotence: Boolean = false, + additionalProperties: Option[Properties] = None): KafkaProducer[K, V] = { val producerProps = new Properties producerProps.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokerList) producerProps.put(ProducerConfig.ACKS_CONFIG, acks.toString) @@ -504,6 +505,7 @@ object TestUtils extends Logging { producerProps.put(ProducerConfig.BATCH_SIZE_CONFIG, batchSize.toString) producerProps.put(ProducerConfig.COMPRESSION_TYPE_CONFIG, compressionType) producerProps.put(ProducerConfig.ENABLE_IDEMPOTENCE_CONFIG, enableIdempotence.toString) + additionalProperties.foreach(producerProps ++= _) producerProps ++= JaasTestUtils.producerSecurityConfigs(securityProtocol, OptionConverters.toJava(trustStoreFile), OptionConverters.toJava(saslProperties)) new KafkaProducer[K, V](producerProps, keySerializer, valueSerializer) } From b71d5b716d3427f2cdaea14839ffa16ff83a6ae6 Mon Sep 17 00:00:00 2001 From: Lianet Magrans Date: Thu, 18 Jun 2026 15:54:53 -0400 Subject: [PATCH 05/11] refactor validation & upd docs --- .../kafka/clients/producer/KafkaProducer.java | 2 +- .../producer/internals/ChunkedBufferPool.java | 26 ++++++++++--------- .../internals/ChunkedProducerBatch.java | 5 ++-- .../internals/ChunkedRecordAccumulator.java | 4 +-- .../ChunkedRecordAccumulatorTest.java | 22 ++++++++-------- 5 files changed, 31 insertions(+), 28 deletions(-) diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/KafkaProducer.java b/clients/src/main/java/org/apache/kafka/clients/producer/KafkaProducer.java index 0990a68439171..51835478ea5df 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/KafkaProducer.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/KafkaProducer.java @@ -471,7 +471,7 @@ public KafkaProducer(Properties properties, Serializer keySerializer, Seriali // Use the chunked path only when a batch is at least one full chunk // (batch.size >= CHUNK_SIZE). Below that, a batch can't fill even one chunk, so chunking // would reserve a whole chunk for a batch capped under it (over-reservation); the full - // strategy (reserving up to batch.size per batch) is both leaner and identical to trunk. + // strategy (reserving max(batch.size, record size) per batch) is both leaner and identical to trunk. if (incremental && batchSize >= ChunkedRecordAccumulator.CHUNK_SIZE) { this.accumulator = new ChunkedRecordAccumulator(logContext, batchSize, diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPool.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPool.java index 9fa367d80e889..1c40451c28dbe 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPool.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPool.java @@ -61,22 +61,11 @@ public ChunkedBufferPool(long memory, int chunkSize, Metrics metrics, Time time, public List allocateChunks(int totalSize, long maxTimeToBlockMs) throws InterruptedException { if (totalSize <= 0) throw new IllegalArgumentException("totalSize must be positive: " + totalSize); - if (totalSize > totalMemory()) - throw new IllegalArgumentException("Attempt to allocate " + totalSize - + " bytes across chunks, but there is a hard limit of " - + totalMemory() + " on memory allocations."); + throwIfChunksNeededExceedsPool(totalSize); int chunkSize = poolableSize(); int numChunks = (int) (((long) totalSize + chunkSize - 1L) / chunkSize); long memoryRequired = (long) numChunks * chunkSize; - // Rounding totalSize up to whole chunks can require more pool memory than buffer.memory - // holds even when totalSize alone fits (the single-buffer "full" path reserves exactly the - // record size, so it never hits this). Reject it here to fail fast, instead of blocking for - // max.block.ms and then throwing BufferExhausted for a request that can never be satisfied. - if (memoryRequired > totalMemory()) - throw new IllegalArgumentException("Attempt to allocate " + numChunks + " chunks of " - + chunkSize + " bytes (" + memoryRequired + " bytes total), but there is a hard limit of " - + totalMemory() + " on memory allocations."); // Chunks pulled from the free list; the remaining bytes are reserved against // nonPooledAvailableMemory and materialized as raw allocations after the lock is released. @@ -196,4 +185,17 @@ public List allocateChunks(int totalSize, long maxTimeToBlockMs) thr } } } + + /** + * Throw if rounding {@code totalSize} up to whole chunks would exceed the pool. + */ + private void throwIfChunksNeededExceedsPool(int totalSize) { + int chunkSize = poolableSize(); + int numChunks = (int) (((long) totalSize + chunkSize - 1L) / chunkSize); + long memoryRequired = (long) numChunks * chunkSize; + if (memoryRequired > totalMemory()) + throw new IllegalArgumentException("Attempt to allocate " + totalSize + " bytes (" + + numChunks + " chunks of " + chunkSize + " = " + memoryRequired + " bytes), but the " + + "hard limit on memory allocations is " + totalMemory() + "."); + } } diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedProducerBatch.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedProducerBatch.java index e80ed7861826d..78a187cf431d7 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedProducerBatch.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedProducerBatch.java @@ -45,8 +45,9 @@ public ChunkedProducerBatch(TopicPartition tp, MemoryRecordsBuilder recordsBuild /** * Bytes of physical buffer this batch needs before {@code tryAppend} could accept the given - * record. Returns 0 when the record fits (either logically full, or the stream's combined - * chunk capacity has room). Positive when {@code hasRoomFor} allows the record but the + * record. Returns 0 when no extension is needed: the batch is empty (first record), it is + * logically full, or the stream's combined chunk capacity already has room. Positive when + * {@code hasRoomFor} allows the record but the * chunks lack physical capacity — the accumulator allocates exactly the missing bytes * (rounded up to whole chunks) and attaches them via {@link #addBuffers} before retrying. */ diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulator.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulator.java index a587754d80c44..a16f85dbf9ef7 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulator.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulator.java @@ -42,8 +42,8 @@ /** * A {@link RecordAccumulator} variant that uses chunked buffer allocation: each new batch - * pre-allocates {@code ceil(recordSize / chunkSize)} chunks from a {@link ChunkedBufferPool}, - * and grows mid-batch by attaching additional chunks via + * pre-allocates the chunks needed for the first record's estimated size (batch header plus the + * record), and grows mid-batch by attaching additional chunks via * {@link ChunkedProducerBatch#addBuffers(List)} when later records would overflow. *

* This is the "incremental" buffer.memory allocation strategy: memory consumption scales with diff --git a/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulatorTest.java b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulatorTest.java index 191224cd437d8..35bc979ae86a4 100644 --- a/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulatorTest.java +++ b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulatorTest.java @@ -246,10 +246,10 @@ public List allocateChunks(int totalSize, long maxTimeToBlockMs) thr /** * Regression guard: an inflight chunked batch returns all K chunks to the pool on the * expiration / broker-disconnect path, not just one {@code initialCapacity} chunkSize. - * The parent {@code RecordAccumulator.deallocate} for inflight batches credits only - * {@code initialCapacity} back to the pool and throws; {@link ChunkedRecordAccumulator} - * overrides {@code deallocate} to return all chunks before the throw. Removing that - * override would re-introduce a (K−1)×chunkSize leak and fail this test. + * For an inflight batch the parent {@code RecordAccumulator.deallocate} calls + * {@code batch.deallocateInflightBuffer(pool)} and then throws; {@link ChunkedProducerBatch} + * overrides that hook to return all chunks (instead of donating one fresh buffer). Removing + * that override would re-introduce a (K−1)×chunkSize leak and fail this test. */ @Test public void testInflightExpirationReturnsAllChunksToPool() throws Exception { @@ -297,11 +297,11 @@ private Deque batchesFor(RecordAccumulator accum, TopicPartition /** * Sender's drain (and any other caller of {@code batch.close()}) cascades through * {@code MemoryRecordsBuilder.closeForRecordAppends()} → - * {@code appendStream.close()} → {@code bufferStream.close()}. The chunked stream's - * buffer must remain readable after that cascade so the builder can flatten chunks into - * the heap buffer it sends over the network. If the chunked stream's {@code close()} - * prematurely returns chunks to the pool, the flattened buffer is empty and the header - * write fails (or produces an empty record set). + * {@code appendStream.close()} → {@code bufferStream.close()}. The chunk data must remain + * readable through that cascade so the builder can flatten the chunks into the heap buffer it + * sends over the network — chunks are returned to the pool only at batch completion + * (deallocate), never at close. If they were released at close, the flattened buffer would be + * empty and the header write would fail (or produce an empty record set). */ @Test public void testBatchCloseDoesNotDeallocateChunksPrematurely() throws Exception { @@ -316,7 +316,7 @@ public void testBatchCloseDoesNotDeallocateChunksPrematurely() throws Exception ProducerBatch batch = dq.peekFirst(); assertNotNull(batch); - // Matches the call sites in RecordAccumulator.drain (line 941) and Sender. + // Matches the call sites in RecordAccumulator.drain and Sender. batch.close(); MemoryRecords records = batch.records(); assertTrue(records.sizeInBytes() > 0, @@ -364,7 +364,7 @@ public void testExtensionTracksCumulativeBatchSize() throws Exception { batch.close(); MemoryRecords records = batch.records(); int actualSize = records.sizeInBytes(); - // Under NONE compression, estimatedBytesWritten is exact: physical bytes ≈ header + sum + // Under NONE compression, estimatedBytesWritten is exact: physical bytes = header + sum // of per-record bytes. The chunks attached must cover that, so actualSize must be > // chunkSize for a multi-record batch with non-trivial content. assertTrue(actualSize > chunkSize, From bdc14e90f471b4c11fd881c98de4f165c009afc5 Mon Sep 17 00:00:00 2001 From: Lianet Magrans Date: Mon, 22 Jun 2026 09:54:12 -0400 Subject: [PATCH 06/11] update config --- .../apache/kafka/clients/producer/KafkaProducer.java | 3 +-- .../kafka/clients/producer/ProducerConfig.java | 12 ++++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/KafkaProducer.java b/clients/src/main/java/org/apache/kafka/clients/producer/KafkaProducer.java index 51835478ea5df..3b13014768281 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/KafkaProducer.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/KafkaProducer.java @@ -470,8 +470,7 @@ public KafkaProducer(Properties properties, Serializer keySerializer, Seriali } // Use the chunked path only when a batch is at least one full chunk // (batch.size >= CHUNK_SIZE). Below that, a batch can't fill even one chunk, so chunking - // would reserve a whole chunk for a batch capped under it (over-reservation); the full - // strategy (reserving max(batch.size, record size) per batch) is both leaner and identical to trunk. + // would over-reserve. if (incremental && batchSize >= ChunkedRecordAccumulator.CHUNK_SIZE) { this.accumulator = new ChunkedRecordAccumulator(logContext, batchSize, diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/ProducerConfig.java b/clients/src/main/java/org/apache/kafka/clients/producer/ProducerConfig.java index 286f3e37dc823..696e9c103118f 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/ProducerConfig.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/ProducerConfig.java @@ -234,12 +234,12 @@ public class ProducerConfig extends AbstractConfig { public static final String BUFFER_MEMORY_ALLOCATION_STRATEGY_INCREMENTAL = "incremental"; private static final String BUFFER_MEMORY_ALLOCATION_STRATEGY_DOC = "Controls how the producer allocates memory from " + BUFFER_MEMORY_CONFIG + " for record batches. The following values are supported: " + "

    " - + "
  • " + BUFFER_MEMORY_ALLOCATION_STRATEGY_FULL + ": reserve " + BATCH_SIZE_CONFIG + " bytes up front for each in-progress batch, " - + "regardless of the actual size of the records appended to it.
  • " - + "
  • " + BUFFER_MEMORY_ALLOCATION_STRATEGY_INCREMENTAL + ": allocate memory on demand as records are appended, growing each batch " - + "up to " + BATCH_SIZE_CONFIG + " bytes. This keeps memory usage proportional to the data actually buffered rather than to the number of " - + "active partitions, which allows larger " + BATCH_SIZE_CONFIG + " values (e.g. for high-latency clusters) without reserving " - + "" + BATCH_SIZE_CONFIG + " bytes for every active partition.
  • " + + "
  • " + BUFFER_MEMORY_ALLOCATION_STRATEGY_FULL + ": reserves a full " + BATCH_SIZE_CONFIG + " up front when a batch is created, " + + "regardless of how much data it ends up holding. Pool memory therefore scales with the number of active partitions.
  • " + + "
  • " + BUFFER_MEMORY_ALLOCATION_STRATEGY_INCREMENTAL + ": allocates memory on demand as records are appended, growing a batch " + + "up to " + BATCH_SIZE_CONFIG + ". Pool memory therefore scales with the data actually buffered rather than the number of active " + + "partitions, allowing larger " + BATCH_SIZE_CONFIG + " values (e.g. for high-latency clusters) without reserving " + + "" + BATCH_SIZE_CONFIG + " for every active partition.
  • " + "
"; /** retry.backoff.ms */ From 4e914999d0af74b25e3714d8cc118194b550ea49 Mon Sep 17 00:00:00 2001 From: Lianet Magrans Date: Mon, 22 Jun 2026 14:55:55 -0400 Subject: [PATCH 07/11] fix for partition change & misc --- .../producer/internals/BufferPool.java | 1 - .../producer/internals/ChunkedBufferPool.java | 14 ++-- .../ChunkedByteBufferOutputStream.java | 21 ++++- .../internals/ChunkedRecordAccumulator.java | 12 ++- .../ChunkedByteBufferOutputStreamTest.java | 22 +++++ .../ChunkedRecordAccumulatorTest.java | 82 +++++++++++++++++++ 6 files changed, 139 insertions(+), 13 deletions(-) diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/BufferPool.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/BufferPool.java index f376c0fe2f4a3..c515cf0a8d79f 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/internals/BufferPool.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/BufferPool.java @@ -59,7 +59,6 @@ public class BufferPool { private final Metrics metrics; protected final Time time; private final Sensor waitTime; - /** True once {@link #close()} has been invoked. */ protected boolean closed; /** diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPool.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPool.java index 1c40451c28dbe..c81fef81cffbe 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPool.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPool.java @@ -30,10 +30,7 @@ /** * A {@link BufferPool} dedicated to chunk-sized buffer reuse (chunk size = {@link #poolableSize()}). *

- * Adds {@link #allocateChunks(int, long)} to acquire multiple chunks atomically: one lock - * acquisition and a single {@link Condition} on {@link #waiters} (FIFO-fair against single-chunk - * acquirers), with any failure (timeout, close, OOM) refunding the whole reservation before the - * exception propagates. No partial holds are visible during the wait. + * Adds {@link #allocateChunks(int, long)} to acquire multiple chunks atomically. */ public class ChunkedBufferPool extends BufferPool { @@ -44,9 +41,10 @@ public ChunkedBufferPool(long memory, int chunkSize, Metrics metrics, Time time, /** * Allocate {@code ceil(totalSize / chunkSize)} chunk-sized buffers atomically, mirroring * {@link BufferPool#allocate}: satisfied immediately if memory is available, else blocks up to - * {@code maxTimeToBlockMs} for the whole request (FIFO on {@link #waiters}). The reservation is - * tracked as bytes against {@link #nonPooledAvailableMemory} plus chunks polled from - * {@link #free}; on any failure path every reserved byte is returned and the next waiter signaled. + * {@code maxTimeToBlockMs} for the whole request (FIFO on {@link #waiters}). + * The reservation is tracked as bytes against {@link #nonPooledAvailableMemory} plus chunks polled + * from {@link #free}. Any failure refunds the whole reservation and signals + * the next waiter before the exception propagates, so no partial holds are visible during the wait. * * @param totalSize minimum total bytes of capacity required across the returned chunks * @param maxTimeToBlockMs maximum time in milliseconds to block waiting for memory @@ -67,7 +65,7 @@ public List allocateChunks(int totalSize, long maxTimeToBlockMs) thr int numChunks = (int) (((long) totalSize + chunkSize - 1L) / chunkSize); long memoryRequired = (long) numChunks * chunkSize; - // Chunks pulled from the free list; the remaining bytes are reserved against + // Chunks taken from the free list. The remaining bytes are reserved against // nonPooledAvailableMemory and materialized as raw allocations after the lock is released. List pooled = new ArrayList<>(numChunks); diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStream.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStream.java index 3b9a9ed42ff5e..89110438381f9 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStream.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStream.java @@ -59,9 +59,8 @@ public class ChunkedByteBufferOutputStream extends ByteBufferOutputStream { * @param chunkSize the size of each chunk in bytes * @param pool the buffer pool used for deallocation */ - @SuppressWarnings("this-escape") public ChunkedByteBufferOutputStream(List initialChunks, int chunkSize, BufferPool pool) { - super(initialChunks.get(0)); + super(validatedFirstChunk(initialChunks, chunkSize)); this.chunkSize = chunkSize; this.pool = pool; this.chunks = new ArrayList<>(initialChunks); @@ -70,6 +69,21 @@ public ChunkedByteBufferOutputStream(List initialChunks, int chunkSi this.dirty = true; } + /** + * Validates the chunk contract: {@code initialChunks} non-empty, each chunk's capacity equal to + * {@code chunkSize}. Returns the first chunk. + */ + private static ByteBuffer validatedFirstChunk(List initialChunks, int chunkSize) { + if (initialChunks == null || initialChunks.isEmpty()) + throw new IllegalArgumentException("initialChunks must be non-empty"); + for (ByteBuffer chunk : initialChunks) { + if (chunk.capacity() != chunkSize) + throw new IllegalArgumentException("each chunk must have capacity " + chunkSize + + ", but found a chunk of capacity " + chunk.capacity()); + } + return initialChunks.get(0); + } + @Override public void write(int b) { ensureChunkCapacity(1); @@ -223,7 +237,8 @@ public int limit() { @Override public int initialCapacity() { - return chunks.isEmpty() ? chunkSize : chunks.get(0).capacity(); + // Every chunk is chunkSize (enforced by the constructor), so this is constant. + return chunkSize; } @Override diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulator.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulator.java index a16f85dbf9ef7..2e74f3b096261 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulator.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulator.java @@ -199,8 +199,18 @@ public RecordAppendResult append(String topic, } synchronized (dq) { - if (partitionChanged(topic, topicInfo, partitionInfo, dq, nowMs, cluster)) + if (partitionChanged(topic, topicInfo, partitionInfo, dq, nowMs, cluster)) { + // The partition switched while we allocated extension chunks off-lock. They + // were sized against the previous partition's tail batch, so they must not be + // attached to a different partition's tail — refund them and let the next + // iteration re-probe the new partition from scratch. + if (extensionChunks != null) { + for (ByteBuffer chunk : extensionChunks) + chunkedFree.deallocate(chunk); + extensionChunks = null; + } continue; + } if (extensionChunks != null) { ProducerBatch last = dq.peekLast(); diff --git a/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStreamTest.java b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStreamTest.java index 569c3bbcfcbdc..c96824b09c137 100644 --- a/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStreamTest.java +++ b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStreamTest.java @@ -49,6 +49,28 @@ private List chunks(ChunkedBufferPool pool, int chunkSize, int count return pool.allocateChunks(chunkSize * count, 100); } + /** + * The constructor fails fast with {@link IllegalArgumentException} when its chunk contract is + * violated — a null or empty list, or a chunk whose capacity differs from {@code chunkSize} — + * rather than the bare NPE / IndexOutOfBoundsException that {@code initialChunks.get(0)} would + * otherwise throw from the {@code super(...)} call. + */ + @Test + public void testConstructorRejectsInvalidChunks() throws Exception { + int chunkSize = 16; + ChunkedBufferPool p = pool(64, chunkSize); + + assertThrows(IllegalArgumentException.class, + () -> new ChunkedByteBufferOutputStream(null, chunkSize, p)); + assertThrows(IllegalArgumentException.class, + () -> new ChunkedByteBufferOutputStream(Collections.emptyList(), chunkSize, p)); + + // A chunk whose capacity doesn't match chunkSize violates the contract. + List wrongSize = Collections.singletonList(ByteBuffer.allocate(chunkSize + 1)); + assertThrows(IllegalArgumentException.class, + () -> new ChunkedByteBufferOutputStream(wrongSize, chunkSize, p)); + } + /** Writes that fit in the initial chunk: stream produces the bytes back via buffer(). */ @Test public void testSingleChunkWriteRoundtrip() throws Exception { diff --git a/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulatorTest.java b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulatorTest.java index 35bc979ae86a4..97bdaa09c29c0 100644 --- a/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulatorTest.java +++ b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulatorTest.java @@ -44,6 +44,7 @@ import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicReference; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -243,6 +244,87 @@ public List allocateChunks(int totalSize, long maxTimeToBlockMs) thr } } + /** + * Regression: when the sticky partition switches while extension chunks are held off-lock, + * those chunks — sized against the previous partition's tail batch — must be refunded + * to the pool on the retry, not carried into the next iteration and attached to a different + * partition's tail (which would over-extend the wrong batch). + *

+ * Deterministic reproduction without threads: the extension path is the only one that calls + * {@code allocateChunks} non-blocking ({@code maxTimeToBlockMs == 0}), so the pool flags when an + * extension allocation has happened; a {@code partitionChanged} override then fires exactly one + * spurious switch on the very next check — which is the second-sync-block check, with the + * extension chunks held. The fix refunds those chunks before the retry; without it they are + * carried and attached instead, so no refund is observed during the append. + */ + @Test + public void testPartitionSwitchRefundsHeldExtensionChunks() throws Exception { + int chunkSize = 128; + long totalMemory = 32L * chunkSize; + + AtomicBoolean extensionAllocated = new AtomicBoolean(false); + AtomicInteger chunkDeallocations = new AtomicInteger(0); + + ChunkedBufferPool pool = new ChunkedBufferPool(totalMemory, chunkSize, metrics, time, "producer-metrics") { + @Override + public List allocateChunks(int totalSize, long maxTimeToBlockMs) throws InterruptedException { + List chunks = super.allocateChunks(totalSize, maxTimeToBlockMs); + // The mid-batch extension path is the only non-blocking caller. + if (maxTimeToBlockMs == 0L) + extensionAllocated.set(true); + return chunks; + } + + @Override + public void deallocate(ByteBuffer buffer) { + chunkDeallocations.incrementAndGet(); + super.deallocate(buffer); + } + }; + + AtomicBoolean switchFired = new AtomicBoolean(false); + ChunkedRecordAccumulator accum = new ChunkedRecordAccumulator(logContext, 8192, Compression.NONE, + /* lingerMs */ 0, /* retryBackoffMs */ 0L, /* retryBackoffMaxMs */ 0L, + /* deliveryTimeoutMs */ 3200, metrics, "producer-metrics", time, + /* transactionManager */ null, pool) { + @Override + protected boolean partitionChanged(String topic, TopicInfo topicInfo, + BuiltInPartitioner.StickyPartitionInfo partitionInfo, + Deque deque, long nowMs, Cluster cluster) { + // Inject one spurious switch, but only once an extension allocation has happened — + // i.e. on the second-sync-block check, while extension chunks are held. + if (extensionAllocated.get() && switchFired.compareAndSet(false, true)) + return true; + return super.partitionChanged(topic, topicInfo, partitionInfo, deque, nowMs, cluster); + } + }; + try { + // Warmup: tiny first record establishes a chunked tail (first-record path, blocking + // allocate — does not flag extensionAllocated). + accum.append(topic, partition1, 0L, key, new byte[1], Record.EMPTY_HEADERS, null, + maxBlockTimeMs, time.milliseconds(), cluster); + int deallocBeforeExtend = chunkDeallocations.get(); + + // Second record overflows the chunk → extension allocated off-lock → injected switch + // fires in the second sync block with those chunks held. + accum.append(topic, partition1, 0L, key, new byte[200], Record.EMPTY_HEADERS, null, + maxBlockTimeMs, time.milliseconds(), cluster); + + assertTrue(switchFired.get(), "the injected partition switch should have fired"); + assertTrue(chunkDeallocations.get() > deallocBeforeExtend, + "extension chunks held at the partition switch must be refunded to the pool; " + + "without the fix they are carried and attached instead"); + + // The record still appends correctly after the switch-retry. + Deque dq = batchesFor(accum, tp1); + assertEquals(1, dq.size()); + assertEquals(2, dq.peekFirst().recordCount, + "second record should still land in the batch after the switch-retry"); + } finally { + accum.close(); + } + } + /** * Regression guard: an inflight chunked batch returns all K chunks to the pool on the * expiration / broker-disconnect path, not just one {@code initialCapacity} chunkSize. From 9ccd47dc673aeaaa44d6c2b9ae6bef28ba811d77 Mon Sep 17 00:00:00 2001 From: Lianet Magrans Date: Tue, 23 Jun 2026 12:59:43 -0400 Subject: [PATCH 08/11] test updates & docs --- .../producer/internals/ChunkedBufferPool.java | 6 +- .../ChunkedByteBufferOutputStream.java | 51 +++---- .../internals/ChunkedProducerBatch.java | 14 +- .../internals/ChunkedRecordAccumulator.java | 70 ++++----- .../producer/internals/RecordAccumulator.java | 12 +- .../record/internal/AbstractRecords.java | 7 +- .../record/internal/MemoryRecordsBuilder.java | 16 +-- .../internals/ChunkedBufferPoolTest.java | 51 ++----- .../ChunkedByteBufferOutputStreamTest.java | 135 ++++++++---------- .../ChunkedRecordAccumulatorTest.java | 86 +++-------- 10 files changed, 167 insertions(+), 281 deletions(-) diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPool.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPool.java index c81fef81cffbe..f2c8ba35f1568 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPool.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPool.java @@ -125,8 +125,6 @@ public List allocateChunks(int totalSize, long maxTimeToBlockMs) thr remainingTimeToBlockNs -= timeNs; // Take pool chunks first, then reserve non-pool bytes for the remainder. - // Pool chunks don't bump `accumulated` because their refund path is - // `free.addFirst` in the outer catch, not nonPool. while (pooled.size() < numChunks && (long) (pooled.size() + 1) * chunkSize + accumulated <= memoryRequired && !free.isEmpty()) { @@ -140,7 +138,7 @@ public List allocateChunks(int totalSize, long maxTimeToBlockMs) thr accumulated += got; } } - // Loop terminated normally — clear the rollback marker (mirrors BufferPool.allocate). + // Clear the rollback tracker. accumulated = 0; } finally { // On failure (timeout / close / interrupt), refund the non-pool bytes taken. @@ -163,7 +161,7 @@ public List allocateChunks(int totalSize, long maxTimeToBlockMs) thr } } - // Allocate raw chunks for the reserved non-pooled portion outside the lock. On OOM, + // Allocate raw chunks for the reserved non-pooled portion outside the lock. On error, // refund all reserved bytes (memoryRequired) and let the next waiter try. int chunksStillNeeded = numChunks - pooled.size(); List result = new ArrayList<>(numChunks); diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStream.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStream.java index 89110438381f9..97b89cd1b4416 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStream.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStream.java @@ -23,21 +23,20 @@ import java.util.List; /** - * A {@link ByteBufferOutputStream} backed by a linked list of fixed-size chunks instead of a - * single re-allocated buffer. + * A {@link ByteBufferOutputStream} backed by a linked list of fixed-size chunks instead of a single + * re-allocated buffer. Chunks are supplied by the caller (initial chunks via the constructor, + * additional chunks via {@link #addBuffers(List)}). *

- * This stream does not grow on its own. Chunks are supplied by the caller — initial - * chunks via the constructor, additional chunks via {@link #addBuffers(List)}. When a write's - * size exceeds the stream's {@link #remaining()} (sum of free bytes across all attached - * chunks), {@link IllegalStateException} is thrown. The caller is responsible for attaching - * additional chunks before any such write. - *

- * Automatic mid-write growth (allocating from inside {@code write()}) is a follow-up tied to - * compression support, where compressor buffering can make a write exceed its reservation; until - * then the write path stays allocation-free and deterministic. - *

- * When {@link #buffer()} is called (typically at batch close), all chunks are flattened into a - * single contiguous ByteBuffer — exactly one copy operation. + * Current/temporary behavior: + *

    + *
  • The stream does not grow on its own: a write whose size exceeds the remaining free bytes + * across all attached chunks throws {@link IllegalStateException}, so the caller must attach + * enough chunks before any such write. + * TODO: KAFKA-20579 (automatic mid-write growth for compression support).
  • + *
  • {@link #buffer()} returns the written bytes as a single contiguous {@link ByteBuffer}, + * flattening all chunks into a new buffer with an extra copy. + * TODO: KAFKA-20580 (remove the extra copy on send, scatter-gather send).
  • + *
*/ public class ChunkedByteBufferOutputStream extends ByteBufferOutputStream { @@ -51,10 +50,10 @@ public class ChunkedByteBufferOutputStream extends ByteBufferOutputStream { /** * Constructs a chunked output stream backed by the given pre-allocated chunks. Ownership of - * {@code initialChunks} transfers to this stream — they will be returned to the pool via - * {@link #deallocate()}. + * {@code initialChunks} transfers to this stream (they will be returned to the pool via + * {@link #deallocate()}). * - * @param initialChunks pre-allocated chunks; must be non-empty; each chunk's capacity must + * @param initialChunks pre-allocated chunks. Must be non-empty and each chunk's capacity must * equal {@code chunkSize} * @param chunkSize the size of each chunk in bytes * @param pool the buffer pool used for deallocation @@ -123,17 +122,12 @@ private void ensureChunkCapacity(int needed) { } /** - * Advances {@code currentChunk} to the next pre-supplied chunk; throws if none is left. The - * caller must attach additional chunks via {@link #addBuffers(List)} before any write that - * exceeds {@link #remaining()}. (Mid-record growth — non-blocking pool then heap — is a - * follow-up that lands with compression support.) + * Advances {@code currentChunk} to the next pre-supplied chunk. */ private void advanceToNextChunk() { if (currentChunkIndex + 1 >= chunks.size()) { - throw new IllegalStateException( - "No more chunks available; the write exceeded the stream's remaining capacity. " - + "Caller must obtain additional chunks (e.g. from ChunkedBufferPool) and attach them " - + "via addBuffers before any write whose size exceeds remaining()"); + // TODO: KAFKA-20579. With compression support, grow here instead of throwing. + throw new IllegalStateException("write exceeded the stream's remaining chunk capacity"); } currentChunkIndex++; currentChunk = chunks.get(currentChunkIndex); @@ -154,9 +148,7 @@ public ByteBuffer buffer() { } // TODO: KAFKA-20687. This flatten runs at batch close, when the chunk set is final. // Today all chunks (used and unused) are returned to the pool only when the batch - // completes (via deallocate(pool)). Consider releasing the fully-unused chunks - // early, at close, rather than holding them until completion — removing them from - // `chunks` here so the completion-time deallocate(pool) does not double-return them. + // completes (via deallocate(pool)). Consider releasing the fully-unused chunks early, here. int totalSize = 0; for (ByteBuffer chunk : chunks) { totalSize += chunk.position(); @@ -237,14 +229,13 @@ public int limit() { @Override public int initialCapacity() { - // Every chunk is chunkSize (enforced by the constructor), so this is constant. return chunkSize; } @Override public void ensureRemaining(int remainingBytesRequired) { // A single call can guarantee at most `chunkSize` of space (the stream advances one chunk - // at a time); callers needing more attach chunks via addBuffers first. write(byte[]) loops + // at a time). Callers needing more attach chunks via addBuffers first. write(byte[]) loops // across chunks, so contiguous capacity isn't required. ensureChunkCapacity(Math.min(remainingBytesRequired, chunkSize)); } diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedProducerBatch.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedProducerBatch.java index 78a187cf431d7..05c290737fe24 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedProducerBatch.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedProducerBatch.java @@ -25,7 +25,7 @@ import java.util.List; /** - * A {@link ProducerBatch} for the incremental (chunked) buffer.memory allocation strategy, backed + * A {@link ProducerBatch} for the incremental buffer.memory allocation strategy, backed * by a {@link MemoryRecordsBuilder} whose stream is a {@link ChunkedByteBufferOutputStream}. * It adds mid-batch chunk extension support ({@link #extensionBytesNeeded} / * {@link #addBuffers}) and overrides the pool deallocation hooks so all chunks are returned to @@ -44,12 +44,12 @@ public ChunkedProducerBatch(TopicPartition tp, MemoryRecordsBuilder recordsBuild } /** - * Bytes of physical buffer this batch needs before {@code tryAppend} could accept the given - * record. Returns 0 when no extension is needed: the batch is empty (first record), it is - * logically full, or the stream's combined chunk capacity already has room. Positive when - * {@code hasRoomFor} allows the record but the - * chunks lack physical capacity — the accumulator allocates exactly the missing bytes - * (rounded up to whole chunks) and attaches them via {@link #addBuffers} before retrying. + * Bytes of chunk capacity this batch needs before {@code tryAppend} could accept the given + * record. Returns 0 when no extension is needed: the batch is empty (first record), it is at + * its batch-size limit, or the attached chunk capacity already has room. Positive when the + * record is within the batch-size limit but the attached chunks lack capacity — the accumulator + * then allocates exactly the missing bytes (rounded up to whole chunks) and attaches them via + * {@link #addBuffers} before retrying. */ int extensionBytesNeeded(long timestamp, byte[] key, byte[] value, Header[] headers) { if (recordCount == 0) diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulator.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulator.java index 2e74f3b096261..297dac0dca91f 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulator.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulator.java @@ -41,22 +41,12 @@ import java.util.List; /** - * A {@link RecordAccumulator} variant that uses chunked buffer allocation: each new batch - * pre-allocates the chunks needed for the first record's estimated size (batch header plus the - * record), and grows mid-batch by attaching additional chunks via - * {@link ChunkedProducerBatch#addBuffers(List)} when later records would overflow. + * A {@link RecordAccumulator} variant that backs each batch with fixed-size chunks drawn from a + * {@link ChunkedBufferPool}, attaching more chunks on demand as records are appended instead of + * reserving {@code batch.size} per batch up front. Buffered memory therefore scales with the data + * actually written rather than with {@code active_partition_count × batch.size}. *

- * This is the "incremental" buffer.memory allocation strategy: memory consumption scales with - * actual data buffered, not with {@code active_partition_count × batch.size}. - *

- * When a record arrives mid-batch (an open batch already exists) and the batch's chunks are - * full, the flow is: under the deque lock, probe that open batch — the tail of the partition's - * deque — via the {@link #tryAppend} override, which returns - * {@link RecordAppendResult#needsExtension(int)} when the batch has logical room but lacks - * physical chunk capacity, then attempt a non-blocking pool acquire for the extra chunks. If the pool is exhausted, close the current batch (so the sender can drain it) - * and route the record through the first-record path, which blocks on the pool up to - * {@code max.block.ms}. As a result, unlike the "full" strategy, {@code send()} may block (or - * fail with {@link BufferExhaustedException}) on records other than the first of a batch. + * See {@link #append} and {@link #tryAppend} for how batches are created and grown. *

* TODO: support compressed data (with mid-record growth); the constructor rejects compression for now. */ @@ -128,7 +118,7 @@ public RecordAppendResult append(String topic, appendsInProgress.incrementAndGet(); ChunkedByteBufferOutputStream bufferStream = null; List extensionChunks = null; - int extensionBytes = 0; + int extensionBytes; if (headers == null) headers = Record.EMPTY_HEADERS; try { while (true) { @@ -148,11 +138,11 @@ public RecordAppendResult append(String topic, if (partitionChanged(topic, topicInfo, partitionInfo, dq, nowMs, cluster)) continue; - // The tryAppend override probes the physical capacity of the tail batch - // (dq.peekLast(), the partition's open batch): a needsBufferExtension result - // means it has logical room but its chunks lack space for this record — fall - // through to allocate the gap outside the deque lock. A null result means the - // tail batch is full or absent — fall through to the first-record (new batch) path. + // The tryAppend override checks the open batch (dq.peekLast()) for chunk + // capacity: a needsBufferExtension result means it is within its batch-size limit + // but its chunks lack capacity for this record — fall through to allocate the gap + // outside the deque lock. A null result means there is no open batch (it is full + // or absent) — fall through to the first-record (new batch) path. RecordAppendResult appendResult = tryAppend(timestamp, key, value, headers, callbacks, dq, nowMs); if (appendResult != null && !appendResult.needsBufferExtension) { boolean enableSwitch = allBatchesFull(dq); @@ -162,7 +152,7 @@ public RecordAppendResult append(String topic, extensionBytes = appendResult == null ? 0 : appendResult.extensionBytesNeeded; } - if (extensionBytes > 0 && extensionChunks == null) { + if (extensionBytes > 0) { // Mid-batch extension: non-blocking only. The thread already holds the open // batch's chunks, so blocking here could deadlock with the Sender (which frees // pool memory by completing batches). On exhaustion, close the batch (making it @@ -201,9 +191,9 @@ public RecordAppendResult append(String topic, synchronized (dq) { if (partitionChanged(topic, topicInfo, partitionInfo, dq, nowMs, cluster)) { // The partition switched while we allocated extension chunks off-lock. They - // were sized against the previous partition's tail batch, so they must not be - // attached to a different partition's tail — refund them and let the next - // iteration re-probe the new partition from scratch. + // were sized against the previous partition's open batch, so they must not be + // attached to a different partition's open batch — refund them and let the next + // iteration re-check the new partition from scratch. if (extensionChunks != null) { for (ByteBuffer chunk : extensionChunks) chunkedFree.deallocate(chunk); @@ -214,10 +204,10 @@ public RecordAppendResult append(String topic, if (extensionChunks != null) { ProducerBatch last = dq.peekLast(); - // The off-lock allocateChunks window allows the probed tail batch to be + // The off-lock allocateChunks window allows the open batch we checked to be // drained and replaced — possibly by a split batch (a plain // ProducerBatch), which can't take extension chunks. Only attach to a - // writable chunked tail; otherwise refund the chunks and re-evaluate. + // writable chunked batch; otherwise refund the chunks and re-evaluate. if (last instanceof ChunkedProducerBatch && last.isWritable()) { ((ChunkedProducerBatch) last).addBuffers(extensionChunks); extensionChunks = null; @@ -228,27 +218,28 @@ public RecordAppendResult append(String topic, return retryResult; } // needsBufferExtension: a concurrent appender consumed capacity our - // extension was sized against — loop to re-probe. null: batch became + // extension was sized against — loop to re-check. null: batch became // full — loop into new-batch creation. Terminates because writeLimit is - // fixed: once full, the probe stops requesting extension. Regression: - // testConcurrentExtensionRaceDoesNotOverflowChunkedStream. + // fixed: once full, the check stops requesting extension. continue; } - // Tail is gone, closed, or non-chunked (e.g., a split batch). Return chunks to pool. + // The open batch is gone, closed, or non-chunked (e.g., a split batch). Return chunks to pool. for (ByteBuffer chunk : extensionChunks) chunkedFree.deallocate(chunk); extensionChunks = null; continue; } - // bufferStream is non-null here (first-record path). + // First-record path: extensionChunks == null here implies extensionBytes == 0, + // so bufferStream was allocated (this iteration or carried from a prior one). + assert bufferStream != null; int firstRecordSize = AbstractRecords.estimateSizeInBytesUpperBound( RecordBatch.CURRENT_MAGIC_VALUE, compression.type(), key, value, headers); final ChunkedByteBufferOutputStream batchStream = bufferStream; RecordAppendResult appendResult = appendNewBatch(topic, effectivePartition, dq, timestamp, key, value, headers, callbacks, () -> chunkedRecordsBuilder(batchStream, firstRecordSize), nowMs); if (appendResult.needsBufferExtension) { - // A concurrent appender created a tail batch we should extend rather + // A concurrent appender created an open batch we should extend rather // than start a new one (detected by appendNewBatch's in-lock tryAppend). // Our bufferStream was sized for a fresh batch — release it and loop so // the extension path allocates exactly the gap-sized chunks. @@ -277,8 +268,8 @@ public RecordAppendResult append(String topic, /** * Try to append to a ProducerBatch, with mid-batch chunk extension support. *

- * If the tail batch has logical room (writeLimit-wise) but its chunked stream lacks - * physical capacity, returns {@link RecordAppendResult#needsExtension(int)} without + * If the open batch is within its batch-size limit but its chunked stream lacks chunk + * capacity, returns {@link RecordAppendResult#needsExtension(int)} without * attempting the append; the caller allocates chunks outside the deque lock, attaches * them, and retries. Otherwise defers to the parent. */ @@ -289,7 +280,7 @@ protected RecordAppendResult tryAppend(long timestamp, byte[] key, byte[] value, throw new KafkaException("Producer closed while send in progress"); ProducerBatch last = deque.peekLast(); // Split batches in an incremental deque are plain ProducerBatch (heap-backed, grow-on-demand) - // and never need chunk extension, so the probe only applies to chunked batches. + // and never need chunk extension, so the check only applies to chunked batches. if (last instanceof ChunkedProducerBatch) { int extensionBytes = ((ChunkedProducerBatch) last).extensionBytesNeeded(timestamp, key, value, headers); if (extensionBytes > 0) @@ -304,10 +295,9 @@ protected ProducerBatch createProducerBatch(TopicPartition tp, MemoryRecordsBuil } /** - * Build a {@link MemoryRecordsBuilder} backed by the chunked stream. {@code writeLimit} = - * {@code max(batchSize, firstRecordSize)} (the same logical cap as the full path) bounds when - * the batch is full; physical capacity grows on demand via - * {@link ChunkedByteBufferOutputStream#addBuffers(List)}. + * Build a {@link MemoryRecordsBuilder} backed by the chunked stream. Its {@code writeLimit} + * bounds when the batch is full (the same batch-size limit as the full path), while chunk + * capacity grows on demand via {@link ChunkedByteBufferOutputStream#addBuffers(List)}. */ private MemoryRecordsBuilder chunkedRecordsBuilder(ChunkedByteBufferOutputStream bufferStream, int firstRecordSize) { diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/RecordAccumulator.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/RecordAccumulator.java index 7bb4243e24f9d..4125b29acaf5c 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/internals/RecordAccumulator.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/RecordAccumulator.java @@ -379,9 +379,9 @@ public RecordAppendResult append(String topic, * @param headers the Headers for the record * @param callbacks The callbacks to execute * @param recordsBuilderSupplier Supplies the {@link MemoryRecordsBuilder} for the new - * batch. Invoked lazily, only when this method is committed to creating a new - * batch (i.e. after the in-lock concurrent-append race check has lost). The - * chunked subclass passes a supplier that produces a stream-backed builder. + * batch. Invoked lazily, only when a new batch is actually created. The chunked + * subclass passes a supplier that produces a builder backed by a + * {@link ChunkedByteBufferOutputStream}. * @param nowMs The current time, in milliseconds */ protected RecordAppendResult appendNewBatch(String topic, @@ -399,7 +399,7 @@ protected RecordAppendResult appendNewBatch(String topic, RecordAppendResult appendResult = tryAppend(timestamp, key, value, headers, callbacks, dq, nowMs); if (appendResult != null) { // Propagate without creating a new batch: either another thread already made us a batch - // (success), or — incremental strategy — a concurrent appender created an extendable tail + // (success), or — incremental strategy — a concurrent appender created an extendable open batch // (needsBufferExtension), so the caller releases its pre-allocated buffer and retries via // the extension path. return appendResult; @@ -1272,8 +1272,8 @@ public static final class RecordAppendResult { public final boolean batchIsFull; public final boolean newBatchCreated; /** - * Signal (incremental strategy) that the tail batch has logical room (writeLimit-wise) but - * its chunks lack physical capacity. The append was NOT attempted; the caller allocates + * Signal (incremental strategy) that the open batch is within its batch-size limit but its + * chunks lack capacity. The append was NOT attempted; the caller allocates * {@link #extensionBytesNeeded} bytes of chunk capacity, attaches them via * {@link ChunkedProducerBatch#addBuffers}, and retries. When {@code true}, {@code future} is null. */ diff --git a/clients/src/main/java/org/apache/kafka/common/record/internal/AbstractRecords.java b/clients/src/main/java/org/apache/kafka/common/record/internal/AbstractRecords.java index d8eb6c4dd0be7..4e6e32d75a556 100644 --- a/clients/src/main/java/org/apache/kafka/common/record/internal/AbstractRecords.java +++ b/clients/src/main/java/org/apache/kafka/common/record/internal/AbstractRecords.java @@ -144,11 +144,8 @@ else if (compressionType != CompressionType.NONE) } /** - * Upper-bound size of one record's contribution to a batch, excluding the batch header. - * For magic v2 it's {@link #estimateSizeInBytesUpperBound} minus - * {@link #recordBatchHeaderSizeInBytes}; for older magic it equals - * {@link #estimateSizeInBytesUpperBound}. Used by the incremental strategy's cumulative sizing, - * where the header is counted once via {@link MemoryRecordsBuilder#estimatedBytesWritten}. + * Upper-bound on the bytes a single record contributes to a batch, excluding the batch header. + * Used by the incremental strategy, which counts the header once, separately. */ public static int recordSizeUpperBound(byte magic, CompressionType compressionType, byte[] key, byte[] value, Header[] headers) { diff --git a/clients/src/main/java/org/apache/kafka/common/record/internal/MemoryRecordsBuilder.java b/clients/src/main/java/org/apache/kafka/common/record/internal/MemoryRecordsBuilder.java index 2bf48518b2d6f..611654ed3e595 100644 --- a/clients/src/main/java/org/apache/kafka/common/record/internal/MemoryRecordsBuilder.java +++ b/clients/src/main/java/org/apache/kafka/common/record/internal/MemoryRecordsBuilder.java @@ -212,9 +212,8 @@ public int initialCapacity() { } /** - * The underlying output stream. Exposed so the incremental-strategy callers can attach - * pre-allocated chunks to a {@code ChunkedByteBufferOutputStream} and route deallocation - * polymorphically. + * The underlying output stream, exposed so the incremental strategy can manage its + * chunk-backed stream. */ public ByteBufferOutputStream bufferStream() { return bufferStream; @@ -828,15 +827,12 @@ private int estimatedBytesWritten() { } /** - * Static projection of the bytes a builder with the given magic, compression type, and ratio - * would have written for {@code uncompressedRecordsSizeInBytes} of records. Exact for - * uncompressed ({@code header + uncompressedBytes}); for compressed it applies the ratio and 5% - * safety multiplier to the whole total, like the per-builder {@link #estimatedBytesWritten()}. + * Returns the projected number of bytes the builder would write for + * {@code uncompressedRecordsSizeInBytes} of records under the given magic, compression type, + * and ratio: exact for uncompressed, a ratio-aware estimate for compressed. *

* Used by the incremental strategy to size chunk reservations (first record, and mid-batch via - * {@link #estimatedBytesWrittenAfter}). This is the batch's projected total output, - * distinct from the {@link #hasRoomFor} writeLimit gate, which counts the incoming record at - * full uncompressed size; the two coincide for uncompressed data and diverge under compression. + * {@link #estimatedBytesWrittenAfter}). */ public static int estimatedBytesWritten(byte magic, CompressionType compressionType, float compressionRatio, diff --git a/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPoolTest.java b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPoolTest.java index e0de8ae19e4dc..45e9aceb12cf5 100644 --- a/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPoolTest.java +++ b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPoolTest.java @@ -19,6 +19,7 @@ import org.apache.kafka.clients.producer.BufferExhaustedException; import org.apache.kafka.common.metrics.Metrics; import org.apache.kafka.common.utils.MockTime; +import org.apache.kafka.test.TestUtils; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; @@ -32,6 +33,7 @@ import java.util.concurrent.atomic.AtomicReference; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertThrows; @@ -41,7 +43,6 @@ public class ChunkedBufferPoolTest { private final MockTime time = new MockTime(); private final Metrics metrics = new Metrics(time); - private final String metricGroup = "producer-metrics"; @AfterEach public void teardown() { @@ -49,6 +50,7 @@ public void teardown() { } private ChunkedBufferPool pool(long totalMemory, int chunkSize) { + String metricGroup = "producer-metrics"; return new ChunkedBufferPool(totalMemory, chunkSize, metrics, time, metricGroup); } @@ -62,7 +64,7 @@ public void testAllocateOneChunk() throws Exception { assertEquals(chunkSize, chunks.get(0).capacity()); } - /** Total size that's not a multiple of chunk size rounds up; returned chunks cover it. */ + /** Total size that's not a multiple of chunk size rounds up */ @Test public void testAllocateRoundsUpToChunkBoundary() throws Exception { int chunkSize = 64; @@ -105,14 +107,12 @@ public void testDeallocationRestoresMemory() throws Exception { assertEquals(total, p.availableMemory()); } - /** Requesting more than total memory throws IllegalArgumentException. */ @Test public void testRejectsRequestExceedingTotalMemory() { ChunkedBufferPool p = pool(128, 64); assertThrows(IllegalArgumentException.class, () -> p.allocateChunks(129, 100)); } - /** Non-positive totalSize is rejected. */ @Test public void testRejectsNonPositiveRequest() { ChunkedBufferPool p = pool(128, 64); @@ -122,7 +122,7 @@ public void testRejectsNonPositiveRequest() { /** * When a multi-chunk request can't be fully satisfied within the deadline, chunks already - * acquired during the call must be returned to the pool — the caller must not leak memory. + * acquired during the call must be returned to the pool. */ @Test public void testRollbackOnPartialFailure() throws Exception { @@ -145,10 +145,7 @@ public void testRollbackOnPartialFailure() throws Exception { /** * No partial chunk holds during the wait. While a multi-chunk request blocks, the pool's - * {@code availableMemory()} must reflect bytes the waiter has not yet "earned" — i.e., the - * reservation is via the accumulator and not via chunks pulled out of the pool's accounting. - * Distinguishes the atomic implementation from the older loop-with-rollback (which held - * partials off the accounting between iterations). + * {@code availableMemory()} must reflect bytes the waiter has not yet "earned". */ @Test public void testNoPartialHoldsDuringWait() throws Exception { @@ -173,10 +170,7 @@ public void testNoPartialHoldsDuringWait() throws Exception { }, "chunked-waiter"); t.start(); // Wait until the thread has joined the waiters queue. - long deadlineMs = System.currentTimeMillis() + 2_000; - while (p.queued() == 0 && System.currentTimeMillis() < deadlineMs) - Thread.sleep(5); - assertEquals(1, p.queued(), "waiter should be parked on the pool's queue"); + TestUtils.waitForCondition(() -> p.queued() == 1, "waiter should be parked on the pool's queue"); // While the waiter is parked, the pool's available memory must still report the // 1 chunk's worth — the waiter has NOT consumed any of it (no partial hold). @@ -235,16 +229,10 @@ public void testFifoFairnessAcrossChunkedAndSingleChunkRequests() throws Excepti tMulti.start(); assertTrue(multiStarted.await(2, TimeUnit.SECONDS)); // Wait for tMulti to be parked before starting tSingle, so the FIFO order is deterministic. - long deadlineMs = System.currentTimeMillis() + 2_000; - while (p.queued() == 0 && System.currentTimeMillis() < deadlineMs) - Thread.sleep(5); - assertEquals(1, p.queued(), "multi-chunk waiter should be the only one queued"); + TestUtils.waitForCondition(() -> p.queued() == 1, "multi-chunk waiter should be the only one queued"); tSingle.start(); // Wait for tSingle to also be parked. - deadlineMs = System.currentTimeMillis() + 2_000; - while (p.queued() < 2 && System.currentTimeMillis() < deadlineMs) - Thread.sleep(5); - assertEquals(2, p.queued(), "single-chunk waiter joined after the multi-chunk one"); + TestUtils.waitForCondition(() -> p.queued() == 2, "single-chunk waiter joined after the multi-chunk one"); // Now free chunks one at a time, allowing the multi-chunk request to accumulate. p.deallocate(h1); @@ -302,10 +290,7 @@ public void testSlowPathDoesNotDoubleRefundPolledChunksOnTimeout() throws Except }, "partial-holder"); t.start(); - long deadline = System.currentTimeMillis() + 2_000; - while (p.queued() == 0 && System.currentTimeMillis() < deadline) - Thread.sleep(5); - assertEquals(1, p.queued()); + TestUtils.waitForCondition(() -> p.queued() == 1, "waiter should be parked on the pool's queue"); // Hand the waiter 2 chunks — it polls them into `pooled` then awaits again for the 3rd. p.deallocate(h1); @@ -315,8 +300,7 @@ public void testSlowPathDoesNotDoubleRefundPolledChunksOnTimeout() throws Except Thread.sleep(100); t.join(5_000); - assertTrue(err.get() instanceof BufferExhaustedException, - "expected BufferExhaustedException, got " + err.get()); + assertInstanceOf(BufferExhaustedException.class, err.get(), "expected BufferExhaustedException, got " + err.get()); // After the throw, exactly 2 chunkSizes of physical memory should be back in the pool // (h1 + h2). h3 is still held outside the pool. The bug double-refunds: it credits @@ -358,10 +342,7 @@ public void testSlowPathSuccessDoesNotCorruptAccounting() throws Exception { }, "slow-success"); t.start(); - long deadline = System.currentTimeMillis() + 2_000; - while (p.queued() == 0 && System.currentTimeMillis() < deadline) - Thread.sleep(5); - assertEquals(1, p.queued()); + TestUtils.waitForCondition(() -> p.queued() == 1, "waiter should be parked on the pool's queue"); // Deallocate 2 chunks → waiter wakes, polls both, accumulated reaches memoryRequired, // exits normally. The success path's finally must NOT refund or decrement nonPool. @@ -406,16 +387,12 @@ public void testCloseDuringAtomicWait() throws Exception { } }, "close-during-wait"); t.start(); - long deadlineMs = System.currentTimeMillis() + 2_000; - while (p.queued() == 0 && System.currentTimeMillis() < deadlineMs) - Thread.sleep(5); - assertEquals(1, p.queued()); + TestUtils.waitForCondition(() -> p.queued() == 1, "waiter should be parked on the pool's queue"); // Close the pool: signals all waiters; the waiter must throw KafkaException. p.close(); t.join(5_000); - assertTrue(err.get() instanceof KafkaException, - "expected KafkaException, got " + err.get()); + assertInstanceOf(KafkaException.class, err.get(), "expected KafkaException, got " + err.get()); p.deallocate(h1); p.deallocate(h2); } diff --git a/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStreamTest.java b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStreamTest.java index c96824b09c137..145876730e977 100644 --- a/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStreamTest.java +++ b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStreamTest.java @@ -34,7 +34,6 @@ public class ChunkedByteBufferOutputStreamTest { private final MockTime time = new MockTime(); private final Metrics metrics = new Metrics(time); - private final String metricGroup = "test"; @AfterEach public void teardown() { @@ -42,6 +41,7 @@ public void teardown() { } private ChunkedBufferPool pool(long total, int chunkSize) { + String metricGroup = "test"; return new ChunkedBufferPool(total, chunkSize, metrics, time, metricGroup); } @@ -49,14 +49,8 @@ private List chunks(ChunkedBufferPool pool, int chunkSize, int count return pool.allocateChunks(chunkSize * count, 100); } - /** - * The constructor fails fast with {@link IllegalArgumentException} when its chunk contract is - * violated — a null or empty list, or a chunk whose capacity differs from {@code chunkSize} — - * rather than the bare NPE / IndexOutOfBoundsException that {@code initialChunks.get(0)} would - * otherwise throw from the {@code super(...)} call. - */ @Test - public void testConstructorRejectsInvalidChunks() throws Exception { + public void testConstructorRejectsInvalidChunks() { int chunkSize = 16; ChunkedBufferPool p = pool(64, chunkSize); @@ -71,123 +65,110 @@ public void testConstructorRejectsInvalidChunks() throws Exception { () -> new ChunkedByteBufferOutputStream(wrongSize, chunkSize, p)); } - /** Writes that fit in the initial chunk: stream produces the bytes back via buffer(). */ @Test public void testSingleChunkWriteRoundtrip() throws Exception { int chunkSize = 16; ChunkedBufferPool p = pool(64, chunkSize); - ChunkedByteBufferOutputStream stream = new ChunkedByteBufferOutputStream(chunks(p, chunkSize, 1), chunkSize, p); + try (ChunkedByteBufferOutputStream stream = new ChunkedByteBufferOutputStream(chunks(p, chunkSize, 1), chunkSize, p)) { - byte[] payload = new byte[]{1, 2, 3, 4, 5}; - stream.write(payload, 0, payload.length); + byte[] payload = new byte[]{1, 2, 3, 4, 5}; + stream.write(payload, 0, payload.length); - ByteBuffer flat = stream.buffer(); - flat.flip(); - byte[] out = new byte[flat.remaining()]; - flat.get(out); - assertArrayEquals(payload, out); + ByteBuffer flat = stream.buffer(); + flat.flip(); + byte[] out = new byte[flat.remaining()]; + flat.get(out); + assertArrayEquals(payload, out); - stream.deallocate(); + stream.deallocate(); + } } - /** Writes that span multiple pre-supplied chunks land in subsequent chunks in order. */ @Test - public void testWriteSpansChunks() throws Exception { + public void testWriteAcrossMultipleChunks() throws Exception { int chunkSize = 8; ChunkedBufferPool p = pool(64, chunkSize); - ChunkedByteBufferOutputStream stream = new ChunkedByteBufferOutputStream(chunks(p, chunkSize, 3), chunkSize, p); + try (ChunkedByteBufferOutputStream stream = new ChunkedByteBufferOutputStream(chunks(p, chunkSize, 3), chunkSize, p)) { - byte[] payload = new byte[20]; - for (int i = 0; i < payload.length; i++) payload[i] = (byte) i; - stream.write(payload, 0, payload.length); + byte[] payload = new byte[20]; + for (int i = 0; i < payload.length; i++) payload[i] = (byte) i; + stream.write(payload, 0, payload.length); - ByteBuffer flat = stream.buffer(); - flat.flip(); - byte[] out = new byte[flat.remaining()]; - flat.get(out); - assertArrayEquals(payload, out); + ByteBuffer flat = stream.buffer(); + flat.flip(); + byte[] out = new byte[flat.remaining()]; + flat.get(out); + assertArrayEquals(payload, out); - stream.deallocate(); + stream.deallocate(); + } } - /** remaining() sums current + queued chunk capacities and shrinks as writes consume bytes. */ @Test - public void testRemainingSumsChunks() throws Exception { + public void testRemainingSumsFreeBytesAcrossChunks() throws Exception { int chunkSize = 8; ChunkedBufferPool p = pool(64, chunkSize); - ChunkedByteBufferOutputStream stream = new ChunkedByteBufferOutputStream(chunks(p, chunkSize, 2), chunkSize, p); + try (ChunkedByteBufferOutputStream stream = new ChunkedByteBufferOutputStream(chunks(p, chunkSize, 2), chunkSize, p)) { - assertEquals(2 * chunkSize, stream.remaining()); - stream.write(new byte[3], 0, 3); - assertEquals(2 * chunkSize - 3, stream.remaining()); - stream.write(new byte[chunkSize], 0, chunkSize); // crosses into chunk 2 - assertEquals(chunkSize - 3, stream.remaining()); + assertEquals(2 * chunkSize, stream.remaining()); + stream.write(new byte[3], 0, 3); + assertEquals(2 * chunkSize - 3, stream.remaining()); + stream.write(new byte[chunkSize], 0, chunkSize); // crosses into chunk 2 + assertEquals(chunkSize - 3, stream.remaining()); - stream.deallocate(); + stream.deallocate(); + } } - /** addBuffers extends capacity so a write that would have overflowed now fits. */ @Test public void testAddBuffersExtendsStream() throws Exception { int chunkSize = 8; ChunkedBufferPool p = pool(64, chunkSize); - ChunkedByteBufferOutputStream stream = new ChunkedByteBufferOutputStream(chunks(p, chunkSize, 1), chunkSize, p); + try (ChunkedByteBufferOutputStream stream = new ChunkedByteBufferOutputStream(chunks(p, chunkSize, 1), chunkSize, p)) { - // Fill the initial chunk. - stream.write(new byte[chunkSize], 0, chunkSize); - assertEquals(0, stream.remaining()); + // Fill the initial chunk. + stream.write(new byte[chunkSize], 0, chunkSize); + assertEquals(0, stream.remaining()); - // Extend and write more — must land in the new chunk. - stream.addBuffers(Collections.singletonList(p.allocate(chunkSize, 100))); - assertEquals(chunkSize, stream.remaining()); + // Extend and write more — must land in the new chunk. + stream.addBuffers(Collections.singletonList(p.allocate(chunkSize, 100))); + assertEquals(chunkSize, stream.remaining()); - byte[] more = new byte[]{9, 9, 9}; - stream.write(more, 0, more.length); - assertEquals(chunkSize - 3, stream.remaining()); + byte[] more = new byte[]{9, 9, 9}; + stream.write(more, 0, more.length); + assertEquals(chunkSize - 3, stream.remaining()); - stream.deallocate(); + stream.deallocate(); + } } - /** Writing past all pre-supplied (and added) chunks throws — uncompressed-only contract. */ - @Test - public void testOverflowThrows() throws Exception { - int chunkSize = 4; - ChunkedBufferPool p = pool(64, chunkSize); - ChunkedByteBufferOutputStream stream = new ChunkedByteBufferOutputStream(chunks(p, chunkSize, 1), chunkSize, p); - - stream.write(new byte[chunkSize], 0, chunkSize); - assertThrows(IllegalStateException.class, () -> stream.write((byte) 1)); - - stream.deallocate(); - } - - /** position(int) at construction walks across pre-supplied chunks. */ @Test public void testPositionWalksAcrossChunks() throws Exception { int chunkSize = 4; ChunkedBufferPool p = pool(32, chunkSize); - ChunkedByteBufferOutputStream stream = new ChunkedByteBufferOutputStream(chunks(p, chunkSize, 3), chunkSize, p); + try (ChunkedByteBufferOutputStream stream = new ChunkedByteBufferOutputStream(chunks(p, chunkSize, 3), chunkSize, p)) { - stream.position(6); // straddles chunk 0 (4 bytes) and chunk 1 (2 bytes) - assertEquals(6, stream.position()); - assertEquals(6, stream.remaining()); + stream.position(6); // straddles chunk 0 (4 bytes) and chunk 1 (2 bytes) + assertEquals(6, stream.position()); + assertEquals(6, stream.remaining()); - stream.deallocate(); + stream.deallocate(); + } } - /** deallocate returns all chunks to the pool. */ @Test public void testDeallocateReturnsAllChunks() throws Exception { int chunkSize = 8; long total = 64; ChunkedBufferPool p = pool(total, chunkSize); List initial = chunks(p, chunkSize, 3); - ChunkedByteBufferOutputStream stream = new ChunkedByteBufferOutputStream(initial, chunkSize, p); - // Also attach one extra chunk so deallocate must handle the added-buffer case too. - stream.addBuffers(Collections.singletonList(p.allocate(chunkSize, 100))); - assertEquals(total - 4L * chunkSize, p.availableMemory()); + try (ChunkedByteBufferOutputStream stream = new ChunkedByteBufferOutputStream(initial, chunkSize, p)) { + // Also attach one extra chunk so deallocate must handle the added-buffer case too. + stream.addBuffers(Collections.singletonList(p.allocate(chunkSize, 100))); + assertEquals(total - 4L * chunkSize, p.availableMemory()); - stream.deallocate(); + stream.deallocate(); + } assertEquals(total, p.availableMemory()); } -} +} \ No newline at end of file diff --git a/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulatorTest.java b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulatorTest.java index 97bdaa09c29c0..d658e201083fe 100644 --- a/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulatorTest.java +++ b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedRecordAccumulatorTest.java @@ -35,7 +35,6 @@ import java.nio.ByteBuffer; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.Deque; import java.util.List; @@ -65,7 +64,7 @@ public class ChunkedRecordAccumulatorTest { private final PartitionMetadata partMetadata1 = new PartitionMetadata(Errors.NONE, tp1, Optional.of(node1.id()), Optional.empty(), null, null, null); - private final List partMetadatas = new ArrayList<>(Arrays.asList(partMetadata1)); + private final List partMetadatas = new ArrayList<>(List.of(partMetadata1)); private final Map nodes = Stream.of(node1).collect(Collectors.toMap(Node::id, java.util.function.Function.identity())); private final MetadataSnapshot metadataCache = new MetadataSnapshot(null, nodes, partMetadatas, @@ -91,7 +90,6 @@ private ChunkedRecordAccumulator newAccumulator(int batchSize, int chunkSize, lo /* transactionManager */ null, pool); } - /** First record creates a batch backed by chunked allocation. */ @Test public void testFirstRecordCreatesChunkedBatch() throws Exception { int chunkSize = 256; @@ -109,12 +107,8 @@ public void testFirstRecordCreatesChunkedBatch() throws Exception { accum.close(); } - /** - * A small follow-up record fits in the existing batch's pre-allocated chunks — no extension - * needed. - */ @Test - public void testFollowupRecordFitsWithoutExtension() throws Exception { + public void testSmallFollowupRecordFitsWithoutExtension() throws Exception { int chunkSize = 256; ChunkedRecordAccumulator accum = newAccumulator(1024, chunkSize, 16L * chunkSize, Compression.NONE); @@ -125,6 +119,7 @@ public void testFollowupRecordFitsWithoutExtension() throws Exception { Deque dq = batchesFor(accum, tp1); assertEquals(1, dq.size(), "Both records should land in the same batch"); + assertNotNull(dq.peekFirst()); assertEquals(2, dq.peekFirst().recordCount); accum.close(); } @@ -144,6 +139,7 @@ public void testMidBatchExtensionGrowsExistingBatch() throws Exception { Deque dq = batchesFor(accum, tp1); assertEquals(1, dq.size()); + assertNotNull(dq.peekFirst()); int initialRecordCount = dq.peekFirst().recordCount; assertEquals(1, initialRecordCount); @@ -153,26 +149,17 @@ public void testMidBatchExtensionGrowsExistingBatch() throws Exception { // Same batch, now with the second record. assertEquals(1, dq.size(), "Should still be the same batch — extension should not roll"); + assertNotNull(dq.peekFirst()); assertEquals(2, dq.peekFirst().recordCount, "Second record should land in the extended batch"); accum.close(); } /** - * Race between two concurrent appenders on the same chunked tail. Both threads probe - * {@code extensionBytesNeeded} under the deque lock against the same physical remaining - * capacity, drop the lock, and allocate gap-sized chunks off-lock. When the first appender - * re-takes the lock and consumes most of the original remaining, the second appender's - * allocation — sized against the pre-race remaining — is short by the bytes the first - * appender consumed. Without a re-probe in the second-lock block, the second appender - * proceeds to write past the stream's physical capacity and {@code advanceToNextChunk} - * throws {@link IllegalStateException}. - *

- * Coordination: a {@link ChunkedBufferPool} subclass blocks the FIRST {@code allocateChunks} - * call after the test arms a latch (i.e. thread A's). Thread B's allocate is the second call - * and proceeds normally — B re-takes the lock, attaches its chunks, appends its record, and - * returns. Only then is thread A released, so A attaches its chunks and tries to append - * against a tail whose remaining has already shrunk. + * Two concurrent appenders race to extend the same open batch, each sizing its extension + * against the same remaining capacity off-lock; once one attaches its chunks, the other's is + * short. Verifies the second-lock re-check makes the short appender re-check rather than write + * past the stream's capacity, which would fail. */ @Test public void testConcurrentExtensionRaceDoesNotOverflowChunkedStream() throws Exception { @@ -202,7 +189,7 @@ public List allocateChunks(int totalSize, long maxTimeToBlockMs) thr /* deliveryTimeoutMs */ 3200, metrics, "producer-metrics", time, /* transactionManager */ null, pool); try { - // Warmup: tiny first record establishes a chunked tail with a known remainingInStream. + // Warmup: tiny first record establishes an open batch with a known remainingInStream. accum.append(topic, partition1, 0L, key, new byte[1], Record.EMPTY_HEADERS, null, maxBlockTimeMs, time.milliseconds(), cluster); @@ -224,8 +211,8 @@ public List allocateChunks(int totalSize, long maxTimeToBlockMs) thr assertTrue(aHoldsChunks.await(10, TimeUnit.SECONDS), "A did not reach allocateChunks"); // B runs on this thread. Its allocateChunks is the second call — armed=false now, - // so it proceeds without blocking. B probes the same R as A (A hasn't attached yet), - // attaches its own chunks, appends its record. Now the tail's remaining has shrunk + // so it proceeds without blocking. B reads the same R as A (A hasn't attached yet), + // attaches its own chunks, appends its record. Now the open batch's remaining has shrunk // by B's actual record size, but A's still-off-lock allocation didn't know that. accum.append(topic, partition1, 0L, key, new byte[recordValueSize], Record.EMPTY_HEADERS, null, maxBlockTimeMs, time.milliseconds(), cluster); @@ -245,17 +232,9 @@ public List allocateChunks(int totalSize, long maxTimeToBlockMs) thr } /** - * Regression: when the sticky partition switches while extension chunks are held off-lock, - * those chunks — sized against the previous partition's tail batch — must be refunded - * to the pool on the retry, not carried into the next iteration and attached to a different - * partition's tail (which would over-extend the wrong batch). - *

- * Deterministic reproduction without threads: the extension path is the only one that calls - * {@code allocateChunks} non-blocking ({@code maxTimeToBlockMs == 0}), so the pool flags when an - * extension allocation has happened; a {@code partitionChanged} override then fires exactly one - * spurious switch on the very next check — which is the second-sync-block check, with the - * extension chunks held. The fix refunds those chunks before the retry; without it they are - * carried and attached instead, so no refund is observed during the append. + * When the sticky partition switches while extension chunks are held off-lock, those + * chunks (sized against the previous partition's open batch) must be refunded to the pool on the + * retry (not carried over and attached to a different partition's open batch) */ @Test public void testPartitionSwitchRefundsHeldExtensionChunks() throws Exception { @@ -299,7 +278,7 @@ protected boolean partitionChanged(String topic, TopicInfo topicInfo, } }; try { - // Warmup: tiny first record establishes a chunked tail (first-record path, blocking + // Warmup: tiny first record establishes an open batch (first-record path, blocking // allocate — does not flag extensionAllocated). accum.append(topic, partition1, 0L, key, new byte[1], Record.EMPTY_HEADERS, null, maxBlockTimeMs, time.milliseconds(), cluster); @@ -318,6 +297,7 @@ protected boolean partitionChanged(String topic, TopicInfo topicInfo, // The record still appends correctly after the switch-retry. Deque dq = batchesFor(accum, tp1); assertEquals(1, dq.size()); + assertNotNull(dq.peekFirst()); assertEquals(2, dq.peekFirst().recordCount, "second record should still land in the batch after the switch-retry"); } finally { @@ -325,14 +305,6 @@ protected boolean partitionChanged(String topic, TopicInfo topicInfo, } } - /** - * Regression guard: an inflight chunked batch returns all K chunks to the pool on the - * expiration / broker-disconnect path, not just one {@code initialCapacity} chunkSize. - * For an inflight batch the parent {@code RecordAccumulator.deallocate} calls - * {@code batch.deallocateInflightBuffer(pool)} and then throws; {@link ChunkedProducerBatch} - * overrides that hook to return all chunks (instead of donating one fresh buffer). Removing - * that override would re-introduce a (K−1)×chunkSize leak and fail this test. - */ @Test public void testInflightExpirationReturnsAllChunksToPool() throws Exception { int chunkSize = 128; @@ -377,13 +349,7 @@ private Deque batchesFor(RecordAccumulator accum, TopicPartition } /** - * Sender's drain (and any other caller of {@code batch.close()}) cascades through - * {@code MemoryRecordsBuilder.closeForRecordAppends()} → - * {@code appendStream.close()} → {@code bufferStream.close()}. The chunk data must remain - * readable through that cascade so the builder can flatten the chunks into the heap buffer it - * sends over the network — chunks are returned to the pool only at batch completion - * (deallocate), never at close. If they were released at close, the flattened buffer would be - * empty and the header write would fail (or produce an empty record set). + * Test that chunks are returned to the pool only at batch completion (deallocate), never at close. */ @Test public void testBatchCloseDoesNotDeallocateChunksPrematurely() throws Exception { @@ -417,11 +383,8 @@ public void testBatchCloseDoesNotDeallocateChunksPrematurely() throws Exception } /** - * Chunks attached grow with the batch's cumulative projected output, not per-record. With a - * chunkSize smaller than the batch's eventual content, a sequence of small records should - * extend the chunks as the running total (header + uncompressed bytes for NONE) crosses - * chunk boundaries — not allocate the first record's worth and then never extend until - * physical capacity is exhausted (which the per-record formula would do). + * As small records accumulate in a batch, the attached chunks grow with the batch's cumulative + * projected output (not per-record). */ @Test public void testExtensionTracksCumulativeBatchSize() throws Exception { @@ -455,15 +418,8 @@ public void testExtensionTracksCumulativeBatchSize() throws Exception { accum.close(); } - /** - * The cumulative sizing formula counts the batch header once. The previous per-record - * formula effectively included the batch header in each record's upper bound, which - * over-allocated as records accumulated. With the cumulative formula, chunk count for N - * small records of total uncompressed size {@code U} is {@code ceil((header + U) / chunkSize)}, - * not {@code N × ceil((header + recordSize) / chunkSize)}. - */ @Test - public void testCumulativeAccountsForHeaderOnce() throws Exception { + public void testCumulativeAccountsForBatchHeaderOnce() throws Exception { int chunkSize = 256; int batchSize = 8192; long totalMemory = 64L * chunkSize; From 860bde4364550fae304a834b2c5c1dea12320af2 Mon Sep 17 00:00:00 2001 From: Lianet Magrans Date: Tue, 23 Jun 2026 13:38:24 -0400 Subject: [PATCH 09/11] fixes --- .../apache/kafka/clients/producer/KafkaProducer.java | 12 +++++++----- .../internals/ChunkedByteBufferOutputStream.java | 2 ++ 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/KafkaProducer.java b/clients/src/main/java/org/apache/kafka/clients/producer/KafkaProducer.java index 3b13014768281..85bd74a4fc61d 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/KafkaProducer.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/KafkaProducer.java @@ -462,16 +462,18 @@ public KafkaProducer(Properties properties, Serializer keySerializer, Seriali String allocationStrategy = config.getString(ProducerConfig.BUFFER_MEMORY_ALLOCATION_STRATEGY_CONFIG) .toLowerCase(Locale.ROOT); boolean incremental = ProducerConfig.BUFFER_MEMORY_ALLOCATION_STRATEGY_INCREMENTAL.equals(allocationStrategy); - if (incremental && compression.type() != CompressionType.NONE) { + // Use the chunked path only when a batch is at least one full chunk + // (batch.size >= CHUNK_SIZE). Below that, a batch can't fill even one chunk, so chunking + // would over-reserve and the producer falls back to the full strategy instead. + boolean useIncremental = incremental && batchSize >= ChunkedRecordAccumulator.CHUNK_SIZE; + // The chunked path does not support compression yet (TODO: KAFKA-20579) + if (useIncremental && compression.type() != CompressionType.NONE) { throw new ConfigException("The " + ProducerConfig.BUFFER_MEMORY_ALLOCATION_STRATEGY_INCREMENTAL + " " + ProducerConfig.BUFFER_MEMORY_ALLOCATION_STRATEGY_CONFIG + " does not support compression yet. " + ProducerConfig.COMPRESSION_TYPE_CONFIG + " must be set to none."); } - // Use the chunked path only when a batch is at least one full chunk - // (batch.size >= CHUNK_SIZE). Below that, a batch can't fill even one chunk, so chunking - // would over-reserve. - if (incremental && batchSize >= ChunkedRecordAccumulator.CHUNK_SIZE) { + if (useIncremental) { this.accumulator = new ChunkedRecordAccumulator(logContext, batchSize, compression, diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStream.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStream.java index 97b89cd1b4416..101a5b15cabdc 100644 --- a/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStream.java +++ b/clients/src/main/java/org/apache/kafka/clients/producer/internals/ChunkedByteBufferOutputStream.java @@ -216,6 +216,8 @@ public void position(int position) { */ @Override public int remaining() { + if (currentChunk == null) // after deallocate no chunks attached, no free capacity + return 0; int total = currentChunk.remaining(); for (int i = currentChunkIndex + 1; i < chunks.size(); i++) total += chunks.get(i).remaining(); From 179e6750601c41c5155105786a2f6dad476c0d37 Mon Sep 17 00:00:00 2001 From: Lianet Magrans Date: Tue, 23 Jun 2026 16:27:11 -0400 Subject: [PATCH 10/11] checkstyle --- .../clients/producer/internals/ChunkedBufferPoolTest.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPoolTest.java b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPoolTest.java index 45e9aceb12cf5..5b0f6f9b4fbb4 100644 --- a/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPoolTest.java +++ b/clients/src/test/java/org/apache/kafka/clients/producer/internals/ChunkedBufferPoolTest.java @@ -17,6 +17,7 @@ package org.apache.kafka.clients.producer.internals; import org.apache.kafka.clients.producer.BufferExhaustedException; +import org.apache.kafka.common.KafkaException; import org.apache.kafka.common.metrics.Metrics; import org.apache.kafka.common.utils.MockTime; import org.apache.kafka.test.TestUtils; @@ -24,8 +25,6 @@ import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; -import org.apache.kafka.common.KafkaException; - import java.nio.ByteBuffer; import java.util.List; import java.util.concurrent.CountDownLatch; From 9b45abc201bf270bd093d417bbd0589ca0768e84 Mon Sep 17 00:00:00 2001 From: Lianet Magrans Date: Tue, 23 Jun 2026 17:38:44 -0400 Subject: [PATCH 11/11] fix test --- .../org/apache/kafka/tools/other/ReplicationQuotasTestRig.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/src/test/java/org/apache/kafka/tools/other/ReplicationQuotasTestRig.java b/tools/src/test/java/org/apache/kafka/tools/other/ReplicationQuotasTestRig.java index 8e646534263a7..7830889a5313d 100644 --- a/tools/src/test/java/org/apache/kafka/tools/other/ReplicationQuotasTestRig.java +++ b/tools/src/test/java/org/apache/kafka/tools/other/ReplicationQuotasTestRig.java @@ -414,7 +414,8 @@ KafkaProducer createProducer() { Option.empty(), new ByteArraySerializer(), new ByteArraySerializer(), - false + false, + Option.empty() ); } }