-
Notifications
You must be signed in to change notification settings - Fork 15.3k
KAFKA-20678: Add timeout in replica manager log reader remote read. #22707
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: trunk
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,6 +25,7 @@ | |
| import org.apache.kafka.server.log.remote.storage.RemoteLogManager; | ||
| import org.apache.kafka.server.share.LogReader; | ||
| import org.apache.kafka.server.storage.log.FetchParams; | ||
| import org.apache.kafka.server.util.timer.TimerTask; | ||
| import org.apache.kafka.storage.internals.log.FetchDataInfo; | ||
| import org.apache.kafka.storage.internals.log.LogReadResult; | ||
| import org.apache.kafka.storage.internals.log.RemoteStorageFetchInfo; | ||
|
|
@@ -37,6 +38,7 @@ | |
| import java.util.Set; | ||
| import java.util.concurrent.CompletableFuture; | ||
| import java.util.concurrent.CompletionException; | ||
| import java.util.concurrent.TimeoutException; | ||
| import java.util.stream.Collectors; | ||
|
|
||
| import scala.Tuple2; | ||
|
|
@@ -116,13 +118,13 @@ public CompletableFuture<LinkedHashMap<TopicIdPartition, LogReadResult>> readAsy | |
| LinkedHashMap<TopicIdPartition, LogReadResult> localReadResults = | ||
| read(fetchParams, partitionsToFetch, topicPartitionFetchOffsets, partitionMaxBytes); | ||
|
|
||
| // One future per partition; combined into a single future once every partition has resolved. | ||
| // Only look at partitions with non-null read results. | ||
| LinkedHashMap<TopicIdPartition, CompletableFuture<LogReadResult>> futures = new LinkedHashMap<>(); | ||
| for (TopicIdPartition topicIdPartition : partitionsToFetch) { | ||
| LogReadResult logReadResult = localReadResults.get(topicIdPartition); | ||
| if (logReadResult == null) { | ||
| futures.put(topicIdPartition, CompletableFuture.completedFuture( | ||
| new LogReadResult(Errors.UNKNOWN_SERVER_ERROR))); | ||
| // Just skip | ||
| log.debug("Log read result for partition {} is null", topicIdPartition); | ||
| continue; | ||
| } | ||
|
|
||
|
|
@@ -147,6 +149,7 @@ public CompletableFuture<LinkedHashMap<TopicIdPartition, LogReadResult>> readAsy | |
| return withInfoAndError(logReadResult, localFetchDataInfo, Errors.forException(cause)); | ||
| } | ||
| if (remoteFetchDataInfo == null) { | ||
| // We want to return successful local read results so no skipping. | ||
| return withInfoAndError(logReadResult, localFetchDataInfo, Errors.UNKNOWN_SERVER_ERROR); | ||
| } | ||
| return withInfoAndError(logReadResult, remoteFetchDataInfo, Errors.NONE); | ||
|
|
@@ -185,8 +188,9 @@ private static LogReadResult withInfoAndError(LogReadResult base, FetchDataInfo | |
| * RemoteStorageFetchInfo is the descriptor surfaced by a prior local read as | ||
| * FetchDataInfo#delayedRemoteStorageFetch. The read runs on the remote storage reader pool so the | ||
| * caller's thread is not blocked; the future completes exceptionally when remote storage is not | ||
| * configured or the read could not be completed. Used internally by readAsync (package-private so | ||
| * it remains unit-testable). | ||
| * configured, the read could not be completed, or the read does not finish within the configured | ||
| * remote fetch timeout ({@code remote.fetch.max.wait.ms}). Used internally by readAsync | ||
| * (package-private so it remains unit-testable). | ||
| */ | ||
| // Visibility for testing | ||
| CompletableFuture<FetchDataInfo> readRemote(RemoteStorageFetchInfo remoteStorageFetchInfo) { | ||
|
|
@@ -218,6 +222,33 @@ CompletableFuture<FetchDataInfo> readRemote(RemoteStorageFetchInfo remoteStorage | |
| future.completeExceptionally(e); | ||
| } | ||
|
|
||
| // Bound the wait on the remote read so a stalled remote tier cannot pin the read (and the | ||
| // resources held while it is pending) indefinitely. Use the broker's timer wheel - as | ||
| // DelayedShareFetch does for its remote fetch - rather than CompletableFuture#orTimeout. On | ||
| // expiry the future completes exceptionally with a TimeoutException, which the caller treats | ||
| // as a (skippable) read error. | ||
| if (!future.isDone()) { | ||
| long timeoutMs = remoteFetchMaxWaitMs(); | ||
| TimerTask timeoutTask = new TimerTask(timeoutMs) { | ||
| @Override | ||
| public void run() { | ||
| future.completeExceptionally(new TimeoutException( | ||
| "Remote read for " + remoteStorageFetchInfo + " did not complete within " + timeoutMs + " ms.")); | ||
| } | ||
|
Comment on lines
+235
to
+237
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't we also need not to cancel the pending future for remote calls? |
||
| }; | ||
| // Cancel the timer task once the read completes (either outcome) so it does not linger in the wheel. | ||
| future.whenComplete((info, exception) -> timeoutTask.cancel()); | ||
| replicaManager.addShareFetchTimerRequest(timeoutTask); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also why share fetch timer API being used here? Also should we make it callers responsibility to handle the timeouts? |
||
| } | ||
|
|
||
| return future; | ||
| } | ||
|
|
||
| /** | ||
| * The maximum time to wait for a remote-tier read, taken from the broker's | ||
| * {@code remote.fetch.max.wait.ms}. Read per call since the config is dynamically reconfigurable. | ||
| */ | ||
| private long remoteFetchMaxWaitMs() { | ||
| return replicaManager.config().remoteLogManagerConfig().remoteFetchMaxWaitMs(); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just completing it exceptionally might not be a good idea as the method can still return the locally fetched data while skipping the remote storage partitions.