From c643d05c647e3208f1b26c370e91cf66978d87ec Mon Sep 17 00:00:00 2001
From: Ray Andrew <rs@rs.ht>
Date: Mon, 18 May 2026 17:23:41 -0500
Subject: [PATCH 1/2] feat(dlio): add DLIO config generation

---
 cmake/modules/Dependencies.cmake              |  89 +++
 .../common/statistics/distributions.h         |  71 +++
 .../utilities/common/statistics/mixture.h     |  76 +++
 .../utilities/common/statistics/statistic.h   |  55 ++
 .../utilities/common/statistics/statistics.h  |   2 +
 .../dft/aggregators/aggregation_runner.h      |  72 +++
 .../utils/utilities/dlio/barrier_simulator.h  | 115 ++++
 .../dftracer/utils/utilities/dlio/optimizer.h |  56 ++
 .../dftracer/utils/utilities/dlio/statistic.h |  35 ++
 .../utils/utilities/dlio/trace_loader.h       | 102 +++
 .../utils/utilities/dlio/worker_queue.h       |  51 ++
 .../dftracer/utils/utilities/dlio/yaml_emit.h |  42 ++
 src/CMakeLists.txt                            |  19 +-
 .../utils/binaries/dftracer_aggregator.cpp    | 587 +++---------------
 .../binaries/dftracer_gen_dlio_config.cpp     | 354 +++++++++++
 .../common/statistics/distributions.cpp       | 448 +++++++++++++
 .../utilities/common/statistics/mixture.cpp   | 259 ++++++++
 .../dft/aggregators/aggregation_runner.cpp    | 444 +++++++++++++
 .../utilities/dlio/barrier_simulator.cpp      | 450 ++++++++++++++
 .../utils/utilities/dlio/optimizer.cpp        |  91 +++
 .../utils/utilities/dlio/trace_loader.cpp     | 380 ++++++++++++
 .../utils/utilities/dlio/yaml_emit.cpp        | 112 ++++
 tests/CMakeLists.txt                          |   4 +
 .../test_dftracer_gen_dlio_config.cpp         | 222 +++++++
 tests/utilities/CMakeLists.txt                |   7 +
 .../common/statistics/test_distributions.cpp  | 174 ++++++
 .../common/statistics/test_mixture.cpp        | 176 ++++++
 .../utilities/dlio/test_barrier_simulator.cpp | 241 +++++++
 28 files changed, 4239 insertions(+), 495 deletions(-)
 create mode 100644 include/dftracer/utils/utilities/common/statistics/distributions.h
 create mode 100644 include/dftracer/utils/utilities/common/statistics/mixture.h
 create mode 100644 include/dftracer/utils/utilities/common/statistics/statistic.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_runner.h
 create mode 100644 include/dftracer/utils/utilities/dlio/barrier_simulator.h
 create mode 100644 include/dftracer/utils/utilities/dlio/optimizer.h
 create mode 100644 include/dftracer/utils/utilities/dlio/statistic.h
 create mode 100644 include/dftracer/utils/utilities/dlio/trace_loader.h
 create mode 100644 include/dftracer/utils/utilities/dlio/worker_queue.h
 create mode 100644 include/dftracer/utils/utilities/dlio/yaml_emit.h
 create mode 100644 src/dftracer/utils/binaries/dftracer_gen_dlio_config.cpp
 create mode 100644 src/dftracer/utils/utilities/common/statistics/distributions.cpp
 create mode 100644 src/dftracer/utils/utilities/common/statistics/mixture.cpp
 create mode 100644 src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_runner.cpp
 create mode 100644 src/dftracer/utils/utilities/dlio/barrier_simulator.cpp
 create mode 100644 src/dftracer/utils/utilities/dlio/optimizer.cpp
 create mode 100644 src/dftracer/utils/utilities/dlio/trace_loader.cpp
 create mode 100644 src/dftracer/utils/utilities/dlio/yaml_emit.cpp
 create mode 100644 tests/binaries/test_dftracer_gen_dlio_config.cpp
 create mode 100644 tests/utilities/common/statistics/test_distributions.cpp
 create mode 100644 tests/utilities/common/statistics/test_mixture.cpp
 create mode 100644 tests/utilities/dlio/test_barrier_simulator.cpp

diff --git a/cmake/modules/Dependencies.cmake b/cmake/modules/Dependencies.cmake
index 454b2ae4..6862d675 100644
--- a/cmake/modules/Dependencies.cmake
+++ b/cmake/modules/Dependencies.cmake
@@ -1766,6 +1766,95 @@ function(link_nanoarrow TARGET_NAME LIBRARY_TYPE)
 
 endfunction()
 
+# ==============================================================================
+# Boost.Math (standalone, header-only); for statistical distributions
+# ==============================================================================
+
+function(need_boost_math)
+  if(NOT boost_math_ADDED)
+    cpmaddpackage(
+      NAME
+      boost_math
+      GITHUB_REPOSITORY
+      boostorg/math
+      GIT_TAG
+      boost-1.91.0
+      DOWNLOAD_ONLY
+      YES)
+  endif()
+
+  # CPMAddPackage only sets boost_math_SOURCE_DIR in the calling scope. Cache
+  # it so link_boost_math() can find the include dir from anywhere in the tree.
+  if(boost_math_SOURCE_DIR)
+    set(boost_math_SOURCE_DIR
+        "${boost_math_SOURCE_DIR}"
+        CACHE INTERNAL "Boost.Math source tree from CPM")
+    message(STATUS "Added Boost.Math (standalone) headers from ${boost_math_SOURCE_DIR}/include")
+  endif()
+endfunction()
+
+# Apply Boost.Math standalone headers + BOOST_MATH_STANDALONE define as PRIVATE
+# build-only properties. We deliberately avoid an INTERFACE link target so the
+# headers/defines never enter the installed/exported target set.
+function(link_boost_math TARGET_NAME)
+  if(NOT TARGET_NAME)
+    message(FATAL_ERROR "link_boost_math: TARGET_NAME is required")
+  endif()
+  if(NOT TARGET ${TARGET_NAME})
+    message(FATAL_ERROR "link_boost_math: target '${TARGET_NAME}' does not exist")
+  endif()
+  if(NOT boost_math_SOURCE_DIR)
+    message(FATAL_ERROR
+      "link_boost_math: boost_math_SOURCE_DIR is unset; call need_boost_math() first")
+  endif()
+
+  target_include_directories(${TARGET_NAME} SYSTEM PRIVATE
+                             ${boost_math_SOURCE_DIR}/include)
+  target_compile_definitions(${TARGET_NAME} PRIVATE BOOST_MATH_STANDALONE)
+  message(STATUS "Linked ${TARGET_NAME} to Boost.Math (standalone)")
+endfunction()
+
+# ==============================================================================
+# yaml-cpp - YAML emit/parse for DLIO config generation
+# ==============================================================================
+
+function(need_yaml_cpp)
+  if(NOT yaml-cpp_ADDED)
+    cpmaddpackage(
+      NAME
+      yaml-cpp
+      GITHUB_REPOSITORY
+      jbeder/yaml-cpp
+      GIT_TAG
+      yaml-cpp-0.9.0
+      OPTIONS
+      "YAML_CPP_BUILD_TESTS OFF"
+      "YAML_CPP_BUILD_TOOLS OFF"
+      "YAML_CPP_BUILD_CONTRIB OFF"
+      "YAML_BUILD_SHARED_LIBS OFF"
+      "YAML_CPP_INSTALL ON"
+      FORCE
+      YES)
+  endif()
+endfunction()
+
+# Link yaml-cpp PRIVATE so the static library is bundled into the consumer and
+# the header path stays out of the installed/exported target set.
+function(link_yaml_cpp TARGET_NAME)
+  if(NOT TARGET_NAME)
+    message(FATAL_ERROR "link_yaml_cpp: TARGET_NAME is required")
+  endif()
+  if(NOT TARGET ${TARGET_NAME})
+    message(FATAL_ERROR "link_yaml_cpp: target '${TARGET_NAME}' does not exist")
+  endif()
+  if(NOT TARGET yaml-cpp::yaml-cpp)
+    message(FATAL_ERROR
+      "link_yaml_cpp: yaml-cpp::yaml-cpp target missing; call need_yaml_cpp() first")
+  endif()
+  target_link_libraries(${TARGET_NAME} PRIVATE yaml-cpp::yaml-cpp)
+  message(STATUS "Linked ${TARGET_NAME} to yaml-cpp")
+endfunction()
+
 # ==============================================================================
 # Testing Dependencies
 # ==============================================================================
diff --git a/include/dftracer/utils/utilities/common/statistics/distributions.h b/include/dftracer/utils/utilities/common/statistics/distributions.h
new file mode 100644
index 00000000..03d042ef
--- /dev/null
+++ b/include/dftracer/utils/utilities/common/statistics/distributions.h
@@ -0,0 +1,71 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMMON_STATISTICS_DISTRIBUTIONS_H
+#define DFTRACER_UTILS_UTILITIES_COMMON_STATISTICS_DISTRIBUTIONS_H
+
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <optional>
+#include <random>
+#include <string_view>
+#include <vector>
+
+namespace dftracer::utils::utilities::common::statistics {
+
+enum class DistributionKind : std::uint8_t {
+    Normal,       // params = {mean, stddev, _}
+    Lognormal,    // params = {mu, sigma, _}      (mu, sigma in log space)
+    Gamma,        // params = {shape, scale, _}
+    Exponential,  // params = {rate, _, _}        (rate = 1/scale)
+    Weibull,      // params = {shape, scale, _}
+};
+
+std::string_view distribution_name(DistributionKind k);
+
+// Result of fitting a single distribution to a sample array.
+// `params` semantics depend on `kind`.
+struct FittedDistribution {
+    DistributionKind kind;
+    std::array<double, 3> params{};
+    double ks_stat = 1.0;  // Kolmogorov-Smirnov statistic (lower = better)
+    double log_likelihood = 0.0;  // sum of log pdf(x_i)
+    double bic = 0.0;             // k*ln(n) - 2*log_likelihood
+    bool valid = false;           // true when MLE succeeded
+};
+
+// MLE fit for a single distribution. Returns valid=false when fitting fails
+// (e.g. non-positive data for lognormal, sample size < 2, Newton
+// non-convergence).
+FittedDistribution fit_single_distribution(DistributionKind kind,
+                                           const std::vector<double>& data);
+
+// Fits all five distributions and returns them ordered by ascending ks_stat.
+// Invalid fits are kept at the back of the result.
+std::vector<FittedDistribution> fit_all_single_distributions(
+    const std::vector<double>& data);
+
+// Picks the lowest-KS valid fit. Returns nullopt if none of the fits succeeded.
+std::optional<FittedDistribution> best_fit_by_ks(
+    const std::vector<FittedDistribution>& fits);
+
+// Distribution PDF / CDF / inverse-CDF. Behavior is undefined when
+// `fit.valid == false`.
+double pdf(const FittedDistribution& fit, double x);
+double cdf(const FittedDistribution& fit, double x);
+double quantile(const FittedDistribution& fit, double p);
+
+// Sampler signature. Matches dlio::Sampler so dlio::BarrierSimulator can
+// consume it directly without an explicit cast.
+using Sampler = std::function<double(std::mt19937_64&)>;
+
+// Builds a Sampler from a fitted distribution.
+// Optional min/max bounds clamp the output (applied after sampling).
+Sampler make_sampler(const FittedDistribution& fit,
+                     std::optional<double> min_bound = std::nullopt,
+                     std::optional<double> max_bound = std::nullopt);
+
+// Returns the parameter count used for BIC. Useful when extending to mixtures.
+int free_parameter_count(DistributionKind kind);
+
+}  // namespace dftracer::utils::utilities::common::statistics
+
+#endif
diff --git a/include/dftracer/utils/utilities/common/statistics/mixture.h b/include/dftracer/utils/utilities/common/statistics/mixture.h
new file mode 100644
index 00000000..7856fd9c
--- /dev/null
+++ b/include/dftracer/utils/utilities/common/statistics/mixture.h
@@ -0,0 +1,76 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMMON_STATISTICS_MIXTURE_H
+#define DFTRACER_UTILS_UTILITIES_COMMON_STATISTICS_MIXTURE_H
+
+#include <dftracer/utils/utilities/common/statistics/distributions.h>
+
+#include <cstdint>
+#include <optional>
+#include <variant>
+#include <vector>
+
+namespace dftracer::utils::utilities::common::statistics {
+
+struct GmmComponent {
+    double mean = 0.0;
+    double stddev = 0.0;
+};
+
+// Univariate Gaussian Mixture Model fit. Component count is `weights.size()`.
+struct FittedMixture {
+    std::vector<double> weights;           // sum to 1
+    std::vector<GmmComponent> components;  // same length as weights
+    double log_likelihood = 0.0;
+    double bic = 0.0;
+    int iterations = 0;
+    bool converged = false;
+    bool valid = false;
+};
+
+struct MixtureFitOptions {
+    int max_iter = 200;
+    double tol = 1e-6;
+    double variance_floor = 1e-12;  // prevent component collapse
+    std::uint64_t seed = 0xC0FFEE;
+};
+
+// Fits a K-component Gaussian Mixture via EM. K-means-style initialization on
+// quantile-spread means and total-variance / K for each component.
+FittedMixture fit_gaussian_mixture(const std::vector<double>& data, int K,
+                                   const MixtureFitOptions& options = {});
+
+double pdf(const FittedMixture& mix, double x);
+double cdf(const FittedMixture& mix, double x);
+
+// Free-parameter count for BIC: 3K - 1  (K means + K stddevs + K-1 free
+// weights).
+int free_parameter_count(const FittedMixture& mix);
+
+// Sampler from a fitted mixture. Draws a component by weight then a Normal.
+Sampler make_sampler(const FittedMixture& mix,
+                     std::optional<double> min_bound = std::nullopt,
+                     std::optional<double> max_bound = std::nullopt);
+
+using BestModel = std::variant<FittedDistribution, FittedMixture>;
+
+struct ModelSelection {
+    BestModel model;
+    double bic = 0.0;
+    int free_params = 0;
+};
+
+// Selects the lowest-BIC model among the candidates. `single_fits` is typically
+// the output of fit_all_single_distributions(); `mixtures` is typically two
+// entries (GMM-2 and GMM-3). Invalid fits are ignored.
+std::optional<ModelSelection> select_best_model(
+    const std::vector<FittedDistribution>& single_fits,
+    const std::vector<FittedMixture>& mixtures);
+
+double pdf(const BestModel& m, double x);
+double cdf(const BestModel& m, double x);
+Sampler make_sampler(const BestModel& m,
+                     std::optional<double> min_bound = std::nullopt,
+                     std::optional<double> max_bound = std::nullopt);
+
+}  // namespace dftracer::utils::utilities::common::statistics
+
+#endif
diff --git a/include/dftracer/utils/utilities/common/statistics/statistic.h b/include/dftracer/utils/utilities/common/statistics/statistic.h
new file mode 100644
index 00000000..315c449f
--- /dev/null
+++ b/include/dftracer/utils/utilities/common/statistics/statistic.h
@@ -0,0 +1,55 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMMON_STATISTICS_STATISTIC_H
+#define DFTRACER_UTILS_UTILITIES_COMMON_STATISTICS_STATISTIC_H
+
+#include <dftracer/utils/utilities/common/statistics/ddsketch.h>
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <utility>
+
+namespace dftracer::utils::utilities::common::statistics {
+
+// Lightweight min/max/mean/count accumulator with an optional DDSketch backing
+// for quantile queries. When no sketch is attached, quantile() falls back to a
+// uniform interpolation between observed min and max.
+class Statistic {
+   public:
+    Statistic() = default;
+
+    void attach_sketch(std::shared_ptr<const DDSketch> sketch) {
+        sketch_ = std::move(sketch);
+    }
+
+    void update(double value) {
+        if (value < min_val_) min_val_ = value;
+        if (value > max_val_) max_val_ = value;
+        sum_ += value;
+        ++count_;
+        mean_ = sum_ / static_cast<double>(count_);
+    }
+
+    double quantile(double q) const {
+        if (sketch_ && !sketch_->empty()) return sketch_->quantile(q);
+        if (count_ == 0 || min_val_ == std::numeric_limits<double>::infinity())
+            return 0.0;
+        return min_val_ + q * (max_val_ - min_val_);
+    }
+
+    double min() const { return count_ == 0 ? 0.0 : min_val_; }
+    double max() const { return count_ == 0 ? 0.0 : max_val_; }
+    double mean() const { return mean_; }
+    std::uint64_t count() const { return count_; }
+
+   private:
+    double min_val_ = std::numeric_limits<double>::infinity();
+    double max_val_ = -std::numeric_limits<double>::infinity();
+    double sum_ = 0.0;
+    double mean_ = 0.0;
+    std::uint64_t count_ = 0;
+    std::shared_ptr<const DDSketch> sketch_;
+};
+
+}  // namespace dftracer::utils::utilities::common::statistics
+
+#endif
diff --git a/include/dftracer/utils/utilities/common/statistics/statistics.h b/include/dftracer/utils/utilities/common/statistics/statistics.h
index d56497d8..882dd0bb 100644
--- a/include/dftracer/utils/utilities/common/statistics/statistics.h
+++ b/include/dftracer/utils/utilities/common/statistics/statistics.h
@@ -9,5 +9,7 @@
  */
 
 #include <dftracer/utils/utilities/common/statistics/ddsketch.h>
+#include <dftracer/utils/utilities/common/statistics/distributions.h>
+#include <dftracer/utils/utilities/common/statistics/statistic.h>
 
 #endif  // DFTRACER_UTILS_UTILITIES_COMMON_STATISTICS_STATISTICS_H
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_runner.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_runner.h
new file mode 100644
index 00000000..73746434
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_runner.h
@@ -0,0 +1,72 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_RUNNER_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_RUNNER_H
+
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/pipeline/pipeline.h>
+#include <dftracer/utils/core/pipeline/pipeline_config.h>
+#include <dftracer/utils/core/utils/timer.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_config.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.h>
+
+#include <cstddef>
+#include <optional>
+#include <string>
+
+namespace dftracer::utils::utilities::composites::dft::aggregators {
+
+// Input bundle for the aggregation pipeline. Callers (binaries) translate their
+// CLI flags into this struct, then call run_aggregation().
+struct AggregationRunInput {
+    // Raw trace input.
+    std::string log_dir;           // directory containing .pfw[.gz]
+    std::string index_dir;         // where the shared RocksDB lives
+    AggregationConfig agg_config;  // time_interval_us, sketch, etc.
+    ::dftracer::utils::PipelineConfig
+        pipeline_config;           // executor threads, etc.
+
+    // Optional output writer. When `output_file` is std::nullopt the run only
+    // populates the AGGREGATION column family in the RocksDB and skips Perfetto
+    // / Arrow emission; downstream consumers (e.g. dftracer_gen_dlio_config)
+    // open the CF directly. When set, the output is written in `output_format`.
+    std::optional<std::string> output_file;
+    std::string output_format = AggregationConfig::FORMAT_JSON;
+    PerfettoEventFormat event_format = PerfettoEventFormat::COUNTER;
+    bool compress_output = false;
+    int compression_level = 1;
+
+    // Indexing controls.
+    bool force_rebuild = false;
+    std::size_t checkpoint_size = 0;
+
+    // Optional staged Timer for profiling. Caller owns lifetime.
+    ::dftracer::utils::Timer* stages = nullptr;
+    bool verbose = true;  // controls console banner output
+};
+
+struct AggregationRunResult {
+    bool success = false;
+
+    // Path to the shared RocksDB index that now contains the AGGREGATION CF.
+    // Downstream tools (dftracer_gen_dlio_config) open this read-only.
+    std::string index_path;
+
+    std::size_t total_keys = 0;
+    std::size_t input_file_count = 0;
+    std::size_t processed_file_count = 0;
+    std::size_t cached_file_count = 0;
+
+    double elapsed_ms = 0.0;
+};
+
+// Runs the full index + aggregate pipeline:
+//   1. Scan log_dir for input files; consult the existing index.
+//   2. Re-index any file that needs it (or all, if force_rebuild).
+//   3. Run the aggregation visitor pipeline across all files needing it.
+//   4. Optionally write the aggregated events to a Perfetto JSON / Arrow IPC
+//      file (when input.output_file is set).
+//   5. Write per-file tracking entries and global config to the AGGREGATION CF.
+coro::CoroTask<AggregationRunResult> run_aggregation(AggregationRunInput input);
+
+}  // namespace dftracer::utils::utilities::composites::dft::aggregators
+
+#endif
diff --git a/include/dftracer/utils/utilities/dlio/barrier_simulator.h b/include/dftracer/utils/utilities/dlio/barrier_simulator.h
new file mode 100644
index 00000000..c889bdd7
--- /dev/null
+++ b/include/dftracer/utils/utilities/dlio/barrier_simulator.h
@@ -0,0 +1,115 @@
+#ifndef DFTRACER_UTILS_UTILITIES_DLIO_BARRIER_SIMULATOR_H
+#define DFTRACER_UTILS_UTILITIES_DLIO_BARRIER_SIMULATOR_H
+
+#include <dftracer/utils/utilities/dlio/statistic.h>
+
+#include <cstdint>
+#include <functional>
+#include <optional>
+#include <random>
+#include <vector>
+
+namespace dftracer::utils::utilities::dlio {
+
+using Rng = std::mt19937_64;
+using Sampler = std::function<double(Rng&)>;
+
+struct BarrierSimulatorContext {
+    int num_ranks = 0;
+    int num_steps = 0;
+    double trace_e2e_duration = 0.0;
+
+    std::vector<std::vector<double>> fetch_block_trace;
+    std::vector<std::vector<double>> fetch_iter_trace;
+
+    Statistic fetch_block_stats;
+    Statistic fetch_iter_stats;
+    Statistic preprocess_stats;
+    Statistic getitem_stats;
+
+    ComponentTimeMetrics trace_preprocess_metrics;
+    ComponentTimeMetrics trace_fetch_iter_metrics;
+    ComponentTimeMetrics trace_fetch_block_metrics;
+    double trace_rank_variance = 0.0;
+    std::vector<double> trace_per_rank_throughput;
+
+    std::optional<std::vector<std::vector<double>>> getitem_trace;
+    std::optional<Statistic> io_stats;
+    std::optional<std::vector<double>> io_samples;
+
+    bool sync_mode = false;
+    int accumulate_grad_batches = 1;
+    bool enable_preprocess_simulation = false;
+    int num_workers = 8;
+    int prefetch_factor = 2;
+    double preprocess_slowdown_factor = 1.0;
+    double base_fetch_iter_overhead = 0.0;
+    bool is_aggregated_trace = false;
+    double avg_calls_per_epoch = 1.0;
+};
+
+struct BarrierSimulationResult {
+    double e2e_duration = 0.0;
+    double e2e_error = 0.0;
+
+    double avg_barrier_overhead = 0.0;
+    double max_barrier_overhead = 0.0;
+
+    std::vector<double> per_rank_completion_time;
+    double rank_variance = 0.0;
+    double trace_rank_variance = 0.0;
+    double rank_variance_error = 0.0;
+    double load_imbalance = 0.0;
+
+    std::vector<double> simulated_fetch_block;
+    double fetch_block_cdf_similarity = 0.0;
+
+    std::vector<double> simulated_preprocess;
+    std::vector<double> simulated_getitem;
+    std::vector<double> simulated_fetch_iter;
+    double fetch_iter_cdf_similarity = 0.0;
+    double getitem_cdf_similarity = 0.0;
+    double avg_queue_depth = 0.0;
+    double avg_queue_stalls = 0.0;
+
+    ComponentTimeMetrics preprocess_metrics;
+    ComponentTimeMetrics fetch_iter_metrics;
+    ComponentTimeMetrics fetch_block_metrics;
+
+    std::optional<ComponentTimeMetrics> trace_preprocess_metrics;
+    std::optional<ComponentTimeMetrics> trace_fetch_iter_metrics;
+    std::optional<ComponentTimeMetrics> trace_fetch_block_metrics;
+
+    std::vector<double> simulated_per_rank_throughput;
+    std::vector<double> trace_per_rank_throughput;
+
+    double throughput_mean = 0.0;
+    double trace_throughput_mean = 0.0;
+    double throughput_mean_error = 0.0;
+
+    double throughput_variance = 0.0;
+    double trace_throughput_variance = 0.0;
+
+    double throughput_cdf_similarity = 0.0;
+};
+
+class BarrierSimulator {
+   public:
+    // preprocess_sampler may be empty; pass {} to use trace-derived
+    // getitem/preprocess stats.
+    BarrierSimulationResult simulate(
+        const BarrierSimulatorContext& context, std::uint64_t base_seed,
+        const Sampler& fetch_block_sampler,
+        const Sampler& preprocess_sampler = {}) const;
+};
+
+// 1 - Kolmogorov-Smirnov statistic between the two empirical distributions.
+// Returns 1.0 for perfect match, 0.0 for fully disjoint.
+double cdf_similarity(const std::vector<double>& a,
+                      const std::vector<double>& b);
+
+double variance(const std::vector<double>& values);
+
+}  // namespace dftracer::utils::utilities::dlio
+
+#endif
diff --git a/include/dftracer/utils/utilities/dlio/optimizer.h b/include/dftracer/utils/utilities/dlio/optimizer.h
new file mode 100644
index 00000000..90163053
--- /dev/null
+++ b/include/dftracer/utils/utilities/dlio/optimizer.h
@@ -0,0 +1,56 @@
+#ifndef DFTRACER_UTILS_UTILITIES_DLIO_OPTIMIZER_H
+#define DFTRACER_UTILS_UTILITIES_DLIO_OPTIMIZER_H
+
+#include <dftracer/utils/utilities/common/statistics/mixture.h>
+#include <dftracer/utils/utilities/dlio/barrier_simulator.h>
+
+#include <cstdint>
+#include <vector>
+
+namespace dftracer::utils::utilities::dlio {
+
+using BestModel = ::dftracer::utils::utilities::common::statistics::BestModel;
+
+struct OptimizerOptions {
+    int max_iterations = 5;
+    double target_e2e_error = 0.05;
+    double target_cdf_similarity = 0.90;
+    int patience = 10;
+    double epsilon = 1.0;
+    double momentum = 0.9;
+    double min_percentile = 50.0;
+    double initial_percentile = 95.0;
+    std::uint64_t base_seed = 42;
+};
+
+struct OptimizerResult {
+    BarrierSimulationResult best;
+    double best_percentile = 0.0;
+    int iterations_used = 0;
+    bool converged = false;
+};
+
+// Sequential momentum-based optimizer.
+// Searches for the max_bound percentile that minimizes simulator e2e_error
+// while preserving fetch_block_cdf_similarity. `sample_times` is the sorted
+// flat per-call sample array used for percentile lookups (sorted in-place if
+// not).
+//
+// Each iteration:
+//   1. comp_max_bound = percentile(sample_times, current_percentile)
+//   2. comp_sampler = make_sampler(model, min=sample_times.front(),
+//   max=comp_max_bound)
+//   3. result = simulator.simulate(context, base_seed, comp_sampler)
+//   4. Adjust current_percentile via momentum-smoothed step proportional to
+//   error.
+OptimizerResult optimize_max_bound_percentile(
+    const BarrierSimulatorContext& context, const BestModel& model,
+    std::vector<double> sample_times, const OptimizerOptions& options = {});
+
+// Helper: percentile by sorted index (linear interpolation between adjacent
+// samples). Returns 0 if data is empty.
+double percentile(const std::vector<double>& sorted_data, double pct);
+
+}  // namespace dftracer::utils::utilities::dlio
+
+#endif
diff --git a/include/dftracer/utils/utilities/dlio/statistic.h b/include/dftracer/utils/utilities/dlio/statistic.h
new file mode 100644
index 00000000..260c9805
--- /dev/null
+++ b/include/dftracer/utils/utilities/dlio/statistic.h
@@ -0,0 +1,35 @@
+#ifndef DFTRACER_UTILS_UTILITIES_DLIO_STATISTIC_H
+#define DFTRACER_UTILS_UTILITIES_DLIO_STATISTIC_H
+
+#include <dftracer/utils/utilities/common/statistics/statistic.h>
+
+#include <cstdint>
+#include <vector>
+
+namespace dftracer::utils::utilities::dlio {
+
+using Statistic = dftracer::utils::utilities::common::statistics::Statistic;
+
+struct ComponentTimeMetrics {
+    double union_time = 0.0;
+    double accumulated_time = 0.0;
+    std::uint64_t num_samples = 0;
+    Statistic stats;
+
+    double concurrency() const {
+        return union_time > 0.0 ? accumulated_time / union_time : 0.0;
+    }
+};
+
+struct Boundary {
+    std::int64_t time;
+    int delta;  // +1 start, -1 end
+};
+
+// Sweep-line union of [start, end] intervals encoded as boundaries.
+// Times are in microseconds; return value is seconds.
+double sweep_union(std::vector<Boundary>& boundaries);
+
+}  // namespace dftracer::utils::utilities::dlio
+
+#endif
diff --git a/include/dftracer/utils/utilities/dlio/trace_loader.h b/include/dftracer/utils/utilities/dlio/trace_loader.h
new file mode 100644
index 00000000..c6fb23dc
--- /dev/null
+++ b/include/dftracer/utils/utilities/dlio/trace_loader.h
@@ -0,0 +1,102 @@
+#ifndef DFTRACER_UTILS_UTILITIES_DLIO_TRACE_LOADER_H
+#define DFTRACER_UTILS_UTILITIES_DLIO_TRACE_LOADER_H
+
+#include <dftracer/utils/utilities/common/statistics/ddsketch.h>
+#include <dftracer/utils/utilities/dlio/barrier_simulator.h>
+#include <dftracer/utils/utilities/dlio/statistic.h>
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace dftracer::utils::utilities::dlio {
+
+using DDSketch = ::dftracer::utils::utilities::common::statistics::DDSketch;
+
+// DLIO category / event names matched in the AGGREGATION CF.
+inline constexpr std::string_view CATEGORY_DATALOADER = "dataloader";
+inline constexpr std::string_view CATEGORY_DATA = "data";
+inline constexpr std::string_view EVENT_FETCH_BLOCK = "fetch.block";
+inline constexpr std::string_view EVENT_FETCH_ITER = "fetch.iter";
+inline constexpr std::string_view EVENT_PREPROCESS = "preprocess";
+inline constexpr std::string_view EVENT_ITEM = "item";
+
+struct AggregatedTraces {
+    // Per-rank concatenated sample sequences (seconds), in pid-ascending,
+    // then time-bucket-ascending order.
+    std::vector<std::vector<double>> fetch_block_trace;
+    std::vector<std::vector<double>> fetch_iter_trace;
+    std::vector<std::vector<double>> getitem_trace;
+
+    // Flat sample arrays for distribution fitting (seconds).
+    std::vector<double> computation_times;  // fetch.block
+    std::vector<double> preprocess_times;   // preprocess
+
+    // Sketches merged across all (rank, bucket) entries for each component.
+    // Nullable: only populated if the aggregator was run with
+    // --compute-percentiles.
+    std::shared_ptr<DDSketch> fetch_block_sketch;
+    std::shared_ptr<DDSketch> fetch_iter_sketch;
+    std::shared_ptr<DDSketch> preprocess_sketch;
+    std::shared_ptr<DDSketch> getitem_sketch;
+
+    // Statistics with min/max/mean/count populated, sketch attached when
+    // present.
+    Statistic fetch_block_stats;
+    Statistic fetch_iter_stats;
+    Statistic preprocess_stats;
+    Statistic getitem_stats;
+
+    // Aggregate metrics (seconds).
+    double trace_e2e_duration = 0.0;
+    double trace_rank_variance = 0.0;
+    std::vector<double> trace_per_rank_throughput;
+
+    // Per-component accumulated / union times. Both computed from actual
+    // (ts, te) boundaries in the CF rather than a fixed-ratio heuristic.
+    ComponentTimeMetrics trace_preprocess_metrics;
+    ComponentTimeMetrics trace_fetch_iter_metrics;
+    ComponentTimeMetrics trace_fetch_block_metrics;
+
+    // Discovered PIDs (sorted) - index in this list defines rank ID.
+    std::vector<std::uint64_t> rank_pids;
+
+    int num_ranks = 0;
+    int num_steps = 0;  // min length across ranks of fetch_block_trace
+
+    // True if any AGGREGATION entry was found.
+    bool any_data = false;
+
+    // True if at least one DDSketch was present in the CF. Drives whether
+    // sample synthesis uses inverse-CDF or mean replication.
+    bool sketches_available = false;
+
+    // Time bucket width in microseconds (from AggGlobalConfig).
+    std::uint64_t time_interval_us = 0;
+};
+
+struct TraceLoaderOptions {
+    // Hard cap on samples synthesized per (cat, name, pid, bucket) entry, so a
+    // single high-count bucket cannot blow up memory. 0 disables the cap.
+    std::uint64_t max_samples_per_entry = 100;
+    // Seed for inverse-CDF sketch sampling.
+    std::uint64_t seed = 0xD15710;
+};
+
+// Loads aggregated DLIO trace data from a dftracer RocksDB. Opens the database
+// read-only, iterates the AGGREGATION column family, and materializes per-rank
+// trace arrays, distribution sample arrays, and trace-side
+// ComponentTimeMetrics.
+AggregatedTraces load_aggregated_traces(const std::string& db_path,
+                                        const TraceLoaderOptions& options = {});
+
+// Convenience: build a BarrierSimulatorContext from loaded traces.
+BarrierSimulatorContext make_simulator_context(const AggregatedTraces& traces,
+                                               int num_workers = 8,
+                                               int prefetch_factor = 2);
+
+}  // namespace dftracer::utils::utilities::dlio
+
+#endif
diff --git a/include/dftracer/utils/utilities/dlio/worker_queue.h b/include/dftracer/utils/utilities/dlio/worker_queue.h
new file mode 100644
index 00000000..6210ce99
--- /dev/null
+++ b/include/dftracer/utils/utilities/dlio/worker_queue.h
@@ -0,0 +1,51 @@
+#ifndef DFTRACER_UTILS_UTILITIES_DLIO_WORKER_QUEUE_H
+#define DFTRACER_UTILS_UTILITIES_DLIO_WORKER_QUEUE_H
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <utility>
+#include <vector>
+
+namespace dftracer::utils::utilities::dlio {
+
+struct WorkInterval {
+    double start_time;
+    double end_time;
+    double preprocess_duration;
+};
+
+// Sampler returns {batch_time, preprocess_time} where batch_time is the wall
+// clock the worker spends on the batch (preprocess + I/O, scaled).
+using BatchTimeSampler = std::function<std::pair<double, double>()>;
+
+class WorkerQueue {
+   public:
+    WorkerQueue(int num_workers, int prefetch_factor)
+        : num_workers_(num_workers),
+          prefetch_factor_(prefetch_factor),
+          queue_capacity_(static_cast<std::size_t>(num_workers) *
+                          static_cast<std::size_t>(prefetch_factor)) {}
+
+    std::vector<WorkInterval> produce_batches(double current_time,
+                                              const BatchTimeSampler& sampler);
+
+    // Returns time consumed (stall + base_overhead).
+    double consume_batch(double current_time, double base_overhead);
+
+    std::size_t queue_depth() const { return ready_batches_.size(); }
+    bool had_stall() const { return stall_count_ > 0; }
+    std::uint64_t stall_count() const { return stall_count_; }
+
+   private:
+    int num_workers_;
+    int prefetch_factor_;
+    std::size_t queue_capacity_;
+    std::uint64_t stall_count_ = 0;
+    std::vector<double> ready_batches_;  // sorted ready times
+    std::vector<double> worker_free_times_;
+};
+
+}  // namespace dftracer::utils::utilities::dlio
+
+#endif
diff --git a/include/dftracer/utils/utilities/dlio/yaml_emit.h b/include/dftracer/utils/utilities/dlio/yaml_emit.h
new file mode 100644
index 00000000..a0ad3634
--- /dev/null
+++ b/include/dftracer/utils/utilities/dlio/yaml_emit.h
@@ -0,0 +1,42 @@
+#ifndef DFTRACER_UTILS_UTILITIES_DLIO_YAML_EMIT_H
+#define DFTRACER_UTILS_UTILITIES_DLIO_YAML_EMIT_H
+
+#include <dftracer/utils/utilities/common/statistics/mixture.h>
+
+#include <iosfwd>
+#include <string>
+
+namespace dftracer::utils::utilities::dlio {
+
+using BestModel = ::dftracer::utils::utilities::common::statistics::BestModel;
+
+// Parameters needed to emit one timing block in the DLIO config.
+struct DlioTimingBlock {
+    BestModel model;
+    double max_bound = 0.0;  // seconds
+};
+
+// Renders the DLIO config YAML:
+//
+//   train:
+//     computation_time:
+//       type: <distribution>
+//       ...
+//       max_bound: <seconds>
+//   reader:
+//     preprocess_time:
+//       type: <distribution>
+//       ...
+//       max_bound: <seconds>
+//
+// Either or both blocks can be omitted by passing `nullptr`.
+std::string render_dlio_yaml(const DlioTimingBlock* computation,
+                             const DlioTimingBlock* preprocess);
+
+// Writes the YAML to `out` (e.g. an ofstream). Returns true on success.
+bool write_dlio_yaml(std::ostream& out, const DlioTimingBlock* computation,
+                     const DlioTimingBlock* preprocess);
+
+}  // namespace dftracer::utils::utilities::dlio
+
+#endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 503bb86e..46829ecd 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -20,6 +20,8 @@ need_readerwriterqueue()
 need_concurrentqueue()
 need_tl_expected()
 need_unordered_dense()
+need_boost_math()
+need_yaml_cpp()
 
 if(DFTRACER_UTILS_ENABLE_ARROW)
   need_nanoarrow()
@@ -117,7 +119,14 @@ set(DFTRACER_UTILS_UTILITIES_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/statistics/ddsketch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/statistics/log2_histogram.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/statistics/timestamp_histogram.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/statistics/distributions.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/statistics/mixture.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/json/json_value.cpp
+    # DLIO config generation
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/dlio/barrier_simulator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/dlio/trace_loader.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/dlio/optimizer.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/dlio/yaml_emit.cpp
 )
 
 list(APPEND DFTRACER_UTILS_UTILITIES_SOURCES
@@ -130,6 +139,7 @@ list(APPEND DFTRACER_UTILS_UTILITIES_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/query/query.cpp
     # DFT Aggregators
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregation_runner.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.cpp
@@ -469,6 +479,12 @@ foreach(variant shared static)
       link_nanoarrow(dftracer_utils_utilities_${variant} ${VARIANT_UPPER})
     endif()
 
+    # Boost.Math (standalone, header-only) for DLIO distribution fitting.
+    link_boost_math(dftracer_utils_utilities_${variant})
+
+    # yaml-cpp for DLIO config emit.
+    link_yaml_cpp(dftracer_utils_utilities_${variant})
+
     # Link zstd when ENABLE_ZSTD is on so headers propagate to consumers
     # (e.g. arrow ipc_writer.cpp guarded by DFTRACER_UTILS_ENABLE_ZSTD).
     if(DFTRACER_UTILS_ENABLE_ZSTD)
@@ -830,7 +846,8 @@ if(DFTRACER_UTILS_BUILD_BINARIES)
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/dftracer_organize.cpp
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/dftracer_reconstruct.cpp
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/dftracer_server.cpp
-      ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/dftracer_comparator.cpp)
+      ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/dftracer_comparator.cpp
+      ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/dftracer_gen_dlio_config.cpp)
 
   set(DFTRACER_MPI_BINARIES "")
   if(DFTRACER_UTILS_ENABLE_MPI)
diff --git a/src/dftracer/utils/binaries/dftracer_aggregator.cpp b/src/dftracer/utils/binaries/dftracer_aggregator.cpp
index 6e1a4d12..03deb880 100644
--- a/src/dftracer/utils/binaries/dftracer_aggregator.cpp
+++ b/src/dftracer/utils/binaries/dftracer_aggregator.cpp
@@ -1,22 +1,15 @@
 #include <dftracer/utils/core/common/config.h>
+#include <dftracer/utils/core/common/logging.h>
 #include <dftracer/utils/core/coro/task.h>
-#include <dftracer/utils/core/pipeline/pipeline.h>
-#include <dftracer/utils/core/tasks/coro_scope.h>
-#include <dftracer/utils/core/tasks/task.h>
-#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h>
-#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.h>
+#include <dftracer/utils/core/utils/timer.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_runner.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/aggregators.h>
-#include <dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h>
-#include <dftracer/utils/utilities/indexer/index_builder_utility.h>
-#include <dftracer/utils/utilities/indexer/index_database.h>
 
-#include "common_cli.h"
-#include "dftracer/utils/core/utils/timer.h"
-#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
-#include <dftracer/utils/utilities/common/arrow/ipc_writer.h>
-#endif
 #include <sstream>
-#include <unordered_set>
+#include <string>
+#include <vector>
+
+#include "common_cli.h"
 
 using namespace dftracer::utils;
 using namespace dftracer::utils::utilities;
@@ -196,540 +189,146 @@ class AggregatorArgParse : public cli::ArgParse {
     }
 };
 
-// Write global config and per-file tracking entries.
-static void write_aggregation_tracking(
-    dftracer::utils::rocksdb::RocksDatabase* db,
-    const AggregationConfig& config,
-    const std::vector<std::string>& processed_files,
-    const std::string& index_path) {
-    namespace rcf = dftracer::utils::rocksdb::cf;
-
-    // Open index database to get file_ids
-    indexer::IndexDatabase idx_db(
-        index_path,
-        dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
-
-    auto batch = db->begin_batch();
-
-    // Write global config once
-    AggGlobalConfig global_cfg;
-    global_cfg.time_interval_us = config.time_interval_us;
-    global_cfg.config_hash = 0;
-    db->put(batch, rcf::AGGREGATION, std::string_view(AGG_GLOBAL_CONFIG_KEY, 2),
-            serialize_agg_global_config(global_cfg));
-
-    // Per-file: empty value (presence = aggregated)
-    for (const auto& file_path : processed_files) {
-        int file_id = idx_db.find_file(file_path);
-        if (file_id >= 0) {
-            auto key = make_agg_file_key(file_id);
-            db->put(batch, rcf::AGGREGATION, key, "");
-        }
-    }
+namespace {
 
-    db->commit_batch(batch);
+std::vector<std::string> split_csv(const std::string& str) {
+    std::vector<std::string> out;
+    if (str.empty()) return out;
+    std::stringstream ss(str);
+    std::string item;
+    while (std::getline(ss, item, ',')) {
+        if (!item.empty()) out.push_back(item);
+    }
+    return out;
 }
 
-static coro::CoroTask<indexer::IndexBuildBatchResult> batch_index_and_aggregate(
-    CoroScope* scope, std::vector<std::string> file_paths,
-    std::string index_dir, std::size_t checkpoint_size, bool force_rebuild,
-    std::size_t parallelism, AggregationConfig agg_config,
-    std::shared_ptr<dftracer::utils::rocksdb::RocksDatabase> agg_db,
-    std::uint32_t config_hash) {
-    auto batch_config = std::make_shared<indexer::IndexBuildBatchConfig>();
-    batch_config->file_paths = std::move(file_paths);
-    batch_config->index_dir = std::move(index_dir);
-    batch_config->checkpoint_size = checkpoint_size;
-    batch_config->parallelism = parallelism;
-    batch_config->force_rebuild = force_rebuild;
-    batch_config->use_batch_write = true;
-
-    auto agg_config_ptr =
-        std::make_shared<AggregationConfig>(std::move(agg_config));
-    batch_config->dft_visitor_factory =
-        [agg_db, config_hash, agg_config_ptr](const std::string& file_path)
-        -> std::vector<std::unique_ptr<composites::dft::DftEventVisitor>> {
-        std::vector<std::unique_ptr<composites::dft::DftEventVisitor>> visitors;
-        visitors.push_back(std::make_unique<AggregationVisitor>(
-            agg_db, config_hash, *agg_config_ptr, file_path));
-        return visitors;
-    };
-
-    co_return co_await indexer::IndexBatchBuilderUtility::process(
-        scope, std::move(batch_config));
-}
+}  // namespace
 
-static PerfettoTraceWriterInput build_streaming_input(
-    EventAggregator* merger_ptr, const AggregationConfig* agg_config,
-    const std::string* output_file, bool compress_output, int compression_level,
-    PerfettoEventFormat event_format) {
-    auto global_tracker = merger_ptr->build_global_tracker();
-
-    PerfettoTraceWriterInput input;
-    input.output_path = *output_file;
-    input.aggregator = merger_ptr;
-    input.tracker = global_tracker.get();
-    input.agg_config = agg_config;
-    input.owned_tracker = std::move(global_tracker);
-    input.root_pids = input.tracker->get_root_pids();
-    input.compute_statistics = agg_config->compute_statistics;
-    input.compute_percentiles = agg_config->compute_percentiles;
-    input.percentiles = agg_config->percentiles;
-    input.compress = compress_output;
-    input.compression_level = compression_level;
-    input.format = event_format;
-
-    const auto& intervals = input.tracker->get_all_intervals();
-    if (!intervals.empty()) {
-        std::uint64_t global_min = UINT64_MAX;
-        std::uint64_t global_max = 0;
-        for (const auto& interval : intervals) {
-            global_min = std::min(global_min, interval.start_ts);
-            global_max = std::max(global_max, interval.end_ts);
-            auto& range = input.boundary_ranges[interval.name][interval.value];
-            if (range.ts == 0 && range.te == 0) {
-                range.ts = interval.start_ts;
-                range.te = interval.end_ts;
-            } else {
-                range.ts = std::min(range.ts, interval.start_ts);
-                range.te = std::max(range.te, interval.end_ts);
-            }
-        }
-        if (global_max > global_min) {
-            input.trace_duration = global_max - global_min;
-        }
-    }
+int main(int argc, char** argv) {
+    DFTRACER_UTILS_LOGGER_INIT();
 
-    return input;
-}
+    argparse::ArgumentParser program("dftracer_aggregator",
+                                     DFTRACER_UTILS_PACKAGE_VERSION);
+    program.add_description(
+        "Aggregate DFTracer events into time-series counters using streaming "
+        "coroutine pipeline with minimal memory footprint");
 
-static coro::CoroTask<int> run_aggregator(const AggregatorArgParse* cli) {
-    auto log_dir = cli->directory.value;
-    auto output_file = cli->output;
-    auto time_interval_ms = cli->time_interval;
-    std::uint64_t time_interval_us =
-        static_cast<std::uint64_t>(time_interval_ms * 1000.0);
-    const auto& group_keys_str = cli->group_keys;
-    const auto& metric_fields_str = cli->metric_fields;
-    const auto& query_str = cli->query_args.query;
-    auto force_rebuild = cli->indexing.force;
-    auto checkpoint_size = cli->indexing.checkpoint_size;
-    auto executor_threads = cli->pipeline.executor_threads;
-    auto index_dir = cli->indexing.index_dir;
-    auto compress_output = cli->compress;
-    auto compression_level = cli->compression_level;
-    const auto& boundary_events_str = cli->boundary_events;
-    auto no_track_parents = cli->no_track_parents;
-    const auto& event_format_str = cli->event_format;
-    auto compute_percentiles = cli->compute_percentiles;
-    const auto& percentiles_str = cli->percentiles;
-    auto relative_accuracy = cli->relative_accuracy;
-    const auto& output_format = cli->format;
-
-    if (!AggregationConfig::is_valid_format(output_format)) {
-        DFTRACER_UTILS_LOG_ERROR(
-            "Invalid output format: %s (supported: %s)", output_format.c_str(),
-            AggregationConfig::supported_formats_str().c_str());
-        co_return 1;
-    }
+    AggregatorArgParse cli(program);
+    cli.setup();
+    if (!cli.parse(argc, argv)) return 1;
 
+    // Resolve enum-like CLI strings.
     PerfettoEventFormat event_format = PerfettoEventFormat::COUNTER;
-    if (event_format_str == "async") {
+    if (cli.event_format == "async") {
         event_format = PerfettoEventFormat::ASYNC;
-    } else if (event_format_str == "regular") {
+    } else if (cli.event_format == "regular") {
         event_format = PerfettoEventFormat::REGULAR;
-    } else if (event_format_str != "counter") {
+    } else if (cli.event_format != "counter") {
         DFTRACER_UTILS_LOG_ERROR(
             "Invalid event format: %s (must be 'counter', 'async', or "
             "'regular')",
-            event_format_str.c_str());
-        co_return 1;
+            cli.event_format.c_str());
+        return 1;
     }
 
-    if (output_format == AggregationConfig::FORMAT_ARROW) {
+    // Output filename: append extension if missing.
+    std::string output_file = cli.output;
+    if (cli.format == AggregationConfig::FORMAT_ARROW) {
         constexpr std::string_view ext = ".arrows";
         if (output_file.size() < ext.size() ||
             output_file.substr(output_file.size() - ext.size()) != ext) {
             output_file += ext;
         }
-    } else if (compress_output) {
+    } else if (cli.compress) {
         if (output_file.size() < 3 ||
             output_file.substr(output_file.size() - 3) != ".gz") {
             output_file += ".gz";
         }
     }
 
-    auto split_string = [](const std::string& str) {
-        std::vector<std::string> result;
-        if (str.empty()) return result;
-        std::stringstream ss(str);
+    // Parse boundary events.
+    std::vector<BoundaryEventConfig> boundary_events;
+    {
+        std::stringstream ss(cli.boundary_events);
         std::string item;
         while (std::getline(ss, item, ',')) {
-            if (!item.empty()) {
-                result.push_back(item);
+            std::stringstream item_ss(item);
+            std::string event_name, value_field, output_name;
+            if (std::getline(item_ss, event_name, ':') &&
+                std::getline(item_ss, value_field, ':') &&
+                std::getline(item_ss, output_name, ':')) {
+                BoundaryEventConfig bec;
+                bec.event_name = event_name;
+                bec.value_field = value_field;
+                bec.output_name = output_name;
+                boundary_events.push_back(bec);
             }
         }
-        return result;
-    };
+    }
 
-    std::vector<std::string> group_keys = split_string(group_keys_str);
-    std::vector<std::string> metric_fields = split_string(metric_fields_str);
+    // Parse percentiles.
     std::vector<double> percentiles;
-    if (compute_percentiles) {
-        auto percentile_strs = split_string(percentiles_str);
-        for (const auto& p_str : percentile_strs) {
+    if (cli.compute_percentiles) {
+        for (const auto& p_str : split_csv(cli.percentiles)) {
             try {
                 double p = std::stod(p_str);
-                if (p >= 0.0 && p <= 1.0) {
-                    percentiles.push_back(p);
-                } else {
+                if (p < 0.0 || p > 1.0) {
                     DFTRACER_UTILS_LOG_ERROR(
                         "Invalid percentile value: %s (must be in [0.0, 1.0])",
                         p_str.c_str());
-                    co_return 1;
+                    return 1;
                 }
-            } catch (const std::exception& e) {
+                percentiles.push_back(p);
+            } catch (const std::exception&) {
                 DFTRACER_UTILS_LOG_ERROR("Failed to parse percentile: %s",
                                          p_str.c_str());
-                co_return 1;
+                return 1;
             }
         }
         if (percentiles.empty()) {
             DFTRACER_UTILS_LOG_ERROR(
                 "No valid percentiles specified with --compute-percentiles");
-            co_return 1;
-        }
-    }
-
-    log_dir = fs::absolute(log_dir).string();
-    output_file = fs::absolute(output_file).string();
-
-    std::printf("==========================================\n");
-    std::printf("DFTracer Aggregator (Streaming Pipeline)\n");
-    std::printf("==========================================\n");
-    std::printf("Arguments:\n");
-    std::printf("  Input directory: %s\n", log_dir.c_str());
-    std::printf("  Output file: %s\n", output_file.c_str());
-    std::printf("  Time interval: %.2f ms (%llu us)\n", time_interval_ms,
-                static_cast<unsigned long long>(time_interval_us));
-    std::printf("  Force rebuild: %s\n", force_rebuild ? "true" : "false");
-    std::printf("  Checkpoint size: %zu bytes (%.2f MB)\n", checkpoint_size,
-                static_cast<double>(checkpoint_size) / (1024.0 * 1024.0));
-    std::printf("  Executor threads: %zu\n", executor_threads);
-
-    if (!group_keys.empty()) {
-        std::printf("  Extra group keys: ");
-        for (std::size_t i = 0; i < group_keys.size(); ++i) {
-            std::printf("%s%s", group_keys[i].c_str(),
-                        i < group_keys.size() - 1 ? ", " : "\n");
-        }
-    }
-
-    if (!metric_fields.empty()) {
-        std::printf("  Custom metric fields: ");
-        for (std::size_t i = 0; i < metric_fields.size(); ++i) {
-            std::printf("%s%s", metric_fields[i].c_str(),
-                        i < metric_fields.size() - 1 ? ", " : "\n");
+            return 1;
         }
     }
 
-    std::printf("==========================================\n\n");
-
-    std::vector<BoundaryEventConfig> boundary_events;
-    if (!boundary_events_str.empty()) {
-        std::stringstream ss(boundary_events_str);
-        std::string item;
-        while (std::getline(ss, item, ',')) {
-            std::stringstream item_ss(item);
-            std::string event_name, value_field, output_name;
-
-            if (std::getline(item_ss, event_name, ':') &&
-                std::getline(item_ss, value_field, ':') &&
-                std::getline(item_ss, output_name, ':')) {
-                BoundaryEventConfig config;
-                config.event_name = event_name;
-                config.value_field = value_field;
-                config.output_name = output_name;
-                boundary_events.push_back(config);
-            }
-        }
+    if (!cli.query_args.query.empty()) {
+        DFTRACER_UTILS_LOG_WARN(
+            "--query is not yet supported in fused mode, ignoring");
     }
 
     AggregationConfig agg_config;
-    agg_config.time_interval_us = time_interval_us;
-    agg_config.extra_group_keys = group_keys;
-    agg_config.custom_metric_fields = metric_fields;
+    agg_config.time_interval_us =
+        static_cast<std::uint64_t>(cli.time_interval * 1000.0);
+    agg_config.extra_group_keys = split_csv(cli.group_keys);
+    agg_config.custom_metric_fields = split_csv(cli.metric_fields);
     agg_config.compute_statistics = true;
-    agg_config.compute_percentiles = compute_percentiles;
-    agg_config.sketch_accuracy = relative_accuracy;
+    agg_config.compute_percentiles = cli.compute_percentiles;
+    agg_config.sketch_accuracy = cli.relative_accuracy;
     agg_config.percentiles = percentiles;
     agg_config.boundary_events = boundary_events;
-    agg_config.track_process_parents = !no_track_parents;
-    agg_config.track_default_args = !cli->no_default_args;
-
-    if (!query_str.empty()) {
-        DFTRACER_UTILS_LOG_WARN(
-            "--query is not yet supported in fused mode, ignoring");
-    }
-
-    // Use hash=0 for simplicity (no config-based filtering)
-    constexpr std::uint32_t config_hash = 0;
+    agg_config.track_process_parents = !cli.no_track_parents;
+    agg_config.track_default_args = !cli.no_default_args;
 
     Timer stages_storage("dftracer_aggregator");
-    Timer* stages = cli->pipeline.time_profiling ? &stages_storage : nullptr;
-    Timer overall(true);
-
-    namespace idx = composites::dft::indexing;
-
-    auto scan_result = std::make_unique<idx::ResolverResult>();
-    {
-        ScopedTimer _t(stages, "scan_and_resolve");
-        idx::IndexResolverUtility resolver;
-        idx::ResolverInput input;
-        input.directory = log_dir;
-        input.index_dir = index_dir;
-        input.require_aggregation = !force_rebuild;
-        input.aggregation_config = agg_config;
-        *scan_result = co_await resolver.process(input);
-    }
-
-    auto& input_files = scan_result->all_files;
-    if (input_files.empty()) {
-        DFTRACER_UTILS_LOG_ERROR("No .pfw or .pfw.gz files found in: %s",
-                                 log_dir.c_str());
-        co_return 1;
-    }
-
-    DFTRACER_UTILS_LOG_INFO("Found %zu input files", input_files.size());
-
-    auto& shared_index_path = scan_result->index_path;
-
-    auto pipeline_config =
-        cli::build_pipeline_config("DFTracer Aggregator", cli->pipeline);
-
-    Pipeline pipeline(pipeline_config);
-
-    if (force_rebuild && fs::exists(shared_index_path)) {
-        DFTRACER_UTILS_LOG_INFO("Clearing shared index store: %s",
-                                shared_index_path.c_str());
-        fs::remove_all(shared_index_path);
-    }
-
-    std::shared_ptr<dftracer::utils::rocksdb::RocksDatabase> agg_db;
-    std::unique_ptr<EventAggregator> merger;
-    {
-        ScopedTimer _t(stages, "open_rocksdb");
-        agg_db = EventAggregator::open_with_merge_operator(shared_index_path);
-        merger = std::make_unique<EventAggregator>(agg_db, config_hash);
-    }
-
-    // Files to process: needs_checkpoint (index + aggregate) +
-    // needs_aggregation
-    const std::size_t num_needing_index = scan_result->needs_checkpoint.size();
-    const std::size_t num_needing_agg_only =
-        force_rebuild ? scan_result->cached.size()
-                      : scan_result->needs_aggregation.size();
-    const std::size_t num_cached =
-        force_rebuild ? 0 : scan_result->total_cached();
-
-    std::vector<std::string> files_to_process;
-    files_to_process.reserve(num_needing_index + num_needing_agg_only);
-    for (auto& item : scan_result->needs_checkpoint) {
-        files_to_process.push_back(std::move(item.file_path));
-    }
-    if (force_rebuild) {
-        for (auto& item : scan_result->cached) {
-            files_to_process.push_back(std::move(item.file_path));
-        }
-    } else {
-        for (auto& item : scan_result->needs_aggregation) {
-            files_to_process.push_back(std::move(item.file_path));
-        }
-    }
-
-    DFTRACER_UTILS_LOG_INFO(
-        "Files to process: %zu (%zu need indexing, %zu need aggregation only, "
-        "%zu cached)",
-        files_to_process.size(), num_needing_index, num_needing_agg_only,
-        num_cached);
-
-    bool write_success = false;
-    std::size_t total_keys = 0;
-    std::atomic<std::size_t> perfetto_keys_written{0};
-
-    auto main_task = make_task(
-        [&](CoroScope& scope) -> coro::CoroTask<void> {
-            if (!files_to_process.empty()) {
-                {
-                    ScopedTimer _t(stages, "index_and_aggregate");
-                    auto batch_result = co_await batch_index_and_aggregate(
-                        &scope, files_to_process, index_dir, checkpoint_size,
-                        force_rebuild, executor_threads, agg_config, agg_db,
-                        config_hash);
-
-                    {
-                        ScopedTimer _vd(stages, "visitor_drain");
-                        for (auto& file_visitors :
-                             batch_result.extra_visitors) {
-                            for (auto& visitor : file_visitors) {
-                                auto* agg_visitor =
-                                    dynamic_cast<AggregationVisitor*>(
-                                        visitor.get());
-                                if (agg_visitor) {
-                                    for (const auto& k :
-                                         agg_visitor->observed_extra_keys())
-                                        merger->add_observed_extra_key(k);
-                                    for (const auto& m :
-                                         agg_visitor->observed_custom_metrics())
-                                        merger->add_observed_custom_metric(m);
-                                    auto output = agg_visitor->take_output();
-                                    merger->merge_chunk(std::move(output));
-                                }
-                            }
-                            file_visitors.clear();
-                        }
-                    }
-                }
-
-                // Write tracking entries for processed files
-                {
-                    ScopedTimer _wt(stages, "write_tracking");
-                    write_aggregation_tracking(agg_db.get(), agg_config,
-                                               files_to_process,
-                                               shared_index_path);
-                }
-            }
-
-            ScopedTimer _pp(stages, "post_processing");
-
-#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
-            if (output_format == AggregationConfig::FORMAT_ARROW) {
-                using namespace utilities::common::arrow;
-
-                std::unique_ptr<AssociationTracker> global_tracker;
-                {
-                    ScopedTimer _bt(stages, "build_global_tracker");
-                    global_tracker = merger->build_global_tracker();
-                }
-                (void)global_tracker;
-
-                EventAggregator::ObservedColumns obs;
-                {
-                    ScopedTimer _oc(stages, "observed_columns");
-                    obs = merger->observed_columns();
-                }
-                auto& global_extra_key_ids = obs.extra_key_ids;
-                auto& global_custom_metric_names = obs.custom_metric_names;
-
-                IpcWriter ipc;
-                if (co_await ipc.open(output_file) != 0) {
-                    DFTRACER_UTILS_LOG_ERROR(
-                        "Failed to open Arrow IPC file: %s",
-                        output_file.c_str());
-                } else {
-                    ScopedTimer _aw(stages, "arrow_scan_write");
-                    constexpr std::size_t BATCH_ROWS = 10000;
-                    AggregationBatch batch;
-                    batch.entries.reserve(BATCH_ROWS);
-                    batch.global_extra_key_ids = &global_extra_key_ids;
-                    batch.global_custom_metric_names =
-                        &global_custom_metric_names;
-
-                    std::vector<ArrowExportResult> pending_batches;
-                    merger->scan([&](AggMapType, const AggregationKey& key,
-                                     AggregationMetrics& metrics) {
-                        total_keys++;
-                        batch.entries.emplace_back(key, std::move(metrics));
-                        if (batch.entries.size() >= BATCH_ROWS) {
-                            pending_batches.push_back(batch.to_arrow());
-                            batch.entries.clear();
-                        }
-                        return true;
-                    });
-                    if (!batch.entries.empty()) {
-                        pending_batches.push_back(batch.to_arrow());
-                    }
-
-                    write_success = true;
-                    for (auto& ab : pending_batches) {
-                        if (co_await ipc.write_batch(ab) != 0) {
-                            write_success = false;
-                            break;
-                        }
-                    }
-                    if (write_success) {
-                        write_success = (co_await ipc.close() == 0);
-                    } else {
-                        co_await ipc.close();
-                    }
-                }
-            } else
-#endif
-            {
-                PerfettoTraceWriterInput streaming_input;
-                {
-                    ScopedTimer _si(stages, "build_streaming_input");
-                    streaming_input = build_streaming_input(
-                        merger.get(), &agg_config, &output_file,
-                        compress_output, compression_level, event_format);
-                    streaming_input.keys_written = &perfetto_keys_written;
-                    streaming_input.merge_on_sharded = true;
-                }
-                {
-                    ScopedTimer _pw(stages, "perfetto_write");
-                    PerfettoTraceWriterUtility writer;
-                    write_success = co_await scope.spawn(
-                        writer, std::move(streaming_input));
-                }
-                total_keys = perfetto_keys_written.load();
-            }
-        },
-        "AggregatorMain");
-
-    pipeline.set_source(main_task);
-    {
-        ScopedTimer _t(stages, "pipeline_execute");
-        pipeline.execute();
-    }
-
-    {
-        ScopedTimer _t(stages, "close_rocksdb");
-        merger.reset();
-        agg_db.reset();
-    }
-
-    overall.stop();
-    double duration_ms = static_cast<double>(overall.elapsed()) / 1e6;
-
-    std::printf("\n");
-    std::printf("==========================================\n");
-    std::printf("Aggregation Results\n");
-    std::printf("==========================================\n");
-    std::printf("  Execution time: %.2f seconds\n", duration_ms / 1000.0);
-    std::printf("  Files: %zu total, %zu processed, %zu cached\n",
-                input_files.size(), files_to_process.size(), num_cached);
-    std::printf("  Unique aggregation keys: %zu\n", total_keys);
-    std::printf("  Output file: %s\n", output_file.c_str());
-    std::printf("  Write status: %s\n", write_success ? "SUCCESS" : "FAILED");
-    std::printf("==========================================\n");
-
-    if (stages) stages->print_stages();
-
-    co_return write_success ? 0 : 1;
-}
-
-int main(int argc, char** argv) {
-    DFTRACER_UTILS_LOGGER_INIT();
-
-    argparse::ArgumentParser program("dftracer_aggregator",
-                                     DFTRACER_UTILS_PACKAGE_VERSION);
-    program.add_description(
-        "Aggregate DFTracer events into time-series counters using streaming "
-        "coroutine pipeline with minimal memory footprint");
-
-    AggregatorArgParse cli(program);
-    cli.setup();
-    if (!cli.parse(argc, argv)) return 1;
-
-    return run_aggregator(&cli).get();
+    Timer* stages = cli.pipeline.time_profiling ? &stages_storage : nullptr;
+
+    AggregationRunInput input;
+    input.log_dir = cli.directory.value;
+    input.index_dir = cli.indexing.index_dir;
+    input.agg_config = std::move(agg_config);
+    input.pipeline_config =
+        cli::build_pipeline_config("DFTracer Aggregator", cli.pipeline);
+    input.output_file = std::move(output_file);
+    input.output_format = cli.format;
+    input.event_format = event_format;
+    input.compress_output = cli.compress;
+    input.compression_level = cli.compression_level;
+    input.force_rebuild = cli.indexing.force;
+    input.checkpoint_size = cli.indexing.checkpoint_size;
+    input.stages = stages;
+    input.verbose = true;
+
+    auto result = run_aggregation(std::move(input)).get();
+    return result.success ? 0 : 1;
 }
diff --git a/src/dftracer/utils/binaries/dftracer_gen_dlio_config.cpp b/src/dftracer/utils/binaries/dftracer_gen_dlio_config.cpp
new file mode 100644
index 00000000..ee171068
--- /dev/null
+++ b/src/dftracer/utils/binaries/dftracer_gen_dlio_config.cpp
@@ -0,0 +1,354 @@
+#include <dftracer/utils/core/common/config.h>
+#include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/utils/timer.h>
+#include <dftracer/utils/utilities/common/statistics/distributions.h>
+#include <dftracer/utils/utilities/common/statistics/mixture.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_runner.h>
+#include <dftracer/utils/utilities/dlio/barrier_simulator.h>
+#include <dftracer/utils/utilities/dlio/optimizer.h>
+#include <dftracer/utils/utilities/dlio/trace_loader.h>
+#include <dftracer/utils/utilities/dlio/yaml_emit.h>
+
+#include <algorithm>
+#include <cstdio>
+#include <exception>
+#include <fstream>
+#include <optional>
+#include <string>
+#include <type_traits>
+#include <variant>
+#include <vector>
+
+#include "common_cli.h"
+
+using namespace dftracer::utils;
+using namespace dftracer::utils::utilities;
+namespace agg = dftracer::utils::utilities::composites::dft::aggregators;
+namespace dlio = dftracer::utils::utilities::dlio;
+namespace stats = dftracer::utils::utilities::common::statistics;
+
+namespace {
+
+class GenDlioConfigArgParse : public cli::ArgParse {
+   public:
+    cli::DirectoryArgs directory{
+        cli::DirMode::DEFAULT_DOT,
+        "Input directory containing .pfw or .pfw.gz traces"};
+    cli::PipelineArgs pipeline;
+    cli::IndexingArgs indexing;
+
+    std::string output;
+    double max_bound_percentile = 95.0;
+    int simulation_iterations = 5;
+    double target_e2e_error = 0.05;
+    double target_cdf_similarity = 0.90;
+    int patience = 10;
+    double epsilon = 1.0;
+    double momentum = 0.9;
+    double min_percentile = 50.0;
+    int num_workers = 8;
+    int prefetch_factor = 2;
+    std::uint64_t seed = 42;
+    std::uint64_t max_samples_per_entry = 100;
+    double time_interval = 5000.0;
+
+    explicit GenDlioConfigArgParse(argparse::ArgumentParser& p) : ArgParse(p) {
+        indexing.index_dir_help =
+            "Directory to store index files (default: system temp directory)";
+        indexing.force_help = "Force index recreation";
+        schema(directory, pipeline, indexing);
+    }
+
+   protected:
+    void register_args() override {
+        parser()
+            .add_argument("-o", "--output")
+            .required()
+            .help("Output path for the DLIO YAML config.");
+        parser()
+            .add_argument("--max-bound-percentile")
+            .help("Initial max_bound percentile (0-100, default: 95)")
+            .scan<'g', double>()
+            .default_value(95.0);
+        parser()
+            .add_argument("--simulation-iterations")
+            .help(
+                "Max simulator iterations for percentile refinement (default: "
+                "5)")
+            .scan<'d', int>()
+            .default_value(5);
+        parser()
+            .add_argument("--target-e2e-error")
+            .help(
+                "Target relative E2E error to declare convergence (default: "
+                "0.05)")
+            .scan<'g', double>()
+            .default_value(0.05);
+        parser()
+            .add_argument("--target-cdf-similarity")
+            .help("Target fetch_block CDF similarity (default: 0.90)")
+            .scan<'g', double>()
+            .default_value(0.90);
+        parser()
+            .add_argument("--patience")
+            .help("Early-stop after this many iterations without improvement")
+            .scan<'d', int>()
+            .default_value(10);
+        parser()
+            .add_argument("--epsilon")
+            .help("Base step size for percentile adjustment (default: 1.0)")
+            .scan<'g', double>()
+            .default_value(1.0);
+        parser()
+            .add_argument("--momentum")
+            .help("Momentum factor in [0, 1) (default: 0.9)")
+            .scan<'g', double>()
+            .default_value(0.9);
+        parser()
+            .add_argument("--min-percentile")
+            .help("Floor on max_bound percentile (default: 50)")
+            .scan<'g', double>()
+            .default_value(50.0);
+        parser()
+            .add_argument("--num-workers")
+            .help("DataLoader worker count for the simulator (default: 8)")
+            .scan<'d', int>()
+            .default_value(8);
+        parser()
+            .add_argument("--prefetch-factor")
+            .help("DataLoader prefetch factor (default: 2)")
+            .scan<'d', int>()
+            .default_value(2);
+        parser()
+            .add_argument("--seed")
+            .help("Base seed for simulator + sampler (default: 42)")
+            .scan<'i', std::uint64_t>()
+            .default_value<std::uint64_t>(42);
+        parser()
+            .add_argument("--max-samples-per-entry")
+            .help(
+                "Cap on synthesized samples per AGGREGATION entry (default: "
+                "100)")
+            .scan<'i', std::uint64_t>()
+            .default_value<std::uint64_t>(100);
+        parser()
+            .add_argument("-t", "--time-interval")
+            .help("Aggregation time interval in ms (default: 5000)")
+            .scan<'g', double>()
+            .default_value(5000.0);
+    }
+
+    void post_parse() override {
+        output = parser().get<std::string>("--output");
+        max_bound_percentile = parser().get<double>("--max-bound-percentile");
+        simulation_iterations = parser().get<int>("--simulation-iterations");
+        target_e2e_error = parser().get<double>("--target-e2e-error");
+        target_cdf_similarity = parser().get<double>("--target-cdf-similarity");
+        patience = parser().get<int>("--patience");
+        epsilon = parser().get<double>("--epsilon");
+        momentum = parser().get<double>("--momentum");
+        min_percentile = parser().get<double>("--min-percentile");
+        num_workers = parser().get<int>("--num-workers");
+        prefetch_factor = parser().get<int>("--prefetch-factor");
+        seed = parser().get<std::uint64_t>("--seed");
+        max_samples_per_entry =
+            parser().get<std::uint64_t>("--max-samples-per-entry");
+        time_interval = parser().get<double>("--time-interval");
+    }
+};
+
+std::optional<stats::BestModel> fit_best_model(
+    const std::vector<double>& data) {
+    if (data.empty()) return std::nullopt;
+    const auto singles = stats::fit_all_single_distributions(data);
+    std::vector<stats::FittedMixture> mixtures;
+    if (data.size() >= 20) {
+        mixtures.push_back(stats::fit_gaussian_mixture(data, 2));
+        mixtures.push_back(stats::fit_gaussian_mixture(data, 3));
+    }
+    const auto best = stats::select_best_model(singles, mixtures);
+    if (!best) return std::nullopt;
+    return best->model;
+}
+
+const char* model_label(const stats::BestModel& m) {
+    return std::visit(
+        [](const auto& v) -> const char* {
+            using T = std::decay_t<decltype(v)>;
+            if constexpr (std::is_same_v<T, stats::FittedDistribution>) {
+                switch (v.kind) {
+                    case stats::DistributionKind::Normal:
+                        return "Normal";
+                    case stats::DistributionKind::Lognormal:
+                        return "Lognormal";
+                    case stats::DistributionKind::Gamma:
+                        return "Gamma";
+                    case stats::DistributionKind::Exponential:
+                        return "Exponential";
+                    case stats::DistributionKind::Weibull:
+                        return "Weibull";
+                }
+                return "Unknown";
+            } else {
+                return v.weights.size() == 2 ? "GMM-2" : "GMM-3";
+            }
+        },
+        m);
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+    DFTRACER_UTILS_LOGGER_INIT();
+
+    argparse::ArgumentParser program("dftracer_gen_dlio_config",
+                                     DFTRACER_UTILS_PACKAGE_VERSION);
+    program.add_description(
+        "Generate a DLIO YAML configuration from raw DFTracer logs. Indexes "
+        "and aggregates the input directory automatically; users do not need "
+        "to run dftracer_aggregator separately.");
+
+    GenDlioConfigArgParse cli(program);
+    cli.setup();
+    if (!cli.parse(argc, argv)) return 1;
+
+    // --- Aggregation phase: produce / reuse the AGGREGATION CF ---------------
+    agg::AggregationConfig agg_config;
+    agg_config.time_interval_us =
+        static_cast<std::uint64_t>(cli.time_interval * 1000.0);
+    agg_config.compute_statistics = true;
+    // DDSketch is required for high-fidelity DLIO config generation. Force it
+    // on so users don't have to remember the flag.
+    agg_config.compute_percentiles = true;
+    agg_config.sketch_accuracy = 0.01;
+    agg_config.percentiles = {0.25, 0.5, 0.75, 0.90};
+    agg_config.track_process_parents = true;
+    agg_config.track_default_args = true;
+
+    agg::AggregationRunInput run_input;
+    run_input.log_dir = cli.directory.value;
+    run_input.index_dir = cli.indexing.index_dir;
+    run_input.agg_config = std::move(agg_config);
+    run_input.pipeline_config =
+        cli::build_pipeline_config("DLIO Config Generator", cli.pipeline);
+    run_input.output_file = std::nullopt;  // populate AGGREGATION CF only
+    run_input.force_rebuild = cli.indexing.force;
+    run_input.checkpoint_size = cli.indexing.checkpoint_size;
+    run_input.verbose = true;
+
+    auto run_result = agg::run_aggregation(std::move(run_input)).get();
+    if (!run_result.success || run_result.index_path.empty()) {
+        DFTRACER_UTILS_LOG_ERROR(
+            "Aggregation failed; cannot generate DLIO config");
+        return 1;
+    }
+
+    // --- Load aggregated traces ---------------------------------------------
+    dlio::TraceLoaderOptions loader_opts;
+    loader_opts.max_samples_per_entry = cli.max_samples_per_entry;
+    loader_opts.seed = cli.seed;
+    dlio::AggregatedTraces traces;
+    try {
+        traces =
+            dlio::load_aggregated_traces(run_result.index_path, loader_opts);
+    } catch (const std::exception& e) {
+        DFTRACER_UTILS_LOG_ERROR("Failed to load AGGREGATION CF: %s", e.what());
+        return 1;
+    }
+    if (!traces.any_data) {
+        DFTRACER_UTILS_LOG_ERROR(
+            "No DLIO events (fetch.block / preprocess) found in %s",
+            cli.directory.value.c_str());
+        return 1;
+    }
+    std::printf("\n");
+    std::printf("==========================================\n");
+    std::printf("DLIO Config Generation\n");
+    std::printf("==========================================\n");
+    std::printf("  Loaded %d rank(s), %d step(s) from index at %s\n",
+                traces.num_ranks, traces.num_steps,
+                run_result.index_path.c_str());
+    std::printf("  computation_times: %zu samples (min %.6fs, max %.6fs)\n",
+                traces.computation_times.size(), traces.fetch_block_stats.min(),
+                traces.fetch_block_stats.max());
+    std::printf("  preprocess_times:  %zu samples (min %.6fs, max %.6fs)\n",
+                traces.preprocess_times.size(), traces.preprocess_stats.min(),
+                traces.preprocess_stats.max());
+
+    // --- Fit distributions ---------------------------------------------------
+    auto comp_model = fit_best_model(traces.computation_times);
+    auto prep_model = fit_best_model(traces.preprocess_times);
+    if (!comp_model) {
+        DFTRACER_UTILS_LOG_ERROR(
+            "Failed to fit a computation_time distribution");
+        return 1;
+    }
+    std::printf("  Best computation_time model: %s\n",
+                model_label(*comp_model));
+    if (prep_model) {
+        std::printf("  Best preprocess_time  model: %s\n",
+                    model_label(*prep_model));
+    } else {
+        std::printf("  No preprocess events; skipping preprocess block\n");
+    }
+
+    // --- Optimize max_bound percentile via simulator -------------------------
+    auto ctx = dlio::make_simulator_context(traces, cli.num_workers,
+                                            cli.prefetch_factor);
+
+    dlio::OptimizerOptions opts;
+    opts.max_iterations = cli.simulation_iterations;
+    opts.target_e2e_error = cli.target_e2e_error;
+    opts.target_cdf_similarity = cli.target_cdf_similarity;
+    opts.patience = cli.patience;
+    opts.epsilon = cli.epsilon;
+    opts.momentum = cli.momentum;
+    opts.min_percentile = cli.min_percentile;
+    opts.initial_percentile = cli.max_bound_percentile;
+    opts.base_seed = cli.seed;
+
+    auto opt = dlio::optimize_max_bound_percentile(
+        ctx, *comp_model, traces.computation_times, opts);
+    std::printf(
+        "  Optimizer: iters=%d, best_percentile=%.2f%%, e2e_error=%.2f%%, "
+        "fetch_block_cdf_similarity=%.4f%s\n",
+        opt.iterations_used, opt.best_percentile, opt.best.e2e_error * 100.0,
+        opt.best.fetch_block_cdf_similarity,
+        opt.converged ? " (converged)" : "");
+
+    auto comp_sorted = traces.computation_times;
+    std::sort(comp_sorted.begin(), comp_sorted.end());
+    const double comp_max_bound =
+        dlio::percentile(comp_sorted, opt.best_percentile);
+
+    double prep_max_bound = 0.0;
+    if (prep_model) {
+        auto prep_sorted = traces.preprocess_times;
+        std::sort(prep_sorted.begin(), prep_sorted.end());
+        prep_max_bound = dlio::percentile(prep_sorted, opt.best_percentile);
+    }
+
+    // --- Emit YAML -----------------------------------------------------------
+    dlio::DlioTimingBlock comp_block{*comp_model, comp_max_bound};
+    std::optional<dlio::DlioTimingBlock> prep_block_storage;
+    const dlio::DlioTimingBlock* prep_block_ptr = nullptr;
+    if (prep_model) {
+        prep_block_storage = dlio::DlioTimingBlock{*prep_model, prep_max_bound};
+        prep_block_ptr = &(*prep_block_storage);
+    }
+
+    std::ofstream out(cli.output);
+    if (!out) {
+        DFTRACER_UTILS_LOG_ERROR("Cannot open %s for writing",
+                                 cli.output.c_str());
+        return 1;
+    }
+    if (!dlio::write_dlio_yaml(out, &comp_block, prep_block_ptr)) {
+        DFTRACER_UTILS_LOG_ERROR("Failed to write %s", cli.output.c_str());
+        return 1;
+    }
+    std::printf("  Wrote DLIO config: %s\n", cli.output.c_str());
+    std::printf("==========================================\n");
+    return 0;
+}
diff --git a/src/dftracer/utils/utilities/common/statistics/distributions.cpp b/src/dftracer/utils/utilities/common/statistics/distributions.cpp
new file mode 100644
index 00000000..546b0e41
--- /dev/null
+++ b/src/dftracer/utils/utilities/common/statistics/distributions.cpp
@@ -0,0 +1,448 @@
+#include <dftracer/utils/utilities/common/statistics/distributions.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <limits>
+#include <random>
+#include <stdexcept>
+
+// Boost.Math standalone is configured globally via -DBOOST_MATH_STANDALONE.
+#include <boost/math/distributions/exponential.hpp>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/distributions/lognormal.hpp>
+#include <boost/math/distributions/normal.hpp>
+#include <boost/math/distributions/weibull.hpp>
+
+namespace dftracer::utils::utilities::common::statistics {
+
+namespace bm = boost::math;
+
+namespace {
+
+constexpr double kMinPositive = 1e-12;
+constexpr int kNewtonMaxIter = 100;
+constexpr double kNewtonTol = 1e-8;
+
+// Sample statistics computed in one pass via Welford for numerical stability.
+struct SampleSummary {
+    std::size_t n = 0;
+    double mean = 0.0;
+    double variance = 0.0;  // population variance (1/n)
+    double min = std::numeric_limits<double>::infinity();
+    double max = -std::numeric_limits<double>::infinity();
+    bool any_non_positive = false;
+};
+
+SampleSummary summarize(const std::vector<double>& data) {
+    SampleSummary s;
+    double m2 = 0.0;
+    for (double x : data) {
+        if (x <= 0.0) s.any_non_positive = true;
+        ++s.n;
+        const double delta = x - s.mean;
+        s.mean += delta / static_cast<double>(s.n);
+        m2 += delta * (x - s.mean);
+        if (x < s.min) s.min = x;
+        if (x > s.max) s.max = x;
+    }
+    if (s.n > 0) s.variance = m2 / static_cast<double>(s.n);
+    return s;
+}
+
+template <typename DistCdf>
+double ks_statistic(const std::vector<double>& sorted_data,
+                    DistCdf&& dist_cdf) {
+    const auto n = static_cast<double>(sorted_data.size());
+    double max_diff = 0.0;
+    // Two-sided KS: at each xi the empirical CDF jumps from (i-1)/n to i/n.
+    // Compare the theoretical CDF against both sides.
+    for (std::size_t i = 0; i < sorted_data.size(); ++i) {
+        const double x = sorted_data[i];
+        const double f_theo = dist_cdf(x);
+        const double f_lo = static_cast<double>(i) / n;
+        const double f_hi = static_cast<double>(i + 1) / n;
+        max_diff = std::max(max_diff, std::abs(f_theo - f_lo));
+        max_diff = std::max(max_diff, std::abs(f_hi - f_theo));
+    }
+    return max_diff;
+}
+
+template <typename DistPdf>
+double log_likelihood(const std::vector<double>& data, DistPdf&& dist_pdf) {
+    double ll = 0.0;
+    for (double x : data) {
+        const double p = dist_pdf(x);
+        if (p <= 0.0 || !std::isfinite(p))
+            return -std::numeric_limits<double>::infinity();
+        ll += std::log(p);
+    }
+    return ll;
+}
+
+double compute_bic(double log_l, std::size_t n, int k) {
+    return static_cast<double>(k) * std::log(static_cast<double>(n)) -
+           2.0 * log_l;
+}
+
+// ---- Per-distribution MLE -------------------------------------------------
+
+FittedDistribution fit_normal(const std::vector<double>& data,
+                              const SampleSummary& s) {
+    FittedDistribution f;
+    f.kind = DistributionKind::Normal;
+    if (s.n < 2 || s.variance <= 0.0) return f;
+    const double sigma = std::sqrt(s.variance);
+    f.params = {s.mean, sigma, 0.0};
+    f.valid = true;
+
+    bm::normal dist(s.mean, sigma);
+    auto sorted = data;
+    std::sort(sorted.begin(), sorted.end());
+    f.ks_stat =
+        ks_statistic(sorted, [&](double x) { return bm::cdf(dist, x); });
+    f.log_likelihood =
+        log_likelihood(data, [&](double x) { return bm::pdf(dist, x); });
+    f.bic = compute_bic(f.log_likelihood, s.n, free_parameter_count(f.kind));
+    return f;
+}
+
+FittedDistribution fit_lognormal(const std::vector<double>& data,
+                                 const SampleSummary& s) {
+    FittedDistribution f;
+    f.kind = DistributionKind::Lognormal;
+    if (s.n < 2 || s.any_non_positive) return f;
+
+    double mean_log = 0.0;
+    for (double x : data) mean_log += std::log(x);
+    mean_log /= static_cast<double>(s.n);
+
+    double var_log = 0.0;
+    for (double x : data) {
+        const double d = std::log(x) - mean_log;
+        var_log += d * d;
+    }
+    var_log /= static_cast<double>(s.n);
+    if (var_log <= 0.0) return f;
+
+    const double sigma = std::sqrt(var_log);
+    f.params = {mean_log, sigma, 0.0};
+    f.valid = true;
+
+    bm::lognormal dist(mean_log, sigma);
+    auto sorted = data;
+    std::sort(sorted.begin(), sorted.end());
+    f.ks_stat =
+        ks_statistic(sorted, [&](double x) { return bm::cdf(dist, x); });
+    f.log_likelihood =
+        log_likelihood(data, [&](double x) { return bm::pdf(dist, x); });
+    f.bic = compute_bic(f.log_likelihood, s.n, free_parameter_count(f.kind));
+    return f;
+}
+
+FittedDistribution fit_exponential(const std::vector<double>& data,
+                                   const SampleSummary& s) {
+    FittedDistribution f;
+    f.kind = DistributionKind::Exponential;
+    if (s.n < 1 || s.mean <= 0.0 || s.any_non_positive) return f;
+    const double rate = 1.0 / s.mean;
+    f.params = {rate, 0.0, 0.0};
+    f.valid = true;
+
+    bm::exponential dist(rate);
+    auto sorted = data;
+    std::sort(sorted.begin(), sorted.end());
+    f.ks_stat =
+        ks_statistic(sorted, [&](double x) { return bm::cdf(dist, x); });
+    f.log_likelihood =
+        log_likelihood(data, [&](double x) { return bm::pdf(dist, x); });
+    f.bic = compute_bic(f.log_likelihood, s.n, free_parameter_count(f.kind));
+    return f;
+}
+
+// Gamma MLE: there's no closed form. We use method-of-moments as the initial
+// estimate (good enough for most timing distributions) and then refine the
+// shape parameter via Newton-Raphson on the log-likelihood derivative.
+//
+//   d/dk log L = n*ln(k/mean) - n*digamma(k) + sum(ln x_i)
+//
+// digamma(k) is the polygamma_0; both digamma and its derivative (trigamma)
+// are provided by Boost.Math.
+FittedDistribution fit_gamma(const std::vector<double>& data,
+                             const SampleSummary& s) {
+    FittedDistribution f;
+    f.kind = DistributionKind::Gamma;
+    if (s.n < 2 || s.any_non_positive || s.variance <= 0.0) return f;
+
+    // Method-of-moments initial estimate.
+    double k = s.mean * s.mean / s.variance;
+    if (k <= 0.0 || !std::isfinite(k)) return f;
+
+    double sum_log = 0.0;
+    for (double x : data) sum_log += std::log(x);
+    const double mean_log = sum_log / static_cast<double>(s.n);
+    const double log_mean = std::log(s.mean);
+    // s = log_mean - mean_log; for k > 0, k satisfies
+    //   ln(k) - digamma(k) = s.
+    const double rhs = log_mean - mean_log;
+    if (rhs <= 0.0) {
+        // Data is degenerate; fall back to MoM.
+    } else {
+        for (int it = 0; it < kNewtonMaxIter; ++it) {
+            const double g = std::log(k) - bm::digamma(k) - rhs;
+            const double gp = 1.0 / k - bm::trigamma(k);
+            if (!std::isfinite(g) || !std::isfinite(gp) || gp == 0.0) break;
+            const double dk = g / gp;
+            k -= dk;
+            if (k <= kMinPositive) {
+                k = kMinPositive;
+                break;
+            }
+            if (std::abs(dk) < kNewtonTol) break;
+        }
+    }
+    const double theta = s.mean / k;
+    if (k <= 0.0 || theta <= 0.0) return f;
+    f.params = {k, theta, 0.0};
+    f.valid = true;
+
+    bm::gamma_distribution<double> dist(k, theta);
+    auto sorted = data;
+    std::sort(sorted.begin(), sorted.end());
+    f.ks_stat =
+        ks_statistic(sorted, [&](double x) { return bm::cdf(dist, x); });
+    f.log_likelihood =
+        log_likelihood(data, [&](double x) { return bm::pdf(dist, x); });
+    f.bic = compute_bic(f.log_likelihood, s.n, free_parameter_count(f.kind));
+    return f;
+}
+
+// Weibull MLE: shape `k` is the root of
+//   f(k) = sum(x^k ln x) / sum(x^k) - 1/k - mean(ln x) = 0
+// Newton-Raphson with MoM-style initial estimate.
+FittedDistribution fit_weibull(const std::vector<double>& data,
+                               const SampleSummary& s) {
+    FittedDistribution f;
+    f.kind = DistributionKind::Weibull;
+    if (s.n < 2 || s.any_non_positive || s.variance <= 0.0) return f;
+
+    double sum_log = 0.0;
+    for (double x : data) sum_log += std::log(x);
+    const double mean_log = sum_log / static_cast<double>(s.n);
+
+    // Initial shape via rough variance heuristic; ~1.0 works for most cases.
+    double k = 1.0;
+
+    for (int it = 0; it < kNewtonMaxIter; ++it) {
+        double s_xk = 0.0, s_xk_lnx = 0.0, s_xk_lnx2 = 0.0;
+        for (double x : data) {
+            const double lx = std::log(x);
+            const double xk = std::pow(x, k);
+            s_xk += xk;
+            s_xk_lnx += xk * lx;
+            s_xk_lnx2 += xk * lx * lx;
+        }
+        if (s_xk <= 0.0 || !std::isfinite(s_xk)) return f;
+        const double a = s_xk_lnx / s_xk;
+        const double a_prime =
+            (s_xk_lnx2 * s_xk - s_xk_lnx * s_xk_lnx) / (s_xk * s_xk);
+        const double g = a - 1.0 / k - mean_log;
+        const double gp = a_prime + 1.0 / (k * k);
+        if (!std::isfinite(g) || !std::isfinite(gp) || gp == 0.0) break;
+        const double dk = g / gp;
+        k -= dk;
+        if (k <= kMinPositive) {
+            k = kMinPositive;
+            break;
+        }
+        if (std::abs(dk) < kNewtonTol) break;
+    }
+
+    double s_xk = 0.0;
+    for (double x : data) s_xk += std::pow(x, k);
+    const double lambda = std::pow(s_xk / static_cast<double>(s.n), 1.0 / k);
+    if (k <= 0.0 || lambda <= 0.0) return f;
+    f.params = {k, lambda, 0.0};
+    f.valid = true;
+
+    bm::weibull dist(k, lambda);
+    auto sorted = data;
+    std::sort(sorted.begin(), sorted.end());
+    f.ks_stat =
+        ks_statistic(sorted, [&](double x) { return bm::cdf(dist, x); });
+    f.log_likelihood =
+        log_likelihood(data, [&](double x) { return bm::pdf(dist, x); });
+    f.bic = compute_bic(f.log_likelihood, s.n, free_parameter_count(f.kind));
+    return f;
+}
+
+}  // namespace
+
+std::string_view distribution_name(DistributionKind k) {
+    switch (k) {
+        case DistributionKind::Normal:
+            return "Normal";
+        case DistributionKind::Lognormal:
+            return "Lognormal";
+        case DistributionKind::Gamma:
+            return "Gamma";
+        case DistributionKind::Exponential:
+            return "Exponential";
+        case DistributionKind::Weibull:
+            return "Weibull";
+    }
+    return "Unknown";
+}
+
+int free_parameter_count(DistributionKind kind) {
+    switch (kind) {
+        case DistributionKind::Normal:
+        case DistributionKind::Lognormal:
+        case DistributionKind::Gamma:
+        case DistributionKind::Weibull:
+            return 2;
+        case DistributionKind::Exponential:
+            return 1;
+    }
+    return 0;
+}
+
+FittedDistribution fit_single_distribution(DistributionKind kind,
+                                           const std::vector<double>& data) {
+    const auto s = summarize(data);
+    switch (kind) {
+        case DistributionKind::Normal:
+            return fit_normal(data, s);
+        case DistributionKind::Lognormal:
+            return fit_lognormal(data, s);
+        case DistributionKind::Gamma:
+            return fit_gamma(data, s);
+        case DistributionKind::Exponential:
+            return fit_exponential(data, s);
+        case DistributionKind::Weibull:
+            return fit_weibull(data, s);
+    }
+    return {};
+}
+
+std::vector<FittedDistribution> fit_all_single_distributions(
+    const std::vector<double>& data) {
+    const auto s = summarize(data);
+    std::vector<FittedDistribution> fits;
+    fits.reserve(5);
+    fits.push_back(fit_normal(data, s));
+    fits.push_back(fit_lognormal(data, s));
+    fits.push_back(fit_gamma(data, s));
+    fits.push_back(fit_exponential(data, s));
+    fits.push_back(fit_weibull(data, s));
+
+    std::sort(fits.begin(), fits.end(),
+              [](const FittedDistribution& a, const FittedDistribution& b) {
+                  if (a.valid != b.valid) return a.valid;  // valid first
+                  return a.ks_stat < b.ks_stat;
+              });
+    return fits;
+}
+
+std::optional<FittedDistribution> best_fit_by_ks(
+    const std::vector<FittedDistribution>& fits) {
+    for (const auto& f : fits) {
+        if (f.valid) return f;
+    }
+    return std::nullopt;
+}
+
+double pdf(const FittedDistribution& fit, double x) {
+    switch (fit.kind) {
+        case DistributionKind::Normal:
+            return bm::pdf(bm::normal(fit.params[0], fit.params[1]), x);
+        case DistributionKind::Lognormal:
+            return bm::pdf(bm::lognormal(fit.params[0], fit.params[1]), x);
+        case DistributionKind::Gamma:
+            return bm::pdf(
+                bm::gamma_distribution<double>(fit.params[0], fit.params[1]),
+                x);
+        case DistributionKind::Exponential:
+            return bm::pdf(bm::exponential(fit.params[0]), x);
+        case DistributionKind::Weibull:
+            return bm::pdf(bm::weibull(fit.params[0], fit.params[1]), x);
+    }
+    return 0.0;
+}
+
+double cdf(const FittedDistribution& fit, double x) {
+    switch (fit.kind) {
+        case DistributionKind::Normal:
+            return bm::cdf(bm::normal(fit.params[0], fit.params[1]), x);
+        case DistributionKind::Lognormal:
+            return bm::cdf(bm::lognormal(fit.params[0], fit.params[1]), x);
+        case DistributionKind::Gamma:
+            return bm::cdf(
+                bm::gamma_distribution<double>(fit.params[0], fit.params[1]),
+                x);
+        case DistributionKind::Exponential:
+            return bm::cdf(bm::exponential(fit.params[0]), x);
+        case DistributionKind::Weibull:
+            return bm::cdf(bm::weibull(fit.params[0], fit.params[1]), x);
+    }
+    return 0.0;
+}
+
+double quantile(const FittedDistribution& fit, double p) {
+    switch (fit.kind) {
+        case DistributionKind::Normal:
+            return bm::quantile(bm::normal(fit.params[0], fit.params[1]), p);
+        case DistributionKind::Lognormal:
+            return bm::quantile(bm::lognormal(fit.params[0], fit.params[1]), p);
+        case DistributionKind::Gamma:
+            return bm::quantile(
+                bm::gamma_distribution<double>(fit.params[0], fit.params[1]),
+                p);
+        case DistributionKind::Exponential:
+            return bm::quantile(bm::exponential(fit.params[0]), p);
+        case DistributionKind::Weibull:
+            return bm::quantile(bm::weibull(fit.params[0], fit.params[1]), p);
+    }
+    return 0.0;
+}
+
+Sampler make_sampler(const FittedDistribution& fit,
+                     std::optional<double> min_bound,
+                     std::optional<double> max_bound) {
+    if (!fit.valid) {
+        throw std::invalid_argument(
+            "make_sampler called with invalid FittedDistribution");
+    }
+    const auto p0 = fit.params[0];
+    const auto p1 = fit.params[1];
+    const auto kind = fit.kind;
+
+    auto draw = [kind, p0, p1](std::mt19937_64& rng) -> double {
+        switch (kind) {
+            case DistributionKind::Normal:
+                return std::normal_distribution<double>(p0, p1)(rng);
+            case DistributionKind::Lognormal:
+                return std::lognormal_distribution<double>(p0, p1)(rng);
+            case DistributionKind::Gamma:
+                return std::gamma_distribution<double>(p0, p1)(rng);
+            case DistributionKind::Exponential:
+                return std::exponential_distribution<double>(p0)(rng);
+            case DistributionKind::Weibull:
+                return std::weibull_distribution<double>(p0, p1)(rng);
+        }
+        return 0.0;
+    };
+
+    if (!min_bound && !max_bound) {
+        return [draw](std::mt19937_64& rng) { return draw(rng); };
+    }
+    const double lo =
+        min_bound.value_or(-std::numeric_limits<double>::infinity());
+    const double hi =
+        max_bound.value_or(std::numeric_limits<double>::infinity());
+    return [draw, lo, hi](std::mt19937_64& rng) {
+        return std::clamp(draw(rng), lo, hi);
+    };
+}
+
+}  // namespace dftracer::utils::utilities::common::statistics
diff --git a/src/dftracer/utils/utilities/common/statistics/mixture.cpp b/src/dftracer/utils/utilities/common/statistics/mixture.cpp
new file mode 100644
index 00000000..3518fdf8
--- /dev/null
+++ b/src/dftracer/utils/utilities/common/statistics/mixture.cpp
@@ -0,0 +1,259 @@
+#include <dftracer/utils/utilities/common/statistics/mixture.h>
+
+#include <algorithm>
+#include <boost/math/distributions/normal.hpp>
+#include <cmath>
+#include <limits>
+#include <numeric>
+#include <random>
+#include <stdexcept>
+
+namespace dftracer::utils::utilities::common::statistics {
+
+namespace bm = boost::math;
+
+namespace {
+
+constexpr double kInvSqrt2Pi = 0.3989422804014327;  // 1 / sqrt(2*pi)
+
+inline double normal_pdf(double x, double mean, double stddev) {
+    const double z = (x - mean) / stddev;
+    return (kInvSqrt2Pi / stddev) * std::exp(-0.5 * z * z);
+}
+
+// log(sum_k exp(log_x[k])) computed with the standard log-sum-exp trick to
+// avoid underflow when component log-likelihoods diverge.
+double log_sum_exp(const std::vector<double>& log_x) {
+    double m = -std::numeric_limits<double>::infinity();
+    for (double v : log_x) {
+        if (v > m) m = v;
+    }
+    if (!std::isfinite(m)) return m;
+    double s = 0.0;
+    for (double v : log_x) s += std::exp(v - m);
+    return m + std::log(s);
+}
+
+// Pick K initial means by sampling sorted-data quantiles uniformly across (0,
+// 1). Avoids the degenerate case where random init collapses all components
+// together.
+std::vector<double> initial_means(const std::vector<double>& sorted, int K) {
+    std::vector<double> means;
+    means.reserve(K);
+    for (int k = 0; k < K; ++k) {
+        const double q =
+            (static_cast<double>(k) + 0.5) / static_cast<double>(K);
+        const std::size_t idx = std::min<std::size_t>(
+            sorted.size() - 1, static_cast<std::size_t>(q * sorted.size()));
+        means.push_back(sorted[idx]);
+    }
+    return means;
+}
+
+double sample_variance(const std::vector<double>& data, double mean) {
+    double sq = 0.0;
+    for (double x : data) {
+        const double d = x - mean;
+        sq += d * d;
+    }
+    return sq / static_cast<double>(data.size());
+}
+
+}  // namespace
+
+FittedMixture fit_gaussian_mixture(const std::vector<double>& data, int K,
+                                   const MixtureFitOptions& options) {
+    FittedMixture m;
+    if (K <= 0 || data.size() < static_cast<std::size_t>(K) * 2) return m;
+
+    auto sorted = data;
+    std::sort(sorted.begin(), sorted.end());
+
+    const double total_mean = std::accumulate(data.begin(), data.end(), 0.0) /
+                              static_cast<double>(data.size());
+    const double total_var =
+        std::max(sample_variance(data, total_mean), options.variance_floor);
+
+    m.weights.assign(K, 1.0 / static_cast<double>(K));
+    m.components.resize(K);
+    const auto means_init = initial_means(sorted, K);
+    for (int k = 0; k < K; ++k) {
+        m.components[k].mean = means_init[k];
+        m.components[k].stddev = std::sqrt(total_var);
+    }
+
+    const std::size_t N = data.size();
+    std::vector<std::vector<double>> resp(K, std::vector<double>(N, 0.0));
+    double prev_ll = -std::numeric_limits<double>::infinity();
+
+    std::vector<double> log_comp(K);
+    for (int it = 0; it < options.max_iter; ++it) {
+        // E-step: responsibilities via log-sum-exp.
+        double ll = 0.0;
+        for (std::size_t i = 0; i < N; ++i) {
+            const double x = data[i];
+            for (int k = 0; k < K; ++k) {
+                const double p =
+                    normal_pdf(x, m.components[k].mean, m.components[k].stddev);
+                log_comp[k] = (p > 0.0 && std::isfinite(p))
+                                  ? std::log(m.weights[k]) + std::log(p)
+                                  : -std::numeric_limits<double>::infinity();
+            }
+            const double lse = log_sum_exp(log_comp);
+            ll += lse;
+            for (int k = 0; k < K; ++k) {
+                resp[k][i] = std::exp(log_comp[k] - lse);
+            }
+        }
+
+        // M-step.
+        for (int k = 0; k < K; ++k) {
+            double n_k = 0.0;
+            for (std::size_t i = 0; i < N; ++i) n_k += resp[k][i];
+
+            // Guard against an empty component.
+            if (n_k < 1e-12) {
+                m.weights[k] = 0.0;
+                m.components[k].stddev =
+                    std::sqrt(std::max(total_var, options.variance_floor));
+                continue;
+            }
+
+            double mean = 0.0;
+            for (std::size_t i = 0; i < N; ++i) mean += resp[k][i] * data[i];
+            mean /= n_k;
+
+            double var = 0.0;
+            for (std::size_t i = 0; i < N; ++i) {
+                const double d = data[i] - mean;
+                var += resp[k][i] * d * d;
+            }
+            var /= n_k;
+            if (var < options.variance_floor) var = options.variance_floor;
+
+            m.weights[k] = n_k / static_cast<double>(N);
+            m.components[k].mean = mean;
+            m.components[k].stddev = std::sqrt(var);
+        }
+
+        m.iterations = it + 1;
+        if (std::isfinite(ll) && std::abs(ll - prev_ll) < options.tol) {
+            m.converged = true;
+            prev_ll = ll;
+            break;
+        }
+        prev_ll = ll;
+    }
+
+    // Final log-likelihood pass (uses the converged parameters).
+    double ll = 0.0;
+    for (double x : data) {
+        for (int k = 0; k < K; ++k) {
+            const double p =
+                normal_pdf(x, m.components[k].mean, m.components[k].stddev);
+            log_comp[k] = (p > 0.0 && std::isfinite(p))
+                              ? std::log(m.weights[k]) + std::log(p)
+                              : -std::numeric_limits<double>::infinity();
+        }
+        ll += log_sum_exp(log_comp);
+    }
+    m.log_likelihood = ll;
+    m.bic = static_cast<double>(free_parameter_count(m)) *
+                std::log(static_cast<double>(N)) -
+            2.0 * ll;
+    m.valid = std::isfinite(ll);
+    return m;
+}
+
+int free_parameter_count(const FittedMixture& mix) {
+    const int K = static_cast<int>(mix.weights.size());
+    return 3 * K - 1;
+}
+
+double pdf(const FittedMixture& mix, double x) {
+    double p = 0.0;
+    for (std::size_t k = 0; k < mix.weights.size(); ++k) {
+        p += mix.weights[k] *
+             normal_pdf(x, mix.components[k].mean, mix.components[k].stddev);
+    }
+    return p;
+}
+
+double cdf(const FittedMixture& mix, double x) {
+    double c = 0.0;
+    for (std::size_t k = 0; k < mix.weights.size(); ++k) {
+        c += mix.weights[k] * bm::cdf(bm::normal(mix.components[k].mean,
+                                                 mix.components[k].stddev),
+                                      x);
+    }
+    return c;
+}
+
+Sampler make_sampler(const FittedMixture& mix, std::optional<double> min_bound,
+                     std::optional<double> max_bound) {
+    if (!mix.valid) {
+        throw std::invalid_argument(
+            "make_sampler called with invalid FittedMixture");
+    }
+
+    auto weights = mix.weights;
+    auto comps = mix.components;
+
+    auto draw = [weights = std::move(weights),
+                 comps = std::move(comps)](std::mt19937_64& rng) -> double {
+        std::discrete_distribution<int> cat(weights.begin(), weights.end());
+        const int k = cat(rng);
+        return std::normal_distribution<double>(comps[k].mean,
+                                                comps[k].stddev)(rng);
+    };
+
+    if (!min_bound && !max_bound) {
+        return [draw = std::move(draw)](std::mt19937_64& rng) {
+            return draw(rng);
+        };
+    }
+    const double lo =
+        min_bound.value_or(-std::numeric_limits<double>::infinity());
+    const double hi =
+        max_bound.value_or(std::numeric_limits<double>::infinity());
+    return [draw = std::move(draw), lo, hi](std::mt19937_64& rng) {
+        return std::clamp(draw(rng), lo, hi);
+    };
+}
+
+std::optional<ModelSelection> select_best_model(
+    const std::vector<FittedDistribution>& single_fits,
+    const std::vector<FittedMixture>& mixtures) {
+    std::optional<ModelSelection> best;
+    for (const auto& f : single_fits) {
+        if (!f.valid) continue;
+        if (!best || f.bic < best->bic) {
+            best = ModelSelection{BestModel{f}, f.bic,
+                                  free_parameter_count(f.kind)};
+        }
+    }
+    for (const auto& m : mixtures) {
+        if (!m.valid) continue;
+        if (!best || m.bic < best->bic) {
+            best = ModelSelection{BestModel{m}, m.bic, free_parameter_count(m)};
+        }
+    }
+    return best;
+}
+
+double pdf(const BestModel& m, double x) {
+    return std::visit([x](const auto& v) { return pdf(v, x); }, m);
+}
+
+double cdf(const BestModel& m, double x) {
+    return std::visit([x](const auto& v) { return cdf(v, x); }, m);
+}
+
+Sampler make_sampler(const BestModel& m, std::optional<double> min_bound,
+                     std::optional<double> max_bound) {
+    return std::visit(
+        [&](const auto& v) { return make_sampler(v, min_bound, max_bound); },
+        m);
+}
+
+}  // namespace dftracer::utils::utilities::common::statistics
diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_runner.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_runner.cpp
new file mode 100644
index 00000000..9a3ee4fe
--- /dev/null
+++ b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_runner.cpp
@@ -0,0 +1,444 @@
+#include <dftracer/utils/core/common/config.h>
+#include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/pipeline/pipeline.h>
+#include <dftracer/utils/core/rocksdb/column_families.h>
+#include <dftracer/utils/core/rocksdb/database.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/core/tasks/task.h>
+#include <dftracer/utils/core/utils/timer.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_runner.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregators.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h>
+#include <dftracer/utils/utilities/indexer/index_builder_utility.h>
+#include <dftracer/utils/utilities/indexer/index_database.h>
+
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+#include <dftracer/utils/utilities/common/arrow/ipc_writer.h>
+#endif
+
+#include <algorithm>
+#include <atomic>
+#include <cstdio>
+#include <memory>
+#include <utility>
+
+namespace dftracer::utils::utilities::composites::dft::aggregators {
+
+namespace {
+
+namespace rcf = ::dftracer::utils::rocksdb::cf;
+namespace idx = composites::dft::indexing;
+
+void write_aggregation_tracking(::dftracer::utils::rocksdb::RocksDatabase* db,
+                                const AggregationConfig& config,
+                                const std::vector<std::string>& processed_files,
+                                const std::string& index_path) {
+    indexer::IndexDatabase idx_db(
+        index_path,
+        ::dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+
+    auto batch = db->begin_batch();
+
+    AggGlobalConfig global_cfg;
+    global_cfg.time_interval_us = config.time_interval_us;
+    global_cfg.config_hash = 0;
+    db->put(batch, rcf::AGGREGATION, std::string_view(AGG_GLOBAL_CONFIG_KEY, 2),
+            serialize_agg_global_config(global_cfg));
+
+    for (const auto& file_path : processed_files) {
+        int file_id = idx_db.find_file(file_path);
+        if (file_id >= 0) {
+            auto key = make_agg_file_key(file_id);
+            db->put(batch, rcf::AGGREGATION, key, "");
+        }
+    }
+
+    db->commit_batch(batch);
+}
+
+coro::CoroTask<indexer::IndexBuildBatchResult> batch_index_and_aggregate(
+    CoroScope* scope, std::vector<std::string> file_paths,
+    std::string index_dir, std::size_t checkpoint_size, bool force_rebuild,
+    std::size_t parallelism, AggregationConfig agg_config,
+    std::shared_ptr<::dftracer::utils::rocksdb::RocksDatabase> agg_db,
+    std::uint32_t config_hash) {
+    auto batch_config = std::make_shared<indexer::IndexBuildBatchConfig>();
+    batch_config->file_paths = std::move(file_paths);
+    batch_config->index_dir = std::move(index_dir);
+    batch_config->checkpoint_size = checkpoint_size;
+    batch_config->parallelism = parallelism;
+    batch_config->force_rebuild = force_rebuild;
+    batch_config->use_batch_write = true;
+
+    auto agg_config_ptr =
+        std::make_shared<AggregationConfig>(std::move(agg_config));
+    batch_config->dft_visitor_factory =
+        [agg_db, config_hash, agg_config_ptr](const std::string& file_path)
+        -> std::vector<std::unique_ptr<composites::dft::DftEventVisitor>> {
+        std::vector<std::unique_ptr<composites::dft::DftEventVisitor>> visitors;
+        visitors.push_back(std::make_unique<AggregationVisitor>(
+            agg_db, config_hash, *agg_config_ptr, file_path));
+        return visitors;
+    };
+
+    co_return co_await indexer::IndexBatchBuilderUtility::process(
+        scope, std::move(batch_config));
+}
+
+PerfettoTraceWriterInput build_streaming_input(
+    EventAggregator* merger_ptr, const AggregationConfig* agg_config,
+    const std::string* output_file, bool compress_output, int compression_level,
+    PerfettoEventFormat event_format) {
+    auto global_tracker = merger_ptr->build_global_tracker();
+
+    PerfettoTraceWriterInput input;
+    input.output_path = *output_file;
+    input.aggregator = merger_ptr;
+    input.tracker = global_tracker.get();
+    input.agg_config = agg_config;
+    input.owned_tracker = std::move(global_tracker);
+    input.root_pids = input.tracker->get_root_pids();
+    input.compute_statistics = agg_config->compute_statistics;
+    input.compute_percentiles = agg_config->compute_percentiles;
+    input.percentiles = agg_config->percentiles;
+    input.compress = compress_output;
+    input.compression_level = compression_level;
+    input.format = event_format;
+
+    const auto& intervals = input.tracker->get_all_intervals();
+    if (!intervals.empty()) {
+        std::uint64_t global_min = UINT64_MAX;
+        std::uint64_t global_max = 0;
+        for (const auto& interval : intervals) {
+            global_min = std::min(global_min, interval.start_ts);
+            global_max = std::max(global_max, interval.end_ts);
+            auto& range = input.boundary_ranges[interval.name][interval.value];
+            if (range.ts == 0 && range.te == 0) {
+                range.ts = interval.start_ts;
+                range.te = interval.end_ts;
+            } else {
+                range.ts = std::min(range.ts, interval.start_ts);
+                range.te = std::max(range.te, interval.end_ts);
+            }
+        }
+        if (global_max > global_min) {
+            input.trace_duration = global_max - global_min;
+        }
+    }
+
+    return input;
+}
+
+}  // namespace
+
+coro::CoroTask<AggregationRunResult> run_aggregation(
+    AggregationRunInput input) {
+    AggregationRunResult result;
+
+    if (!AggregationConfig::is_valid_format(input.output_format)) {
+        DFTRACER_UTILS_LOG_ERROR(
+            "Invalid output format: %s (supported: %s)",
+            input.output_format.c_str(),
+            AggregationConfig::supported_formats_str().c_str());
+        co_return result;
+    }
+
+    input.log_dir = fs::absolute(input.log_dir).string();
+    if (input.output_file) {
+        *input.output_file = fs::absolute(*input.output_file).string();
+    }
+
+    if (input.verbose) {
+        std::printf("==========================================\n");
+        std::printf("DFTracer Aggregator (Streaming Pipeline)\n");
+        std::printf("==========================================\n");
+        std::printf("Arguments:\n");
+        std::printf("  Input directory: %s\n", input.log_dir.c_str());
+        std::printf("  Output file: %s\n",
+                    input.output_file ? input.output_file->c_str() : "<none>");
+        std::printf(
+            "  Time interval: %llu us\n",
+            static_cast<unsigned long long>(input.agg_config.time_interval_us));
+        std::printf("  Force rebuild: %s\n",
+                    input.force_rebuild ? "true" : "false");
+        std::printf(
+            "  Checkpoint size: %zu bytes (%.2f MB)\n", input.checkpoint_size,
+            static_cast<double>(input.checkpoint_size) / (1024.0 * 1024.0));
+        std::printf("  Executor threads: %zu\n",
+                    input.pipeline_config.executor_threads);
+        std::printf("==========================================\n\n");
+    }
+
+    constexpr std::uint32_t config_hash = 0;
+
+    ::dftracer::utils::Timer* stages = input.stages;
+    ::dftracer::utils::Timer overall(true);
+
+    auto scan_result = std::make_unique<idx::ResolverResult>();
+    {
+        ::dftracer::utils::ScopedTimer _t(stages, "scan_and_resolve");
+        idx::IndexResolverUtility resolver;
+        idx::ResolverInput resolver_input;
+        resolver_input.directory = input.log_dir;
+        resolver_input.index_dir = input.index_dir;
+        resolver_input.require_aggregation = !input.force_rebuild;
+        resolver_input.aggregation_config = input.agg_config;
+        *scan_result = co_await resolver.process(resolver_input);
+    }
+
+    auto& input_files = scan_result->all_files;
+    if (input_files.empty()) {
+        DFTRACER_UTILS_LOG_ERROR("No .pfw or .pfw.gz files found in: %s",
+                                 input.log_dir.c_str());
+        co_return result;
+    }
+
+    DFTRACER_UTILS_LOG_INFO("Found %zu input files", input_files.size());
+
+    auto& shared_index_path = scan_result->index_path;
+    result.index_path = shared_index_path;
+    result.input_file_count = input_files.size();
+
+    Pipeline pipeline(input.pipeline_config);
+
+    if (input.force_rebuild && fs::exists(shared_index_path)) {
+        DFTRACER_UTILS_LOG_INFO("Clearing shared index store: %s",
+                                shared_index_path.c_str());
+        fs::remove_all(shared_index_path);
+    }
+
+    std::shared_ptr<::dftracer::utils::rocksdb::RocksDatabase> agg_db;
+    std::unique_ptr<EventAggregator> merger;
+    {
+        ::dftracer::utils::ScopedTimer _t(stages, "open_rocksdb");
+        agg_db = EventAggregator::open_with_merge_operator(shared_index_path);
+        merger = std::make_unique<EventAggregator>(agg_db, config_hash);
+    }
+
+    const std::size_t num_needing_index = scan_result->needs_checkpoint.size();
+    const std::size_t num_needing_agg_only =
+        input.force_rebuild ? scan_result->cached.size()
+                            : scan_result->needs_aggregation.size();
+    const std::size_t num_cached =
+        input.force_rebuild ? 0 : scan_result->total_cached();
+    result.cached_file_count = num_cached;
+
+    std::vector<std::string> files_to_process;
+    files_to_process.reserve(num_needing_index + num_needing_agg_only);
+    for (auto& item : scan_result->needs_checkpoint) {
+        files_to_process.push_back(std::move(item.file_path));
+    }
+    if (input.force_rebuild) {
+        for (auto& item : scan_result->cached) {
+            files_to_process.push_back(std::move(item.file_path));
+        }
+    } else {
+        for (auto& item : scan_result->needs_aggregation) {
+            files_to_process.push_back(std::move(item.file_path));
+        }
+    }
+    result.processed_file_count = files_to_process.size();
+
+    DFTRACER_UTILS_LOG_INFO(
+        "Files to process: %zu (%zu need indexing, %zu need aggregation only, "
+        "%zu cached)",
+        files_to_process.size(), num_needing_index, num_needing_agg_only,
+        num_cached);
+
+    bool write_success =
+        !input.output_file.has_value();  // no output -> trivially OK
+    std::size_t total_keys = 0;
+    std::atomic<std::size_t> perfetto_keys_written{0};
+
+    auto main_task = make_task(
+        [&](CoroScope& scope) -> coro::CoroTask<void> {
+            if (!files_to_process.empty()) {
+                {
+                    ::dftracer::utils::ScopedTimer _t(stages,
+                                                      "index_and_aggregate");
+                    auto batch_result = co_await batch_index_and_aggregate(
+                        &scope, files_to_process, input.index_dir,
+                        input.checkpoint_size, input.force_rebuild,
+                        input.pipeline_config.executor_threads,
+                        input.agg_config, agg_db, config_hash);
+
+                    {
+                        ::dftracer::utils::ScopedTimer _vd(stages,
+                                                           "visitor_drain");
+                        for (auto& file_visitors :
+                             batch_result.extra_visitors) {
+                            for (auto& visitor : file_visitors) {
+                                auto* agg_visitor =
+                                    dynamic_cast<AggregationVisitor*>(
+                                        visitor.get());
+                                if (agg_visitor) {
+                                    for (const auto& k :
+                                         agg_visitor->observed_extra_keys())
+                                        merger->add_observed_extra_key(k);
+                                    for (const auto& m :
+                                         agg_visitor->observed_custom_metrics())
+                                        merger->add_observed_custom_metric(m);
+                                    auto output = agg_visitor->take_output();
+                                    merger->merge_chunk(std::move(output));
+                                }
+                            }
+                            file_visitors.clear();
+                        }
+                    }
+                }
+
+                {
+                    ::dftracer::utils::ScopedTimer _wt(stages,
+                                                       "write_tracking");
+                    write_aggregation_tracking(agg_db.get(), input.agg_config,
+                                               files_to_process,
+                                               shared_index_path);
+                }
+            }
+
+            ::dftracer::utils::ScopedTimer _pp(stages, "post_processing");
+
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+            if (input.output_file &&
+                input.output_format == AggregationConfig::FORMAT_ARROW) {
+                using namespace ::dftracer::utils::utilities::common::arrow;
+
+                std::unique_ptr<AssociationTracker> global_tracker;
+                {
+                    ::dftracer::utils::ScopedTimer _bt(stages,
+                                                       "build_global_tracker");
+                    global_tracker = merger->build_global_tracker();
+                }
+                (void)global_tracker;
+
+                EventAggregator::ObservedColumns obs;
+                {
+                    ::dftracer::utils::ScopedTimer _oc(stages,
+                                                       "observed_columns");
+                    obs = merger->observed_columns();
+                }
+                auto& global_extra_key_ids = obs.extra_key_ids;
+                auto& global_custom_metric_names = obs.custom_metric_names;
+
+                IpcWriter ipc;
+                if (co_await ipc.open(*input.output_file) != 0) {
+                    DFTRACER_UTILS_LOG_ERROR(
+                        "Failed to open Arrow IPC file: %s",
+                        input.output_file->c_str());
+                } else {
+                    ::dftracer::utils::ScopedTimer _aw(stages,
+                                                       "arrow_scan_write");
+                    constexpr std::size_t BATCH_ROWS = 10000;
+                    AggregationBatch batch;
+                    batch.entries.reserve(BATCH_ROWS);
+                    batch.global_extra_key_ids = &global_extra_key_ids;
+                    batch.global_custom_metric_names =
+                        &global_custom_metric_names;
+
+                    std::vector<ArrowExportResult> pending_batches;
+                    merger->scan([&](AggMapType, const AggregationKey& key,
+                                     AggregationMetrics& metrics) {
+                        total_keys++;
+                        batch.entries.emplace_back(key, std::move(metrics));
+                        if (batch.entries.size() >= BATCH_ROWS) {
+                            pending_batches.push_back(batch.to_arrow());
+                            batch.entries.clear();
+                        }
+                        return true;
+                    });
+                    if (!batch.entries.empty()) {
+                        pending_batches.push_back(batch.to_arrow());
+                    }
+
+                    write_success = true;
+                    for (auto& ab : pending_batches) {
+                        if (co_await ipc.write_batch(ab) != 0) {
+                            write_success = false;
+                            break;
+                        }
+                    }
+                    if (write_success) {
+                        write_success = (co_await ipc.close() == 0);
+                    } else {
+                        co_await ipc.close();
+                    }
+                }
+            } else
+#endif
+                if (input.output_file) {
+                PerfettoTraceWriterInput streaming_input;
+                {
+                    ::dftracer::utils::ScopedTimer _si(stages,
+                                                       "build_streaming_input");
+                    streaming_input = build_streaming_input(
+                        merger.get(), &input.agg_config, &(*input.output_file),
+                        input.compress_output, input.compression_level,
+                        input.event_format);
+                    streaming_input.keys_written = &perfetto_keys_written;
+                    streaming_input.merge_on_sharded = true;
+                }
+                {
+                    ::dftracer::utils::ScopedTimer _pw(stages,
+                                                       "perfetto_write");
+                    PerfettoTraceWriterUtility writer;
+                    write_success = co_await scope.spawn(
+                        writer, std::move(streaming_input));
+                }
+                total_keys = perfetto_keys_written.load();
+            } else {
+                // No output file requested: just count keys in the AGGREGATION
+                // CF so callers get a meaningful total_keys.
+                merger->scan([&](AggMapType, const AggregationKey&,
+                                 AggregationMetrics&) {
+                    total_keys++;
+                    return true;
+                });
+            }
+        },
+        "AggregatorMain");
+
+    pipeline.set_source(main_task);
+    {
+        ::dftracer::utils::ScopedTimer _t(stages, "pipeline_execute");
+        pipeline.execute();
+    }
+
+    {
+        ::dftracer::utils::ScopedTimer _t(stages, "close_rocksdb");
+        merger.reset();
+        agg_db.reset();
+    }
+
+    overall.stop();
+    result.elapsed_ms = static_cast<double>(overall.elapsed()) / 1e6;
+    result.total_keys = total_keys;
+    result.success = write_success;
+
+    if (input.verbose) {
+        std::printf("\n==========================================\n");
+        std::printf("Aggregation Results\n");
+        std::printf("==========================================\n");
+        std::printf("  Execution time: %.2f seconds\n",
+                    result.elapsed_ms / 1000.0);
+        std::printf("  Files: %zu total, %zu processed, %zu cached\n",
+                    result.input_file_count, result.processed_file_count,
+                    result.cached_file_count);
+        std::printf("  Unique aggregation keys: %zu\n", result.total_keys);
+        if (input.output_file) {
+            std::printf("  Output file: %s\n", input.output_file->c_str());
+            std::printf("  Write status: %s\n",
+                        result.success ? "SUCCESS" : "FAILED");
+        }
+        std::printf("==========================================\n");
+    }
+
+    if (stages) stages->print_stages();
+
+    co_return result;
+}
+
+}  // namespace dftracer::utils::utilities::composites::dft::aggregators
diff --git a/src/dftracer/utils/utilities/dlio/barrier_simulator.cpp b/src/dftracer/utils/utilities/dlio/barrier_simulator.cpp
new file mode 100644
index 00000000..201fcfef
--- /dev/null
+++ b/src/dftracer/utils/utilities/dlio/barrier_simulator.cpp
@@ -0,0 +1,450 @@
+#include <dftracer/utils/utilities/dlio/barrier_simulator.h>
+#include <dftracer/utils/utilities/dlio/statistic.h>
+#include <dftracer/utils/utilities/dlio/worker_queue.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+
+namespace dftracer::utils::utilities::dlio {
+
+namespace {
+
+constexpr std::uint64_t PREPROCESS_RNG_OFFSET = 888888;
+constexpr std::uint64_t STEP_RNG_OFFSET = 999999;
+
+inline double uniform01(Rng& rng) {
+    return std::uniform_real_distribution<double>(0.0, 1.0)(rng);
+}
+
+}  // namespace
+
+double sweep_union(std::vector<Boundary>& boundaries) {
+    if (boundaries.empty()) return 0.0;
+    std::sort(
+        boundaries.begin(), boundaries.end(),
+        [](const Boundary& a, const Boundary& b) { return a.time < b.time; });
+
+    double union_time_us = 0.0;
+    int active = 0;
+    std::int64_t last_time = 0;
+    for (const auto& b : boundaries) {
+        if (active > 0)
+            union_time_us += static_cast<double>(b.time - last_time);
+        active += b.delta;
+        last_time = b.time;
+    }
+    return union_time_us / 1e6;
+}
+
+double variance(const std::vector<double>& values) {
+    if (values.empty()) return 0.0;
+    double mean = 0.0;
+    for (double v : values) mean += v;
+    mean /= static_cast<double>(values.size());
+
+    double sq = 0.0;
+    for (double v : values) {
+        double d = v - mean;
+        sq += d * d;
+    }
+    return sq / static_cast<double>(values.size());
+}
+
+double cdf_similarity(const std::vector<double>& a,
+                      const std::vector<double>& b) {
+    if (a.empty() || b.empty()) return 0.0;
+
+    std::vector<double> as(a), bs(b);
+    std::sort(as.begin(), as.end());
+    std::sort(bs.begin(), bs.end());
+
+    const auto na = static_cast<double>(as.size());
+    const auto nb = static_cast<double>(bs.size());
+
+    std::vector<double> all;
+    all.reserve(as.size() + bs.size());
+    all.insert(all.end(), as.begin(), as.end());
+    all.insert(all.end(), bs.begin(), bs.end());
+    std::sort(all.begin(), all.end());
+    all.erase(std::unique(all.begin(), all.end()), all.end());
+
+    double max_diff = 0.0;
+    for (double v : all) {
+        const auto ca = static_cast<double>(
+            std::upper_bound(as.begin(), as.end(), v) - as.begin());
+        const auto cb = static_cast<double>(
+            std::upper_bound(bs.begin(), bs.end(), v) - bs.begin());
+        double diff = std::abs(ca / na - cb / nb);
+        if (diff > max_diff) max_diff = diff;
+    }
+    return 1.0 - max_diff;
+}
+
+std::vector<WorkInterval> WorkerQueue::produce_batches(
+    double current_time, const BatchTimeSampler& sampler) {
+    if (worker_free_times_.empty()) {
+        worker_free_times_.assign(static_cast<std::size_t>(num_workers_),
+                                  current_time);
+    }
+
+    std::vector<WorkInterval> intervals;
+    while (ready_batches_.size() < queue_capacity_) {
+        const auto earliest_it = std::min_element(worker_free_times_.begin(),
+                                                  worker_free_times_.end());
+        const double worker_available = *earliest_it;
+
+        auto [batch_time, preprocess_time] = sampler();
+        const double batch_ready = worker_available + batch_time;
+        intervals.push_back({worker_available, batch_ready, preprocess_time});
+
+        *earliest_it = batch_ready;
+        ready_batches_.push_back(batch_ready);
+    }
+    std::sort(ready_batches_.begin(), ready_batches_.end());
+    return intervals;
+}
+
+double WorkerQueue::consume_batch(double current_time, double base_overhead) {
+    if (ready_batches_.empty()) {
+        ++stall_count_;
+        return base_overhead;
+    }
+    const double batch_ready = ready_batches_.front();
+    ready_batches_.erase(ready_batches_.begin());
+
+    if (batch_ready <= current_time) return base_overhead;
+
+    ++stall_count_;
+    return (batch_ready - current_time) + base_overhead;
+}
+
+BarrierSimulationResult BarrierSimulator::simulate(
+    const BarrierSimulatorContext& ctx, std::uint64_t base_seed,
+    const Sampler& fetch_block_sampler,
+    const Sampler& preprocess_sampler) const {
+    BarrierSimulationResult result;
+
+    std::vector<Rng> rank_rngs;
+    rank_rngs.reserve(static_cast<std::size_t>(ctx.num_ranks));
+    for (int rank = 0; rank < ctx.num_ranks; ++rank) {
+        rank_rngs.emplace_back(base_seed + static_cast<std::uint64_t>(rank));
+    }
+
+    std::vector<double> rank_times(static_cast<std::size_t>(ctx.num_ranks),
+                                   0.0);
+    std::vector<double> rank_work_times(static_cast<std::size_t>(ctx.num_ranks),
+                                        0.0);
+    std::vector<double> barrier_overheads;
+
+    std::vector<Boundary> boundaries;
+    std::vector<Boundary> preprocess_boundaries;
+    std::vector<Boundary> fetch_iter_boundaries;
+    std::vector<Boundary> fetch_block_boundaries;
+
+    std::vector<WorkerQueue> rank_queues;
+    std::vector<Rng> preprocess_rngs;
+    preprocess_rngs.reserve(static_cast<std::size_t>(ctx.num_ranks));
+    for (int rank = 0; rank < ctx.num_ranks; ++rank) {
+        preprocess_rngs.emplace_back(base_seed + PREPROCESS_RNG_OFFSET +
+                                     static_cast<std::uint64_t>(rank));
+    }
+
+    auto worker_batch_sampler_for = [&](int rank, bool record_simulated) {
+        return [&, rank, record_simulated]() -> std::pair<double, double> {
+            Rng& rrng = preprocess_rngs[static_cast<std::size_t>(rank)];
+            double sampled_preprocess = 0.0;
+            double sampled_getitem = 0.0;
+            if (preprocess_sampler && ctx.io_stats) {
+                sampled_preprocess = preprocess_sampler(rrng);
+                const double sampled_io =
+                    ctx.io_stats->quantile(uniform01(rrng));
+                sampled_getitem = sampled_preprocess + sampled_io;
+            } else {
+                sampled_getitem = ctx.getitem_stats.quantile(uniform01(rrng));
+                sampled_preprocess =
+                    ctx.preprocess_stats.quantile(uniform01(rrng));
+            }
+
+            const double io_time =
+                std::max(0.0, sampled_getitem - sampled_preprocess);
+            const double total_time = io_time + sampled_preprocess;
+            const double adjusted_time =
+                total_time * ctx.preprocess_slowdown_factor;
+
+            if (record_simulated) {
+                result.simulated_preprocess.push_back(sampled_preprocess);
+                result.simulated_getitem.push_back(sampled_getitem);
+            }
+
+            result.preprocess_metrics.accumulated_time += sampled_preprocess;
+            ++result.preprocess_metrics.num_samples;
+            result.preprocess_metrics.stats.update(sampled_preprocess);
+
+            return {adjusted_time, sampled_preprocess};
+        };
+    };
+
+    if (ctx.enable_preprocess_simulation) {
+        for (int rank = 0; rank < ctx.num_ranks; ++rank) {
+            rank_queues.emplace_back(ctx.num_workers, ctx.prefetch_factor);
+        }
+        for (int rank = 0; rank < ctx.num_ranks; ++rank) {
+            auto sampler =
+                worker_batch_sampler_for(rank, /*record_simulated=*/false);
+            auto intervals =
+                rank_queues[static_cast<std::size_t>(rank)].produce_batches(
+                    0.0, sampler);
+            for (const auto& interval : intervals) {
+                preprocess_boundaries.push_back(
+                    {static_cast<std::int64_t>(interval.start_time), +1});
+                preprocess_boundaries.push_back(
+                    {static_cast<std::int64_t>(interval.end_time), -1});
+            }
+        }
+    }
+
+    Rng step_rng(base_seed + STEP_RNG_OFFSET);
+
+    std::uint64_t total_queue_stalls = 0;
+    std::uint64_t total_queue_depth_samples = 0;
+    double sum_queue_depth = 0.0;
+
+    for (int step = 0; step < ctx.num_steps; ++step) {
+        for (int rank = 0; rank < ctx.num_ranks; ++rank) {
+            const auto r = static_cast<std::size_t>(rank);
+            double fetch_iter = 0.0;
+            double fetch_block = 0.0;
+
+            const bool have_aggregated_fetch_iter =
+                ctx.is_aggregated_trace && !ctx.enable_preprocess_simulation &&
+                static_cast<int>(ctx.fetch_iter_trace.size()) > rank &&
+                static_cast<int>(ctx.fetch_iter_trace[r].size()) > step;
+
+            if (have_aggregated_fetch_iter) {
+                fetch_iter =
+                    ctx.fetch_iter_trace[r][static_cast<std::size_t>(step)];
+            } else if (ctx.enable_preprocess_simulation) {
+                auto& queue = rank_queues[r];
+
+                fetch_iter = queue.consume_batch(rank_times[r],
+                                                 ctx.base_fetch_iter_overhead);
+                result.simulated_fetch_iter.push_back(fetch_iter);
+                if (queue.had_stall()) ++total_queue_stalls;
+                sum_queue_depth += static_cast<double>(queue.queue_depth());
+                ++total_queue_depth_samples;
+
+                auto sampler =
+                    worker_batch_sampler_for(rank, /*record_simulated=*/true);
+                auto intervals =
+                    queue.produce_batches(rank_times[r] + fetch_iter, sampler);
+                for (const auto& interval : intervals) {
+                    preprocess_boundaries.push_back(
+                        {static_cast<std::int64_t>(interval.start_time), +1});
+                    preprocess_boundaries.push_back(
+                        {static_cast<std::int64_t>(interval.end_time), -1});
+                }
+            } else {
+                fetch_iter = ctx.fetch_iter_stats.quantile(uniform01(step_rng));
+                if (!ctx.is_aggregated_trace) {
+                    fetch_iter =
+                        std::clamp(fetch_iter, ctx.fetch_iter_stats.min(),
+                                   ctx.fetch_iter_stats.max());
+                }
+            }
+
+            fetch_block = fetch_block_sampler(rank_rngs[r]);
+            if (!ctx.is_aggregated_trace) {
+                fetch_block =
+                    std::clamp(fetch_block, ctx.fetch_block_stats.min(),
+                               ctx.fetch_block_stats.max());
+            }
+
+            result.simulated_fetch_block.push_back(fetch_block);
+            rank_work_times[r] += fetch_block + fetch_iter;
+
+            const double start_time = rank_times[r];
+            const double fetch_iter_end_time = start_time + fetch_iter;
+            const double end_time = fetch_iter_end_time + fetch_block;
+
+            result.fetch_iter_metrics.accumulated_time += fetch_iter;
+            ++result.fetch_iter_metrics.num_samples;
+            result.fetch_iter_metrics.stats.update(fetch_iter);
+            fetch_iter_boundaries.push_back(
+                {static_cast<std::int64_t>(start_time * 1e6), +1});
+            fetch_iter_boundaries.push_back(
+                {static_cast<std::int64_t>(fetch_iter_end_time * 1e6), -1});
+
+            result.fetch_block_metrics.accumulated_time += fetch_block;
+            ++result.fetch_block_metrics.num_samples;
+            result.fetch_block_metrics.stats.update(fetch_block);
+            fetch_block_boundaries.push_back(
+                {static_cast<std::int64_t>(fetch_iter_end_time * 1e6), +1});
+            fetch_block_boundaries.push_back(
+                {static_cast<std::int64_t>(end_time * 1e6), -1});
+
+            if (!ctx.sync_mode) {
+                boundaries.push_back(
+                    {static_cast<std::int64_t>(start_time * 1e6), +1});
+                boundaries.push_back(
+                    {static_cast<std::int64_t>(end_time * 1e6), -1});
+            }
+
+            rank_times[r] = end_time;
+        }
+
+        const bool is_barrier_step =
+            ctx.accumulate_grad_batches > 0 &&
+            ((step + 1) % ctx.accumulate_grad_batches == 0);
+        if (ctx.sync_mode && is_barrier_step) {
+            const double max_time =
+                *std::max_element(rank_times.begin(), rank_times.end());
+            for (int rank = 0; rank < ctx.num_ranks; ++rank) {
+                const auto r = static_cast<std::size_t>(rank);
+                barrier_overheads.push_back(max_time - rank_times[r]);
+                rank_times[r] = max_time;
+            }
+        }
+    }
+
+    if (ctx.sync_mode) {
+        result.e2e_duration = rank_times.empty() ? 0.0 : rank_times.front();
+    } else if (!boundaries.empty()) {
+        result.e2e_duration = sweep_union(boundaries);
+    } else if (!rank_times.empty()) {
+        result.e2e_duration =
+            *std::max_element(rank_times.begin(), rank_times.end());
+    }
+
+    if (!preprocess_boundaries.empty())
+        result.preprocess_metrics.union_time =
+            sweep_union(preprocess_boundaries);
+    if (!fetch_iter_boundaries.empty())
+        result.fetch_iter_metrics.union_time =
+            sweep_union(fetch_iter_boundaries);
+    if (!fetch_block_boundaries.empty())
+        result.fetch_block_metrics.union_time =
+            sweep_union(fetch_block_boundaries);
+
+    result.trace_preprocess_metrics = ctx.trace_preprocess_metrics;
+    result.trace_fetch_iter_metrics = ctx.trace_fetch_iter_metrics;
+    result.trace_fetch_block_metrics = ctx.trace_fetch_block_metrics;
+
+    if (!barrier_overheads.empty()) {
+        double sum = 0.0;
+        double max_ov = -std::numeric_limits<double>::infinity();
+        for (double v : barrier_overheads) {
+            sum += v;
+            if (v > max_ov) max_ov = v;
+        }
+        result.avg_barrier_overhead =
+            sum / static_cast<double>(barrier_overheads.size());
+        result.max_barrier_overhead = max_ov;
+    }
+
+    {
+        std::vector<double> trace_flat;
+        for (const auto& rank_data : ctx.fetch_block_trace) {
+            trace_flat.insert(trace_flat.end(), rank_data.begin(),
+                              rank_data.end());
+        }
+        result.fetch_block_cdf_similarity =
+            cdf_similarity(result.simulated_fetch_block, trace_flat);
+    }
+
+    if (!result.simulated_fetch_iter.empty() && !ctx.fetch_iter_trace.empty()) {
+        std::vector<double> trace_flat;
+        for (const auto& rank_data : ctx.fetch_iter_trace) {
+            trace_flat.insert(trace_flat.end(), rank_data.begin(),
+                              rank_data.end());
+        }
+        result.fetch_iter_cdf_similarity =
+            cdf_similarity(result.simulated_fetch_iter, trace_flat);
+    }
+
+    if (!result.simulated_getitem.empty() && ctx.getitem_trace) {
+        std::vector<double> trace_flat;
+        for (const auto& rank_data : *ctx.getitem_trace) {
+            trace_flat.insert(trace_flat.end(), rank_data.begin(),
+                              rank_data.end());
+        }
+        result.getitem_cdf_similarity =
+            cdf_similarity(result.simulated_getitem, trace_flat);
+    }
+
+    if (ctx.trace_e2e_duration > 0.0) {
+        result.e2e_error =
+            std::abs(result.e2e_duration - ctx.trace_e2e_duration) /
+            ctx.trace_e2e_duration;
+    }
+
+    result.per_rank_completion_time = rank_times;
+    result.rank_variance = variance(rank_work_times);
+    result.trace_rank_variance = ctx.trace_rank_variance;
+    if (ctx.trace_rank_variance > 0.0) {
+        result.rank_variance_error =
+            std::abs(result.rank_variance - ctx.trace_rank_variance) /
+            ctx.trace_rank_variance;
+    }
+
+    if (!rank_times.empty()) {
+        const double min_t =
+            *std::min_element(rank_times.begin(), rank_times.end());
+        const double max_t =
+            *std::max_element(rank_times.begin(), rank_times.end());
+        result.load_imbalance = (max_t - min_t) / (min_t + 1e-9);
+    }
+
+    if (ctx.enable_preprocess_simulation && total_queue_depth_samples > 0) {
+        result.avg_queue_depth =
+            sum_queue_depth / static_cast<double>(total_queue_depth_samples);
+        result.avg_queue_stalls =
+            static_cast<double>(total_queue_stalls) /
+            static_cast<double>(ctx.num_ranks * ctx.num_steps);
+    }
+
+    result.simulated_per_rank_throughput.reserve(
+        static_cast<std::size_t>(ctx.num_ranks));
+    for (int rank = 0; rank < ctx.num_ranks; ++rank) {
+        const auto r = static_cast<std::size_t>(rank);
+        result.simulated_per_rank_throughput.push_back(
+            rank_times[r] > 0.0
+                ? static_cast<double>(ctx.num_steps) / rank_times[r]
+                : 0.0);
+    }
+    result.trace_per_rank_throughput = ctx.trace_per_rank_throughput;
+
+    if (!result.simulated_per_rank_throughput.empty() &&
+        !result.trace_per_rank_throughput.empty()) {
+        double sim_sum = 0.0;
+        for (double v : result.simulated_per_rank_throughput) sim_sum += v;
+        result.throughput_mean =
+            sim_sum /
+            static_cast<double>(result.simulated_per_rank_throughput.size());
+
+        double tr_sum = 0.0;
+        for (double v : result.trace_per_rank_throughput) tr_sum += v;
+        result.trace_throughput_mean =
+            tr_sum /
+            static_cast<double>(result.trace_per_rank_throughput.size());
+
+        if (result.trace_throughput_mean > 0.0) {
+            result.throughput_mean_error =
+                std::abs(result.throughput_mean -
+                         result.trace_throughput_mean) /
+                result.trace_throughput_mean;
+        }
+        result.throughput_variance =
+            variance(result.simulated_per_rank_throughput);
+        result.trace_throughput_variance =
+            variance(result.trace_per_rank_throughput);
+        result.throughput_cdf_similarity =
+            cdf_similarity(result.simulated_per_rank_throughput,
+                           result.trace_per_rank_throughput);
+    }
+
+    return result;
+}
+
+}  // namespace dftracer::utils::utilities::dlio
diff --git a/src/dftracer/utils/utilities/dlio/optimizer.cpp b/src/dftracer/utils/utilities/dlio/optimizer.cpp
new file mode 100644
index 00000000..9c2b5d16
--- /dev/null
+++ b/src/dftracer/utils/utilities/dlio/optimizer.cpp
@@ -0,0 +1,91 @@
+#include <dftracer/utils/utilities/dlio/optimizer.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <limits>
+
+namespace dftracer::utils::utilities::dlio {
+
+namespace stats = ::dftracer::utils::utilities::common::statistics;
+
+double percentile(const std::vector<double>& sorted_data, double pct) {
+    if (sorted_data.empty()) return 0.0;
+    const double p = std::clamp(pct, 0.0, 100.0) / 100.0;
+    const double idx = p * static_cast<double>(sorted_data.size() - 1);
+    const auto lo = static_cast<std::size_t>(std::floor(idx));
+    const auto hi = static_cast<std::size_t>(std::ceil(idx));
+    if (lo == hi) return sorted_data[lo];
+    const double frac = idx - static_cast<double>(lo);
+    return sorted_data[lo] * (1.0 - frac) + sorted_data[hi] * frac;
+}
+
+OptimizerResult optimize_max_bound_percentile(
+    const BarrierSimulatorContext& context, const BestModel& model,
+    std::vector<double> sample_times, const OptimizerOptions& options) {
+    OptimizerResult out;
+    out.best_percentile = options.initial_percentile;
+
+    std::sort(sample_times.begin(), sample_times.end());
+    if (sample_times.empty()) return out;
+
+    const double sample_min = sample_times.front();
+
+    BarrierSimulator sim;
+    double current_percentile = options.initial_percentile;
+    double velocity = 0.0;
+    double best_e2e_error = std::numeric_limits<double>::infinity();
+    int iterations_without_improvement = 0;
+
+    constexpr double kImprovementThreshold = 0.001;  // 0.1% relative
+
+    for (int iter = 0; iter < options.max_iterations; ++iter) {
+        const double max_bound = percentile(sample_times, current_percentile);
+        auto sampler = stats::make_sampler(model, sample_min, max_bound);
+
+        auto result = sim.simulate(context, options.base_seed, sampler);
+        out.iterations_used = iter + 1;
+
+        // Track best result by E2E error (improvement must beat threshold).
+        const bool first = (iter == 0);
+        const bool better =
+            result.e2e_error < best_e2e_error * (1.0 - kImprovementThreshold);
+        if (first || better) {
+            out.best = result;
+            out.best_percentile = current_percentile;
+            best_e2e_error = result.e2e_error;
+            iterations_without_improvement = 0;
+        } else {
+            ++iterations_without_improvement;
+        }
+
+        const bool e2e_ok = result.e2e_error < options.target_e2e_error;
+        const bool cdf_ok =
+            result.fetch_block_cdf_similarity > options.target_cdf_similarity;
+        if (e2e_ok && cdf_ok) {
+            out.converged = true;
+            return out;
+        }
+        if (iterations_without_improvement >= options.patience) return out;
+
+        // Momentum-smoothed step. Overshooting -> shrink percentile;
+        // undershooting -> grow it; close-but-not-converged -> nudge by CDF.
+        double step = 0.0;
+        if (result.e2e_duration > context.trace_e2e_duration) {
+            const double aggressive = result.e2e_error > 0.10 ? 2.0 : 1.0;
+            step = -options.epsilon * aggressive;
+        } else if (result.e2e_duration < context.trace_e2e_duration * 0.95) {
+            step = options.epsilon * 0.5;
+        } else if (result.fetch_block_cdf_similarity <
+                   options.target_cdf_similarity) {
+            step = -options.epsilon * 0.5;
+        }
+        velocity = options.momentum * velocity + step;
+        current_percentile = std::clamp(current_percentile + velocity,
+                                        options.min_percentile, 100.0);
+    }
+
+    return out;
+}
+
+}  // namespace dftracer::utils::utilities::dlio
diff --git a/src/dftracer/utils/utilities/dlio/trace_loader.cpp b/src/dftracer/utils/utilities/dlio/trace_loader.cpp
new file mode 100644
index 00000000..be1a25c0
--- /dev/null
+++ b/src/dftracer/utils/utilities/dlio/trace_loader.cpp
@@ -0,0 +1,380 @@
+#include <dftracer/utils/core/rocksdb/column_families.h>
+#include <dftracer/utils/core/rocksdb/database.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.h>
+#include <dftracer/utils/utilities/dlio/trace_loader.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <map>
+#include <random>
+#include <stdexcept>
+#include <string_view>
+#include <unordered_set>
+#include <utility>
+
+namespace dftracer::utils::utilities::dlio {
+
+namespace {
+
+namespace agg = ::dftracer::utils::utilities::composites::dft::aggregators;
+namespace rdb = ::dftracer::utils::rocksdb;
+
+constexpr double US_TO_S = 1e-6;
+
+// AGGREGATION CF contains data keys (varint-encoded) and reserved system keys
+// prefixed with 0xFF{FD,FE,FF}. Filter those out.
+inline bool is_system_key(std::string_view key) {
+    return key.size() >= 2 && static_cast<std::uint8_t>(key[0]) == 0xFF;
+}
+
+inline bool matches(std::string_view a, std::string_view b) { return a == b; }
+
+struct ComponentAccumulator {
+    // Per-(pid, time_bucket) entries used to materialize per-rank sample seqs.
+    // Outer map sorts by pid for deterministic rank assignment; inner sorts by
+    // time_bucket so the resulting sample sequence is in trace order.
+    std::map<std::uint64_t, std::map<std::uint64_t, std::vector<double>>>
+        per_pid_bucket_samples;
+    // Boundary list (in microseconds) for sweep_union of trace-side wall clock.
+    std::vector<Boundary> boundaries;
+    // Merged sketch across all entries; nullptr until first sketch is seen.
+    std::shared_ptr<DDSketch> sketch;
+    // Aggregate accumulators.
+    double accumulated_time_s = 0.0;  // sum of count * mean (s)
+    std::uint64_t total_count = 0;
+    double min_us = 0.0;
+    double max_us = 0.0;
+    bool min_max_seen = false;
+};
+
+void apply_minmax(ComponentAccumulator& acc, double min_us, double max_us) {
+    if (!acc.min_max_seen) {
+        acc.min_us = min_us;
+        acc.max_us = max_us;
+        acc.min_max_seen = true;
+        return;
+    }
+    if (min_us < acc.min_us) acc.min_us = min_us;
+    if (max_us > acc.max_us) acc.max_us = max_us;
+}
+
+// Synthesize per-call durations (seconds) for a single (cat, name, pid, bucket)
+// entry. When a sketch is available we draw `n` samples via inverse-CDF for
+// within-bucket variance; otherwise we replicate the per-call mean. `n` is
+// clamped to `max_samples` if non-zero.
+void synthesize_samples(const agg::AggregationMetrics& metrics,
+                        std::uint64_t max_samples, std::mt19937_64& rng,
+                        std::vector<double>& out) {
+    if (metrics.count == 0) return;
+    const auto desired =
+        max_samples == 0 ? metrics.count : std::min(metrics.count, max_samples);
+    if (desired == 0) return;
+
+    const double mean_s = metrics.duration.mean * US_TO_S;
+    if (metrics.duration.sketch) {
+        std::uniform_real_distribution<double> u01(0.0, 1.0);
+        out.reserve(out.size() + desired);
+        for (std::uint64_t i = 0; i < desired; ++i) {
+            const double q = u01(rng);
+            const double v_us = metrics.duration.sketch->quantile(q);
+            out.push_back(v_us * US_TO_S);
+        }
+    } else {
+        out.insert(out.end(), desired, mean_s);
+    }
+}
+
+// Drains the per-(pid, bucket) sample buckets in `acc` into per-rank flat
+// vectors, in pid-ascending and then bucket-ascending order.
+std::vector<std::vector<double>> flatten_per_rank(
+    const ComponentAccumulator& acc,
+    const std::vector<std::uint64_t>& rank_pids) {
+    std::vector<std::vector<double>> out;
+    out.reserve(rank_pids.size());
+    for (auto pid : rank_pids) {
+        std::vector<double> rank_samples;
+        auto it = acc.per_pid_bucket_samples.find(pid);
+        if (it == acc.per_pid_bucket_samples.end()) {
+            out.emplace_back();
+            continue;
+        }
+        for (const auto& [bucket, samples] : it->second) {
+            (void)bucket;
+            rank_samples.insert(rank_samples.end(), samples.begin(),
+                                samples.end());
+        }
+        out.push_back(std::move(rank_samples));
+    }
+    return out;
+}
+
+}  // namespace
+
+AggregatedTraces load_aggregated_traces(const std::string& db_path,
+                                        const TraceLoaderOptions& options) {
+    // The AGGREGATION CF was created with a merge operator; we must re-attach
+    // it (read-only) so RocksDB will let us iterate the CF. Without the merge
+    // operator, NewIterator() returns "merge_operator_ must be set".
+    auto db_handle =
+        agg::EventAggregator::open_read_only_with_merge_operator(db_path);
+    if (!db_handle) {
+        throw std::runtime_error("dlio: failed to open RocksDB at " + db_path);
+    }
+    auto& db = *db_handle;
+
+    // The intern dictionary must be populated before any key parsing happens.
+    agg::load_intern_dictionary(db);
+
+    AggregatedTraces out;
+
+    // Global config (time_interval_us) lives at key 0xFFFE in AGGREGATION CF.
+    {
+        std::string val;
+        const auto st = db.get(std::string_view(agg::AGG_GLOBAL_CONFIG_KEY, 2),
+                               &val, rdb::cf::AGGREGATION);
+        if (st.ok() && !val.empty()) {
+            const auto cfg = agg::deserialize_agg_global_config(val);
+            out.time_interval_us = cfg.time_interval_us;
+        }
+    }
+
+    ComponentAccumulator acc_fetch_block;
+    ComponentAccumulator acc_fetch_iter;
+    ComponentAccumulator acc_preprocess;
+    ComponentAccumulator acc_getitem;
+
+    std::unordered_set<std::uint64_t> pid_set;
+    std::mt19937_64 rng(options.seed);
+
+    auto it = db.new_iterator(rdb::cf::AGGREGATION);
+    if (!it) {
+        throw std::runtime_error("dlio: failed to obtain AGGREGATION iterator");
+    }
+
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+        const auto key_slice = it->key();
+        std::string_view key_sv(key_slice.data(), key_slice.size());
+        if (is_system_key(key_sv)) continue;
+
+        agg::AggKeyView kv;
+        if (!agg::parse_agg_key_view(key_sv, kv)) continue;
+
+        ComponentAccumulator* target = nullptr;
+        if (matches(kv.cat, CATEGORY_DATALOADER) &&
+            matches(kv.name, EVENT_FETCH_BLOCK)) {
+            target = &acc_fetch_block;
+        } else if (matches(kv.cat, CATEGORY_DATALOADER) &&
+                   matches(kv.name, EVENT_FETCH_ITER)) {
+            target = &acc_fetch_iter;
+        } else if (matches(kv.cat, CATEGORY_DATA) &&
+                   matches(kv.name, EVENT_PREPROCESS)) {
+            target = &acc_preprocess;
+        } else if (matches(kv.cat, CATEGORY_DATA) &&
+                   matches(kv.name, EVENT_ITEM)) {
+            target = &acc_getitem;
+        } else {
+            continue;
+        }
+
+        const auto val_slice = it->value();
+        std::string_view val_sv(val_slice.data(), val_slice.size());
+        auto metrics = agg::deserialize_agg_value(val_sv);
+        if (metrics.count == 0) continue;
+
+        out.any_data = true;
+        pid_set.insert(kv.pid);
+
+        // Synthesize per-call samples for this entry.
+        auto& bucket_vec =
+            target->per_pid_bucket_samples[kv.pid][kv.time_bucket];
+        synthesize_samples(metrics, options.max_samples_per_entry, rng,
+                           bucket_vec);
+
+        // Accumulate component-level state.
+        target->accumulated_time_s +=
+            static_cast<double>(metrics.duration.total) * US_TO_S;
+        target->total_count += metrics.count;
+        apply_minmax(*target, static_cast<double>(metrics.duration.min),
+                     static_cast<double>(metrics.duration.max));
+
+        // Real per-entry (ts, te) interval for trace-side union time.
+        if (metrics.te > metrics.ts) {
+            target->boundaries.push_back(
+                {static_cast<std::int64_t>(metrics.ts), +1});
+            target->boundaries.push_back(
+                {static_cast<std::int64_t>(metrics.te), -1});
+        }
+
+        // Merge sketch when present.
+        if (metrics.duration.sketch) {
+            out.sketches_available = true;
+            if (!target->sketch) {
+                target->sketch =
+                    std::make_shared<DDSketch>(*metrics.duration.sketch);
+            } else {
+                target->sketch->merge(*metrics.duration.sketch);
+            }
+        }
+    }
+
+    if (!it->status().ok()) {
+        throw std::runtime_error(
+            "dlio: iteration over AGGREGATION CF failed: " +
+            it->status().ToString());
+    }
+
+    if (!out.any_data) {
+        return out;
+    }
+
+    // Rank PIDs are the pids that emitted at least one fetch.block event.
+    // fetch_iter is intentionally not required since some traces omit it.
+    std::vector<std::uint64_t> rank_pids;
+    rank_pids.reserve(acc_fetch_block.per_pid_bucket_samples.size());
+    for (const auto& [pid, _] : acc_fetch_block.per_pid_bucket_samples) {
+        rank_pids.push_back(pid);
+    }
+    std::sort(rank_pids.begin(), rank_pids.end());
+    out.rank_pids = rank_pids;
+    out.num_ranks = static_cast<int>(rank_pids.size());
+
+    // Per-rank traces.
+    out.fetch_block_trace = flatten_per_rank(acc_fetch_block, rank_pids);
+    out.fetch_iter_trace = flatten_per_rank(acc_fetch_iter, rank_pids);
+    out.getitem_trace = flatten_per_rank(acc_getitem, rank_pids);
+
+    // num_steps = min length across ranks (conservative — drop straggler
+    // steps).
+    out.num_steps = out.num_ranks > 0
+                        ? static_cast<int>(out.fetch_block_trace.front().size())
+                        : 0;
+    for (const auto& r : out.fetch_block_trace) {
+        out.num_steps = std::min(out.num_steps, static_cast<int>(r.size()));
+    }
+
+    // Flat fitting arrays.
+    for (const auto& r : out.fetch_block_trace) {
+        out.computation_times.insert(out.computation_times.end(), r.begin(),
+                                     r.end());
+    }
+    for (const auto& [pid, buckets] : acc_preprocess.per_pid_bucket_samples) {
+        (void)pid;
+        for (const auto& [bucket, samples] : buckets) {
+            (void)bucket;
+            out.preprocess_times.insert(out.preprocess_times.end(),
+                                        samples.begin(), samples.end());
+        }
+    }
+
+    // Sketches + Statistic objects.
+    auto attach_stat = [](Statistic& stat, const ComponentAccumulator& a) {
+        if (!a.min_max_seen) return;
+        // Seed Statistic with min and max (in seconds) so its fallback quantile
+        // path has reasonable bounds even without a sketch.
+        stat.update(a.min_us * US_TO_S);
+        if (a.max_us != a.min_us) stat.update(a.max_us * US_TO_S);
+        if (a.sketch) {
+            stat.attach_sketch(a.sketch);
+        }
+    };
+    attach_stat(out.fetch_block_stats, acc_fetch_block);
+    attach_stat(out.fetch_iter_stats, acc_fetch_iter);
+    attach_stat(out.preprocess_stats, acc_preprocess);
+    attach_stat(out.getitem_stats, acc_getitem);
+    out.fetch_block_sketch = acc_fetch_block.sketch;
+    out.fetch_iter_sketch = acc_fetch_iter.sketch;
+    out.preprocess_sketch = acc_preprocess.sketch;
+    out.getitem_sketch = acc_getitem.sketch;
+
+    // Trace-side ComponentTimeMetrics: accumulated + union (in seconds).
+    auto fill_metrics = [](ComponentTimeMetrics& m, ComponentAccumulator& a) {
+        m.accumulated_time = a.accumulated_time_s;
+        m.num_samples = a.total_count;
+        m.union_time = sweep_union(a.boundaries);  // returns seconds
+    };
+    fill_metrics(out.trace_fetch_block_metrics, acc_fetch_block);
+    fill_metrics(out.trace_fetch_iter_metrics, acc_fetch_iter);
+    fill_metrics(out.trace_preprocess_metrics, acc_preprocess);
+
+    // Overall e2e: union of fetch_block intervals across all ranks gives the
+    // tightest defensible estimate for "time spent in the data path".
+    out.trace_e2e_duration = out.trace_fetch_block_metrics.union_time;
+
+    // Per-rank throughput = events / rank-side fetch_block wall clock. We
+    // approximate rank wall clock as sum(per-entry te-ts) for that rank's
+    // fetch_block keys; matches the granularity available without raw events.
+    out.trace_per_rank_throughput.reserve(rank_pids.size());
+    std::vector<double> per_rank_wall;
+    per_rank_wall.reserve(rank_pids.size());
+    for (auto pid : rank_pids) {
+        double wall_us = 0.0;
+        std::uint64_t count = 0;
+        auto pit = acc_fetch_block.per_pid_bucket_samples.find(pid);
+        if (pit != acc_fetch_block.per_pid_bucket_samples.end()) {
+            for (const auto& [bucket, samples] : pit->second) {
+                (void)bucket;
+                count += samples.size();
+            }
+            // Wall clock per rank = sum of sample durations as an upper bound.
+            for (const auto& [bucket, samples] : pit->second) {
+                (void)bucket;
+                for (double s : samples) wall_us += s / US_TO_S;
+            }
+        }
+        const double wall_s = wall_us * US_TO_S;
+        per_rank_wall.push_back(wall_s);
+        out.trace_per_rank_throughput.push_back(
+            wall_s > 0.0 ? static_cast<double>(count) / wall_s : 0.0);
+    }
+
+    // Trace rank variance = variance of per-rank wall clock.
+    out.trace_rank_variance = variance(per_rank_wall);
+
+    if (!out.sketches_available) {
+        std::fprintf(stderr,
+                     "dlio: warning - AGGREGATION CF has no DDSketch data. "
+                     "Distribution fitting will use mean-replication samples; "
+                     "re-run dftracer_aggregator with --compute-percentiles "
+                     "for higher-fidelity DLIO configs.\n");
+    }
+
+    return out;
+}
+
+BarrierSimulatorContext make_simulator_context(const AggregatedTraces& traces,
+                                               int num_workers,
+                                               int prefetch_factor) {
+    BarrierSimulatorContext ctx;
+    ctx.num_ranks = traces.num_ranks;
+    ctx.num_steps = traces.num_steps;
+    ctx.is_aggregated_trace = true;
+    ctx.sync_mode = false;
+
+    ctx.fetch_block_trace = traces.fetch_block_trace;
+    ctx.fetch_iter_trace = traces.fetch_iter_trace;
+    if (!traces.getitem_trace.empty()) {
+        ctx.getitem_trace = traces.getitem_trace;
+    }
+
+    ctx.fetch_block_stats = traces.fetch_block_stats;
+    ctx.fetch_iter_stats = traces.fetch_iter_stats;
+    ctx.preprocess_stats = traces.preprocess_stats;
+    ctx.getitem_stats = traces.getitem_stats;
+
+    ctx.trace_fetch_block_metrics = traces.trace_fetch_block_metrics;
+    ctx.trace_fetch_iter_metrics = traces.trace_fetch_iter_metrics;
+    ctx.trace_preprocess_metrics = traces.trace_preprocess_metrics;
+
+    ctx.trace_e2e_duration = traces.trace_e2e_duration;
+    ctx.trace_rank_variance = traces.trace_rank_variance;
+    ctx.trace_per_rank_throughput = traces.trace_per_rank_throughput;
+
+    ctx.num_workers = num_workers;
+    ctx.prefetch_factor = prefetch_factor;
+    return ctx;
+}
+
+}  // namespace dftracer::utils::utilities::dlio
diff --git a/src/dftracer/utils/utilities/dlio/yaml_emit.cpp b/src/dftracer/utils/utilities/dlio/yaml_emit.cpp
new file mode 100644
index 00000000..5ecb9916
--- /dev/null
+++ b/src/dftracer/utils/utilities/dlio/yaml_emit.cpp
@@ -0,0 +1,112 @@
+#include <dftracer/utils/utilities/dlio/yaml_emit.h>
+#include <yaml-cpp/yaml.h>
+
+#include <cmath>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <variant>
+
+namespace dftracer::utils::utilities::dlio {
+
+namespace stats = ::dftracer::utils::utilities::common::statistics;
+
+namespace {
+
+YAML::Node emit_single(const stats::FittedDistribution& f) {
+    YAML::Node n;
+    switch (f.kind) {
+        case stats::DistributionKind::Normal:
+            n["type"] = "normal";
+            n["mean"] = f.params[0];
+            n["stdev"] = f.params[1];
+            break;
+        case stats::DistributionKind::Lognormal:
+            n["type"] = "lognormal";
+            n["mean"] = f.params[0];
+            n["sigma"] = f.params[1];
+            break;
+        case stats::DistributionKind::Gamma:
+            n["type"] = "gamma";
+            n["shape"] = f.params[0];
+            n["scale"] = f.params[1];
+            break;
+        case stats::DistributionKind::Exponential:
+            // params[0] is rate; DLIO config expects "scale" = 1/rate.
+            n["type"] = "exponential";
+            n["scale"] = f.params[0] > 0.0 ? 1.0 / f.params[0] : 0.0;
+            break;
+        case stats::DistributionKind::Weibull:
+            n["type"] = "weibull";
+            n["shape"] = f.params[0];
+            n["scale"] = f.params[1];
+            break;
+    }
+    return n;
+}
+
+YAML::Node emit_mixture(const stats::FittedMixture& m) {
+    YAML::Node n;
+    n["type"] = "mixture";
+    n["n_components"] = static_cast<int>(m.weights.size());
+    YAML::Node components(YAML::NodeType::Sequence);
+    for (std::size_t k = 0; k < m.weights.size(); ++k) {
+        YAML::Node comp;
+        comp["weight"] = m.weights[k];
+        YAML::Node params;
+        params["type"] = "normal";
+        params["mean"] = m.components[k].mean;
+        params["stdev"] = m.components[k].stddev;
+        comp["params"] = params;
+        components.push_back(comp);
+    }
+    n["components"] = components;
+    return n;
+}
+
+YAML::Node emit_block(const DlioTimingBlock& block) {
+    YAML::Node n = std::visit(
+        [](const auto& v) -> YAML::Node {
+            using T = std::decay_t<decltype(v)>;
+            if constexpr (std::is_same_v<T, stats::FittedDistribution>) {
+                return emit_single(v);
+            } else {
+                return emit_mixture(v);
+            }
+        },
+        block.model);
+    n["max_bound"] = block.max_bound;
+    return n;
+}
+
+}  // namespace
+
+std::string render_dlio_yaml(const DlioTimingBlock* computation,
+                             const DlioTimingBlock* preprocess) {
+    YAML::Node root;
+    if (computation) {
+        YAML::Node train;
+        train["computation_time"] = emit_block(*computation);
+        root["train"] = train;
+    }
+    if (preprocess) {
+        YAML::Node reader;
+        reader["preprocess_time"] = emit_block(*preprocess);
+        root["reader"] = reader;
+    }
+    YAML::Emitter emit;
+    emit.SetIndent(2);
+    emit.SetMapFormat(YAML::Block);
+    emit.SetSeqFormat(YAML::Block);
+    emit << root;
+    return std::string(emit.c_str());
+}
+
+bool write_dlio_yaml(std::ostream& out, const DlioTimingBlock* computation,
+                     const DlioTimingBlock* preprocess) {
+    out << render_dlio_yaml(computation, preprocess);
+    out << "\n";
+    return static_cast<bool>(out);
+}
+
+}  // namespace dftracer::utils::utilities::dlio
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index f6c2ea51..6a58eef2 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -203,6 +203,7 @@ set(TEST_BINARY_SOURCES
     binaries/test_dftracer_stats.cpp
     binaries/test_dftracer_index.cpp
     binaries/test_dftracer_aggregator.cpp
+    binaries/test_dftracer_gen_dlio_config.cpp
     binaries/test_dftracer_organize.cpp
     binaries/test_dftracer_view.cpp
     binaries/test_dftracer_tar.cpp
@@ -306,6 +307,9 @@ foreach(test_file ${TEST_BINARY_SOURCES})
   elseif(bin_exec STREQUAL "binaries/test_dftracer_aggregator")
     set_tests_properties(${bin_exec} PROPERTIES ENVIRONMENT
       "DFTRACER_AGGREGATOR_PATH=$<TARGET_FILE:dftracer_aggregator>")
+  elseif(bin_exec STREQUAL "binaries/test_dftracer_gen_dlio_config")
+    set_tests_properties(${bin_exec} PROPERTIES ENVIRONMENT
+      "DFTRACER_GEN_DLIO_CONFIG_PATH=$<TARGET_FILE:dftracer_gen_dlio_config>")
   elseif(bin_exec STREQUAL "binaries/test_dftracer_organize")
     set_tests_properties(${bin_exec} PROPERTIES ENVIRONMENT
       "DFTRACER_ORGANIZE_PATH=$<TARGET_FILE:dftracer_organize>;DFTRACER_RECONSTRUCT_PATH=$<TARGET_FILE:dftracer_reconstruct>")
diff --git a/tests/binaries/test_dftracer_gen_dlio_config.cpp b/tests/binaries/test_dftracer_gen_dlio_config.cpp
new file mode 100644
index 00000000..13299308
--- /dev/null
+++ b/tests/binaries/test_dftracer_gen_dlio_config.cpp
@@ -0,0 +1,222 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <dftracer/utils/core/common/filesystem.h>
+#include <doctest/doctest.h>
+#include <sys/wait.h>
+#include <testing_utilities.h>
+#include <unistd.h>
+#include <zlib.h>
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace {
+
+// Produce a .pfw.gz file containing realistic DLIO trace events: a mix of
+// (cat=dataloader, name=fetch.block / fetch.iter) and (cat=data,
+// name=preprocess) duration events spread across `num_ranks` pids. Each rank
+// gets `events_per_rank` of each kind.
+std::string create_dlio_pfw_gz(dft_utils_test::TestEnvironment& env, int id,
+                               int num_ranks, int events_per_rank) {
+    const std::string plain_path =
+        env.get_dir() + "/dlio_trace_" + std::to_string(id) + ".pfw";
+    {
+        std::ofstream ofs(plain_path);
+        if (!ofs.is_open()) return "";
+        ofs << "[\n";
+        std::uint64_t event_id = 1;
+        const std::uint64_t base_ts = 1000000000ULL;
+        for (int rank = 0; rank < num_ranks; ++rank) {
+            const std::uint64_t pid = 1000 + static_cast<std::uint64_t>(rank);
+            const std::uint64_t tid = 2000 + static_cast<std::uint64_t>(rank);
+            std::uint64_t ts = base_ts + rank * 100000ULL;
+            for (int i = 0; i < events_per_rank; ++i) {
+                // fetch.block: lognormal-ish via varying durations.
+                const std::uint64_t fb_dur = 100 + (i * 7) % 500;
+                ofs << R"({"id":)" << event_id++ << R"(,"pid":)" << pid
+                    << R"(,"tid":)" << tid
+                    << R"(,"name":"fetch.block","cat":"dataloader")"
+                    << R"(,"ph":"X","ts":)" << ts << R"(,"dur":)" << fb_dur
+                    << R"(,"args":{"hhash":"h1"}})"
+                    << "\n";
+                ts += fb_dur;
+
+                // fetch.iter: shorter durations.
+                const std::uint64_t fi_dur = 50 + (i * 3) % 100;
+                ofs << R"({"id":)" << event_id++ << R"(,"pid":)" << pid
+                    << R"(,"tid":)" << tid
+                    << R"(,"name":"fetch.iter","cat":"dataloader")"
+                    << R"(,"ph":"X","ts":)" << ts << R"(,"dur":)" << fi_dur
+                    << R"(,"args":{"hhash":"h1"}})"
+                    << "\n";
+                ts += fi_dur;
+
+                // preprocess: emitted by a worker pid (different from main).
+                const std::uint64_t worker_pid = pid + 100000ULL;
+                const std::uint64_t pre_dur = 80 + (i * 5) % 200;
+                ofs << R"({"id":)" << event_id++ << R"(,"pid":)" << worker_pid
+                    << R"(,"tid":)" << tid
+                    << R"(,"name":"preprocess","cat":"data")"
+                    << R"(,"ph":"X","ts":)" << ts << R"(,"dur":)" << pre_dur
+                    << R"(,"args":{"hhash":"h1"}})"
+                    << "\n";
+                ts += pre_dur;
+            }
+        }
+        ofs << "]\n";
+    }
+
+    // Compress to .pfw.gz.
+    std::string gz_path = plain_path + ".gz";
+    {
+        gzFile gz = gzopen(gz_path.c_str(), "wb");
+        if (!gz) return "";
+        std::ifstream ifs(plain_path, std::ios::binary);
+        std::stringstream ss;
+        ss << ifs.rdbuf();
+        const std::string body = ss.str();
+        gzwrite(gz, body.data(), static_cast<unsigned>(body.size()));
+        gzclose(gz);
+    }
+    fs::remove(plain_path);
+    return gz_path;
+}
+
+std::string find_binary() {
+    const char* env_path = std::getenv("DFTRACER_GEN_DLIO_CONFIG_PATH");
+    if (env_path != nullptr && ::access(env_path, X_OK) == 0) return env_path;
+
+    std::vector<std::string> candidates = {
+        "./dftracer_gen_dlio_config",         "../dftracer_gen_dlio_config",
+        "../../dftracer_gen_dlio_config",     "../bin/dftracer_gen_dlio_config",
+        "../../bin/dftracer_gen_dlio_config",
+    };
+    for (const auto& path : candidates) {
+        if (::access(path.c_str(), X_OK) == 0) return path;
+    }
+    return "";
+}
+
+int run_binary(const std::string& binary,
+               const std::vector<std::string>& args) {
+    pid_t pid = ::fork();
+    if (pid < 0) return -1;
+    if (pid == 0) {
+        std::vector<const char*> argv;
+        argv.push_back(binary.c_str());
+        for (const auto& a : args) argv.push_back(a.c_str());
+        argv.push_back(nullptr);
+        ::execv(binary.c_str(), const_cast<char* const*>(argv.data()));
+        ::_exit(127);
+    }
+    int status = 0;
+    ::waitpid(pid, &status, 0);
+    if (WIFEXITED(status)) return WEXITSTATUS(status);
+    return -1;
+}
+
+std::string read_file(const std::string& path) {
+    std::ifstream ifs(path);
+    std::stringstream ss;
+    ss << ifs.rdbuf();
+    return ss.str();
+}
+
+}  // namespace
+
+TEST_SUITE("DFTracerGenDlioConfig") {
+    TEST_CASE("binary exists") {
+        const auto binary = find_binary();
+        if (binary.empty()) {
+            MESSAGE(
+                "dftracer_gen_dlio_config not found. Set "
+                "DFTRACER_GEN_DLIO_CONFIG_PATH to locate it.");
+            return;
+        }
+        CHECK(!binary.empty());
+    }
+
+    TEST_CASE("--help exits 0") {
+        const auto binary = find_binary();
+        if (binary.empty()) return;
+        CHECK(run_binary(binary, {"--help"}) == 0);
+    }
+
+    TEST_CASE("missing --output rejected") {
+        const auto binary = find_binary();
+        if (binary.empty()) return;
+        // Pointing -d at a valid empty dir but no -o; argparse should fail.
+        dft_utils_test::TestEnvironment env(10);
+        REQUIRE(env.is_valid());
+        CHECK(run_binary(binary, {"-d", env.get_dir()}) != 0);
+    }
+
+    TEST_CASE("directory without DLIO events fails gracefully") {
+        const auto binary = find_binary();
+        if (binary.empty()) return;
+        dft_utils_test::TestEnvironment env(50);
+        REQUIRE(env.is_valid());
+
+        // Generic POSIX trace (no fetch.block/preprocess events).
+        auto trace_gz = env.create_dft_test_gzip_file(50);
+        REQUIRE(!trace_gz.empty());
+
+        const std::string out = env.get_dir() + "/dlio_config.yaml";
+        const int rc = run_binary(binary, {"-d", env.get_dir(), "-o", out});
+        // Non-zero exit and no YAML produced.
+        CHECK(rc != 0);
+        CHECK_FALSE(fs::exists(out));
+    }
+
+    TEST_CASE(
+        "happy path: DLIO traces produce a valid YAML with train + reader") {
+        const auto binary = find_binary();
+        if (binary.empty()) return;
+        dft_utils_test::TestEnvironment env(200);
+        REQUIRE(env.is_valid());
+
+        for (int i = 0; i < 2; ++i) {
+            auto f = create_dlio_pfw_gz(env, i, /*num_ranks=*/2,
+                                        /*events_per_rank=*/200);
+            REQUIRE(!f.empty());
+        }
+
+        const std::string out = env.get_dir() + "/dlio_config.yaml";
+        const int rc = run_binary(binary, {"-d", env.get_dir(), "-o", out,
+                                           "--simulation-iterations", "2"});
+        CHECK(rc == 0);
+        REQUIRE(fs::exists(out));
+
+        const std::string contents = read_file(out);
+        // Spot-check the YAML schema. Don't pin specific distribution choice
+        // (fitter may pick any of single/GMM-2/GMM-3 depending on data).
+        CHECK(contents.find("train:") != std::string::npos);
+        CHECK(contents.find("computation_time:") != std::string::npos);
+        CHECK(contents.find("reader:") != std::string::npos);
+        CHECK(contents.find("preprocess_time:") != std::string::npos);
+        CHECK(contents.find("type:") != std::string::npos);
+        CHECK(contents.find("max_bound:") != std::string::npos);
+    }
+
+    TEST_CASE("respects --num-workers and --prefetch-factor") {
+        const auto binary = find_binary();
+        if (binary.empty()) return;
+        dft_utils_test::TestEnvironment env(200);
+        REQUIRE(env.is_valid());
+
+        auto f = create_dlio_pfw_gz(env, 0, /*num_ranks=*/1,
+                                    /*events_per_rank=*/150);
+        REQUIRE(!f.empty());
+
+        const std::string out = env.get_dir() + "/dlio_config.yaml";
+        const int rc = run_binary(
+            binary, {"-d", env.get_dir(), "-o", out, "--num-workers", "4",
+                     "--prefetch-factor", "1", "--simulation-iterations", "2"});
+        CHECK(rc == 0);
+        CHECK(fs::exists(out));
+    }
+}
diff --git a/tests/utilities/CMakeLists.txt b/tests/utilities/CMakeLists.txt
index c3ab418f..fbc09bbf 100644
--- a/tests/utilities/CMakeLists.txt
+++ b/tests/utilities/CMakeLists.txt
@@ -56,6 +56,13 @@ set(UTILITIES_TEST_SOURCES
     composites/dft/statistics/test_statistics_query.cpp
     common/statistics/test_log2_histogram.cpp
     common/statistics/test_timestamp_histogram.cpp
+
+    # Distribution fitting
+    common/statistics/test_distributions.cpp
+    common/statistics/test_mixture.cpp
+
+    # DLIO config generation
+    dlio/test_barrier_simulator.cpp
     composites/dft/statistics/test_detailed_statistics.cpp
 
     # Query language
diff --git a/tests/utilities/common/statistics/test_distributions.cpp b/tests/utilities/common/statistics/test_distributions.cpp
new file mode 100644
index 00000000..b2f34518
--- /dev/null
+++ b/tests/utilities/common/statistics/test_distributions.cpp
@@ -0,0 +1,174 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <dftracer/utils/utilities/common/statistics/distributions.h>
+#include <doctest/doctest.h>
+
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+
+using namespace dftracer::utils::utilities::common::statistics;
+using Rng = std::mt19937_64;
+
+namespace {
+
+std::vector<double> generate_samples(std::size_t n,
+                                     std::function<double(std::mt19937_64&)> f,
+                                     std::uint64_t seed = 12345) {
+    std::mt19937_64 rng(seed);
+    std::vector<double> data;
+    data.reserve(n);
+    for (std::size_t i = 0; i < n; ++i) data.push_back(f(rng));
+    return data;
+}
+
+}  // namespace
+
+TEST_SUITE("fit_single_distribution") {
+    TEST_CASE("Normal: recovers mean and stddev") {
+        auto data = generate_samples(5000, [](auto& r) {
+            return std::normal_distribution<double>(2.5, 0.7)(r);
+        });
+        const auto fit =
+            fit_single_distribution(DistributionKind::Normal, data);
+        REQUIRE(fit.valid);
+        CHECK(fit.params[0] == doctest::Approx(2.5).epsilon(0.05));
+        CHECK(fit.params[1] == doctest::Approx(0.7).epsilon(0.05));
+        // Same-family fit should have small KS statistic on 5k samples.
+        CHECK(fit.ks_stat < 0.05);
+    }
+
+    TEST_CASE("Lognormal: recovers mu and sigma in log space") {
+        auto data = generate_samples(5000, [](auto& r) {
+            return std::lognormal_distribution<double>(-1.0, 0.5)(r);
+        });
+        const auto fit =
+            fit_single_distribution(DistributionKind::Lognormal, data);
+        REQUIRE(fit.valid);
+        CHECK(fit.params[0] == doctest::Approx(-1.0).epsilon(0.05));
+        CHECK(fit.params[1] == doctest::Approx(0.5).epsilon(0.05));
+        CHECK(fit.ks_stat < 0.05);
+    }
+
+    TEST_CASE("Exponential: recovers rate") {
+        auto data = generate_samples(5000, [](auto& r) {
+            return std::exponential_distribution<double>(3.0)(r);
+        });
+        const auto fit =
+            fit_single_distribution(DistributionKind::Exponential, data);
+        REQUIRE(fit.valid);
+        CHECK(fit.params[0] == doctest::Approx(3.0).epsilon(0.05));
+        CHECK(fit.ks_stat < 0.05);
+    }
+
+    TEST_CASE("Gamma: recovers shape and scale within 5%") {
+        auto data = generate_samples(5000, [](auto& r) {
+            return std::gamma_distribution<double>(2.0, 0.3)(r);
+        });
+        const auto fit = fit_single_distribution(DistributionKind::Gamma, data);
+        REQUIRE(fit.valid);
+        CHECK(fit.params[0] == doctest::Approx(2.0).epsilon(0.05));
+        CHECK(fit.params[1] == doctest::Approx(0.3).epsilon(0.05));
+        CHECK(fit.ks_stat < 0.05);
+    }
+
+    TEST_CASE("Weibull: recovers shape and scale within 5%") {
+        auto data = generate_samples(5000, [](auto& r) {
+            return std::weibull_distribution<double>(1.5, 2.0)(r);
+        });
+        const auto fit =
+            fit_single_distribution(DistributionKind::Weibull, data);
+        REQUIRE(fit.valid);
+        CHECK(fit.params[0] == doctest::Approx(1.5).epsilon(0.05));
+        CHECK(fit.params[1] == doctest::Approx(2.0).epsilon(0.05));
+        CHECK(fit.ks_stat < 0.05);
+    }
+
+    TEST_CASE("Lognormal: rejects non-positive data") {
+        std::vector<double> data{1.0, -2.0, 3.0, 4.0};
+        const auto fit =
+            fit_single_distribution(DistributionKind::Lognormal, data);
+        CHECK_FALSE(fit.valid);
+    }
+
+    TEST_CASE("Normal: rejects too few samples") {
+        std::vector<double> data{1.0};
+        const auto fit =
+            fit_single_distribution(DistributionKind::Normal, data);
+        CHECK_FALSE(fit.valid);
+    }
+}
+
+TEST_SUITE("fit_all_single_distributions") {
+    TEST_CASE("ranks correct family at the top") {
+        auto data = generate_samples(2000, [](auto& r) {
+            return std::lognormal_distribution<double>(0.0, 0.4)(r);
+        });
+        const auto fits = fit_all_single_distributions(data);
+        REQUIRE_FALSE(fits.empty());
+        const auto best = best_fit_by_ks(fits);
+        REQUIRE(best.has_value());
+        CHECK(best->kind == DistributionKind::Lognormal);
+    }
+
+    TEST_CASE("all valid fits sorted ascending by KS") {
+        auto data = generate_samples(2000, [](auto& r) {
+            return std::gamma_distribution<double>(2.0, 0.5)(r);
+        });
+        const auto fits = fit_all_single_distributions(data);
+        REQUIRE(fits.size() == 5);
+        double last_ks = -1.0;
+        for (const auto& f : fits) {
+            if (!f.valid) break;
+            CHECK(f.ks_stat >= last_ks);
+            last_ks = f.ks_stat;
+        }
+    }
+}
+
+TEST_SUITE("FittedDistribution pdf/cdf/quantile") {
+    TEST_CASE("Normal cdf matches Gaussian quantiles") {
+        FittedDistribution fit{
+            DistributionKind::Normal, {0.0, 1.0, 0.0}, 0.0, 0.0, 0.0, true};
+        // 97.5th percentile of standard normal ~ 1.96.
+        CHECK(quantile(fit, 0.975) == doctest::Approx(1.959964).epsilon(1e-3));
+        CHECK(cdf(fit, 0.0) == doctest::Approx(0.5));
+    }
+
+    TEST_CASE("Lognormal quantile at median") {
+        FittedDistribution fit{
+            DistributionKind::Lognormal, {0.0, 1.0, 0.0}, 0.0, 0.0, 0.0, true};
+        CHECK(quantile(fit, 0.5) == doctest::Approx(1.0).epsilon(1e-6));
+    }
+}
+
+TEST_SUITE("make_sampler") {
+    TEST_CASE("Normal sampler reproduces fit mean within tolerance") {
+        FittedDistribution fit{
+            DistributionKind::Normal, {1.0, 0.2, 0.0}, 0.0, 0.0, 0.0, true};
+        auto sampler = make_sampler(fit);
+        Rng rng(99);
+        double sum = 0.0;
+        const int n = 4000;
+        for (int i = 0; i < n; ++i) sum += sampler(rng);
+        const double mean = sum / n;
+        CHECK(mean == doctest::Approx(1.0).epsilon(0.05));
+    }
+
+    TEST_CASE("clamps to provided bounds") {
+        FittedDistribution fit{
+            DistributionKind::Normal, {0.0, 1.0, 0.0}, 0.0, 0.0, 0.0, true};
+        auto sampler = make_sampler(fit, /*min_bound=*/-0.5, /*max_bound=*/0.5);
+        Rng rng(7);
+        for (int i = 0; i < 200; ++i) {
+            const double s = sampler(rng);
+            CHECK(s >= -0.5);
+            CHECK(s <= 0.5);
+        }
+    }
+
+    TEST_CASE("throws on invalid fit") {
+        FittedDistribution fit;  // valid = false
+        CHECK_THROWS_AS(make_sampler(fit), std::invalid_argument);
+    }
+}
diff --git a/tests/utilities/common/statistics/test_mixture.cpp b/tests/utilities/common/statistics/test_mixture.cpp
new file mode 100644
index 00000000..f268beec
--- /dev/null
+++ b/tests/utilities/common/statistics/test_mixture.cpp
@@ -0,0 +1,176 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <dftracer/utils/utilities/common/statistics/distributions.h>
+#include <dftracer/utils/utilities/common/statistics/mixture.h>
+#include <doctest/doctest.h>
+
+#include <algorithm>
+#include <cmath>
+#include <random>
+#include <variant>
+#include <vector>
+
+using namespace dftracer::utils::utilities::common::statistics;
+
+namespace {
+
+// Sample n points from a K-component Normal mixture with given
+// weights/means/stddevs.
+std::vector<double> sample_mixture(std::size_t n,
+                                   const std::vector<double>& weights,
+                                   const std::vector<double>& means,
+                                   const std::vector<double>& stddevs,
+                                   std::uint64_t seed = 4242) {
+    std::mt19937_64 rng(seed);
+    std::discrete_distribution<int> cat(weights.begin(), weights.end());
+    std::vector<double> data;
+    data.reserve(n);
+    for (std::size_t i = 0; i < n; ++i) {
+        const int k = cat(rng);
+        data.push_back(
+            std::normal_distribution<double>(means[k], stddevs[k])(rng));
+    }
+    return data;
+}
+
+}  // namespace
+
+TEST_SUITE("fit_gaussian_mixture") {
+    TEST_CASE("K=2: recovers means within tolerance") {
+        // Two well-separated Gaussians: N(0, 0.3) with weight 0.4, N(3, 0.5)
+        // with 0.6.
+        auto data = sample_mixture(5000, {0.4, 0.6}, {0.0, 3.0}, {0.3, 0.5},
+                                   /*seed=*/7);
+        const auto fit = fit_gaussian_mixture(data, 2);
+        REQUIRE(fit.valid);
+        REQUIRE(fit.components.size() == 2);
+
+        // Sort by mean for deterministic comparison (EM has label-switching
+        // freedom).
+        std::vector<std::size_t> idx{0, 1};
+        std::sort(idx.begin(), idx.end(), [&](std::size_t a, std::size_t b) {
+            return fit.components[a].mean < fit.components[b].mean;
+        });
+        const auto& c0 = fit.components[idx[0]];
+        const auto& c1 = fit.components[idx[1]];
+
+        CHECK(c0.mean == doctest::Approx(0.0).epsilon(0.1));
+        CHECK(c1.mean == doctest::Approx(3.0).epsilon(0.05));
+        CHECK(c0.stddev == doctest::Approx(0.3).epsilon(0.15));
+        CHECK(c1.stddev == doctest::Approx(0.5).epsilon(0.15));
+        CHECK(fit.weights[idx[0]] == doctest::Approx(0.4).epsilon(0.1));
+        CHECK(fit.weights[idx[1]] == doctest::Approx(0.6).epsilon(0.1));
+        CHECK(fit.converged);
+    }
+
+    TEST_CASE("K=3: converges and weights sum to 1") {
+        auto data = sample_mixture(6000, {0.3, 0.4, 0.3}, {-2.0, 0.5, 3.0},
+                                   {0.4, 0.3, 0.5},
+                                   /*seed=*/11);
+        const auto fit = fit_gaussian_mixture(data, 3);
+        REQUIRE(fit.valid);
+        REQUIRE(fit.components.size() == 3);
+
+        double wsum = 0.0;
+        for (double w : fit.weights) wsum += w;
+        CHECK(wsum == doctest::Approx(1.0).epsilon(1e-9));
+        CHECK(fit.iterations > 0);
+    }
+
+    TEST_CASE("rejects too few samples") {
+        std::vector<double> data{1.0, 2.0};
+        const auto fit = fit_gaussian_mixture(data, 2);
+        CHECK_FALSE(fit.valid);
+    }
+}
+
+TEST_SUITE("FittedMixture pdf/cdf/sampler") {
+    TEST_CASE("pdf matches hand-computed value") {
+        FittedMixture m;
+        m.weights = {0.5, 0.5};
+        m.components = {{0.0, 1.0}, {2.0, 1.0}};
+        m.valid = true;
+        // pdf at x=1: 0.5 * N(1; 0, 1) + 0.5 * N(1; 2, 1)
+        // = 0.5 * 0.24197 + 0.5 * 0.24197 = 0.24197
+        CHECK(pdf(m, 1.0) == doctest::Approx(0.24197).epsilon(1e-4));
+    }
+
+    TEST_CASE("cdf is monotone and bounded") {
+        FittedMixture m;
+        m.weights = {0.3, 0.7};
+        m.components = {{-1.0, 0.5}, {1.0, 0.5}};
+        m.valid = true;
+        CHECK(cdf(m, -10.0) == doctest::Approx(0.0).epsilon(1e-6));
+        CHECK(cdf(m, 10.0) == doctest::Approx(1.0).epsilon(1e-6));
+        CHECK(cdf(m, 0.0) > cdf(m, -1.0));
+    }
+
+    TEST_CASE("sampler reproduces mixture mean within tolerance") {
+        FittedMixture m;
+        m.weights = {0.5, 0.5};
+        m.components = {{1.0, 0.2}, {3.0, 0.2}};
+        m.valid = true;
+        auto sampler = make_sampler(m);
+        std::mt19937_64 rng(17);
+        double sum = 0.0;
+        const int n = 6000;
+        for (int i = 0; i < n; ++i) sum += sampler(rng);
+        // True mixture mean = 0.5*1 + 0.5*3 = 2.0
+        CHECK(sum / n == doctest::Approx(2.0).epsilon(0.05));
+    }
+}
+
+TEST_SUITE("select_best_model") {
+    TEST_CASE("picks mixture for bimodal data") {
+        auto data = sample_mixture(4000, {0.5, 0.5}, {0.0, 4.0}, {0.3, 0.4},
+                                   /*seed=*/22);
+        const auto singles = fit_all_single_distributions(data);
+        std::vector<FittedMixture> mixes{fit_gaussian_mixture(data, 2),
+                                         fit_gaussian_mixture(data, 3)};
+        const auto best = select_best_model(singles, mixes);
+        REQUIRE(best.has_value());
+        // For clearly bimodal data, no single dist should win.
+        CHECK(std::holds_alternative<FittedMixture>(best->model));
+    }
+
+    TEST_CASE("picks single for unimodal Normal data") {
+        std::mt19937_64 rng(33);
+        std::vector<double> data;
+        data.reserve(3000);
+        for (int i = 0; i < 3000; ++i)
+            data.push_back(std::normal_distribution<double>(2.0, 0.5)(rng));
+        const auto singles = fit_all_single_distributions(data);
+        std::vector<FittedMixture> mixes{fit_gaussian_mixture(data, 2),
+                                         fit_gaussian_mixture(data, 3)};
+        const auto best = select_best_model(singles, mixes);
+        REQUIRE(best.has_value());
+        // With BIC's parameter penalty, a single Normal should beat GMM-2/3.
+        CHECK(std::holds_alternative<FittedDistribution>(best->model));
+    }
+
+    TEST_CASE("empty inputs return nullopt") {
+        const auto best = select_best_model({}, {});
+        CHECK_FALSE(best.has_value());
+    }
+}
+
+TEST_SUITE("BestModel variant dispatch") {
+    TEST_CASE("pdf dispatches through variant") {
+        FittedDistribution f{
+            DistributionKind::Normal, {0.0, 1.0, 0.0}, 0.0, 0.0, 0.0, true};
+        BestModel bm{f};
+        // pdf(std normal, 0) = 1/sqrt(2*pi) ~ 0.39894
+        CHECK(pdf(bm, 0.0) == doctest::Approx(0.39894).epsilon(1e-3));
+    }
+
+    TEST_CASE("sampler dispatches through variant") {
+        FittedMixture m;
+        m.weights = {1.0};
+        m.components = {{5.0, 0.1}};
+        m.valid = true;
+        BestModel bm{m};
+        auto s = make_sampler(bm);
+        std::mt19937_64 rng(55);
+        const double draw = s(rng);
+        CHECK(draw == doctest::Approx(5.0).epsilon(0.1));
+    }
+}
diff --git a/tests/utilities/dlio/test_barrier_simulator.cpp b/tests/utilities/dlio/test_barrier_simulator.cpp
new file mode 100644
index 00000000..385548f9
--- /dev/null
+++ b/tests/utilities/dlio/test_barrier_simulator.cpp
@@ -0,0 +1,241 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <dftracer/utils/utilities/dlio/barrier_simulator.h>
+#include <dftracer/utils/utilities/dlio/statistic.h>
+#include <dftracer/utils/utilities/dlio/worker_queue.h>
+#include <doctest/doctest.h>
+
+#include <cmath>
+#include <vector>
+
+using namespace dftracer::utils::utilities::dlio;
+
+namespace {
+
+Statistic constant_stat(double value) {
+    Statistic s;
+    s.update(value);
+    return s;
+}
+
+constexpr double kEpsilon = 1e-9;
+
+}  // namespace
+
+TEST_SUITE("sweep_union") {
+    TEST_CASE("empty") {
+        std::vector<Boundary> b;
+        CHECK(sweep_union(b) == doctest::Approx(0.0));
+    }
+
+    TEST_CASE("single interval - returns its width in seconds") {
+        // 1 second = 1,000,000 us.
+        std::vector<Boundary> b{{0, +1}, {1'000'000, -1}};
+        CHECK(sweep_union(b) == doctest::Approx(1.0));
+    }
+
+    TEST_CASE("two disjoint intervals - sum of widths") {
+        std::vector<Boundary> b{
+            {0, +1}, {500'000, -1}, {1'000'000, +1}, {1'500'000, -1}};
+        CHECK(sweep_union(b) == doctest::Approx(1.0));
+    }
+
+    TEST_CASE("two overlapping intervals - merged width") {
+        // [0, 1s] and [0.5s, 1.5s] -> union is [0, 1.5s] = 1.5s.
+        std::vector<Boundary> b{
+            {0, +1}, {1'000'000, -1}, {500'000, +1}, {1'500'000, -1}};
+        CHECK(sweep_union(b) == doctest::Approx(1.5));
+    }
+
+    TEST_CASE("nested intervals - outer width") {
+        // [0, 2s] contains [0.5s, 1.5s].
+        std::vector<Boundary> b{
+            {0, +1}, {2'000'000, -1}, {500'000, +1}, {1'500'000, -1}};
+        CHECK(sweep_union(b) == doctest::Approx(2.0));
+    }
+}
+
+TEST_SUITE("cdf_similarity") {
+    TEST_CASE("identical samples - exactly 1.0") {
+        std::vector<double> a{0.1, 0.2, 0.3, 0.4};
+        CHECK(cdf_similarity(a, a) == doctest::Approx(1.0));
+    }
+
+    TEST_CASE("fully disjoint samples") {
+        // a in [0, 0.4], b in [10, 10.4]: at every value v < 10 the trace CDF
+        // is 0 while sim CDF is 1 -> KS = 1, similarity = 0.
+        std::vector<double> a{0.1, 0.2, 0.3, 0.4};
+        std::vector<double> b{10.0, 10.1, 10.2, 10.4};
+        CHECK(cdf_similarity(a, b) == doctest::Approx(0.0));
+    }
+
+    TEST_CASE("empty inputs") {
+        std::vector<double> a{1.0, 2.0};
+        std::vector<double> empty;
+        CHECK(cdf_similarity(a, empty) == doctest::Approx(0.0));
+        CHECK(cdf_similarity(empty, a) == doctest::Approx(0.0));
+    }
+}
+
+TEST_SUITE("variance") {
+    TEST_CASE("empty") { CHECK(variance({}) == doctest::Approx(0.0)); }
+
+    TEST_CASE("constant values - zero variance") {
+        CHECK(variance({3.0, 3.0, 3.0}) == doctest::Approx(0.0));
+    }
+
+    TEST_CASE("known sample") {
+        // Population variance of {1, 2, 3, 4, 5} = 2.0.
+        CHECK(variance({1.0, 2.0, 3.0, 4.0, 5.0}) == doctest::Approx(2.0));
+    }
+}
+
+TEST_SUITE("WorkerQueue") {
+    TEST_CASE("produce_batches fills exactly to capacity") {
+        WorkerQueue q(/*num_workers=*/2, /*prefetch_factor=*/3);
+        auto sampler = []() -> std::pair<double, double> { return {1.0, 0.5}; };
+        auto intervals = q.produce_batches(0.0, sampler);
+        CHECK(intervals.size() == 6);
+        CHECK(q.queue_depth() == 6);
+    }
+
+    TEST_CASE("consume_batch on ready batch returns base_overhead") {
+        WorkerQueue q(1, 1);
+        auto sampler = []() -> std::pair<double, double> { return {2.0, 1.0}; };
+        q.produce_batches(0.0, sampler);
+        const double consumed =
+            q.consume_batch(/*current_time=*/5.0, /*base_overhead=*/0.1);
+        CHECK(consumed == doctest::Approx(0.1));
+        CHECK(q.queue_depth() == 0);
+        CHECK_FALSE(q.had_stall());
+    }
+
+    TEST_CASE("consume_batch on empty queue counts as stall") {
+        WorkerQueue q(1, 1);
+        const double consumed = q.consume_batch(0.0, 0.1);
+        CHECK(consumed == doctest::Approx(0.1));
+        CHECK(q.had_stall());
+        CHECK(q.stall_count() == 1);
+    }
+
+    TEST_CASE(
+        "consume_batch on not-yet-ready batch returns wait + base_overhead") {
+        WorkerQueue q(1, 1);
+        auto sampler = []() -> std::pair<double, double> {
+            return {10.0, 5.0};
+        };
+        q.produce_batches(0.0, sampler);
+        const double consumed =
+            q.consume_batch(/*current_time=*/3.0, /*base_overhead=*/0.2);
+        // Batch ready at 10.0, current 3.0 -> wait 7.0 + base 0.2 = 7.2.
+        CHECK(consumed == doctest::Approx(7.2));
+        CHECK(q.had_stall());
+    }
+}
+
+TEST_SUITE("BarrierSimulator") {
+    // Build a context with deterministic constant samplers and trace data.
+    // Each rank runs num_steps iterations with constant fetch_iter=0.05s,
+    // constant fetch_block=0.1s, no preprocess simulation, aggregated mode.
+    static BarrierSimulatorContext make_ctx(int num_ranks, int num_steps) {
+        BarrierSimulatorContext ctx;
+        ctx.num_ranks = num_ranks;
+        ctx.num_steps = num_steps;
+        ctx.is_aggregated_trace = true;  // skip clamp-to-trace-bounds
+        ctx.sync_mode = false;
+
+        // Pre-loaded fetch_iter trace lets the simulator skip RNG-driven path.
+        ctx.fetch_iter_trace.assign(num_ranks,
+                                    std::vector<double>(num_steps, 0.05));
+        // Trace fetch_block values (used by CDF similarity check).
+        ctx.fetch_block_trace.assign(num_ranks,
+                                     std::vector<double>(num_steps, 0.1));
+
+        ctx.fetch_iter_stats = constant_stat(0.05);
+        ctx.fetch_block_stats = constant_stat(0.1);
+        ctx.preprocess_stats = constant_stat(0.01);
+        ctx.getitem_stats = constant_stat(0.02);
+
+        ctx.trace_e2e_duration = num_steps * (0.05 + 0.1);
+        ctx.trace_rank_variance = 0.0;
+        ctx.trace_per_rank_throughput.assign(num_ranks, 1.0 / 0.15);
+        return ctx;
+    }
+
+    TEST_CASE("async mode, constant work - e2e equals per-rank wall clock") {
+        const int num_ranks = 4;
+        const int num_steps = 10;
+        auto ctx = make_ctx(num_ranks, num_steps);
+
+        BarrierSimulator sim;
+        auto result =
+            sim.simulate(ctx, /*base_seed=*/42,
+                         /*fetch_block_sampler=*/[](Rng&) { return 0.1; });
+
+        const double expected_per_rank = num_steps * (0.05 + 0.1);  // 1.5s
+        CHECK(result.per_rank_completion_time.size() ==
+              static_cast<std::size_t>(num_ranks));
+        for (double t : result.per_rank_completion_time) {
+            CHECK(t == doctest::Approx(expected_per_rank).epsilon(kEpsilon));
+        }
+        // All ranks run in lockstep over the same wall clock; union = per-rank
+        // time.
+        CHECK(result.e2e_duration ==
+              doctest::Approx(expected_per_rank).epsilon(1e-6));
+        CHECK(result.rank_variance == doctest::Approx(0.0));
+        CHECK(result.load_imbalance == doctest::Approx(0.0).epsilon(1e-6));
+        CHECK(result.e2e_error == doctest::Approx(0.0).epsilon(1e-6));
+
+        // Simulated fetch_block matches trace exactly -> CDF similarity == 1.0.
+        CHECK(result.fetch_block_cdf_similarity == doctest::Approx(1.0));
+
+        // Component accumulated times.
+        CHECK(result.fetch_block_metrics.accumulated_time ==
+              doctest::Approx(num_ranks * num_steps * 0.1));
+        CHECK(result.fetch_iter_metrics.accumulated_time ==
+              doctest::Approx(num_ranks * num_steps * 0.05));
+        CHECK(result.fetch_block_metrics.num_samples ==
+              static_cast<std::uint64_t>(num_ranks * num_steps));
+    }
+
+    TEST_CASE("sync mode with barrier every step - lockstep advance") {
+        const int num_ranks = 3;
+        const int num_steps = 5;
+        auto ctx = make_ctx(num_ranks, num_steps);
+        ctx.sync_mode = true;
+        ctx.accumulate_grad_batches = 1;
+
+        BarrierSimulator sim;
+        auto result =
+            sim.simulate(ctx, /*base_seed=*/123, [](Rng&) { return 0.1; });
+
+        const double expected = num_steps * 0.15;
+        // In sync mode all ranks finish at the same time; e2e = rank_times[0].
+        CHECK(result.e2e_duration == doctest::Approx(expected).epsilon(1e-6));
+        for (double t : result.per_rank_completion_time) {
+            CHECK(t == doctest::Approx(expected).epsilon(kEpsilon));
+        }
+        // Constant work + barrier -> zero barrier overhead.
+        CHECK(result.avg_barrier_overhead == doctest::Approx(0.0));
+        CHECK(result.max_barrier_overhead == doctest::Approx(0.0));
+    }
+
+    TEST_CASE(
+        "throughput metrics populated when trace throughput is provided") {
+        auto ctx = make_ctx(/*num_ranks=*/2, /*num_steps=*/8);
+        BarrierSimulator sim;
+        auto result = sim.simulate(ctx, 7, [](Rng&) { return 0.1; });
+
+        const double expected_throughput = 8.0 / (8 * 0.15);
+        CHECK(result.simulated_per_rank_throughput.size() == 2);
+        for (double tp : result.simulated_per_rank_throughput) {
+            CHECK(tp == doctest::Approx(expected_throughput));
+        }
+        CHECK(result.throughput_mean == doctest::Approx(expected_throughput));
+        CHECK(result.throughput_variance == doctest::Approx(0.0).epsilon(1e-9));
+        // CDF similarity is computed but not asserted exactly: simulated
+        // throughput can differ from trace throughput by ~1 ULP due to
+        // accumulation order, and KS is exact so a 1-ULP gap collapses
+        // similarity to 0. Just check it ran.
+        CHECK(result.trace_per_rank_throughput.size() == 2);
+    }
+}

From eafcd87412bcf35508e985ce53e51d08322d603c Mon Sep 17 00:00:00 2001
From: Ray Andrew <rs@rs.ht>
Date: Mon, 18 May 2026 22:21:18 -0500
Subject: [PATCH 2/2] chore(cods): update DLIO generator docs

---
 docs/source/cli.rst              |  95 ++++++++++++++++++-
 docs/source/pipeline.rst         |   2 +-
 docs/source/utilities.rst        |  32 ++++++-
 docs/source/utilities/common.rst | 101 +++++++++++++++++++-
 docs/source/utilities/dlio.rst   | 157 +++++++++++++++++++++++++++++++
 5 files changed, 377 insertions(+), 10 deletions(-)
 create mode 100644 docs/source/utilities/dlio.rst

diff --git a/docs/source/cli.rst b/docs/source/cli.rst
index e4b74464..1b34d463 100644
--- a/docs/source/cli.rst
+++ b/docs/source/cli.rst
@@ -13,7 +13,7 @@ Most tools wire in a common set of argument schemas defined in
 semantics across every binary that exposes the relevant schema and are not
 repeated in each tool's section.
 
-**Pipeline (``PipelineArgs``)**
+**Pipeline** (``PipelineArgs``)
 
 - ``--executor-threads <count>`` - Number of worker threads for parallel
   processing (default: number of CPU cores)
@@ -21,19 +21,19 @@ repeated in each tool's section.
   cores)
 - ``--time-profiling`` - Print stage timing breakdown to stderr
 
-**Indexing (``IndexingArgs``)**
+**Indexing** (``IndexingArgs``)
 
 - ``--index-dir <path>`` - Directory for ``.dftindex`` stores
 - ``--checkpoint-size <bytes>`` - Checkpoint size for gzip indexing in bytes
   (default: 33554432 B / 32 MB)
 - ``-f, --force`` - Force index recreation
 
-**Query (``QueryArgs``)**
+**Query** (``QueryArgs``)
 
 - ``--query <query>`` - Query DSL filter
   (e.g., ``'cat == "POSIX" and dur > 1000'``)
 
-**Watchdog (``WatchdogArgs``)**
+**Watchdog** (``WatchdogArgs``)
 
 - ``--disable-watchdog`` - Disable watchdog for hang detection
 - ``--watchdog-global-timeout <s>`` - Watchdog global timeout for pipeline
@@ -49,7 +49,7 @@ repeated in each tool's section.
 - ``--watchdog-deadlock-timeout <s>`` - Watchdog deadlock timeout in seconds
   (0 = use default, default: 600)
 
-**Inputs (``DirectoryArgs`` / ``FilesArgs``)**
+**Inputs** (``DirectoryArgs`` / ``FilesArgs``)
 
 - ``-d, --directory <path>`` - Directory containing trace files
 - ``--files <files...>`` - Trace files (``.pfw``, ``.pfw.gz``)
@@ -516,6 +516,91 @@ The Arrow output always includes the base columns ``batch_type``, ``cat``,
     import duckdb
     result = duckdb.sql("SELECT * FROM 'agg.arrows'")
 
+dftracer_gen_dlio_config
+------------------------
+
+**Description:** Generate a DLIO YAML configuration directly from a directory
+of raw DFTracer traces. The tool indexes the inputs, aggregates them into the
+internal ``AGGREGATION`` column family (DDSketch forced on), fits per-component
+distributions, refines ``max_bound`` against an internal barrier simulator, and
+emits a DLIO ``train.computation_time`` + ``reader.preprocess_time`` block. The
+user does not need to run ``dftracer_aggregator`` separately.
+
+Required input event names: ``cat=dataloader`` with ``name=fetch.block`` /
+``fetch.iter``, and ``cat=data`` with ``name=preprocess`` / ``item``. The tool
+exits non-zero with an explanatory message if no DLIO events are present.
+
+**Usage:**
+
+.. code-block:: bash
+
+    dftracer_gen_dlio_config [OPTIONS] -o <config.yaml>
+
+**Options:**
+
+- ``-d, --directory <path>`` - Input directory containing .pfw or .pfw.gz traces (default: .)
+- ``-o, --output <path>`` - Output path for the DLIO YAML config [required]
+- ``--max-bound-percentile <pct>`` - Initial max_bound percentile, 0-100 (default: 95)
+- ``--simulation-iterations <n>`` - Max simulator iterations for percentile refinement (default: 5)
+- ``--target-e2e-error <frac>`` - Target relative E2E error to declare convergence (default: 0.05)
+- ``--target-cdf-similarity <frac>`` - Target fetch_block CDF similarity (default: 0.90)
+- ``--patience <n>`` - Early-stop after this many iterations without improvement (default: 10)
+- ``--epsilon <step>`` - Base step size for percentile adjustment (default: 1.0)
+- ``--momentum <m>`` - Momentum factor in [0, 1) (default: 0.9)
+- ``--min-percentile <pct>`` - Floor on max_bound percentile during optimization (default: 50)
+- ``--num-workers <n>`` - DataLoader worker count for the simulator (default: 8)
+- ``--prefetch-factor <n>`` - DataLoader prefetch factor (default: 2)
+- ``--seed <n>`` - Base seed for simulator and sampler (default: 42)
+- ``--max-samples-per-entry <n>`` - Cap on synthesized samples per aggregation entry; 0 disables (default: 100)
+- ``-t, --time-interval <ms>`` - Aggregation time interval in ms (default: 5000)
+- ``--index-dir <path>`` - Directory for the shared index store (default: system temp dir)
+- ``--checkpoint-size <bytes>`` - Checkpoint size for indexing in bytes (default: 33554432 B / 32 MB)
+- ``--executor-threads <count>`` - Number of executor threads for parallel processing
+- ``-f, --force`` - Force index recreation
+
+**Distribution pool:** Each component is fit as the lowest-BIC choice among
+{Normal, Lognormal, Gamma, Exponential, Weibull, Gaussian Mixture (K=2),
+Gaussian Mixture (K=3)}. Mixture candidates are only considered when the
+sample count is at least 20.
+
+**Example:**
+
+.. code-block:: bash
+
+    # Generate config from a directory of raw traces
+    dftracer_gen_dlio_config -d ./traces -o dlio_config.yaml
+
+    # Refine harder against the simulator with a tighter convergence target
+    dftracer_gen_dlio_config -d ./traces -o dlio_config.yaml \
+        --simulation-iterations 20 --target-e2e-error 0.02 --patience 5
+
+    # Reuse a shared index directory across runs to skip re-indexing
+    dftracer_gen_dlio_config -d ./traces -o dlio_config.yaml \
+        --index-dir /var/cache/dftracer/idx
+
+**Output schema:**
+
+.. code-block:: yaml
+
+    train:
+      computation_time:
+        type: <normal|lognormal|gamma|exponential|weibull|mixture>
+        # single distribution: per-family params (mean/stdev, mu/sigma,
+        # shape/scale, rate)
+        # mixture: n_components + components: [{weight, params: {type, ...}}]
+        max_bound: <seconds>
+    reader:
+      preprocess_time:
+        # same structure
+
+**Comparing against an external generator:** ``scripts/compare_dlio_yamls.py``
+diffs two DLIO YAMLs with a tolerance check on parameters and a two-sample
+Kolmogorov-Smirnov check on samples drawn from each fit. Run via ``uv run
+scripts/compare_dlio_yamls.py --python <a.yaml> --cpp <b.yaml>`` (the inline
+PEP-723 metadata installs ``pyyaml`` and ``numpy`` automatically). Same model
+family + small KS = the two YAMLs would produce indistinguishable DLIO sample
+streams.
+
 dftracer_organize
 -----------------
 
diff --git a/docs/source/pipeline.rst b/docs/source/pipeline.rst
index 05faad8a..49019439 100644
--- a/docs/source/pipeline.rst
+++ b/docs/source/pipeline.rst
@@ -818,7 +818,7 @@ Control execution duration and cooperative cancellation using ``PipelineConfig``
         co_return;
     });
 
-**Timing out a race with ``when_any`` + timeout:** Use ``when_any`` with a timeout awaitable to race operations:
+**Timing out a race with** ``when_any`` **+ timeout:** Use ``when_any`` with a timeout awaitable to race operations:
 
 .. code-block:: cpp
 
diff --git a/docs/source/utilities.rst b/docs/source/utilities.rst
index f462d079..3b393220 100644
--- a/docs/source/utilities.rst
+++ b/docs/source/utilities.rst
@@ -17,6 +17,7 @@ dftracer-utils provides a collection of composable utilities for trace file proc
    utilities/indexer
    utilities/reader
    utilities/common
+   utilities/dlio
    call-tree
 
 Overview
@@ -43,8 +44,9 @@ Utilities follow a consistent pattern:
            Hash["Hash<br/>FNV1a, Std, MurmurHash3"]
            Indexer["Indexer<br/>Checkpoint, BloomFilter"]
            Reader["Reader<br/>Stream, LineProcessor"]
-           Common["Common<br/>JSON, DDSketch, Log2Histogram"]
+           Common["Common<br/>JSON, DDSketch, Statistic, Distributions, Mixture"]
            Composites["Composites<br/>DFTracer-specific pipelines"]
+           Dlio["DLIO<br/>BarrierSimulator, TraceLoader, Optimizer, YAML emit"]
        end
 
        Utility --> FileIO
@@ -55,6 +57,7 @@ Utilities follow a consistent pattern:
        Utility --> Reader
        Utility --> Common
        Utility --> Composites
+       Utility --> Dlio
 
 File I/O
 --------
@@ -71,13 +74,36 @@ See :doc:`/utilities/fileio` for detailed usage.
 Statistics
 ----------
 
-Enhanced statistics collection for trace analysis:
+Enhanced statistics collection and distribution fitting for trace analysis:
 
 - **DDSketch**: Deterministic, merge-order-independent percentile estimation with bounded relative error
 - **Log2Histogram**: Fixed 65-bin logarithmic histogram for duration and size distributions
+- **Statistic**: Min/max/mean/count accumulator that optionally delegates to an attached DDSketch for quantile queries
+- **Distributions**: MLE fitting + KS / BIC scoring for Normal, Lognormal, Gamma, Exponential, Weibull; sampler factory backed by ``<random>`` and `Boost.Math standalone <https://www.boost.org/doc/libs/release/libs/math/doc/html/math_toolkit/standalone.html>`_
+- **Mixture**: Univariate Gaussian Mixture EM (K=2, K=3) with log-sum-exp responsibilities and BIC-based selection across single + mixture models
 - **Chunk statistics**: Per-chunk event tracking with online variance calculation and per-name duration sketches
 
-These are used in indexing and aggregation pipelines to compute event distributions and percentiles efficiently.
+These are used in indexing and aggregation pipelines to compute event distributions and percentiles efficiently, and by the DLIO config generator to fit per-component timing distributions.
+
+DLIO Config Generation
+----------------------
+
+End-to-end pipeline that converts a directory of raw DFTracer logs into a DLIO
+training-loop YAML configuration:
+
+- **trace_loader**: pulls the ``AGGREGATION`` column family (re-attaches the
+  merge operator at open time) and synthesizes per-rank sample arrays from
+  per-(pid, time_bucket) entries.
+- **BarrierSimulator**: simulates one DLIO training run across the captured
+  ranks/steps, scoring an end-to-end duration, rank variance, and ``fetch.block``
+  CDF similarity against the empirical trace.
+- **optimizer**: sequential momentum loop refining the ``max_bound`` percentile
+  on the fitted sampler to minimize simulator E2E error.
+- **yaml_emit**: renders single distributions or Gaussian mixtures into the
+  DLIO ``train.computation_time`` / ``reader.preprocess_time`` schema.
+
+See :doc:`/utilities/dlio` for the API and ``dftracer_gen_dlio_config`` in
+:doc:`/cli` for the user-facing binary.
 
 Indexing
 --------
diff --git a/docs/source/utilities/common.rst b/docs/source/utilities/common.rst
index 3b571b29..cede5cfd 100644
--- a/docs/source/utilities/common.rst
+++ b/docs/source/utilities/common.rst
@@ -241,13 +241,19 @@ The query AST uses ``std::variant``-based nodes:
 Statistics
 ----------
 
-Percentile estimation and histogram utilities for trace analysis.
+Percentile estimation, histogram, accumulator, and distribution-fitting
+utilities for trace analysis.
 
 .. code-block:: cpp
 
    #include <dftracer/utils/utilities/common/statistics/ddsketch.h>
    #include <dftracer/utils/utilities/common/statistics/log2_histogram.h>
    #include <dftracer/utils/utilities/common/statistics/timestamp_histogram.h>
+   #include <dftracer/utils/utilities/common/statistics/statistic.h>
+   #include <dftracer/utils/utilities/common/statistics/distributions.h>
+   #include <dftracer/utils/utilities/common/statistics/mixture.h>
+   // Or use the umbrella header:
+   #include <dftracer/utils/utilities/common/statistics/statistics.h>
 
 DDSketch
 ~~~~~~~~
@@ -350,6 +356,99 @@ expansions for adaptive aggregation.
    auto bytes = th.serialize();
    auto restored = TimestampHistogram::deserialize(bytes.data(), bytes.size());
 
+Statistic
+~~~~~~~~~
+
+Lightweight min/max/mean/count accumulator with an optional DDSketch backing
+for quantile queries. When a sketch is attached, ``quantile()`` consults it;
+when no sketch is present, the fallback is a uniform interpolation between
+observed min and max.
+
+.. code-block:: cpp
+
+   Statistic stat;
+   for (double v : samples) stat.update(v);
+
+   double mean = stat.mean();
+   double approx_p50 = stat.quantile(0.5);  // uses linear-interp without a sketch
+
+   // Promote to DDSketch-backed quantiles by attaching a populated sketch.
+   auto sketch = std::make_shared<DDSketch>(0.01);
+   for (double v : samples) sketch->add(v);
+   stat.attach_sketch(std::move(sketch));
+   double real_p99 = stat.quantile(0.99);  // now consults the sketch
+
+Distributions
+~~~~~~~~~~~~~
+
+Maximum-likelihood fitting for five parametric families plus a Kolmogorov-
+Smirnov goodness-of-fit score and BIC. Backed by `Boost.Math standalone
+<https://www.boost.org/doc/libs/release/libs/math/doc/html/math_toolkit/standalone.html>`_
+for CDF/PDF/quantile evaluation; samplers use ``<random>``.
+
+Supported families: Normal, Lognormal, Gamma, Exponential, Weibull.
+
+.. code-block:: cpp
+
+   std::vector<double> data = ...;
+
+   // Fit one family directly.
+   FittedDistribution fit = fit_single_distribution(
+       DistributionKind::Lognormal, data);
+   if (fit.valid) {
+       printf("lognormal mu=%.4f sigma=%.4f KS=%.4f BIC=%.2f\n",
+              fit.params[0], fit.params[1], fit.ks_stat, fit.bic);
+   }
+
+   // Fit all five and pick the lowest-KS valid fit.
+   auto fits = fit_all_single_distributions(data);
+   if (auto best = best_fit_by_ks(fits)) {
+       printf("best family: %s\n",
+              std::string(distribution_name(best->kind)).c_str());
+   }
+
+   // Build a sampler from a fit (optionally bounded).
+   auto sampler = make_sampler(*best, /*min_bound=*/0.0,
+                                /*max_bound=*/0.5);
+   std::mt19937_64 rng(42);
+   double draw = sampler(rng);
+
+Mixture
+~~~~~~~
+
+Univariate Gaussian Mixture Model fitting via EM (K=2, K=3) with log-sum-exp
+responsibilities, quantile-spread initial means, and a variance floor to
+prevent component collapse. Plus a BIC-based selector across single
+distributions and mixtures.
+
+.. code-block:: cpp
+
+   // Fit a 2-component Gaussian mixture.
+   FittedMixture m = fit_gaussian_mixture(data, /*K=*/2);
+   if (m.valid && m.converged) {
+       for (size_t k = 0; k < m.weights.size(); ++k) {
+           printf("comp %zu weight=%.3f mean=%.6f stddev=%.6f\n",
+                  k, m.weights[k],
+                  m.components[k].mean, m.components[k].stddev);
+       }
+   }
+
+   // Pick the lowest-BIC model across {singles, GMM-2, GMM-3}.
+   auto singles = fit_all_single_distributions(data);
+   std::vector<FittedMixture> mixes{
+       fit_gaussian_mixture(data, 2),
+       fit_gaussian_mixture(data, 3),
+   };
+   auto selection = select_best_model(singles, mixes);
+
+   if (selection) {
+       // `BestModel` is a std::variant<FittedDistribution, FittedMixture>.
+       // pdf / cdf / make_sampler are overloaded and dispatch through it.
+       auto sampler = make_sampler(selection->model);
+       std::mt19937_64 rng(42);
+       double draw = sampler(rng);
+   }
+
 Arrow
 -----
 
diff --git a/docs/source/utilities/dlio.rst b/docs/source/utilities/dlio.rst
new file mode 100644
index 00000000..26b19530
--- /dev/null
+++ b/docs/source/utilities/dlio.rst
@@ -0,0 +1,157 @@
+DLIO Config Generation
+======================
+
+The ``dlio`` utilities power the ``dftracer_gen_dlio_config`` binary
+(see :ref:`dftracer_gen_dlio_config <cli-shared-flags>` in the CLI reference).
+They consume an already-populated ``AGGREGATION`` column family (produced by
+:doc:`/cli` ``dftracer_aggregator`` or by the shared
+``aggregation_runner`` library function) and emit a DLIO-compatible YAML
+config describing per-component timing distributions.
+
+.. code-block:: cpp
+
+   #include <dftracer/utils/utilities/dlio/barrier_simulator.h>
+   #include <dftracer/utils/utilities/dlio/optimizer.h>
+   #include <dftracer/utils/utilities/dlio/statistic.h>
+   #include <dftracer/utils/utilities/dlio/trace_loader.h>
+   #include <dftracer/utils/utilities/dlio/worker_queue.h>
+   #include <dftracer/utils/utilities/dlio/yaml_emit.h>
+
+Pipeline overview
+-----------------
+
+End-to-end the module composes four pieces:
+
+1. **trace_loader** opens an existing RocksDB read-only (with the AGGREGATION
+   merge operator re-attached), iterates the ``AGGREGATION`` column family,
+   groups entries by ``(cat, name, pid, time_bucket)``, and synthesizes a flat
+   per-rank sample stream per component (``fetch.block``, ``fetch.iter``,
+   ``preprocess``, ``item``). Sketches are used for inverse-CDF sampling when
+   the aggregator was run with ``--compute-percentiles``; otherwise the
+   per-call mean is replicated.
+
+2. The :doc:`distribution fitter <common>` (under
+   ``common/statistics/distributions.h`` and ``mixture.h``) fits the lowest-BIC
+   model from {Normal, Lognormal, Gamma, Exponential, Weibull, GMM-2, GMM-3}.
+
+3. **BarrierSimulator** simulates one DLIO training run across the captured
+   ranks/steps using the fitted distribution as the ``fetch.block`` sampler.
+   It produces an end-to-end duration, rank variance, and a Kolmogorov-Smirnov
+   similarity between simulated and trace ``fetch.block`` samples.
+
+4. **optimizer** runs a sequential momentum loop tuning the ``max_bound``
+   percentile used to clamp the sampler, minimizing simulator E2E error while
+   keeping the CDF similarity above a target.
+
+5. **yaml_emit** renders the final YAML.
+
+trace_loader
+------------
+
+.. code-block:: cpp
+
+   TraceLoaderOptions opts;
+   opts.max_samples_per_entry = 100;   // cap per (pid, bucket) entry, 0 = unlimited
+   opts.seed = 0xD15710;                // seed for inverse-CDF sampling
+
+   AggregatedTraces traces = load_aggregated_traces(db_path, opts);
+
+   if (!traces.any_data) { /* no DLIO events in this DB */ }
+   if (!traces.sketches_available) {
+       // The aggregator was run without --compute-percentiles. We fell back to
+       // mean replication; rerun with the flag for higher-fidelity output.
+   }
+
+Returns an ``AggregatedTraces`` with:
+
+- ``fetch_block_trace`` / ``fetch_iter_trace`` / ``getitem_trace`` —
+  per-rank ``std::vector<std::vector<double>>`` of seconds, in pid-ascending
+  then time-bucket-ascending order.
+- ``computation_times`` / ``preprocess_times`` — flat sample arrays in seconds
+  (the input to ``fit_all_single_distributions``).
+- ``fetch_block_stats`` / ``fetch_iter_stats`` / ``preprocess_stats`` /
+  ``getitem_stats`` — :doc:`Statistic <common>` objects, with merged DDSketches
+  attached when available.
+- ``trace_e2e_duration`` and per-component ``ComponentTimeMetrics`` with both
+  ``accumulated_time`` (sum of ``count × mean``) and ``union_time`` (true
+  wall-clock union via ``sweep_union`` over per-entry ``(ts, te)`` boundaries).
+
+BarrierSimulator
+----------------
+
+.. code-block:: cpp
+
+   BarrierSimulatorContext ctx = make_simulator_context(
+       traces, /*num_workers=*/8, /*prefetch_factor=*/2);
+
+   auto sampler = make_sampler(fitted_model);  // from common/statistics
+   BarrierSimulator sim;
+   BarrierSimulationResult result = sim.simulate(
+       ctx, /*base_seed=*/42, sampler);
+
+   printf("e2e=%.3fs error=%.2f%% fetch_block_cdf_sim=%.4f\n",
+          result.e2e_duration,
+          result.e2e_error * 100.0,
+          result.fetch_block_cdf_similarity);
+
+Free helpers exposed alongside ``BarrierSimulator``:
+
+- ``sweep_union(boundaries)`` — sweep-line interval union, microseconds to
+  seconds.
+- ``cdf_similarity(a, b)`` — ``1 − KS`` between two empirical samples.
+- ``variance(values)`` — population variance.
+
+Distribution fitting
+--------------------
+
+Lives under ``common/statistics`` and works on any sample array, not just DLIO
+traces — see :doc:`common` for ``FittedDistribution``, ``FittedMixture``,
+``BestModel`` (the ``std::variant``), ``select_best_model``, ``make_sampler``,
+and free ``pdf`` / ``cdf`` / ``quantile`` overloads.
+
+optimizer
+---------
+
+.. code-block:: cpp
+
+   OptimizerOptions opt_opts;
+   opt_opts.max_iterations = 5;
+   opt_opts.target_e2e_error = 0.05;
+   opt_opts.target_cdf_similarity = 0.90;
+   opt_opts.patience = 10;
+   opt_opts.epsilon = 1.0;
+   opt_opts.momentum = 0.9;
+   opt_opts.min_percentile = 50.0;
+   opt_opts.initial_percentile = 95.0;
+   opt_opts.base_seed = 42;
+
+   OptimizerResult opt = optimize_max_bound_percentile(
+       ctx, fitted_model, traces.computation_times, opt_opts);
+
+   double max_bound = percentile(sorted_samples, opt.best_percentile);
+
+Each iteration constructs a fresh sampler clamped at
+``percentile(sample_times, current_percentile)``, runs ``simulate()``, and
+adjusts ``current_percentile`` by a momentum-smoothed step proportional to the
+E2E error sign. Convergence: ``e2e_error < target_e2e_error`` AND
+``fetch_block_cdf_similarity > target_cdf_similarity``. Early-stops after
+``patience`` iterations without improvement.
+
+yaml_emit
+---------
+
+.. code-block:: cpp
+
+   DlioTimingBlock comp{best_comp_model, comp_max_bound};
+   DlioTimingBlock prep{best_prep_model, prep_max_bound};
+
+   std::ofstream out("dlio_config.yaml");
+   write_dlio_yaml(out, &comp, &prep);
+
+   // Or render to a string:
+   std::string yaml = render_dlio_yaml(&comp, &prep);
+
+Renders both single distributions and mixtures into the DLIO schema
+(``type: <family>`` + family-specific params, or ``type: mixture`` +
+``n_components`` + ``components: [{weight, params: {...}}]``). Pass ``nullptr``
+to either block argument to omit it.