llnl · hariharan-devarajan · May 19, 2026 · May 18, 2026 · May 19, 2026
diff --git a/cmake/modules/Dependencies.cmake b/cmake/modules/Dependencies.cmake
@@ -1766,6 +1766,95 @@ function(link_nanoarrow TARGET_NAME LIBRARY_TYPE)
 
 endfunction()
 
+# ==============================================================================
+# Boost.Math (standalone, header-only); for statistical distributions
+# ==============================================================================
+
+function(need_boost_math)
+  if(NOT boost_math_ADDED)
+    cpmaddpackage(
+      NAME
+      boost_math
+      GITHUB_REPOSITORY
+      boostorg/math
+      GIT_TAG
+      boost-1.91.0
+      DOWNLOAD_ONLY
+      YES)
+  endif()
+
+  # CPMAddPackage only sets boost_math_SOURCE_DIR in the calling scope. Cache
+  # it so link_boost_math() can find the include dir from anywhere in the tree.
+  if(boost_math_SOURCE_DIR)
+    set(boost_math_SOURCE_DIR
+        "${boost_math_SOURCE_DIR}"
+        CACHE INTERNAL "Boost.Math source tree from CPM")
+    message(STATUS "Added Boost.Math (standalone) headers from ${boost_math_SOURCE_DIR}/include")
+  endif()
+endfunction()
+
+# Apply Boost.Math standalone headers + BOOST_MATH_STANDALONE define as PRIVATE
+# build-only properties. We deliberately avoid an INTERFACE link target so the
+# headers/defines never enter the installed/exported target set.
+function(link_boost_math TARGET_NAME)
+  if(NOT TARGET_NAME)
+    message(FATAL_ERROR "link_boost_math: TARGET_NAME is required")
+  endif()
+  if(NOT TARGET ${TARGET_NAME})
+    message(FATAL_ERROR "link_boost_math: target '${TARGET_NAME}' does not exist")
+  endif()
+  if(NOT boost_math_SOURCE_DIR)
+    message(FATAL_ERROR
+      "link_boost_math: boost_math_SOURCE_DIR is unset; call need_boost_math() first")
+  endif()
+
+  target_include_directories(${TARGET_NAME} SYSTEM PRIVATE
+                             ${boost_math_SOURCE_DIR}/include)
+  target_compile_definitions(${TARGET_NAME} PRIVATE BOOST_MATH_STANDALONE)
+  message(STATUS "Linked ${TARGET_NAME} to Boost.Math (standalone)")
+endfunction()
+
+# ==============================================================================
+# yaml-cpp - YAML emit/parse for DLIO config generation
+# ==============================================================================
+
+function(need_yaml_cpp)
+  if(NOT yaml-cpp_ADDED)
+    cpmaddpackage(
+      NAME
+      yaml-cpp
+      GITHUB_REPOSITORY
+      jbeder/yaml-cpp
+      GIT_TAG
+      yaml-cpp-0.9.0
+      OPTIONS
+      "YAML_CPP_BUILD_TESTS OFF"
+      "YAML_CPP_BUILD_TOOLS OFF"
+      "YAML_CPP_BUILD_CONTRIB OFF"
+      "YAML_BUILD_SHARED_LIBS OFF"
+      "YAML_CPP_INSTALL ON"
+      FORCE
+      YES)
+  endif()
+endfunction()
+
+# Link yaml-cpp PRIVATE so the static library is bundled into the consumer and
+# the header path stays out of the installed/exported target set.
+function(link_yaml_cpp TARGET_NAME)
+  if(NOT TARGET_NAME)
+    message(FATAL_ERROR "link_yaml_cpp: TARGET_NAME is required")
+  endif()
+  if(NOT TARGET ${TARGET_NAME})
+    message(FATAL_ERROR "link_yaml_cpp: target '${TARGET_NAME}' does not exist")
+  endif()
+  if(NOT TARGET yaml-cpp::yaml-cpp)
+    message(FATAL_ERROR
+      "link_yaml_cpp: yaml-cpp::yaml-cpp target missing; call need_yaml_cpp() first")
+  endif()
+  target_link_libraries(${TARGET_NAME} PRIVATE yaml-cpp::yaml-cpp)
+  message(STATUS "Linked ${TARGET_NAME} to yaml-cpp")
+endfunction()
+
 # ==============================================================================
 # Testing Dependencies
 # ==============================================================================

diff --git a/docs/source/cli.rst b/docs/source/cli.rst
@@ -13,27 +13,27 @@ Most tools wire in a common set of argument schemas defined in
 semantics across every binary that exposes the relevant schema and are not
 repeated in each tool's section.
 
-**Pipeline (``PipelineArgs``)**
+**Pipeline** (``PipelineArgs``)
 
 - ``--executor-threads <count>`` - Number of worker threads for parallel
   processing (default: number of CPU cores)
 - ``--io-threads <count>`` - Number of I/O threads (default: number of CPU
   cores)
 - ``--time-profiling`` - Print stage timing breakdown to stderr
 
-**Indexing (``IndexingArgs``)**
+**Indexing** (``IndexingArgs``)
 
 - ``--index-dir <path>`` - Directory for ``.dftindex`` stores
 - ``--checkpoint-size <bytes>`` - Checkpoint size for gzip indexing in bytes
   (default: 33554432 B / 32 MB)
 - ``-f, --force`` - Force index recreation
 
-**Query (``QueryArgs``)**
+**Query** (``QueryArgs``)
 
 - ``--query <query>`` - Query DSL filter
   (e.g., ``'cat == "POSIX" and dur > 1000'``)
 
-**Watchdog (``WatchdogArgs``)**
+**Watchdog** (``WatchdogArgs``)
 
 - ``--disable-watchdog`` - Disable watchdog for hang detection
 - ``--watchdog-global-timeout <s>`` - Watchdog global timeout for pipeline
@@ -49,7 +49,7 @@ repeated in each tool's section.
 - ``--watchdog-deadlock-timeout <s>`` - Watchdog deadlock timeout in seconds
   (0 = use default, default: 600)
 
-**Inputs (``DirectoryArgs`` / ``FilesArgs``)**
+**Inputs** (``DirectoryArgs`` / ``FilesArgs``)
 
 - ``-d, --directory <path>`` - Directory containing trace files
 - ``--files <files...>`` - Trace files (``.pfw``, ``.pfw.gz``)
@@ -516,6 +516,91 @@ The Arrow output always includes the base columns ``batch_type``, ``cat``,
     import duckdb
     result = duckdb.sql("SELECT * FROM 'agg.arrows'")
 
+dftracer_gen_dlio_config
+------------------------
+
+**Description:** Generate a DLIO YAML configuration directly from a directory
+of raw DFTracer traces. The tool indexes the inputs, aggregates them into the
+internal ``AGGREGATION`` column family (DDSketch forced on), fits per-component
+distributions, refines ``max_bound`` against an internal barrier simulator, and
+emits a DLIO ``train.computation_time`` + ``reader.preprocess_time`` block. The
+user does not need to run ``dftracer_aggregator`` separately.
+
+Required input event names: ``cat=dataloader`` with ``name=fetch.block`` /
+``fetch.iter``, and ``cat=data`` with ``name=preprocess`` / ``item``. The tool
+exits non-zero with an explanatory message if no DLIO events are present.
+
+**Usage:**
+
+.. code-block:: bash
+
+    dftracer_gen_dlio_config [OPTIONS] -o <config.yaml>
+
+**Options:**
+
+- ``-d, --directory <path>`` - Input directory containing .pfw or .pfw.gz traces (default: .)
+- ``-o, --output <path>`` - Output path for the DLIO YAML config [required]
+- ``--max-bound-percentile <pct>`` - Initial max_bound percentile, 0-100 (default: 95)
+- ``--simulation-iterations <n>`` - Max simulator iterations for percentile refinement (default: 5)
+- ``--target-e2e-error <frac>`` - Target relative E2E error to declare convergence (default: 0.05)
+- ``--target-cdf-similarity <frac>`` - Target fetch_block CDF similarity (default: 0.90)
+- ``--patience <n>`` - Early-stop after this many iterations without improvement (default: 10)
+- ``--epsilon <step>`` - Base step size for percentile adjustment (default: 1.0)
+- ``--momentum <m>`` - Momentum factor in [0, 1) (default: 0.9)
+- ``--min-percentile <pct>`` - Floor on max_bound percentile during optimization (default: 50)
+- ``--num-workers <n>`` - DataLoader worker count for the simulator (default: 8)
+- ``--prefetch-factor <n>`` - DataLoader prefetch factor (default: 2)
+- ``--seed <n>`` - Base seed for simulator and sampler (default: 42)
+- ``--max-samples-per-entry <n>`` - Cap on synthesized samples per aggregation entry; 0 disables (default: 100)
+- ``-t, --time-interval <ms>`` - Aggregation time interval in ms (default: 5000)
+- ``--index-dir <path>`` - Directory for the shared index store (default: system temp dir)
+- ``--checkpoint-size <bytes>`` - Checkpoint size for indexing in bytes (default: 33554432 B / 32 MB)
+- ``--executor-threads <count>`` - Number of executor threads for parallel processing
+- ``-f, --force`` - Force index recreation
+
+**Distribution pool:** Each component is fit as the lowest-BIC choice among
+{Normal, Lognormal, Gamma, Exponential, Weibull, Gaussian Mixture (K=2),
+Gaussian Mixture (K=3)}. Mixture candidates are only considered when the
+sample count is at least 20.
+
+**Example:**
+
+.. code-block:: bash
+
+    # Generate config from a directory of raw traces
+    dftracer_gen_dlio_config -d ./traces -o dlio_config.yaml
+
+    # Refine harder against the simulator with a tighter convergence target
+    dftracer_gen_dlio_config -d ./traces -o dlio_config.yaml \
+        --simulation-iterations 20 --target-e2e-error 0.02 --patience 5
+
+    # Reuse a shared index directory across runs to skip re-indexing
+    dftracer_gen_dlio_config -d ./traces -o dlio_config.yaml \
+        --index-dir /var/cache/dftracer/idx
+
+**Output schema:**
+
+.. code-block:: yaml
+
+    train:
+      computation_time:
+        type: <normal|lognormal|gamma|exponential|weibull|mixture>
+        # single distribution: per-family params (mean/stdev, mu/sigma,
+        # shape/scale, rate)
+        # mixture: n_components + components: [{weight, params: {type, ...}}]
+        max_bound: <seconds>
+    reader:
+      preprocess_time:
+        # same structure
+
+**Comparing against an external generator:** ``scripts/compare_dlio_yamls.py``
+diffs two DLIO YAMLs with a tolerance check on parameters and a two-sample
+Kolmogorov-Smirnov check on samples drawn from each fit. Run via ``uv run
+scripts/compare_dlio_yamls.py --python <a.yaml> --cpp <b.yaml>`` (the inline
+PEP-723 metadata installs ``pyyaml`` and ``numpy`` automatically). Same model
+family + small KS = the two YAMLs would produce indistinguishable DLIO sample
+streams.
+
 dftracer_organize
 -----------------
 

diff --git a/docs/source/pipeline.rst b/docs/source/pipeline.rst
@@ -818,7 +818,7 @@ Control execution duration and cooperative cancellation using ``PipelineConfig``
         co_return;
     });
 
-**Timing out a race with ``when_any`` + timeout:** Use ``when_any`` with a timeout awaitable to race operations:
+**Timing out a race with** ``when_any`` **+ timeout:** Use ``when_any`` with a timeout awaitable to race operations:
 
 .. code-block:: cpp
 

diff --git a/docs/source/utilities.rst b/docs/source/utilities.rst
@@ -17,6 +17,7 @@ dftracer-utils provides a collection of composable utilities for trace file proc
    utilities/indexer
    utilities/reader
    utilities/common
+   utilities/dlio
    call-tree
 
 Overview
@@ -43,8 +44,9 @@ Utilities follow a consistent pattern:
            Hash["Hash<br/>FNV1a, Std, MurmurHash3"]
            Indexer["Indexer<br/>Checkpoint, BloomFilter"]
            Reader["Reader<br/>Stream, LineProcessor"]
-           Common["Common<br/>JSON, DDSketch, Log2Histogram"]
+           Common["Common<br/>JSON, DDSketch, Statistic, Distributions, Mixture"]
            Composites["Composites<br/>DFTracer-specific pipelines"]
+           Dlio["DLIO<br/>BarrierSimulator, TraceLoader, Optimizer, YAML emit"]
        end
 
        Utility --> FileIO
@@ -55,6 +57,7 @@ Utilities follow a consistent pattern:
        Utility --> Reader
        Utility --> Common
        Utility --> Composites
+       Utility --> Dlio
 
 File I/O
 --------
@@ -71,13 +74,36 @@ See :doc:`/utilities/fileio` for detailed usage.
 Statistics
 ----------
 
-Enhanced statistics collection for trace analysis:
+Enhanced statistics collection and distribution fitting for trace analysis:
 
 - **DDSketch**: Deterministic, merge-order-independent percentile estimation with bounded relative error
 - **Log2Histogram**: Fixed 65-bin logarithmic histogram for duration and size distributions
+- **Statistic**: Min/max/mean/count accumulator that optionally delegates to an attached DDSketch for quantile queries
+- **Distributions**: MLE fitting + KS / BIC scoring for Normal, Lognormal, Gamma, Exponential, Weibull; sampler factory backed by ``<random>`` and `Boost.Math standalone <https://www.boost.org/doc/libs/release/libs/math/doc/html/math_toolkit/standalone.html>`_
+- **Mixture**: Univariate Gaussian Mixture EM (K=2, K=3) with log-sum-exp responsibilities and BIC-based selection across single + mixture models
 - **Chunk statistics**: Per-chunk event tracking with online variance calculation and per-name duration sketches
 
-These are used in indexing and aggregation pipelines to compute event distributions and percentiles efficiently.
+These are used in indexing and aggregation pipelines to compute event distributions and percentiles efficiently, and by the DLIO config generator to fit per-component timing distributions.
+
+DLIO Config Generation
+----------------------
+
+End-to-end pipeline that converts a directory of raw DFTracer logs into a DLIO
+training-loop YAML configuration:
+
+- **trace_loader**: pulls the ``AGGREGATION`` column family (re-attaches the
+  merge operator at open time) and synthesizes per-rank sample arrays from
+  per-(pid, time_bucket) entries.
+- **BarrierSimulator**: simulates one DLIO training run across the captured
+  ranks/steps, scoring an end-to-end duration, rank variance, and ``fetch.block``
+  CDF similarity against the empirical trace.
+- **optimizer**: sequential momentum loop refining the ``max_bound`` percentile
+  on the fitted sampler to minimize simulator E2E error.
+- **yaml_emit**: renders single distributions or Gaussian mixtures into the
+  DLIO ``train.computation_time`` / ``reader.preprocess_time`` schema.
+
+See :doc:`/utilities/dlio` for the API and ``dftracer_gen_dlio_config`` in
+:doc:`/cli` for the user-facing binary.
 
 Indexing
 --------