From c643d05c647e3208f1b26c370e91cf66978d87ec Mon Sep 17 00:00:00 2001 From: Ray Andrew Date: Mon, 18 May 2026 17:23:41 -0500 Subject: [PATCH 1/2] feat(dlio): add DLIO config generation --- cmake/modules/Dependencies.cmake | 89 +++ .../common/statistics/distributions.h | 71 +++ .../utilities/common/statistics/mixture.h | 76 +++ .../utilities/common/statistics/statistic.h | 55 ++ .../utilities/common/statistics/statistics.h | 2 + .../dft/aggregators/aggregation_runner.h | 72 +++ .../utils/utilities/dlio/barrier_simulator.h | 115 ++++ .../dftracer/utils/utilities/dlio/optimizer.h | 56 ++ .../dftracer/utils/utilities/dlio/statistic.h | 35 ++ .../utils/utilities/dlio/trace_loader.h | 102 +++ .../utils/utilities/dlio/worker_queue.h | 51 ++ .../dftracer/utils/utilities/dlio/yaml_emit.h | 42 ++ src/CMakeLists.txt | 19 +- .../utils/binaries/dftracer_aggregator.cpp | 587 +++--------------- .../binaries/dftracer_gen_dlio_config.cpp | 354 +++++++++++ .../common/statistics/distributions.cpp | 448 +++++++++++++ .../utilities/common/statistics/mixture.cpp | 259 ++++++++ .../dft/aggregators/aggregation_runner.cpp | 444 +++++++++++++ .../utilities/dlio/barrier_simulator.cpp | 450 ++++++++++++++ .../utils/utilities/dlio/optimizer.cpp | 91 +++ .../utils/utilities/dlio/trace_loader.cpp | 380 ++++++++++++ .../utils/utilities/dlio/yaml_emit.cpp | 112 ++++ tests/CMakeLists.txt | 4 + .../test_dftracer_gen_dlio_config.cpp | 222 +++++++ tests/utilities/CMakeLists.txt | 7 + .../common/statistics/test_distributions.cpp | 174 ++++++ .../common/statistics/test_mixture.cpp | 176 ++++++ .../utilities/dlio/test_barrier_simulator.cpp | 241 +++++++ 28 files changed, 4239 insertions(+), 495 deletions(-) create mode 100644 include/dftracer/utils/utilities/common/statistics/distributions.h create mode 100644 include/dftracer/utils/utilities/common/statistics/mixture.h create mode 100644 include/dftracer/utils/utilities/common/statistics/statistic.h create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_runner.h create mode 100644 include/dftracer/utils/utilities/dlio/barrier_simulator.h create mode 100644 include/dftracer/utils/utilities/dlio/optimizer.h create mode 100644 include/dftracer/utils/utilities/dlio/statistic.h create mode 100644 include/dftracer/utils/utilities/dlio/trace_loader.h create mode 100644 include/dftracer/utils/utilities/dlio/worker_queue.h create mode 100644 include/dftracer/utils/utilities/dlio/yaml_emit.h create mode 100644 src/dftracer/utils/binaries/dftracer_gen_dlio_config.cpp create mode 100644 src/dftracer/utils/utilities/common/statistics/distributions.cpp create mode 100644 src/dftracer/utils/utilities/common/statistics/mixture.cpp create mode 100644 src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_runner.cpp create mode 100644 src/dftracer/utils/utilities/dlio/barrier_simulator.cpp create mode 100644 src/dftracer/utils/utilities/dlio/optimizer.cpp create mode 100644 src/dftracer/utils/utilities/dlio/trace_loader.cpp create mode 100644 src/dftracer/utils/utilities/dlio/yaml_emit.cpp create mode 100644 tests/binaries/test_dftracer_gen_dlio_config.cpp create mode 100644 tests/utilities/common/statistics/test_distributions.cpp create mode 100644 tests/utilities/common/statistics/test_mixture.cpp create mode 100644 tests/utilities/dlio/test_barrier_simulator.cpp diff --git a/cmake/modules/Dependencies.cmake b/cmake/modules/Dependencies.cmake index 454b2ae4..6862d675 100644 --- a/cmake/modules/Dependencies.cmake +++ b/cmake/modules/Dependencies.cmake @@ -1766,6 +1766,95 @@ function(link_nanoarrow TARGET_NAME LIBRARY_TYPE) endfunction() +# ============================================================================== +# Boost.Math (standalone, header-only); for statistical distributions +# ============================================================================== + +function(need_boost_math) + if(NOT boost_math_ADDED) + cpmaddpackage( + NAME + boost_math + GITHUB_REPOSITORY + boostorg/math + GIT_TAG + boost-1.91.0 + DOWNLOAD_ONLY + YES) + endif() + + # CPMAddPackage only sets boost_math_SOURCE_DIR in the calling scope. Cache + # it so link_boost_math() can find the include dir from anywhere in the tree. + if(boost_math_SOURCE_DIR) + set(boost_math_SOURCE_DIR + "${boost_math_SOURCE_DIR}" + CACHE INTERNAL "Boost.Math source tree from CPM") + message(STATUS "Added Boost.Math (standalone) headers from ${boost_math_SOURCE_DIR}/include") + endif() +endfunction() + +# Apply Boost.Math standalone headers + BOOST_MATH_STANDALONE define as PRIVATE +# build-only properties. We deliberately avoid an INTERFACE link target so the +# headers/defines never enter the installed/exported target set. +function(link_boost_math TARGET_NAME) + if(NOT TARGET_NAME) + message(FATAL_ERROR "link_boost_math: TARGET_NAME is required") + endif() + if(NOT TARGET ${TARGET_NAME}) + message(FATAL_ERROR "link_boost_math: target '${TARGET_NAME}' does not exist") + endif() + if(NOT boost_math_SOURCE_DIR) + message(FATAL_ERROR + "link_boost_math: boost_math_SOURCE_DIR is unset; call need_boost_math() first") + endif() + + target_include_directories(${TARGET_NAME} SYSTEM PRIVATE + ${boost_math_SOURCE_DIR}/include) + target_compile_definitions(${TARGET_NAME} PRIVATE BOOST_MATH_STANDALONE) + message(STATUS "Linked ${TARGET_NAME} to Boost.Math (standalone)") +endfunction() + +# ============================================================================== +# yaml-cpp - YAML emit/parse for DLIO config generation +# ============================================================================== + +function(need_yaml_cpp) + if(NOT yaml-cpp_ADDED) + cpmaddpackage( + NAME + yaml-cpp + GITHUB_REPOSITORY + jbeder/yaml-cpp + GIT_TAG + yaml-cpp-0.9.0 + OPTIONS + "YAML_CPP_BUILD_TESTS OFF" + "YAML_CPP_BUILD_TOOLS OFF" + "YAML_CPP_BUILD_CONTRIB OFF" + "YAML_BUILD_SHARED_LIBS OFF" + "YAML_CPP_INSTALL ON" + FORCE + YES) + endif() +endfunction() + +# Link yaml-cpp PRIVATE so the static library is bundled into the consumer and +# the header path stays out of the installed/exported target set. +function(link_yaml_cpp TARGET_NAME) + if(NOT TARGET_NAME) + message(FATAL_ERROR "link_yaml_cpp: TARGET_NAME is required") + endif() + if(NOT TARGET ${TARGET_NAME}) + message(FATAL_ERROR "link_yaml_cpp: target '${TARGET_NAME}' does not exist") + endif() + if(NOT TARGET yaml-cpp::yaml-cpp) + message(FATAL_ERROR + "link_yaml_cpp: yaml-cpp::yaml-cpp target missing; call need_yaml_cpp() first") + endif() + target_link_libraries(${TARGET_NAME} PRIVATE yaml-cpp::yaml-cpp) + message(STATUS "Linked ${TARGET_NAME} to yaml-cpp") +endfunction() + # ============================================================================== # Testing Dependencies # ============================================================================== diff --git a/include/dftracer/utils/utilities/common/statistics/distributions.h b/include/dftracer/utils/utilities/common/statistics/distributions.h new file mode 100644 index 00000000..03d042ef --- /dev/null +++ b/include/dftracer/utils/utilities/common/statistics/distributions.h @@ -0,0 +1,71 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMMON_STATISTICS_DISTRIBUTIONS_H +#define DFTRACER_UTILS_UTILITIES_COMMON_STATISTICS_DISTRIBUTIONS_H + +#include +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::common::statistics { + +enum class DistributionKind : std::uint8_t { + Normal, // params = {mean, stddev, _} + Lognormal, // params = {mu, sigma, _} (mu, sigma in log space) + Gamma, // params = {shape, scale, _} + Exponential, // params = {rate, _, _} (rate = 1/scale) + Weibull, // params = {shape, scale, _} +}; + +std::string_view distribution_name(DistributionKind k); + +// Result of fitting a single distribution to a sample array. +// `params` semantics depend on `kind`. +struct FittedDistribution { + DistributionKind kind; + std::array params{}; + double ks_stat = 1.0; // Kolmogorov-Smirnov statistic (lower = better) + double log_likelihood = 0.0; // sum of log pdf(x_i) + double bic = 0.0; // k*ln(n) - 2*log_likelihood + bool valid = false; // true when MLE succeeded +}; + +// MLE fit for a single distribution. Returns valid=false when fitting fails +// (e.g. non-positive data for lognormal, sample size < 2, Newton +// non-convergence). +FittedDistribution fit_single_distribution(DistributionKind kind, + const std::vector& data); + +// Fits all five distributions and returns them ordered by ascending ks_stat. +// Invalid fits are kept at the back of the result. +std::vector fit_all_single_distributions( + const std::vector& data); + +// Picks the lowest-KS valid fit. Returns nullopt if none of the fits succeeded. +std::optional best_fit_by_ks( + const std::vector& fits); + +// Distribution PDF / CDF / inverse-CDF. Behavior is undefined when +// `fit.valid == false`. +double pdf(const FittedDistribution& fit, double x); +double cdf(const FittedDistribution& fit, double x); +double quantile(const FittedDistribution& fit, double p); + +// Sampler signature. Matches dlio::Sampler so dlio::BarrierSimulator can +// consume it directly without an explicit cast. +using Sampler = std::function; + +// Builds a Sampler from a fitted distribution. +// Optional min/max bounds clamp the output (applied after sampling). +Sampler make_sampler(const FittedDistribution& fit, + std::optional min_bound = std::nullopt, + std::optional max_bound = std::nullopt); + +// Returns the parameter count used for BIC. Useful when extending to mixtures. +int free_parameter_count(DistributionKind kind); + +} // namespace dftracer::utils::utilities::common::statistics + +#endif diff --git a/include/dftracer/utils/utilities/common/statistics/mixture.h b/include/dftracer/utils/utilities/common/statistics/mixture.h new file mode 100644 index 00000000..7856fd9c --- /dev/null +++ b/include/dftracer/utils/utilities/common/statistics/mixture.h @@ -0,0 +1,76 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMMON_STATISTICS_MIXTURE_H +#define DFTRACER_UTILS_UTILITIES_COMMON_STATISTICS_MIXTURE_H + +#include + +#include +#include +#include +#include + +namespace dftracer::utils::utilities::common::statistics { + +struct GmmComponent { + double mean = 0.0; + double stddev = 0.0; +}; + +// Univariate Gaussian Mixture Model fit. Component count is `weights.size()`. +struct FittedMixture { + std::vector weights; // sum to 1 + std::vector components; // same length as weights + double log_likelihood = 0.0; + double bic = 0.0; + int iterations = 0; + bool converged = false; + bool valid = false; +}; + +struct MixtureFitOptions { + int max_iter = 200; + double tol = 1e-6; + double variance_floor = 1e-12; // prevent component collapse + std::uint64_t seed = 0xC0FFEE; +}; + +// Fits a K-component Gaussian Mixture via EM. K-means-style initialization on +// quantile-spread means and total-variance / K for each component. +FittedMixture fit_gaussian_mixture(const std::vector& data, int K, + const MixtureFitOptions& options = {}); + +double pdf(const FittedMixture& mix, double x); +double cdf(const FittedMixture& mix, double x); + +// Free-parameter count for BIC: 3K - 1 (K means + K stddevs + K-1 free +// weights). +int free_parameter_count(const FittedMixture& mix); + +// Sampler from a fitted mixture. Draws a component by weight then a Normal. +Sampler make_sampler(const FittedMixture& mix, + std::optional min_bound = std::nullopt, + std::optional max_bound = std::nullopt); + +using BestModel = std::variant; + +struct ModelSelection { + BestModel model; + double bic = 0.0; + int free_params = 0; +}; + +// Selects the lowest-BIC model among the candidates. `single_fits` is typically +// the output of fit_all_single_distributions(); `mixtures` is typically two +// entries (GMM-2 and GMM-3). Invalid fits are ignored. +std::optional select_best_model( + const std::vector& single_fits, + const std::vector& mixtures); + +double pdf(const BestModel& m, double x); +double cdf(const BestModel& m, double x); +Sampler make_sampler(const BestModel& m, + std::optional min_bound = std::nullopt, + std::optional max_bound = std::nullopt); + +} // namespace dftracer::utils::utilities::common::statistics + +#endif diff --git a/include/dftracer/utils/utilities/common/statistics/statistic.h b/include/dftracer/utils/utilities/common/statistics/statistic.h new file mode 100644 index 00000000..315c449f --- /dev/null +++ b/include/dftracer/utils/utilities/common/statistics/statistic.h @@ -0,0 +1,55 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMMON_STATISTICS_STATISTIC_H +#define DFTRACER_UTILS_UTILITIES_COMMON_STATISTICS_STATISTIC_H + +#include + +#include +#include +#include +#include + +namespace dftracer::utils::utilities::common::statistics { + +// Lightweight min/max/mean/count accumulator with an optional DDSketch backing +// for quantile queries. When no sketch is attached, quantile() falls back to a +// uniform interpolation between observed min and max. +class Statistic { + public: + Statistic() = default; + + void attach_sketch(std::shared_ptr sketch) { + sketch_ = std::move(sketch); + } + + void update(double value) { + if (value < min_val_) min_val_ = value; + if (value > max_val_) max_val_ = value; + sum_ += value; + ++count_; + mean_ = sum_ / static_cast(count_); + } + + double quantile(double q) const { + if (sketch_ && !sketch_->empty()) return sketch_->quantile(q); + if (count_ == 0 || min_val_ == std::numeric_limits::infinity()) + return 0.0; + return min_val_ + q * (max_val_ - min_val_); + } + + double min() const { return count_ == 0 ? 0.0 : min_val_; } + double max() const { return count_ == 0 ? 0.0 : max_val_; } + double mean() const { return mean_; } + std::uint64_t count() const { return count_; } + + private: + double min_val_ = std::numeric_limits::infinity(); + double max_val_ = -std::numeric_limits::infinity(); + double sum_ = 0.0; + double mean_ = 0.0; + std::uint64_t count_ = 0; + std::shared_ptr sketch_; +}; + +} // namespace dftracer::utils::utilities::common::statistics + +#endif diff --git a/include/dftracer/utils/utilities/common/statistics/statistics.h b/include/dftracer/utils/utilities/common/statistics/statistics.h index d56497d8..882dd0bb 100644 --- a/include/dftracer/utils/utilities/common/statistics/statistics.h +++ b/include/dftracer/utils/utilities/common/statistics/statistics.h @@ -9,5 +9,7 @@ */ #include +#include +#include #endif // DFTRACER_UTILS_UTILITIES_COMMON_STATISTICS_STATISTICS_H diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_runner.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_runner.h new file mode 100644 index 00000000..73746434 --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_runner.h @@ -0,0 +1,72 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_RUNNER_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_RUNNER_H + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft::aggregators { + +// Input bundle for the aggregation pipeline. Callers (binaries) translate their +// CLI flags into this struct, then call run_aggregation(). +struct AggregationRunInput { + // Raw trace input. + std::string log_dir; // directory containing .pfw[.gz] + std::string index_dir; // where the shared RocksDB lives + AggregationConfig agg_config; // time_interval_us, sketch, etc. + ::dftracer::utils::PipelineConfig + pipeline_config; // executor threads, etc. + + // Optional output writer. When `output_file` is std::nullopt the run only + // populates the AGGREGATION column family in the RocksDB and skips Perfetto + // / Arrow emission; downstream consumers (e.g. dftracer_gen_dlio_config) + // open the CF directly. When set, the output is written in `output_format`. + std::optional output_file; + std::string output_format = AggregationConfig::FORMAT_JSON; + PerfettoEventFormat event_format = PerfettoEventFormat::COUNTER; + bool compress_output = false; + int compression_level = 1; + + // Indexing controls. + bool force_rebuild = false; + std::size_t checkpoint_size = 0; + + // Optional staged Timer for profiling. Caller owns lifetime. + ::dftracer::utils::Timer* stages = nullptr; + bool verbose = true; // controls console banner output +}; + +struct AggregationRunResult { + bool success = false; + + // Path to the shared RocksDB index that now contains the AGGREGATION CF. + // Downstream tools (dftracer_gen_dlio_config) open this read-only. + std::string index_path; + + std::size_t total_keys = 0; + std::size_t input_file_count = 0; + std::size_t processed_file_count = 0; + std::size_t cached_file_count = 0; + + double elapsed_ms = 0.0; +}; + +// Runs the full index + aggregate pipeline: +// 1. Scan log_dir for input files; consult the existing index. +// 2. Re-index any file that needs it (or all, if force_rebuild). +// 3. Run the aggregation visitor pipeline across all files needing it. +// 4. Optionally write the aggregated events to a Perfetto JSON / Arrow IPC +// file (when input.output_file is set). +// 5. Write per-file tracking entries and global config to the AGGREGATION CF. +coro::CoroTask run_aggregation(AggregationRunInput input); + +} // namespace dftracer::utils::utilities::composites::dft::aggregators + +#endif diff --git a/include/dftracer/utils/utilities/dlio/barrier_simulator.h b/include/dftracer/utils/utilities/dlio/barrier_simulator.h new file mode 100644 index 00000000..c889bdd7 --- /dev/null +++ b/include/dftracer/utils/utilities/dlio/barrier_simulator.h @@ -0,0 +1,115 @@ +#ifndef DFTRACER_UTILS_UTILITIES_DLIO_BARRIER_SIMULATOR_H +#define DFTRACER_UTILS_UTILITIES_DLIO_BARRIER_SIMULATOR_H + +#include + +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::dlio { + +using Rng = std::mt19937_64; +using Sampler = std::function; + +struct BarrierSimulatorContext { + int num_ranks = 0; + int num_steps = 0; + double trace_e2e_duration = 0.0; + + std::vector> fetch_block_trace; + std::vector> fetch_iter_trace; + + Statistic fetch_block_stats; + Statistic fetch_iter_stats; + Statistic preprocess_stats; + Statistic getitem_stats; + + ComponentTimeMetrics trace_preprocess_metrics; + ComponentTimeMetrics trace_fetch_iter_metrics; + ComponentTimeMetrics trace_fetch_block_metrics; + double trace_rank_variance = 0.0; + std::vector trace_per_rank_throughput; + + std::optional>> getitem_trace; + std::optional io_stats; + std::optional> io_samples; + + bool sync_mode = false; + int accumulate_grad_batches = 1; + bool enable_preprocess_simulation = false; + int num_workers = 8; + int prefetch_factor = 2; + double preprocess_slowdown_factor = 1.0; + double base_fetch_iter_overhead = 0.0; + bool is_aggregated_trace = false; + double avg_calls_per_epoch = 1.0; +}; + +struct BarrierSimulationResult { + double e2e_duration = 0.0; + double e2e_error = 0.0; + + double avg_barrier_overhead = 0.0; + double max_barrier_overhead = 0.0; + + std::vector per_rank_completion_time; + double rank_variance = 0.0; + double trace_rank_variance = 0.0; + double rank_variance_error = 0.0; + double load_imbalance = 0.0; + + std::vector simulated_fetch_block; + double fetch_block_cdf_similarity = 0.0; + + std::vector simulated_preprocess; + std::vector simulated_getitem; + std::vector simulated_fetch_iter; + double fetch_iter_cdf_similarity = 0.0; + double getitem_cdf_similarity = 0.0; + double avg_queue_depth = 0.0; + double avg_queue_stalls = 0.0; + + ComponentTimeMetrics preprocess_metrics; + ComponentTimeMetrics fetch_iter_metrics; + ComponentTimeMetrics fetch_block_metrics; + + std::optional trace_preprocess_metrics; + std::optional trace_fetch_iter_metrics; + std::optional trace_fetch_block_metrics; + + std::vector simulated_per_rank_throughput; + std::vector trace_per_rank_throughput; + + double throughput_mean = 0.0; + double trace_throughput_mean = 0.0; + double throughput_mean_error = 0.0; + + double throughput_variance = 0.0; + double trace_throughput_variance = 0.0; + + double throughput_cdf_similarity = 0.0; +}; + +class BarrierSimulator { + public: + // preprocess_sampler may be empty; pass {} to use trace-derived + // getitem/preprocess stats. + BarrierSimulationResult simulate( + const BarrierSimulatorContext& context, std::uint64_t base_seed, + const Sampler& fetch_block_sampler, + const Sampler& preprocess_sampler = {}) const; +}; + +// 1 - Kolmogorov-Smirnov statistic between the two empirical distributions. +// Returns 1.0 for perfect match, 0.0 for fully disjoint. +double cdf_similarity(const std::vector& a, + const std::vector& b); + +double variance(const std::vector& values); + +} // namespace dftracer::utils::utilities::dlio + +#endif diff --git a/include/dftracer/utils/utilities/dlio/optimizer.h b/include/dftracer/utils/utilities/dlio/optimizer.h new file mode 100644 index 00000000..90163053 --- /dev/null +++ b/include/dftracer/utils/utilities/dlio/optimizer.h @@ -0,0 +1,56 @@ +#ifndef DFTRACER_UTILS_UTILITIES_DLIO_OPTIMIZER_H +#define DFTRACER_UTILS_UTILITIES_DLIO_OPTIMIZER_H + +#include +#include + +#include +#include + +namespace dftracer::utils::utilities::dlio { + +using BestModel = ::dftracer::utils::utilities::common::statistics::BestModel; + +struct OptimizerOptions { + int max_iterations = 5; + double target_e2e_error = 0.05; + double target_cdf_similarity = 0.90; + int patience = 10; + double epsilon = 1.0; + double momentum = 0.9; + double min_percentile = 50.0; + double initial_percentile = 95.0; + std::uint64_t base_seed = 42; +}; + +struct OptimizerResult { + BarrierSimulationResult best; + double best_percentile = 0.0; + int iterations_used = 0; + bool converged = false; +}; + +// Sequential momentum-based optimizer. +// Searches for the max_bound percentile that minimizes simulator e2e_error +// while preserving fetch_block_cdf_similarity. `sample_times` is the sorted +// flat per-call sample array used for percentile lookups (sorted in-place if +// not). +// +// Each iteration: +// 1. comp_max_bound = percentile(sample_times, current_percentile) +// 2. comp_sampler = make_sampler(model, min=sample_times.front(), +// max=comp_max_bound) +// 3. result = simulator.simulate(context, base_seed, comp_sampler) +// 4. Adjust current_percentile via momentum-smoothed step proportional to +// error. +OptimizerResult optimize_max_bound_percentile( + const BarrierSimulatorContext& context, const BestModel& model, + std::vector sample_times, const OptimizerOptions& options = {}); + +// Helper: percentile by sorted index (linear interpolation between adjacent +// samples). Returns 0 if data is empty. +double percentile(const std::vector& sorted_data, double pct); + +} // namespace dftracer::utils::utilities::dlio + +#endif diff --git a/include/dftracer/utils/utilities/dlio/statistic.h b/include/dftracer/utils/utilities/dlio/statistic.h new file mode 100644 index 00000000..260c9805 --- /dev/null +++ b/include/dftracer/utils/utilities/dlio/statistic.h @@ -0,0 +1,35 @@ +#ifndef DFTRACER_UTILS_UTILITIES_DLIO_STATISTIC_H +#define DFTRACER_UTILS_UTILITIES_DLIO_STATISTIC_H + +#include + +#include +#include + +namespace dftracer::utils::utilities::dlio { + +using Statistic = dftracer::utils::utilities::common::statistics::Statistic; + +struct ComponentTimeMetrics { + double union_time = 0.0; + double accumulated_time = 0.0; + std::uint64_t num_samples = 0; + Statistic stats; + + double concurrency() const { + return union_time > 0.0 ? accumulated_time / union_time : 0.0; + } +}; + +struct Boundary { + std::int64_t time; + int delta; // +1 start, -1 end +}; + +// Sweep-line union of [start, end] intervals encoded as boundaries. +// Times are in microseconds; return value is seconds. +double sweep_union(std::vector& boundaries); + +} // namespace dftracer::utils::utilities::dlio + +#endif diff --git a/include/dftracer/utils/utilities/dlio/trace_loader.h b/include/dftracer/utils/utilities/dlio/trace_loader.h new file mode 100644 index 00000000..c6fb23dc --- /dev/null +++ b/include/dftracer/utils/utilities/dlio/trace_loader.h @@ -0,0 +1,102 @@ +#ifndef DFTRACER_UTILS_UTILITIES_DLIO_TRACE_LOADER_H +#define DFTRACER_UTILS_UTILITIES_DLIO_TRACE_LOADER_H + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::dlio { + +using DDSketch = ::dftracer::utils::utilities::common::statistics::DDSketch; + +// DLIO category / event names matched in the AGGREGATION CF. +inline constexpr std::string_view CATEGORY_DATALOADER = "dataloader"; +inline constexpr std::string_view CATEGORY_DATA = "data"; +inline constexpr std::string_view EVENT_FETCH_BLOCK = "fetch.block"; +inline constexpr std::string_view EVENT_FETCH_ITER = "fetch.iter"; +inline constexpr std::string_view EVENT_PREPROCESS = "preprocess"; +inline constexpr std::string_view EVENT_ITEM = "item"; + +struct AggregatedTraces { + // Per-rank concatenated sample sequences (seconds), in pid-ascending, + // then time-bucket-ascending order. + std::vector> fetch_block_trace; + std::vector> fetch_iter_trace; + std::vector> getitem_trace; + + // Flat sample arrays for distribution fitting (seconds). + std::vector computation_times; // fetch.block + std::vector preprocess_times; // preprocess + + // Sketches merged across all (rank, bucket) entries for each component. + // Nullable: only populated if the aggregator was run with + // --compute-percentiles. + std::shared_ptr fetch_block_sketch; + std::shared_ptr fetch_iter_sketch; + std::shared_ptr preprocess_sketch; + std::shared_ptr getitem_sketch; + + // Statistics with min/max/mean/count populated, sketch attached when + // present. + Statistic fetch_block_stats; + Statistic fetch_iter_stats; + Statistic preprocess_stats; + Statistic getitem_stats; + + // Aggregate metrics (seconds). + double trace_e2e_duration = 0.0; + double trace_rank_variance = 0.0; + std::vector trace_per_rank_throughput; + + // Per-component accumulated / union times. Both computed from actual + // (ts, te) boundaries in the CF rather than a fixed-ratio heuristic. + ComponentTimeMetrics trace_preprocess_metrics; + ComponentTimeMetrics trace_fetch_iter_metrics; + ComponentTimeMetrics trace_fetch_block_metrics; + + // Discovered PIDs (sorted) - index in this list defines rank ID. + std::vector rank_pids; + + int num_ranks = 0; + int num_steps = 0; // min length across ranks of fetch_block_trace + + // True if any AGGREGATION entry was found. + bool any_data = false; + + // True if at least one DDSketch was present in the CF. Drives whether + // sample synthesis uses inverse-CDF or mean replication. + bool sketches_available = false; + + // Time bucket width in microseconds (from AggGlobalConfig). + std::uint64_t time_interval_us = 0; +}; + +struct TraceLoaderOptions { + // Hard cap on samples synthesized per (cat, name, pid, bucket) entry, so a + // single high-count bucket cannot blow up memory. 0 disables the cap. + std::uint64_t max_samples_per_entry = 100; + // Seed for inverse-CDF sketch sampling. + std::uint64_t seed = 0xD15710; +}; + +// Loads aggregated DLIO trace data from a dftracer RocksDB. Opens the database +// read-only, iterates the AGGREGATION column family, and materializes per-rank +// trace arrays, distribution sample arrays, and trace-side +// ComponentTimeMetrics. +AggregatedTraces load_aggregated_traces(const std::string& db_path, + const TraceLoaderOptions& options = {}); + +// Convenience: build a BarrierSimulatorContext from loaded traces. +BarrierSimulatorContext make_simulator_context(const AggregatedTraces& traces, + int num_workers = 8, + int prefetch_factor = 2); + +} // namespace dftracer::utils::utilities::dlio + +#endif diff --git a/include/dftracer/utils/utilities/dlio/worker_queue.h b/include/dftracer/utils/utilities/dlio/worker_queue.h new file mode 100644 index 00000000..6210ce99 --- /dev/null +++ b/include/dftracer/utils/utilities/dlio/worker_queue.h @@ -0,0 +1,51 @@ +#ifndef DFTRACER_UTILS_UTILITIES_DLIO_WORKER_QUEUE_H +#define DFTRACER_UTILS_UTILITIES_DLIO_WORKER_QUEUE_H + +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::dlio { + +struct WorkInterval { + double start_time; + double end_time; + double preprocess_duration; +}; + +// Sampler returns {batch_time, preprocess_time} where batch_time is the wall +// clock the worker spends on the batch (preprocess + I/O, scaled). +using BatchTimeSampler = std::function()>; + +class WorkerQueue { + public: + WorkerQueue(int num_workers, int prefetch_factor) + : num_workers_(num_workers), + prefetch_factor_(prefetch_factor), + queue_capacity_(static_cast(num_workers) * + static_cast(prefetch_factor)) {} + + std::vector produce_batches(double current_time, + const BatchTimeSampler& sampler); + + // Returns time consumed (stall + base_overhead). + double consume_batch(double current_time, double base_overhead); + + std::size_t queue_depth() const { return ready_batches_.size(); } + bool had_stall() const { return stall_count_ > 0; } + std::uint64_t stall_count() const { return stall_count_; } + + private: + int num_workers_; + int prefetch_factor_; + std::size_t queue_capacity_; + std::uint64_t stall_count_ = 0; + std::vector ready_batches_; // sorted ready times + std::vector worker_free_times_; +}; + +} // namespace dftracer::utils::utilities::dlio + +#endif diff --git a/include/dftracer/utils/utilities/dlio/yaml_emit.h b/include/dftracer/utils/utilities/dlio/yaml_emit.h new file mode 100644 index 00000000..a0ad3634 --- /dev/null +++ b/include/dftracer/utils/utilities/dlio/yaml_emit.h @@ -0,0 +1,42 @@ +#ifndef DFTRACER_UTILS_UTILITIES_DLIO_YAML_EMIT_H +#define DFTRACER_UTILS_UTILITIES_DLIO_YAML_EMIT_H + +#include + +#include +#include + +namespace dftracer::utils::utilities::dlio { + +using BestModel = ::dftracer::utils::utilities::common::statistics::BestModel; + +// Parameters needed to emit one timing block in the DLIO config. +struct DlioTimingBlock { + BestModel model; + double max_bound = 0.0; // seconds +}; + +// Renders the DLIO config YAML: +// +// train: +// computation_time: +// type: +// ... +// max_bound: +// reader: +// preprocess_time: +// type: +// ... +// max_bound: +// +// Either or both blocks can be omitted by passing `nullptr`. +std::string render_dlio_yaml(const DlioTimingBlock* computation, + const DlioTimingBlock* preprocess); + +// Writes the YAML to `out` (e.g. an ofstream). Returns true on success. +bool write_dlio_yaml(std::ostream& out, const DlioTimingBlock* computation, + const DlioTimingBlock* preprocess); + +} // namespace dftracer::utils::utilities::dlio + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 503bb86e..46829ecd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -20,6 +20,8 @@ need_readerwriterqueue() need_concurrentqueue() need_tl_expected() need_unordered_dense() +need_boost_math() +need_yaml_cpp() if(DFTRACER_UTILS_ENABLE_ARROW) need_nanoarrow() @@ -117,7 +119,14 @@ set(DFTRACER_UTILS_UTILITIES_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/statistics/ddsketch.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/statistics/log2_histogram.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/statistics/timestamp_histogram.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/statistics/distributions.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/statistics/mixture.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/json/json_value.cpp + # DLIO config generation + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/dlio/barrier_simulator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/dlio/trace_loader.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/dlio/optimizer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/dlio/yaml_emit.cpp ) list(APPEND DFTRACER_UTILS_UTILITIES_SOURCES @@ -130,6 +139,7 @@ list(APPEND DFTRACER_UTILS_UTILITIES_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/query/query.cpp # DFT Aggregators ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregation_runner.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.cpp @@ -469,6 +479,12 @@ foreach(variant shared static) link_nanoarrow(dftracer_utils_utilities_${variant} ${VARIANT_UPPER}) endif() + # Boost.Math (standalone, header-only) for DLIO distribution fitting. + link_boost_math(dftracer_utils_utilities_${variant}) + + # yaml-cpp for DLIO config emit. + link_yaml_cpp(dftracer_utils_utilities_${variant}) + # Link zstd when ENABLE_ZSTD is on so headers propagate to consumers # (e.g. arrow ipc_writer.cpp guarded by DFTRACER_UTILS_ENABLE_ZSTD). if(DFTRACER_UTILS_ENABLE_ZSTD) @@ -830,7 +846,8 @@ if(DFTRACER_UTILS_BUILD_BINARIES) ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/dftracer_organize.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/dftracer_reconstruct.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/dftracer_server.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/dftracer_comparator.cpp) + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/dftracer_comparator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/dftracer_gen_dlio_config.cpp) set(DFTRACER_MPI_BINARIES "") if(DFTRACER_UTILS_ENABLE_MPI) diff --git a/src/dftracer/utils/binaries/dftracer_aggregator.cpp b/src/dftracer/utils/binaries/dftracer_aggregator.cpp index 6e1a4d12..03deb880 100644 --- a/src/dftracer/utils/binaries/dftracer_aggregator.cpp +++ b/src/dftracer/utils/binaries/dftracer_aggregator.cpp @@ -1,22 +1,15 @@ #include +#include #include -#include -#include -#include -#include -#include +#include +#include #include -#include -#include -#include -#include "common_cli.h" -#include "dftracer/utils/core/utils/timer.h" -#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC -#include -#endif #include -#include +#include +#include + +#include "common_cli.h" using namespace dftracer::utils; using namespace dftracer::utils::utilities; @@ -196,540 +189,146 @@ class AggregatorArgParse : public cli::ArgParse { } }; -// Write global config and per-file tracking entries. -static void write_aggregation_tracking( - dftracer::utils::rocksdb::RocksDatabase* db, - const AggregationConfig& config, - const std::vector& processed_files, - const std::string& index_path) { - namespace rcf = dftracer::utils::rocksdb::cf; - - // Open index database to get file_ids - indexer::IndexDatabase idx_db( - index_path, - dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); - - auto batch = db->begin_batch(); - - // Write global config once - AggGlobalConfig global_cfg; - global_cfg.time_interval_us = config.time_interval_us; - global_cfg.config_hash = 0; - db->put(batch, rcf::AGGREGATION, std::string_view(AGG_GLOBAL_CONFIG_KEY, 2), - serialize_agg_global_config(global_cfg)); - - // Per-file: empty value (presence = aggregated) - for (const auto& file_path : processed_files) { - int file_id = idx_db.find_file(file_path); - if (file_id >= 0) { - auto key = make_agg_file_key(file_id); - db->put(batch, rcf::AGGREGATION, key, ""); - } - } +namespace { - db->commit_batch(batch); +std::vector split_csv(const std::string& str) { + std::vector out; + if (str.empty()) return out; + std::stringstream ss(str); + std::string item; + while (std::getline(ss, item, ',')) { + if (!item.empty()) out.push_back(item); + } + return out; } -static coro::CoroTask batch_index_and_aggregate( - CoroScope* scope, std::vector file_paths, - std::string index_dir, std::size_t checkpoint_size, bool force_rebuild, - std::size_t parallelism, AggregationConfig agg_config, - std::shared_ptr agg_db, - std::uint32_t config_hash) { - auto batch_config = std::make_shared(); - batch_config->file_paths = std::move(file_paths); - batch_config->index_dir = std::move(index_dir); - batch_config->checkpoint_size = checkpoint_size; - batch_config->parallelism = parallelism; - batch_config->force_rebuild = force_rebuild; - batch_config->use_batch_write = true; - - auto agg_config_ptr = - std::make_shared(std::move(agg_config)); - batch_config->dft_visitor_factory = - [agg_db, config_hash, agg_config_ptr](const std::string& file_path) - -> std::vector> { - std::vector> visitors; - visitors.push_back(std::make_unique( - agg_db, config_hash, *agg_config_ptr, file_path)); - return visitors; - }; - - co_return co_await indexer::IndexBatchBuilderUtility::process( - scope, std::move(batch_config)); -} +} // namespace -static PerfettoTraceWriterInput build_streaming_input( - EventAggregator* merger_ptr, const AggregationConfig* agg_config, - const std::string* output_file, bool compress_output, int compression_level, - PerfettoEventFormat event_format) { - auto global_tracker = merger_ptr->build_global_tracker(); - - PerfettoTraceWriterInput input; - input.output_path = *output_file; - input.aggregator = merger_ptr; - input.tracker = global_tracker.get(); - input.agg_config = agg_config; - input.owned_tracker = std::move(global_tracker); - input.root_pids = input.tracker->get_root_pids(); - input.compute_statistics = agg_config->compute_statistics; - input.compute_percentiles = agg_config->compute_percentiles; - input.percentiles = agg_config->percentiles; - input.compress = compress_output; - input.compression_level = compression_level; - input.format = event_format; - - const auto& intervals = input.tracker->get_all_intervals(); - if (!intervals.empty()) { - std::uint64_t global_min = UINT64_MAX; - std::uint64_t global_max = 0; - for (const auto& interval : intervals) { - global_min = std::min(global_min, interval.start_ts); - global_max = std::max(global_max, interval.end_ts); - auto& range = input.boundary_ranges[interval.name][interval.value]; - if (range.ts == 0 && range.te == 0) { - range.ts = interval.start_ts; - range.te = interval.end_ts; - } else { - range.ts = std::min(range.ts, interval.start_ts); - range.te = std::max(range.te, interval.end_ts); - } - } - if (global_max > global_min) { - input.trace_duration = global_max - global_min; - } - } +int main(int argc, char** argv) { + DFTRACER_UTILS_LOGGER_INIT(); - return input; -} + argparse::ArgumentParser program("dftracer_aggregator", + DFTRACER_UTILS_PACKAGE_VERSION); + program.add_description( + "Aggregate DFTracer events into time-series counters using streaming " + "coroutine pipeline with minimal memory footprint"); -static coro::CoroTask run_aggregator(const AggregatorArgParse* cli) { - auto log_dir = cli->directory.value; - auto output_file = cli->output; - auto time_interval_ms = cli->time_interval; - std::uint64_t time_interval_us = - static_cast(time_interval_ms * 1000.0); - const auto& group_keys_str = cli->group_keys; - const auto& metric_fields_str = cli->metric_fields; - const auto& query_str = cli->query_args.query; - auto force_rebuild = cli->indexing.force; - auto checkpoint_size = cli->indexing.checkpoint_size; - auto executor_threads = cli->pipeline.executor_threads; - auto index_dir = cli->indexing.index_dir; - auto compress_output = cli->compress; - auto compression_level = cli->compression_level; - const auto& boundary_events_str = cli->boundary_events; - auto no_track_parents = cli->no_track_parents; - const auto& event_format_str = cli->event_format; - auto compute_percentiles = cli->compute_percentiles; - const auto& percentiles_str = cli->percentiles; - auto relative_accuracy = cli->relative_accuracy; - const auto& output_format = cli->format; - - if (!AggregationConfig::is_valid_format(output_format)) { - DFTRACER_UTILS_LOG_ERROR( - "Invalid output format: %s (supported: %s)", output_format.c_str(), - AggregationConfig::supported_formats_str().c_str()); - co_return 1; - } + AggregatorArgParse cli(program); + cli.setup(); + if (!cli.parse(argc, argv)) return 1; + // Resolve enum-like CLI strings. PerfettoEventFormat event_format = PerfettoEventFormat::COUNTER; - if (event_format_str == "async") { + if (cli.event_format == "async") { event_format = PerfettoEventFormat::ASYNC; - } else if (event_format_str == "regular") { + } else if (cli.event_format == "regular") { event_format = PerfettoEventFormat::REGULAR; - } else if (event_format_str != "counter") { + } else if (cli.event_format != "counter") { DFTRACER_UTILS_LOG_ERROR( "Invalid event format: %s (must be 'counter', 'async', or " "'regular')", - event_format_str.c_str()); - co_return 1; + cli.event_format.c_str()); + return 1; } - if (output_format == AggregationConfig::FORMAT_ARROW) { + // Output filename: append extension if missing. + std::string output_file = cli.output; + if (cli.format == AggregationConfig::FORMAT_ARROW) { constexpr std::string_view ext = ".arrows"; if (output_file.size() < ext.size() || output_file.substr(output_file.size() - ext.size()) != ext) { output_file += ext; } - } else if (compress_output) { + } else if (cli.compress) { if (output_file.size() < 3 || output_file.substr(output_file.size() - 3) != ".gz") { output_file += ".gz"; } } - auto split_string = [](const std::string& str) { - std::vector result; - if (str.empty()) return result; - std::stringstream ss(str); + // Parse boundary events. + std::vector boundary_events; + { + std::stringstream ss(cli.boundary_events); std::string item; while (std::getline(ss, item, ',')) { - if (!item.empty()) { - result.push_back(item); + std::stringstream item_ss(item); + std::string event_name, value_field, output_name; + if (std::getline(item_ss, event_name, ':') && + std::getline(item_ss, value_field, ':') && + std::getline(item_ss, output_name, ':')) { + BoundaryEventConfig bec; + bec.event_name = event_name; + bec.value_field = value_field; + bec.output_name = output_name; + boundary_events.push_back(bec); } } - return result; - }; + } - std::vector group_keys = split_string(group_keys_str); - std::vector metric_fields = split_string(metric_fields_str); + // Parse percentiles. std::vector percentiles; - if (compute_percentiles) { - auto percentile_strs = split_string(percentiles_str); - for (const auto& p_str : percentile_strs) { + if (cli.compute_percentiles) { + for (const auto& p_str : split_csv(cli.percentiles)) { try { double p = std::stod(p_str); - if (p >= 0.0 && p <= 1.0) { - percentiles.push_back(p); - } else { + if (p < 0.0 || p > 1.0) { DFTRACER_UTILS_LOG_ERROR( "Invalid percentile value: %s (must be in [0.0, 1.0])", p_str.c_str()); - co_return 1; + return 1; } - } catch (const std::exception& e) { + percentiles.push_back(p); + } catch (const std::exception&) { DFTRACER_UTILS_LOG_ERROR("Failed to parse percentile: %s", p_str.c_str()); - co_return 1; + return 1; } } if (percentiles.empty()) { DFTRACER_UTILS_LOG_ERROR( "No valid percentiles specified with --compute-percentiles"); - co_return 1; - } - } - - log_dir = fs::absolute(log_dir).string(); - output_file = fs::absolute(output_file).string(); - - std::printf("==========================================\n"); - std::printf("DFTracer Aggregator (Streaming Pipeline)\n"); - std::printf("==========================================\n"); - std::printf("Arguments:\n"); - std::printf(" Input directory: %s\n", log_dir.c_str()); - std::printf(" Output file: %s\n", output_file.c_str()); - std::printf(" Time interval: %.2f ms (%llu us)\n", time_interval_ms, - static_cast(time_interval_us)); - std::printf(" Force rebuild: %s\n", force_rebuild ? "true" : "false"); - std::printf(" Checkpoint size: %zu bytes (%.2f MB)\n", checkpoint_size, - static_cast(checkpoint_size) / (1024.0 * 1024.0)); - std::printf(" Executor threads: %zu\n", executor_threads); - - if (!group_keys.empty()) { - std::printf(" Extra group keys: "); - for (std::size_t i = 0; i < group_keys.size(); ++i) { - std::printf("%s%s", group_keys[i].c_str(), - i < group_keys.size() - 1 ? ", " : "\n"); - } - } - - if (!metric_fields.empty()) { - std::printf(" Custom metric fields: "); - for (std::size_t i = 0; i < metric_fields.size(); ++i) { - std::printf("%s%s", metric_fields[i].c_str(), - i < metric_fields.size() - 1 ? ", " : "\n"); + return 1; } } - std::printf("==========================================\n\n"); - - std::vector boundary_events; - if (!boundary_events_str.empty()) { - std::stringstream ss(boundary_events_str); - std::string item; - while (std::getline(ss, item, ',')) { - std::stringstream item_ss(item); - std::string event_name, value_field, output_name; - - if (std::getline(item_ss, event_name, ':') && - std::getline(item_ss, value_field, ':') && - std::getline(item_ss, output_name, ':')) { - BoundaryEventConfig config; - config.event_name = event_name; - config.value_field = value_field; - config.output_name = output_name; - boundary_events.push_back(config); - } - } + if (!cli.query_args.query.empty()) { + DFTRACER_UTILS_LOG_WARN( + "--query is not yet supported in fused mode, ignoring"); } AggregationConfig agg_config; - agg_config.time_interval_us = time_interval_us; - agg_config.extra_group_keys = group_keys; - agg_config.custom_metric_fields = metric_fields; + agg_config.time_interval_us = + static_cast(cli.time_interval * 1000.0); + agg_config.extra_group_keys = split_csv(cli.group_keys); + agg_config.custom_metric_fields = split_csv(cli.metric_fields); agg_config.compute_statistics = true; - agg_config.compute_percentiles = compute_percentiles; - agg_config.sketch_accuracy = relative_accuracy; + agg_config.compute_percentiles = cli.compute_percentiles; + agg_config.sketch_accuracy = cli.relative_accuracy; agg_config.percentiles = percentiles; agg_config.boundary_events = boundary_events; - agg_config.track_process_parents = !no_track_parents; - agg_config.track_default_args = !cli->no_default_args; - - if (!query_str.empty()) { - DFTRACER_UTILS_LOG_WARN( - "--query is not yet supported in fused mode, ignoring"); - } - - // Use hash=0 for simplicity (no config-based filtering) - constexpr std::uint32_t config_hash = 0; + agg_config.track_process_parents = !cli.no_track_parents; + agg_config.track_default_args = !cli.no_default_args; Timer stages_storage("dftracer_aggregator"); - Timer* stages = cli->pipeline.time_profiling ? &stages_storage : nullptr; - Timer overall(true); - - namespace idx = composites::dft::indexing; - - auto scan_result = std::make_unique(); - { - ScopedTimer _t(stages, "scan_and_resolve"); - idx::IndexResolverUtility resolver; - idx::ResolverInput input; - input.directory = log_dir; - input.index_dir = index_dir; - input.require_aggregation = !force_rebuild; - input.aggregation_config = agg_config; - *scan_result = co_await resolver.process(input); - } - - auto& input_files = scan_result->all_files; - if (input_files.empty()) { - DFTRACER_UTILS_LOG_ERROR("No .pfw or .pfw.gz files found in: %s", - log_dir.c_str()); - co_return 1; - } - - DFTRACER_UTILS_LOG_INFO("Found %zu input files", input_files.size()); - - auto& shared_index_path = scan_result->index_path; - - auto pipeline_config = - cli::build_pipeline_config("DFTracer Aggregator", cli->pipeline); - - Pipeline pipeline(pipeline_config); - - if (force_rebuild && fs::exists(shared_index_path)) { - DFTRACER_UTILS_LOG_INFO("Clearing shared index store: %s", - shared_index_path.c_str()); - fs::remove_all(shared_index_path); - } - - std::shared_ptr agg_db; - std::unique_ptr merger; - { - ScopedTimer _t(stages, "open_rocksdb"); - agg_db = EventAggregator::open_with_merge_operator(shared_index_path); - merger = std::make_unique(agg_db, config_hash); - } - - // Files to process: needs_checkpoint (index + aggregate) + - // needs_aggregation - const std::size_t num_needing_index = scan_result->needs_checkpoint.size(); - const std::size_t num_needing_agg_only = - force_rebuild ? scan_result->cached.size() - : scan_result->needs_aggregation.size(); - const std::size_t num_cached = - force_rebuild ? 0 : scan_result->total_cached(); - - std::vector files_to_process; - files_to_process.reserve(num_needing_index + num_needing_agg_only); - for (auto& item : scan_result->needs_checkpoint) { - files_to_process.push_back(std::move(item.file_path)); - } - if (force_rebuild) { - for (auto& item : scan_result->cached) { - files_to_process.push_back(std::move(item.file_path)); - } - } else { - for (auto& item : scan_result->needs_aggregation) { - files_to_process.push_back(std::move(item.file_path)); - } - } - - DFTRACER_UTILS_LOG_INFO( - "Files to process: %zu (%zu need indexing, %zu need aggregation only, " - "%zu cached)", - files_to_process.size(), num_needing_index, num_needing_agg_only, - num_cached); - - bool write_success = false; - std::size_t total_keys = 0; - std::atomic perfetto_keys_written{0}; - - auto main_task = make_task( - [&](CoroScope& scope) -> coro::CoroTask { - if (!files_to_process.empty()) { - { - ScopedTimer _t(stages, "index_and_aggregate"); - auto batch_result = co_await batch_index_and_aggregate( - &scope, files_to_process, index_dir, checkpoint_size, - force_rebuild, executor_threads, agg_config, agg_db, - config_hash); - - { - ScopedTimer _vd(stages, "visitor_drain"); - for (auto& file_visitors : - batch_result.extra_visitors) { - for (auto& visitor : file_visitors) { - auto* agg_visitor = - dynamic_cast( - visitor.get()); - if (agg_visitor) { - for (const auto& k : - agg_visitor->observed_extra_keys()) - merger->add_observed_extra_key(k); - for (const auto& m : - agg_visitor->observed_custom_metrics()) - merger->add_observed_custom_metric(m); - auto output = agg_visitor->take_output(); - merger->merge_chunk(std::move(output)); - } - } - file_visitors.clear(); - } - } - } - - // Write tracking entries for processed files - { - ScopedTimer _wt(stages, "write_tracking"); - write_aggregation_tracking(agg_db.get(), agg_config, - files_to_process, - shared_index_path); - } - } - - ScopedTimer _pp(stages, "post_processing"); - -#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC - if (output_format == AggregationConfig::FORMAT_ARROW) { - using namespace utilities::common::arrow; - - std::unique_ptr global_tracker; - { - ScopedTimer _bt(stages, "build_global_tracker"); - global_tracker = merger->build_global_tracker(); - } - (void)global_tracker; - - EventAggregator::ObservedColumns obs; - { - ScopedTimer _oc(stages, "observed_columns"); - obs = merger->observed_columns(); - } - auto& global_extra_key_ids = obs.extra_key_ids; - auto& global_custom_metric_names = obs.custom_metric_names; - - IpcWriter ipc; - if (co_await ipc.open(output_file) != 0) { - DFTRACER_UTILS_LOG_ERROR( - "Failed to open Arrow IPC file: %s", - output_file.c_str()); - } else { - ScopedTimer _aw(stages, "arrow_scan_write"); - constexpr std::size_t BATCH_ROWS = 10000; - AggregationBatch batch; - batch.entries.reserve(BATCH_ROWS); - batch.global_extra_key_ids = &global_extra_key_ids; - batch.global_custom_metric_names = - &global_custom_metric_names; - - std::vector pending_batches; - merger->scan([&](AggMapType, const AggregationKey& key, - AggregationMetrics& metrics) { - total_keys++; - batch.entries.emplace_back(key, std::move(metrics)); - if (batch.entries.size() >= BATCH_ROWS) { - pending_batches.push_back(batch.to_arrow()); - batch.entries.clear(); - } - return true; - }); - if (!batch.entries.empty()) { - pending_batches.push_back(batch.to_arrow()); - } - - write_success = true; - for (auto& ab : pending_batches) { - if (co_await ipc.write_batch(ab) != 0) { - write_success = false; - break; - } - } - if (write_success) { - write_success = (co_await ipc.close() == 0); - } else { - co_await ipc.close(); - } - } - } else -#endif - { - PerfettoTraceWriterInput streaming_input; - { - ScopedTimer _si(stages, "build_streaming_input"); - streaming_input = build_streaming_input( - merger.get(), &agg_config, &output_file, - compress_output, compression_level, event_format); - streaming_input.keys_written = &perfetto_keys_written; - streaming_input.merge_on_sharded = true; - } - { - ScopedTimer _pw(stages, "perfetto_write"); - PerfettoTraceWriterUtility writer; - write_success = co_await scope.spawn( - writer, std::move(streaming_input)); - } - total_keys = perfetto_keys_written.load(); - } - }, - "AggregatorMain"); - - pipeline.set_source(main_task); - { - ScopedTimer _t(stages, "pipeline_execute"); - pipeline.execute(); - } - - { - ScopedTimer _t(stages, "close_rocksdb"); - merger.reset(); - agg_db.reset(); - } - - overall.stop(); - double duration_ms = static_cast(overall.elapsed()) / 1e6; - - std::printf("\n"); - std::printf("==========================================\n"); - std::printf("Aggregation Results\n"); - std::printf("==========================================\n"); - std::printf(" Execution time: %.2f seconds\n", duration_ms / 1000.0); - std::printf(" Files: %zu total, %zu processed, %zu cached\n", - input_files.size(), files_to_process.size(), num_cached); - std::printf(" Unique aggregation keys: %zu\n", total_keys); - std::printf(" Output file: %s\n", output_file.c_str()); - std::printf(" Write status: %s\n", write_success ? "SUCCESS" : "FAILED"); - std::printf("==========================================\n"); - - if (stages) stages->print_stages(); - - co_return write_success ? 0 : 1; -} - -int main(int argc, char** argv) { - DFTRACER_UTILS_LOGGER_INIT(); - - argparse::ArgumentParser program("dftracer_aggregator", - DFTRACER_UTILS_PACKAGE_VERSION); - program.add_description( - "Aggregate DFTracer events into time-series counters using streaming " - "coroutine pipeline with minimal memory footprint"); - - AggregatorArgParse cli(program); - cli.setup(); - if (!cli.parse(argc, argv)) return 1; - - return run_aggregator(&cli).get(); + Timer* stages = cli.pipeline.time_profiling ? &stages_storage : nullptr; + + AggregationRunInput input; + input.log_dir = cli.directory.value; + input.index_dir = cli.indexing.index_dir; + input.agg_config = std::move(agg_config); + input.pipeline_config = + cli::build_pipeline_config("DFTracer Aggregator", cli.pipeline); + input.output_file = std::move(output_file); + input.output_format = cli.format; + input.event_format = event_format; + input.compress_output = cli.compress; + input.compression_level = cli.compression_level; + input.force_rebuild = cli.indexing.force; + input.checkpoint_size = cli.indexing.checkpoint_size; + input.stages = stages; + input.verbose = true; + + auto result = run_aggregation(std::move(input)).get(); + return result.success ? 0 : 1; } diff --git a/src/dftracer/utils/binaries/dftracer_gen_dlio_config.cpp b/src/dftracer/utils/binaries/dftracer_gen_dlio_config.cpp new file mode 100644 index 00000000..ee171068 --- /dev/null +++ b/src/dftracer/utils/binaries/dftracer_gen_dlio_config.cpp @@ -0,0 +1,354 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common_cli.h" + +using namespace dftracer::utils; +using namespace dftracer::utils::utilities; +namespace agg = dftracer::utils::utilities::composites::dft::aggregators; +namespace dlio = dftracer::utils::utilities::dlio; +namespace stats = dftracer::utils::utilities::common::statistics; + +namespace { + +class GenDlioConfigArgParse : public cli::ArgParse { + public: + cli::DirectoryArgs directory{ + cli::DirMode::DEFAULT_DOT, + "Input directory containing .pfw or .pfw.gz traces"}; + cli::PipelineArgs pipeline; + cli::IndexingArgs indexing; + + std::string output; + double max_bound_percentile = 95.0; + int simulation_iterations = 5; + double target_e2e_error = 0.05; + double target_cdf_similarity = 0.90; + int patience = 10; + double epsilon = 1.0; + double momentum = 0.9; + double min_percentile = 50.0; + int num_workers = 8; + int prefetch_factor = 2; + std::uint64_t seed = 42; + std::uint64_t max_samples_per_entry = 100; + double time_interval = 5000.0; + + explicit GenDlioConfigArgParse(argparse::ArgumentParser& p) : ArgParse(p) { + indexing.index_dir_help = + "Directory to store index files (default: system temp directory)"; + indexing.force_help = "Force index recreation"; + schema(directory, pipeline, indexing); + } + + protected: + void register_args() override { + parser() + .add_argument("-o", "--output") + .required() + .help("Output path for the DLIO YAML config."); + parser() + .add_argument("--max-bound-percentile") + .help("Initial max_bound percentile (0-100, default: 95)") + .scan<'g', double>() + .default_value(95.0); + parser() + .add_argument("--simulation-iterations") + .help( + "Max simulator iterations for percentile refinement (default: " + "5)") + .scan<'d', int>() + .default_value(5); + parser() + .add_argument("--target-e2e-error") + .help( + "Target relative E2E error to declare convergence (default: " + "0.05)") + .scan<'g', double>() + .default_value(0.05); + parser() + .add_argument("--target-cdf-similarity") + .help("Target fetch_block CDF similarity (default: 0.90)") + .scan<'g', double>() + .default_value(0.90); + parser() + .add_argument("--patience") + .help("Early-stop after this many iterations without improvement") + .scan<'d', int>() + .default_value(10); + parser() + .add_argument("--epsilon") + .help("Base step size for percentile adjustment (default: 1.0)") + .scan<'g', double>() + .default_value(1.0); + parser() + .add_argument("--momentum") + .help("Momentum factor in [0, 1) (default: 0.9)") + .scan<'g', double>() + .default_value(0.9); + parser() + .add_argument("--min-percentile") + .help("Floor on max_bound percentile (default: 50)") + .scan<'g', double>() + .default_value(50.0); + parser() + .add_argument("--num-workers") + .help("DataLoader worker count for the simulator (default: 8)") + .scan<'d', int>() + .default_value(8); + parser() + .add_argument("--prefetch-factor") + .help("DataLoader prefetch factor (default: 2)") + .scan<'d', int>() + .default_value(2); + parser() + .add_argument("--seed") + .help("Base seed for simulator + sampler (default: 42)") + .scan<'i', std::uint64_t>() + .default_value(42); + parser() + .add_argument("--max-samples-per-entry") + .help( + "Cap on synthesized samples per AGGREGATION entry (default: " + "100)") + .scan<'i', std::uint64_t>() + .default_value(100); + parser() + .add_argument("-t", "--time-interval") + .help("Aggregation time interval in ms (default: 5000)") + .scan<'g', double>() + .default_value(5000.0); + } + + void post_parse() override { + output = parser().get("--output"); + max_bound_percentile = parser().get("--max-bound-percentile"); + simulation_iterations = parser().get("--simulation-iterations"); + target_e2e_error = parser().get("--target-e2e-error"); + target_cdf_similarity = parser().get("--target-cdf-similarity"); + patience = parser().get("--patience"); + epsilon = parser().get("--epsilon"); + momentum = parser().get("--momentum"); + min_percentile = parser().get("--min-percentile"); + num_workers = parser().get("--num-workers"); + prefetch_factor = parser().get("--prefetch-factor"); + seed = parser().get("--seed"); + max_samples_per_entry = + parser().get("--max-samples-per-entry"); + time_interval = parser().get("--time-interval"); + } +}; + +std::optional fit_best_model( + const std::vector& data) { + if (data.empty()) return std::nullopt; + const auto singles = stats::fit_all_single_distributions(data); + std::vector mixtures; + if (data.size() >= 20) { + mixtures.push_back(stats::fit_gaussian_mixture(data, 2)); + mixtures.push_back(stats::fit_gaussian_mixture(data, 3)); + } + const auto best = stats::select_best_model(singles, mixtures); + if (!best) return std::nullopt; + return best->model; +} + +const char* model_label(const stats::BestModel& m) { + return std::visit( + [](const auto& v) -> const char* { + using T = std::decay_t; + if constexpr (std::is_same_v) { + switch (v.kind) { + case stats::DistributionKind::Normal: + return "Normal"; + case stats::DistributionKind::Lognormal: + return "Lognormal"; + case stats::DistributionKind::Gamma: + return "Gamma"; + case stats::DistributionKind::Exponential: + return "Exponential"; + case stats::DistributionKind::Weibull: + return "Weibull"; + } + return "Unknown"; + } else { + return v.weights.size() == 2 ? "GMM-2" : "GMM-3"; + } + }, + m); +} + +} // namespace + +int main(int argc, char** argv) { + DFTRACER_UTILS_LOGGER_INIT(); + + argparse::ArgumentParser program("dftracer_gen_dlio_config", + DFTRACER_UTILS_PACKAGE_VERSION); + program.add_description( + "Generate a DLIO YAML configuration from raw DFTracer logs. Indexes " + "and aggregates the input directory automatically; users do not need " + "to run dftracer_aggregator separately."); + + GenDlioConfigArgParse cli(program); + cli.setup(); + if (!cli.parse(argc, argv)) return 1; + + // --- Aggregation phase: produce / reuse the AGGREGATION CF --------------- + agg::AggregationConfig agg_config; + agg_config.time_interval_us = + static_cast(cli.time_interval * 1000.0); + agg_config.compute_statistics = true; + // DDSketch is required for high-fidelity DLIO config generation. Force it + // on so users don't have to remember the flag. + agg_config.compute_percentiles = true; + agg_config.sketch_accuracy = 0.01; + agg_config.percentiles = {0.25, 0.5, 0.75, 0.90}; + agg_config.track_process_parents = true; + agg_config.track_default_args = true; + + agg::AggregationRunInput run_input; + run_input.log_dir = cli.directory.value; + run_input.index_dir = cli.indexing.index_dir; + run_input.agg_config = std::move(agg_config); + run_input.pipeline_config = + cli::build_pipeline_config("DLIO Config Generator", cli.pipeline); + run_input.output_file = std::nullopt; // populate AGGREGATION CF only + run_input.force_rebuild = cli.indexing.force; + run_input.checkpoint_size = cli.indexing.checkpoint_size; + run_input.verbose = true; + + auto run_result = agg::run_aggregation(std::move(run_input)).get(); + if (!run_result.success || run_result.index_path.empty()) { + DFTRACER_UTILS_LOG_ERROR( + "Aggregation failed; cannot generate DLIO config"); + return 1; + } + + // --- Load aggregated traces --------------------------------------------- + dlio::TraceLoaderOptions loader_opts; + loader_opts.max_samples_per_entry = cli.max_samples_per_entry; + loader_opts.seed = cli.seed; + dlio::AggregatedTraces traces; + try { + traces = + dlio::load_aggregated_traces(run_result.index_path, loader_opts); + } catch (const std::exception& e) { + DFTRACER_UTILS_LOG_ERROR("Failed to load AGGREGATION CF: %s", e.what()); + return 1; + } + if (!traces.any_data) { + DFTRACER_UTILS_LOG_ERROR( + "No DLIO events (fetch.block / preprocess) found in %s", + cli.directory.value.c_str()); + return 1; + } + std::printf("\n"); + std::printf("==========================================\n"); + std::printf("DLIO Config Generation\n"); + std::printf("==========================================\n"); + std::printf(" Loaded %d rank(s), %d step(s) from index at %s\n", + traces.num_ranks, traces.num_steps, + run_result.index_path.c_str()); + std::printf(" computation_times: %zu samples (min %.6fs, max %.6fs)\n", + traces.computation_times.size(), traces.fetch_block_stats.min(), + traces.fetch_block_stats.max()); + std::printf(" preprocess_times: %zu samples (min %.6fs, max %.6fs)\n", + traces.preprocess_times.size(), traces.preprocess_stats.min(), + traces.preprocess_stats.max()); + + // --- Fit distributions --------------------------------------------------- + auto comp_model = fit_best_model(traces.computation_times); + auto prep_model = fit_best_model(traces.preprocess_times); + if (!comp_model) { + DFTRACER_UTILS_LOG_ERROR( + "Failed to fit a computation_time distribution"); + return 1; + } + std::printf(" Best computation_time model: %s\n", + model_label(*comp_model)); + if (prep_model) { + std::printf(" Best preprocess_time model: %s\n", + model_label(*prep_model)); + } else { + std::printf(" No preprocess events; skipping preprocess block\n"); + } + + // --- Optimize max_bound percentile via simulator ------------------------- + auto ctx = dlio::make_simulator_context(traces, cli.num_workers, + cli.prefetch_factor); + + dlio::OptimizerOptions opts; + opts.max_iterations = cli.simulation_iterations; + opts.target_e2e_error = cli.target_e2e_error; + opts.target_cdf_similarity = cli.target_cdf_similarity; + opts.patience = cli.patience; + opts.epsilon = cli.epsilon; + opts.momentum = cli.momentum; + opts.min_percentile = cli.min_percentile; + opts.initial_percentile = cli.max_bound_percentile; + opts.base_seed = cli.seed; + + auto opt = dlio::optimize_max_bound_percentile( + ctx, *comp_model, traces.computation_times, opts); + std::printf( + " Optimizer: iters=%d, best_percentile=%.2f%%, e2e_error=%.2f%%, " + "fetch_block_cdf_similarity=%.4f%s\n", + opt.iterations_used, opt.best_percentile, opt.best.e2e_error * 100.0, + opt.best.fetch_block_cdf_similarity, + opt.converged ? " (converged)" : ""); + + auto comp_sorted = traces.computation_times; + std::sort(comp_sorted.begin(), comp_sorted.end()); + const double comp_max_bound = + dlio::percentile(comp_sorted, opt.best_percentile); + + double prep_max_bound = 0.0; + if (prep_model) { + auto prep_sorted = traces.preprocess_times; + std::sort(prep_sorted.begin(), prep_sorted.end()); + prep_max_bound = dlio::percentile(prep_sorted, opt.best_percentile); + } + + // --- Emit YAML ----------------------------------------------------------- + dlio::DlioTimingBlock comp_block{*comp_model, comp_max_bound}; + std::optional prep_block_storage; + const dlio::DlioTimingBlock* prep_block_ptr = nullptr; + if (prep_model) { + prep_block_storage = dlio::DlioTimingBlock{*prep_model, prep_max_bound}; + prep_block_ptr = &(*prep_block_storage); + } + + std::ofstream out(cli.output); + if (!out) { + DFTRACER_UTILS_LOG_ERROR("Cannot open %s for writing", + cli.output.c_str()); + return 1; + } + if (!dlio::write_dlio_yaml(out, &comp_block, prep_block_ptr)) { + DFTRACER_UTILS_LOG_ERROR("Failed to write %s", cli.output.c_str()); + return 1; + } + std::printf(" Wrote DLIO config: %s\n", cli.output.c_str()); + std::printf("==========================================\n"); + return 0; +} diff --git a/src/dftracer/utils/utilities/common/statistics/distributions.cpp b/src/dftracer/utils/utilities/common/statistics/distributions.cpp new file mode 100644 index 00000000..546b0e41 --- /dev/null +++ b/src/dftracer/utils/utilities/common/statistics/distributions.cpp @@ -0,0 +1,448 @@ +#include + +#include +#include +#include +#include +#include +#include + +// Boost.Math standalone is configured globally via -DBOOST_MATH_STANDALONE. +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::common::statistics { + +namespace bm = boost::math; + +namespace { + +constexpr double kMinPositive = 1e-12; +constexpr int kNewtonMaxIter = 100; +constexpr double kNewtonTol = 1e-8; + +// Sample statistics computed in one pass via Welford for numerical stability. +struct SampleSummary { + std::size_t n = 0; + double mean = 0.0; + double variance = 0.0; // population variance (1/n) + double min = std::numeric_limits::infinity(); + double max = -std::numeric_limits::infinity(); + bool any_non_positive = false; +}; + +SampleSummary summarize(const std::vector& data) { + SampleSummary s; + double m2 = 0.0; + for (double x : data) { + if (x <= 0.0) s.any_non_positive = true; + ++s.n; + const double delta = x - s.mean; + s.mean += delta / static_cast(s.n); + m2 += delta * (x - s.mean); + if (x < s.min) s.min = x; + if (x > s.max) s.max = x; + } + if (s.n > 0) s.variance = m2 / static_cast(s.n); + return s; +} + +template +double ks_statistic(const std::vector& sorted_data, + DistCdf&& dist_cdf) { + const auto n = static_cast(sorted_data.size()); + double max_diff = 0.0; + // Two-sided KS: at each xi the empirical CDF jumps from (i-1)/n to i/n. + // Compare the theoretical CDF against both sides. + for (std::size_t i = 0; i < sorted_data.size(); ++i) { + const double x = sorted_data[i]; + const double f_theo = dist_cdf(x); + const double f_lo = static_cast(i) / n; + const double f_hi = static_cast(i + 1) / n; + max_diff = std::max(max_diff, std::abs(f_theo - f_lo)); + max_diff = std::max(max_diff, std::abs(f_hi - f_theo)); + } + return max_diff; +} + +template +double log_likelihood(const std::vector& data, DistPdf&& dist_pdf) { + double ll = 0.0; + for (double x : data) { + const double p = dist_pdf(x); + if (p <= 0.0 || !std::isfinite(p)) + return -std::numeric_limits::infinity(); + ll += std::log(p); + } + return ll; +} + +double compute_bic(double log_l, std::size_t n, int k) { + return static_cast(k) * std::log(static_cast(n)) - + 2.0 * log_l; +} + +// ---- Per-distribution MLE ------------------------------------------------- + +FittedDistribution fit_normal(const std::vector& data, + const SampleSummary& s) { + FittedDistribution f; + f.kind = DistributionKind::Normal; + if (s.n < 2 || s.variance <= 0.0) return f; + const double sigma = std::sqrt(s.variance); + f.params = {s.mean, sigma, 0.0}; + f.valid = true; + + bm::normal dist(s.mean, sigma); + auto sorted = data; + std::sort(sorted.begin(), sorted.end()); + f.ks_stat = + ks_statistic(sorted, [&](double x) { return bm::cdf(dist, x); }); + f.log_likelihood = + log_likelihood(data, [&](double x) { return bm::pdf(dist, x); }); + f.bic = compute_bic(f.log_likelihood, s.n, free_parameter_count(f.kind)); + return f; +} + +FittedDistribution fit_lognormal(const std::vector& data, + const SampleSummary& s) { + FittedDistribution f; + f.kind = DistributionKind::Lognormal; + if (s.n < 2 || s.any_non_positive) return f; + + double mean_log = 0.0; + for (double x : data) mean_log += std::log(x); + mean_log /= static_cast(s.n); + + double var_log = 0.0; + for (double x : data) { + const double d = std::log(x) - mean_log; + var_log += d * d; + } + var_log /= static_cast(s.n); + if (var_log <= 0.0) return f; + + const double sigma = std::sqrt(var_log); + f.params = {mean_log, sigma, 0.0}; + f.valid = true; + + bm::lognormal dist(mean_log, sigma); + auto sorted = data; + std::sort(sorted.begin(), sorted.end()); + f.ks_stat = + ks_statistic(sorted, [&](double x) { return bm::cdf(dist, x); }); + f.log_likelihood = + log_likelihood(data, [&](double x) { return bm::pdf(dist, x); }); + f.bic = compute_bic(f.log_likelihood, s.n, free_parameter_count(f.kind)); + return f; +} + +FittedDistribution fit_exponential(const std::vector& data, + const SampleSummary& s) { + FittedDistribution f; + f.kind = DistributionKind::Exponential; + if (s.n < 1 || s.mean <= 0.0 || s.any_non_positive) return f; + const double rate = 1.0 / s.mean; + f.params = {rate, 0.0, 0.0}; + f.valid = true; + + bm::exponential dist(rate); + auto sorted = data; + std::sort(sorted.begin(), sorted.end()); + f.ks_stat = + ks_statistic(sorted, [&](double x) { return bm::cdf(dist, x); }); + f.log_likelihood = + log_likelihood(data, [&](double x) { return bm::pdf(dist, x); }); + f.bic = compute_bic(f.log_likelihood, s.n, free_parameter_count(f.kind)); + return f; +} + +// Gamma MLE: there's no closed form. We use method-of-moments as the initial +// estimate (good enough for most timing distributions) and then refine the +// shape parameter via Newton-Raphson on the log-likelihood derivative. +// +// d/dk log L = n*ln(k/mean) - n*digamma(k) + sum(ln x_i) +// +// digamma(k) is the polygamma_0; both digamma and its derivative (trigamma) +// are provided by Boost.Math. +FittedDistribution fit_gamma(const std::vector& data, + const SampleSummary& s) { + FittedDistribution f; + f.kind = DistributionKind::Gamma; + if (s.n < 2 || s.any_non_positive || s.variance <= 0.0) return f; + + // Method-of-moments initial estimate. + double k = s.mean * s.mean / s.variance; + if (k <= 0.0 || !std::isfinite(k)) return f; + + double sum_log = 0.0; + for (double x : data) sum_log += std::log(x); + const double mean_log = sum_log / static_cast(s.n); + const double log_mean = std::log(s.mean); + // s = log_mean - mean_log; for k > 0, k satisfies + // ln(k) - digamma(k) = s. + const double rhs = log_mean - mean_log; + if (rhs <= 0.0) { + // Data is degenerate; fall back to MoM. + } else { + for (int it = 0; it < kNewtonMaxIter; ++it) { + const double g = std::log(k) - bm::digamma(k) - rhs; + const double gp = 1.0 / k - bm::trigamma(k); + if (!std::isfinite(g) || !std::isfinite(gp) || gp == 0.0) break; + const double dk = g / gp; + k -= dk; + if (k <= kMinPositive) { + k = kMinPositive; + break; + } + if (std::abs(dk) < kNewtonTol) break; + } + } + const double theta = s.mean / k; + if (k <= 0.0 || theta <= 0.0) return f; + f.params = {k, theta, 0.0}; + f.valid = true; + + bm::gamma_distribution dist(k, theta); + auto sorted = data; + std::sort(sorted.begin(), sorted.end()); + f.ks_stat = + ks_statistic(sorted, [&](double x) { return bm::cdf(dist, x); }); + f.log_likelihood = + log_likelihood(data, [&](double x) { return bm::pdf(dist, x); }); + f.bic = compute_bic(f.log_likelihood, s.n, free_parameter_count(f.kind)); + return f; +} + +// Weibull MLE: shape `k` is the root of +// f(k) = sum(x^k ln x) / sum(x^k) - 1/k - mean(ln x) = 0 +// Newton-Raphson with MoM-style initial estimate. +FittedDistribution fit_weibull(const std::vector& data, + const SampleSummary& s) { + FittedDistribution f; + f.kind = DistributionKind::Weibull; + if (s.n < 2 || s.any_non_positive || s.variance <= 0.0) return f; + + double sum_log = 0.0; + for (double x : data) sum_log += std::log(x); + const double mean_log = sum_log / static_cast(s.n); + + // Initial shape via rough variance heuristic; ~1.0 works for most cases. + double k = 1.0; + + for (int it = 0; it < kNewtonMaxIter; ++it) { + double s_xk = 0.0, s_xk_lnx = 0.0, s_xk_lnx2 = 0.0; + for (double x : data) { + const double lx = std::log(x); + const double xk = std::pow(x, k); + s_xk += xk; + s_xk_lnx += xk * lx; + s_xk_lnx2 += xk * lx * lx; + } + if (s_xk <= 0.0 || !std::isfinite(s_xk)) return f; + const double a = s_xk_lnx / s_xk; + const double a_prime = + (s_xk_lnx2 * s_xk - s_xk_lnx * s_xk_lnx) / (s_xk * s_xk); + const double g = a - 1.0 / k - mean_log; + const double gp = a_prime + 1.0 / (k * k); + if (!std::isfinite(g) || !std::isfinite(gp) || gp == 0.0) break; + const double dk = g / gp; + k -= dk; + if (k <= kMinPositive) { + k = kMinPositive; + break; + } + if (std::abs(dk) < kNewtonTol) break; + } + + double s_xk = 0.0; + for (double x : data) s_xk += std::pow(x, k); + const double lambda = std::pow(s_xk / static_cast(s.n), 1.0 / k); + if (k <= 0.0 || lambda <= 0.0) return f; + f.params = {k, lambda, 0.0}; + f.valid = true; + + bm::weibull dist(k, lambda); + auto sorted = data; + std::sort(sorted.begin(), sorted.end()); + f.ks_stat = + ks_statistic(sorted, [&](double x) { return bm::cdf(dist, x); }); + f.log_likelihood = + log_likelihood(data, [&](double x) { return bm::pdf(dist, x); }); + f.bic = compute_bic(f.log_likelihood, s.n, free_parameter_count(f.kind)); + return f; +} + +} // namespace + +std::string_view distribution_name(DistributionKind k) { + switch (k) { + case DistributionKind::Normal: + return "Normal"; + case DistributionKind::Lognormal: + return "Lognormal"; + case DistributionKind::Gamma: + return "Gamma"; + case DistributionKind::Exponential: + return "Exponential"; + case DistributionKind::Weibull: + return "Weibull"; + } + return "Unknown"; +} + +int free_parameter_count(DistributionKind kind) { + switch (kind) { + case DistributionKind::Normal: + case DistributionKind::Lognormal: + case DistributionKind::Gamma: + case DistributionKind::Weibull: + return 2; + case DistributionKind::Exponential: + return 1; + } + return 0; +} + +FittedDistribution fit_single_distribution(DistributionKind kind, + const std::vector& data) { + const auto s = summarize(data); + switch (kind) { + case DistributionKind::Normal: + return fit_normal(data, s); + case DistributionKind::Lognormal: + return fit_lognormal(data, s); + case DistributionKind::Gamma: + return fit_gamma(data, s); + case DistributionKind::Exponential: + return fit_exponential(data, s); + case DistributionKind::Weibull: + return fit_weibull(data, s); + } + return {}; +} + +std::vector fit_all_single_distributions( + const std::vector& data) { + const auto s = summarize(data); + std::vector fits; + fits.reserve(5); + fits.push_back(fit_normal(data, s)); + fits.push_back(fit_lognormal(data, s)); + fits.push_back(fit_gamma(data, s)); + fits.push_back(fit_exponential(data, s)); + fits.push_back(fit_weibull(data, s)); + + std::sort(fits.begin(), fits.end(), + [](const FittedDistribution& a, const FittedDistribution& b) { + if (a.valid != b.valid) return a.valid; // valid first + return a.ks_stat < b.ks_stat; + }); + return fits; +} + +std::optional best_fit_by_ks( + const std::vector& fits) { + for (const auto& f : fits) { + if (f.valid) return f; + } + return std::nullopt; +} + +double pdf(const FittedDistribution& fit, double x) { + switch (fit.kind) { + case DistributionKind::Normal: + return bm::pdf(bm::normal(fit.params[0], fit.params[1]), x); + case DistributionKind::Lognormal: + return bm::pdf(bm::lognormal(fit.params[0], fit.params[1]), x); + case DistributionKind::Gamma: + return bm::pdf( + bm::gamma_distribution(fit.params[0], fit.params[1]), + x); + case DistributionKind::Exponential: + return bm::pdf(bm::exponential(fit.params[0]), x); + case DistributionKind::Weibull: + return bm::pdf(bm::weibull(fit.params[0], fit.params[1]), x); + } + return 0.0; +} + +double cdf(const FittedDistribution& fit, double x) { + switch (fit.kind) { + case DistributionKind::Normal: + return bm::cdf(bm::normal(fit.params[0], fit.params[1]), x); + case DistributionKind::Lognormal: + return bm::cdf(bm::lognormal(fit.params[0], fit.params[1]), x); + case DistributionKind::Gamma: + return bm::cdf( + bm::gamma_distribution(fit.params[0], fit.params[1]), + x); + case DistributionKind::Exponential: + return bm::cdf(bm::exponential(fit.params[0]), x); + case DistributionKind::Weibull: + return bm::cdf(bm::weibull(fit.params[0], fit.params[1]), x); + } + return 0.0; +} + +double quantile(const FittedDistribution& fit, double p) { + switch (fit.kind) { + case DistributionKind::Normal: + return bm::quantile(bm::normal(fit.params[0], fit.params[1]), p); + case DistributionKind::Lognormal: + return bm::quantile(bm::lognormal(fit.params[0], fit.params[1]), p); + case DistributionKind::Gamma: + return bm::quantile( + bm::gamma_distribution(fit.params[0], fit.params[1]), + p); + case DistributionKind::Exponential: + return bm::quantile(bm::exponential(fit.params[0]), p); + case DistributionKind::Weibull: + return bm::quantile(bm::weibull(fit.params[0], fit.params[1]), p); + } + return 0.0; +} + +Sampler make_sampler(const FittedDistribution& fit, + std::optional min_bound, + std::optional max_bound) { + if (!fit.valid) { + throw std::invalid_argument( + "make_sampler called with invalid FittedDistribution"); + } + const auto p0 = fit.params[0]; + const auto p1 = fit.params[1]; + const auto kind = fit.kind; + + auto draw = [kind, p0, p1](std::mt19937_64& rng) -> double { + switch (kind) { + case DistributionKind::Normal: + return std::normal_distribution(p0, p1)(rng); + case DistributionKind::Lognormal: + return std::lognormal_distribution(p0, p1)(rng); + case DistributionKind::Gamma: + return std::gamma_distribution(p0, p1)(rng); + case DistributionKind::Exponential: + return std::exponential_distribution(p0)(rng); + case DistributionKind::Weibull: + return std::weibull_distribution(p0, p1)(rng); + } + return 0.0; + }; + + if (!min_bound && !max_bound) { + return [draw](std::mt19937_64& rng) { return draw(rng); }; + } + const double lo = + min_bound.value_or(-std::numeric_limits::infinity()); + const double hi = + max_bound.value_or(std::numeric_limits::infinity()); + return [draw, lo, hi](std::mt19937_64& rng) { + return std::clamp(draw(rng), lo, hi); + }; +} + +} // namespace dftracer::utils::utilities::common::statistics diff --git a/src/dftracer/utils/utilities/common/statistics/mixture.cpp b/src/dftracer/utils/utilities/common/statistics/mixture.cpp new file mode 100644 index 00000000..3518fdf8 --- /dev/null +++ b/src/dftracer/utils/utilities/common/statistics/mixture.cpp @@ -0,0 +1,259 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::common::statistics { + +namespace bm = boost::math; + +namespace { + +constexpr double kInvSqrt2Pi = 0.3989422804014327; // 1 / sqrt(2*pi) + +inline double normal_pdf(double x, double mean, double stddev) { + const double z = (x - mean) / stddev; + return (kInvSqrt2Pi / stddev) * std::exp(-0.5 * z * z); +} + +// log(sum_k exp(log_x[k])) computed with the standard log-sum-exp trick to +// avoid underflow when component log-likelihoods diverge. +double log_sum_exp(const std::vector& log_x) { + double m = -std::numeric_limits::infinity(); + for (double v : log_x) { + if (v > m) m = v; + } + if (!std::isfinite(m)) return m; + double s = 0.0; + for (double v : log_x) s += std::exp(v - m); + return m + std::log(s); +} + +// Pick K initial means by sampling sorted-data quantiles uniformly across (0, +// 1). Avoids the degenerate case where random init collapses all components +// together. +std::vector initial_means(const std::vector& sorted, int K) { + std::vector means; + means.reserve(K); + for (int k = 0; k < K; ++k) { + const double q = + (static_cast(k) + 0.5) / static_cast(K); + const std::size_t idx = std::min( + sorted.size() - 1, static_cast(q * sorted.size())); + means.push_back(sorted[idx]); + } + return means; +} + +double sample_variance(const std::vector& data, double mean) { + double sq = 0.0; + for (double x : data) { + const double d = x - mean; + sq += d * d; + } + return sq / static_cast(data.size()); +} + +} // namespace + +FittedMixture fit_gaussian_mixture(const std::vector& data, int K, + const MixtureFitOptions& options) { + FittedMixture m; + if (K <= 0 || data.size() < static_cast(K) * 2) return m; + + auto sorted = data; + std::sort(sorted.begin(), sorted.end()); + + const double total_mean = std::accumulate(data.begin(), data.end(), 0.0) / + static_cast(data.size()); + const double total_var = + std::max(sample_variance(data, total_mean), options.variance_floor); + + m.weights.assign(K, 1.0 / static_cast(K)); + m.components.resize(K); + const auto means_init = initial_means(sorted, K); + for (int k = 0; k < K; ++k) { + m.components[k].mean = means_init[k]; + m.components[k].stddev = std::sqrt(total_var); + } + + const std::size_t N = data.size(); + std::vector> resp(K, std::vector(N, 0.0)); + double prev_ll = -std::numeric_limits::infinity(); + + std::vector log_comp(K); + for (int it = 0; it < options.max_iter; ++it) { + // E-step: responsibilities via log-sum-exp. + double ll = 0.0; + for (std::size_t i = 0; i < N; ++i) { + const double x = data[i]; + for (int k = 0; k < K; ++k) { + const double p = + normal_pdf(x, m.components[k].mean, m.components[k].stddev); + log_comp[k] = (p > 0.0 && std::isfinite(p)) + ? std::log(m.weights[k]) + std::log(p) + : -std::numeric_limits::infinity(); + } + const double lse = log_sum_exp(log_comp); + ll += lse; + for (int k = 0; k < K; ++k) { + resp[k][i] = std::exp(log_comp[k] - lse); + } + } + + // M-step. + for (int k = 0; k < K; ++k) { + double n_k = 0.0; + for (std::size_t i = 0; i < N; ++i) n_k += resp[k][i]; + + // Guard against an empty component. + if (n_k < 1e-12) { + m.weights[k] = 0.0; + m.components[k].stddev = + std::sqrt(std::max(total_var, options.variance_floor)); + continue; + } + + double mean = 0.0; + for (std::size_t i = 0; i < N; ++i) mean += resp[k][i] * data[i]; + mean /= n_k; + + double var = 0.0; + for (std::size_t i = 0; i < N; ++i) { + const double d = data[i] - mean; + var += resp[k][i] * d * d; + } + var /= n_k; + if (var < options.variance_floor) var = options.variance_floor; + + m.weights[k] = n_k / static_cast(N); + m.components[k].mean = mean; + m.components[k].stddev = std::sqrt(var); + } + + m.iterations = it + 1; + if (std::isfinite(ll) && std::abs(ll - prev_ll) < options.tol) { + m.converged = true; + prev_ll = ll; + break; + } + prev_ll = ll; + } + + // Final log-likelihood pass (uses the converged parameters). + double ll = 0.0; + for (double x : data) { + for (int k = 0; k < K; ++k) { + const double p = + normal_pdf(x, m.components[k].mean, m.components[k].stddev); + log_comp[k] = (p > 0.0 && std::isfinite(p)) + ? std::log(m.weights[k]) + std::log(p) + : -std::numeric_limits::infinity(); + } + ll += log_sum_exp(log_comp); + } + m.log_likelihood = ll; + m.bic = static_cast(free_parameter_count(m)) * + std::log(static_cast(N)) - + 2.0 * ll; + m.valid = std::isfinite(ll); + return m; +} + +int free_parameter_count(const FittedMixture& mix) { + const int K = static_cast(mix.weights.size()); + return 3 * K - 1; +} + +double pdf(const FittedMixture& mix, double x) { + double p = 0.0; + for (std::size_t k = 0; k < mix.weights.size(); ++k) { + p += mix.weights[k] * + normal_pdf(x, mix.components[k].mean, mix.components[k].stddev); + } + return p; +} + +double cdf(const FittedMixture& mix, double x) { + double c = 0.0; + for (std::size_t k = 0; k < mix.weights.size(); ++k) { + c += mix.weights[k] * bm::cdf(bm::normal(mix.components[k].mean, + mix.components[k].stddev), + x); + } + return c; +} + +Sampler make_sampler(const FittedMixture& mix, std::optional min_bound, + std::optional max_bound) { + if (!mix.valid) { + throw std::invalid_argument( + "make_sampler called with invalid FittedMixture"); + } + + auto weights = mix.weights; + auto comps = mix.components; + + auto draw = [weights = std::move(weights), + comps = std::move(comps)](std::mt19937_64& rng) -> double { + std::discrete_distribution cat(weights.begin(), weights.end()); + const int k = cat(rng); + return std::normal_distribution(comps[k].mean, + comps[k].stddev)(rng); + }; + + if (!min_bound && !max_bound) { + return [draw = std::move(draw)](std::mt19937_64& rng) { + return draw(rng); + }; + } + const double lo = + min_bound.value_or(-std::numeric_limits::infinity()); + const double hi = + max_bound.value_or(std::numeric_limits::infinity()); + return [draw = std::move(draw), lo, hi](std::mt19937_64& rng) { + return std::clamp(draw(rng), lo, hi); + }; +} + +std::optional select_best_model( + const std::vector& single_fits, + const std::vector& mixtures) { + std::optional best; + for (const auto& f : single_fits) { + if (!f.valid) continue; + if (!best || f.bic < best->bic) { + best = ModelSelection{BestModel{f}, f.bic, + free_parameter_count(f.kind)}; + } + } + for (const auto& m : mixtures) { + if (!m.valid) continue; + if (!best || m.bic < best->bic) { + best = ModelSelection{BestModel{m}, m.bic, free_parameter_count(m)}; + } + } + return best; +} + +double pdf(const BestModel& m, double x) { + return std::visit([x](const auto& v) { return pdf(v, x); }, m); +} + +double cdf(const BestModel& m, double x) { + return std::visit([x](const auto& v) { return cdf(v, x); }, m); +} + +Sampler make_sampler(const BestModel& m, std::optional min_bound, + std::optional max_bound) { + return std::visit( + [&](const auto& v) { return make_sampler(v, min_bound, max_bound); }, + m); +} + +} // namespace dftracer::utils::utilities::common::statistics diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_runner.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_runner.cpp new file mode 100644 index 00000000..9a3ee4fe --- /dev/null +++ b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_runner.cpp @@ -0,0 +1,444 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC +#include +#endif + +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft::aggregators { + +namespace { + +namespace rcf = ::dftracer::utils::rocksdb::cf; +namespace idx = composites::dft::indexing; + +void write_aggregation_tracking(::dftracer::utils::rocksdb::RocksDatabase* db, + const AggregationConfig& config, + const std::vector& processed_files, + const std::string& index_path) { + indexer::IndexDatabase idx_db( + index_path, + ::dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + + auto batch = db->begin_batch(); + + AggGlobalConfig global_cfg; + global_cfg.time_interval_us = config.time_interval_us; + global_cfg.config_hash = 0; + db->put(batch, rcf::AGGREGATION, std::string_view(AGG_GLOBAL_CONFIG_KEY, 2), + serialize_agg_global_config(global_cfg)); + + for (const auto& file_path : processed_files) { + int file_id = idx_db.find_file(file_path); + if (file_id >= 0) { + auto key = make_agg_file_key(file_id); + db->put(batch, rcf::AGGREGATION, key, ""); + } + } + + db->commit_batch(batch); +} + +coro::CoroTask batch_index_and_aggregate( + CoroScope* scope, std::vector file_paths, + std::string index_dir, std::size_t checkpoint_size, bool force_rebuild, + std::size_t parallelism, AggregationConfig agg_config, + std::shared_ptr<::dftracer::utils::rocksdb::RocksDatabase> agg_db, + std::uint32_t config_hash) { + auto batch_config = std::make_shared(); + batch_config->file_paths = std::move(file_paths); + batch_config->index_dir = std::move(index_dir); + batch_config->checkpoint_size = checkpoint_size; + batch_config->parallelism = parallelism; + batch_config->force_rebuild = force_rebuild; + batch_config->use_batch_write = true; + + auto agg_config_ptr = + std::make_shared(std::move(agg_config)); + batch_config->dft_visitor_factory = + [agg_db, config_hash, agg_config_ptr](const std::string& file_path) + -> std::vector> { + std::vector> visitors; + visitors.push_back(std::make_unique( + agg_db, config_hash, *agg_config_ptr, file_path)); + return visitors; + }; + + co_return co_await indexer::IndexBatchBuilderUtility::process( + scope, std::move(batch_config)); +} + +PerfettoTraceWriterInput build_streaming_input( + EventAggregator* merger_ptr, const AggregationConfig* agg_config, + const std::string* output_file, bool compress_output, int compression_level, + PerfettoEventFormat event_format) { + auto global_tracker = merger_ptr->build_global_tracker(); + + PerfettoTraceWriterInput input; + input.output_path = *output_file; + input.aggregator = merger_ptr; + input.tracker = global_tracker.get(); + input.agg_config = agg_config; + input.owned_tracker = std::move(global_tracker); + input.root_pids = input.tracker->get_root_pids(); + input.compute_statistics = agg_config->compute_statistics; + input.compute_percentiles = agg_config->compute_percentiles; + input.percentiles = agg_config->percentiles; + input.compress = compress_output; + input.compression_level = compression_level; + input.format = event_format; + + const auto& intervals = input.tracker->get_all_intervals(); + if (!intervals.empty()) { + std::uint64_t global_min = UINT64_MAX; + std::uint64_t global_max = 0; + for (const auto& interval : intervals) { + global_min = std::min(global_min, interval.start_ts); + global_max = std::max(global_max, interval.end_ts); + auto& range = input.boundary_ranges[interval.name][interval.value]; + if (range.ts == 0 && range.te == 0) { + range.ts = interval.start_ts; + range.te = interval.end_ts; + } else { + range.ts = std::min(range.ts, interval.start_ts); + range.te = std::max(range.te, interval.end_ts); + } + } + if (global_max > global_min) { + input.trace_duration = global_max - global_min; + } + } + + return input; +} + +} // namespace + +coro::CoroTask run_aggregation( + AggregationRunInput input) { + AggregationRunResult result; + + if (!AggregationConfig::is_valid_format(input.output_format)) { + DFTRACER_UTILS_LOG_ERROR( + "Invalid output format: %s (supported: %s)", + input.output_format.c_str(), + AggregationConfig::supported_formats_str().c_str()); + co_return result; + } + + input.log_dir = fs::absolute(input.log_dir).string(); + if (input.output_file) { + *input.output_file = fs::absolute(*input.output_file).string(); + } + + if (input.verbose) { + std::printf("==========================================\n"); + std::printf("DFTracer Aggregator (Streaming Pipeline)\n"); + std::printf("==========================================\n"); + std::printf("Arguments:\n"); + std::printf(" Input directory: %s\n", input.log_dir.c_str()); + std::printf(" Output file: %s\n", + input.output_file ? input.output_file->c_str() : ""); + std::printf( + " Time interval: %llu us\n", + static_cast(input.agg_config.time_interval_us)); + std::printf(" Force rebuild: %s\n", + input.force_rebuild ? "true" : "false"); + std::printf( + " Checkpoint size: %zu bytes (%.2f MB)\n", input.checkpoint_size, + static_cast(input.checkpoint_size) / (1024.0 * 1024.0)); + std::printf(" Executor threads: %zu\n", + input.pipeline_config.executor_threads); + std::printf("==========================================\n\n"); + } + + constexpr std::uint32_t config_hash = 0; + + ::dftracer::utils::Timer* stages = input.stages; + ::dftracer::utils::Timer overall(true); + + auto scan_result = std::make_unique(); + { + ::dftracer::utils::ScopedTimer _t(stages, "scan_and_resolve"); + idx::IndexResolverUtility resolver; + idx::ResolverInput resolver_input; + resolver_input.directory = input.log_dir; + resolver_input.index_dir = input.index_dir; + resolver_input.require_aggregation = !input.force_rebuild; + resolver_input.aggregation_config = input.agg_config; + *scan_result = co_await resolver.process(resolver_input); + } + + auto& input_files = scan_result->all_files; + if (input_files.empty()) { + DFTRACER_UTILS_LOG_ERROR("No .pfw or .pfw.gz files found in: %s", + input.log_dir.c_str()); + co_return result; + } + + DFTRACER_UTILS_LOG_INFO("Found %zu input files", input_files.size()); + + auto& shared_index_path = scan_result->index_path; + result.index_path = shared_index_path; + result.input_file_count = input_files.size(); + + Pipeline pipeline(input.pipeline_config); + + if (input.force_rebuild && fs::exists(shared_index_path)) { + DFTRACER_UTILS_LOG_INFO("Clearing shared index store: %s", + shared_index_path.c_str()); + fs::remove_all(shared_index_path); + } + + std::shared_ptr<::dftracer::utils::rocksdb::RocksDatabase> agg_db; + std::unique_ptr merger; + { + ::dftracer::utils::ScopedTimer _t(stages, "open_rocksdb"); + agg_db = EventAggregator::open_with_merge_operator(shared_index_path); + merger = std::make_unique(agg_db, config_hash); + } + + const std::size_t num_needing_index = scan_result->needs_checkpoint.size(); + const std::size_t num_needing_agg_only = + input.force_rebuild ? scan_result->cached.size() + : scan_result->needs_aggregation.size(); + const std::size_t num_cached = + input.force_rebuild ? 0 : scan_result->total_cached(); + result.cached_file_count = num_cached; + + std::vector files_to_process; + files_to_process.reserve(num_needing_index + num_needing_agg_only); + for (auto& item : scan_result->needs_checkpoint) { + files_to_process.push_back(std::move(item.file_path)); + } + if (input.force_rebuild) { + for (auto& item : scan_result->cached) { + files_to_process.push_back(std::move(item.file_path)); + } + } else { + for (auto& item : scan_result->needs_aggregation) { + files_to_process.push_back(std::move(item.file_path)); + } + } + result.processed_file_count = files_to_process.size(); + + DFTRACER_UTILS_LOG_INFO( + "Files to process: %zu (%zu need indexing, %zu need aggregation only, " + "%zu cached)", + files_to_process.size(), num_needing_index, num_needing_agg_only, + num_cached); + + bool write_success = + !input.output_file.has_value(); // no output -> trivially OK + std::size_t total_keys = 0; + std::atomic perfetto_keys_written{0}; + + auto main_task = make_task( + [&](CoroScope& scope) -> coro::CoroTask { + if (!files_to_process.empty()) { + { + ::dftracer::utils::ScopedTimer _t(stages, + "index_and_aggregate"); + auto batch_result = co_await batch_index_and_aggregate( + &scope, files_to_process, input.index_dir, + input.checkpoint_size, input.force_rebuild, + input.pipeline_config.executor_threads, + input.agg_config, agg_db, config_hash); + + { + ::dftracer::utils::ScopedTimer _vd(stages, + "visitor_drain"); + for (auto& file_visitors : + batch_result.extra_visitors) { + for (auto& visitor : file_visitors) { + auto* agg_visitor = + dynamic_cast( + visitor.get()); + if (agg_visitor) { + for (const auto& k : + agg_visitor->observed_extra_keys()) + merger->add_observed_extra_key(k); + for (const auto& m : + agg_visitor->observed_custom_metrics()) + merger->add_observed_custom_metric(m); + auto output = agg_visitor->take_output(); + merger->merge_chunk(std::move(output)); + } + } + file_visitors.clear(); + } + } + } + + { + ::dftracer::utils::ScopedTimer _wt(stages, + "write_tracking"); + write_aggregation_tracking(agg_db.get(), input.agg_config, + files_to_process, + shared_index_path); + } + } + + ::dftracer::utils::ScopedTimer _pp(stages, "post_processing"); + +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + if (input.output_file && + input.output_format == AggregationConfig::FORMAT_ARROW) { + using namespace ::dftracer::utils::utilities::common::arrow; + + std::unique_ptr global_tracker; + { + ::dftracer::utils::ScopedTimer _bt(stages, + "build_global_tracker"); + global_tracker = merger->build_global_tracker(); + } + (void)global_tracker; + + EventAggregator::ObservedColumns obs; + { + ::dftracer::utils::ScopedTimer _oc(stages, + "observed_columns"); + obs = merger->observed_columns(); + } + auto& global_extra_key_ids = obs.extra_key_ids; + auto& global_custom_metric_names = obs.custom_metric_names; + + IpcWriter ipc; + if (co_await ipc.open(*input.output_file) != 0) { + DFTRACER_UTILS_LOG_ERROR( + "Failed to open Arrow IPC file: %s", + input.output_file->c_str()); + } else { + ::dftracer::utils::ScopedTimer _aw(stages, + "arrow_scan_write"); + constexpr std::size_t BATCH_ROWS = 10000; + AggregationBatch batch; + batch.entries.reserve(BATCH_ROWS); + batch.global_extra_key_ids = &global_extra_key_ids; + batch.global_custom_metric_names = + &global_custom_metric_names; + + std::vector pending_batches; + merger->scan([&](AggMapType, const AggregationKey& key, + AggregationMetrics& metrics) { + total_keys++; + batch.entries.emplace_back(key, std::move(metrics)); + if (batch.entries.size() >= BATCH_ROWS) { + pending_batches.push_back(batch.to_arrow()); + batch.entries.clear(); + } + return true; + }); + if (!batch.entries.empty()) { + pending_batches.push_back(batch.to_arrow()); + } + + write_success = true; + for (auto& ab : pending_batches) { + if (co_await ipc.write_batch(ab) != 0) { + write_success = false; + break; + } + } + if (write_success) { + write_success = (co_await ipc.close() == 0); + } else { + co_await ipc.close(); + } + } + } else +#endif + if (input.output_file) { + PerfettoTraceWriterInput streaming_input; + { + ::dftracer::utils::ScopedTimer _si(stages, + "build_streaming_input"); + streaming_input = build_streaming_input( + merger.get(), &input.agg_config, &(*input.output_file), + input.compress_output, input.compression_level, + input.event_format); + streaming_input.keys_written = &perfetto_keys_written; + streaming_input.merge_on_sharded = true; + } + { + ::dftracer::utils::ScopedTimer _pw(stages, + "perfetto_write"); + PerfettoTraceWriterUtility writer; + write_success = co_await scope.spawn( + writer, std::move(streaming_input)); + } + total_keys = perfetto_keys_written.load(); + } else { + // No output file requested: just count keys in the AGGREGATION + // CF so callers get a meaningful total_keys. + merger->scan([&](AggMapType, const AggregationKey&, + AggregationMetrics&) { + total_keys++; + return true; + }); + } + }, + "AggregatorMain"); + + pipeline.set_source(main_task); + { + ::dftracer::utils::ScopedTimer _t(stages, "pipeline_execute"); + pipeline.execute(); + } + + { + ::dftracer::utils::ScopedTimer _t(stages, "close_rocksdb"); + merger.reset(); + agg_db.reset(); + } + + overall.stop(); + result.elapsed_ms = static_cast(overall.elapsed()) / 1e6; + result.total_keys = total_keys; + result.success = write_success; + + if (input.verbose) { + std::printf("\n==========================================\n"); + std::printf("Aggregation Results\n"); + std::printf("==========================================\n"); + std::printf(" Execution time: %.2f seconds\n", + result.elapsed_ms / 1000.0); + std::printf(" Files: %zu total, %zu processed, %zu cached\n", + result.input_file_count, result.processed_file_count, + result.cached_file_count); + std::printf(" Unique aggregation keys: %zu\n", result.total_keys); + if (input.output_file) { + std::printf(" Output file: %s\n", input.output_file->c_str()); + std::printf(" Write status: %s\n", + result.success ? "SUCCESS" : "FAILED"); + } + std::printf("==========================================\n"); + } + + if (stages) stages->print_stages(); + + co_return result; +} + +} // namespace dftracer::utils::utilities::composites::dft::aggregators diff --git a/src/dftracer/utils/utilities/dlio/barrier_simulator.cpp b/src/dftracer/utils/utilities/dlio/barrier_simulator.cpp new file mode 100644 index 00000000..201fcfef --- /dev/null +++ b/src/dftracer/utils/utilities/dlio/barrier_simulator.cpp @@ -0,0 +1,450 @@ +#include +#include +#include + +#include +#include +#include +#include + +namespace dftracer::utils::utilities::dlio { + +namespace { + +constexpr std::uint64_t PREPROCESS_RNG_OFFSET = 888888; +constexpr std::uint64_t STEP_RNG_OFFSET = 999999; + +inline double uniform01(Rng& rng) { + return std::uniform_real_distribution(0.0, 1.0)(rng); +} + +} // namespace + +double sweep_union(std::vector& boundaries) { + if (boundaries.empty()) return 0.0; + std::sort( + boundaries.begin(), boundaries.end(), + [](const Boundary& a, const Boundary& b) { return a.time < b.time; }); + + double union_time_us = 0.0; + int active = 0; + std::int64_t last_time = 0; + for (const auto& b : boundaries) { + if (active > 0) + union_time_us += static_cast(b.time - last_time); + active += b.delta; + last_time = b.time; + } + return union_time_us / 1e6; +} + +double variance(const std::vector& values) { + if (values.empty()) return 0.0; + double mean = 0.0; + for (double v : values) mean += v; + mean /= static_cast(values.size()); + + double sq = 0.0; + for (double v : values) { + double d = v - mean; + sq += d * d; + } + return sq / static_cast(values.size()); +} + +double cdf_similarity(const std::vector& a, + const std::vector& b) { + if (a.empty() || b.empty()) return 0.0; + + std::vector as(a), bs(b); + std::sort(as.begin(), as.end()); + std::sort(bs.begin(), bs.end()); + + const auto na = static_cast(as.size()); + const auto nb = static_cast(bs.size()); + + std::vector all; + all.reserve(as.size() + bs.size()); + all.insert(all.end(), as.begin(), as.end()); + all.insert(all.end(), bs.begin(), bs.end()); + std::sort(all.begin(), all.end()); + all.erase(std::unique(all.begin(), all.end()), all.end()); + + double max_diff = 0.0; + for (double v : all) { + const auto ca = static_cast( + std::upper_bound(as.begin(), as.end(), v) - as.begin()); + const auto cb = static_cast( + std::upper_bound(bs.begin(), bs.end(), v) - bs.begin()); + double diff = std::abs(ca / na - cb / nb); + if (diff > max_diff) max_diff = diff; + } + return 1.0 - max_diff; +} + +std::vector WorkerQueue::produce_batches( + double current_time, const BatchTimeSampler& sampler) { + if (worker_free_times_.empty()) { + worker_free_times_.assign(static_cast(num_workers_), + current_time); + } + + std::vector intervals; + while (ready_batches_.size() < queue_capacity_) { + const auto earliest_it = std::min_element(worker_free_times_.begin(), + worker_free_times_.end()); + const double worker_available = *earliest_it; + + auto [batch_time, preprocess_time] = sampler(); + const double batch_ready = worker_available + batch_time; + intervals.push_back({worker_available, batch_ready, preprocess_time}); + + *earliest_it = batch_ready; + ready_batches_.push_back(batch_ready); + } + std::sort(ready_batches_.begin(), ready_batches_.end()); + return intervals; +} + +double WorkerQueue::consume_batch(double current_time, double base_overhead) { + if (ready_batches_.empty()) { + ++stall_count_; + return base_overhead; + } + const double batch_ready = ready_batches_.front(); + ready_batches_.erase(ready_batches_.begin()); + + if (batch_ready <= current_time) return base_overhead; + + ++stall_count_; + return (batch_ready - current_time) + base_overhead; +} + +BarrierSimulationResult BarrierSimulator::simulate( + const BarrierSimulatorContext& ctx, std::uint64_t base_seed, + const Sampler& fetch_block_sampler, + const Sampler& preprocess_sampler) const { + BarrierSimulationResult result; + + std::vector rank_rngs; + rank_rngs.reserve(static_cast(ctx.num_ranks)); + for (int rank = 0; rank < ctx.num_ranks; ++rank) { + rank_rngs.emplace_back(base_seed + static_cast(rank)); + } + + std::vector rank_times(static_cast(ctx.num_ranks), + 0.0); + std::vector rank_work_times(static_cast(ctx.num_ranks), + 0.0); + std::vector barrier_overheads; + + std::vector boundaries; + std::vector preprocess_boundaries; + std::vector fetch_iter_boundaries; + std::vector fetch_block_boundaries; + + std::vector rank_queues; + std::vector preprocess_rngs; + preprocess_rngs.reserve(static_cast(ctx.num_ranks)); + for (int rank = 0; rank < ctx.num_ranks; ++rank) { + preprocess_rngs.emplace_back(base_seed + PREPROCESS_RNG_OFFSET + + static_cast(rank)); + } + + auto worker_batch_sampler_for = [&](int rank, bool record_simulated) { + return [&, rank, record_simulated]() -> std::pair { + Rng& rrng = preprocess_rngs[static_cast(rank)]; + double sampled_preprocess = 0.0; + double sampled_getitem = 0.0; + if (preprocess_sampler && ctx.io_stats) { + sampled_preprocess = preprocess_sampler(rrng); + const double sampled_io = + ctx.io_stats->quantile(uniform01(rrng)); + sampled_getitem = sampled_preprocess + sampled_io; + } else { + sampled_getitem = ctx.getitem_stats.quantile(uniform01(rrng)); + sampled_preprocess = + ctx.preprocess_stats.quantile(uniform01(rrng)); + } + + const double io_time = + std::max(0.0, sampled_getitem - sampled_preprocess); + const double total_time = io_time + sampled_preprocess; + const double adjusted_time = + total_time * ctx.preprocess_slowdown_factor; + + if (record_simulated) { + result.simulated_preprocess.push_back(sampled_preprocess); + result.simulated_getitem.push_back(sampled_getitem); + } + + result.preprocess_metrics.accumulated_time += sampled_preprocess; + ++result.preprocess_metrics.num_samples; + result.preprocess_metrics.stats.update(sampled_preprocess); + + return {adjusted_time, sampled_preprocess}; + }; + }; + + if (ctx.enable_preprocess_simulation) { + for (int rank = 0; rank < ctx.num_ranks; ++rank) { + rank_queues.emplace_back(ctx.num_workers, ctx.prefetch_factor); + } + for (int rank = 0; rank < ctx.num_ranks; ++rank) { + auto sampler = + worker_batch_sampler_for(rank, /*record_simulated=*/false); + auto intervals = + rank_queues[static_cast(rank)].produce_batches( + 0.0, sampler); + for (const auto& interval : intervals) { + preprocess_boundaries.push_back( + {static_cast(interval.start_time), +1}); + preprocess_boundaries.push_back( + {static_cast(interval.end_time), -1}); + } + } + } + + Rng step_rng(base_seed + STEP_RNG_OFFSET); + + std::uint64_t total_queue_stalls = 0; + std::uint64_t total_queue_depth_samples = 0; + double sum_queue_depth = 0.0; + + for (int step = 0; step < ctx.num_steps; ++step) { + for (int rank = 0; rank < ctx.num_ranks; ++rank) { + const auto r = static_cast(rank); + double fetch_iter = 0.0; + double fetch_block = 0.0; + + const bool have_aggregated_fetch_iter = + ctx.is_aggregated_trace && !ctx.enable_preprocess_simulation && + static_cast(ctx.fetch_iter_trace.size()) > rank && + static_cast(ctx.fetch_iter_trace[r].size()) > step; + + if (have_aggregated_fetch_iter) { + fetch_iter = + ctx.fetch_iter_trace[r][static_cast(step)]; + } else if (ctx.enable_preprocess_simulation) { + auto& queue = rank_queues[r]; + + fetch_iter = queue.consume_batch(rank_times[r], + ctx.base_fetch_iter_overhead); + result.simulated_fetch_iter.push_back(fetch_iter); + if (queue.had_stall()) ++total_queue_stalls; + sum_queue_depth += static_cast(queue.queue_depth()); + ++total_queue_depth_samples; + + auto sampler = + worker_batch_sampler_for(rank, /*record_simulated=*/true); + auto intervals = + queue.produce_batches(rank_times[r] + fetch_iter, sampler); + for (const auto& interval : intervals) { + preprocess_boundaries.push_back( + {static_cast(interval.start_time), +1}); + preprocess_boundaries.push_back( + {static_cast(interval.end_time), -1}); + } + } else { + fetch_iter = ctx.fetch_iter_stats.quantile(uniform01(step_rng)); + if (!ctx.is_aggregated_trace) { + fetch_iter = + std::clamp(fetch_iter, ctx.fetch_iter_stats.min(), + ctx.fetch_iter_stats.max()); + } + } + + fetch_block = fetch_block_sampler(rank_rngs[r]); + if (!ctx.is_aggregated_trace) { + fetch_block = + std::clamp(fetch_block, ctx.fetch_block_stats.min(), + ctx.fetch_block_stats.max()); + } + + result.simulated_fetch_block.push_back(fetch_block); + rank_work_times[r] += fetch_block + fetch_iter; + + const double start_time = rank_times[r]; + const double fetch_iter_end_time = start_time + fetch_iter; + const double end_time = fetch_iter_end_time + fetch_block; + + result.fetch_iter_metrics.accumulated_time += fetch_iter; + ++result.fetch_iter_metrics.num_samples; + result.fetch_iter_metrics.stats.update(fetch_iter); + fetch_iter_boundaries.push_back( + {static_cast(start_time * 1e6), +1}); + fetch_iter_boundaries.push_back( + {static_cast(fetch_iter_end_time * 1e6), -1}); + + result.fetch_block_metrics.accumulated_time += fetch_block; + ++result.fetch_block_metrics.num_samples; + result.fetch_block_metrics.stats.update(fetch_block); + fetch_block_boundaries.push_back( + {static_cast(fetch_iter_end_time * 1e6), +1}); + fetch_block_boundaries.push_back( + {static_cast(end_time * 1e6), -1}); + + if (!ctx.sync_mode) { + boundaries.push_back( + {static_cast(start_time * 1e6), +1}); + boundaries.push_back( + {static_cast(end_time * 1e6), -1}); + } + + rank_times[r] = end_time; + } + + const bool is_barrier_step = + ctx.accumulate_grad_batches > 0 && + ((step + 1) % ctx.accumulate_grad_batches == 0); + if (ctx.sync_mode && is_barrier_step) { + const double max_time = + *std::max_element(rank_times.begin(), rank_times.end()); + for (int rank = 0; rank < ctx.num_ranks; ++rank) { + const auto r = static_cast(rank); + barrier_overheads.push_back(max_time - rank_times[r]); + rank_times[r] = max_time; + } + } + } + + if (ctx.sync_mode) { + result.e2e_duration = rank_times.empty() ? 0.0 : rank_times.front(); + } else if (!boundaries.empty()) { + result.e2e_duration = sweep_union(boundaries); + } else if (!rank_times.empty()) { + result.e2e_duration = + *std::max_element(rank_times.begin(), rank_times.end()); + } + + if (!preprocess_boundaries.empty()) + result.preprocess_metrics.union_time = + sweep_union(preprocess_boundaries); + if (!fetch_iter_boundaries.empty()) + result.fetch_iter_metrics.union_time = + sweep_union(fetch_iter_boundaries); + if (!fetch_block_boundaries.empty()) + result.fetch_block_metrics.union_time = + sweep_union(fetch_block_boundaries); + + result.trace_preprocess_metrics = ctx.trace_preprocess_metrics; + result.trace_fetch_iter_metrics = ctx.trace_fetch_iter_metrics; + result.trace_fetch_block_metrics = ctx.trace_fetch_block_metrics; + + if (!barrier_overheads.empty()) { + double sum = 0.0; + double max_ov = -std::numeric_limits::infinity(); + for (double v : barrier_overheads) { + sum += v; + if (v > max_ov) max_ov = v; + } + result.avg_barrier_overhead = + sum / static_cast(barrier_overheads.size()); + result.max_barrier_overhead = max_ov; + } + + { + std::vector trace_flat; + for (const auto& rank_data : ctx.fetch_block_trace) { + trace_flat.insert(trace_flat.end(), rank_data.begin(), + rank_data.end()); + } + result.fetch_block_cdf_similarity = + cdf_similarity(result.simulated_fetch_block, trace_flat); + } + + if (!result.simulated_fetch_iter.empty() && !ctx.fetch_iter_trace.empty()) { + std::vector trace_flat; + for (const auto& rank_data : ctx.fetch_iter_trace) { + trace_flat.insert(trace_flat.end(), rank_data.begin(), + rank_data.end()); + } + result.fetch_iter_cdf_similarity = + cdf_similarity(result.simulated_fetch_iter, trace_flat); + } + + if (!result.simulated_getitem.empty() && ctx.getitem_trace) { + std::vector trace_flat; + for (const auto& rank_data : *ctx.getitem_trace) { + trace_flat.insert(trace_flat.end(), rank_data.begin(), + rank_data.end()); + } + result.getitem_cdf_similarity = + cdf_similarity(result.simulated_getitem, trace_flat); + } + + if (ctx.trace_e2e_duration > 0.0) { + result.e2e_error = + std::abs(result.e2e_duration - ctx.trace_e2e_duration) / + ctx.trace_e2e_duration; + } + + result.per_rank_completion_time = rank_times; + result.rank_variance = variance(rank_work_times); + result.trace_rank_variance = ctx.trace_rank_variance; + if (ctx.trace_rank_variance > 0.0) { + result.rank_variance_error = + std::abs(result.rank_variance - ctx.trace_rank_variance) / + ctx.trace_rank_variance; + } + + if (!rank_times.empty()) { + const double min_t = + *std::min_element(rank_times.begin(), rank_times.end()); + const double max_t = + *std::max_element(rank_times.begin(), rank_times.end()); + result.load_imbalance = (max_t - min_t) / (min_t + 1e-9); + } + + if (ctx.enable_preprocess_simulation && total_queue_depth_samples > 0) { + result.avg_queue_depth = + sum_queue_depth / static_cast(total_queue_depth_samples); + result.avg_queue_stalls = + static_cast(total_queue_stalls) / + static_cast(ctx.num_ranks * ctx.num_steps); + } + + result.simulated_per_rank_throughput.reserve( + static_cast(ctx.num_ranks)); + for (int rank = 0; rank < ctx.num_ranks; ++rank) { + const auto r = static_cast(rank); + result.simulated_per_rank_throughput.push_back( + rank_times[r] > 0.0 + ? static_cast(ctx.num_steps) / rank_times[r] + : 0.0); + } + result.trace_per_rank_throughput = ctx.trace_per_rank_throughput; + + if (!result.simulated_per_rank_throughput.empty() && + !result.trace_per_rank_throughput.empty()) { + double sim_sum = 0.0; + for (double v : result.simulated_per_rank_throughput) sim_sum += v; + result.throughput_mean = + sim_sum / + static_cast(result.simulated_per_rank_throughput.size()); + + double tr_sum = 0.0; + for (double v : result.trace_per_rank_throughput) tr_sum += v; + result.trace_throughput_mean = + tr_sum / + static_cast(result.trace_per_rank_throughput.size()); + + if (result.trace_throughput_mean > 0.0) { + result.throughput_mean_error = + std::abs(result.throughput_mean - + result.trace_throughput_mean) / + result.trace_throughput_mean; + } + result.throughput_variance = + variance(result.simulated_per_rank_throughput); + result.trace_throughput_variance = + variance(result.trace_per_rank_throughput); + result.throughput_cdf_similarity = + cdf_similarity(result.simulated_per_rank_throughput, + result.trace_per_rank_throughput); + } + + return result; +} + +} // namespace dftracer::utils::utilities::dlio diff --git a/src/dftracer/utils/utilities/dlio/optimizer.cpp b/src/dftracer/utils/utilities/dlio/optimizer.cpp new file mode 100644 index 00000000..9c2b5d16 --- /dev/null +++ b/src/dftracer/utils/utilities/dlio/optimizer.cpp @@ -0,0 +1,91 @@ +#include + +#include +#include +#include +#include + +namespace dftracer::utils::utilities::dlio { + +namespace stats = ::dftracer::utils::utilities::common::statistics; + +double percentile(const std::vector& sorted_data, double pct) { + if (sorted_data.empty()) return 0.0; + const double p = std::clamp(pct, 0.0, 100.0) / 100.0; + const double idx = p * static_cast(sorted_data.size() - 1); + const auto lo = static_cast(std::floor(idx)); + const auto hi = static_cast(std::ceil(idx)); + if (lo == hi) return sorted_data[lo]; + const double frac = idx - static_cast(lo); + return sorted_data[lo] * (1.0 - frac) + sorted_data[hi] * frac; +} + +OptimizerResult optimize_max_bound_percentile( + const BarrierSimulatorContext& context, const BestModel& model, + std::vector sample_times, const OptimizerOptions& options) { + OptimizerResult out; + out.best_percentile = options.initial_percentile; + + std::sort(sample_times.begin(), sample_times.end()); + if (sample_times.empty()) return out; + + const double sample_min = sample_times.front(); + + BarrierSimulator sim; + double current_percentile = options.initial_percentile; + double velocity = 0.0; + double best_e2e_error = std::numeric_limits::infinity(); + int iterations_without_improvement = 0; + + constexpr double kImprovementThreshold = 0.001; // 0.1% relative + + for (int iter = 0; iter < options.max_iterations; ++iter) { + const double max_bound = percentile(sample_times, current_percentile); + auto sampler = stats::make_sampler(model, sample_min, max_bound); + + auto result = sim.simulate(context, options.base_seed, sampler); + out.iterations_used = iter + 1; + + // Track best result by E2E error (improvement must beat threshold). + const bool first = (iter == 0); + const bool better = + result.e2e_error < best_e2e_error * (1.0 - kImprovementThreshold); + if (first || better) { + out.best = result; + out.best_percentile = current_percentile; + best_e2e_error = result.e2e_error; + iterations_without_improvement = 0; + } else { + ++iterations_without_improvement; + } + + const bool e2e_ok = result.e2e_error < options.target_e2e_error; + const bool cdf_ok = + result.fetch_block_cdf_similarity > options.target_cdf_similarity; + if (e2e_ok && cdf_ok) { + out.converged = true; + return out; + } + if (iterations_without_improvement >= options.patience) return out; + + // Momentum-smoothed step. Overshooting -> shrink percentile; + // undershooting -> grow it; close-but-not-converged -> nudge by CDF. + double step = 0.0; + if (result.e2e_duration > context.trace_e2e_duration) { + const double aggressive = result.e2e_error > 0.10 ? 2.0 : 1.0; + step = -options.epsilon * aggressive; + } else if (result.e2e_duration < context.trace_e2e_duration * 0.95) { + step = options.epsilon * 0.5; + } else if (result.fetch_block_cdf_similarity < + options.target_cdf_similarity) { + step = -options.epsilon * 0.5; + } + velocity = options.momentum * velocity + step; + current_percentile = std::clamp(current_percentile + velocity, + options.min_percentile, 100.0); + } + + return out; +} + +} // namespace dftracer::utils::utilities::dlio diff --git a/src/dftracer/utils/utilities/dlio/trace_loader.cpp b/src/dftracer/utils/utilities/dlio/trace_loader.cpp new file mode 100644 index 00000000..be1a25c0 --- /dev/null +++ b/src/dftracer/utils/utilities/dlio/trace_loader.cpp @@ -0,0 +1,380 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::dlio { + +namespace { + +namespace agg = ::dftracer::utils::utilities::composites::dft::aggregators; +namespace rdb = ::dftracer::utils::rocksdb; + +constexpr double US_TO_S = 1e-6; + +// AGGREGATION CF contains data keys (varint-encoded) and reserved system keys +// prefixed with 0xFF{FD,FE,FF}. Filter those out. +inline bool is_system_key(std::string_view key) { + return key.size() >= 2 && static_cast(key[0]) == 0xFF; +} + +inline bool matches(std::string_view a, std::string_view b) { return a == b; } + +struct ComponentAccumulator { + // Per-(pid, time_bucket) entries used to materialize per-rank sample seqs. + // Outer map sorts by pid for deterministic rank assignment; inner sorts by + // time_bucket so the resulting sample sequence is in trace order. + std::map>> + per_pid_bucket_samples; + // Boundary list (in microseconds) for sweep_union of trace-side wall clock. + std::vector boundaries; + // Merged sketch across all entries; nullptr until first sketch is seen. + std::shared_ptr sketch; + // Aggregate accumulators. + double accumulated_time_s = 0.0; // sum of count * mean (s) + std::uint64_t total_count = 0; + double min_us = 0.0; + double max_us = 0.0; + bool min_max_seen = false; +}; + +void apply_minmax(ComponentAccumulator& acc, double min_us, double max_us) { + if (!acc.min_max_seen) { + acc.min_us = min_us; + acc.max_us = max_us; + acc.min_max_seen = true; + return; + } + if (min_us < acc.min_us) acc.min_us = min_us; + if (max_us > acc.max_us) acc.max_us = max_us; +} + +// Synthesize per-call durations (seconds) for a single (cat, name, pid, bucket) +// entry. When a sketch is available we draw `n` samples via inverse-CDF for +// within-bucket variance; otherwise we replicate the per-call mean. `n` is +// clamped to `max_samples` if non-zero. +void synthesize_samples(const agg::AggregationMetrics& metrics, + std::uint64_t max_samples, std::mt19937_64& rng, + std::vector& out) { + if (metrics.count == 0) return; + const auto desired = + max_samples == 0 ? metrics.count : std::min(metrics.count, max_samples); + if (desired == 0) return; + + const double mean_s = metrics.duration.mean * US_TO_S; + if (metrics.duration.sketch) { + std::uniform_real_distribution u01(0.0, 1.0); + out.reserve(out.size() + desired); + for (std::uint64_t i = 0; i < desired; ++i) { + const double q = u01(rng); + const double v_us = metrics.duration.sketch->quantile(q); + out.push_back(v_us * US_TO_S); + } + } else { + out.insert(out.end(), desired, mean_s); + } +} + +// Drains the per-(pid, bucket) sample buckets in `acc` into per-rank flat +// vectors, in pid-ascending and then bucket-ascending order. +std::vector> flatten_per_rank( + const ComponentAccumulator& acc, + const std::vector& rank_pids) { + std::vector> out; + out.reserve(rank_pids.size()); + for (auto pid : rank_pids) { + std::vector rank_samples; + auto it = acc.per_pid_bucket_samples.find(pid); + if (it == acc.per_pid_bucket_samples.end()) { + out.emplace_back(); + continue; + } + for (const auto& [bucket, samples] : it->second) { + (void)bucket; + rank_samples.insert(rank_samples.end(), samples.begin(), + samples.end()); + } + out.push_back(std::move(rank_samples)); + } + return out; +} + +} // namespace + +AggregatedTraces load_aggregated_traces(const std::string& db_path, + const TraceLoaderOptions& options) { + // The AGGREGATION CF was created with a merge operator; we must re-attach + // it (read-only) so RocksDB will let us iterate the CF. Without the merge + // operator, NewIterator() returns "merge_operator_ must be set". + auto db_handle = + agg::EventAggregator::open_read_only_with_merge_operator(db_path); + if (!db_handle) { + throw std::runtime_error("dlio: failed to open RocksDB at " + db_path); + } + auto& db = *db_handle; + + // The intern dictionary must be populated before any key parsing happens. + agg::load_intern_dictionary(db); + + AggregatedTraces out; + + // Global config (time_interval_us) lives at key 0xFFFE in AGGREGATION CF. + { + std::string val; + const auto st = db.get(std::string_view(agg::AGG_GLOBAL_CONFIG_KEY, 2), + &val, rdb::cf::AGGREGATION); + if (st.ok() && !val.empty()) { + const auto cfg = agg::deserialize_agg_global_config(val); + out.time_interval_us = cfg.time_interval_us; + } + } + + ComponentAccumulator acc_fetch_block; + ComponentAccumulator acc_fetch_iter; + ComponentAccumulator acc_preprocess; + ComponentAccumulator acc_getitem; + + std::unordered_set pid_set; + std::mt19937_64 rng(options.seed); + + auto it = db.new_iterator(rdb::cf::AGGREGATION); + if (!it) { + throw std::runtime_error("dlio: failed to obtain AGGREGATION iterator"); + } + + for (it->SeekToFirst(); it->Valid(); it->Next()) { + const auto key_slice = it->key(); + std::string_view key_sv(key_slice.data(), key_slice.size()); + if (is_system_key(key_sv)) continue; + + agg::AggKeyView kv; + if (!agg::parse_agg_key_view(key_sv, kv)) continue; + + ComponentAccumulator* target = nullptr; + if (matches(kv.cat, CATEGORY_DATALOADER) && + matches(kv.name, EVENT_FETCH_BLOCK)) { + target = &acc_fetch_block; + } else if (matches(kv.cat, CATEGORY_DATALOADER) && + matches(kv.name, EVENT_FETCH_ITER)) { + target = &acc_fetch_iter; + } else if (matches(kv.cat, CATEGORY_DATA) && + matches(kv.name, EVENT_PREPROCESS)) { + target = &acc_preprocess; + } else if (matches(kv.cat, CATEGORY_DATA) && + matches(kv.name, EVENT_ITEM)) { + target = &acc_getitem; + } else { + continue; + } + + const auto val_slice = it->value(); + std::string_view val_sv(val_slice.data(), val_slice.size()); + auto metrics = agg::deserialize_agg_value(val_sv); + if (metrics.count == 0) continue; + + out.any_data = true; + pid_set.insert(kv.pid); + + // Synthesize per-call samples for this entry. + auto& bucket_vec = + target->per_pid_bucket_samples[kv.pid][kv.time_bucket]; + synthesize_samples(metrics, options.max_samples_per_entry, rng, + bucket_vec); + + // Accumulate component-level state. + target->accumulated_time_s += + static_cast(metrics.duration.total) * US_TO_S; + target->total_count += metrics.count; + apply_minmax(*target, static_cast(metrics.duration.min), + static_cast(metrics.duration.max)); + + // Real per-entry (ts, te) interval for trace-side union time. + if (metrics.te > metrics.ts) { + target->boundaries.push_back( + {static_cast(metrics.ts), +1}); + target->boundaries.push_back( + {static_cast(metrics.te), -1}); + } + + // Merge sketch when present. + if (metrics.duration.sketch) { + out.sketches_available = true; + if (!target->sketch) { + target->sketch = + std::make_shared(*metrics.duration.sketch); + } else { + target->sketch->merge(*metrics.duration.sketch); + } + } + } + + if (!it->status().ok()) { + throw std::runtime_error( + "dlio: iteration over AGGREGATION CF failed: " + + it->status().ToString()); + } + + if (!out.any_data) { + return out; + } + + // Rank PIDs are the pids that emitted at least one fetch.block event. + // fetch_iter is intentionally not required since some traces omit it. + std::vector rank_pids; + rank_pids.reserve(acc_fetch_block.per_pid_bucket_samples.size()); + for (const auto& [pid, _] : acc_fetch_block.per_pid_bucket_samples) { + rank_pids.push_back(pid); + } + std::sort(rank_pids.begin(), rank_pids.end()); + out.rank_pids = rank_pids; + out.num_ranks = static_cast(rank_pids.size()); + + // Per-rank traces. + out.fetch_block_trace = flatten_per_rank(acc_fetch_block, rank_pids); + out.fetch_iter_trace = flatten_per_rank(acc_fetch_iter, rank_pids); + out.getitem_trace = flatten_per_rank(acc_getitem, rank_pids); + + // num_steps = min length across ranks (conservative — drop straggler + // steps). + out.num_steps = out.num_ranks > 0 + ? static_cast(out.fetch_block_trace.front().size()) + : 0; + for (const auto& r : out.fetch_block_trace) { + out.num_steps = std::min(out.num_steps, static_cast(r.size())); + } + + // Flat fitting arrays. + for (const auto& r : out.fetch_block_trace) { + out.computation_times.insert(out.computation_times.end(), r.begin(), + r.end()); + } + for (const auto& [pid, buckets] : acc_preprocess.per_pid_bucket_samples) { + (void)pid; + for (const auto& [bucket, samples] : buckets) { + (void)bucket; + out.preprocess_times.insert(out.preprocess_times.end(), + samples.begin(), samples.end()); + } + } + + // Sketches + Statistic objects. + auto attach_stat = [](Statistic& stat, const ComponentAccumulator& a) { + if (!a.min_max_seen) return; + // Seed Statistic with min and max (in seconds) so its fallback quantile + // path has reasonable bounds even without a sketch. + stat.update(a.min_us * US_TO_S); + if (a.max_us != a.min_us) stat.update(a.max_us * US_TO_S); + if (a.sketch) { + stat.attach_sketch(a.sketch); + } + }; + attach_stat(out.fetch_block_stats, acc_fetch_block); + attach_stat(out.fetch_iter_stats, acc_fetch_iter); + attach_stat(out.preprocess_stats, acc_preprocess); + attach_stat(out.getitem_stats, acc_getitem); + out.fetch_block_sketch = acc_fetch_block.sketch; + out.fetch_iter_sketch = acc_fetch_iter.sketch; + out.preprocess_sketch = acc_preprocess.sketch; + out.getitem_sketch = acc_getitem.sketch; + + // Trace-side ComponentTimeMetrics: accumulated + union (in seconds). + auto fill_metrics = [](ComponentTimeMetrics& m, ComponentAccumulator& a) { + m.accumulated_time = a.accumulated_time_s; + m.num_samples = a.total_count; + m.union_time = sweep_union(a.boundaries); // returns seconds + }; + fill_metrics(out.trace_fetch_block_metrics, acc_fetch_block); + fill_metrics(out.trace_fetch_iter_metrics, acc_fetch_iter); + fill_metrics(out.trace_preprocess_metrics, acc_preprocess); + + // Overall e2e: union of fetch_block intervals across all ranks gives the + // tightest defensible estimate for "time spent in the data path". + out.trace_e2e_duration = out.trace_fetch_block_metrics.union_time; + + // Per-rank throughput = events / rank-side fetch_block wall clock. We + // approximate rank wall clock as sum(per-entry te-ts) for that rank's + // fetch_block keys; matches the granularity available without raw events. + out.trace_per_rank_throughput.reserve(rank_pids.size()); + std::vector per_rank_wall; + per_rank_wall.reserve(rank_pids.size()); + for (auto pid : rank_pids) { + double wall_us = 0.0; + std::uint64_t count = 0; + auto pit = acc_fetch_block.per_pid_bucket_samples.find(pid); + if (pit != acc_fetch_block.per_pid_bucket_samples.end()) { + for (const auto& [bucket, samples] : pit->second) { + (void)bucket; + count += samples.size(); + } + // Wall clock per rank = sum of sample durations as an upper bound. + for (const auto& [bucket, samples] : pit->second) { + (void)bucket; + for (double s : samples) wall_us += s / US_TO_S; + } + } + const double wall_s = wall_us * US_TO_S; + per_rank_wall.push_back(wall_s); + out.trace_per_rank_throughput.push_back( + wall_s > 0.0 ? static_cast(count) / wall_s : 0.0); + } + + // Trace rank variance = variance of per-rank wall clock. + out.trace_rank_variance = variance(per_rank_wall); + + if (!out.sketches_available) { + std::fprintf(stderr, + "dlio: warning - AGGREGATION CF has no DDSketch data. " + "Distribution fitting will use mean-replication samples; " + "re-run dftracer_aggregator with --compute-percentiles " + "for higher-fidelity DLIO configs.\n"); + } + + return out; +} + +BarrierSimulatorContext make_simulator_context(const AggregatedTraces& traces, + int num_workers, + int prefetch_factor) { + BarrierSimulatorContext ctx; + ctx.num_ranks = traces.num_ranks; + ctx.num_steps = traces.num_steps; + ctx.is_aggregated_trace = true; + ctx.sync_mode = false; + + ctx.fetch_block_trace = traces.fetch_block_trace; + ctx.fetch_iter_trace = traces.fetch_iter_trace; + if (!traces.getitem_trace.empty()) { + ctx.getitem_trace = traces.getitem_trace; + } + + ctx.fetch_block_stats = traces.fetch_block_stats; + ctx.fetch_iter_stats = traces.fetch_iter_stats; + ctx.preprocess_stats = traces.preprocess_stats; + ctx.getitem_stats = traces.getitem_stats; + + ctx.trace_fetch_block_metrics = traces.trace_fetch_block_metrics; + ctx.trace_fetch_iter_metrics = traces.trace_fetch_iter_metrics; + ctx.trace_preprocess_metrics = traces.trace_preprocess_metrics; + + ctx.trace_e2e_duration = traces.trace_e2e_duration; + ctx.trace_rank_variance = traces.trace_rank_variance; + ctx.trace_per_rank_throughput = traces.trace_per_rank_throughput; + + ctx.num_workers = num_workers; + ctx.prefetch_factor = prefetch_factor; + return ctx; +} + +} // namespace dftracer::utils::utilities::dlio diff --git a/src/dftracer/utils/utilities/dlio/yaml_emit.cpp b/src/dftracer/utils/utilities/dlio/yaml_emit.cpp new file mode 100644 index 00000000..5ecb9916 --- /dev/null +++ b/src/dftracer/utils/utilities/dlio/yaml_emit.cpp @@ -0,0 +1,112 @@ +#include +#include + +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::dlio { + +namespace stats = ::dftracer::utils::utilities::common::statistics; + +namespace { + +YAML::Node emit_single(const stats::FittedDistribution& f) { + YAML::Node n; + switch (f.kind) { + case stats::DistributionKind::Normal: + n["type"] = "normal"; + n["mean"] = f.params[0]; + n["stdev"] = f.params[1]; + break; + case stats::DistributionKind::Lognormal: + n["type"] = "lognormal"; + n["mean"] = f.params[0]; + n["sigma"] = f.params[1]; + break; + case stats::DistributionKind::Gamma: + n["type"] = "gamma"; + n["shape"] = f.params[0]; + n["scale"] = f.params[1]; + break; + case stats::DistributionKind::Exponential: + // params[0] is rate; DLIO config expects "scale" = 1/rate. + n["type"] = "exponential"; + n["scale"] = f.params[0] > 0.0 ? 1.0 / f.params[0] : 0.0; + break; + case stats::DistributionKind::Weibull: + n["type"] = "weibull"; + n["shape"] = f.params[0]; + n["scale"] = f.params[1]; + break; + } + return n; +} + +YAML::Node emit_mixture(const stats::FittedMixture& m) { + YAML::Node n; + n["type"] = "mixture"; + n["n_components"] = static_cast(m.weights.size()); + YAML::Node components(YAML::NodeType::Sequence); + for (std::size_t k = 0; k < m.weights.size(); ++k) { + YAML::Node comp; + comp["weight"] = m.weights[k]; + YAML::Node params; + params["type"] = "normal"; + params["mean"] = m.components[k].mean; + params["stdev"] = m.components[k].stddev; + comp["params"] = params; + components.push_back(comp); + } + n["components"] = components; + return n; +} + +YAML::Node emit_block(const DlioTimingBlock& block) { + YAML::Node n = std::visit( + [](const auto& v) -> YAML::Node { + using T = std::decay_t; + if constexpr (std::is_same_v) { + return emit_single(v); + } else { + return emit_mixture(v); + } + }, + block.model); + n["max_bound"] = block.max_bound; + return n; +} + +} // namespace + +std::string render_dlio_yaml(const DlioTimingBlock* computation, + const DlioTimingBlock* preprocess) { + YAML::Node root; + if (computation) { + YAML::Node train; + train["computation_time"] = emit_block(*computation); + root["train"] = train; + } + if (preprocess) { + YAML::Node reader; + reader["preprocess_time"] = emit_block(*preprocess); + root["reader"] = reader; + } + YAML::Emitter emit; + emit.SetIndent(2); + emit.SetMapFormat(YAML::Block); + emit.SetSeqFormat(YAML::Block); + emit << root; + return std::string(emit.c_str()); +} + +bool write_dlio_yaml(std::ostream& out, const DlioTimingBlock* computation, + const DlioTimingBlock* preprocess) { + out << render_dlio_yaml(computation, preprocess); + out << "\n"; + return static_cast(out); +} + +} // namespace dftracer::utils::utilities::dlio diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index f6c2ea51..6a58eef2 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -203,6 +203,7 @@ set(TEST_BINARY_SOURCES binaries/test_dftracer_stats.cpp binaries/test_dftracer_index.cpp binaries/test_dftracer_aggregator.cpp + binaries/test_dftracer_gen_dlio_config.cpp binaries/test_dftracer_organize.cpp binaries/test_dftracer_view.cpp binaries/test_dftracer_tar.cpp @@ -306,6 +307,9 @@ foreach(test_file ${TEST_BINARY_SOURCES}) elseif(bin_exec STREQUAL "binaries/test_dftracer_aggregator") set_tests_properties(${bin_exec} PROPERTIES ENVIRONMENT "DFTRACER_AGGREGATOR_PATH=$") + elseif(bin_exec STREQUAL "binaries/test_dftracer_gen_dlio_config") + set_tests_properties(${bin_exec} PROPERTIES ENVIRONMENT + "DFTRACER_GEN_DLIO_CONFIG_PATH=$") elseif(bin_exec STREQUAL "binaries/test_dftracer_organize") set_tests_properties(${bin_exec} PROPERTIES ENVIRONMENT "DFTRACER_ORGANIZE_PATH=$;DFTRACER_RECONSTRUCT_PATH=$") diff --git a/tests/binaries/test_dftracer_gen_dlio_config.cpp b/tests/binaries/test_dftracer_gen_dlio_config.cpp new file mode 100644 index 00000000..13299308 --- /dev/null +++ b/tests/binaries/test_dftracer_gen_dlio_config.cpp @@ -0,0 +1,222 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace { + +// Produce a .pfw.gz file containing realistic DLIO trace events: a mix of +// (cat=dataloader, name=fetch.block / fetch.iter) and (cat=data, +// name=preprocess) duration events spread across `num_ranks` pids. Each rank +// gets `events_per_rank` of each kind. +std::string create_dlio_pfw_gz(dft_utils_test::TestEnvironment& env, int id, + int num_ranks, int events_per_rank) { + const std::string plain_path = + env.get_dir() + "/dlio_trace_" + std::to_string(id) + ".pfw"; + { + std::ofstream ofs(plain_path); + if (!ofs.is_open()) return ""; + ofs << "[\n"; + std::uint64_t event_id = 1; + const std::uint64_t base_ts = 1000000000ULL; + for (int rank = 0; rank < num_ranks; ++rank) { + const std::uint64_t pid = 1000 + static_cast(rank); + const std::uint64_t tid = 2000 + static_cast(rank); + std::uint64_t ts = base_ts + rank * 100000ULL; + for (int i = 0; i < events_per_rank; ++i) { + // fetch.block: lognormal-ish via varying durations. + const std::uint64_t fb_dur = 100 + (i * 7) % 500; + ofs << R"({"id":)" << event_id++ << R"(,"pid":)" << pid + << R"(,"tid":)" << tid + << R"(,"name":"fetch.block","cat":"dataloader")" + << R"(,"ph":"X","ts":)" << ts << R"(,"dur":)" << fb_dur + << R"(,"args":{"hhash":"h1"}})" + << "\n"; + ts += fb_dur; + + // fetch.iter: shorter durations. + const std::uint64_t fi_dur = 50 + (i * 3) % 100; + ofs << R"({"id":)" << event_id++ << R"(,"pid":)" << pid + << R"(,"tid":)" << tid + << R"(,"name":"fetch.iter","cat":"dataloader")" + << R"(,"ph":"X","ts":)" << ts << R"(,"dur":)" << fi_dur + << R"(,"args":{"hhash":"h1"}})" + << "\n"; + ts += fi_dur; + + // preprocess: emitted by a worker pid (different from main). + const std::uint64_t worker_pid = pid + 100000ULL; + const std::uint64_t pre_dur = 80 + (i * 5) % 200; + ofs << R"({"id":)" << event_id++ << R"(,"pid":)" << worker_pid + << R"(,"tid":)" << tid + << R"(,"name":"preprocess","cat":"data")" + << R"(,"ph":"X","ts":)" << ts << R"(,"dur":)" << pre_dur + << R"(,"args":{"hhash":"h1"}})" + << "\n"; + ts += pre_dur; + } + } + ofs << "]\n"; + } + + // Compress to .pfw.gz. + std::string gz_path = plain_path + ".gz"; + { + gzFile gz = gzopen(gz_path.c_str(), "wb"); + if (!gz) return ""; + std::ifstream ifs(plain_path, std::ios::binary); + std::stringstream ss; + ss << ifs.rdbuf(); + const std::string body = ss.str(); + gzwrite(gz, body.data(), static_cast(body.size())); + gzclose(gz); + } + fs::remove(plain_path); + return gz_path; +} + +std::string find_binary() { + const char* env_path = std::getenv("DFTRACER_GEN_DLIO_CONFIG_PATH"); + if (env_path != nullptr && ::access(env_path, X_OK) == 0) return env_path; + + std::vector candidates = { + "./dftracer_gen_dlio_config", "../dftracer_gen_dlio_config", + "../../dftracer_gen_dlio_config", "../bin/dftracer_gen_dlio_config", + "../../bin/dftracer_gen_dlio_config", + }; + for (const auto& path : candidates) { + if (::access(path.c_str(), X_OK) == 0) return path; + } + return ""; +} + +int run_binary(const std::string& binary, + const std::vector& args) { + pid_t pid = ::fork(); + if (pid < 0) return -1; + if (pid == 0) { + std::vector argv; + argv.push_back(binary.c_str()); + for (const auto& a : args) argv.push_back(a.c_str()); + argv.push_back(nullptr); + ::execv(binary.c_str(), const_cast(argv.data())); + ::_exit(127); + } + int status = 0; + ::waitpid(pid, &status, 0); + if (WIFEXITED(status)) return WEXITSTATUS(status); + return -1; +} + +std::string read_file(const std::string& path) { + std::ifstream ifs(path); + std::stringstream ss; + ss << ifs.rdbuf(); + return ss.str(); +} + +} // namespace + +TEST_SUITE("DFTracerGenDlioConfig") { + TEST_CASE("binary exists") { + const auto binary = find_binary(); + if (binary.empty()) { + MESSAGE( + "dftracer_gen_dlio_config not found. Set " + "DFTRACER_GEN_DLIO_CONFIG_PATH to locate it."); + return; + } + CHECK(!binary.empty()); + } + + TEST_CASE("--help exits 0") { + const auto binary = find_binary(); + if (binary.empty()) return; + CHECK(run_binary(binary, {"--help"}) == 0); + } + + TEST_CASE("missing --output rejected") { + const auto binary = find_binary(); + if (binary.empty()) return; + // Pointing -d at a valid empty dir but no -o; argparse should fail. + dft_utils_test::TestEnvironment env(10); + REQUIRE(env.is_valid()); + CHECK(run_binary(binary, {"-d", env.get_dir()}) != 0); + } + + TEST_CASE("directory without DLIO events fails gracefully") { + const auto binary = find_binary(); + if (binary.empty()) return; + dft_utils_test::TestEnvironment env(50); + REQUIRE(env.is_valid()); + + // Generic POSIX trace (no fetch.block/preprocess events). + auto trace_gz = env.create_dft_test_gzip_file(50); + REQUIRE(!trace_gz.empty()); + + const std::string out = env.get_dir() + "/dlio_config.yaml"; + const int rc = run_binary(binary, {"-d", env.get_dir(), "-o", out}); + // Non-zero exit and no YAML produced. + CHECK(rc != 0); + CHECK_FALSE(fs::exists(out)); + } + + TEST_CASE( + "happy path: DLIO traces produce a valid YAML with train + reader") { + const auto binary = find_binary(); + if (binary.empty()) return; + dft_utils_test::TestEnvironment env(200); + REQUIRE(env.is_valid()); + + for (int i = 0; i < 2; ++i) { + auto f = create_dlio_pfw_gz(env, i, /*num_ranks=*/2, + /*events_per_rank=*/200); + REQUIRE(!f.empty()); + } + + const std::string out = env.get_dir() + "/dlio_config.yaml"; + const int rc = run_binary(binary, {"-d", env.get_dir(), "-o", out, + "--simulation-iterations", "2"}); + CHECK(rc == 0); + REQUIRE(fs::exists(out)); + + const std::string contents = read_file(out); + // Spot-check the YAML schema. Don't pin specific distribution choice + // (fitter may pick any of single/GMM-2/GMM-3 depending on data). + CHECK(contents.find("train:") != std::string::npos); + CHECK(contents.find("computation_time:") != std::string::npos); + CHECK(contents.find("reader:") != std::string::npos); + CHECK(contents.find("preprocess_time:") != std::string::npos); + CHECK(contents.find("type:") != std::string::npos); + CHECK(contents.find("max_bound:") != std::string::npos); + } + + TEST_CASE("respects --num-workers and --prefetch-factor") { + const auto binary = find_binary(); + if (binary.empty()) return; + dft_utils_test::TestEnvironment env(200); + REQUIRE(env.is_valid()); + + auto f = create_dlio_pfw_gz(env, 0, /*num_ranks=*/1, + /*events_per_rank=*/150); + REQUIRE(!f.empty()); + + const std::string out = env.get_dir() + "/dlio_config.yaml"; + const int rc = run_binary( + binary, {"-d", env.get_dir(), "-o", out, "--num-workers", "4", + "--prefetch-factor", "1", "--simulation-iterations", "2"}); + CHECK(rc == 0); + CHECK(fs::exists(out)); + } +} diff --git a/tests/utilities/CMakeLists.txt b/tests/utilities/CMakeLists.txt index c3ab418f..fbc09bbf 100644 --- a/tests/utilities/CMakeLists.txt +++ b/tests/utilities/CMakeLists.txt @@ -56,6 +56,13 @@ set(UTILITIES_TEST_SOURCES composites/dft/statistics/test_statistics_query.cpp common/statistics/test_log2_histogram.cpp common/statistics/test_timestamp_histogram.cpp + + # Distribution fitting + common/statistics/test_distributions.cpp + common/statistics/test_mixture.cpp + + # DLIO config generation + dlio/test_barrier_simulator.cpp composites/dft/statistics/test_detailed_statistics.cpp # Query language diff --git a/tests/utilities/common/statistics/test_distributions.cpp b/tests/utilities/common/statistics/test_distributions.cpp new file mode 100644 index 00000000..b2f34518 --- /dev/null +++ b/tests/utilities/common/statistics/test_distributions.cpp @@ -0,0 +1,174 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include +#include + +#include +#include +#include +#include + +using namespace dftracer::utils::utilities::common::statistics; +using Rng = std::mt19937_64; + +namespace { + +std::vector generate_samples(std::size_t n, + std::function f, + std::uint64_t seed = 12345) { + std::mt19937_64 rng(seed); + std::vector data; + data.reserve(n); + for (std::size_t i = 0; i < n; ++i) data.push_back(f(rng)); + return data; +} + +} // namespace + +TEST_SUITE("fit_single_distribution") { + TEST_CASE("Normal: recovers mean and stddev") { + auto data = generate_samples(5000, [](auto& r) { + return std::normal_distribution(2.5, 0.7)(r); + }); + const auto fit = + fit_single_distribution(DistributionKind::Normal, data); + REQUIRE(fit.valid); + CHECK(fit.params[0] == doctest::Approx(2.5).epsilon(0.05)); + CHECK(fit.params[1] == doctest::Approx(0.7).epsilon(0.05)); + // Same-family fit should have small KS statistic on 5k samples. + CHECK(fit.ks_stat < 0.05); + } + + TEST_CASE("Lognormal: recovers mu and sigma in log space") { + auto data = generate_samples(5000, [](auto& r) { + return std::lognormal_distribution(-1.0, 0.5)(r); + }); + const auto fit = + fit_single_distribution(DistributionKind::Lognormal, data); + REQUIRE(fit.valid); + CHECK(fit.params[0] == doctest::Approx(-1.0).epsilon(0.05)); + CHECK(fit.params[1] == doctest::Approx(0.5).epsilon(0.05)); + CHECK(fit.ks_stat < 0.05); + } + + TEST_CASE("Exponential: recovers rate") { + auto data = generate_samples(5000, [](auto& r) { + return std::exponential_distribution(3.0)(r); + }); + const auto fit = + fit_single_distribution(DistributionKind::Exponential, data); + REQUIRE(fit.valid); + CHECK(fit.params[0] == doctest::Approx(3.0).epsilon(0.05)); + CHECK(fit.ks_stat < 0.05); + } + + TEST_CASE("Gamma: recovers shape and scale within 5%") { + auto data = generate_samples(5000, [](auto& r) { + return std::gamma_distribution(2.0, 0.3)(r); + }); + const auto fit = fit_single_distribution(DistributionKind::Gamma, data); + REQUIRE(fit.valid); + CHECK(fit.params[0] == doctest::Approx(2.0).epsilon(0.05)); + CHECK(fit.params[1] == doctest::Approx(0.3).epsilon(0.05)); + CHECK(fit.ks_stat < 0.05); + } + + TEST_CASE("Weibull: recovers shape and scale within 5%") { + auto data = generate_samples(5000, [](auto& r) { + return std::weibull_distribution(1.5, 2.0)(r); + }); + const auto fit = + fit_single_distribution(DistributionKind::Weibull, data); + REQUIRE(fit.valid); + CHECK(fit.params[0] == doctest::Approx(1.5).epsilon(0.05)); + CHECK(fit.params[1] == doctest::Approx(2.0).epsilon(0.05)); + CHECK(fit.ks_stat < 0.05); + } + + TEST_CASE("Lognormal: rejects non-positive data") { + std::vector data{1.0, -2.0, 3.0, 4.0}; + const auto fit = + fit_single_distribution(DistributionKind::Lognormal, data); + CHECK_FALSE(fit.valid); + } + + TEST_CASE("Normal: rejects too few samples") { + std::vector data{1.0}; + const auto fit = + fit_single_distribution(DistributionKind::Normal, data); + CHECK_FALSE(fit.valid); + } +} + +TEST_SUITE("fit_all_single_distributions") { + TEST_CASE("ranks correct family at the top") { + auto data = generate_samples(2000, [](auto& r) { + return std::lognormal_distribution(0.0, 0.4)(r); + }); + const auto fits = fit_all_single_distributions(data); + REQUIRE_FALSE(fits.empty()); + const auto best = best_fit_by_ks(fits); + REQUIRE(best.has_value()); + CHECK(best->kind == DistributionKind::Lognormal); + } + + TEST_CASE("all valid fits sorted ascending by KS") { + auto data = generate_samples(2000, [](auto& r) { + return std::gamma_distribution(2.0, 0.5)(r); + }); + const auto fits = fit_all_single_distributions(data); + REQUIRE(fits.size() == 5); + double last_ks = -1.0; + for (const auto& f : fits) { + if (!f.valid) break; + CHECK(f.ks_stat >= last_ks); + last_ks = f.ks_stat; + } + } +} + +TEST_SUITE("FittedDistribution pdf/cdf/quantile") { + TEST_CASE("Normal cdf matches Gaussian quantiles") { + FittedDistribution fit{ + DistributionKind::Normal, {0.0, 1.0, 0.0}, 0.0, 0.0, 0.0, true}; + // 97.5th percentile of standard normal ~ 1.96. + CHECK(quantile(fit, 0.975) == doctest::Approx(1.959964).epsilon(1e-3)); + CHECK(cdf(fit, 0.0) == doctest::Approx(0.5)); + } + + TEST_CASE("Lognormal quantile at median") { + FittedDistribution fit{ + DistributionKind::Lognormal, {0.0, 1.0, 0.0}, 0.0, 0.0, 0.0, true}; + CHECK(quantile(fit, 0.5) == doctest::Approx(1.0).epsilon(1e-6)); + } +} + +TEST_SUITE("make_sampler") { + TEST_CASE("Normal sampler reproduces fit mean within tolerance") { + FittedDistribution fit{ + DistributionKind::Normal, {1.0, 0.2, 0.0}, 0.0, 0.0, 0.0, true}; + auto sampler = make_sampler(fit); + Rng rng(99); + double sum = 0.0; + const int n = 4000; + for (int i = 0; i < n; ++i) sum += sampler(rng); + const double mean = sum / n; + CHECK(mean == doctest::Approx(1.0).epsilon(0.05)); + } + + TEST_CASE("clamps to provided bounds") { + FittedDistribution fit{ + DistributionKind::Normal, {0.0, 1.0, 0.0}, 0.0, 0.0, 0.0, true}; + auto sampler = make_sampler(fit, /*min_bound=*/-0.5, /*max_bound=*/0.5); + Rng rng(7); + for (int i = 0; i < 200; ++i) { + const double s = sampler(rng); + CHECK(s >= -0.5); + CHECK(s <= 0.5); + } + } + + TEST_CASE("throws on invalid fit") { + FittedDistribution fit; // valid = false + CHECK_THROWS_AS(make_sampler(fit), std::invalid_argument); + } +} diff --git a/tests/utilities/common/statistics/test_mixture.cpp b/tests/utilities/common/statistics/test_mixture.cpp new file mode 100644 index 00000000..f268beec --- /dev/null +++ b/tests/utilities/common/statistics/test_mixture.cpp @@ -0,0 +1,176 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include +#include +#include + +#include +#include +#include +#include +#include + +using namespace dftracer::utils::utilities::common::statistics; + +namespace { + +// Sample n points from a K-component Normal mixture with given +// weights/means/stddevs. +std::vector sample_mixture(std::size_t n, + const std::vector& weights, + const std::vector& means, + const std::vector& stddevs, + std::uint64_t seed = 4242) { + std::mt19937_64 rng(seed); + std::discrete_distribution cat(weights.begin(), weights.end()); + std::vector data; + data.reserve(n); + for (std::size_t i = 0; i < n; ++i) { + const int k = cat(rng); + data.push_back( + std::normal_distribution(means[k], stddevs[k])(rng)); + } + return data; +} + +} // namespace + +TEST_SUITE("fit_gaussian_mixture") { + TEST_CASE("K=2: recovers means within tolerance") { + // Two well-separated Gaussians: N(0, 0.3) with weight 0.4, N(3, 0.5) + // with 0.6. + auto data = sample_mixture(5000, {0.4, 0.6}, {0.0, 3.0}, {0.3, 0.5}, + /*seed=*/7); + const auto fit = fit_gaussian_mixture(data, 2); + REQUIRE(fit.valid); + REQUIRE(fit.components.size() == 2); + + // Sort by mean for deterministic comparison (EM has label-switching + // freedom). + std::vector idx{0, 1}; + std::sort(idx.begin(), idx.end(), [&](std::size_t a, std::size_t b) { + return fit.components[a].mean < fit.components[b].mean; + }); + const auto& c0 = fit.components[idx[0]]; + const auto& c1 = fit.components[idx[1]]; + + CHECK(c0.mean == doctest::Approx(0.0).epsilon(0.1)); + CHECK(c1.mean == doctest::Approx(3.0).epsilon(0.05)); + CHECK(c0.stddev == doctest::Approx(0.3).epsilon(0.15)); + CHECK(c1.stddev == doctest::Approx(0.5).epsilon(0.15)); + CHECK(fit.weights[idx[0]] == doctest::Approx(0.4).epsilon(0.1)); + CHECK(fit.weights[idx[1]] == doctest::Approx(0.6).epsilon(0.1)); + CHECK(fit.converged); + } + + TEST_CASE("K=3: converges and weights sum to 1") { + auto data = sample_mixture(6000, {0.3, 0.4, 0.3}, {-2.0, 0.5, 3.0}, + {0.4, 0.3, 0.5}, + /*seed=*/11); + const auto fit = fit_gaussian_mixture(data, 3); + REQUIRE(fit.valid); + REQUIRE(fit.components.size() == 3); + + double wsum = 0.0; + for (double w : fit.weights) wsum += w; + CHECK(wsum == doctest::Approx(1.0).epsilon(1e-9)); + CHECK(fit.iterations > 0); + } + + TEST_CASE("rejects too few samples") { + std::vector data{1.0, 2.0}; + const auto fit = fit_gaussian_mixture(data, 2); + CHECK_FALSE(fit.valid); + } +} + +TEST_SUITE("FittedMixture pdf/cdf/sampler") { + TEST_CASE("pdf matches hand-computed value") { + FittedMixture m; + m.weights = {0.5, 0.5}; + m.components = {{0.0, 1.0}, {2.0, 1.0}}; + m.valid = true; + // pdf at x=1: 0.5 * N(1; 0, 1) + 0.5 * N(1; 2, 1) + // = 0.5 * 0.24197 + 0.5 * 0.24197 = 0.24197 + CHECK(pdf(m, 1.0) == doctest::Approx(0.24197).epsilon(1e-4)); + } + + TEST_CASE("cdf is monotone and bounded") { + FittedMixture m; + m.weights = {0.3, 0.7}; + m.components = {{-1.0, 0.5}, {1.0, 0.5}}; + m.valid = true; + CHECK(cdf(m, -10.0) == doctest::Approx(0.0).epsilon(1e-6)); + CHECK(cdf(m, 10.0) == doctest::Approx(1.0).epsilon(1e-6)); + CHECK(cdf(m, 0.0) > cdf(m, -1.0)); + } + + TEST_CASE("sampler reproduces mixture mean within tolerance") { + FittedMixture m; + m.weights = {0.5, 0.5}; + m.components = {{1.0, 0.2}, {3.0, 0.2}}; + m.valid = true; + auto sampler = make_sampler(m); + std::mt19937_64 rng(17); + double sum = 0.0; + const int n = 6000; + for (int i = 0; i < n; ++i) sum += sampler(rng); + // True mixture mean = 0.5*1 + 0.5*3 = 2.0 + CHECK(sum / n == doctest::Approx(2.0).epsilon(0.05)); + } +} + +TEST_SUITE("select_best_model") { + TEST_CASE("picks mixture for bimodal data") { + auto data = sample_mixture(4000, {0.5, 0.5}, {0.0, 4.0}, {0.3, 0.4}, + /*seed=*/22); + const auto singles = fit_all_single_distributions(data); + std::vector mixes{fit_gaussian_mixture(data, 2), + fit_gaussian_mixture(data, 3)}; + const auto best = select_best_model(singles, mixes); + REQUIRE(best.has_value()); + // For clearly bimodal data, no single dist should win. + CHECK(std::holds_alternative(best->model)); + } + + TEST_CASE("picks single for unimodal Normal data") { + std::mt19937_64 rng(33); + std::vector data; + data.reserve(3000); + for (int i = 0; i < 3000; ++i) + data.push_back(std::normal_distribution(2.0, 0.5)(rng)); + const auto singles = fit_all_single_distributions(data); + std::vector mixes{fit_gaussian_mixture(data, 2), + fit_gaussian_mixture(data, 3)}; + const auto best = select_best_model(singles, mixes); + REQUIRE(best.has_value()); + // With BIC's parameter penalty, a single Normal should beat GMM-2/3. + CHECK(std::holds_alternative(best->model)); + } + + TEST_CASE("empty inputs return nullopt") { + const auto best = select_best_model({}, {}); + CHECK_FALSE(best.has_value()); + } +} + +TEST_SUITE("BestModel variant dispatch") { + TEST_CASE("pdf dispatches through variant") { + FittedDistribution f{ + DistributionKind::Normal, {0.0, 1.0, 0.0}, 0.0, 0.0, 0.0, true}; + BestModel bm{f}; + // pdf(std normal, 0) = 1/sqrt(2*pi) ~ 0.39894 + CHECK(pdf(bm, 0.0) == doctest::Approx(0.39894).epsilon(1e-3)); + } + + TEST_CASE("sampler dispatches through variant") { + FittedMixture m; + m.weights = {1.0}; + m.components = {{5.0, 0.1}}; + m.valid = true; + BestModel bm{m}; + auto s = make_sampler(bm); + std::mt19937_64 rng(55); + const double draw = s(rng); + CHECK(draw == doctest::Approx(5.0).epsilon(0.1)); + } +} diff --git a/tests/utilities/dlio/test_barrier_simulator.cpp b/tests/utilities/dlio/test_barrier_simulator.cpp new file mode 100644 index 00000000..385548f9 --- /dev/null +++ b/tests/utilities/dlio/test_barrier_simulator.cpp @@ -0,0 +1,241 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include +#include +#include +#include + +#include +#include + +using namespace dftracer::utils::utilities::dlio; + +namespace { + +Statistic constant_stat(double value) { + Statistic s; + s.update(value); + return s; +} + +constexpr double kEpsilon = 1e-9; + +} // namespace + +TEST_SUITE("sweep_union") { + TEST_CASE("empty") { + std::vector b; + CHECK(sweep_union(b) == doctest::Approx(0.0)); + } + + TEST_CASE("single interval - returns its width in seconds") { + // 1 second = 1,000,000 us. + std::vector b{{0, +1}, {1'000'000, -1}}; + CHECK(sweep_union(b) == doctest::Approx(1.0)); + } + + TEST_CASE("two disjoint intervals - sum of widths") { + std::vector b{ + {0, +1}, {500'000, -1}, {1'000'000, +1}, {1'500'000, -1}}; + CHECK(sweep_union(b) == doctest::Approx(1.0)); + } + + TEST_CASE("two overlapping intervals - merged width") { + // [0, 1s] and [0.5s, 1.5s] -> union is [0, 1.5s] = 1.5s. + std::vector b{ + {0, +1}, {1'000'000, -1}, {500'000, +1}, {1'500'000, -1}}; + CHECK(sweep_union(b) == doctest::Approx(1.5)); + } + + TEST_CASE("nested intervals - outer width") { + // [0, 2s] contains [0.5s, 1.5s]. + std::vector b{ + {0, +1}, {2'000'000, -1}, {500'000, +1}, {1'500'000, -1}}; + CHECK(sweep_union(b) == doctest::Approx(2.0)); + } +} + +TEST_SUITE("cdf_similarity") { + TEST_CASE("identical samples - exactly 1.0") { + std::vector a{0.1, 0.2, 0.3, 0.4}; + CHECK(cdf_similarity(a, a) == doctest::Approx(1.0)); + } + + TEST_CASE("fully disjoint samples") { + // a in [0, 0.4], b in [10, 10.4]: at every value v < 10 the trace CDF + // is 0 while sim CDF is 1 -> KS = 1, similarity = 0. + std::vector a{0.1, 0.2, 0.3, 0.4}; + std::vector b{10.0, 10.1, 10.2, 10.4}; + CHECK(cdf_similarity(a, b) == doctest::Approx(0.0)); + } + + TEST_CASE("empty inputs") { + std::vector a{1.0, 2.0}; + std::vector empty; + CHECK(cdf_similarity(a, empty) == doctest::Approx(0.0)); + CHECK(cdf_similarity(empty, a) == doctest::Approx(0.0)); + } +} + +TEST_SUITE("variance") { + TEST_CASE("empty") { CHECK(variance({}) == doctest::Approx(0.0)); } + + TEST_CASE("constant values - zero variance") { + CHECK(variance({3.0, 3.0, 3.0}) == doctest::Approx(0.0)); + } + + TEST_CASE("known sample") { + // Population variance of {1, 2, 3, 4, 5} = 2.0. + CHECK(variance({1.0, 2.0, 3.0, 4.0, 5.0}) == doctest::Approx(2.0)); + } +} + +TEST_SUITE("WorkerQueue") { + TEST_CASE("produce_batches fills exactly to capacity") { + WorkerQueue q(/*num_workers=*/2, /*prefetch_factor=*/3); + auto sampler = []() -> std::pair { return {1.0, 0.5}; }; + auto intervals = q.produce_batches(0.0, sampler); + CHECK(intervals.size() == 6); + CHECK(q.queue_depth() == 6); + } + + TEST_CASE("consume_batch on ready batch returns base_overhead") { + WorkerQueue q(1, 1); + auto sampler = []() -> std::pair { return {2.0, 1.0}; }; + q.produce_batches(0.0, sampler); + const double consumed = + q.consume_batch(/*current_time=*/5.0, /*base_overhead=*/0.1); + CHECK(consumed == doctest::Approx(0.1)); + CHECK(q.queue_depth() == 0); + CHECK_FALSE(q.had_stall()); + } + + TEST_CASE("consume_batch on empty queue counts as stall") { + WorkerQueue q(1, 1); + const double consumed = q.consume_batch(0.0, 0.1); + CHECK(consumed == doctest::Approx(0.1)); + CHECK(q.had_stall()); + CHECK(q.stall_count() == 1); + } + + TEST_CASE( + "consume_batch on not-yet-ready batch returns wait + base_overhead") { + WorkerQueue q(1, 1); + auto sampler = []() -> std::pair { + return {10.0, 5.0}; + }; + q.produce_batches(0.0, sampler); + const double consumed = + q.consume_batch(/*current_time=*/3.0, /*base_overhead=*/0.2); + // Batch ready at 10.0, current 3.0 -> wait 7.0 + base 0.2 = 7.2. + CHECK(consumed == doctest::Approx(7.2)); + CHECK(q.had_stall()); + } +} + +TEST_SUITE("BarrierSimulator") { + // Build a context with deterministic constant samplers and trace data. + // Each rank runs num_steps iterations with constant fetch_iter=0.05s, + // constant fetch_block=0.1s, no preprocess simulation, aggregated mode. + static BarrierSimulatorContext make_ctx(int num_ranks, int num_steps) { + BarrierSimulatorContext ctx; + ctx.num_ranks = num_ranks; + ctx.num_steps = num_steps; + ctx.is_aggregated_trace = true; // skip clamp-to-trace-bounds + ctx.sync_mode = false; + + // Pre-loaded fetch_iter trace lets the simulator skip RNG-driven path. + ctx.fetch_iter_trace.assign(num_ranks, + std::vector(num_steps, 0.05)); + // Trace fetch_block values (used by CDF similarity check). + ctx.fetch_block_trace.assign(num_ranks, + std::vector(num_steps, 0.1)); + + ctx.fetch_iter_stats = constant_stat(0.05); + ctx.fetch_block_stats = constant_stat(0.1); + ctx.preprocess_stats = constant_stat(0.01); + ctx.getitem_stats = constant_stat(0.02); + + ctx.trace_e2e_duration = num_steps * (0.05 + 0.1); + ctx.trace_rank_variance = 0.0; + ctx.trace_per_rank_throughput.assign(num_ranks, 1.0 / 0.15); + return ctx; + } + + TEST_CASE("async mode, constant work - e2e equals per-rank wall clock") { + const int num_ranks = 4; + const int num_steps = 10; + auto ctx = make_ctx(num_ranks, num_steps); + + BarrierSimulator sim; + auto result = + sim.simulate(ctx, /*base_seed=*/42, + /*fetch_block_sampler=*/[](Rng&) { return 0.1; }); + + const double expected_per_rank = num_steps * (0.05 + 0.1); // 1.5s + CHECK(result.per_rank_completion_time.size() == + static_cast(num_ranks)); + for (double t : result.per_rank_completion_time) { + CHECK(t == doctest::Approx(expected_per_rank).epsilon(kEpsilon)); + } + // All ranks run in lockstep over the same wall clock; union = per-rank + // time. + CHECK(result.e2e_duration == + doctest::Approx(expected_per_rank).epsilon(1e-6)); + CHECK(result.rank_variance == doctest::Approx(0.0)); + CHECK(result.load_imbalance == doctest::Approx(0.0).epsilon(1e-6)); + CHECK(result.e2e_error == doctest::Approx(0.0).epsilon(1e-6)); + + // Simulated fetch_block matches trace exactly -> CDF similarity == 1.0. + CHECK(result.fetch_block_cdf_similarity == doctest::Approx(1.0)); + + // Component accumulated times. + CHECK(result.fetch_block_metrics.accumulated_time == + doctest::Approx(num_ranks * num_steps * 0.1)); + CHECK(result.fetch_iter_metrics.accumulated_time == + doctest::Approx(num_ranks * num_steps * 0.05)); + CHECK(result.fetch_block_metrics.num_samples == + static_cast(num_ranks * num_steps)); + } + + TEST_CASE("sync mode with barrier every step - lockstep advance") { + const int num_ranks = 3; + const int num_steps = 5; + auto ctx = make_ctx(num_ranks, num_steps); + ctx.sync_mode = true; + ctx.accumulate_grad_batches = 1; + + BarrierSimulator sim; + auto result = + sim.simulate(ctx, /*base_seed=*/123, [](Rng&) { return 0.1; }); + + const double expected = num_steps * 0.15; + // In sync mode all ranks finish at the same time; e2e = rank_times[0]. + CHECK(result.e2e_duration == doctest::Approx(expected).epsilon(1e-6)); + for (double t : result.per_rank_completion_time) { + CHECK(t == doctest::Approx(expected).epsilon(kEpsilon)); + } + // Constant work + barrier -> zero barrier overhead. + CHECK(result.avg_barrier_overhead == doctest::Approx(0.0)); + CHECK(result.max_barrier_overhead == doctest::Approx(0.0)); + } + + TEST_CASE( + "throughput metrics populated when trace throughput is provided") { + auto ctx = make_ctx(/*num_ranks=*/2, /*num_steps=*/8); + BarrierSimulator sim; + auto result = sim.simulate(ctx, 7, [](Rng&) { return 0.1; }); + + const double expected_throughput = 8.0 / (8 * 0.15); + CHECK(result.simulated_per_rank_throughput.size() == 2); + for (double tp : result.simulated_per_rank_throughput) { + CHECK(tp == doctest::Approx(expected_throughput)); + } + CHECK(result.throughput_mean == doctest::Approx(expected_throughput)); + CHECK(result.throughput_variance == doctest::Approx(0.0).epsilon(1e-9)); + // CDF similarity is computed but not asserted exactly: simulated + // throughput can differ from trace throughput by ~1 ULP due to + // accumulation order, and KS is exact so a 1-ULP gap collapses + // similarity to 0. Just check it ran. + CHECK(result.trace_per_rank_throughput.size() == 2); + } +} From eafcd87412bcf35508e985ce53e51d08322d603c Mon Sep 17 00:00:00 2001 From: Ray Andrew Date: Mon, 18 May 2026 22:21:18 -0500 Subject: [PATCH 2/2] chore(cods): update DLIO generator docs --- docs/source/cli.rst | 95 ++++++++++++++++++- docs/source/pipeline.rst | 2 +- docs/source/utilities.rst | 32 ++++++- docs/source/utilities/common.rst | 101 +++++++++++++++++++- docs/source/utilities/dlio.rst | 157 +++++++++++++++++++++++++++++++ 5 files changed, 377 insertions(+), 10 deletions(-) create mode 100644 docs/source/utilities/dlio.rst diff --git a/docs/source/cli.rst b/docs/source/cli.rst index e4b74464..1b34d463 100644 --- a/docs/source/cli.rst +++ b/docs/source/cli.rst @@ -13,7 +13,7 @@ Most tools wire in a common set of argument schemas defined in semantics across every binary that exposes the relevant schema and are not repeated in each tool's section. -**Pipeline (``PipelineArgs``)** +**Pipeline** (``PipelineArgs``) - ``--executor-threads `` - Number of worker threads for parallel processing (default: number of CPU cores) @@ -21,19 +21,19 @@ repeated in each tool's section. cores) - ``--time-profiling`` - Print stage timing breakdown to stderr -**Indexing (``IndexingArgs``)** +**Indexing** (``IndexingArgs``) - ``--index-dir `` - Directory for ``.dftindex`` stores - ``--checkpoint-size `` - Checkpoint size for gzip indexing in bytes (default: 33554432 B / 32 MB) - ``-f, --force`` - Force index recreation -**Query (``QueryArgs``)** +**Query** (``QueryArgs``) - ``--query `` - Query DSL filter (e.g., ``'cat == "POSIX" and dur > 1000'``) -**Watchdog (``WatchdogArgs``)** +**Watchdog** (``WatchdogArgs``) - ``--disable-watchdog`` - Disable watchdog for hang detection - ``--watchdog-global-timeout `` - Watchdog global timeout for pipeline @@ -49,7 +49,7 @@ repeated in each tool's section. - ``--watchdog-deadlock-timeout `` - Watchdog deadlock timeout in seconds (0 = use default, default: 600) -**Inputs (``DirectoryArgs`` / ``FilesArgs``)** +**Inputs** (``DirectoryArgs`` / ``FilesArgs``) - ``-d, --directory `` - Directory containing trace files - ``--files `` - Trace files (``.pfw``, ``.pfw.gz``) @@ -516,6 +516,91 @@ The Arrow output always includes the base columns ``batch_type``, ``cat``, import duckdb result = duckdb.sql("SELECT * FROM 'agg.arrows'") +dftracer_gen_dlio_config +------------------------ + +**Description:** Generate a DLIO YAML configuration directly from a directory +of raw DFTracer traces. The tool indexes the inputs, aggregates them into the +internal ``AGGREGATION`` column family (DDSketch forced on), fits per-component +distributions, refines ``max_bound`` against an internal barrier simulator, and +emits a DLIO ``train.computation_time`` + ``reader.preprocess_time`` block. The +user does not need to run ``dftracer_aggregator`` separately. + +Required input event names: ``cat=dataloader`` with ``name=fetch.block`` / +``fetch.iter``, and ``cat=data`` with ``name=preprocess`` / ``item``. The tool +exits non-zero with an explanatory message if no DLIO events are present. + +**Usage:** + +.. code-block:: bash + + dftracer_gen_dlio_config [OPTIONS] -o + +**Options:** + +- ``-d, --directory `` - Input directory containing .pfw or .pfw.gz traces (default: .) +- ``-o, --output `` - Output path for the DLIO YAML config [required] +- ``--max-bound-percentile `` - Initial max_bound percentile, 0-100 (default: 95) +- ``--simulation-iterations `` - Max simulator iterations for percentile refinement (default: 5) +- ``--target-e2e-error `` - Target relative E2E error to declare convergence (default: 0.05) +- ``--target-cdf-similarity `` - Target fetch_block CDF similarity (default: 0.90) +- ``--patience `` - Early-stop after this many iterations without improvement (default: 10) +- ``--epsilon `` - Base step size for percentile adjustment (default: 1.0) +- ``--momentum `` - Momentum factor in [0, 1) (default: 0.9) +- ``--min-percentile `` - Floor on max_bound percentile during optimization (default: 50) +- ``--num-workers `` - DataLoader worker count for the simulator (default: 8) +- ``--prefetch-factor `` - DataLoader prefetch factor (default: 2) +- ``--seed `` - Base seed for simulator and sampler (default: 42) +- ``--max-samples-per-entry `` - Cap on synthesized samples per aggregation entry; 0 disables (default: 100) +- ``-t, --time-interval `` - Aggregation time interval in ms (default: 5000) +- ``--index-dir `` - Directory for the shared index store (default: system temp dir) +- ``--checkpoint-size `` - Checkpoint size for indexing in bytes (default: 33554432 B / 32 MB) +- ``--executor-threads `` - Number of executor threads for parallel processing +- ``-f, --force`` - Force index recreation + +**Distribution pool:** Each component is fit as the lowest-BIC choice among +{Normal, Lognormal, Gamma, Exponential, Weibull, Gaussian Mixture (K=2), +Gaussian Mixture (K=3)}. Mixture candidates are only considered when the +sample count is at least 20. + +**Example:** + +.. code-block:: bash + + # Generate config from a directory of raw traces + dftracer_gen_dlio_config -d ./traces -o dlio_config.yaml + + # Refine harder against the simulator with a tighter convergence target + dftracer_gen_dlio_config -d ./traces -o dlio_config.yaml \ + --simulation-iterations 20 --target-e2e-error 0.02 --patience 5 + + # Reuse a shared index directory across runs to skip re-indexing + dftracer_gen_dlio_config -d ./traces -o dlio_config.yaml \ + --index-dir /var/cache/dftracer/idx + +**Output schema:** + +.. code-block:: yaml + + train: + computation_time: + type: + # single distribution: per-family params (mean/stdev, mu/sigma, + # shape/scale, rate) + # mixture: n_components + components: [{weight, params: {type, ...}}] + max_bound: + reader: + preprocess_time: + # same structure + +**Comparing against an external generator:** ``scripts/compare_dlio_yamls.py`` +diffs two DLIO YAMLs with a tolerance check on parameters and a two-sample +Kolmogorov-Smirnov check on samples drawn from each fit. Run via ``uv run +scripts/compare_dlio_yamls.py --python --cpp `` (the inline +PEP-723 metadata installs ``pyyaml`` and ``numpy`` automatically). Same model +family + small KS = the two YAMLs would produce indistinguishable DLIO sample +streams. + dftracer_organize ----------------- diff --git a/docs/source/pipeline.rst b/docs/source/pipeline.rst index 05faad8a..49019439 100644 --- a/docs/source/pipeline.rst +++ b/docs/source/pipeline.rst @@ -818,7 +818,7 @@ Control execution duration and cooperative cancellation using ``PipelineConfig`` co_return; }); -**Timing out a race with ``when_any`` + timeout:** Use ``when_any`` with a timeout awaitable to race operations: +**Timing out a race with** ``when_any`` **+ timeout:** Use ``when_any`` with a timeout awaitable to race operations: .. code-block:: cpp diff --git a/docs/source/utilities.rst b/docs/source/utilities.rst index f462d079..3b393220 100644 --- a/docs/source/utilities.rst +++ b/docs/source/utilities.rst @@ -17,6 +17,7 @@ dftracer-utils provides a collection of composable utilities for trace file proc utilities/indexer utilities/reader utilities/common + utilities/dlio call-tree Overview @@ -43,8 +44,9 @@ Utilities follow a consistent pattern: Hash["Hash
FNV1a, Std, MurmurHash3"] Indexer["Indexer
Checkpoint, BloomFilter"] Reader["Reader
Stream, LineProcessor"] - Common["Common
JSON, DDSketch, Log2Histogram"] + Common["Common
JSON, DDSketch, Statistic, Distributions, Mixture"] Composites["Composites
DFTracer-specific pipelines"] + Dlio["DLIO
BarrierSimulator, TraceLoader, Optimizer, YAML emit"] end Utility --> FileIO @@ -55,6 +57,7 @@ Utilities follow a consistent pattern: Utility --> Reader Utility --> Common Utility --> Composites + Utility --> Dlio File I/O -------- @@ -71,13 +74,36 @@ See :doc:`/utilities/fileio` for detailed usage. Statistics ---------- -Enhanced statistics collection for trace analysis: +Enhanced statistics collection and distribution fitting for trace analysis: - **DDSketch**: Deterministic, merge-order-independent percentile estimation with bounded relative error - **Log2Histogram**: Fixed 65-bin logarithmic histogram for duration and size distributions +- **Statistic**: Min/max/mean/count accumulator that optionally delegates to an attached DDSketch for quantile queries +- **Distributions**: MLE fitting + KS / BIC scoring for Normal, Lognormal, Gamma, Exponential, Weibull; sampler factory backed by ```` and `Boost.Math standalone `_ +- **Mixture**: Univariate Gaussian Mixture EM (K=2, K=3) with log-sum-exp responsibilities and BIC-based selection across single + mixture models - **Chunk statistics**: Per-chunk event tracking with online variance calculation and per-name duration sketches -These are used in indexing and aggregation pipelines to compute event distributions and percentiles efficiently. +These are used in indexing and aggregation pipelines to compute event distributions and percentiles efficiently, and by the DLIO config generator to fit per-component timing distributions. + +DLIO Config Generation +---------------------- + +End-to-end pipeline that converts a directory of raw DFTracer logs into a DLIO +training-loop YAML configuration: + +- **trace_loader**: pulls the ``AGGREGATION`` column family (re-attaches the + merge operator at open time) and synthesizes per-rank sample arrays from + per-(pid, time_bucket) entries. +- **BarrierSimulator**: simulates one DLIO training run across the captured + ranks/steps, scoring an end-to-end duration, rank variance, and ``fetch.block`` + CDF similarity against the empirical trace. +- **optimizer**: sequential momentum loop refining the ``max_bound`` percentile + on the fitted sampler to minimize simulator E2E error. +- **yaml_emit**: renders single distributions or Gaussian mixtures into the + DLIO ``train.computation_time`` / ``reader.preprocess_time`` schema. + +See :doc:`/utilities/dlio` for the API and ``dftracer_gen_dlio_config`` in +:doc:`/cli` for the user-facing binary. Indexing -------- diff --git a/docs/source/utilities/common.rst b/docs/source/utilities/common.rst index 3b571b29..cede5cfd 100644 --- a/docs/source/utilities/common.rst +++ b/docs/source/utilities/common.rst @@ -241,13 +241,19 @@ The query AST uses ``std::variant``-based nodes: Statistics ---------- -Percentile estimation and histogram utilities for trace analysis. +Percentile estimation, histogram, accumulator, and distribution-fitting +utilities for trace analysis. .. code-block:: cpp #include #include #include + #include + #include + #include + // Or use the umbrella header: + #include DDSketch ~~~~~~~~ @@ -350,6 +356,99 @@ expansions for adaptive aggregation. auto bytes = th.serialize(); auto restored = TimestampHistogram::deserialize(bytes.data(), bytes.size()); +Statistic +~~~~~~~~~ + +Lightweight min/max/mean/count accumulator with an optional DDSketch backing +for quantile queries. When a sketch is attached, ``quantile()`` consults it; +when no sketch is present, the fallback is a uniform interpolation between +observed min and max. + +.. code-block:: cpp + + Statistic stat; + for (double v : samples) stat.update(v); + + double mean = stat.mean(); + double approx_p50 = stat.quantile(0.5); // uses linear-interp without a sketch + + // Promote to DDSketch-backed quantiles by attaching a populated sketch. + auto sketch = std::make_shared(0.01); + for (double v : samples) sketch->add(v); + stat.attach_sketch(std::move(sketch)); + double real_p99 = stat.quantile(0.99); // now consults the sketch + +Distributions +~~~~~~~~~~~~~ + +Maximum-likelihood fitting for five parametric families plus a Kolmogorov- +Smirnov goodness-of-fit score and BIC. Backed by `Boost.Math standalone +`_ +for CDF/PDF/quantile evaluation; samplers use ````. + +Supported families: Normal, Lognormal, Gamma, Exponential, Weibull. + +.. code-block:: cpp + + std::vector data = ...; + + // Fit one family directly. + FittedDistribution fit = fit_single_distribution( + DistributionKind::Lognormal, data); + if (fit.valid) { + printf("lognormal mu=%.4f sigma=%.4f KS=%.4f BIC=%.2f\n", + fit.params[0], fit.params[1], fit.ks_stat, fit.bic); + } + + // Fit all five and pick the lowest-KS valid fit. + auto fits = fit_all_single_distributions(data); + if (auto best = best_fit_by_ks(fits)) { + printf("best family: %s\n", + std::string(distribution_name(best->kind)).c_str()); + } + + // Build a sampler from a fit (optionally bounded). + auto sampler = make_sampler(*best, /*min_bound=*/0.0, + /*max_bound=*/0.5); + std::mt19937_64 rng(42); + double draw = sampler(rng); + +Mixture +~~~~~~~ + +Univariate Gaussian Mixture Model fitting via EM (K=2, K=3) with log-sum-exp +responsibilities, quantile-spread initial means, and a variance floor to +prevent component collapse. Plus a BIC-based selector across single +distributions and mixtures. + +.. code-block:: cpp + + // Fit a 2-component Gaussian mixture. + FittedMixture m = fit_gaussian_mixture(data, /*K=*/2); + if (m.valid && m.converged) { + for (size_t k = 0; k < m.weights.size(); ++k) { + printf("comp %zu weight=%.3f mean=%.6f stddev=%.6f\n", + k, m.weights[k], + m.components[k].mean, m.components[k].stddev); + } + } + + // Pick the lowest-BIC model across {singles, GMM-2, GMM-3}. + auto singles = fit_all_single_distributions(data); + std::vector mixes{ + fit_gaussian_mixture(data, 2), + fit_gaussian_mixture(data, 3), + }; + auto selection = select_best_model(singles, mixes); + + if (selection) { + // `BestModel` is a std::variant. + // pdf / cdf / make_sampler are overloaded and dispatch through it. + auto sampler = make_sampler(selection->model); + std::mt19937_64 rng(42); + double draw = sampler(rng); + } + Arrow ----- diff --git a/docs/source/utilities/dlio.rst b/docs/source/utilities/dlio.rst new file mode 100644 index 00000000..26b19530 --- /dev/null +++ b/docs/source/utilities/dlio.rst @@ -0,0 +1,157 @@ +DLIO Config Generation +====================== + +The ``dlio`` utilities power the ``dftracer_gen_dlio_config`` binary +(see :ref:`dftracer_gen_dlio_config ` in the CLI reference). +They consume an already-populated ``AGGREGATION`` column family (produced by +:doc:`/cli` ``dftracer_aggregator`` or by the shared +``aggregation_runner`` library function) and emit a DLIO-compatible YAML +config describing per-component timing distributions. + +.. code-block:: cpp + + #include + #include + #include + #include + #include + #include + +Pipeline overview +----------------- + +End-to-end the module composes four pieces: + +1. **trace_loader** opens an existing RocksDB read-only (with the AGGREGATION + merge operator re-attached), iterates the ``AGGREGATION`` column family, + groups entries by ``(cat, name, pid, time_bucket)``, and synthesizes a flat + per-rank sample stream per component (``fetch.block``, ``fetch.iter``, + ``preprocess``, ``item``). Sketches are used for inverse-CDF sampling when + the aggregator was run with ``--compute-percentiles``; otherwise the + per-call mean is replicated. + +2. The :doc:`distribution fitter ` (under + ``common/statistics/distributions.h`` and ``mixture.h``) fits the lowest-BIC + model from {Normal, Lognormal, Gamma, Exponential, Weibull, GMM-2, GMM-3}. + +3. **BarrierSimulator** simulates one DLIO training run across the captured + ranks/steps using the fitted distribution as the ``fetch.block`` sampler. + It produces an end-to-end duration, rank variance, and a Kolmogorov-Smirnov + similarity between simulated and trace ``fetch.block`` samples. + +4. **optimizer** runs a sequential momentum loop tuning the ``max_bound`` + percentile used to clamp the sampler, minimizing simulator E2E error while + keeping the CDF similarity above a target. + +5. **yaml_emit** renders the final YAML. + +trace_loader +------------ + +.. code-block:: cpp + + TraceLoaderOptions opts; + opts.max_samples_per_entry = 100; // cap per (pid, bucket) entry, 0 = unlimited + opts.seed = 0xD15710; // seed for inverse-CDF sampling + + AggregatedTraces traces = load_aggregated_traces(db_path, opts); + + if (!traces.any_data) { /* no DLIO events in this DB */ } + if (!traces.sketches_available) { + // The aggregator was run without --compute-percentiles. We fell back to + // mean replication; rerun with the flag for higher-fidelity output. + } + +Returns an ``AggregatedTraces`` with: + +- ``fetch_block_trace`` / ``fetch_iter_trace`` / ``getitem_trace`` — + per-rank ``std::vector>`` of seconds, in pid-ascending + then time-bucket-ascending order. +- ``computation_times`` / ``preprocess_times`` — flat sample arrays in seconds + (the input to ``fit_all_single_distributions``). +- ``fetch_block_stats`` / ``fetch_iter_stats`` / ``preprocess_stats`` / + ``getitem_stats`` — :doc:`Statistic ` objects, with merged DDSketches + attached when available. +- ``trace_e2e_duration`` and per-component ``ComponentTimeMetrics`` with both + ``accumulated_time`` (sum of ``count × mean``) and ``union_time`` (true + wall-clock union via ``sweep_union`` over per-entry ``(ts, te)`` boundaries). + +BarrierSimulator +---------------- + +.. code-block:: cpp + + BarrierSimulatorContext ctx = make_simulator_context( + traces, /*num_workers=*/8, /*prefetch_factor=*/2); + + auto sampler = make_sampler(fitted_model); // from common/statistics + BarrierSimulator sim; + BarrierSimulationResult result = sim.simulate( + ctx, /*base_seed=*/42, sampler); + + printf("e2e=%.3fs error=%.2f%% fetch_block_cdf_sim=%.4f\n", + result.e2e_duration, + result.e2e_error * 100.0, + result.fetch_block_cdf_similarity); + +Free helpers exposed alongside ``BarrierSimulator``: + +- ``sweep_union(boundaries)`` — sweep-line interval union, microseconds to + seconds. +- ``cdf_similarity(a, b)`` — ``1 − KS`` between two empirical samples. +- ``variance(values)`` — population variance. + +Distribution fitting +-------------------- + +Lives under ``common/statistics`` and works on any sample array, not just DLIO +traces — see :doc:`common` for ``FittedDistribution``, ``FittedMixture``, +``BestModel`` (the ``std::variant``), ``select_best_model``, ``make_sampler``, +and free ``pdf`` / ``cdf`` / ``quantile`` overloads. + +optimizer +--------- + +.. code-block:: cpp + + OptimizerOptions opt_opts; + opt_opts.max_iterations = 5; + opt_opts.target_e2e_error = 0.05; + opt_opts.target_cdf_similarity = 0.90; + opt_opts.patience = 10; + opt_opts.epsilon = 1.0; + opt_opts.momentum = 0.9; + opt_opts.min_percentile = 50.0; + opt_opts.initial_percentile = 95.0; + opt_opts.base_seed = 42; + + OptimizerResult opt = optimize_max_bound_percentile( + ctx, fitted_model, traces.computation_times, opt_opts); + + double max_bound = percentile(sorted_samples, opt.best_percentile); + +Each iteration constructs a fresh sampler clamped at +``percentile(sample_times, current_percentile)``, runs ``simulate()``, and +adjusts ``current_percentile`` by a momentum-smoothed step proportional to the +E2E error sign. Convergence: ``e2e_error < target_e2e_error`` AND +``fetch_block_cdf_similarity > target_cdf_similarity``. Early-stops after +``patience`` iterations without improvement. + +yaml_emit +--------- + +.. code-block:: cpp + + DlioTimingBlock comp{best_comp_model, comp_max_bound}; + DlioTimingBlock prep{best_prep_model, prep_max_bound}; + + std::ofstream out("dlio_config.yaml"); + write_dlio_yaml(out, &comp, &prep); + + // Or render to a string: + std::string yaml = render_dlio_yaml(&comp, &prep); + +Renders both single distributions and mixtures into the DLIO schema +(``type: `` + family-specific params, or ``type: mixture`` + +``n_components`` + ``components: [{weight, params: {...}}]``). Pass ``nullptr`` +to either block argument to omit it.