From bea9ad19a8b212aeba051e9861cf1fa7e47c57e1 Mon Sep 17 00:00:00 2001 From: Jonas Rembser Date: Fri, 20 Feb 2026 22:42:29 +0100 Subject: [PATCH 1/2] [RF] Disable redundant dirty-flag propagation during minimization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a likelihood is evaluated with the new `"cpu"` backend, the `RooFit::Evaluator` fully manages dependency tracking and re-evaluation of the computation graph. In this case, RooFit’s built-in dirty flag propagation in RooAbsArg becomes redundant and introduces significant overhead for large models. This patch disables regular dirty state propagation for all non-fundamental nodes in the Evaluator's computation graph by setting their OperMode to `RooAbsArg::ADirty`. Fundamental nodes (e.g. RooRealVar, RooCategory) are excluded because they are often shared with other computation graphs outside the Evaluator (usually the original pdf in the RooWorkspace). To set the OperMode of *all* RooAbsArgs to `ADirty` during minimization, while avoiding side effects outside the minimization scope, the dirty flag propagation for the fundamental nodes is only disabled temporarily in the RooMinimizer. This commit drastically speeds up fits with AD in particular (up to 2 x for large models), because with fast gradients, the dirty flag propagation that determines which part of the compute graph needs to be recomputed becomes the bottleneck. It was also redundant with a faster "dirty state" bookkeeping mechanism in the `RooFit::Evaluator` class itself. At this point, there is no performance regression anymore when disabling recursive dirty flag propagation for all evaluated nodes, so the old comment in the code about test 14 in stressRooFit being slow doesn't apply anymore. --- roofit/roofitcore/inc/RooEvaluatorWrapper.h | 3 +- roofit/roofitcore/inc/RooFit/Evaluator.h | 2 + roofit/roofitcore/inc/RooMinimizer.h | 1 + roofit/roofitcore/src/RooEvaluatorWrapper.cxx | 5 ++ roofit/roofitcore/src/RooFit/Evaluator.cxx | 64 ++++++++++++++++--- roofit/roofitcore/src/RooMinimizer.cxx | 48 ++++++++++---- 6 files changed, 102 insertions(+), 21 deletions(-) diff --git a/roofit/roofitcore/inc/RooEvaluatorWrapper.h b/roofit/roofitcore/inc/RooEvaluatorWrapper.h index dea80a4179602..340631e0d282d 100644 --- a/roofit/roofitcore/inc/RooEvaluatorWrapper.h +++ b/roofit/roofitcore/inc/RooEvaluatorWrapper.h @@ -71,9 +71,10 @@ class RooEvaluatorWrapper final : public RooAbsReal { void generateHessian(); void setUseGeneratedFunctionCode(bool); - void writeDebugMacro(std::string const &) const; + std::stack> setOperModes(RooAbsArg::OperMode opMode); + protected: double evaluate() const override; diff --git a/roofit/roofitcore/inc/RooFit/Evaluator.h b/roofit/roofitcore/inc/RooFit/Evaluator.h index 281e4ab7213f3..18bf0e304fac2 100644 --- a/roofit/roofitcore/inc/RooFit/Evaluator.h +++ b/roofit/roofitcore/inc/RooFit/Evaluator.h @@ -45,6 +45,8 @@ class Evaluator { void setOffsetMode(RooFit::EvalContext::OffsetMode); + std::stack> setOperModes(RooAbsArg::OperMode opMode); + private: void processVariable(NodeInfo &nodeInfo); void processCategory(NodeInfo &nodeInfo); diff --git a/roofit/roofitcore/inc/RooMinimizer.h b/roofit/roofitcore/inc/RooMinimizer.h index fcf2dfe960a64..95ff438588f3e 100644 --- a/roofit/roofitcore/inc/RooMinimizer.h +++ b/roofit/roofitcore/inc/RooMinimizer.h @@ -234,6 +234,7 @@ class RooMinimizer : public TObject { void fillCorrMatrix(RooFitResult &fitRes); void updateErrors(); + RooAbsReal &_function; ROOT::Fit::FitConfig _config; ///< fitter configuration (options and parameter settings) std::unique_ptr _result; /// _minimizer; ///writeDebugMacro(filename); } +std::stack> RooEvaluatorWrapper::setOperModes(RooAbsArg::OperMode opMode) +{ + return _evaluator->setOperModes(opMode); +} + } // namespace RooFit::Experimental /// \endcond diff --git a/roofit/roofitcore/src/RooFit/Evaluator.cxx b/roofit/roofitcore/src/RooFit/Evaluator.cxx index 0ad61d2774a26..87a7f929f76f6 100644 --- a/roofit/roofitcore/src/RooFit/Evaluator.cxx +++ b/roofit/roofitcore/src/RooFit/Evaluator.cxx @@ -46,6 +46,7 @@ RooAbsPdf::fitTo() is called and gets destroyed when the fitting ends. #include #include #include +#include namespace RooFit { @@ -325,16 +326,16 @@ void Evaluator::updateOutputSizes() for (auto &info : _nodes) { info.outputSize = outputSizeMap.at(info.absArg); info.isDirty = true; - - // In principle we don't need dirty flag propagation because the driver - // takes care of deciding which node needs to be re-evaluated. However, - // disabling it also for scalar mode results in very long fitting times - // for specific models (test 14 in stressRooFit), which still needs to be - // understood. TODO. - if (!info.isScalar()) { + // We don't need dirty flag propagation because the evaluator takes care + // of deciding what needs to be re-evaluated. We can disable the regular + // dirty state propagation. However, fundamental variables like + // RooRealVars and RooCategories are usually shared with other + // computation graphs outside the evaluator, so they can't be mutated. + // See also the code of the RooMinimizer, which ensures that dirty state + // propagation is temporarily disabled during minimization to really + // eliminate any overhead from the dirty flag propagation. + if (!info.absArg->isFundamental()) { setOperMode(info.absArg, RooAbsArg::ADirty); - } else { - setOperMode(info.absArg, info.originalOperMode); } } @@ -632,6 +633,51 @@ void Evaluator::setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode) } } +// Change the operation modes of all RooAbsArgs in the computation graph. +// The changes are reset when the returned RAII object goes out of scope. +// +// We also walk transitively through value clients of the nodes to cover any +// node that RooAbsReal::doEval (the fallback scalar implementation) might +// inadvertently propagate the ADirty mode to via its recursive restore: that +// helper sets servers temporarily to AClean and then calls +// setOperMode(oldOperMode) to restore, which recurses to value clients when +// oldOperMode is ADirty. If we did not protect those clients here, any node +// outside the computation graph that shares a fundamental (e.g. a parameter +// like a RooRealVar) would be left permanently in ADirty after the first +// minimization, dramatically slowing down later scalar evaluations (for +// example on pdfs held by the legacy test statistics' internal cache). +std::stack> Evaluator::setOperModes(RooAbsArg::OperMode opMode) +{ + std::stack> out{}; + std::unordered_set visited; + + std::vector queue; + queue.reserve(_nodes.size()); + for (auto &info : _nodes) { + queue.push_back(info.absArg); + } + + while (!queue.empty()) { + RooAbsArg *node = queue.back(); + queue.pop_back(); + if (!visited.insert(node).second) + continue; + + if (opMode != node->operMode()) { + out.emplace(std::make_unique(node, opMode)); + } + + // Only follow value-client links: that is exactly the propagation path + // used by RooAbsArg::setOperMode with mode==ADirty. + if (opMode == RooAbsArg::ADirty) { + for (auto *client : node->valueClients()) { + queue.push_back(client); + } + } + } + return out; +} + void Evaluator::print(std::ostream &os) { std::cout << "--- RooFit BatchMode evaluation ---\n"; diff --git a/roofit/roofitcore/src/RooMinimizer.cxx b/roofit/roofitcore/src/RooMinimizer.cxx index 718a13880ac2b..679a7c98a0450 100644 --- a/roofit/roofitcore/src/RooMinimizer.cxx +++ b/roofit/roofitcore/src/RooMinimizer.cxx @@ -40,27 +40,30 @@ automatic PDF optimization. #include "RooMinimizer.h" #include "RooAbsMinimizerFcn.h" -#include "RooArgSet.h" -#include "RooArgList.h" #include "RooAbsReal.h" +#include "RooArgList.h" +#include "RooArgSet.h" +#include "RooCategory.h" #include "RooDataSet.h" -#include "RooRealVar.h" -#include "RooSentinel.h" +#include "RooEvaluatorWrapper.h" +#include "RooFit/TestStatistics/RooAbsL.h" +#include "RooFit/TestStatistics/RooRealL.h" +#include "RooFitResult.h" +#include "RooHelpers.h" +#include "RooMinimizerFcn.h" #include "RooMsgService.h" -#include "RooCategory.h" #include "RooMultiPdf.h" #include "RooPlot.h" -#include "RooHelpers.h" -#include "RooMinimizerFcn.h" -#include "RooFitResult.h" -#include "RooFit/TestStatistics/RooAbsL.h" -#include "RooFit/TestStatistics/RooRealL.h" +#include "RooRealVar.h" +#include "RooSentinel.h" #ifdef ROOFIT_MULTIPROCESS #include "TestStatistics/MinuitFcnGrad.h" #include "RooFit/MultiProcess/Config.h" #include "RooFit/MultiProcess/ProcessTimer.h" #endif +#include "RooFitImplHelpers.h" + #include #include #include @@ -120,6 +123,22 @@ void reorderCombinations(std::vector> &combos, const std::vecto } } +// The RooEvaluatorWrapper uses its own logic to decide what needs to be +// re-evaluated. We can therefore disable the regular dirty state propagation +// temporarily during minimization. However, some RooAbsArgs shared with other +// regular RooFit computation graphs outside the minimized likelihood, so we +// have to make sure that the operation mode is reset after the minimization. +// +// This should be called before running any routine via the _minimizer data +// member. The RAII object should only be destructed after the routine is done. +std::stack> setOperModesDirty(RooAbsReal &function) +{ + if (auto *wrapper = dynamic_cast(&function)) { + return wrapper->setOperModes(RooAbsArg::ADirty); + } + return {}; +} + } // namespace //////////////////////////////////////////////////////////////////////////////// @@ -135,7 +154,7 @@ void reorderCombinations(std::vector> &combos, const std::vecto /// value of the input function. /// Constructor that accepts all configuration in struct with RooAbsReal likelihood -RooMinimizer::RooMinimizer(RooAbsReal &function, Config const &cfg) : _cfg(cfg) +RooMinimizer::RooMinimizer(RooAbsReal &function, Config const &cfg) : _function{function}, _cfg(cfg) { initMinimizerFirstPart(); auto nll_real = dynamic_cast(&function); @@ -692,6 +711,7 @@ RooPlot *RooMinimizer::contour(RooRealVar &var1, RooRealVar &var2, double n1, do n[4] = n5; n[5] = n6; + auto operModeRAII = setOperModesDirty(_function); for (int ic = 0; ic < 6; ic++) { if (n[ic] > 0) { @@ -906,6 +926,8 @@ bool RooMinimizer::fitFCN() // fit a user provided FCN function // create fit parameter settings + auto operModeRAII = setOperModesDirty(_function); + // Check number of parameters unsigned int npar = getNPar(); if (npar == 0) { @@ -1045,6 +1067,8 @@ bool RooMinimizer::calculateHessErrors() // compute the Hesse errors according to configuration // set in the parameters and append value in fit result + auto operModeRAII = setOperModesDirty(_function); + // update minimizer (recreate if not done or if name has changed if (!updateMinimizerOptions()) { coutE(Minimization) << "RooMinimizer::calculateHessErrors() Error re-initializing the minimizer" << std::endl; @@ -1079,6 +1103,8 @@ bool RooMinimizer::calculateMinosErrors() // (in DoMinimization) aftewr minimizing if the // FitConfig::MinosErrors() flag is set + auto operModeRAII = setOperModesDirty(_function); + // update minimizer (but cannot re-create in this case). Must use an existing one if (!updateMinimizerOptions(false)) { coutE(Minimization) << "RooMinimizer::calculateHessErrors() Error re-initializing the minimizer" << std::endl; From 53aec08e8bb1ca2609e9ff807147467483e85e5c Mon Sep 17 00:00:00 2001 From: Jonas Rembser Date: Thu, 23 Apr 2026 00:03:31 +0200 Subject: [PATCH 2/2] [RF] Refactor ChangeOperModeRAII to work with groups of RooAbsArgs Several places needed to record a set of operation-mode changes and restore them later as a group, so it's better to have the ChangeOperModeRAII act on groups of RooAbsArg to not have to create one RAII object per arg. --- roofit/roofitcore/inc/RooEvaluatorWrapper.h | 4 +- roofit/roofitcore/inc/RooFit/Evaluator.h | 5 +- roofit/roofitcore/res/RooFitImplHelpers.h | 48 ++++++++++++++----- roofit/roofitcore/src/RooEvaluatorWrapper.cxx | 2 +- roofit/roofitcore/src/RooFit/Evaluator.cxx | 14 +++--- roofit/roofitcore/src/RooMinimizer.cxx | 2 +- roofit/roofitcore/src/RooRealIntegral.cxx | 4 +- 7 files changed, 52 insertions(+), 27 deletions(-) diff --git a/roofit/roofitcore/inc/RooEvaluatorWrapper.h b/roofit/roofitcore/inc/RooEvaluatorWrapper.h index 340631e0d282d..ce394fc629152 100644 --- a/roofit/roofitcore/inc/RooEvaluatorWrapper.h +++ b/roofit/roofitcore/inc/RooEvaluatorWrapper.h @@ -21,8 +21,10 @@ #include #include +#include #include +class ChangeOperModeRAII; class RooAbsArg; class RooAbsCategory; class RooAbsPdf; @@ -73,7 +75,7 @@ class RooEvaluatorWrapper final : public RooAbsReal { void setUseGeneratedFunctionCode(bool); void writeDebugMacro(std::string const &) const; - std::stack> setOperModes(RooAbsArg::OperMode opMode); + std::unique_ptr setOperModes(RooAbsArg::OperMode opMode); protected: double evaluate() const override; diff --git a/roofit/roofitcore/inc/RooFit/Evaluator.h b/roofit/roofitcore/inc/RooFit/Evaluator.h index 18bf0e304fac2..f50424c634abf 100644 --- a/roofit/roofitcore/inc/RooFit/Evaluator.h +++ b/roofit/roofitcore/inc/RooFit/Evaluator.h @@ -20,7 +20,6 @@ #include #include -#include class ChangeOperModeRAII; class RooAbsArg; @@ -45,7 +44,7 @@ class Evaluator { void setOffsetMode(RooFit::EvalContext::OffsetMode); - std::stack> setOperModes(RooAbsArg::OperMode opMode); + std::unique_ptr setOperModes(RooAbsArg::OperMode opMode); private: void processVariable(NodeInfo &nodeInfo); @@ -68,7 +67,7 @@ class Evaluator { RooFit::EvalContext _evalContextCUDA; std::vector _nodes; // the ordered computation graph std::unordered_map _nodesMap; // for quick lookup of nodes - std::stack> _changeOperModeRAIIs; + std::unique_ptr _operModeChanges; }; } // end namespace RooFit diff --git a/roofit/roofitcore/res/RooFitImplHelpers.h b/roofit/roofitcore/res/RooFitImplHelpers.h index 6d04134bc85ec..d14bc905cd21e 100644 --- a/roofit/roofitcore/res/RooFitImplHelpers.h +++ b/roofit/roofitcore/res/RooFitImplHelpers.h @@ -46,25 +46,51 @@ class DisableCachingRAII { bool _oldState; }; -/// Struct to temporarily change the operation mode of a RooAbsArg until it -/// goes out of scope. +/// Scope guard that temporarily changes the operation mode of one or more +/// RooAbsArg instances. Each call to change() records the arg's current +/// operMode before flipping it to the requested mode (non-recursively, i.e. +/// value clients are not touched). Destruction (or an explicit clear()) +/// restores every recorded mode in LIFO order. +/// +/// The class is movable but not copyable, so it can be returned from +/// functions that build up a batch of changes to hand to the caller. class ChangeOperModeRAII { public: - ChangeOperModeRAII(RooAbsArg *arg, RooAbsArg::OperMode opMode) : _arg{arg}, _oldOpMode(arg->operMode()) + ChangeOperModeRAII() = default; + + /// Convenience ctor: behaves like a scope guard for a single arg. + ChangeOperModeRAII(RooAbsArg *arg, RooAbsArg::OperMode opMode) { change(arg, opMode); } + + ~ChangeOperModeRAII() { clear(); } + + ChangeOperModeRAII(ChangeOperModeRAII &&) = default; + ChangeOperModeRAII &operator=(ChangeOperModeRAII &&) = default; + ChangeOperModeRAII(ChangeOperModeRAII const &) = delete; + ChangeOperModeRAII &operator=(ChangeOperModeRAII const &) = delete; + + /// Record arg's current operMode and flip it to opMode. If the current + /// mode already equals opMode, this is a no-op (nothing to restore). + void change(RooAbsArg *arg, RooAbsArg::OperMode opMode) { - arg->setOperMode(opMode, /*recurse=*/false); + if (opMode == arg->operMode()) + return; + _entries.emplace_back(arg, arg->operMode()); + arg->setOperMode(opMode, /*recurseADirty=*/false); } - ChangeOperModeRAII(ChangeOperModeRAII const &other) = delete; - ChangeOperModeRAII &operator=(ChangeOperModeRAII const &other) = delete; - ChangeOperModeRAII(ChangeOperModeRAII &&other) = delete; - ChangeOperModeRAII &operator=(ChangeOperModeRAII &&other) = delete; + /// Restore every recorded change right away, emptying this guard. + void clear() + { + for (auto it = _entries.rbegin(); it != _entries.rend(); ++it) { + it->first->setOperMode(it->second, /*recurseADirty=*/false); + } + _entries.clear(); + } - ~ChangeOperModeRAII() { _arg->setOperMode(_oldOpMode, /*recurse=*/false); } + bool empty() const { return _entries.empty(); } private: - RooAbsArg *_arg = nullptr; - RooAbsArg::OperMode _oldOpMode; + std::vector> _entries; }; namespace RooHelpers { diff --git a/roofit/roofitcore/src/RooEvaluatorWrapper.cxx b/roofit/roofitcore/src/RooEvaluatorWrapper.cxx index f3916c3e9e655..c0a2a66c0b1e5 100644 --- a/roofit/roofitcore/src/RooEvaluatorWrapper.cxx +++ b/roofit/roofitcore/src/RooEvaluatorWrapper.cxx @@ -741,7 +741,7 @@ void RooEvaluatorWrapper::writeDebugMacro(std::string const &filename) const return _funcWrapper->writeDebugMacro(filename); } -std::stack> RooEvaluatorWrapper::setOperModes(RooAbsArg::OperMode opMode) +std::unique_ptr RooEvaluatorWrapper::setOperModes(RooAbsArg::OperMode opMode) { return _evaluator->setOperModes(opMode); } diff --git a/roofit/roofitcore/src/RooFit/Evaluator.cxx b/roofit/roofitcore/src/RooFit/Evaluator.cxx index 87a7f929f76f6..b17fc94cbe598 100644 --- a/roofit/roofitcore/src/RooFit/Evaluator.cxx +++ b/roofit/roofitcore/src/RooFit/Evaluator.cxx @@ -628,9 +628,9 @@ void Evaluator::markGPUNodes() /// Evaluator gets deleted. void Evaluator::setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode) { - if (opMode != arg->operMode()) { - _changeOperModeRAIIs.emplace(std::make_unique(arg, opMode)); - } + if (!_operModeChanges) + _operModeChanges = std::make_unique(); + _operModeChanges->change(arg, opMode); } // Change the operation modes of all RooAbsArgs in the computation graph. @@ -646,9 +646,9 @@ void Evaluator::setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode) // like a RooRealVar) would be left permanently in ADirty after the first // minimization, dramatically slowing down later scalar evaluations (for // example on pdfs held by the legacy test statistics' internal cache). -std::stack> Evaluator::setOperModes(RooAbsArg::OperMode opMode) +std::unique_ptr Evaluator::setOperModes(RooAbsArg::OperMode opMode) { - std::stack> out{}; + auto out = std::make_unique(); std::unordered_set visited; std::vector queue; @@ -663,9 +663,7 @@ std::stack> Evaluator::setOperModes(RooAbsAr if (!visited.insert(node).second) continue; - if (opMode != node->operMode()) { - out.emplace(std::make_unique(node, opMode)); - } + out->change(node, opMode); // Only follow value-client links: that is exactly the propagation path // used by RooAbsArg::setOperMode with mode==ADirty. diff --git a/roofit/roofitcore/src/RooMinimizer.cxx b/roofit/roofitcore/src/RooMinimizer.cxx index 679a7c98a0450..af81c50102902 100644 --- a/roofit/roofitcore/src/RooMinimizer.cxx +++ b/roofit/roofitcore/src/RooMinimizer.cxx @@ -131,7 +131,7 @@ void reorderCombinations(std::vector> &combos, const std::vecto // // This should be called before running any routine via the _minimizer data // member. The RAII object should only be destructed after the routine is done. -std::stack> setOperModesDirty(RooAbsReal &function) +std::unique_ptr setOperModesDirty(RooAbsReal &function) { if (auto *wrapper = dynamic_cast(&function)) { return wrapper->setOperModes(RooAbsArg::ADirty); diff --git a/roofit/roofitcore/src/RooRealIntegral.cxx b/roofit/roofitcore/src/RooRealIntegral.cxx index c580bb0b2c10e..a1aa4bc459e54 100644 --- a/roofit/roofitcore/src/RooRealIntegral.cxx +++ b/roofit/roofitcore/src/RooRealIntegral.cxx @@ -841,12 +841,12 @@ double RooRealIntegral::evaluate() const // from caching subgraph results (e.g. for nested numeric integrals). RooArgList serverList; _function->treeNodeServerList(&serverList, nullptr, true, true, false, true); - std::list operModeRAII; + ChangeOperModeRAII operModeRAII; for (auto *arg : serverList) { arg->syncCache(); if (arg->operMode() == RooAbsArg::AClean) { - operModeRAII.emplace_back(arg, RooAbsArg::Auto); + operModeRAII.change(arg, RooAbsArg::Auto); } }