From 7d6c3a0e42aac04fad7568c2e2276090af861f8f Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Fri, 3 Apr 2026 21:15:50 -0400
Subject: [PATCH 01/25] add python script for tests

---
 narf/include/tests.hpp |  2 ++
 narf/tests.py          |  3 +++
 test/testshiftsmear.py | 11 +++++++++++
 3 files changed, 16 insertions(+)
 create mode 100644 narf/tests.py
 create mode 100644 test/testshiftsmear.py
diff --git a/narf/include/tests.hpp b/narf/include/tests.hpp
index 36b99d6..ed8ed04 100644
--- a/narf/include/tests.hpp
+++ b/narf/include/tests.hpp
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "histutils.hpp"
+
 namespace narf {
   ROOT::VecOps::RVec<double> testshift() {
     boost::histogram::axis::regular a(100, 0., 1.);
diff --git a/narf/tests.py b/narf/tests.py
new file mode 100644
index 0000000..abd7731
--- /dev/null
+++ b/narf/tests.py
@@ -0,0 +1,3 @@
+import narf.clingutils
+
+narf.clingutils.Declare('#include "tests.hpp"')
diff --git a/test/testshiftsmear.py b/test/testshiftsmear.py
new file mode 100644
index 0000000..2ca1bfa
--- /dev/null
+++ b/test/testshiftsmear.py
@@ -0,0 +1,11 @@
+import ROOT
+import narf.tests
+
+res =  ROOT.narf.testshift()
+print(res)
+
+res = ROOT.narf.testshifteigen()
+print(res)
+
+res = ROOT.narf.testshiftrw()
+print(res)

From e84d73a032753d4c42a80dcfb621ca121cf98933 Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Sat, 4 Apr 2026 04:11:07 -0400
Subject: [PATCH 02/25] Add SymMatrixAtomic

---
 narf/include/matrix_utils.hpp | 53 +++++++++++++++++++++++++++++++++++
 narf/include/tests.hpp        | 46 ++++++++++++++++++++++++++----
 test/testsymmatrixatomic.py   |  6 ++++
 3 files changed, 100 insertions(+), 5 deletions(-)
 create mode 100644 narf/include/matrix_utils.hpp
 create mode 100644 test/testsymmatrixatomic.py

diff --git a/narf/include/matrix_utils.hpp b/narf/include/matrix_utils.hpp
new file mode 100644
index 0000000..a63d2c2
--- /dev/null
+++ b/narf/include/matrix_utils.hpp
@@ -0,0 +1,53 @@
+#include <atomic>
+#include <eigen3/Eigen/Dense>
+#include <algorithm>
+
+class SymMatrixAtomic {
+public:
+  SymMatrixAtomic() = default;
+  SymMatrixAtomic(std::size_t n) : n_(n), data_(n*(n+1)/2) {}
+
+  double fetch_add(std::size_t iidx, std::size_t jidx, double val) {
+    std::atomic<double>& ref = data_[packed_index(iidx, jidx)];
+    return ref.fetch_add(val);
+  }
+
+  void fill_row(std::size_t row, double *rowData) {
+    const std::size_t offset = packed_index(row, row);
+    std::fill(rowData, rowData + row, 0.);
+    std::copy(data_.begin() + offset, data_.begin() + offset + n_ - row, rowData + row);
+  }
+
+private:
+
+/**
+ * Converts (row, col) indices of a symmetric matrix into a linearized index
+ * for packed storage of unique elements (upper triangle, row-major).
+ *
+ * Storage layout (0-indexed, n=4 example):
+ *   (0,0)(0,1)(0,2)(0,3) | (1,1)(1,2)(1,3) | (2,2)(2,3) | (3,3)
+ *     0    1    2    3       4    5    6       7    8       9
+ *
+ * Total elements stored: n*(n+1)/2
+ *
+ * @param row  Row index (0-based)
+ * @param col  Column index (0-based)
+ * @return     Linearized index into packed storage array
+ */
+    inline std::size_t packed_index(std::size_t row, std::size_t col) {
+        // Normalize to upper triangle: ensure row <= col
+        if (row > col) {
+            std::size_t tmp = row;
+            row = col;
+            col = tmp;
+        }
+
+        // Number of elements in rows 0..(row-1): row*n - row*(row-1)/2
+        // Plus offset within current row: (col - row)
+        return row * n_ - row * (row - 1) / 2 + (col - row);
+    }
+
+  std::size_t n_;
+  std::vector<std::atomic<double> > data_;
+
+};
diff --git a/narf/include/tests.hpp b/narf/include/tests.hpp
index ed8ed04..801a48f 100644
--- a/narf/include/tests.hpp
+++ b/narf/include/tests.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "histutils.hpp"
+#include "matrix_utils.hpp"
 
 namespace narf {
   ROOT::VecOps::RVec<double> testshift() {
@@ -37,18 +38,53 @@ namespace narf {
   void testshiftrw() {
     std::vector<int> a{0, 1, 2};
     std::vector<int> b{3, 4, 5};
-  
+
     std::cout << a.front() << std::endl;
     std::cout << b.front() << std::endl;
-  
+
     for (auto const &[ael, bel] : make_zip_view(a, b)) {
       ael = 0;
       bel = 1;
     }
-  
+
     std::cout << a.front() << std::endl;
     std::cout << b.front() << std::endl;
-  
-  
+
+
+  }
+
+  // Test SymMatrixAtomic: construction, fetch_add, fill_row, and index symmetry.
+  // Returns true if all checks pass.
+  bool testSymMatrixAtomic() {
+    const std::size_t n = 4;
+    SymMatrixAtomic mat(n);
+
+    // Fill upper triangle: mat[i][j] = (i+1)*(j+1) for i <= j
+    for (std::size_t i = 0; i < n; ++i) {
+      for (std::size_t j = i; j < n; ++j) {
+        mat.fetch_add(i, j, double((i + 1) * (j + 1)));
+      }
+    }
+
+    // Verify via fill_row: rowData[j] == (i+1)*(j+1) for j >= i, else 0
+    std::vector<double> rowData(n);
+    for (std::size_t i = 0; i < n; ++i) {
+      mat.fill_row(i, rowData.data());
+      for (std::size_t j = 0; j < i; ++j) {
+        if (rowData[j] != 0.0) return false;
+      }
+      for (std::size_t j = i; j < n; ++j) {
+        if (rowData[j] != double((i + 1) * (j + 1))) return false;
+      }
+    }
+
+    // Verify symmetry: fetch_add(i,j) and fetch_add(j,i) address the same element
+    SymMatrixAtomic mat2(n);
+    mat2.fetch_add(1, 3, 5.0);  // upper triangle (1,3)
+    mat2.fetch_add(3, 1, 3.0);  // lower triangle: should map to same element
+    mat2.fill_row(1, rowData.data());
+    if (rowData[3] != 8.0) return false;
+
+    return true;
   }
 }
diff --git a/test/testsymmatrixatomic.py b/test/testsymmatrixatomic.py
new file mode 100644
index 0000000..51d1068
--- /dev/null
+++ b/test/testsymmatrixatomic.py
@@ -0,0 +1,6 @@
+import ROOT
+import narf.tests
+
+res = ROOT.narf.testSymMatrixAtomic()
+print(f"testSymMatrixAtomic: {res}")
+assert res, "testSymMatrixAtomic failed"

From 1a85357f3187073a8a42be8652bf3b8304f3c904 Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Sat, 4 Apr 2026 14:11:34 -0400
Subject: [PATCH 03/25] minor improvement for SymMatrixAtomic and add initial
 version of SparseMatrixAtomic

---
 narf/include/matrix_utils.hpp | 97 ++++++++++++++++++++++++++++++++++-
 narf/matrix_utils.py          |  3 ++
 2 files changed, 99 insertions(+), 1 deletion(-)
 create mode 100644 narf/matrix_utils.py

diff --git a/narf/include/matrix_utils.hpp b/narf/include/matrix_utils.hpp
index a63d2c2..16793e5 100644
--- a/narf/include/matrix_utils.hpp
+++ b/narf/include/matrix_utils.hpp
@@ -1,6 +1,10 @@
+#pragma once
+
 #include <atomic>
 #include <eigen3/Eigen/Dense>
 #include <algorithm>
+#include "oneapi/tbb.h"
+
 
 class SymMatrixAtomic {
 public:
@@ -47,7 +51,98 @@ class SymMatrixAtomic {
         return row * n_ - row * (row - 1) / 2 + (col - row);
     }
 
-  std::size_t n_;
+  std::size_t n_{};
   std::vector<std::atomic<double> > data_;
 
 };
+
+class SparseMatrixIndexValues {
+public:
+  const std::vector<std::size_t> &idxs0() const { return idxs0_; }
+  const std::vector<std::size_t> &idxs1() const { return idxs1_; }
+  const std::vector<double> &vals() const { return vals_; }
+
+  void emplace_back(std::size_t idx0, std::size_t idx1, double val) {
+    idxs0_.emplace_back(idx0);
+    idxs1_.emplace_back(idx1);
+    vals_.emplace_back(val);
+  }
+
+  void reserve(std::size_t i) {
+    idxs0_.reserve(i);
+    idxs1_.reserve(i);
+    vals_.reserve(i);
+  }
+
+  std::size_t size() const { return vals_.size(); }
+
+private:
+  std::vector<std::size_t> idxs0_;
+  std::vector<std::size_t> idxs1_;
+  std::vector<double> vals_;
+};
+
+class SparseMatrixAtomic {
+public:
+  // *FIXME* this can lock on inserts and generally has poor performance for our workloads
+  // replace with some (custom) alternative
+  using map_type = tbb::concurrent_unordered_map<std::size_t, std::atomic<double>>;
+
+  SparseMatrixAtomic(std::size_t size0, std::size_t size1) : size0_(size0), size1_(size1), data_(size0*size1/40) {}
+
+  std::atomic<double> &operator() (std::size_t idx0, std::size_t idx1) {
+    const std::size_t i = globalidx(idx0, idx1);
+    auto res = data_.emplace(i, 0.);
+    auto &it = res.first;
+    return it->second;
+  }
+
+  const std::atomic<double> &operator() (std::size_t idx0, std::size_t idx1) const {
+    const std::size_t i = globalidx(idx0, idx1);
+    return data_.at(i);
+  }
+
+  void fetch_add(std::size_t idx0, std::size_t idx1, double val) {
+    if (val != 0.) {
+      auto &elemval = operator()(idx0, idx1);
+      elemval.fetch_add(val);
+    }
+  }
+
+  SparseMatrixIndexValues index_values() const {
+    SparseMatrixIndexValues res;
+    res.reserve(data_.size());
+
+    for (auto &elem : data_) {
+      auto is = idxs(elem.first);
+      res.emplace_back(is[0], is[1], elem.second);
+    }
+
+    return res;
+  }
+
+  void clear() { data_.clear(); }
+
+  void reserve(std::size_t i) { data_.reserve(i); }
+
+  std::size_t dense_size() const { return size0_*size1_; }
+
+  map_type &data() { return data_; }
+
+private:
+  std::size_t globalidx(std::size_t idx0, std::size_t idx1) const {
+    return idx0*size0_ + idx1;
+  }
+
+  std::array<std::size_t, 2> idxs(std::size_t globalidx) const {
+    const std::size_t idx0 = globalidx/size0_;
+    const std::size_t idx1 = globalidx % size0_;
+
+    return std::array<std::size_t, 2>{idx0, idx1};
+  }
+
+  const std::size_t size0_;
+  const std::size_t size1_;
+  map_type data_;
+
+};
\ No newline at end of file
diff --git a/narf/matrix_utils.py b/narf/matrix_utils.py
new file mode 100644
index 0000000..47f620d
--- /dev/null
+++ b/narf/matrix_utils.py
@@ -0,0 +1,3 @@
+import narf.clingutils
+
+narf.clingutils.Declare('#include "matrix_utils.hpp"')

From a5d2c6dd77884159f0c9ed9ab47e127382bcc60f Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Sat, 4 Apr 2026 23:59:08 -0400
Subject: [PATCH 04/25] fix deprecated storage_type access

---
 narf/histutils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/narf/histutils.py b/narf/histutils.py
index f08514b..5655b26 100644
--- a/narf/histutils.py
+++ b/narf/histutils.py
@@ -114,7 +114,7 @@ def make_array_interface_view(boost_hist):
         
     underflow = [axis.traits.underflow for axis in boost_hist.axes]
 
-    acc_type = convert_storage_type(boost_hist._storage_type)
+    acc_type = convert_storage_type(boost_hist.storage_type)
     arrview = ROOT.narf.array_interface_view[acc_type, len(shape)](arr, shape, strides, underflow)
 
     return arrview
@@ -132,9 +132,9 @@ def hist_to_pyroot_boost(hist_hist, tensor_rank = 0, force_atomic = False):
         scalar_type = ROOT.double
         dimensions = ROOT.Eigen.Sizes[tuple(tensor_sizes)]
 
-        if issubclass(hist_hist._storage_type, bh.storage.Double):
+        if issubclass(hist_hist.storage_type, bh.storage.Double):
             cppstoragetype = ROOT.narf.tensor_accumulator[scalar_type, dimensions]
-        elif issubclass(hist_hist._storage_type, bh.storage.Weight):
+        elif issubclass(hist_hist.storage_type, bh.storage.Weight):
             cppstoragetype = ROOT.narf.tensor_accumulator[ROOT.boost.histogram.accumulators.weighted_sum[scalar_type], dimensions]
         else:
             raise TypeError("Requested storage type is not supported with tensor weights currently")
@@ -143,7 +143,7 @@ def hist_to_pyroot_boost(hist_hist, tensor_rank = 0, force_atomic = False):
             cppstoragetype = ROOT.narf.atomic_adaptor[cppstoragetype]
     else:
         python_axes = hist_hist.axes
-        cppstoragetype = convert_storage_type(hist_hist._storage_type, force_atomic = force_atomic)
+        cppstoragetype = convert_storage_type(hist_hist.storage_type, force_atomic = force_atomic)
 
     cppaxes = [ROOT.std.move(convert_axis(axis)) for axis in python_axes]
 

From 6c27fcb3a746059f76a4c90c45a560ea0f3536ee Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Sun, 5 Apr 2026 00:00:06 -0400
Subject: [PATCH 05/25] fix constness

---
 narf/include/histutils.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/narf/include/histutils.hpp b/narf/include/histutils.hpp
index ad4fc31..41917d9 100644
--- a/narf/include/histutils.hpp
+++ b/narf/include/histutils.hpp
@@ -626,7 +626,7 @@ namespace narf {
   public:
     QuantileHelper(hist_t &&resource) : base_t(std::forward<hist_t>(resource)) {}
 
-    boost::histogram::axis::index_type operator()(const boost::histogram::axis::traits::value_type<Axes>&... args, const scalar_t &last) {
+    boost::histogram::axis::index_type operator()(const boost::histogram::axis::traits::value_type<Axes>&... args, const scalar_t &last) const {
       auto const &hist = *base_t::resourceHist_;
       auto const &edges = narf::get_value(hist, args...).data();
 
@@ -652,7 +652,7 @@ namespace narf {
 
     QuantileHelperStatic(const edge_t &edges) : edges_(edges) {}
 
-    boost::histogram::axis::index_type operator() (double val) {
+    boost::histogram::axis::index_type operator() (double val) const {
       // find the quantile bin corresponding to the last argument
       auto const upper = std::upper_bound(edges_.begin(), edges_.end(), val);
       auto const iquant = std::distance(edges_.begin(), upper);

From db0394fb1a70096feec46ac3d7f5620f69bc31a5 Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Sun, 5 Apr 2026 00:00:27 -0400
Subject: [PATCH 06/25] make wrapper more flexible/robust

---
 narf/include/rdfutils.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/narf/include/rdfutils.hpp b/narf/include/rdfutils.hpp
index eaddbbb..3ea29ef 100755
--- a/narf/include/rdfutils.hpp
+++ b/narf/include/rdfutils.hpp
@@ -8,14 +8,14 @@ namespace narf {
     private:
         Callable callable_;
 
-        using return_t = decltype(callable_(std::declval<Args>()...));
+        using return_t = std::decay_t<decltype(callable_(std::declval<Args>()...))>;
 
     public:
 
         DefineWrapper(const Callable &callable) : callable_(callable) {}
         DefineWrapper(Callable &&callable) : callable_(std::move(callable)) {}
 
-        return_t operator() (const Args&... args) const {
+        return_t operator() (const Args&... args) {
             return callable_(args...);
         }
     };

From 5a4c98d892bff02b0d3911f070c0d5fe19ad0ea6 Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Sun, 5 Apr 2026 00:00:47 -0400
Subject: [PATCH 07/25] flexible column types for quantile helpers

---
 narf/histutils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/narf/histutils.py b/narf/histutils.py
index 5655b26..c764a26 100644
--- a/narf/histutils.py
+++ b/narf/histutils.py
@@ -754,7 +754,7 @@ def define_quantile_ints(df, cols, quantile_hists):
         helper_cols = helper_cols_cond + [col]
 
         outname = f"{col}_iquant"
-        df = df.Define(outname, quanthelper, helper_cols)
+        df = narf.rdfutils.flexible_define(df, outname, quanthelper, helper_cols)
         helper_cols_cond.append(outname)
 
     quantile_axes = list(quantile_hists[-1].axes)

From 7468541faa80e952ed75b85b63b7027008d4dc4d Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Sun, 5 Apr 2026 00:08:41 -0400
Subject: [PATCH 08/25] add missing include

---
 narf/include/utils.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/narf/include/utils.hpp b/narf/include/utils.hpp
index e5eaf3b..7fa26ce 100755
--- a/narf/include/utils.hpp
+++ b/narf/include/utils.hpp
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <ranges>
+		
 namespace narf {
 
   template<typename T>

From ff899e4a20d006911018d2639ecf4b8d33dd7ce5 Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Mon, 6 Apr 2026 05:57:05 -0400
Subject: [PATCH 09/25] make range_to more flexible

---
 narf/include/utils.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/narf/include/utils.hpp b/narf/include/utils.hpp
index 7fa26ce..fc99960 100755
--- a/narf/include/utils.hpp
+++ b/narf/include/utils.hpp
@@ -136,8 +136,8 @@ namespace narf {
             | std::views::transform([](auto const &it){ return *it; });
   }
 
-  template <template <typename> class C, class T>
-  auto range_to(T &range) {
+  template <template <typename> class C=ROOT::VecOps::RVec, class T>
+  auto range_to(T &&range) {
     using value_type = std::decay_t<decltype(*range.begin())>;
 
     C<value_type> res;

From 105bed61c43b2f3deed4f23c89281ef44bba6cbc Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Tue, 7 Apr 2026 01:19:28 +0200
Subject: [PATCH 10/25] add lock-free insert-only concurrent_flat_map

A segmented open-addressing hash map for integer keys supporting
concurrent lock-free find / insert / emplace / expansion. State bits
are encoded in the two MSBs of each slot's key. Includes tests
covering single-threaded correctness, pointer stability across
expansion, and multi-threaded concurrent insert/find, plus a test
for SparseMatrixAtomic that exercises its public API under
concurrent fetch_add.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 narf/include/concurrent_flat_map.hpp | 324 +++++++++++++++++++++++++++
 narf/include/tests.hpp               | 162 ++++++++++++++
 test/testconcurrentflatmap.py        |   6 +
 3 files changed, 492 insertions(+)
 create mode 100644 narf/include/concurrent_flat_map.hpp
 create mode 100644 test/testconcurrentflatmap.py

diff --git a/narf/include/concurrent_flat_map.hpp b/narf/include/concurrent_flat_map.hpp
new file mode 100644
index 0000000..ef854c0
--- /dev/null
+++ b/narf/include/concurrent_flat_map.hpp
@@ -0,0 +1,324 @@
+#ifndef NARF_CONCURRENT_FLAT_MAP_HPP
+#define NARF_CONCURRENT_FLAT_MAP_HPP
+
+#include <algorithm>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <new>
+#include <type_traits>
+#include <utility>
+
+namespace narf {
+
+// Lock-free, insert-only, expandable concurrent flat hash map.
+//
+// Properties:
+//   - find / insert / emplace / expansion are all lock-free and safe to call
+//     concurrently from any number of threads.
+//   - Elements are never erased; once inserted, the address of an element's
+//     value is stable for the lifetime of the map (pointers returned by
+//     find/emplace remain valid).
+//   - The container grows by appending geometrically larger segments; existing
+//     segments are never rehashed, so concurrent readers are never disturbed.
+//
+// Requirements on Key:
+//   - Must be an integer type.
+//   - The two most significant bits of any user-supplied key must be zero;
+//     they are reserved for internal slot state (occupied / busy markers).
+template <typename Key, typename Value, typename Hash = std::hash<Key>>
+class concurrent_flat_map {
+  static_assert(std::is_integral_v<Key>, "Key must be an integer type");
+
+public:
+  using key_type    = Key;
+  using mapped_type = Value;
+  using hasher      = Hash;
+
+private:
+  using UKey = std::make_unsigned_t<Key>;
+
+  static constexpr unsigned kKeyBits   = sizeof(UKey) * 8;
+  static constexpr UKey kOccupiedBit   = UKey(1) << (kKeyBits - 1);
+  static constexpr UKey kBusyBit       = UKey(1) << (kKeyBits - 2);
+  static constexpr UKey kStateMask     = kOccupiedBit | kBusyBit;
+  static constexpr UKey kPayloadMask   = ~kStateMask;
+  static constexpr UKey kEmpty         = 0;
+
+  struct Slot {
+    std::atomic<UKey> key{kEmpty};
+    alignas(Value) unsigned char storage[sizeof(Value)];
+
+    Value* value_ptr() noexcept {
+      return std::launder(reinterpret_cast<Value*>(&storage));
+    }
+  };
+
+  struct Segment {
+    const std::size_t capacity;
+    const std::size_t mask;
+    std::atomic<std::size_t> size{0};
+    std::unique_ptr<Slot[]> slots;
+    std::atomic<Segment*> next{nullptr};
+
+    explicit Segment(std::size_t cap)
+      : capacity(cap), mask(cap - 1), slots(new Slot[cap]) {}
+
+    ~Segment() {
+      if constexpr (!std::is_trivially_destructible_v<Value>) {
+        for (std::size_t i = 0; i < capacity; ++i) {
+          UKey k = slots[i].key.load(std::memory_order_relaxed);
+          if ((k & kOccupiedBit) && !(k & kBusyBit)) {
+            slots[i].value_ptr()->~Value();
+          }
+        }
+      }
+    }
+  };
+
+  static constexpr std::size_t kDefaultInitialCapacity = 64;
+  static constexpr std::size_t kMaxProbe               = 32;
+
+  Segment* head_;
+  std::atomic<Segment*> tail_;
+  Hash hash_;
+
+  static UKey encode(Key key) noexcept {
+    return (static_cast<UKey>(key) & kPayloadMask) | kOccupiedBit;
+  }
+
+  static std::size_t round_up_pow2(std::size_t n) noexcept {
+    std::size_t c = 1;
+    while (c < n) c <<= 1;
+    return c;
+  }
+
+  // Spin until the slot's busy bit clears, then return the stable key.
+  static UKey wait_not_busy(Slot& slot) noexcept {
+    UKey k = slot.key.load(std::memory_order_acquire);
+    while (k & kBusyBit) {
+      k = slot.key.load(std::memory_order_acquire);
+    }
+    return k;
+  }
+
+  // Allocate (or observe) the segment that follows `current`. Multiple threads
+  // racing here will agree on a single winning segment; losers free their
+  // speculative allocation.
+  Segment* ensure_next(Segment* current) {
+    Segment* next = current->next.load(std::memory_order_acquire);
+    if (next) return next;
+    auto* fresh = new Segment(current->capacity * 2);
+    Segment* expected = nullptr;
+    if (current->next.compare_exchange_strong(
+            expected, fresh,
+            std::memory_order_acq_rel, std::memory_order_acquire)) {
+      // Best-effort tail advance so future inserters skip filled segments.
+      Segment* t = tail_.load(std::memory_order_acquire);
+      while (true) {
+        Segment* tn = t->next.load(std::memory_order_acquire);
+        if (!tn) break;
+        if (tail_.compare_exchange_weak(t, tn,
+                                        std::memory_order_acq_rel,
+                                        std::memory_order_acquire)) {
+          t = tn;
+        }
+      }
+      return fresh;
+    }
+    delete fresh;
+    return expected;
+  }
+
+  // Search a single segment for `target`. Returns pointer to value or nullptr.
+  Value* find_in(Segment* seg, std::size_t h, UKey target) const noexcept {
+    const std::size_t base = h & seg->mask;
+    const std::size_t probe_limit = std::min(kMaxProbe, seg->capacity);
+    for (std::size_t i = 0; i < probe_limit; ++i) {
+      Slot& slot = seg->slots[(base + i) & seg->mask];
+      UKey k = slot.key.load(std::memory_order_acquire);
+      if (k == kEmpty) return nullptr;
+      if (k & kBusyBit) k = wait_not_busy(slot);
+      if (k == target) return slot.value_ptr();
+    }
+    return nullptr;
+  }
+
+  // Try to insert `target` into a single segment. Returns
+  //   {ptr, true}  : newly inserted, value constructed from args
+  //   {ptr, false} : key was already present
+  //   {nullptr, false} : segment full along this probe sequence
+  template <typename... Args>
+  std::pair<Value*, bool> emplace_in(Segment* seg, std::size_t h, UKey target,
+                                     Args&&... args) {
+    const std::size_t base = h & seg->mask;
+    const std::size_t probe_limit = std::min(kMaxProbe, seg->capacity);
+    const UKey busy = (target & kPayloadMask) | kOccupiedBit | kBusyBit;
+    for (std::size_t i = 0; i < probe_limit; ++i) {
+      Slot& slot = seg->slots[(base + i) & seg->mask];
+      UKey k = slot.key.load(std::memory_order_acquire);
+      if (k == kEmpty) {
+        UKey expected = kEmpty;
+        if (slot.key.compare_exchange_strong(
+                expected, busy,
+                std::memory_order_acq_rel, std::memory_order_acquire)) {
+          ::new (static_cast<void*>(&slot.storage))
+              Value(std::forward<Args>(args)...);
+          slot.key.store(target, std::memory_order_release);
+          seg->size.fetch_add(1, std::memory_order_relaxed);
+          return {slot.value_ptr(), true};
+        }
+        k = expected;
+      }
+      if (k & kBusyBit) k = wait_not_busy(slot);
+      if (k == target) return {slot.value_ptr(), false};
+    }
+    return {nullptr, false};
+  }
+
+public:
+  explicit concurrent_flat_map(std::size_t initial_capacity = kDefaultInitialCapacity) {
+    const std::size_t cap = round_up_pow2(std::max<std::size_t>(initial_capacity, 2));
+    head_ = new Segment(cap);
+    tail_.store(head_, std::memory_order_release);
+  }
+
+  ~concurrent_flat_map() {
+    Segment* s = head_;
+    while (s) {
+      Segment* n = s->next.load(std::memory_order_relaxed);
+      delete s;
+      s = n;
+    }
+  }
+
+  concurrent_flat_map(const concurrent_flat_map&)            = delete;
+  concurrent_flat_map& operator=(const concurrent_flat_map&) = delete;
+
+  // Returns a pointer to the value associated with `key`, or nullptr.
+  // The returned pointer remains valid for the lifetime of the map.
+  Value* find(Key key) noexcept {
+    const UKey target  = encode(key);
+    const std::size_t h = hash_(key);
+    for (Segment* seg = head_; seg;
+         seg = seg->next.load(std::memory_order_acquire)) {
+      if (Value* v = find_in(seg, h, target)) return v;
+    }
+    return nullptr;
+  }
+
+  const Value* find(Key key) const noexcept {
+    return const_cast<concurrent_flat_map*>(this)->find(key);
+  }
+
+  bool contains(Key key) const noexcept { return find(key) != nullptr; }
+
+  // Construct a value in-place if `key` is not yet present.
+  // Returns {pointer-to-value, inserted?}.
+  template <typename... Args>
+  std::pair<Value*, bool> emplace(Key key, Args&&... args) {
+    const UKey target  = encode(key);
+    const std::size_t h = hash_(key);
+
+    // Look in every existing segment first to honour insert-once semantics.
+    for (Segment* seg = head_; seg;
+         seg = seg->next.load(std::memory_order_acquire)) {
+      if (Value* v = find_in(seg, h, target)) return {v, false};
+    }
+
+    // Then try to insert at the tail, growing as required.
+    Segment* seg = tail_.load(std::memory_order_acquire);
+    while (true) {
+      auto result = emplace_in(seg, h, target, std::forward<Args>(args)...);
+      if (result.first) {
+        return result;
+      }
+      // This probe sequence is saturated in `seg`; another thread may have
+      // already inserted the same key into a later segment, so re-check
+      // everything past `seg` before allocating.
+      Segment* next = seg->next.load(std::memory_order_acquire);
+      if (!next) next = ensure_next(seg);
+      for (Segment* s = next; s; s = s->next.load(std::memory_order_acquire)) {
+        if (Value* v = find_in(s, h, target)) return {v, false};
+      }
+      seg = next;
+    }
+  }
+
+  std::pair<Value*, bool> insert(Key key, const Value& v) {
+    return emplace(key, v);
+  }
+  std::pair<Value*, bool> insert(Key key, Value&& v) {
+    return emplace(key, std::move(v));
+  }
+
+  // Total number of inserted elements summed across all segments. Reads each
+  // segment's atomic counter; safe to call concurrently with other operations
+  // but the result reflects an instantaneous, possibly racing snapshot.
+  std::size_t size() const noexcept {
+    std::size_t total = 0;
+    for (Segment* seg = head_; seg;
+         seg = seg->next.load(std::memory_order_acquire)) {
+      total += seg->size.load(std::memory_order_acquire);
+    }
+    return total;
+  }
+
+  // Visit every (key, value) pair currently in the map. Not safe to call
+  // concurrently with insertions if the visitor relies on a stable snapshot;
+  // the visitor only sees fully-constructed slots and waits past any in-flight
+  // insert that races with iteration.
+  template <typename F>
+  void for_each(F&& f) {
+    for (Segment* seg = head_; seg;
+         seg = seg->next.load(std::memory_order_acquire)) {
+      for (std::size_t i = 0; i < seg->capacity; ++i) {
+        Slot& slot = seg->slots[i];
+        UKey k = slot.key.load(std::memory_order_acquire);
+        if (k == kEmpty) continue;
+        if (k & kBusyBit) k = wait_not_busy(slot);
+        if (!(k & kOccupiedBit)) continue;
+        f(static_cast<Key>(k & kPayloadMask), *slot.value_ptr());
+      }
+    }
+  }
+
+  template <typename F>
+  void for_each(F&& f) const {
+    const_cast<concurrent_flat_map*>(this)->for_each(std::forward<F>(f));
+  }
+
+  // Remove all elements and shrink back to a single head segment. NOT thread
+  // safe with respect to any other operation; the caller must establish
+  // exclusive access (e.g. between processing passes).
+  void clear() {
+    Segment* s = head_->next.load(std::memory_order_relaxed);
+    while (s) {
+      Segment* n = s->next.load(std::memory_order_relaxed);
+      delete s;
+      s = n;
+    }
+    head_->next.store(nullptr, std::memory_order_relaxed);
+    if constexpr (!std::is_trivially_destructible_v<Value>) {
+      for (std::size_t i = 0; i < head_->capacity; ++i) {
+        UKey k = head_->slots[i].key.load(std::memory_order_relaxed);
+        if ((k & kOccupiedBit) && !(k & kBusyBit)) {
+          head_->slots[i].value_ptr()->~Value();
+        }
+        head_->slots[i].key.store(kEmpty, std::memory_order_relaxed);
+      }
+    } else {
+      for (std::size_t i = 0; i < head_->capacity; ++i) {
+        head_->slots[i].key.store(kEmpty, std::memory_order_relaxed);
+      }
+    }
+    head_->size.store(0, std::memory_order_relaxed);
+    tail_.store(head_, std::memory_order_release);
+  }
+};
+
+} // namespace narf
+
+#endif // NARF_CONCURRENT_FLAT_MAP_HPP
diff --git a/narf/include/tests.hpp b/narf/include/tests.hpp
index 801a48f..dd9aea2 100644
--- a/narf/include/tests.hpp
+++ b/narf/include/tests.hpp
@@ -1,8 +1,15 @@
 #pragma once
 
+#include "concurrent_flat_map.hpp"
 #include "histutils.hpp"
 #include "matrix_utils.hpp"
 
+#include <atomic>
+#include <cmath>
+#include <thread>
+#include <unordered_set>
+#include <vector>
+
 namespace narf {
   ROOT::VecOps::RVec<double> testshift() {
     boost::histogram::axis::regular a(100, 0., 1.);
@@ -87,4 +94,159 @@ namespace narf {
 
     return true;
   }
+
+  // Test SparseMatrixAtomic: single-threaded fetch_add, index_values round-trip,
+  // clear, and multi-threaded concurrent fetch_add. Returns true on success.
+  bool testSparseMatrixAtomic() {
+    // ---- single-threaded ----
+    {
+      SparseMatrixAtomic mat(20, 20);
+      mat.fetch_add(1, 2, 3.0);
+      mat.fetch_add(1, 2, 4.0);
+      mat.fetch_add(5, 7, 1.5);
+      mat.fetch_add(0, 0, 2.0);
+      mat.fetch_add(0, 0, 0.0); // no-op
+      if (mat(1, 2).load() != 7.0) return false;
+      if (mat(5, 7).load() != 1.5) return false;
+      if (mat(0, 0).load() != 2.0) return false;
+
+      auto iv = mat.index_values();
+      if (iv.size() != 3) return false;
+      double sum = 0.0;
+      for (std::size_t k = 0; k < iv.size(); ++k) sum += iv.vals()[k];
+      if (sum != 10.5) return false;
+
+      mat.clear();
+      if (mat.index_values().size() != 0) return false;
+      // reuse after clear
+      mat.fetch_add(3, 4, 9.0);
+      if (mat(3, 4).load() != 9.0) return false;
+      if (mat.index_values().size() != 1) return false;
+    }
+
+    // ---- multi-threaded fetch_add: each (i,j) cell receives a known total ----
+    {
+      const std::size_t N = 32;
+      SparseMatrixAtomic mat(N, N);
+      const unsigned T = 8;
+      const unsigned reps = 500;
+      std::vector<std::thread> threads;
+      threads.reserve(T);
+      for (unsigned t = 0; t < T; ++t) {
+        threads.emplace_back([&, t] {
+          for (unsigned r = 0; r < reps; ++r) {
+            for (std::size_t i = 0; i < N; ++i) {
+              // Use a sparse pattern: only ~half the columns
+              std::size_t j = (i * 3 + 1) % N;
+              mat.fetch_add(i, j, 1.0 + 0.01 * t);
+            }
+          }
+        });
+      }
+      for (auto& th : threads) th.join();
+
+      double per_cell_expected = 0.0;
+      for (unsigned t = 0; t < T; ++t) per_cell_expected += reps * (1.0 + 0.01 * t);
+
+      auto iv = mat.index_values();
+      if (iv.size() != N) return false;
+      for (std::size_t k = 0; k < iv.size(); ++k) {
+        std::size_t i = iv.idxs0()[k];
+        std::size_t j = iv.idxs1()[k];
+        if (j != (i * 3 + 1) % N) return false;
+        if (std::abs(iv.vals()[k] - per_cell_expected) > 1e-9) return false;
+      }
+    }
+
+    return true;
+  }
+
+  // Test concurrent_flat_map: single-threaded correctness, expansion, and
+  // multi-threaded concurrent insert / find. Returns true on success.
+  bool testConcurrentFlatMap() {
+    // ---- single-threaded correctness, force several expansions ----
+    {
+      concurrent_flat_map<std::uint64_t, std::uint64_t> map(8);
+      const std::uint64_t N = 5000;
+      for (std::uint64_t i = 1; i <= N; ++i) {
+        auto [p, inserted] = map.emplace(i, i * 7u + 3u);
+        if (!inserted || !p || *p != i * 7u + 3u) return false;
+      }
+      // re-insert: must report not-inserted but return existing value
+      for (std::uint64_t i = 1; i <= N; ++i) {
+        auto [p, inserted] = map.emplace(i, std::uint64_t(0));
+        if (inserted || !p || *p != i * 7u + 3u) return false;
+      }
+      // find every key
+      for (std::uint64_t i = 1; i <= N; ++i) {
+        auto* p = map.find(i);
+        if (!p || *p != i * 7u + 3u) return false;
+      }
+      // missing keys
+      if (map.find(0) != nullptr) return false;
+      if (map.find(N + 1) != nullptr) return false;
+    }
+
+    // ---- pointer stability across expansion ----
+    {
+      concurrent_flat_map<std::uint64_t, std::uint64_t> map(4);
+      std::vector<std::uint64_t*> ptrs;
+      const std::uint64_t N = 1000;
+      ptrs.reserve(N);
+      for (std::uint64_t i = 1; i <= N; ++i) {
+        ptrs.push_back(map.emplace(i, i).first);
+      }
+      for (std::uint64_t i = 1; i <= N; ++i) {
+        if (ptrs[i - 1] != map.find(i)) return false;
+        if (*ptrs[i - 1] != i) return false;
+      }
+    }
+
+    // ---- multi-threaded insert / find ----
+    {
+      concurrent_flat_map<std::uint64_t, std::uint64_t> map(16);
+      const unsigned T = 8;
+      const std::uint64_t per = 4000;
+      std::atomic<std::uint64_t> dup_inserts{0};
+      std::atomic<std::uint64_t> bad{0};
+      std::vector<std::thread> threads;
+      threads.reserve(T);
+      for (unsigned t = 0; t < T; ++t) {
+        threads.emplace_back([&, t] {
+          for (std::uint64_t i = 0; i < per; ++i) {
+            // Overlapping key ranges across threads exercise contention.
+            std::uint64_t key = (i % (per / 2)) + 1 + (t % 2) * (per / 2);
+            std::uint64_t val = key * 1315423911u;
+            auto [p, ins] = map.emplace(key, val);
+            if (!p || *p != val) bad.fetch_add(1);
+            (void)ins;
+            auto* f = map.find(key);
+            if (!f || *f != val) bad.fetch_add(1);
+          }
+          // Each thread also inserts a unique key block.
+          for (std::uint64_t i = 0; i < per; ++i) {
+            std::uint64_t key = 1000000ull + t * per + i;
+            auto [p, ins] = map.emplace(key, key ^ 0xdeadbeefu);
+            if (!ins || !p || *p != (key ^ 0xdeadbeefu)) {
+              dup_inserts.fetch_add(1);
+            }
+          }
+        });
+      }
+      for (auto& th : threads) th.join();
+      if (bad.load() != 0) return false;
+      if (dup_inserts.load() != 0) return false;
+
+      // Verify all unique-block keys present and correct.
+      for (unsigned t = 0; t < T; ++t) {
+        for (std::uint64_t i = 0; i < per; ++i) {
+          std::uint64_t key = 1000000ull + t * per + i;
+          auto* p = map.find(key);
+          if (!p || *p != (key ^ 0xdeadbeefu)) return false;
+        }
+      }
+    }
+
+    return true;
+  }
 }
diff --git a/test/testconcurrentflatmap.py b/test/testconcurrentflatmap.py
new file mode 100644
index 0000000..2ef423f
--- /dev/null
+++ b/test/testconcurrentflatmap.py
@@ -0,0 +1,6 @@
+import ROOT
+import narf.tests
+
+res = ROOT.narf.testConcurrentFlatMap()
+print(f"testConcurrentFlatMap: {res}")
+assert res, "testConcurrentFlatMap failed"

From 1c39602de6b4f25fcf140c4a18f34436249932a4 Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Tue, 7 Apr 2026 01:19:28 +0200
Subject: [PATCH 11/25] add SparseMatrixAtomic test driver

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 test/testsparsematrixatomic.py | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 test/testsparsematrixatomic.py

diff --git a/test/testsparsematrixatomic.py b/test/testsparsematrixatomic.py
new file mode 100644
index 0000000..242ed4f
--- /dev/null
+++ b/test/testsparsematrixatomic.py
@@ -0,0 +1,6 @@
+import ROOT
+import narf.tests
+
+res = ROOT.narf.testSparseMatrixAtomic()
+print(f"testSparseMatrixAtomic: {res}")
+assert res, "testSparseMatrixAtomic failed"

From a8be35b88202b085ac579106ac0738bce79e5235 Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Tue, 7 Apr 2026 01:20:11 +0200
Subject: [PATCH 12/25] SparseMatrixAtomic: switch to narf::concurrent_flat_map

Replaces tbb::concurrent_unordered_map with the new lock-free
insert-only flat map, removing the FIXME about lock contention on
inserts. reserve() becomes a no-op since the new map grows on
demand.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 narf/include/matrix_utils.hpp | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/narf/include/matrix_utils.hpp b/narf/include/matrix_utils.hpp
index 16793e5..f777b93 100644
--- a/narf/include/matrix_utils.hpp
+++ b/narf/include/matrix_utils.hpp
@@ -3,7 +3,7 @@
 #include <atomic>
 #include <eigen3/Eigen/Dense>
 #include <algorithm>
-#include "oneapi/tbb.h"
+#include "concurrent_flat_map.hpp"
 
 
 class SymMatrixAtomic {
@@ -84,22 +84,22 @@ class SparseMatrixIndexValues {
 
 class SparseMatrixAtomic {
 public:
-  // *FIXME* this can lock on inserts and generally has poor performance for our workloads
-  // replace with some (custom) alternative
-  using map_type = tbb::concurrent_unordered_map<std::size_t, std::atomic<double>>;
+  using map_type = narf::concurrent_flat_map<std::size_t, std::atomic<double>>;
 
-  SparseMatrixAtomic(std::size_t size0, std::size_t size1) : size0_(size0), size1_(size1), data_(size0*size1/40) {}
+  SparseMatrixAtomic(std::size_t size0, std::size_t size1)
+    : size0_(size0), size1_(size1),
+      data_(std::max<std::size_t>(size0 * size1 / 40, 16)) {}
 
   std::atomic<double> &operator() (std::size_t idx0, std::size_t idx1) {
     const std::size_t i = globalidx(idx0, idx1);
-    auto res = data_.emplace(i, 0.);
-    auto &it = res.first;
-    return it->second;
+    auto res = data_.emplace(i);
+    return *res.first;
   }
 
   const std::atomic<double> &operator() (std::size_t idx0, std::size_t idx1) const {
     const std::size_t i = globalidx(idx0, idx1);
-    return data_.at(i);
+    auto* p = data_.find(i);
+    return *p;
   }
 
   void fetch_add(std::size_t idx0, std::size_t idx1, double val) {
@@ -113,17 +113,17 @@ class SparseMatrixAtomic {
     SparseMatrixIndexValues res;
     res.reserve(data_.size());
 
-    for (auto &elem : data_) {
-      auto is = idxs(elem.first);
-      res.emplace_back(is[0], is[1], elem.second);
-    }
+    data_.for_each([&](std::size_t key, const std::atomic<double>& val) {
+      auto is = idxs(key);
+      res.emplace_back(is[0], is[1], val.load());
+    });
 
     return res;
   }
 
   void clear() { data_.clear(); }
 
-  void reserve(std::size_t i) { data_.reserve(i); }
+  void reserve(std::size_t /*i*/) { /* no-op: map grows on demand */ }
 
   std::size_t dense_size() const { return size0_*size1_; }
 

From a39c56dfc3de46f0805718f4e5b52e2a5128e4da Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Tue, 7 Apr 2026 02:15:56 +0200
Subject: [PATCH 13/25] concurrent_flat_map: add move constructor and
 assignment

Required so the map can live as a member of other movable types
(e.g. a boost::histogram storage class). The moved-from object is
left in a destroy-only state.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 narf/include/concurrent_flat_map.hpp | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/narf/include/concurrent_flat_map.hpp b/narf/include/concurrent_flat_map.hpp
index ef854c0..a022543 100644
--- a/narf/include/concurrent_flat_map.hpp
+++ b/narf/include/concurrent_flat_map.hpp
@@ -197,6 +197,34 @@ class concurrent_flat_map {
   concurrent_flat_map(const concurrent_flat_map&)            = delete;
   concurrent_flat_map& operator=(const concurrent_flat_map&) = delete;
 
+  // Move constructor: transfers ownership. The moved-from object is left in
+  // a destroy-only state; calling any other method on it is undefined.
+  concurrent_flat_map(concurrent_flat_map&& other) noexcept
+    : head_(other.head_),
+      tail_(other.tail_.load(std::memory_order_relaxed)),
+      hash_(std::move(other.hash_)) {
+    other.head_ = nullptr;
+    other.tail_.store(nullptr, std::memory_order_relaxed);
+  }
+
+  concurrent_flat_map& operator=(concurrent_flat_map&& other) noexcept {
+    if (this != &other) {
+      Segment* s = head_;
+      while (s) {
+        Segment* n = s->next.load(std::memory_order_relaxed);
+        delete s;
+        s = n;
+      }
+      head_ = other.head_;
+      tail_.store(other.tail_.load(std::memory_order_relaxed),
+                  std::memory_order_relaxed);
+      hash_ = std::move(other.hash_);
+      other.head_ = nullptr;
+      other.tail_.store(nullptr, std::memory_order_relaxed);
+    }
+    return *this;
+  }
+
   // Returns a pointer to the value associated with `key`, or nullptr.
   // The returned pointer remains valid for the lifetime of the map.
   Value* find(Key key) noexcept {

From 59bd2ef384f60e363b715f1e1d19f3ce97cdb3f5 Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Tue, 7 Apr 2026 02:15:56 +0200
Subject: [PATCH 14/25] HistoBoost: add SparseStorage option backed by
 concurrent_flat_map

Adds narf::concurrent_sparse_storage, a boost::histogram Storage type
backed by narf::concurrent_flat_map with has_threading_support = true,
plus a make_histogram_sparse factory and python-friendly snapshot
helpers (boost::histogram does not expose its storage_ member to
cppyy directly).

HistoBoost gains a SparseStorage marker class taking an estimated
fill_fraction (default 0.1) used to pre-size the underlying map and
avoid most on-the-fly expansions. Tensor weights are not supported in
this mode and conversion to a python hist.Hist is skipped; the raw
RResultPtr is returned. Includes an end-to-end RDataFrame test.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 narf/histutils.py                 |  35 ++++++
 narf/include/histutils.hpp        |   1 +
 narf/include/sparse_histogram.hpp | 202 ++++++++++++++++++++++++++++++
 test/testsparsehistogram.py       |  31 +++++
 4 files changed, 269 insertions(+)
 create mode 100644 narf/include/sparse_histogram.hpp
 create mode 100644 test/testsparsehistogram.py

diff --git a/narf/histutils.py b/narf/histutils.py
index c764a26..be17680 100644
--- a/narf/histutils.py
+++ b/narf/histutils.py
@@ -19,6 +19,24 @@
 narf.clingutils.Declare('#include <eigen3/Eigen/Dense>')
 narf.clingutils.Declare('#include <eigen3/unsupported/Eigen/CXX11/Tensor>')
 
+class SparseStorage:
+    """Storage option for HistoBoost selecting a narf::concurrent_sparse_storage
+    backed by a narf::concurrent_flat_map. Conversion to a python hist.Hist
+    object is not supported in this mode.
+
+    Parameters
+    ----------
+    fill_fraction : float
+        Estimated fraction of bins (including under/overflow) that will be
+        populated. Used to size the underlying concurrent_flat_map so that
+        most fills hit the initial allocation rather than triggering
+        on-the-fly expansion. Values outside (0, 1] are accepted; pass a
+        small number for very sparse fills.
+    """
+    def __init__(self, fill_fraction=0.1):
+        self.fill_fraction = float(fill_fraction)
+
+
 def bool_to_string(b):
     if b:
         return "true"
@@ -161,6 +179,23 @@ def _histo_boost(df, name, axes, cols, storage = bh.storage.Weight(), force_atom
     if force_atomic is None:
         force_atomic = ROOT.ROOT.IsImplicitMTEnabled()
 
+    # Sparse storage path: build a narf::sparse_histogram and skip the python
+    # hist.Hist conversion entirely.
+    if isinstance(storage, SparseStorage):
+        if tensor_axes is not None:
+            raise NotImplementedError("Tensor weights are not supported with SparseStorage")
+        coltypes = [df.GetColumnType(col) for col in cols]
+        for coltype in coltypes[len(axes):]:
+            traits = ROOT.narf.tensor_traits[coltype]
+            if traits.is_tensor:
+                raise NotImplementedError("Tensor weights are not supported with SparseStorage")
+        cppaxes = [ROOT.std.move(convert_axis(axis)) for axis in axes]
+        hfill = ROOT.narf.make_histogram_sparse[ROOT.narf.atomic_adaptor[ROOT.double]](storage.fill_fraction, *cppaxes)
+        helper = ROOT.narf.FillBoostHelperAtomic[type(hfill)](ROOT.std.move(hfill))
+        targs = tuple([type(df), type(helper)] + coltypes)
+        res = ROOT.narf.book_helper[targs](df, ROOT.std.move(helper), cols)
+        return res
+
     #TODO some of this code can be shared with root histogram version
 
     #FIXME make this more generic
diff --git a/narf/include/histutils.hpp b/narf/include/histutils.hpp
index 41917d9..12461c3 100644
--- a/narf/include/histutils.hpp
+++ b/narf/include/histutils.hpp
@@ -6,6 +6,7 @@
 #include "traits.hpp"
 #include "utils.hpp"
 #include "atomic_adaptor.hpp"
+#include "sparse_histogram.hpp"
 #include "tensorutils.hpp"
 #include "tensorevalutils.hpp"
 #include <ROOT/RResultPtr.hxx>
diff --git a/narf/include/sparse_histogram.hpp b/narf/include/sparse_histogram.hpp
new file mode 100644
index 0000000..a2e5d20
--- /dev/null
+++ b/narf/include/sparse_histogram.hpp
@@ -0,0 +1,202 @@
+#ifndef NARF_SPARSE_HISTOGRAM_HPP
+#define NARF_SPARSE_HISTOGRAM_HPP
+
+#include <atomic>
+#include <cstddef>
+#include <iterator>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <boost/histogram.hpp>
+#include <boost/histogram/unsafe_access.hpp>
+
+#include "atomic_adaptor.hpp"
+#include "concurrent_flat_map.hpp"
+
+namespace narf {
+
+// A boost::histogram-compatible Storage backed by narf::concurrent_flat_map.
+//
+// Bins are addressed by their linearized index (the same scheme boost
+// histogram itself uses for dense storages). Only bins that have been touched
+// by a fill consume memory; the underlying lock-free map allows concurrent
+// fills from many threads.
+//
+// Iteration via the standard begin()/end() interface walks every virtual bin
+// (including never-touched bins, which materialize on access via the map's
+// emplace path). For sparse traversals prefer iterating data() directly with
+// for_each.
+//
+// Conversion of histograms using this storage to python hist.Hist objects is
+// not supported in the current implementation.
+template <typename T = narf::atomic_adaptor<double>>
+class concurrent_sparse_storage {
+public:
+  using value_type      = T;
+  using reference       = T&;
+  using const_reference = const T&;
+  using map_type        = concurrent_flat_map<std::size_t, T>;
+
+  static constexpr bool has_threading_support = true;
+
+  concurrent_sparse_storage() = default;
+  explicit concurrent_sparse_storage(double fill_fraction)
+    : fill_fraction_(fill_fraction) {}
+  concurrent_sparse_storage(concurrent_sparse_storage&&) = default;
+  concurrent_sparse_storage& operator=(concurrent_sparse_storage&&) = default;
+  concurrent_sparse_storage(const concurrent_sparse_storage&) = delete;
+  concurrent_sparse_storage& operator=(const concurrent_sparse_storage&) = delete;
+
+  // boost::histogram::histogram calls reset() with the total number of bins
+  // (including under/overflow) on construction and after axis growth.
+  // The map is sized to fill_fraction * n to avoid most early expansions
+  // when an estimate of occupancy is supplied by the caller.
+  void reset(std::size_t n) {
+    size_ = n;
+    const double cap_d = fill_fraction_ * static_cast<double>(n);
+    std::size_t cap = cap_d > 1.0 ? static_cast<std::size_t>(cap_d) : 1;
+    map_ = map_type{cap};
+  }
+
+  double fill_fraction() const noexcept { return fill_fraction_; }
+
+  std::size_t size() const noexcept { return size_; }
+
+  // Insert-on-access; safe for concurrent fills.
+  reference operator[](std::size_t i) {
+    return *map_.emplace(i).first;
+  }
+
+  const_reference operator[](std::size_t i) const {
+    if (auto* p = map_.find(i)) return *p;
+    // Materialize a default-constructed cell so the const overload still
+    // returns a stable reference. This matches the dense_storage contract
+    // that "every bin index in [0, size()) is addressable".
+    return *const_cast<map_type&>(map_).emplace(i).first;
+  }
+
+  // Required by boost::histogram::storage concept, but not used by our fill
+  // path. Two sparse storages compare equal iff they have the same logical
+  // size and identical populated entries.
+  bool operator==(const concurrent_sparse_storage& other) const {
+    if (size_ != other.size_) return false;
+    bool eq = true;
+    map_.for_each([&](std::size_t k, const T& v) {
+      if (!eq) return;
+      auto* p = const_cast<map_type&>(other.map_).find(k);
+      if (!p || !(*p == v)) eq = false;
+    });
+    return eq;
+  }
+
+  // Random-access iterator over the full virtual bin range. Dereferencing
+  // materializes the bin (via operator[]). boost::histogram's fill path
+  // requires random-access semantics so it can do `begin() + idx`.
+  class iterator {
+  public:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type        = T;
+    using reference         = T&;
+    using pointer           = T*;
+    using difference_type   = std::ptrdiff_t;
+
+    iterator() = default;
+    iterator(concurrent_sparse_storage* s, std::size_t i) : s_(s), i_(i) {}
+
+    reference operator*()  const { return (*s_)[i_]; }
+    reference operator[](difference_type n) const {
+      return (*s_)[i_ + static_cast<std::size_t>(n)];
+    }
+
+    iterator& operator++() { ++i_; return *this; }
+    iterator  operator++(int) { auto t = *this; ++i_; return t; }
+    iterator& operator--() { --i_; return *this; }
+    iterator  operator--(int) { auto t = *this; --i_; return t; }
+
+    iterator& operator+=(difference_type n) {
+      i_ = static_cast<std::size_t>(static_cast<difference_type>(i_) + n);
+      return *this;
+    }
+    iterator& operator-=(difference_type n) { return *this += -n; }
+    iterator  operator+(difference_type n) const { auto t = *this; t += n; return t; }
+    iterator  operator-(difference_type n) const { auto t = *this; t -= n; return t; }
+    friend iterator operator+(difference_type n, iterator it) { return it + n; }
+    difference_type operator-(const iterator& o) const {
+      return static_cast<difference_type>(i_) - static_cast<difference_type>(o.i_);
+    }
+
+    bool operator==(const iterator& o) const { return i_ == o.i_; }
+    bool operator!=(const iterator& o) const { return i_ != o.i_; }
+    bool operator< (const iterator& o) const { return i_ <  o.i_; }
+    bool operator<=(const iterator& o) const { return i_ <= o.i_; }
+    bool operator> (const iterator& o) const { return i_ >  o.i_; }
+    bool operator>=(const iterator& o) const { return i_ >= o.i_; }
+
+  private:
+    concurrent_sparse_storage* s_ = nullptr;
+    std::size_t i_ = 0;
+  };
+  using const_iterator = iterator;
+
+  iterator begin() { return iterator(this, 0); }
+  iterator end()   { return iterator(this, size_); }
+  const_iterator begin() const {
+    return iterator(const_cast<concurrent_sparse_storage*>(this), 0);
+  }
+  const_iterator end() const {
+    return iterator(const_cast<concurrent_sparse_storage*>(this), size_);
+  }
+
+  map_type& data() noexcept { return map_; }
+  const map_type& data() const noexcept { return map_; }
+
+private:
+  double      fill_fraction_ = 1.0;
+  std::size_t size_ = 0;
+  map_type    map_;
+};
+
+// Convenience factory: builds a boost::histogram::histogram with the
+// concurrent sparse storage and the supplied axes.
+template <typename T, typename... Axes>
+boost::histogram::histogram<std::tuple<std::decay_t<Axes>...>,
+                            concurrent_sparse_storage<T>>
+make_histogram_sparse(double fill_fraction, Axes&&... axes) {
+  return boost::histogram::make_histogram_with(
+      concurrent_sparse_storage<T>{fill_fraction},
+      std::forward<Axes>(axes)...);
+}
+
+// Helpers to inspect the underlying concurrent_flat_map of a sparse-storage
+// boost histogram. Free functions because cppyy does not expose
+// boost::histogram::histogram::storage_ directly.
+template <typename Axes, typename T>
+typename concurrent_sparse_storage<T>::map_type&
+sparse_histogram_data(
+    boost::histogram::histogram<Axes, concurrent_sparse_storage<T>>& h) {
+  return boost::histogram::unsafe_access::storage(h).data();
+}
+
+// Snapshot the populated bins of a sparse-storage boost histogram into a
+// vector of (linearized_index, value) pairs. Convenience for python
+// inspection where passing a python callable to for_each is awkward.
+template <typename Axes, typename T>
+std::vector<std::pair<std::size_t, double>> sparse_histogram_snapshot(
+    boost::histogram::histogram<Axes, concurrent_sparse_storage<T>>& h) {
+  std::vector<std::pair<std::size_t, double>> out;
+  auto& m = boost::histogram::unsafe_access::storage(h).data();
+  out.reserve(m.size());
+  m.for_each([&](std::size_t k, const T& v) {
+    if constexpr (requires { v.load(); }) {
+      out.emplace_back(k, static_cast<double>(v.load()));
+    } else {
+      out.emplace_back(k, static_cast<double>(v));
+    }
+  });
+  return out;
+}
+
+} // namespace narf
+
+#endif // NARF_SPARSE_HISTOGRAM_HPP
diff --git a/test/testsparsehistogram.py b/test/testsparsehistogram.py
new file mode 100644
index 0000000..6fbf4bf
--- /dev/null
+++ b/test/testsparsehistogram.py
@@ -0,0 +1,31 @@
+import ROOT
+import boost_histogram as bh
+import narf
+import narf.histutils
+
+ROOT.ROOT.EnableImplicitMT(4)
+
+N = 20000
+df = ROOT.RDataFrame(N)
+df = df.Define("x", "double((rdfentry_ % 20) + 0.5)")  # values 0.5, 1.5, ..., 19.5
+df = df.Define("w", "1.0")
+
+ax = bh.axis.Regular(20, 0.0, 20.0)
+res = df.HistoBoost("hsparse", [ax], ["x", "w"], storage=narf.histutils.SparseStorage(fill_fraction=1.0))
+hfill = res.GetValue()
+
+expected_per_bin = N // 20
+
+# Snapshot populated bins (linearized index, value).
+got = {int(k): float(v) for k, v in ROOT.narf.sparse_histogram_snapshot(hfill)}
+
+print(f"populated bins: {len(got)}")
+assert len(got) == 20, f"expected 20 populated bins, got {len(got)}"
+
+# boost::histogram linearized layout for a single regular axis with under/overflow:
+#   linearized = boost_index + 1, real bins occupy keys 1..20.
+for b in range(1, 21):
+    assert b in got, f"missing bin {b}"
+    assert abs(got[b] - expected_per_bin) < 1e-9, f"bin {b}: {got[b]} != {expected_per_bin}"
+
+print("testSparseHistogram OK")

From 57e440aed8810c73d0b5af151b84a97b858d114d Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Tue, 7 Apr 2026 02:21:03 +0200
Subject: [PATCH 15/25] HistoBoost SparseStorage: convert result to
 wums.SparseHist

The SparseStorage path now lazily converts the underlying C++
sparse histogram to a wums.sparse_hist.SparseHist on first
dereference, snapshotting the concurrent_flat_map into flat
indices/values that match the with-flow row-major layout.
Pass convert_to_hist=False to get the raw RResultPtr instead.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 narf/histutils.py           | 40 +++++++++++++++++++++++++++++++++++--
 test/testsparsehistogram.py | 22 +++++++++-----------
 2 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/narf/histutils.py b/narf/histutils.py
index be17680..8127832 100644
--- a/narf/histutils.py
+++ b/narf/histutils.py
@@ -179,8 +179,8 @@ def _histo_boost(df, name, axes, cols, storage = bh.storage.Weight(), force_atom
     if force_atomic is None:
         force_atomic = ROOT.ROOT.IsImplicitMTEnabled()
 
-    # Sparse storage path: build a narf::sparse_histogram and skip the python
-    # hist.Hist conversion entirely.
+    # Sparse storage path: build a narf::sparse_histogram backed by a
+    # concurrent_flat_map. The result is exposed as a wums.SparseHist.
     if isinstance(storage, SparseStorage):
         if tensor_axes is not None:
             raise NotImplementedError("Tensor weights are not supported with SparseStorage")
@@ -194,6 +194,42 @@ def _histo_boost(df, name, axes, cols, storage = bh.storage.Weight(), force_atom
         helper = ROOT.narf.FillBoostHelperAtomic[type(hfill)](ROOT.std.move(hfill))
         targs = tuple([type(df), type(helper)] + coltypes)
         res = ROOT.narf.book_helper[targs](df, ROOT.std.move(helper), cols)
+
+        if not convert_to_hist:
+            return res
+
+        # Lazily convert the underlying C++ sparse histogram to a wums.SparseHist
+        # the first time the result is dereferenced.
+        from wums.sparse_hist import SparseHist
+
+        res._GetPtr = res.GetPtr
+        res._sparse_hist = None
+        python_axes_sparse = list(axes)
+
+        def _build_sparse():
+            if res._sparse_hist is not None:
+                return res._sparse_hist
+            cpp_hist = res._GetPtr()
+            snapshot = ROOT.narf.sparse_histogram_snapshot(cpp_hist)
+            n = len(snapshot)
+            flat = np.empty(n, dtype=np.int64)
+            vals = np.empty(n, dtype=np.float64)
+            for i, kv in enumerate(snapshot):
+                flat[i] = int(kv.first)
+                vals[i] = float(kv.second)
+            size = 1
+            for ax in python_axes_sparse:
+                size *= int(ax.extent)
+            res._sparse_hist = SparseHist._from_flat(flat, vals, python_axes_sparse, size)
+            return res._sparse_hist
+
+        ret_null = lambda: None
+        res.__deref__ = _build_sparse
+        res.__follow__ = _build_sparse
+        res.begin = ret_null
+        res.end = ret_null
+        res.GetPtr = _build_sparse
+        res.GetValue = _build_sparse
         return res
 
     #TODO some of this code can be shared with root histogram version
diff --git a/test/testsparsehistogram.py b/test/testsparsehistogram.py
index 6fbf4bf..cf8a143 100644
--- a/test/testsparsehistogram.py
+++ b/test/testsparsehistogram.py
@@ -1,7 +1,9 @@
 import ROOT
+import numpy as np
 import boost_histogram as bh
 import narf
 import narf.histutils
+from wums.sparse_hist import SparseHist
 
 ROOT.ROOT.EnableImplicitMT(4)
 
@@ -12,20 +14,14 @@
 
 ax = bh.axis.Regular(20, 0.0, 20.0)
 res = df.HistoBoost("hsparse", [ax], ["x", "w"], storage=narf.histutils.SparseStorage(fill_fraction=1.0))
-hfill = res.GetValue()
+sh = res.GetValue()
 
-expected_per_bin = N // 20
+assert isinstance(sh, SparseHist), f"expected SparseHist, got {type(sh).__name__}"
+assert sh.nnz == 20, f"expected 20 populated bins, got {sh.nnz}"
 
-# Snapshot populated bins (linearized index, value).
-got = {int(k): float(v) for k, v in ROOT.narf.sparse_histogram_snapshot(hfill)}
-
-print(f"populated bins: {len(got)}")
-assert len(got) == 20, f"expected 20 populated bins, got {len(got)}"
-
-# boost::histogram linearized layout for a single regular axis with under/overflow:
-#   linearized = boost_index + 1, real bins occupy keys 1..20.
-for b in range(1, 21):
-    assert b in got, f"missing bin {b}"
-    assert abs(got[b] - expected_per_bin) < 1e-9, f"bin {b}: {got[b]} != {expected_per_bin}"
+# Dense round-trip with flow gives a (22,) array; with flow=False a (20,) array.
+dense_noflow = sh.toarray(flow=False)
+expected = np.full(20, N // 20, dtype=np.float64)
+assert np.allclose(dense_noflow, expected), f"mismatch: {dense_noflow} vs {expected}"
 
 print("testSparseHistogram OK")

From 97b21536e7decd73ec818f359fcb394219104fdc Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Tue, 7 Apr 2026 03:15:53 +0200
Subject: [PATCH 16/25] concurrent_flat_map: serialize segment growth via
 sentinel

Previously every thread that observed a saturated tail segment
speculatively allocated a doubled-size successor and then either
won the CAS or freed it. Under high thread contention this caused
a transient memory spike of M_threads * segment_size per growth
event, easily inflating peak RSS by an order of magnitude for
multi-GB segments and potentially fragmenting the address space.

ensure_next now CAS-publishes a "growing" sentinel into the
segment's next pointer before allocating; only the winning thread
performs the allocation while losers yield-spin until the real
successor is published. All segment walks use a new observed_next
helper that treats the sentinel as "no successor yet".

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 narf/include/concurrent_flat_map.hpp | 62 +++++++++++++++++++++-------
 1 file changed, 47 insertions(+), 15 deletions(-)

diff --git a/narf/include/concurrent_flat_map.hpp b/narf/include/concurrent_flat_map.hpp
index a022543..e1c0cd2 100644
--- a/narf/include/concurrent_flat_map.hpp
+++ b/narf/include/concurrent_flat_map.hpp
@@ -8,6 +8,7 @@
 #include <functional>
 #include <memory>
 #include <new>
+#include <thread>
 #include <type_traits>
 #include <utility>
 
@@ -95,6 +96,19 @@ class concurrent_flat_map {
     return c;
   }
 
+  // Sentinel published in `Segment::next` while a thread is allocating the
+  // successor segment. Distinct from both nullptr and any valid pointer.
+  static Segment* growing_sentinel() noexcept {
+    return reinterpret_cast<Segment*>(static_cast<std::uintptr_t>(1));
+  }
+
+  // Returns the published successor of `s`, or nullptr if there isn't one
+  // yet (or if a growth is in progress and not yet visible to readers).
+  static Segment* observed_next(const Segment* s) noexcept {
+    Segment* n = s->next.load(std::memory_order_acquire);
+    return (n == growing_sentinel()) ? nullptr : n;
+  }
+
   // Spin until the slot's busy bit clears, then return the stable key.
   static UKey wait_not_busy(Slot& slot) noexcept {
     UKey k = slot.key.load(std::memory_order_acquire);
@@ -104,22 +118,34 @@ class concurrent_flat_map {
     return k;
   }
 
-  // Allocate (or observe) the segment that follows `current`. Multiple threads
-  // racing here will agree on a single winning segment; losers free their
-  // speculative allocation.
+  // Allocate (or observe) the segment that follows `current`. Only the thread
+  // that wins the right to grow performs the allocation; other threads spin
+  // briefly on a sentinel until the new segment is published. This avoids the
+  // thundering-herd of speculative allocations that doubling-segment growth
+  // would otherwise produce under high thread contention.
   Segment* ensure_next(Segment* current) {
     Segment* next = current->next.load(std::memory_order_acquire);
-    if (next) return next;
-    auto* fresh = new Segment(current->capacity * 2);
+    if (next && next != growing_sentinel()) return next;
+
     Segment* expected = nullptr;
     if (current->next.compare_exchange_strong(
-            expected, fresh,
+            expected, growing_sentinel(),
             std::memory_order_acq_rel, std::memory_order_acquire)) {
+      // Won the race: this thread is the sole allocator.
+      Segment* fresh = nullptr;
+      try {
+        fresh = new Segment(current->capacity * 2);
+      } catch (...) {
+        current->next.store(nullptr, std::memory_order_release);
+        throw;
+      }
+      current->next.store(fresh, std::memory_order_release);
+
       // Best-effort tail advance so future inserters skip filled segments.
       Segment* t = tail_.load(std::memory_order_acquire);
       while (true) {
         Segment* tn = t->next.load(std::memory_order_acquire);
-        if (!tn) break;
+        if (!tn || tn == growing_sentinel()) break;
         if (tail_.compare_exchange_weak(t, tn,
                                         std::memory_order_acq_rel,
                                         std::memory_order_acquire)) {
@@ -128,8 +154,14 @@ class concurrent_flat_map {
       }
       return fresh;
     }
-    delete fresh;
-    return expected;
+
+    // Lost the race. Either another thread is mid-allocation (sentinel
+    // observed) or the new segment is already published.
+    while (true) {
+      Segment* obs = current->next.load(std::memory_order_acquire);
+      if (obs && obs != growing_sentinel()) return obs;
+      std::this_thread::yield();
+    }
   }
 
   // Search a single segment for `target`. Returns pointer to value or nullptr.
@@ -231,7 +263,7 @@ class concurrent_flat_map {
     const UKey target  = encode(key);
     const std::size_t h = hash_(key);
     for (Segment* seg = head_; seg;
-         seg = seg->next.load(std::memory_order_acquire)) {
+         seg = observed_next(seg)) {
       if (Value* v = find_in(seg, h, target)) return v;
     }
     return nullptr;
@@ -252,7 +284,7 @@ class concurrent_flat_map {
 
     // Look in every existing segment first to honour insert-once semantics.
     for (Segment* seg = head_; seg;
-         seg = seg->next.load(std::memory_order_acquire)) {
+         seg = observed_next(seg)) {
       if (Value* v = find_in(seg, h, target)) return {v, false};
     }
 
@@ -266,9 +298,9 @@ class concurrent_flat_map {
       // This probe sequence is saturated in `seg`; another thread may have
       // already inserted the same key into a later segment, so re-check
       // everything past `seg` before allocating.
-      Segment* next = seg->next.load(std::memory_order_acquire);
+      Segment* next = observed_next(seg);
       if (!next) next = ensure_next(seg);
-      for (Segment* s = next; s; s = s->next.load(std::memory_order_acquire)) {
+      for (Segment* s = next; s; s = observed_next(s)) {
         if (Value* v = find_in(s, h, target)) return {v, false};
       }
       seg = next;
@@ -288,7 +320,7 @@ class concurrent_flat_map {
   std::size_t size() const noexcept {
     std::size_t total = 0;
     for (Segment* seg = head_; seg;
-         seg = seg->next.load(std::memory_order_acquire)) {
+         seg = observed_next(seg)) {
       total += seg->size.load(std::memory_order_acquire);
     }
     return total;
@@ -301,7 +333,7 @@ class concurrent_flat_map {
   template <typename F>
   void for_each(F&& f) {
     for (Segment* seg = head_; seg;
-         seg = seg->next.load(std::memory_order_acquire)) {
+         seg = observed_next(seg)) {
       for (std::size_t i = 0; i < seg->capacity; ++i) {
         Slot& slot = seg->slots[i];
         UKey k = slot.key.load(std::memory_order_acquire);

From 85a3adbb0cf798f4b9067036e2374ac2ccb1c9cd Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Tue, 7 Apr 2026 13:04:59 +0200
Subject: [PATCH 17/25] SparseStorage: fix ND linearization mismatch with
 SparseHist

boost::histogram linearizes axes column-major (leftmost axis has
stride 1), but wums.SparseHist expects numpy row-major flat
indices. For ND histograms this caused entries to land in the
wrong bins (often flow bins) and silently disappear from
toarray(flow=False); 1D was unaffected and so the existing test
did not catch it.

The conversion now un-ravels each boost-linear key under F order
and re-ravels under C order before constructing the SparseHist.
Adds a 3D test that cross-checks against a dense HistoBoost.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 narf/histutils.py           | 17 ++++++++++++-----
 test/testsparsehistogram.py | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/narf/histutils.py b/narf/histutils.py
index 8127832..b73aec9 100644
--- a/narf/histutils.py
+++ b/narf/histutils.py
@@ -212,14 +212,21 @@ def _build_sparse():
             cpp_hist = res._GetPtr()
             snapshot = ROOT.narf.sparse_histogram_snapshot(cpp_hist)
             n = len(snapshot)
-            flat = np.empty(n, dtype=np.int64)
+            boost_flat = np.empty(n, dtype=np.int64)
             vals = np.empty(n, dtype=np.float64)
             for i, kv in enumerate(snapshot):
-                flat[i] = int(kv.first)
+                boost_flat[i] = int(kv.first)
                 vals[i] = float(kv.second)
-            size = 1
-            for ax in python_axes_sparse:
-                size *= int(ax.extent)
+            extents = tuple(int(ax.extent) for ax in python_axes_sparse)
+            size = int(np.prod(extents)) if extents else 1
+            # boost::histogram linearizes column-major (leftmost axis = stride 1),
+            # but wums.SparseHist expects numpy row-major (C order). Remap by
+            # un-raveling under F order and re-raveling under C order.
+            if n and len(extents) > 1:
+                multi = np.unravel_index(boost_flat, extents, order="F")
+                flat = np.ravel_multi_index(multi, extents, order="C").astype(np.int64)
+            else:
+                flat = boost_flat
             res._sparse_hist = SparseHist._from_flat(flat, vals, python_axes_sparse, size)
             return res._sparse_hist
 
diff --git a/test/testsparsehistogram.py b/test/testsparsehistogram.py
index cf8a143..8b3c7a4 100644
--- a/test/testsparsehistogram.py
+++ b/test/testsparsehistogram.py
@@ -24,4 +24,40 @@
 expected = np.full(20, N // 20, dtype=np.float64)
 assert np.allclose(dense_noflow, expected), f"mismatch: {dense_noflow} vs {expected}"
 
+
+# ---- 3D fill: verify ND linearization ----
+N3 = 24000
+df3 = ROOT.RDataFrame(N3)
+df3 = df3.Define("x", "double(rdfentry_ % 3)")        # 0, 1, 2
+df3 = df3.Define("y", "double((rdfentry_ / 3) % 4)")  # 0..3
+df3 = df3.Define("z", "double((rdfentry_ / 12) % 5)") # 0..4
+df3 = df3.Define("w", "1.0")
+
+ax_x = bh.axis.Regular(3, 0.0, 3.0, underflow=False, overflow=False)
+ax_y = bh.axis.Regular(4, 0.0, 4.0, underflow=False, overflow=False)
+ax_z = bh.axis.Regular(5, 0.0, 5.0, underflow=False, overflow=False)
+
+res3 = df3.HistoBoost(
+    "h3d",
+    [ax_x, ax_y, ax_z],
+    ["x", "y", "z", "w"],
+    storage=narf.histutils.SparseStorage(fill_fraction=1.0),
+)
+sh3 = res3.GetValue()
+
+assert isinstance(sh3, SparseHist)
+arr3 = sh3.toarray(flow=False)
+assert arr3.shape == (3, 4, 5)
+# Each (x,y,z) cell receives N3 / (3*4*5) = 400 entries.
+assert np.allclose(arr3, np.full((3, 4, 5), N3 // (3 * 4 * 5)))
+
+# Cross-check the same data through a dense fill.
+df3_dense = ROOT.RDataFrame(N3)
+df3_dense = df3_dense.Define("x", "double(rdfentry_ % 3)")
+df3_dense = df3_dense.Define("y", "double((rdfentry_ / 3) % 4)")
+df3_dense = df3_dense.Define("z", "double((rdfentry_ / 12) % 5)")
+df3_dense = df3_dense.Define("w", "1.0")
+hd = df3_dense.HistoBoost("h3d_dense", [ax_x, ax_y, ax_z], ["x", "y", "z", "w"]).GetValue()
+assert np.allclose(arr3, hd.values())
+
 print("testSparseHistogram OK")

From a2d9d74cedb471dc8f34f29787d4f834701527b4 Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Tue, 7 Apr 2026 14:39:09 +0200
Subject: [PATCH 18/25] SparseMatrixAtomic: configurable fill_fraction

Replace the hard-coded size0*size1/40 initial capacity with a
fill_fraction constructor argument (default 0.025 to match the
previous behaviour) that sizes the underlying concurrent_flat_map
to fill_fraction * size0 * size1 entries.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 narf/include/matrix_utils.hpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/narf/include/matrix_utils.hpp b/narf/include/matrix_utils.hpp
index f777b93..8a74729 100644
--- a/narf/include/matrix_utils.hpp
+++ b/narf/include/matrix_utils.hpp
@@ -86,9 +86,12 @@ class SparseMatrixAtomic {
 public:
   using map_type = narf::concurrent_flat_map<std::size_t, std::atomic<double>>;
 
-  SparseMatrixAtomic(std::size_t size0, std::size_t size1)
+  SparseMatrixAtomic(std::size_t size0, std::size_t size1,
+                     double fill_fraction = 0.025)
     : size0_(size0), size1_(size1),
-      data_(std::max<std::size_t>(size0 * size1 / 40, 16)) {}
+      data_(std::max<std::size_t>(
+          static_cast<std::size_t>(fill_fraction * static_cast<double>(size0 * size1)),
+          16)) {}
 
   std::atomic<double> &operator() (std::size_t idx0, std::size_t idx1) {
     const std::size_t i = globalidx(idx0, idx1);

From 07dc6960b657c37d0f5d3aea5a7ac6304b173e77 Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Tue, 7 Apr 2026 15:54:48 +0200
Subject: [PATCH 19/25] HistShiftHelper: guard against non-finite bin geometry

Treat continuous-axis bins with infinite width or center as flow bins
and return zero correction, preventing NaN propagation when an axis
uses np.inf as a bin edge.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 narf/include/histutils.hpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/narf/include/histutils.hpp b/narf/include/histutils.hpp
index 12461c3..9ceba50 100644
--- a/narf/include/histutils.hpp
+++ b/narf/include/histutils.hpp
@@ -807,9 +807,11 @@ namespace narf {
         // will probably have to return u,d,s individually with more complicated logic
         // in the calling layer to do the matrix multiplication
 
-        // Underflow / overflow: no reliable bin geometry, return no correction.
+        // Underflow / overflow or degenerate bin geometry (e.g. infinite bin
+        // edge): no reliable bin geometry, return no correction.
         // (note that a is infinity in this case such that delta and v are also zero)
-        const bool flow = bin_idx < 0 || bin_idx >= ax.size();
+        const bool degenerate = !std::isfinite(a) || !std::isfinite(x_c);
+        const bool flow = bin_idx < 0 || bin_idx >= ax.size() || degenerate;
 
         auto const u = flow ? 0.*x_orig : (x_orig - x_c)/a;
         auto const delta = (x_shifted - x_orig)/a;

From 1e280c5f6c0afd5fff58f10ce4470b568384e665 Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Fri, 10 Apr 2026 00:59:55 +0200
Subject: [PATCH 20/25] Add MapWrapper helper for element-wise application over
 container args

MapWrapper wraps an arbitrary callable so that, when invoked, any
argument satisfying narf::is_container is zipped element-wise (with
scalar arguments broadcast via make_view) and the callable is applied
to each resulting tuple via std::apply. If none of the arguments are
containers, the callable is invoked directly with the arguments as-is.

Also provide a forwarding constructor so the wrapped callable can be
constructed in place from MapWrapper's own constructor arguments, and
add a unit test exercising both the container and scalar-passthrough
code paths.
---
 narf/include/rdfutils.hpp | 32 ++++++++++++++++++++++++++++++++
 narf/include/tests.hpp    | 23 +++++++++++++++++++++++
 test/testmapwrapper.py    |  6 ++++++
 3 files changed, 61 insertions(+)
 create mode 100644 test/testmapwrapper.py

diff --git a/narf/include/rdfutils.hpp b/narf/include/rdfutils.hpp
index 3ea29ef..bf642f1 100755
--- a/narf/include/rdfutils.hpp
+++ b/narf/include/rdfutils.hpp
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "utils.hpp"
+
 namespace narf {
 
     template<typename Callable, typename... Args>
@@ -19,4 +21,34 @@ namespace narf {
             return callable_(args...);
         }
     };
+
+    template<typename Callable>
+    class MapWrapper {
+
+    private:
+        Callable callable_;
+    public:
+
+        MapWrapper(const Callable &callable) : callable_(callable) {}
+        MapWrapper(Callable &&callable) : callable_(std::move(callable)) {}
+
+        template<typename... CArgs>
+            requires (sizeof...(CArgs) != 1 || (!std::is_same_v<std::decay_t<CArgs>, MapWrapper> && ...)) &&
+                     std::is_constructible_v<Callable, CArgs&&...>
+        explicit MapWrapper(CArgs&&... cargs) : callable_(std::forward<CArgs>(cargs)...) {}
+
+        template<typename... Args>
+        auto operator() (const Args&... args) {
+            if constexpr ((is_container<Args> || ...)) {
+                auto apply_elem = [this](auto const &elem_tuple) {
+                    return std::apply(callable_, elem_tuple);
+                };
+                auto const res_view = make_zip_view(make_view(args)...) | std::views::transform(apply_elem);
+                return range_to(res_view);
+            } else {
+                return callable_(args...);
+            }
+        }
+    };
+
 }
diff --git a/narf/include/tests.hpp b/narf/include/tests.hpp
index dd9aea2..804a567 100644
--- a/narf/include/tests.hpp
+++ b/narf/include/tests.hpp
@@ -3,6 +3,7 @@
 #include "concurrent_flat_map.hpp"
 #include "histutils.hpp"
 #include "matrix_utils.hpp"
+#include "rdfutils.hpp"
 
 #include <atomic>
 #include <cmath>
@@ -249,4 +250,26 @@ namespace narf {
 
     return true;
   }
+
+  // Test MapWrapper: constructs a wrapper over a simple callable and applies
+  // it over zipped input ranges.
+  bool testMapWrapper() {
+    auto add = [](int a, int b) { return a + b; };
+    MapWrapper<decltype(add)> wrapper(add);
+
+    // Scalar passthrough: no container args -> callable invoked directly.
+    auto add_scalar = [](int x, int y) { return x + y; };
+    MapWrapper<decltype(add_scalar)> scalar_wrapper(add_scalar);
+    if (scalar_wrapper(2, 5) != 7) return false;
+
+    std::vector<int> a{1, 2, 3, 4};
+    std::vector<int> b{10, 20, 30, 40};
+    auto res = wrapper(a, b);
+    if (res.size() != 4) return false;
+    if (res[0] != 11) return false;
+    if (res[1] != 22) return false;
+    if (res[2] != 33) return false;
+    if (res[3] != 44) return false;
+    return true;
+  }
 }
diff --git a/test/testmapwrapper.py b/test/testmapwrapper.py
new file mode 100644
index 0000000..4e31b02
--- /dev/null
+++ b/test/testmapwrapper.py
@@ -0,0 +1,6 @@
+import ROOT
+import narf.tests
+
+res = ROOT.narf.testMapWrapper()
+print(f"testMapWrapper: {res}")
+assert res, "testMapWrapper failed"

From 2a199e9196574f3036cfedafd03002dd8cf1d545 Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Fri, 10 Apr 2026 01:00:07 +0200
Subject: [PATCH 21/25] HistShiftHelper: delegate container broadcasting to
 MapWrapper

Rename the core class to HistShiftHelperImpl and drop its is_container_any
branch, collapsing compute and compute_impl into a single scalar-only
implementation. HistShiftHelper is now a template alias for
MapWrapper<HistShiftHelperImpl<Axes...>>, which restores the previous
element-wise behavior for container arguments while keeping the per-event
code path untouched.
---
 narf/include/histutils.hpp | 48 +++++++++-----------------------------
 1 file changed, 11 insertions(+), 37 deletions(-)

diff --git a/narf/include/histutils.hpp b/narf/include/histutils.hpp
index 9ceba50..2aad2af 100644
--- a/narf/include/histutils.hpp
+++ b/narf/include/histutils.hpp
@@ -9,6 +9,7 @@
 #include "sparse_histogram.hpp"
 #include "tensorutils.hpp"
 #include "tensorevalutils.hpp"
+#include "rdfutils.hpp"
 #include <ROOT/RResultPtr.hxx>
 #include <ROOT/TThreadExecutor.hxx>
 #include <iostream>
@@ -671,10 +672,10 @@ namespace narf {
   /// summing over the contribution from each axis
 
   template<typename... Axes>
-  class HistShiftHelper {
+  class HistShiftHelperImpl {
   public:
-    HistShiftHelper(const Axes&... axes) : axes_(axes...) {}
-    HistShiftHelper(Axes&&... axes) : axes_(std::move(axes)...) {}
+    HistShiftHelperImpl(const Axes&... axes) : axes_(axes...) {}
+    HistShiftHelperImpl(Axes&&... axes) : axes_(std::move(axes)...) {}
 
     template <typename... Args>
     auto operator()(const Args&... args) const {
@@ -715,39 +716,6 @@ namespace narf {
     /// Core computation dispatched over axis indices.
     template <typename Nominal, typename Shifted, typename SmearShifted, typename Weight, std::size_t... Is>
     auto compute(const Nominal& orig,
-                   const Shifted& shifted,
-                   const SmearShifted& smear_shifted,
-                   const Weight& nominal_weight,
-                   std::index_sequence<Is...> idxs) const {
-
-      constexpr bool is_container_any = (is_container<std::tuple_element_t<Is, Nominal>> || ...);
-
-      if constexpr(is_container_any) {
-        auto make_range_of_tuples = [](auto const&...xs){ return make_zip_view(make_view(xs)...); };
-
-        auto const orig_v = std::apply(make_range_of_tuples, orig);
-        auto const shifted_v = std::apply(make_range_of_tuples, shifted);
-        auto const smear_shifted_v = std::apply(make_range_of_tuples, smear_shifted);
-
-        auto compute_elem_impl = [this, &nominal_weight, &idxs](auto &orig_elem, auto &shifted_elem, auto&smear_shifted_elem) {
-          return compute_impl(orig_elem, shifted_elem, smear_shifted_elem, nominal_weight, idxs);
-        };
-        auto compute_elem = [&compute_elem_impl](auto const &elem_tuple) { return std::apply(compute_elem_impl, elem_tuple); };
-
-        auto const res_view = make_zip_view(orig_v, shifted_v, smear_shifted_v)
-                              | std::views::transform(compute_elem);
-
-        auto const res = range_to<ROOT::VecOps::RVec>(res_view);
-        return res;
-      }
-      else {
-        return compute_impl(orig, shifted, smear_shifted, nominal_weight, idxs);
-      }
-    }
-
-    /// Core computation dispatched over axis indices.
-    template <typename Nominal, typename Shifted, typename SmearShifted, typename Weight, std::size_t... Is>
-    auto compute_impl(const Nominal& orig,
                  const Shifted& shifted,
                  const SmearShifted& smear_shifted,
                  const Weight& nominal_weight,
@@ -841,12 +809,18 @@ namespace narf {
     const std::tuple<Axes...> axes_;
   };
 
+  /// Public helper: MapWrapper around HistShiftHelperImpl so container
+  /// arguments are automatically broadcast/zipped element-wise, while scalar
+  /// arguments are passed through directly.
+  template<typename... Axes>
+  using HistShiftHelper = MapWrapper<HistShiftHelperImpl<Axes...>>;
+
   // factory function needed because CTAD doesn't work reliably from cppyy
   // also the trailing return type is needed because cppyy has issues with auto
   // return types
   template <typename... Axes>
   HistShiftHelper<std::decay_t<Axes>...> make_hist_shift_helper(Axes&&... axes) {
-    return HistShiftHelper(std::forward<Axes>(axes)...);
+    return HistShiftHelper<std::decay_t<Axes>...>(std::forward<Axes>(axes)...);
   }
 }
 

From 527cd1ed7dd5b6eedcc68b16446b096a05db12df Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Fri, 10 Apr 2026 01:05:33 +0200
Subject: [PATCH 22/25] QuantileHelper[Static]: delegate container broadcasting
 to MapWrapper

Rename the core classes to QuantileHelperImpl and QuantileHelperStaticImpl
and expose QuantileHelper / QuantileHelperStatic as MapWrapper template
aliases over them. This gives both helpers automatic element-wise
broadcasting over container arguments while leaving their existing
scalar call paths and factory/Python entry points source-compatible.

Also add a unit test exercising the scalar and RVec call paths of
QuantileHelperStatic.
---
 narf/include/histutils.hpp | 16 ++++++++++++----
 narf/include/tests.hpp     | 19 +++++++++++++++++++
 test/testquantilehelper.py |  6 ++++++
 3 files changed, 37 insertions(+), 4 deletions(-)
 create mode 100644 test/testquantilehelper.py

diff --git a/narf/include/histutils.hpp b/narf/include/histutils.hpp
index 2aad2af..d649a27 100644
--- a/narf/include/histutils.hpp
+++ b/narf/include/histutils.hpp
@@ -619,14 +619,14 @@ namespace narf {
   // The underlying histogram holds a tensor with the bin edges for the quantiles in the last variable,
   // conditional on all the previous variables
   template <typename Storage, typename... Axes>
-  class QuantileHelper : public HistHelper<Storage, Axes...> {
+  class QuantileHelperImpl : public HistHelper<Storage, Axes...> {
     using base_t = HistHelper<Storage, Axes...>;
     using hist_t = typename base_t::hist_t;
     using scalar_t = typename Storage::value_type::tensor_t::Scalar;
     static constexpr auto nquants = Storage::value_type::size;
 
   public:
-    QuantileHelper(hist_t &&resource) : base_t(std::forward<hist_t>(resource)) {}
+    QuantileHelperImpl(hist_t &&resource) : base_t(std::forward<hist_t>(resource)) {}
 
     boost::histogram::axis::index_type operator()(const boost::histogram::axis::traits::value_type<Axes>&... args, const scalar_t &last) const {
       auto const &hist = *base_t::resourceHist_;
@@ -639,6 +639,11 @@ namespace narf {
     }
   };
 
+  // MapWrapper alias so container arguments are automatically broadcast /
+  // zipped element-wise, while scalar arguments call through directly.
+  template <typename Storage, typename... Axes>
+  using QuantileHelper = MapWrapper<QuantileHelperImpl<Storage, Axes...>>;
+
   // CTAD doesn't work reliably from cppyy so add factory function
   template <typename Storage, typename... Axes>
   QuantileHelper<Storage, Axes...> make_quantile_helper(boost::histogram::histogram<std::tuple<Axes...>, Storage> &&h) {
@@ -648,11 +653,11 @@ namespace narf {
 
   // simple version for static quantiles
   template<std::size_t N>
-  class QuantileHelperStatic {
+  class QuantileHelperStaticImpl {
   public:
     using edge_t = std::array<double, N>;
 
-    QuantileHelperStatic(const edge_t &edges) : edges_(edges) {}
+    QuantileHelperStaticImpl(const edge_t &edges) : edges_(edges) {}
 
     boost::histogram::axis::index_type operator() (double val) const {
       // find the quantile bin corresponding to the last argument
@@ -665,6 +670,9 @@ namespace narf {
     const edge_t edges_;
   };
 
+  template<std::size_t N>
+  using QuantileHelperStatic = MapWrapper<QuantileHelperStaticImpl<N>>;
+
   /// Computes the minimum-variance reweighting to approximate a shift
   /// or smearing in the underlying variables of a multidimensional histogram.
   ///
diff --git a/narf/include/tests.hpp b/narf/include/tests.hpp
index 804a567..c47e31d 100644
--- a/narf/include/tests.hpp
+++ b/narf/include/tests.hpp
@@ -251,6 +251,25 @@ namespace narf {
     return true;
   }
 
+  // Test QuantileHelperStatic: edges {0.25, 0.5, 0.75} partition into 4 bins.
+  // Exercise both scalar and container (RVec) call paths via MapWrapper.
+  bool testQuantileHelperStatic() {
+    QuantileHelperStatic<4> helper(std::array<double, 4>{0.25, 0.5, 0.75, 1.0});
+
+    if (helper(0.1) != 0) return false;
+    if (helper(0.25) != 1) return false;
+    if (helper(0.4) != 1) return false;
+    if (helper(0.6) != 2) return false;
+    if (helper(0.9) != 3) return false;
+
+    ROOT::VecOps::RVec<double> vals{0.1, 0.4, 0.6, 0.9};
+    auto res = helper(vals);
+    if (res.size() != 4) return false;
+    if (res[0] != 0 || res[1] != 1 || res[2] != 2 || res[3] != 3) return false;
+
+    return true;
+  }
+
   // Test MapWrapper: constructs a wrapper over a simple callable and applies
   // it over zipped input ranges.
   bool testMapWrapper() {
diff --git a/test/testquantilehelper.py b/test/testquantilehelper.py
new file mode 100644
index 0000000..3b18001
--- /dev/null
+++ b/test/testquantilehelper.py
@@ -0,0 +1,6 @@
+import ROOT
+import narf.tests
+
+res = ROOT.narf.testQuantileHelperStatic()
+print(f"testQuantileHelperStatic: {res}")
+assert res, "testQuantileHelperStatic failed"

From b1b0d7da07ebe73536e0fa7eb4e30642f714d212 Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Fri, 10 Apr 2026 01:31:13 +0200
Subject: [PATCH 23/25] QuantileHelper[Static]: add continuous CDF-style lookup
 mode

Thread a bool Continuous template parameter through QuantileHelperImpl
and QuantileHelperStaticImpl via a shared quantile_lookup helper. In
continuous mode the helpers return a double in [0, 1] obtained by
linearly interpolating between adjacent stored edges (edges[i] maps to
i/(N-1)), with values outside [edges[0], edges[N-1]] clamped to 0 / 1.

Expose QuantileHelperContinuous / QuantileHelperStaticContinuous aliases
and a make_quantile_helper_continuous factory. Extend the unit test to
cover the scalar and RVec continuous code paths.
---
 narf/include/histutils.hpp | 58 ++++++++++++++++++++++++++++----------
 narf/include/tests.hpp     | 21 ++++++++++++++
 2 files changed, 64 insertions(+), 15 deletions(-)

diff --git a/narf/include/histutils.hpp b/narf/include/histutils.hpp
index d649a27..c5928d4 100644
--- a/narf/include/histutils.hpp
+++ b/narf/include/histutils.hpp
@@ -618,7 +618,30 @@ namespace narf {
   // Helper which facilitates conversion from value to quantile for a single variable
   // The underlying histogram holds a tensor with the bin edges for the quantiles in the last variable,
   // conditional on all the previous variables
-  template <typename Storage, typename... Axes>
+  /// Look up the quantile bin for `val` in the sorted edge array [begin, end).
+  /// When Continuous is false, return the clamped integer bin index (matching
+  /// the previous behavior). When Continuous is true, return a CDF-style
+  /// double in [0, 1] obtained by linearly interpolating between adjacent
+  /// edges, with edges[i] mapping to i/(N-1) and values outside
+  /// [edges[0], edges[N-1]] clamped to 0 / 1 respectively.
+  template <bool Continuous, typename It, typename T>
+  auto quantile_lookup(It begin, It end, const T &val) {
+    const auto n = std::distance(begin, end);
+    auto const upper = std::upper_bound(begin, end, val);
+    auto const iquant = std::distance(begin, upper);
+    if constexpr (Continuous) {
+      auto const i = std::clamp<std::ptrdiff_t>(iquant - 1, 0, n - 2);
+      auto const lo = *(begin + i);
+      auto const hi = *(begin + i + 1);
+      double const frac = double(val - lo) / double(hi - lo);
+      double const res = (double(i) + frac) / double(n - 1);
+      return std::clamp(res, 0.0, 1.0);
+    } else {
+      return std::clamp<boost::histogram::axis::index_type>(iquant, 0, n - 1);
+    }
+  }
+
+  template <typename Storage, bool Continuous, typename... Axes>
   class QuantileHelperImpl : public HistHelper<Storage, Axes...> {
     using base_t = HistHelper<Storage, Axes...>;
     using hist_t = typename base_t::hist_t;
@@ -628,21 +651,20 @@ namespace narf {
   public:
     QuantileHelperImpl(hist_t &&resource) : base_t(std::forward<hist_t>(resource)) {}
 
-    boost::histogram::axis::index_type operator()(const boost::histogram::axis::traits::value_type<Axes>&... args, const scalar_t &last) const {
+    auto operator()(const boost::histogram::axis::traits::value_type<Axes>&... args, const scalar_t &last) const {
       auto const &hist = *base_t::resourceHist_;
       auto const &edges = narf::get_value(hist, args...).data();
-
-      // find the quantile bin corresponding to the last argument
-      auto const upper = std::upper_bound(edges.data(), edges.data()+nquants, last);
-      auto const iquant = std::distance(edges.data(), upper);
-      return std::clamp<boost::histogram::axis::index_type>(iquant, 0, nquants-1);
+      return quantile_lookup<Continuous>(edges.data(), edges.data() + nquants, last);
     }
   };
 
   // MapWrapper alias so container arguments are automatically broadcast /
   // zipped element-wise, while scalar arguments call through directly.
   template <typename Storage, typename... Axes>
-  using QuantileHelper = MapWrapper<QuantileHelperImpl<Storage, Axes...>>;
+  using QuantileHelper = MapWrapper<QuantileHelperImpl<Storage, false, Axes...>>;
+
+  template <typename Storage, typename... Axes>
+  using QuantileHelperContinuous = MapWrapper<QuantileHelperImpl<Storage, true, Axes...>>;
 
   // CTAD doesn't work reliably from cppyy so add factory function
   template <typename Storage, typename... Axes>
@@ -651,19 +673,22 @@ namespace narf {
     return QuantileHelper<Storage, Axes...>(std::forward<hist_t>(h));
   }
 
+  template <typename Storage, typename... Axes>
+  QuantileHelperContinuous<Storage, Axes...> make_quantile_helper_continuous(boost::histogram::histogram<std::tuple<Axes...>, Storage> &&h) {
+    using hist_t = boost::histogram::histogram<std::tuple<Axes...>, Storage>;
+    return QuantileHelperContinuous<Storage, Axes...>(std::forward<hist_t>(h));
+  }
+
   // simple version for static quantiles
-  template<std::size_t N>
+  template<std::size_t N, bool Continuous = false>
   class QuantileHelperStaticImpl {
   public:
     using edge_t = std::array<double, N>;
 
     QuantileHelperStaticImpl(const edge_t &edges) : edges_(edges) {}
 
-    boost::histogram::axis::index_type operator() (double val) const {
-      // find the quantile bin corresponding to the last argument
-      auto const upper = std::upper_bound(edges_.begin(), edges_.end(), val);
-      auto const iquant = std::distance(edges_.begin(), upper);
-      return std::clamp<boost::histogram::axis::index_type>(iquant, 0, N-1);
+    auto operator() (double val) const {
+      return quantile_lookup<Continuous>(edges_.begin(), edges_.end(), val);
     }
 
   private:
@@ -671,7 +696,10 @@ namespace narf {
   };
 
   template<std::size_t N>
-  using QuantileHelperStatic = MapWrapper<QuantileHelperStaticImpl<N>>;
+  using QuantileHelperStatic = MapWrapper<QuantileHelperStaticImpl<N, false>>;
+
+  template<std::size_t N>
+  using QuantileHelperStaticContinuous = MapWrapper<QuantileHelperStaticImpl<N, true>>;
 
   /// Computes the minimum-variance reweighting to approximate a shift
   /// or smearing in the underlying variables of a multidimensional histogram.
diff --git a/narf/include/tests.hpp b/narf/include/tests.hpp
index c47e31d..c88dc0d 100644
--- a/narf/include/tests.hpp
+++ b/narf/include/tests.hpp
@@ -267,6 +267,27 @@ namespace narf {
     if (res.size() != 4) return false;
     if (res[0] != 0 || res[1] != 1 || res[2] != 2 || res[3] != 3) return false;
 
+    // Continuous mode: CDF in [0, 1], edges[i] -> i/(N-1).
+    QuantileHelperStaticContinuous<4> helper_c(std::array<double, 4>{0.25, 0.5, 0.75, 1.0});
+    auto const eps = 1e-9;
+    if (std::abs(helper_c(0.25) - 0.0) > eps) return false;
+    if (std::abs(helper_c(0.5) - 1.0/3.0) > eps) return false;
+    if (std::abs(helper_c(1.0) - 1.0) > eps) return false;
+    // val below first edge clamps to 0
+    if (std::abs(helper_c(0.0) - 0.0) > eps) return false;
+    // val above last edge clamps to 1
+    if (std::abs(helper_c(2.0) - 1.0) > eps) return false;
+    // midpoint of first interval
+    if (std::abs(helper_c(0.375) - (1.0/6.0)) > eps) return false;
+
+    // Container path through MapWrapper
+    auto res_c = helper_c(ROOT::VecOps::RVec<double>{0.25, 0.5, 0.75, 1.0});
+    if (res_c.size() != 4) return false;
+    if (std::abs(res_c[0] - 0.0) > eps) return false;
+    if (std::abs(res_c[1] - 1.0/3.0) > eps) return false;
+    if (std::abs(res_c[2] - 2.0/3.0) > eps) return false;
+    if (std::abs(res_c[3] - 1.0) > eps) return false;
+
     return true;
   }
 

From f29f771acbdfaa04536168906adb2cd80af5e839 Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Fri, 10 Apr 2026 01:31:24 +0200
Subject: [PATCH 24/25] define_quantile_ints: support continuous quantile mode

Add a continuous=False option to build_quantile_hists which preserves
the original (Regular / Variable) quantile axes in the returned helper
histograms instead of replacing them with Integer axes.
define_quantile_ints auto-detects the mode from the axis type and
dispatches to the continuous quantile helpers, feeding the resulting
CDF-style columns (named _quant instead of _iquant) to subsequent
helpers in the chain.
---
 narf/histutils.py | 48 ++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 41 insertions(+), 7 deletions(-)

diff --git a/narf/histutils.py b/narf/histutils.py
index b73aec9..780b3e7 100644
--- a/narf/histutils.py
+++ b/narf/histutils.py
@@ -719,8 +719,15 @@ def pythonize_rdataframe(klass):
     klass.HistoNDWithBoost = _histond_with_boost
     klass.SumAndCount = _sum_and_count
 
-def build_quantile_hists(df, cols, condaxes, quantaxes):
-    """Build histograms which encode conditional quantiles for the provided variables, to be used with define_quantile_ints"""
+def build_quantile_hists(df, cols, condaxes, quantaxes, continuous=False):
+    """Build histograms which encode conditional quantiles for the provided variables, to be used with define_quantile_ints.
+
+    When ``continuous=True`` the original quantile axes are kept as-is in the
+    returned helper histograms (instead of being replaced by ``Integer`` axes).
+    ``define_quantile_ints`` detects this from the axis type and uses the
+    continuous quantile helpers, which return CDF-style values in ``[0, 1]``
+    obtained by linearly interpolating between the stored quantile edges.
+    """
 
     arraxes = condaxes + quantaxes
 
@@ -793,7 +800,13 @@ def build_quantile_hists(df, cols, condaxes, quantaxes):
 
             iquantax = iax - ncond
 
-            quantile_integer_axis = hist.axis.Integer(0, axis.size, underflow=False, overflow=False, name=f"{axis.name}_int")
+            if continuous:
+                # Keep the original (Regular / Variable) quantile axis so that
+                # subsequent helpers in the chain can be indexed by the
+                # continuous CDF-style output of the previous helper.
+                quantile_integer_axis = axis
+            else:
+                quantile_integer_axis = hist.axis.Integer(0, axis.size, underflow=False, overflow=False, name=f"{axis.name}_int")
             quantile_integer_axes.append(quantile_integer_axis)
 
             helper_axes = condaxes[:iax+1] + quantile_integer_axes
@@ -807,7 +820,14 @@ def build_quantile_hists(df, cols, condaxes, quantaxes):
 
 
 def define_quantile_ints(df, cols, quantile_hists):
-    """Define transformed columns corresponding to conditional quantile bins (integers)"""
+    """Define transformed columns corresponding to conditional quantiles.
+
+    By default the helpers return the integer quantile bin index. If the
+    helper histograms produced by :func:`build_quantile_hists` were built in
+    continuous mode (their trailing quantile axis is not a plain ``Integer``
+    axis), the continuous quantile helpers are used instead, returning a
+    CDF-style value in ``[0, 1]``.
+    """
 
     ncols = len(cols)
     nquant = len(quantile_hists)
@@ -816,22 +836,36 @@ def define_quantile_ints(df, cols, quantile_hists):
     condcols = cols[:ncond]
     quantcols = cols[ncond:]
 
+    # Detect continuous mode from the trailing (quantile) axis of the first
+    # helper histogram: continuous-mode helper histograms preserve the
+    # original (Regular / Variable) quantile axis, integer-mode ones use a
+    # generated Integer axis.
+    continuous = not isinstance(quantile_hists[0].axes[-1], hist.axis.Integer)
+
     helper_cols_cond = condcols.copy()
 
+    suffix = "_quant" if continuous else "_iquant"
+
     for col, quantile_hist in zip(quantcols, quantile_hists):
 
         if len(quantile_hist.axes) > 1:
             helper_hist = narf.hist_to_pyroot_boost(quantile_hist, tensor_rank=1)
-            quanthelper = ROOT.narf.make_quantile_helper(ROOT.std.move(helper_hist))
+            if continuous:
+                quanthelper = ROOT.narf.make_quantile_helper_continuous(ROOT.std.move(helper_hist))
+            else:
+                quanthelper = ROOT.narf.make_quantile_helper(ROOT.std.move(helper_hist))
         else:
             # special case for static quantiles with no conditional variables
             vals = quantile_hist.values()
             arr = ROOT.std.array["double", vals.size](vals)
-            quanthelper = ROOT.narf.QuantileHelperStatic[vals.size](arr)
+            if continuous:
+                quanthelper = ROOT.narf.QuantileHelperStaticContinuous[vals.size](arr)
+            else:
+                quanthelper = ROOT.narf.QuantileHelperStatic[vals.size](arr)
 
         helper_cols = helper_cols_cond + [col]
 
-        outname = f"{col}_iquant"
+        outname = f"{col}{suffix}"
         df = narf.rdfutils.flexible_define(df, outname, quanthelper, helper_cols)
         helper_cols_cond.append(outname)
 

From c42e75cd04bc5ba00565557c1cc34a1ac3127933 Mon Sep 17 00:00:00 2001
From: Josh Bendavid <Josh.Bendavid@cern.ch>
Date: Fri, 10 Apr 2026 01:50:28 +0200
Subject: [PATCH 25/25] build_quantile_hists: return bin centers and volumes

Also compute per-bin minima (via ak.min) alongside the existing maxima
so that per-dimension widths and centers of the final transformed
quantile bins can be derived. Return two additional histograms:

- centers_hist: multidimensional bin centers stored along an extra
  StrCategory "coord" axis labelled with the input quantile axis names
  (with a quant_i placeholder for any unnamed or duplicated name).
- volume_hist: product of the per-dimension widths of the same bin.

Both are indexed by the full set of conditional and quantile axes
matching the last helper histogram. Update the existing call site in
test/testquantiles.py to unpack the new return tuple.
---
 narf/histutils.py     | 66 ++++++++++++++++++++++++++++++++++++++++++-
 test/testquantiles.py |  4 ++-
 2 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/narf/histutils.py b/narf/histutils.py
index 780b3e7..5d3fd80 100644
--- a/narf/histutils.py
+++ b/narf/histutils.py
@@ -727,6 +727,14 @@ def build_quantile_hists(df, cols, condaxes, quantaxes, continuous=False):
     ``define_quantile_ints`` detects this from the axis type and uses the
     continuous quantile helpers, which return CDF-style values in ``[0, 1]``
     obtained by linearly interpolating between the stored quantile edges.
+
+    Returns a tuple ``(quantile_hists, centers_hist, volume_hist)`` where
+    ``centers_hist`` gives the multidimensional center (one component per
+    quantile variable, along an extra ``coord`` ``StrCategory`` axis labelled
+    with the input quantile axis names when available) of each final
+    transformed quantile bin, and ``volume_hist`` gives the product of the
+    per-dimension widths of the same bin, both indexed by the full set of
+    conditional and (integer or original) quantile axes.
     """
 
     arraxes = condaxes + quantaxes
@@ -763,6 +771,8 @@ def build_quantile_hists(df, cols, condaxes, quantaxes, continuous=False):
 
     quantile_integer_axes = []
     quantile_hists = []
+    quantile_mins_list = []
+    quantile_maxs_list = []
 
 
     for iax, (col, axis) in enumerate(zip(cols, arraxes)):
@@ -792,12 +802,27 @@ def build_quantile_hists(df, cols, condaxes, quantaxes, continuous=False):
             quantile_edges = np.reshape(quantile_edges, shape_final[:iax+1])
             quantile_edges = ak.to_numpy(quantile_edges)
 
+            quantile_mins = ak.min(arr[..., col], axis=-1, mask_identity=False)
+            quantile_mins = np.reshape(quantile_mins, shape_final[:iax+1])
+            quantile_mins = ak.to_numpy(quantile_mins)
+
             # replace -infinity from empty values with the previous bin edge
             # so that the quantile edges are at least still monotonic
             nquants = quantile_edges.shape[-1]
             for iquant in range(1, nquants):
                 quantile_edges[..., iquant] = np.where(quantile_edges[..., iquant]==-np.inf, quantile_edges[..., iquant-1], quantile_edges[..., iquant])
 
+            # replace +infinity from empty values with the next bin's min,
+            # keeping mins monotonic and avoiding negative bin widths; any
+            # remaining +infinities (trailing empty bins) fall back to the
+            # corresponding max so the bin collapses to zero width.
+            for iquant in range(nquants - 2, -1, -1):
+                quantile_mins[..., iquant] = np.where(quantile_mins[..., iquant]==np.inf, quantile_mins[..., iquant+1], quantile_mins[..., iquant])
+            quantile_mins = np.where(quantile_mins==np.inf, quantile_edges, quantile_mins)
+
+            quantile_mins_list.append(quantile_mins)
+            quantile_maxs_list.append(quantile_edges)
+
             iquantax = iax - ncond
 
             if continuous:
@@ -816,7 +841,46 @@ def build_quantile_hists(df, cols, condaxes, quantaxes, continuous=False):
 
             quantile_hists.append(helper_hist)
 
-    return quantile_hists
+    # Compute multidim centers and volumes for the final transformed bins.
+    # Each per-axis widths/centers array has shape covering only the
+    # conditional dims plus the quantile dims up to that axis; broadcast to
+    # the full (cond + all quant) shape and combine.
+    full_shape = tuple(shape_final)
+    per_dim_widths = []
+    per_dim_centers = []
+    for i in range(nquant):
+        mins_i = quantile_mins_list[i]
+        maxs_i = quantile_maxs_list[i]
+        widths_i = maxs_i - mins_i
+        centers_i = 0.5 * (maxs_i + mins_i)
+        ndim_extra = nquant - i - 1
+        new_shape = widths_i.shape + (1,) * ndim_extra
+        per_dim_widths.append(np.broadcast_to(widths_i.reshape(new_shape), full_shape))
+        per_dim_centers.append(np.broadcast_to(centers_i.reshape(new_shape), full_shape))
+
+    final_axes = condaxes + quantile_integer_axes
+    volume_hist = hist.Hist(*final_axes)
+    if nquant > 0:
+        volume = per_dim_widths[0].copy()
+        for w in per_dim_widths[1:]:
+            volume = volume * w
+        volume_hist.values(flow=True)[...] = volume
+
+        # Use the input quantile axis names as labels on the coord axis, with
+        # a unique placeholder for any unnamed axis.
+        coord_names = []
+        for i, ax in enumerate(quantaxes):
+            name = getattr(ax, "name", "") or ""
+            if not name or name in coord_names:
+                name = f"quant_{i}"
+            coord_names.append(name)
+        coord_axis = hist.axis.StrCategory(coord_names, name="coord", overflow=False)
+        centers_hist = hist.Hist(*final_axes, coord_axis)
+        centers_hist.values(flow=True)[...] = np.stack(per_dim_centers, axis=-1)
+    else:
+        centers_hist = hist.Hist(*final_axes)
+
+    return quantile_hists, centers_hist, volume_hist
 
 
 def define_quantile_ints(df, cols, quantile_hists):
diff --git a/test/testquantiles.py b/test/testquantiles.py
index 849adf0..e1ca3b6 100755
--- a/test/testquantiles.py
+++ b/test/testquantiles.py
@@ -59,7 +59,9 @@
 quantaxes = [hist.axis.Regular(nbins, 0., 1., name=f"quantile_{i}", underflow=False, overflow=False) for i in range(nquant)]
 
 # build quantile hists (triggers an event loop)
-quantile_hists = narf.histutils.build_quantile_hists(df, cols, condaxes, quantaxes)
+quantile_hists, centers_hist, volume_hist = narf.histutils.build_quantile_hists(df, cols, condaxes, quantaxes)
+print("centers:", centers_hist)
+print("volumes:", volume_hist)
 
 # define transformed variables with quantile bin indexes
 df, quantile_axes, quantile_cols = narf.histutils.define_quantile_ints(df, cols=condcols+quantcols, quantile_hists=quantile_hists)