Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
7d6c3a0
add python script for tests
bendavid Apr 4, 2026
e84d73a
Add SymMatrixAtomic
bendavid Apr 4, 2026
1a85357
minor improvement for SymMatrixAtomic and add initial version of Spar…
bendavid Apr 4, 2026
a5d2c6d
fix deprecated storage_type access
bendavid Apr 5, 2026
6c27fcb
fix constness
bendavid Apr 5, 2026
db0394f
make wrapper more flexible/robust
bendavid Apr 5, 2026
5a4c98d
flexible column types for quantile helpers
bendavid Apr 5, 2026
7468541
add missing include
bendavid Apr 5, 2026
ff899e4
make range_to more flexible
bendavid Apr 6, 2026
105bed6
add lock-free insert-only concurrent_flat_map
bendavid Apr 6, 2026
1c39602
add SparseMatrixAtomic test driver
bendavid Apr 6, 2026
a8be35b
SparseMatrixAtomic: switch to narf::concurrent_flat_map
bendavid Apr 6, 2026
a39c56d
concurrent_flat_map: add move constructor and assignment
bendavid Apr 7, 2026
59bd2ef
HistoBoost: add SparseStorage option backed by concurrent_flat_map
bendavid Apr 7, 2026
57e440a
HistoBoost SparseStorage: convert result to wums.SparseHist
bendavid Apr 7, 2026
97b2153
concurrent_flat_map: serialize segment growth via sentinel
bendavid Apr 7, 2026
85a3adb
SparseStorage: fix ND linearization mismatch with SparseHist
bendavid Apr 7, 2026
a2d9d74
SparseMatrixAtomic: configurable fill_fraction
bendavid Apr 7, 2026
07dc696
HistShiftHelper: guard against non-finite bin geometry
bendavid Apr 7, 2026
1e280c5
Add MapWrapper helper for element-wise application over container args
bendavid Apr 9, 2026
2a199e9
HistShiftHelper: delegate container broadcasting to MapWrapper
bendavid Apr 9, 2026
527cd1e
QuantileHelper[Static]: delegate container broadcasting to MapWrapper
bendavid Apr 9, 2026
b1b0d7d
QuantileHelper[Static]: add continuous CDF-style lookup mode
bendavid Apr 9, 2026
f29f771
define_quantile_ints: support continuous quantile mode
bendavid Apr 9, 2026
c42e75c
build_quantile_hists: return bin centers and volumes
bendavid Apr 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 189 additions & 13 deletions narf/histutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,24 @@
narf.clingutils.Declare('#include <eigen3/Eigen/Dense>')
narf.clingutils.Declare('#include <eigen3/unsupported/Eigen/CXX11/Tensor>')

class SparseStorage:
"""Storage option for HistoBoost selecting a narf::concurrent_sparse_storage
backed by a narf::concurrent_flat_map. Conversion to a python hist.Hist
object is not supported in this mode.

Parameters
----------
fill_fraction : float
Estimated fraction of bins (including under/overflow) that will be
populated. Used to size the underlying concurrent_flat_map so that
most fills hit the initial allocation rather than triggering
on-the-fly expansion. Values outside (0, 1] are accepted; pass a
small number for very sparse fills.
"""
def __init__(self, fill_fraction=0.1):
self.fill_fraction = float(fill_fraction)


def bool_to_string(b):
if b:
return "true"
Expand Down Expand Up @@ -114,7 +132,7 @@ def make_array_interface_view(boost_hist):

underflow = [axis.traits.underflow for axis in boost_hist.axes]

acc_type = convert_storage_type(boost_hist._storage_type)
acc_type = convert_storage_type(boost_hist.storage_type)
arrview = ROOT.narf.array_interface_view[acc_type, len(shape)](arr, shape, strides, underflow)

return arrview
Expand All @@ -132,9 +150,9 @@ def hist_to_pyroot_boost(hist_hist, tensor_rank = 0, force_atomic = False):
scalar_type = ROOT.double
dimensions = ROOT.Eigen.Sizes[tuple(tensor_sizes)]

if issubclass(hist_hist._storage_type, bh.storage.Double):
if issubclass(hist_hist.storage_type, bh.storage.Double):
cppstoragetype = ROOT.narf.tensor_accumulator[scalar_type, dimensions]
elif issubclass(hist_hist._storage_type, bh.storage.Weight):
elif issubclass(hist_hist.storage_type, bh.storage.Weight):
cppstoragetype = ROOT.narf.tensor_accumulator[ROOT.boost.histogram.accumulators.weighted_sum[scalar_type], dimensions]
else:
raise TypeError("Requested storage type is not supported with tensor weights currently")
Expand All @@ -143,7 +161,7 @@ def hist_to_pyroot_boost(hist_hist, tensor_rank = 0, force_atomic = False):
cppstoragetype = ROOT.narf.atomic_adaptor[cppstoragetype]
else:
python_axes = hist_hist.axes
cppstoragetype = convert_storage_type(hist_hist._storage_type, force_atomic = force_atomic)
cppstoragetype = convert_storage_type(hist_hist.storage_type, force_atomic = force_atomic)

cppaxes = [ROOT.std.move(convert_axis(axis)) for axis in python_axes]

Expand All @@ -161,6 +179,66 @@ def _histo_boost(df, name, axes, cols, storage = bh.storage.Weight(), force_atom
if force_atomic is None:
force_atomic = ROOT.ROOT.IsImplicitMTEnabled()

# Sparse storage path: build a narf::sparse_histogram backed by a
# concurrent_flat_map. The result is exposed as a wums.SparseHist.
if isinstance(storage, SparseStorage):
if tensor_axes is not None:
raise NotImplementedError("Tensor weights are not supported with SparseStorage")
coltypes = [df.GetColumnType(col) for col in cols]
for coltype in coltypes[len(axes):]:
traits = ROOT.narf.tensor_traits[coltype]
if traits.is_tensor:
raise NotImplementedError("Tensor weights are not supported with SparseStorage")
cppaxes = [ROOT.std.move(convert_axis(axis)) for axis in axes]
hfill = ROOT.narf.make_histogram_sparse[ROOT.narf.atomic_adaptor[ROOT.double]](storage.fill_fraction, *cppaxes)
helper = ROOT.narf.FillBoostHelperAtomic[type(hfill)](ROOT.std.move(hfill))
targs = tuple([type(df), type(helper)] + coltypes)
res = ROOT.narf.book_helper[targs](df, ROOT.std.move(helper), cols)

if not convert_to_hist:
return res

# Lazily convert the underlying C++ sparse histogram to a wums.SparseHist
# the first time the result is dereferenced.
from wums.sparse_hist import SparseHist

res._GetPtr = res.GetPtr
res._sparse_hist = None
python_axes_sparse = list(axes)

def _build_sparse():
if res._sparse_hist is not None:
return res._sparse_hist
cpp_hist = res._GetPtr()
snapshot = ROOT.narf.sparse_histogram_snapshot(cpp_hist)
n = len(snapshot)
boost_flat = np.empty(n, dtype=np.int64)
vals = np.empty(n, dtype=np.float64)
for i, kv in enumerate(snapshot):
boost_flat[i] = int(kv.first)
vals[i] = float(kv.second)
extents = tuple(int(ax.extent) for ax in python_axes_sparse)
size = int(np.prod(extents)) if extents else 1
# boost::histogram linearizes column-major (leftmost axis = stride 1),
# but wums.SparseHist expects numpy row-major (C order). Remap by
# un-raveling under F order and re-raveling under C order.
if n and len(extents) > 1:
multi = np.unravel_index(boost_flat, extents, order="F")
flat = np.ravel_multi_index(multi, extents, order="C").astype(np.int64)
else:
flat = boost_flat
res._sparse_hist = SparseHist._from_flat(flat, vals, python_axes_sparse, size)
return res._sparse_hist

ret_null = lambda: None
res.__deref__ = _build_sparse
res.__follow__ = _build_sparse
res.begin = ret_null
res.end = ret_null
res.GetPtr = _build_sparse
res.GetValue = _build_sparse
return res

#TODO some of this code can be shared with root histogram version

#FIXME make this more generic
Expand Down Expand Up @@ -641,8 +719,23 @@ def pythonize_rdataframe(klass):
klass.HistoNDWithBoost = _histond_with_boost
klass.SumAndCount = _sum_and_count

def build_quantile_hists(df, cols, condaxes, quantaxes):
"""Build histograms which encode conditional quantiles for the provided variables, to be used with define_quantile_ints"""
def build_quantile_hists(df, cols, condaxes, quantaxes, continuous=False):
"""Build histograms which encode conditional quantiles for the provided variables, to be used with define_quantile_ints.

When ``continuous=True`` the original quantile axes are kept as-is in the
returned helper histograms (instead of being replaced by ``Integer`` axes).
``define_quantile_ints`` detects this from the axis type and uses the
continuous quantile helpers, which return CDF-style values in ``[0, 1]``
obtained by linearly interpolating between the stored quantile edges.

Returns a tuple ``(quantile_hists, centers_hist, volume_hist)`` where
``centers_hist`` gives the multidimensional center (one component per
quantile variable, along an extra ``coord`` ``StrCategory`` axis labelled
with the input quantile axis names when available) of each final
transformed quantile bin, and ``volume_hist`` gives the product of the
per-dimension widths of the same bin, both indexed by the full set of
conditional and (integer or original) quantile axes.
"""

arraxes = condaxes + quantaxes

Expand Down Expand Up @@ -678,6 +771,8 @@ def build_quantile_hists(df, cols, condaxes, quantaxes):

quantile_integer_axes = []
quantile_hists = []
quantile_mins_list = []
quantile_maxs_list = []


for iax, (col, axis) in enumerate(zip(cols, arraxes)):
Expand Down Expand Up @@ -707,15 +802,36 @@ def build_quantile_hists(df, cols, condaxes, quantaxes):
quantile_edges = np.reshape(quantile_edges, shape_final[:iax+1])
quantile_edges = ak.to_numpy(quantile_edges)

quantile_mins = ak.min(arr[..., col], axis=-1, mask_identity=False)
quantile_mins = np.reshape(quantile_mins, shape_final[:iax+1])
quantile_mins = ak.to_numpy(quantile_mins)

# replace -infinity from empty values with the previous bin edge
# so that the quantile edges are at least still monotonic
nquants = quantile_edges.shape[-1]
for iquant in range(1, nquants):
quantile_edges[..., iquant] = np.where(quantile_edges[..., iquant]==-np.inf, quantile_edges[..., iquant-1], quantile_edges[..., iquant])

# replace +infinity from empty values with the next bin's min,
# keeping mins monotonic and avoiding negative bin widths; any
# remaining +infinities (trailing empty bins) fall back to the
# corresponding max so the bin collapses to zero width.
for iquant in range(nquants - 2, -1, -1):
quantile_mins[..., iquant] = np.where(quantile_mins[..., iquant]==np.inf, quantile_mins[..., iquant+1], quantile_mins[..., iquant])
quantile_mins = np.where(quantile_mins==np.inf, quantile_edges, quantile_mins)

quantile_mins_list.append(quantile_mins)
quantile_maxs_list.append(quantile_edges)

iquantax = iax - ncond

quantile_integer_axis = hist.axis.Integer(0, axis.size, underflow=False, overflow=False, name=f"{axis.name}_int")
if continuous:
# Keep the original (Regular / Variable) quantile axis so that
# subsequent helpers in the chain can be indexed by the
# continuous CDF-style output of the previous helper.
quantile_integer_axis = axis
else:
quantile_integer_axis = hist.axis.Integer(0, axis.size, underflow=False, overflow=False, name=f"{axis.name}_int")
quantile_integer_axes.append(quantile_integer_axis)

helper_axes = condaxes[:iax+1] + quantile_integer_axes
Expand All @@ -725,11 +841,57 @@ def build_quantile_hists(df, cols, condaxes, quantaxes):

quantile_hists.append(helper_hist)

return quantile_hists
# Compute multidim centers and volumes for the final transformed bins.
# Each per-axis widths/centers array has shape covering only the
# conditional dims plus the quantile dims up to that axis; broadcast to
# the full (cond + all quant) shape and combine.
full_shape = tuple(shape_final)
per_dim_widths = []
per_dim_centers = []
for i in range(nquant):
mins_i = quantile_mins_list[i]
maxs_i = quantile_maxs_list[i]
widths_i = maxs_i - mins_i
centers_i = 0.5 * (maxs_i + mins_i)
ndim_extra = nquant - i - 1
new_shape = widths_i.shape + (1,) * ndim_extra
per_dim_widths.append(np.broadcast_to(widths_i.reshape(new_shape), full_shape))
per_dim_centers.append(np.broadcast_to(centers_i.reshape(new_shape), full_shape))

final_axes = condaxes + quantile_integer_axes
volume_hist = hist.Hist(*final_axes)
if nquant > 0:
volume = per_dim_widths[0].copy()
for w in per_dim_widths[1:]:
volume = volume * w
volume_hist.values(flow=True)[...] = volume

# Use the input quantile axis names as labels on the coord axis, with
# a unique placeholder for any unnamed axis.
coord_names = []
for i, ax in enumerate(quantaxes):
name = getattr(ax, "name", "") or ""
if not name or name in coord_names:
name = f"quant_{i}"
coord_names.append(name)
coord_axis = hist.axis.StrCategory(coord_names, name="coord", overflow=False)
centers_hist = hist.Hist(*final_axes, coord_axis)
centers_hist.values(flow=True)[...] = np.stack(per_dim_centers, axis=-1)
else:
centers_hist = hist.Hist(*final_axes)

return quantile_hists, centers_hist, volume_hist


def define_quantile_ints(df, cols, quantile_hists):
"""Define transformed columns corresponding to conditional quantile bins (integers)"""
"""Define transformed columns corresponding to conditional quantiles.

By default the helpers return the integer quantile bin index. If the
helper histograms produced by :func:`build_quantile_hists` were built in
continuous mode (their trailing quantile axis is not a plain ``Integer``
axis), the continuous quantile helpers are used instead, returning a
CDF-style value in ``[0, 1]``.
"""

ncols = len(cols)
nquant = len(quantile_hists)
Expand All @@ -738,23 +900,37 @@ def define_quantile_ints(df, cols, quantile_hists):
condcols = cols[:ncond]
quantcols = cols[ncond:]

# Detect continuous mode from the trailing (quantile) axis of the first
# helper histogram: continuous-mode helper histograms preserve the
# original (Regular / Variable) quantile axis, integer-mode ones use a
# generated Integer axis.
continuous = not isinstance(quantile_hists[0].axes[-1], hist.axis.Integer)

helper_cols_cond = condcols.copy()

suffix = "_quant" if continuous else "_iquant"

for col, quantile_hist in zip(quantcols, quantile_hists):

if len(quantile_hist.axes) > 1:
helper_hist = narf.hist_to_pyroot_boost(quantile_hist, tensor_rank=1)
quanthelper = ROOT.narf.make_quantile_helper(ROOT.std.move(helper_hist))
if continuous:
quanthelper = ROOT.narf.make_quantile_helper_continuous(ROOT.std.move(helper_hist))
else:
quanthelper = ROOT.narf.make_quantile_helper(ROOT.std.move(helper_hist))
else:
# special case for static quantiles with no conditional variables
vals = quantile_hist.values()
arr = ROOT.std.array["double", vals.size](vals)
quanthelper = ROOT.narf.QuantileHelperStatic[vals.size](arr)
if continuous:
quanthelper = ROOT.narf.QuantileHelperStaticContinuous[vals.size](arr)
else:
quanthelper = ROOT.narf.QuantileHelperStatic[vals.size](arr)

helper_cols = helper_cols_cond + [col]

outname = f"{col}_iquant"
df = df.Define(outname, quanthelper, helper_cols)
outname = f"{col}{suffix}"
df = narf.rdfutils.flexible_define(df, outname, quanthelper, helper_cols)
helper_cols_cond.append(outname)

quantile_axes = list(quantile_hists[-1].axes)
Expand Down
Loading