From 11043f851f0c39af99d4a447a65d26846280e392 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Sat, 14 Mar 2026 10:56:06 -0300 Subject: [PATCH 01/35] perf: accumulate moments with multithreading --- pandas/_libs/algos.pxd | 102 +++++++------------- pandas/_libs/algos.pyx | 98 +++++++------------ pandas/_libs/groupby.pyx | 43 ++++----- pandas/_libs/include/pandas/moments.h | 121 ++++++++++++++++++++++++ pandas/_libs/meson.build | 10 +- pandas/_libs/src/moments.c | 40 ++++++++ pandas/_libs/window/aggregations.pyx | 131 +++++++++++++------------- pandas/_libs/window/meson.build | 1 + 8 files changed, 318 insertions(+), 228 deletions(-) create mode 100644 pandas/_libs/include/pandas/moments.h create mode 100644 pandas/_libs/src/moments.c diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd index e47f7d56ba343..4d80831b603c4 100644 --- a/pandas/_libs/algos.pxd +++ b/pandas/_libs/algos.pxd @@ -6,6 +6,7 @@ from libc.math cimport ( from numpy cimport ( float64_t, int64_t, + uint8_t, ) from pandas._libs.dtypes cimport ( @@ -17,76 +18,37 @@ from pandas._libs.dtypes cimport ( cdef numeric_t kth_smallest_c(numeric_t* arr, Py_ssize_t k, Py_ssize_t n) noexcept nogil -@cython.cdivision(True) -cdef inline void moments_add_value( - float64_t val, - int64_t* nobs, - float64_t* mean, - float64_t* m2, - float64_t* m3, - float64_t* m4, - int max_moment, -) noexcept nogil: - cdef: - float64_t n - float64_t delta = val - mean[0] - float64_t delta_n, term1 - - nobs[0] += 1 - n = nobs[0] - delta_n = delta / n - term1 = delta * delta_n * (n - 1.0) - - if max_moment >= 4: - m4[0] += delta_n * ( - -4.0 * m3[0] - + delta_n * ( - 6.0 * m2[0] + term1 * (n * n - 3.0 * n + 3.0) - ) - ) - if max_moment >= 3: - m3[0] += delta_n * (term1 * (n - 2.0) - 3.0 * m2[0]) - m2[0] += term1 - mean[0] += delta_n - - -@cython.cdivision(True) -cdef inline float64_t calc_skew( - int64_t nobs, float64_t m2, float64_t m3 -) noexcept nogil: - cdef: - float64_t moments_ratio, correction, dnobs - - if nobs < 3: - return NAN - - dnobs = nobs - - moments_ratio = m3 / (m2 * sqrt(m2)) - correction = (dnobs * sqrt(dnobs - 1.0)) / (dnobs - 2.0) - return moments_ratio * correction - - -@cython.cdivision(True) -cdef inline float64_t calc_kurt( - int64_t nobs, float64_t m2, float64_t m4 -) noexcept nogil: - cdef: - float64_t result, dnobs, term1, term2, inner, correction - float64_t moments_ratio - - if nobs < 4: - return NAN - dnobs = nobs - moments_ratio = m4 / (m2 * m2) - term1 = dnobs * (dnobs + 1.0) * moments_ratio - term2 = 3.0 * (dnobs - 1.0) - inner = term1 - term2 - - correction = (dnobs - 1.0) / ((dnobs - 2.0) * (dnobs - 3.0)) - result = correction * inner - - return result +cdef extern from "pandas/moments.h": + ctypedef struct Moments: + int64_t n + float64_t mean + float64_t m2 + float64_t m3 + float64_t m4 + + void moments_add_value( + Moments* moments, + float64_t val, + int max_moment, + ) nogil + + Moments moments_merge( + Moments a, + Moments b, + int max_moment, + ) nogil + + Moments accumulate_moments_scalar( + const float64_t* values, + int64_t n, + bint skipna, + const uint8_t* mask, + int max_moment, + ) nogil + + float64_t calc_skew(Moments) nogil + + float64_t calc_kurt(Moments) nogil cdef enum TiebreakEnumType: diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 97aa64593a3fc..2e4585566c486 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -5,6 +5,7 @@ from libc.math cimport ( sqrt, ) from libc.stdlib cimport ( + calloc, free, malloc, ) @@ -1600,44 +1601,13 @@ def diff_2d( # ---------------------------------------------------------------------- -@cython.boundscheck(False) -@cython.wraparound(False) -cdef void accumulate_moments_scalar( - const float64_t[:] values, - bint skipna, - const uint8_t[:] mask, - int64_t* nobs, - float64_t* mean, - float64_t* m2, - float64_t* m3, - float64_t* m4, - int max_moment, -) noexcept nogil: - cdef: - Py_ssize_t i, n = len(values) - bint uses_mask = mask is not None - float64_t val - - for i in range(n): - val = values[i] - if uses_mask and mask[i]: - val = NaN - if skipna and isnan(val): - continue - moments_add_value(val, nobs, mean, m2, m3, m4, max_moment) - - @cython.boundscheck(False) @cython.wraparound(False) cdef void accumulate_moments_axis( const float64_t[:, :] values, bint skipna, const uint8_t[:, :] mask, - int64_t[:] nobs, - float64_t[:] mean, - float64_t[:] m2, - float64_t[:] m3, - float64_t[:] m4, + Moments *moments, int axis, int max_moment, ) noexcept nogil: @@ -1646,8 +1616,6 @@ cdef void accumulate_moments_axis( Py_ssize_t nouter, ninner bint uses_mask = mask is not None float64_t val - float64_t* m3_ptr = NULL - float64_t* m4_ptr = NULL if axis == 0: # Assumes F-contiguous @@ -1659,52 +1627,51 @@ cdef void accumulate_moments_axis( ninner = ncols for i in range(nouter): - if max_moment >= 3: - m3_ptr = &m3[i] - if max_moment >= 4: - m4_ptr = &m4[i] for j in range(ninner): val = values[j, i] if axis == 0 else values[i, j] if uses_mask and (mask[j, i] if axis == 0 else mask[i, j]): val = NaN if skipna and isnan(val): continue - moments_add_value(val, &nobs[i], &mean[i], &m2[i], m3_ptr, m4_ptr, - max_moment) + moments_add_value(&moments[i], val, max_moment) @cython.boundscheck(False) @cython.wraparound(False) def scalar_skew( - const float64_t[:] values, + const float64_t[::1] values, bint skipna, - const uint8_t[:] mask, + const uint8_t[::1] mask, ) -> float: cdef: - int64_t nobs = 0 - float64_t mean = 0.0, m2 = 0.0, m3 = 0.0 + Moments moments + Py_ssize_t n = values.shape[0] + const float64_t* p_values = &values[0] + const uint8_t* p_mask = &mask[0] if mask is not None else NULL with nogil: - accumulate_moments_scalar(values, skipna, mask, &nobs, &mean, &m2, &m3, NULL, 3) + moments = accumulate_moments_scalar(p_values, n, skipna, p_mask, 3) - return calc_skew(nobs, m2, m3) + return calc_skew(moments) @cython.boundscheck(False) @cython.wraparound(False) def scalar_kurt( - const float64_t[:] values, + const float64_t[::1] values, bint skipna, - const uint8_t[:] mask, + const uint8_t[::1] mask, ) -> float: cdef: - int64_t nobs = 0 - float64_t mean = 0.0, m2 = 0.0, m3 = 0.0, m4 = 0.0 + Moments moments + Py_ssize_t n = values.shape[0] + const float64_t* p_values = &values[0] + const uint8_t* p_mask = &mask[0] if mask is not None else NULL with nogil: - accumulate_moments_scalar(values, skipna, mask, &nobs, &mean, &m2, &m3, &m4, 4) + moments = accumulate_moments_scalar(p_values, n, skipna, p_mask, 4) - return calc_kurt(nobs, m2, m4) + return calc_kurt(moments) @cython.boundscheck(False) @@ -1718,18 +1685,19 @@ def axis_skew( cdef: Py_ssize_t i Py_ssize_t nouter = values.shape[1] if axis == 0 else values.shape[0] - int64_t[::1] nobs = np.zeros(nouter, dtype=np.int64) - float64_t[::1] mean = np.zeros(nouter) - float64_t[::1] m2 = np.zeros(nouter) - float64_t[::1] m3 = np.zeros(nouter) + Moments *moments = calloc(nouter, sizeof(Moments)) ndarray result_arr = np.empty(nouter, dtype=np.float64) float64_t[:] result = result_arr + if moments == NULL: + raise MemoryError() + with nogil: - accumulate_moments_axis(values, skipna, mask, nobs, mean, m2, m3, None, axis, 3) + accumulate_moments_axis(values, skipna, mask, moments, axis, 3) for i in range(nouter): - result[i] = calc_skew(nobs[i], m2[i], m3[i]) + result[i] = calc_skew(moments[i]) + free(moments) return result_arr @@ -1744,19 +1712,19 @@ def axis_kurt( cdef: Py_ssize_t i Py_ssize_t nouter = values.shape[1] if axis == 0 else values.shape[0] - int64_t[::1] nobs = np.zeros(nouter, dtype=np.int64) - float64_t[::1] mean = np.zeros(nouter) - float64_t[::1] m2 = np.zeros(nouter) - float64_t[::1] m3 = np.zeros(nouter) - float64_t[::1] m4 = np.zeros(nouter) + Moments *moments = calloc(nouter, sizeof(Moments)) ndarray result_arr = np.empty(nouter, dtype=np.float64) float64_t[:] result = result_arr + if moments == NULL: + raise MemoryError() + with nogil: - accumulate_moments_axis(values, skipna, mask, nobs, mean, m2, m3, m4, axis, 4) + accumulate_moments_axis(values, skipna, mask, moments, axis, 4) for i in range(nouter): - result[i] = calc_kurt(nobs[i], m2[i], m4[i]) + result[i] = calc_kurt(moments[i]) + free(moments) return result_arr diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 15abebef987ed..14b8320e7b044 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -10,6 +10,7 @@ from libc.math cimport ( sqrt, ) from libc.stdlib cimport ( + calloc, free, malloc, ) @@ -34,6 +35,7 @@ cnp.import_array() from pandas._libs cimport util from pandas._libs.algos cimport ( + Moments, calc_kurt, calc_skew, get_rank_nan_fill_val, @@ -1023,23 +1025,18 @@ def group_skew( ) -> None: cdef: Py_ssize_t i, j, N, K, lab, ngroups = len(counts) - int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) bint uses_mask = mask is not None - float64_t[:, ::1] mean, M2, M3 + Moments* ms float64_t val if len_values != len_labels: raise ValueError("len(index) != len(labels)") - nobs = np.zeros((out).shape, dtype=np.int64) - - mean = np.zeros((out).shape, dtype=np.float64) - # M2, and M3 correspond to 2nd and 3rd central moments - M2 = np.zeros((out).shape, dtype=np.float64) - M3 = np.zeros((out).shape, dtype=np.float64) - N, K = (values).shape + ms = calloc(ngroups * K, sizeof(Moments)) + if ms is NULL: + raise MemoryError() out[:, :] = 0.0 @@ -1060,15 +1057,16 @@ def group_skew( if skipna and (isnan(val) or _treat_as_na(val, False)): continue - moments_add_value(val, &nobs[lab, j], &mean[lab, j], &M2[lab, j], - &M3[lab, j], NULL, 3) + moments_add_value(&ms[lab * K + j], val, 3) for i in range(ngroups): for j in range(K): - out[i, j] = calc_skew(nobs[i, j], M2[i, j], M3[i, j]) + out[i, j] = calc_skew(ms[i * K + j]) if result_mask is not None and isnan(out[i, j]): result_mask[i, j] = 1 + free(ms) + @cython.wraparound(False) @cython.boundscheck(False) @@ -1085,24 +1083,18 @@ def group_kurt( ) -> None: cdef: Py_ssize_t i, j, N, K, lab, ngroups = len(counts) - int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) bint uses_mask = mask is not None - float64_t[:, ::1] mean, M2, M3, M4 + Moments* ms float64_t val if len_values != len_labels: raise ValueError("len(index) != len(labels)") - nobs = np.zeros((out).shape, dtype=np.int64) - - mean = np.zeros((out).shape, dtype=np.float64) - # M2, M3 and M4 correspond to 2nd, 3rd and 4th Central Moments - M2 = np.zeros((out).shape, dtype=np.float64) - M3 = np.zeros((out).shape, dtype=np.float64) - M4 = np.zeros((out).shape, dtype=np.float64) - N, K = (values).shape + ms = calloc(ngroups * K, sizeof(Moments)) + if ms is NULL: + raise MemoryError() out[:, :] = 0.0 @@ -1123,15 +1115,16 @@ def group_kurt( if skipna and (isnan(val) or _treat_as_na(val, False)): continue - moments_add_value(val, &nobs[lab, j], &mean[lab, j], &M2[lab, j], - &M3[lab, j], &M4[lab, j], 4) + moments_add_value(&ms[lab * K + j], val, 4) for i in range(ngroups): for j in range(K): - out[i, j] = calc_kurt(nobs[i, j], M2[i, j], M4[i, j]) + out[i, j] = calc_kurt(ms[i * K + j]) if result_mask is not None and isnan(out[i, j]): result_mask[i, j] = 1 + free(ms) + @cython.wraparound(False) @cython.boundscheck(False) diff --git a/pandas/_libs/include/pandas/moments.h b/pandas/_libs/include/pandas/moments.h new file mode 100644 index 0000000000000..2a56c7e19e169 --- /dev/null +++ b/pandas/_libs/include/pandas/moments.h @@ -0,0 +1,121 @@ +/* +Copyright (c) 2026, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. +*/ + +#pragma once + +#include +#include +#include + +#ifdef __cplusplus +// prevent name mangling +extern "C" { +#endif + +typedef struct { + int64_t n; + double mean; + double m2; + double m3; + double m4; +} Moments; + +static inline void moments_add_value(Moments *moments, double val, + int max_moment) { + double delta = val - moments->mean; + moments->n++; + double n = (double)moments->n; + double delta_n = delta / n; + double term1 = delta * delta_n * (n - 1.0); + + if (max_moment >= 4) { + moments->m4 += + delta_n * + (-4.0 * moments->m3 + + delta_n * (6.0 * moments->m2 + term1 * (n * n - 3.0 * n + 3.0))); + } + if (max_moment >= 3) { + moments->m3 += delta_n * (term1 * (n - 2.0) - 3.0 * moments->m2); + } + moments->m2 += term1; + moments->mean += delta_n; +} + +static inline Moments moments_merge(Moments a, Moments b, int max_moment) { + + if (a.n == 0) { + return b; + } + if (b.n == 0) { + return a; + } + + Moments out; + + out.n = a.n + b.n; + double n_a = (double)a.n; + double n_b = (double)b.n; + double delta = b.mean - a.mean; + double n = (double)out.n; + double delta_n = delta / n; + double term1 = delta * delta_n * n_a * n_b; + + if (max_moment >= 4) { + out.m4 = + a.m4 + b.m4 + + delta_n * (4.0 * (n_a * b.m3 - n_b * a.m3) + + delta_n * (6.0 * (n_a * n_a * b.m2 + n_b * n_b * a.m2) + + term1 * (n_a * n_a - n_a * n_b + n_b * n_b))); + } else { + out.m4 = 0.0; + } + if (max_moment >= 3) { + out.m3 = a.m3 + b.m3 + + delta_n * (3.0 * (n_a * b.m2 - n_b * a.m2) + term1 * (n_a - n_b)); + } else { + out.m3 = 0.0; + } + out.m2 = a.m2 + b.m2 + term1; + out.mean = a.mean + delta_n * n_b; + + return out; +} + +/// Compute central moments until `max_moment` using `n` elements from `values`. +/// The size is represented as signed integer for MSVC compatibility with +/// OpenMP. +Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna, + const uint8_t *mask, int max_moment); + +static inline double calc_skew(Moments moments) { + if (moments.n < 3) { + return NAN; + } + double dnobs = (double)moments.n; + double moments_ratio = moments.m3 / (moments.m2 * sqrt(moments.m2)); + double correction = (dnobs * sqrt(dnobs - 1.0)) / (dnobs - 2.0); + return moments_ratio * correction; +} + +static inline double calc_kurt(Moments moments) { + if (moments.n < 4) { + return NAN; + } + double dnobs = (double)moments.n; + double moments_ratio = moments.m4 / (moments.m2 * moments.m2); + double term1 = dnobs * (dnobs + 1.0) * moments_ratio; + double term2 = 3.0 * (dnobs - 1.0); + double inner = term1 - term2; + double correction = (dnobs - 1.0) / ((dnobs - 2.0) * (dnobs - 3.0)); + return correction * inner; +} + +#ifdef __cplusplus +} +#endif diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index 56ff2a01b450c..2c2122613d3e3 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -55,13 +55,19 @@ fast_float = subproject('fast_float') fast_float_dep = fast_float.get_variable('fast_float_dep') subdir('tslibs') +openmp_dep = dependency('OpenMP', required: false) libs_sources = { # Dict of extension name -> dict of {sources, include_dirs, and deps} # numpy include dir is implicitly included 'algos': { - 'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper], - 'deps': [_khash_primitive_helper_dep, m_dep], + 'sources': [ + 'algos.pyx', + 'src/moments.c', + _algos_common_helper, + _algos_take_helper, + ], + 'deps': [_khash_primitive_helper_dep, m_dep, openmp_dep], }, 'arrays': {'sources': ['arrays.pyx']}, 'groupby': {'sources': ['groupby.pyx'], 'deps': [m_dep]}, diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c new file mode 100644 index 0000000000000..fd31a9c1c1d11 --- /dev/null +++ b/pandas/_libs/src/moments.c @@ -0,0 +1,40 @@ +/* +Copyright (c) 2026, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. +*/ + +#include "pandas/moments.h" +#include + +Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna, + const uint8_t *mask, int max_moment) { + Moments result = {0}; + +#pragma omp parallel + { + Moments moments_local = {0}; + +#pragma omp for nowait + for (int64_t i = 0; i < n; i++) { + double val = values[i]; + if (mask && mask[i]) { + val = NAN; + } + if (skipna && isnan(val)) { + continue; + } + moments_add_value(&moments_local, val, max_moment); + } + +#pragma omp critical + { + result = moments_merge(result, moments_local, max_moment); + } + } + + return result; +} diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 619d5590cc207..3ffc14c1b4cbc 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -9,6 +9,7 @@ from libcpp.stack cimport stack from libcpp.unordered_map cimport unordered_map from pandas._libs.algos cimport ( + Moments, TiebreakEnumType, calc_kurt, calc_skew, @@ -492,26 +493,22 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, # Rolling skewness -cdef void add_skew(float64_t val, int64_t *nobs, - float64_t *mean, float64_t *m2, - float64_t *m3, +cdef void add_skew(float64_t val, Moments *m, bint *numerically_unstable, ) noexcept nogil: """ add a value from the skew calc """ cdef: - float64_t old_m3 = m3[0] + float64_t old_m3 = m.m3 # Not NaN if val == val: - moments_add_value(val, nobs, mean, m2, m3, NULL, 3) - if fabs(old_m3) * InvCondTol > fabs(m3[0]): + moments_add_value(m, val, 3) + if fabs(old_m3) * InvCondTol > fabs(m.m3): # possible catastrophic cancellation numerically_unstable[0] = True -cdef void remove_skew(float64_t val, int64_t *nobs, - float64_t *mean, float64_t *m2, - float64_t *m3, +cdef void remove_skew(float64_t val, Moments *m, bint *numerically_unstable) noexcept nogil: """ remove a value from the skew calc """ cdef: @@ -529,22 +526,22 @@ cdef void remove_skew(float64_t val, int64_t *nobs, # Not NaN if val == val: - nobs[0] -= 1 - n = (nobs[0]) - delta = val - mean[0] + m.n -= 1 + n = (m.n) + delta = val - m.mean delta_n = delta / n term1 = delta_n * delta * (n + 1.0) - m3_update = delta_n * (term1 * (n + 2.0) - 3.0 * m2[0]) - new_m3 = m3[0] - m3_update + m3_update = delta_n * (term1 * (n + 2.0) - 3.0 * m.m2) + new_m3 = m.m3 - m3_update - if (fabs(m3_update) + fabs(m3[0])) * InvCondTol > fabs(new_m3): + if (fabs(m3_update) + fabs(m.m3)) * InvCondTol > fabs(new_m3): # possible catastrophic cancellation numerically_unstable[0] = True - m3[0] = new_m3 - m2[0] -= term1 - mean[0] -= delta_n + m.m3 = new_m3 + m.m2 -= term1 + m.mean -= delta_n def roll_skew(const float64_t[:] values, ndarray[int64_t] start, @@ -552,8 +549,8 @@ def roll_skew(const float64_t[:] values, ndarray[int64_t] start, cdef: Py_ssize_t i, j float64_t val - float64_t mean, m2, m3 - int64_t nobs = 0, N = len(start) + Moments m + int64_t N = len(start) int64_t s, e ndarray[float64_t] output bint is_monotonic_increasing_bounds @@ -585,31 +582,35 @@ def roll_skew(const float64_t[:] values, ndarray[int64_t] start, # calculate deletes for j in range(start[i - 1], s): val = values[j] - remove_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable) + remove_skew(val, &m, &numerically_unstable) # calculate adds for j in range(end[i - 1], e): val = values[j] - add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable) + add_skew(val, &m, &numerically_unstable) if requires_recompute or numerically_unstable: - mean = m2 = m3 = 0.0 - nobs = 0 + m.n = 0 + m.mean = 0.0 + m.m2 = 0.0 + m.m3 = 0.0 + m.m4 = 0.0 for j in range(s, e): val = values[j] - add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable) + add_skew(val, &m, &numerically_unstable) numerically_unstable = False - output[i] = NaN if nobs < minp else calc_skew(nobs, m2, m3) + output[i] = NaN if m.n < minp else calc_skew(m) if not is_monotonic_increasing_bounds: - nobs = 0 - mean = 0.0 - m2 = 0.0 - m3 = 0.0 + m.n = 0 + m.mean = 0.0 + m.m2 = 0.0 + m.m3 = 0.0 + m.m4 = 0.0 return output @@ -617,26 +618,22 @@ def roll_skew(const float64_t[:] values, ndarray[int64_t] start, # Rolling kurtosis -cdef void add_kurt(float64_t val, int64_t *nobs, - float64_t *mean, float64_t *m2, - float64_t *m3, float64_t *m4, +cdef void add_kurt(float64_t val, Moments *m, bint *numerically_unstable, ) noexcept nogil: """ add a value from the kurotic calc """ cdef: - float64_t old_m4 = m4[0] + float64_t old_m4 = m.m4 # Not NaN if val == val: - moments_add_value(val, nobs, mean, m2, m3, m4, 4) - if fabs(old_m4) * InvCondTol > fabs(m4[0]): + moments_add_value(m, val, 4) + if fabs(old_m4) * InvCondTol > fabs(m.m4): # possible catastrophic cancellation numerically_unstable[0] = True -cdef void remove_kurt(float64_t val, int64_t *nobs, - float64_t *mean, float64_t *m2, - float64_t *m3, float64_t *m4, +cdef void remove_kurt(float64_t val, Moments *m, bint *numerically_unstable, ) noexcept nogil: """ remove a value from the kurotic calc """ @@ -645,37 +642,37 @@ cdef void remove_kurt(float64_t val, int64_t *nobs, # Not NaN if val == val: - nobs[0] -= 1 - n = (nobs[0]) - delta = val - mean[0] + m.n -= 1 + n = (m.n) + delta = val - m.mean delta_n = delta / n term1 = delta_n * delta * (n + 1.0) m4_update = delta_n * ( - 4.0 * m3[0] + 4.0 * m.m3 + delta_n * ( - 6.0 * m2[0] + 6.0 * m.m2 - term1 * (n * n + 3.0 * n + 3.0) ) ) - new_m4 = m4[0] + m4_update + new_m4 = m.m4 + m4_update - if (fabs(m4_update) + fabs(m4[0])) * InvCondTol > fabs(new_m4): + if (fabs(m4_update) + fabs(m.m4)) * InvCondTol > fabs(new_m4): # possible catastrophic cancellation numerically_unstable[0] = True - m4[0] = new_m4 - m3[0] -= delta_n * (term1 * (n + 2.0) - 3.0 * m2[0]) - m2[0] -= term1 - mean[0] -= delta_n + m.m4 = new_m4 + m.m3 -= delta_n * (term1 * (n + 2.0) - 3.0 * m.m2) + m.m2 -= term1 + m.mean -= delta_n def roll_kurt(const float64_t[:] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp) -> np.ndarray: cdef: Py_ssize_t i, j - float64_t mean, m2, m3, m4 - int64_t nobs, s, e + Moments m + int64_t s, e int64_t N = len(start) ndarray[float64_t] output bint is_monotonic_increasing_bounds @@ -707,30 +704,32 @@ def roll_kurt(const float64_t[:] values, ndarray[int64_t] start, # and removed # calculate deletes for j in range(start[i - 1], s): - remove_kurt(values[j], &nobs, &mean, &m2, &m3, &m4, - &numerically_unstable) + remove_kurt(values[j], &m, &numerically_unstable) # calculate adds for j in range(end[i - 1], e): - add_kurt(values[j], &nobs, &mean, &m2, &m3, &m4, - &numerically_unstable) + add_kurt(values[j], &m, &numerically_unstable) if requires_recompute or numerically_unstable: - mean = m2 = m3 = m4 = 0.0 - nobs = 0 + m.n = 0 + m.mean = 0.0 + m.m2 = 0.0 + m.m3 = 0.0 + m.m4 = 0.0 for j in range(s, e): - add_kurt(values[j], &nobs, &mean, &m2, &m3, &m4, - &numerically_unstable) + add_kurt(values[j], &m, &numerically_unstable) + + numerically_unstable = False - output[i] = NaN if nobs < minp else calc_kurt(nobs, m2, m4) + output[i] = NaN if m.n < minp else calc_kurt(m) if not is_monotonic_increasing_bounds: - nobs = 0 - mean = 0.0 - m2 = 0.0 - m3 = 0.0 - m4 = 0.0 + m.n = 0 + m.mean = 0.0 + m.m2 = 0.0 + m.m3 = 0.0 + m.m4 = 0.0 return output diff --git a/pandas/_libs/window/meson.build b/pandas/_libs/window/meson.build index 8c00a98b1241a..3e3e25b626914 100644 --- a/pandas/_libs/window/meson.build +++ b/pandas/_libs/window/meson.build @@ -10,6 +10,7 @@ py.extension_module( ['aggregations.pyx'], cython_args: cy_args, include_directories: [inc_np, inc_pd], + dependencies: [m_dep], subdir: 'pandas/_libs/window', override_options: ['cython_language=cpp'], install: true, From 2c9de5c6be4238699630dffe8334858d7e36a8b9 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Sat, 14 Mar 2026 12:48:09 -0300 Subject: [PATCH 02/35] fix: omit warnings when OpenMP not used --- pandas/_libs/src/moments.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index fd31a9c1c1d11..dc63ea6412276 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -14,11 +14,15 @@ Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna, const uint8_t *mask, int max_moment) { Moments result = {0}; -#pragma omp parallel +#ifdef _OPENMP +# pragma omp parallel +#endif { Moments moments_local = {0}; -#pragma omp for nowait +#ifdef _OPENMP +# pragma omp for nowait +#endif for (int64_t i = 0; i < n; i++) { double val = values[i]; if (mask && mask[i]) { @@ -30,7 +34,9 @@ Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna, moments_add_value(&moments_local, val, max_moment); } -#pragma omp critical +#ifdef _OPENMP +# pragma omp critical +#endif { result = moments_merge(result, moments_local, max_moment); } From 69786e80a279faaaafa811172f5263516823cb0d Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Sat, 14 Mar 2026 13:27:49 -0300 Subject: [PATCH 03/35] fix: try to fix msvc error --- pandas/_libs/src/moments.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index dc63ea6412276..009790951ae6a 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -9,10 +9,12 @@ The full license is in the LICENSE file, distributed with this software. #include "pandas/moments.h" #include +#include Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna, const uint8_t *mask, int max_moment) { Moments result = {0}; + int64_t i; #ifdef _OPENMP # pragma omp parallel @@ -23,7 +25,7 @@ Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna, #ifdef _OPENMP # pragma omp for nowait #endif - for (int64_t i = 0; i < n; i++) { + for (i = 0; i < n; i++) { double val = values[i]; if (mask && mask[i]) { val = NAN; From 3464b248c4b92e8834660062054193091b3528ef Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Sat, 14 Mar 2026 15:00:02 -0300 Subject: [PATCH 04/35] chore: rename output and don't assign 0.0 on else --- pandas/_libs/include/pandas/moments.h | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/include/pandas/moments.h b/pandas/_libs/include/pandas/moments.h index 2a56c7e19e169..e492e8200c82b 100644 --- a/pandas/_libs/include/pandas/moments.h +++ b/pandas/_libs/include/pandas/moments.h @@ -56,35 +56,31 @@ static inline Moments moments_merge(Moments a, Moments b, int max_moment) { return a; } - Moments out; + Moments result; - out.n = a.n + b.n; + result.n = a.n + b.n; double n_a = (double)a.n; double n_b = (double)b.n; double delta = b.mean - a.mean; - double n = (double)out.n; - double delta_n = delta / n; + double delta_n = delta / (double)result.n; double term1 = delta * delta_n * n_a * n_b; if (max_moment >= 4) { - out.m4 = + result.m4 = a.m4 + b.m4 + delta_n * (4.0 * (n_a * b.m3 - n_b * a.m3) + delta_n * (6.0 * (n_a * n_a * b.m2 + n_b * n_b * a.m2) + term1 * (n_a * n_a - n_a * n_b + n_b * n_b))); - } else { - out.m4 = 0.0; } if (max_moment >= 3) { - out.m3 = a.m3 + b.m3 + - delta_n * (3.0 * (n_a * b.m2 - n_b * a.m2) + term1 * (n_a - n_b)); - } else { - out.m3 = 0.0; + result.m3 = + a.m3 + b.m3 + + delta_n * (3.0 * (n_a * b.m2 - n_b * a.m2) + term1 * (n_a - n_b)); } - out.m2 = a.m2 + b.m2 + term1; - out.mean = a.mean + delta_n * n_b; + result.m2 = a.m2 + b.m2 + term1; + result.mean = a.mean + delta_n * n_b; - return out; + return result; } /// Compute central moments until `max_moment` using `n` elements from `values`. From a86820e9b4b879615ca700a5f1e84f93f3e49b13 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Sat, 14 Mar 2026 15:48:57 -0300 Subject: [PATCH 05/35] test: use almost_equal instead of strict equality Depending on how the job is distributed among the threads, the tests may compute different results. So, assert almost equality instead of strict equality. --- pandas/tests/apply/test_str.py | 3 ++- pandas/tests/arrays/floating/test_function.py | 3 ++- pandas/tests/arrays/integer/test_function.py | 3 ++- pandas/tests/test_nanops.py | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index e5a9492630b13..9589db07dbcee 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -41,7 +41,8 @@ def test_apply_with_string_funcs(float_frame, func, kwds, how): def test_with_string_args(datetime_series, all_numeric_reductions): result = datetime_series.apply(all_numeric_reductions) expected = getattr(datetime_series, all_numeric_reductions)() - assert result == expected + # reductions computed in parallel may yield different results + tm.assert_almost_equal(result, expected) @pytest.mark.parametrize("op", ["mean", "median", "std", "var"]) diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py index a5938b2aa93e8..2fcdcd5a9e823 100644 --- a/pandas/tests/arrays/floating/test_function.py +++ b/pandas/tests/arrays/floating/test_function.py @@ -116,7 +116,8 @@ def test_stat_method(pandasmethname, kwargs): s2 = pd.Series(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype="float64") pandasmeth = getattr(s2, pandasmethname) expected = pandasmeth(**kwargs) - assert expected == result + # reductions computed in parallel may yield different results + tm.assert_almost_equal(result, expected) def test_value_counts_na(): diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 6b3308baf3ee7..a288ab631ae3b 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -134,7 +134,8 @@ def test_stat_method(pandasmethname, kwargs): s2 = pd.Series(data=[1, 2, 3, 4, 5, 6], dtype="Int64") pandasmeth = getattr(s2, pandasmethname) expected = pandasmeth(**kwargs) - assert expected == result + # reductions computed in parallel may yield different results + tm.assert_almost_equal(result, expected) def test_value_counts_na(): diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index acd92d916bdcc..b83e2d6e0e7f2 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1243,7 +1243,8 @@ def test_nanops_independent_of_mask_param(operation): mask = ser.isna() median_expected = operation(ser._values) median_result = operation(ser._values, mask=mask._values) - assert median_expected == median_result + # reductions computed in parallel may yield different results + tm.assert_almost_equal(median_result, median_expected) @pytest.mark.parametrize("min_count", [-1, 0]) From 3485af7cb12146ff4b7f2e4fda1714bc4fc10a8a Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Mon, 16 Mar 2026 23:28:11 -0300 Subject: [PATCH 06/35] perf: use AVX2 instructions --- pandas/_libs/src/moments.c | 204 ++++++++++++++++++++++++++++++++++--- 1 file changed, 188 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index 009790951ae6a..59b26283244c9 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -10,31 +10,203 @@ The full license is in the LICENSE file, distributed with this software. #include "pandas/moments.h" #include #include +#include + +#ifdef _OPENMP +# include +#else +static inline int omp_get_thread_num() { return 0; } +static inline int omp_get_num_threads() { return 1; } +#endif + +/* --- AVX2 SIMD Implementation --- */ + +#ifdef __x86_64__ +# if defined(__clang__) +# pragma clang attribute push(__attribute__((target("avx2"))), \ + apply_to = function) +# define PANDAS_HAS_AVX2 1 +# elif defined(__GNUC__) +# pragma GCC push_options +# pragma GCC target("avx2") +# define PANDAS_HAS_AVX2 1 +# endif // defined(__GNUC__) +#endif // ifdef __x86_64__ + +#ifdef PANDAS_HAS_AVX2 +# include + +// Vectorization of [moments_add_value] +static inline Moments accumulate_moments_avx2(const double *values, int64_t n, + int skipna, const uint8_t *mask, + int max_moment) { + __m256d v_mean = _mm256_setzero_pd(); + __m256d v_m2 = _mm256_setzero_pd(); + __m256d v_m3 = _mm256_setzero_pd(); + __m256d v_m4 = _mm256_setzero_pd(); + // Using double for v_n to avoid epi64 -> pd conversions. + // The results are exact up to n < 2^53. + __m256d v_n = _mm256_setzero_pd(); + + __m256d v_one = _mm256_set1_pd(1.0); + __m256d v_zero = _mm256_setzero_pd(); + __m256d v_nan = _mm256_set1_pd(NAN); + + int64_t i = 0; + for (; i < n - 4; i += 4) { + __m256d v_val = _mm256_loadu_pd(values + i); + + if (mask) { + uint32_t m32; + memcpy(&m32, mask + i, sizeof(uint32_t)); + __m128i v_mask_bytes = _mm_cvtsi32_si128((int)m32); + __m256i v_mask_epi64 = _mm256_cvtepu8_epi64(v_mask_bytes); + __m256i v_mask_is_zero = + _mm256_cmpeq_epi64(v_mask_epi64, _mm256_setzero_si256()); + // replace val with NAN where mask is 1. + v_val = + _mm256_blendv_pd(v_nan, v_val, _mm256_castsi256_pd(v_mask_is_zero)); + } + + __m256d v_skip_mask = + skipna ? _mm256_cmp_pd(v_val, v_val, _CMP_UNORD_Q) : v_zero; + // Increment 1 where we do not skip + __m256d v_n_increment = _mm256_andnot_pd(v_skip_mask, v_one); + v_n = _mm256_add_pd(v_n, v_n_increment); + __m256d v_n_nonzero = _mm256_max_pd(v_n, v_one); + + __m256d v_delta = _mm256_sub_pd(v_val, v_mean); + + // replace delta with zero when skipping to don't update moments + v_delta = _mm256_blendv_pd(v_delta, v_zero, v_skip_mask); + __m256d v_delta_n = _mm256_div_pd(v_delta, v_n_nonzero); + + __m256d v_delta_n2 = _mm256_mul_pd(v_delta, v_delta_n); + __m256d v_term1 = _mm256_mul_pd(v_delta_n2, _mm256_sub_pd(v_n, v_one)); + + if (max_moment >= 4) { + __m256d v_n2 = _mm256_mul_pd(v_n, v_n); + // n * n - 3.0 * n + 3.0 + __m256d v_n_term = _mm256_add_pd( + _mm256_sub_pd(v_n2, _mm256_mul_pd(_mm256_set1_pd(3.0), v_n)), + _mm256_set1_pd(3.0)); + __m256d v_m4_update = _mm256_mul_pd( + v_delta_n, + _mm256_add_pd( + _mm256_mul_pd(_mm256_set1_pd(-4.0), v_m3), + _mm256_mul_pd( + v_delta_n, + _mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(6.0), v_m2), + _mm256_mul_pd(v_term1, v_n_term))))); + v_m4 = _mm256_add_pd(v_m4, v_m4_update); + } + + if (max_moment >= 3) { + __m256d v_m3_term = _mm256_mul_pd( + v_delta_n, + _mm256_sub_pd( + _mm256_mul_pd(v_term1, _mm256_sub_pd(v_n, _mm256_set1_pd(2.0))), + _mm256_mul_pd(_mm256_set1_pd(3.0), v_m2))); + v_m3 = _mm256_add_pd(v_m3, v_m3_term); + } + + v_m2 = _mm256_add_pd(v_m2, v_term1); + v_mean = _mm256_add_pd(v_mean, v_delta_n); + } + + double n_arr[4], mean_arr[4], m2_arr[4], m3_arr[4], m4_arr[4]; + _mm256_storeu_pd(n_arr, v_n); + _mm256_storeu_pd(mean_arr, v_mean); + _mm256_storeu_pd(m2_arr, v_m2); + _mm256_storeu_pd(m3_arr, v_m3); + _mm256_storeu_pd(m4_arr, v_m4); + + Moments moments_arr[4]; + for (int j = 0; j < 4; j++) { + moments_arr[j] = (Moments){(int64_t)round(n_arr[j]), mean_arr[j], m2_arr[j], + m3_arr[j], m4_arr[j]}; + } + + // Distribute remaining values across chunks + for (; i < n; i++) { + double val = values[i]; + if (mask && mask[i]) + val = NAN; + if (skipna && isnan(val)) + continue; + moments_add_value(&moments_arr[i % 4], val, max_moment); + } + + // pairwise merge for numerical stability + Moments m_01 = moments_merge(moments_arr[0], moments_arr[1], max_moment); + Moments m_23 = moments_merge(moments_arr[2], moments_arr[3], max_moment); + + return moments_merge(m_01, m_23, max_moment); +} +#endif + +#ifdef __x86_64__ +# if defined(__clang__) +# pragma clang attribute pop +# elif defined(__GNUC__) +# pragma GCC pop_options +# endif +#endif // ifdef __x86_64__ + +/* --- Scalar Fallback Implementation --- */ + +static inline Moments accumulate_moments_scalar_block(const double *values, + int64_t n, int skipna, + const uint8_t *mask, + int max_moment) { + Moments moments = {0}; + for (int64_t i = 0; i < n; i++) { + double val = values[i]; + if (mask && mask[i]) + val = NAN; + if (skipna && isnan(val)) + continue; + moments_add_value(&moments, val, max_moment); + } + return moments; +} + +/* --- Accumulation Dispatch (Choose AVX2 or Scalar) --- */ + +static inline Moments accumulate_moments_dispatch(const double *values, + int64_t n, int skipna, + const uint8_t *mask, + int max_moment) { +#if defined(PANDAS_HAS_AVX2) && (defined(__GNUC__) || defined(__clang__)) + if (__builtin_cpu_supports("avx2")) { + return accumulate_moments_avx2(values, n, skipna, mask, max_moment); + } +#endif + return accumulate_moments_scalar_block(values, n, skipna, mask, max_moment); +} +#undef PANDAS_HAS_AVX2 + +/* --- Public API (Orchestrates OpenMP Parallelism) --- */ Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna, const uint8_t *mask, int max_moment) { Moments result = {0}; - int64_t i; #ifdef _OPENMP -# pragma omp parallel + // parallel threshold chosen based on `perf report` where libgomp wasn't the + // overhead +# pragma omp parallel if (n > 10000) #endif { - Moments moments_local = {0}; + int tid = omp_get_thread_num(); + int num_threads = omp_get_num_threads(); -#ifdef _OPENMP -# pragma omp for nowait -#endif - for (i = 0; i < n; i++) { - double val = values[i]; - if (mask && mask[i]) { - val = NAN; - } - if (skipna && isnan(val)) { - continue; - } - moments_add_value(&moments_local, val, max_moment); - } + int64_t start = (tid * n) / num_threads; + int64_t end = tid == num_threads - 1 ? n : ((tid + 1) * n) / num_threads; + + Moments moments_local = + accumulate_moments_dispatch(values + start, end - start, skipna, + mask ? mask + start : NULL, max_moment); #ifdef _OPENMP # pragma omp critical From f7c0d8558152e6381fc691d3d5342a9308e818bc Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Wed, 18 Mar 2026 21:05:45 -0300 Subject: [PATCH 07/35] wip: neon SIMD --- pandas/_libs/src/moments.c | 138 ++++++++++++++++++++++++++++++++++++- 1 file changed, 137 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index 59b26283244c9..91b3fb810ac49 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -153,6 +153,138 @@ static inline Moments accumulate_moments_avx2(const double *values, int64_t n, # endif #endif // ifdef __x86_64__ +/* --- NEON SIMD Implementation --- */ + +#ifdef __aarch64__ +# if defined(__clang__) +# pragma clang attribute push(__attribute__((target("simd"))), \ + apply_to = function) +# define PANDAS_HAS_NEON 1 +# elif defined(__GNUC__) +# pragma GCC push_options +# pragma GCC target("+simd") +# define PANDAS_HAS_NEON 1 +# endif +#endif + +#ifdef PANDAS_HAS_NEON +# include + +// Vectorization of [moments_add_value] +static inline Moments accumulate_moments_neon(const double *values, int64_t n, + int skipna, const uint8_t *mask, + int max_moment) { + float64x2_t v_mean = vdupq_n_f64(0.0); + float64x2_t v_m2 = vdupq_n_f64(0.0); + float64x2_t v_m3 = vdupq_n_f64(0.0); + float64x2_t v_m4 = vdupq_n_f64(0.0); + // Using double for v_n to avoid conversions. + float64x2_t v_n = vdupq_n_f64(0.0); + + float64x2_t v_one = vdupq_n_f64(1.0); + float64x2_t v_zero = vdupq_n_f64(0.0); + float64x2_t v_nan = vdupq_n_f64(NAN); + + int64_t i = 0; + for (; i < n - 2; i += 2) { + float64x2_t v_val = vld1q_f64(values + i); + + uint64x2_t v_keep_mask = vdupq_n_u64(-1ULL); + + if (mask) { + uint16_t m16; + memcpy(&m16, mask + i, sizeof(uint16_t)); + uint8x8_t v_mask_bytes = vreinterpret_u8_u16(vdup_n_u16(m16)); + // Expand 2 bytes to 2x 64-bit. + uint16x8_t v_m16 = vmovl_u8(v_mask_bytes); + uint32x4_t v_m32 = vmovl_u16(vget_low_u16(v_m16)); + uint64x2_t v_m64 = vmovl_u32(vget_low_u32(v_m32)); + + uint64x2_t v_mask_is_zero = vceqq_u64(v_m64, vdupq_n_u64(0)); + // replace val with NAN where mask is != 0. + v_val = vbslq_f64(v_mask_is_zero, v_val, v_nan); + } + + if (skipna) { + uint64x2_t v_is_not_nan = vceqq_f64(v_val, v_val); + v_keep_mask = vandq_u64(v_keep_mask, v_is_not_nan); + } + + // Increment 1 where we do not skip + float64x2_t v_n_increment = vbslq_f64(v_keep_mask, v_one, v_zero); + v_n = vaddq_f64(v_n, v_n_increment); + float64x2_t v_n_nonzero = vmaxq_f64(v_n, v_one); + + float64x2_t v_delta = vsubq_f64(v_val, v_mean); + + // replace delta with zero when skipping to don't update moments + v_delta = vbslq_f64(v_keep_mask, v_delta, v_zero); + float64x2_t v_delta_n = vdivq_f64(v_delta, v_n_nonzero); + + float64x2_t v_delta_n2 = vmulq_f64(v_delta, v_delta_n); + float64x2_t v_term1 = vmulq_f64(v_delta_n2, vsubq_f64(v_n, v_one)); + + if (max_moment >= 4) { + float64x2_t v_n2 = vmulq_f64(v_n, v_n); + // n * n - 3.0 * n + 3.0 + float64x2_t v_n_term = vaddq_f64( + vsubq_f64(v_n2, vmulq_f64(vdupq_n_f64(3.0), v_n)), vdupq_n_f64(3.0)); + float64x2_t v_m4_update = vmulq_f64( + v_delta_n, + vaddq_f64( + vmulq_f64(vdupq_n_f64(-4.0), v_m3), + vmulq_f64(v_delta_n, vaddq_f64(vmulq_f64(vdupq_n_f64(6.0), v_m2), + vmulq_f64(v_term1, v_n_term))))); + v_m4 = vaddq_f64(v_m4, v_m4_update); + } + + if (max_moment >= 3) { + float64x2_t v_m3_term = vmulq_f64( + v_delta_n, + vsubq_f64(vmulq_f64(v_term1, vsubq_f64(v_n, vdupq_n_f64(2.0))), + vmulq_f64(vdupq_n_f64(3.0), v_m2))); + v_m3 = vaddq_f64(v_m3, v_m3_term); + } + + v_m2 = vaddq_f64(v_m2, v_term1); + v_mean = vaddq_f64(v_mean, v_delta_n); + } + + double n_arr[2], mean_arr[2], m2_arr[2], m3_arr[2], m4_arr[2]; + vst1q_f64(n_arr, v_n); + vst1q_f64(mean_arr, v_mean); + vst1q_f64(m2_arr, v_m2); + vst1q_f64(m3_arr, v_m3); + vst1q_f64(m4_arr, v_m4); + + Moments moments_arr[2]; + for (int j = 0; j < 2; j++) { + moments_arr[j] = (Moments){(int64_t)round(n_arr[j]), mean_arr[j], m2_arr[j], + m3_arr[j], m4_arr[j]}; + } + + // Distribute remaining values across chunks + for (; i < n; i++) { + double val = values[i]; + if (mask && mask[i]) + val = NAN; + if (skipna && isnan(val)) + continue; + moments_add_value(&moments_arr[i % 2], val, max_moment); + } + + return moments_merge(moments_arr[0], moments_arr[1], max_moment); +} +#endif + +#ifdef __aarch64__ +# if defined(__clang__) +# pragma clang attribute pop +# elif defined(__GNUC__) +# pragma GCC pop_options +# endif +#endif // ifdef __aarch64__ + /* --- Scalar Fallback Implementation --- */ static inline Moments accumulate_moments_scalar_block(const double *values, @@ -171,7 +303,7 @@ static inline Moments accumulate_moments_scalar_block(const double *values, return moments; } -/* --- Accumulation Dispatch (Choose AVX2 or Scalar) --- */ +/* --- Accumulation Dispatch (Choose AVX2, NEON or Scalar) --- */ static inline Moments accumulate_moments_dispatch(const double *values, int64_t n, int skipna, @@ -181,10 +313,14 @@ static inline Moments accumulate_moments_dispatch(const double *values, if (__builtin_cpu_supports("avx2")) { return accumulate_moments_avx2(values, n, skipna, mask, max_moment); } +#endif +#ifdef PANDAS_HAS_NEON + return accumulate_moments_neon(values, n, skipna, mask, max_moment); #endif return accumulate_moments_scalar_block(values, n, skipna, mask, max_moment); } #undef PANDAS_HAS_AVX2 +#undef PANDAS_HAS_NEON /* --- Public API (Orchestrates OpenMP Parallelism) --- */ From c508a10a2b5fb25c4232d17362ceeca7479790b4 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Fri, 20 Mar 2026 11:43:27 -0300 Subject: [PATCH 08/35] refactor: use vector extensions --- pandas/_libs/src/moments.c | 301 ++++++++++--------------------------- 1 file changed, 78 insertions(+), 223 deletions(-) diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index 91b3fb810ac49..6e72885d6a298 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -19,107 +19,106 @@ static inline int omp_get_thread_num() { return 0; } static inline int omp_get_num_threads() { return 1; } #endif -/* --- AVX2 SIMD Implementation --- */ +/* --- SIMD Implementation --- */ +#if defined(__clang__) +typedef double v4d __attribute__((ext_vector_type(4), aligned(1))); +typedef long long v4si __attribute__((ext_vector_type(4), aligned(1))); +# define PANDAS_HAS_SIMD 1 +# if defined(__x86_64__) +# define PANDAS_SIMD_TARGETS \ + __attribute__((target_clones("avx2", "default"))) +# else +# define PANDAS_SIMD_TARGETS +# endif +#elif defined(__GNUC__) +typedef double v4d __attribute__((vector_size(32), aligned(1))); +typedef long long v4si __attribute__((vector_size(32), aligned(1))); +# define PANDAS_HAS_SIMD 1 +# if defined(__x86_64__) +# define PANDAS_SIMD_TARGETS \ + __attribute__((target_clones("avx2", "default"))) +# else +# define PANDAS_SIMD_TARGETS +# endif +#endif -#ifdef __x86_64__ +#ifdef PANDAS_HAS_SIMD +/* Vector select: returns (mask ? a : b) where mask is all-ones or all-zeros */ # if defined(__clang__) -# pragma clang attribute push(__attribute__((target("avx2"))), \ - apply_to = function) -# define PANDAS_HAS_AVX2 1 -# elif defined(__GNUC__) -# pragma GCC push_options -# pragma GCC target("avx2") -# define PANDAS_HAS_AVX2 1 -# endif // defined(__GNUC__) -#endif // ifdef __x86_64__ - -#ifdef PANDAS_HAS_AVX2 -# include - -// Vectorization of [moments_add_value] -static inline Moments accumulate_moments_avx2(const double *values, int64_t n, - int skipna, const uint8_t *mask, - int max_moment) { - __m256d v_mean = _mm256_setzero_pd(); - __m256d v_m2 = _mm256_setzero_pd(); - __m256d v_m3 = _mm256_setzero_pd(); - __m256d v_m4 = _mm256_setzero_pd(); +# define v_select(mask, a, b) ((mask) ? (a) : (b)) +# else +// gcc doesn't have ternary operators when compiling C files. +# define v_select(mask, a, b) \ + ((v4d)(((v4si)(mask) & (v4si)(a)) | (~(v4si)(mask) & (v4si)(b)))) +# endif + +PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values, + int64_t n, int skipna, + const uint8_t *mask, + int max_moment) { + v4d v_mean = {0.0, 0.0, 0.0, 0.0}; + v4d v_m2 = {0.0, 0.0, 0.0, 0.0}; + v4d v_m3 = {0.0, 0.0, 0.0, 0.0}; + v4d v_m4 = {0.0, 0.0, 0.0, 0.0}; // Using double for v_n to avoid epi64 -> pd conversions. // The results are exact up to n < 2^53. - __m256d v_n = _mm256_setzero_pd(); + v4d v_n = {0.0, 0.0, 0.0, 0.0}; - __m256d v_one = _mm256_set1_pd(1.0); - __m256d v_zero = _mm256_setzero_pd(); - __m256d v_nan = _mm256_set1_pd(NAN); + v4d v_one = {1.0, 1.0, 1.0, 1.0}; + v4d v_zero = {0.0, 0.0, 0.0, 0.0}; + v4d v_nan = {NAN, NAN, NAN, NAN}; int64_t i = 0; for (; i < n - 4; i += 4) { - __m256d v_val = _mm256_loadu_pd(values + i); + v4d v_val = *(v4d *)(values + i); if (mask) { - uint32_t m32; - memcpy(&m32, mask + i, sizeof(uint32_t)); - __m128i v_mask_bytes = _mm_cvtsi32_si128((int)m32); - __m256i v_mask_epi64 = _mm256_cvtepu8_epi64(v_mask_bytes); - __m256i v_mask_is_zero = - _mm256_cmpeq_epi64(v_mask_epi64, _mm256_setzero_si256()); + v4d v_m_val = {mask[i + 0] ? 1.0 : 0.0, mask[i + 1] ? 1.0 : 0.0, + mask[i + 2] ? 1.0 : 0.0, mask[i + 3] ? 1.0 : 0.0}; // replace val with NAN where mask is 1. - v_val = - _mm256_blendv_pd(v_nan, v_val, _mm256_castsi256_pd(v_mask_is_zero)); + v_val = v_select(v_m_val == v_one, v_nan, v_val); } - __m256d v_skip_mask = - skipna ? _mm256_cmp_pd(v_val, v_val, _CMP_UNORD_Q) : v_zero; - // Increment 1 where we do not skip - __m256d v_n_increment = _mm256_andnot_pd(v_skip_mask, v_one); - v_n = _mm256_add_pd(v_n, v_n_increment); - __m256d v_n_nonzero = _mm256_max_pd(v_n, v_one); + // skip_mask is 1.0 if we should skip, 0.0 otherwise + v4d v_is_nan = v_select(v_val == v_val, v_zero, v_one); + v4d v_skip_mask = skipna ? v_is_nan : v_zero; + + // Increment 1 where we do NOT skip + v4d v_n_increment = v_one - v_skip_mask; + v_n += v_n_increment; + v4d v_n_nonzero = v_select(v_n < v_one, v_one, v_n); - __m256d v_delta = _mm256_sub_pd(v_val, v_mean); + v4d v_delta = v_val - v_mean; // replace delta with zero when skipping to don't update moments - v_delta = _mm256_blendv_pd(v_delta, v_zero, v_skip_mask); - __m256d v_delta_n = _mm256_div_pd(v_delta, v_n_nonzero); + v_delta = v_select(v_skip_mask == v_zero, v_delta, v_zero); + v4d v_delta_n = v_delta / v_n_nonzero; - __m256d v_delta_n2 = _mm256_mul_pd(v_delta, v_delta_n); - __m256d v_term1 = _mm256_mul_pd(v_delta_n2, _mm256_sub_pd(v_n, v_one)); + v4d v_delta_n2 = v_delta * v_delta_n; + v4d v_term1 = v_delta_n2 * (v_n - v_one); if (max_moment >= 4) { - __m256d v_n2 = _mm256_mul_pd(v_n, v_n); + v4d v_n2 = v_n * v_n; // n * n - 3.0 * n + 3.0 - __m256d v_n_term = _mm256_add_pd( - _mm256_sub_pd(v_n2, _mm256_mul_pd(_mm256_set1_pd(3.0), v_n)), - _mm256_set1_pd(3.0)); - __m256d v_m4_update = _mm256_mul_pd( - v_delta_n, - _mm256_add_pd( - _mm256_mul_pd(_mm256_set1_pd(-4.0), v_m3), - _mm256_mul_pd( - v_delta_n, - _mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(6.0), v_m2), - _mm256_mul_pd(v_term1, v_n_term))))); - v_m4 = _mm256_add_pd(v_m4, v_m4_update); + v4d v_n_term = (v_n2 - (3.0 * v_n)) + 3.0; + v_m4 += v_delta_n * ((-4.0 * v_m3) + + (v_delta_n * ((6.0 * v_m2) + (v_term1 * v_n_term)))); } if (max_moment >= 3) { - __m256d v_m3_term = _mm256_mul_pd( - v_delta_n, - _mm256_sub_pd( - _mm256_mul_pd(v_term1, _mm256_sub_pd(v_n, _mm256_set1_pd(2.0))), - _mm256_mul_pd(_mm256_set1_pd(3.0), v_m2))); - v_m3 = _mm256_add_pd(v_m3, v_m3_term); + v_m3 += v_delta_n * ((v_term1 * (v_n - 2.0)) - (3.0 * v_m2)); } - v_m2 = _mm256_add_pd(v_m2, v_term1); - v_mean = _mm256_add_pd(v_mean, v_delta_n); + v_m2 += v_term1; + v_mean += v_delta_n; } double n_arr[4], mean_arr[4], m2_arr[4], m3_arr[4], m4_arr[4]; - _mm256_storeu_pd(n_arr, v_n); - _mm256_storeu_pd(mean_arr, v_mean); - _mm256_storeu_pd(m2_arr, v_m2); - _mm256_storeu_pd(m3_arr, v_m3); - _mm256_storeu_pd(m4_arr, v_m4); + *(v4d *)n_arr = v_n; + *(v4d *)mean_arr = v_mean; + *(v4d *)m2_arr = v_m2; + *(v4d *)m3_arr = v_m3; + *(v4d *)m4_arr = v_m4; Moments moments_arr[4]; for (int j = 0; j < 4; j++) { @@ -145,146 +144,6 @@ static inline Moments accumulate_moments_avx2(const double *values, int64_t n, } #endif -#ifdef __x86_64__ -# if defined(__clang__) -# pragma clang attribute pop -# elif defined(__GNUC__) -# pragma GCC pop_options -# endif -#endif // ifdef __x86_64__ - -/* --- NEON SIMD Implementation --- */ - -#ifdef __aarch64__ -# if defined(__clang__) -# pragma clang attribute push(__attribute__((target("simd"))), \ - apply_to = function) -# define PANDAS_HAS_NEON 1 -# elif defined(__GNUC__) -# pragma GCC push_options -# pragma GCC target("+simd") -# define PANDAS_HAS_NEON 1 -# endif -#endif - -#ifdef PANDAS_HAS_NEON -# include - -// Vectorization of [moments_add_value] -static inline Moments accumulate_moments_neon(const double *values, int64_t n, - int skipna, const uint8_t *mask, - int max_moment) { - float64x2_t v_mean = vdupq_n_f64(0.0); - float64x2_t v_m2 = vdupq_n_f64(0.0); - float64x2_t v_m3 = vdupq_n_f64(0.0); - float64x2_t v_m4 = vdupq_n_f64(0.0); - // Using double for v_n to avoid conversions. - float64x2_t v_n = vdupq_n_f64(0.0); - - float64x2_t v_one = vdupq_n_f64(1.0); - float64x2_t v_zero = vdupq_n_f64(0.0); - float64x2_t v_nan = vdupq_n_f64(NAN); - - int64_t i = 0; - for (; i < n - 2; i += 2) { - float64x2_t v_val = vld1q_f64(values + i); - - uint64x2_t v_keep_mask = vdupq_n_u64(-1ULL); - - if (mask) { - uint16_t m16; - memcpy(&m16, mask + i, sizeof(uint16_t)); - uint8x8_t v_mask_bytes = vreinterpret_u8_u16(vdup_n_u16(m16)); - // Expand 2 bytes to 2x 64-bit. - uint16x8_t v_m16 = vmovl_u8(v_mask_bytes); - uint32x4_t v_m32 = vmovl_u16(vget_low_u16(v_m16)); - uint64x2_t v_m64 = vmovl_u32(vget_low_u32(v_m32)); - - uint64x2_t v_mask_is_zero = vceqq_u64(v_m64, vdupq_n_u64(0)); - // replace val with NAN where mask is != 0. - v_val = vbslq_f64(v_mask_is_zero, v_val, v_nan); - } - - if (skipna) { - uint64x2_t v_is_not_nan = vceqq_f64(v_val, v_val); - v_keep_mask = vandq_u64(v_keep_mask, v_is_not_nan); - } - - // Increment 1 where we do not skip - float64x2_t v_n_increment = vbslq_f64(v_keep_mask, v_one, v_zero); - v_n = vaddq_f64(v_n, v_n_increment); - float64x2_t v_n_nonzero = vmaxq_f64(v_n, v_one); - - float64x2_t v_delta = vsubq_f64(v_val, v_mean); - - // replace delta with zero when skipping to don't update moments - v_delta = vbslq_f64(v_keep_mask, v_delta, v_zero); - float64x2_t v_delta_n = vdivq_f64(v_delta, v_n_nonzero); - - float64x2_t v_delta_n2 = vmulq_f64(v_delta, v_delta_n); - float64x2_t v_term1 = vmulq_f64(v_delta_n2, vsubq_f64(v_n, v_one)); - - if (max_moment >= 4) { - float64x2_t v_n2 = vmulq_f64(v_n, v_n); - // n * n - 3.0 * n + 3.0 - float64x2_t v_n_term = vaddq_f64( - vsubq_f64(v_n2, vmulq_f64(vdupq_n_f64(3.0), v_n)), vdupq_n_f64(3.0)); - float64x2_t v_m4_update = vmulq_f64( - v_delta_n, - vaddq_f64( - vmulq_f64(vdupq_n_f64(-4.0), v_m3), - vmulq_f64(v_delta_n, vaddq_f64(vmulq_f64(vdupq_n_f64(6.0), v_m2), - vmulq_f64(v_term1, v_n_term))))); - v_m4 = vaddq_f64(v_m4, v_m4_update); - } - - if (max_moment >= 3) { - float64x2_t v_m3_term = vmulq_f64( - v_delta_n, - vsubq_f64(vmulq_f64(v_term1, vsubq_f64(v_n, vdupq_n_f64(2.0))), - vmulq_f64(vdupq_n_f64(3.0), v_m2))); - v_m3 = vaddq_f64(v_m3, v_m3_term); - } - - v_m2 = vaddq_f64(v_m2, v_term1); - v_mean = vaddq_f64(v_mean, v_delta_n); - } - - double n_arr[2], mean_arr[2], m2_arr[2], m3_arr[2], m4_arr[2]; - vst1q_f64(n_arr, v_n); - vst1q_f64(mean_arr, v_mean); - vst1q_f64(m2_arr, v_m2); - vst1q_f64(m3_arr, v_m3); - vst1q_f64(m4_arr, v_m4); - - Moments moments_arr[2]; - for (int j = 0; j < 2; j++) { - moments_arr[j] = (Moments){(int64_t)round(n_arr[j]), mean_arr[j], m2_arr[j], - m3_arr[j], m4_arr[j]}; - } - - // Distribute remaining values across chunks - for (; i < n; i++) { - double val = values[i]; - if (mask && mask[i]) - val = NAN; - if (skipna && isnan(val)) - continue; - moments_add_value(&moments_arr[i % 2], val, max_moment); - } - - return moments_merge(moments_arr[0], moments_arr[1], max_moment); -} -#endif - -#ifdef __aarch64__ -# if defined(__clang__) -# pragma clang attribute pop -# elif defined(__GNUC__) -# pragma GCC pop_options -# endif -#endif // ifdef __aarch64__ - /* --- Scalar Fallback Implementation --- */ static inline Moments accumulate_moments_scalar_block(const double *values, @@ -303,24 +162,20 @@ static inline Moments accumulate_moments_scalar_block(const double *values, return moments; } -/* --- Accumulation Dispatch (Choose AVX2, NEON or Scalar) --- */ +/* --- Accumulation Dispatch (Choose SIMD or Scalar) --- */ static inline Moments accumulate_moments_dispatch(const double *values, int64_t n, int skipna, const uint8_t *mask, int max_moment) { -#if defined(PANDAS_HAS_AVX2) && (defined(__GNUC__) || defined(__clang__)) - if (__builtin_cpu_supports("avx2")) { - return accumulate_moments_avx2(values, n, skipna, mask, max_moment); - } -#endif -#ifdef PANDAS_HAS_NEON - return accumulate_moments_neon(values, n, skipna, mask, max_moment); +#if defined(PANDAS_HAS_SIMD) + return accumulate_moments_simd(values, n, skipna, mask, max_moment); #endif return accumulate_moments_scalar_block(values, n, skipna, mask, max_moment); } -#undef PANDAS_HAS_AVX2 -#undef PANDAS_HAS_NEON +#undef PANDAS_HAS_SIMD +#undef PANDAS_SIMD_TARGETS +#undef v_select /* --- Public API (Orchestrates OpenMP Parallelism) --- */ From e35e33785410340cc25519a221964698ac6a2a9f Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Fri, 20 Mar 2026 14:44:14 -0300 Subject: [PATCH 09/35] fix: fix musl build --- pandas/_libs/src/moments.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index 6e72885d6a298..04dda82f2053a 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -19,27 +19,25 @@ static inline int omp_get_thread_num() { return 0; } static inline int omp_get_num_threads() { return 1; } #endif +#ifndef __has_attribute +# define __has_attribute(x) 0 +#endif + +#if defined(__x86_64__) && __has_attribute(target_clones) && defined(__GLIBC__) +# define PANDAS_SIMD_TARGETS __attribute__((target_clones("avx2", "default"))) +#else +# define PANDAS_SIMD_TARGETS +#endif + /* --- SIMD Implementation --- */ #if defined(__clang__) typedef double v4d __attribute__((ext_vector_type(4), aligned(1))); typedef long long v4si __attribute__((ext_vector_type(4), aligned(1))); # define PANDAS_HAS_SIMD 1 -# if defined(__x86_64__) -# define PANDAS_SIMD_TARGETS \ - __attribute__((target_clones("avx2", "default"))) -# else -# define PANDAS_SIMD_TARGETS -# endif #elif defined(__GNUC__) typedef double v4d __attribute__((vector_size(32), aligned(1))); typedef long long v4si __attribute__((vector_size(32), aligned(1))); # define PANDAS_HAS_SIMD 1 -# if defined(__x86_64__) -# define PANDAS_SIMD_TARGETS \ - __attribute__((target_clones("avx2", "default"))) -# else -# define PANDAS_SIMD_TARGETS -# endif #endif #ifdef PANDAS_HAS_SIMD From 7c5b6e955f55836e9de5448d4a800ca27ae6eb9c Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Fri, 20 Mar 2026 14:48:37 -0300 Subject: [PATCH 10/35] fix: remove unnecessary undef --- pandas/_libs/src/moments.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index 04dda82f2053a..d98896a50ee80 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -171,9 +171,6 @@ static inline Moments accumulate_moments_dispatch(const double *values, #endif return accumulate_moments_scalar_block(values, n, skipna, mask, max_moment); } -#undef PANDAS_HAS_SIMD -#undef PANDAS_SIMD_TARGETS -#undef v_select /* --- Public API (Orchestrates OpenMP Parallelism) --- */ From 39fcf5f9849de81d96bf1d77670274b50a542606 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Fri, 20 Mar 2026 15:22:21 -0300 Subject: [PATCH 11/35] chore: remove unused include --- pandas/_libs/src/moments.c | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index d98896a50ee80..4e436abfacaf5 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -10,7 +10,6 @@ The full license is in the LICENSE file, distributed with this software. #include "pandas/moments.h" #include #include -#include #ifdef _OPENMP # include From 040656512d230ae25e2c311214acef199d26b15e Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Fri, 20 Mar 2026 15:26:58 -0300 Subject: [PATCH 12/35] fix: check for vector attribute --- pandas/_libs/src/moments.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index 4e436abfacaf5..5bd81479b40a8 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -29,11 +29,11 @@ static inline int omp_get_num_threads() { return 1; } #endif /* --- SIMD Implementation --- */ -#if defined(__clang__) +#if defined(__clang__) && __has_attribute(ext_vector_type) typedef double v4d __attribute__((ext_vector_type(4), aligned(1))); typedef long long v4si __attribute__((ext_vector_type(4), aligned(1))); # define PANDAS_HAS_SIMD 1 -#elif defined(__GNUC__) +#elif defined(__GNUC__) && __has_attribute(vector_size) typedef double v4d __attribute__((vector_size(32), aligned(1))); typedef long long v4si __attribute__((vector_size(32), aligned(1))); # define PANDAS_HAS_SIMD 1 From bc951a40a54de0363faa8b2447f96459a2eba1c0 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Fri, 20 Mar 2026 15:30:04 -0300 Subject: [PATCH 13/35] fix: improve typedef for different platforms --- pandas/_libs/src/moments.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index 5bd81479b40a8..ea5738f742e8b 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -34,8 +34,9 @@ typedef double v4d __attribute__((ext_vector_type(4), aligned(1))); typedef long long v4si __attribute__((ext_vector_type(4), aligned(1))); # define PANDAS_HAS_SIMD 1 #elif defined(__GNUC__) && __has_attribute(vector_size) -typedef double v4d __attribute__((vector_size(32), aligned(1))); -typedef long long v4si __attribute__((vector_size(32), aligned(1))); +typedef double v4d __attribute__((vector_size(4 * sizeof(double)), aligned(1))); +typedef long long v4si + __attribute__((vector_size(4 * sizeof(long long)), aligned(1))); # define PANDAS_HAS_SIMD 1 #endif From c2c832af9ef88f83b77030bc93866f7e72137a8a Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Fri, 20 Mar 2026 15:37:40 -0300 Subject: [PATCH 14/35] chore: clarify vectorized in gcc and undefine v_select --- pandas/_libs/src/moments.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index ea5738f742e8b..a0723f87bbbb1 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -45,7 +45,7 @@ typedef long long v4si # if defined(__clang__) # define v_select(mask, a, b) ((mask) ? (a) : (b)) # else -// gcc doesn't have ternary operators when compiling C files. +// gcc doesn't have vectorized ternary operators when compiling C files. # define v_select(mask, a, b) \ ((v4d)(((v4si)(mask) & (v4si)(a)) | (~(v4si)(mask) & (v4si)(b)))) # endif @@ -140,6 +140,7 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values, return moments_merge(m_01, m_23, max_moment); } +# undef v_select #endif /* --- Scalar Fallback Implementation --- */ From 1d7e1085dfc5c7070fa0ef7e62cfbcc9065aea78 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Fri, 20 Mar 2026 17:31:39 -0300 Subject: [PATCH 15/35] perf: vectorize mask handling --- pandas/_libs/src/moments.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index a0723f87bbbb1..ce0869c6d9c03 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -32,11 +32,14 @@ static inline int omp_get_num_threads() { return 1; } #if defined(__clang__) && __has_attribute(ext_vector_type) typedef double v4d __attribute__((ext_vector_type(4), aligned(1))); typedef long long v4si __attribute__((ext_vector_type(4), aligned(1))); +typedef uint8_t v4u8 __attribute__((ext_vector_type(4), aligned(1))); # define PANDAS_HAS_SIMD 1 #elif defined(__GNUC__) && __has_attribute(vector_size) typedef double v4d __attribute__((vector_size(4 * sizeof(double)), aligned(1))); typedef long long v4si __attribute__((vector_size(4 * sizeof(long long)), aligned(1))); +typedef uint8_t v4u8 + __attribute__((vector_size(4 * sizeof(uint8_t)), aligned(1))); # define PANDAS_HAS_SIMD 1 #endif @@ -71,10 +74,10 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values, v4d v_val = *(v4d *)(values + i); if (mask) { - v4d v_m_val = {mask[i + 0] ? 1.0 : 0.0, mask[i + 1] ? 1.0 : 0.0, - mask[i + 2] ? 1.0 : 0.0, mask[i + 3] ? 1.0 : 0.0}; + v4u8 mask_vec = *(v4u8 *)(mask + i); + v4si mask_si = __builtin_convertvector(mask_vec, v4si); // replace val with NAN where mask is 1. - v_val = v_select(v_m_val == v_one, v_nan, v_val); + v_val = v_select(mask_si != 0, v_nan, v_val); } // skip_mask is 1.0 if we should skip, 0.0 otherwise From 40fd15349620212091ce2a0160148c2d2889c61d Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Fri, 20 Mar 2026 17:34:40 -0300 Subject: [PATCH 16/35] chore: remove inlines in favor of compiler discretion --- pandas/_libs/src/moments.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index ce0869c6d9c03..cb4d5d832b196 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -148,10 +148,9 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values, /* --- Scalar Fallback Implementation --- */ -static inline Moments accumulate_moments_scalar_block(const double *values, - int64_t n, int skipna, - const uint8_t *mask, - int max_moment) { +Moments accumulate_moments_scalar_block(const double *values, int64_t n, + int skipna, const uint8_t *mask, + int max_moment) { Moments moments = {0}; for (int64_t i = 0; i < n; i++) { double val = values[i]; @@ -166,10 +165,8 @@ static inline Moments accumulate_moments_scalar_block(const double *values, /* --- Accumulation Dispatch (Choose SIMD or Scalar) --- */ -static inline Moments accumulate_moments_dispatch(const double *values, - int64_t n, int skipna, - const uint8_t *mask, - int max_moment) { +Moments accumulate_moments_dispatch(const double *values, int64_t n, int skipna, + const uint8_t *mask, int max_moment) { #if defined(PANDAS_HAS_SIMD) return accumulate_moments_simd(values, n, skipna, mask, max_moment); #endif From 7d3f2860c9d01f0248b209c9e1f23510a4b3f356 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Fri, 20 Mar 2026 18:03:30 -0300 Subject: [PATCH 17/35] chore: comments --- pandas/_libs/src/moments.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index cb4d5d832b196..0c7ef0d17b85b 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -173,15 +173,13 @@ Moments accumulate_moments_dispatch(const double *values, int64_t n, int skipna, return accumulate_moments_scalar_block(values, n, skipna, mask, max_moment); } -/* --- Public API (Orchestrates OpenMP Parallelism) --- */ +/* --- Moments 1D Accumulator Implementation --- */ Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna, const uint8_t *mask, int max_moment) { Moments result = {0}; #ifdef _OPENMP - // parallel threshold chosen based on `perf report` where libgomp wasn't the - // overhead # pragma omp parallel if (n > 10000) #endif { From 6f9224e5b00e691c6db10b2f823748b6fb08be95 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Sat, 21 Mar 2026 00:20:02 -0300 Subject: [PATCH 18/35] fix: fix loop bounds and avoid unnecessary round --- pandas/_libs/src/moments.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index 0c7ef0d17b85b..e9ec4f84fccfe 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -70,7 +70,7 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values, v4d v_nan = {NAN, NAN, NAN, NAN}; int64_t i = 0; - for (; i < n - 4; i += 4) { + for (; i < n - 3; i += 4) { v4d v_val = *(v4d *)(values + i); if (mask) { @@ -123,7 +123,7 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values, Moments moments_arr[4]; for (int j = 0; j < 4; j++) { - moments_arr[j] = (Moments){(int64_t)round(n_arr[j]), mean_arr[j], m2_arr[j], + moments_arr[j] = (Moments){(int64_t)n_arr[j], mean_arr[j], m2_arr[j], m3_arr[j], m4_arr[j]}; } From f3ba6abb201628ccd69fc781bcc874eed3c5223b Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Sat, 21 Mar 2026 17:33:22 -0300 Subject: [PATCH 19/35] chore: add comments after endif --- pandas/_libs/src/moments.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index e9ec4f84fccfe..2f69da965c24a 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -14,19 +14,19 @@ The full license is in the LICENSE file, distributed with this software. #ifdef _OPENMP # include #else -static inline int omp_get_thread_num() { return 0; } -static inline int omp_get_num_threads() { return 1; } -#endif +static inline int omp_get_thread_num(void) { return 0; } +static inline int omp_get_num_threads(void) { return 1; } +#endif // _OPENMP #ifndef __has_attribute # define __has_attribute(x) 0 -#endif +#endif // __has_attribute #if defined(__x86_64__) && __has_attribute(target_clones) && defined(__GLIBC__) # define PANDAS_SIMD_TARGETS __attribute__((target_clones("avx2", "default"))) #else # define PANDAS_SIMD_TARGETS -#endif +#endif // x86_64 + glibc + target_clones /* --- SIMD Implementation --- */ #if defined(__clang__) && __has_attribute(ext_vector_type) @@ -41,7 +41,7 @@ typedef long long v4si typedef uint8_t v4u8 __attribute__((vector_size(4 * sizeof(uint8_t)), aligned(1))); # define PANDAS_HAS_SIMD 1 -#endif +#endif // __clang__ or __GNUC__ with vector extension #ifdef PANDAS_HAS_SIMD /* Vector select: returns (mask ? a : b) where mask is all-ones or all-zeros */ @@ -51,7 +51,7 @@ typedef uint8_t v4u8 // gcc doesn't have vectorized ternary operators when compiling C files. # define v_select(mask, a, b) \ ((v4d)(((v4si)(mask) & (v4si)(a)) | (~(v4si)(mask) & (v4si)(b)))) -# endif +# endif // defined(__clang__) PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values, int64_t n, int skipna, @@ -144,7 +144,7 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values, return moments_merge(m_01, m_23, max_moment); } # undef v_select -#endif +#endif // PANDAS_HAS_SIMD /* --- Scalar Fallback Implementation --- */ @@ -169,7 +169,7 @@ Moments accumulate_moments_dispatch(const double *values, int64_t n, int skipna, const uint8_t *mask, int max_moment) { #if defined(PANDAS_HAS_SIMD) return accumulate_moments_simd(values, n, skipna, mask, max_moment); -#endif +#endif // defined(PANDAS_HAS_SIMD) return accumulate_moments_scalar_block(values, n, skipna, mask, max_moment); } From 18566d2fad50cfce7cdf64cca75555dde4572d09 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Sat, 21 Mar 2026 23:48:27 -0300 Subject: [PATCH 20/35] refactor: use `size_t` instead of signed integer for array size --- pandas/_libs/algos.pxd | 2 +- pandas/_libs/algos.pyx | 4 ++-- pandas/_libs/include/pandas/moments.h | 2 +- pandas/_libs/src/moments.c | 22 +++++++++++----------- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd index 4d80831b603c4..8ae078f4363de 100644 --- a/pandas/_libs/algos.pxd +++ b/pandas/_libs/algos.pxd @@ -40,7 +40,7 @@ cdef extern from "pandas/moments.h": Moments accumulate_moments_scalar( const float64_t* values, - int64_t n, + size_t n, bint skipna, const uint8_t* mask, int max_moment, diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 2e4585566c486..f25df20c63f76 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1645,7 +1645,7 @@ def scalar_skew( ) -> float: cdef: Moments moments - Py_ssize_t n = values.shape[0] + size_t n = values.shape[0] const float64_t* p_values = &values[0] const uint8_t* p_mask = &mask[0] if mask is not None else NULL @@ -1664,7 +1664,7 @@ def scalar_kurt( ) -> float: cdef: Moments moments - Py_ssize_t n = values.shape[0] + size_t n = values.shape[0] const float64_t* p_values = &values[0] const uint8_t* p_mask = &mask[0] if mask is not None else NULL diff --git a/pandas/_libs/include/pandas/moments.h b/pandas/_libs/include/pandas/moments.h index e492e8200c82b..166ace232fb5a 100644 --- a/pandas/_libs/include/pandas/moments.h +++ b/pandas/_libs/include/pandas/moments.h @@ -86,7 +86,7 @@ static inline Moments moments_merge(Moments a, Moments b, int max_moment) { /// Compute central moments until `max_moment` using `n` elements from `values`. /// The size is represented as signed integer for MSVC compatibility with /// OpenMP. -Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna, +Moments accumulate_moments_scalar(const double *values, size_t n, int skipna, const uint8_t *mask, int max_moment); static inline double calc_skew(Moments moments) { diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index 2f69da965c24a..2276db43dc600 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -54,7 +54,7 @@ typedef uint8_t v4u8 # endif // defined(__clang__) PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values, - int64_t n, int skipna, + size_t n, int skipna, const uint8_t *mask, int max_moment) { v4d v_mean = {0.0, 0.0, 0.0, 0.0}; @@ -69,8 +69,8 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values, v4d v_zero = {0.0, 0.0, 0.0, 0.0}; v4d v_nan = {NAN, NAN, NAN, NAN}; - int64_t i = 0; - for (; i < n - 3; i += 4) { + size_t i = 0; + for (; i + 3 < n; i += 4) { v4d v_val = *(v4d *)(values + i); if (mask) { @@ -148,11 +148,11 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values, /* --- Scalar Fallback Implementation --- */ -Moments accumulate_moments_scalar_block(const double *values, int64_t n, +Moments accumulate_moments_scalar_block(const double *values, size_t n, int skipna, const uint8_t *mask, int max_moment) { Moments moments = {0}; - for (int64_t i = 0; i < n; i++) { + for (size_t i = 0; i < n; i++) { double val = values[i]; if (mask && mask[i]) val = NAN; @@ -165,7 +165,7 @@ Moments accumulate_moments_scalar_block(const double *values, int64_t n, /* --- Accumulation Dispatch (Choose SIMD or Scalar) --- */ -Moments accumulate_moments_dispatch(const double *values, int64_t n, int skipna, +Moments accumulate_moments_dispatch(const double *values, size_t n, int skipna, const uint8_t *mask, int max_moment) { #if defined(PANDAS_HAS_SIMD) return accumulate_moments_simd(values, n, skipna, mask, max_moment); @@ -175,7 +175,7 @@ Moments accumulate_moments_dispatch(const double *values, int64_t n, int skipna, /* --- Moments 1D Accumulator Implementation --- */ -Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna, +Moments accumulate_moments_scalar(const double *values, size_t n, int skipna, const uint8_t *mask, int max_moment) { Moments result = {0}; @@ -183,11 +183,11 @@ Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna, # pragma omp parallel if (n > 10000) #endif { - int tid = omp_get_thread_num(); - int num_threads = omp_get_num_threads(); + size_t tid = (size_t)omp_get_thread_num(); + size_t num_threads = (size_t)omp_get_num_threads(); - int64_t start = (tid * n) / num_threads; - int64_t end = tid == num_threads - 1 ? n : ((tid + 1) * n) / num_threads; + size_t start = (tid * n) / num_threads; + size_t end = tid == num_threads - 1 ? n : ((tid + 1) * n) / num_threads; Moments moments_local = accumulate_moments_dispatch(values + start, end - start, skipna, From 9254c3c24df0ae5e1c7beafe45d65f576779b465 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Sun, 22 Mar 2026 00:01:54 -0300 Subject: [PATCH 21/35] fix: cast type to size_t to avoid conversion warnings --- pandas/_libs/algos.pyx | 4 ++-- pandas/_libs/groupby.pyx | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index f25df20c63f76..839bf31904a87 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1685,7 +1685,7 @@ def axis_skew( cdef: Py_ssize_t i Py_ssize_t nouter = values.shape[1] if axis == 0 else values.shape[0] - Moments *moments = calloc(nouter, sizeof(Moments)) + Moments *moments = calloc(nouter, sizeof(Moments)) ndarray result_arr = np.empty(nouter, dtype=np.float64) float64_t[:] result = result_arr @@ -1712,7 +1712,7 @@ def axis_kurt( cdef: Py_ssize_t i Py_ssize_t nouter = values.shape[1] if axis == 0 else values.shape[0] - Moments *moments = calloc(nouter, sizeof(Moments)) + Moments *moments = calloc(nouter, sizeof(Moments)) ndarray result_arr = np.empty(nouter, dtype=np.float64) float64_t[:] result = result_arr diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 14b8320e7b044..d6529623e41de 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1034,7 +1034,7 @@ def group_skew( raise ValueError("len(index) != len(labels)") N, K = (values).shape - ms = calloc(ngroups * K, sizeof(Moments)) + ms = calloc(ngroups * K, sizeof(Moments)) if ms is NULL: raise MemoryError() @@ -1092,7 +1092,7 @@ def group_kurt( raise ValueError("len(index) != len(labels)") N, K = (values).shape - ms = calloc(ngroups * K, sizeof(Moments)) + ms = calloc(ngroups * K, sizeof(Moments)) if ms is NULL: raise MemoryError() From 933db467972fb3b4d4b246a84aabcdc3c369fbbc Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Sun, 22 Mar 2026 00:19:51 -0300 Subject: [PATCH 22/35] add note about performance --- pandas/_libs/algos.pyx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 839bf31904a87..af06f89d36849 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1626,7 +1626,13 @@ cdef void accumulate_moments_axis( nouter = nrows ninner = ncols + # PERF: The outer dimension is embarrassingly parallel for i in range(nouter): + # PERF: If it's possible to enforce contiguity in the last dimension, + # it's possible to use accumulate_moments_scalar with SIMD instructions. + # It may create more threads if OMP_NESTED is enabled and the outer loop + # is run in parallel. + # If it doesn't run in parallel, it will at least use SIMD instructions. for j in range(ninner): val = values[j, i] if axis == 0 else values[i, j] if uses_mask and (mask[j, i] if axis == 0 else mask[i, j]): From 5acaa4c498162e778cf5a80d1a34bdbdf14b4401 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Sun, 22 Mar 2026 12:08:45 -0300 Subject: [PATCH 23/35] feat: make OpenMP optional and disabled by default --- doc/source/development/contributing_environment.rst | 3 ++- meson_options.txt | 1 + pandas/_libs/meson.build | 7 ++++++- 3 files changed, 9 insertions(+), 2 deletions(-) create mode 100644 meson_options.txt diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index ce0430bd9e876..b5034dee446ca 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -169,12 +169,13 @@ To compile and install pandas in editable mode, run: python -m pip install --verbose --editable . --no-build-isolation -Additional Meson options can be passed to the ``pip install`` command to modify the installation. +Additional Meson and Pandas options can be passed to the ``pip install`` command to modify the installation. Helpful options include: * ``-Ceditable-verbose=true``: Print verbose logs during rebuild, even during ``import``. * ``-Cbuilddir="your builddir here"``: Specify a different build directory for the C extensions. * ``-Csetup-args="-Ddebug=true"``: Compile the C extensions with debug symbols. +* ``-Csetup-args=-Duse_openmp=true``: Compile Pandas with parallelization support through OpenMP. .. note:: When pandas is installed in ``--editable`` mode, pandas will automatically rebuild the library upon ``import``, diff --git a/meson_options.txt b/meson_options.txt new file mode 100644 index 0000000000000..536c1cadeb718 --- /dev/null +++ b/meson_options.txt @@ -0,0 +1 @@ +option('use_openmp', type: 'boolean', value: false, description: 'Whether to enable OpenMP support') diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index 2c2122613d3e3..694f792fdb72d 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -55,7 +55,12 @@ fast_float = subproject('fast_float') fast_float_dep = fast_float.get_variable('fast_float_dep') subdir('tslibs') -openmp_dep = dependency('OpenMP', required: false) + +if get_option('use_openmp') + openmp_dep = dependency('OpenMP', required: true) +else + openmp_dep = declare_dependency() +endif libs_sources = { # Dict of extension name -> dict of {sources, include_dirs, and deps} From d62ac23899b2736cc42f73411b8762f25b6433f6 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Sun, 22 Mar 2026 19:12:12 -0300 Subject: [PATCH 24/35] refactor: use feature instead of boolean for openmp --- doc/source/development/contributing_environment.rst | 2 +- meson_options.txt | 2 +- pandas/_libs/meson.build | 6 +----- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index b5034dee446ca..012304b3da3fc 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -175,7 +175,7 @@ Helpful options include: * ``-Ceditable-verbose=true``: Print verbose logs during rebuild, even during ``import``. * ``-Cbuilddir="your builddir here"``: Specify a different build directory for the C extensions. * ``-Csetup-args="-Ddebug=true"``: Compile the C extensions with debug symbols. -* ``-Csetup-args=-Duse_openmp=true``: Compile Pandas with parallelization support through OpenMP. +* ``-Csetup-args=-Dopenmp=enabled``: Compile Pandas with parallelization support through OpenMP. .. note:: When pandas is installed in ``--editable`` mode, pandas will automatically rebuild the library upon ``import``, diff --git a/meson_options.txt b/meson_options.txt index 536c1cadeb718..128bb55b2f3f6 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -1 +1 @@ -option('use_openmp', type: 'boolean', value: false, description: 'Whether to enable OpenMP support') +option('openmp', type: 'feature', value: 'disabled', description: 'Parallelization with OpenMP') diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index 694f792fdb72d..e4db9972dd44d 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -56,11 +56,7 @@ fast_float_dep = fast_float.get_variable('fast_float_dep') subdir('tslibs') -if get_option('use_openmp') - openmp_dep = dependency('OpenMP', required: true) -else - openmp_dep = declare_dependency() -endif +openmp_dep = dependency('OpenMP', required: get_option('openmp')) libs_sources = { # Dict of extension name -> dict of {sources, include_dirs, and deps} From bc43aaad0d4d75e8425969e049ae1b8abde1c0e0 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Mon, 23 Mar 2026 17:57:34 -0300 Subject: [PATCH 25/35] perf: use OpenMP automatically --- doc/source/development/contributing_environment.rst | 2 +- meson_options.txt | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index 012304b3da3fc..b814e1ca96020 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -175,7 +175,7 @@ Helpful options include: * ``-Ceditable-verbose=true``: Print verbose logs during rebuild, even during ``import``. * ``-Cbuilddir="your builddir here"``: Specify a different build directory for the C extensions. * ``-Csetup-args="-Ddebug=true"``: Compile the C extensions with debug symbols. -* ``-Csetup-args=-Dopenmp=enabled``: Compile Pandas with parallelization support through OpenMP. +* ``-Csetup-args=-Dopenmp=disabled``: Compile Pandas without parallelization support through OpenMP. .. note:: When pandas is installed in ``--editable`` mode, pandas will automatically rebuild the library upon ``import``, diff --git a/meson_options.txt b/meson_options.txt index 128bb55b2f3f6..22fc68252de52 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -1 +1,5 @@ -option('openmp', type: 'feature', value: 'disabled', description: 'Parallelization with OpenMP') +option( + 'openmp', + type: 'feature', + description: 'Parallelization with OpenMP', +) From baf66c4834666fb8026f221dd04362c6454eed73 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Mon, 23 Mar 2026 19:04:06 -0300 Subject: [PATCH 26/35] fix: use meson.options to fix the build error --- meson_options.txt => meson.options | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename meson_options.txt => meson.options (100%) diff --git a/meson_options.txt b/meson.options similarity index 100% rename from meson_options.txt rename to meson.options From 893796fc65e251da3e79afc79b54436ca0e36084 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Mon, 23 Mar 2026 21:24:26 -0300 Subject: [PATCH 27/35] docs(moments): remove outdated comment about MSVC --- pandas/_libs/include/pandas/moments.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/_libs/include/pandas/moments.h b/pandas/_libs/include/pandas/moments.h index 166ace232fb5a..e9516acf4a881 100644 --- a/pandas/_libs/include/pandas/moments.h +++ b/pandas/_libs/include/pandas/moments.h @@ -84,8 +84,6 @@ static inline Moments moments_merge(Moments a, Moments b, int max_moment) { } /// Compute central moments until `max_moment` using `n` elements from `values`. -/// The size is represented as signed integer for MSVC compatibility with -/// OpenMP. Moments accumulate_moments_scalar(const double *values, size_t n, int skipna, const uint8_t *mask, int max_moment); From 57d3cb1a3b2b8f4b6349b2c6c13164e3132833b5 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Mon, 23 Mar 2026 21:26:53 -0300 Subject: [PATCH 28/35] fix: don't force compiler for vector extension --- pandas/_libs/src/moments.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index 2276db43dc600..b80dadf350d04 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -29,19 +29,19 @@ static inline int omp_get_num_threads(void) { return 1; } #endif // x86_64 + glibc + target_clones /* --- SIMD Implementation --- */ -#if defined(__clang__) && __has_attribute(ext_vector_type) +#if __has_attribute(ext_vector_type) typedef double v4d __attribute__((ext_vector_type(4), aligned(1))); typedef long long v4si __attribute__((ext_vector_type(4), aligned(1))); typedef uint8_t v4u8 __attribute__((ext_vector_type(4), aligned(1))); # define PANDAS_HAS_SIMD 1 -#elif defined(__GNUC__) && __has_attribute(vector_size) +#elif __has_attribute(vector_size) typedef double v4d __attribute__((vector_size(4 * sizeof(double)), aligned(1))); typedef long long v4si __attribute__((vector_size(4 * sizeof(long long)), aligned(1))); typedef uint8_t v4u8 __attribute__((vector_size(4 * sizeof(uint8_t)), aligned(1))); # define PANDAS_HAS_SIMD 1 -#endif // __clang__ or __GNUC__ with vector extension +#endif // __has_attribute(ext_vector_type) #ifdef PANDAS_HAS_SIMD /* Vector select: returns (mask ? a : b) where mask is all-ones or all-zeros */ From 10e14e57237cc11f96e915324c7f858c9725b9dc Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Mon, 23 Mar 2026 21:42:29 -0300 Subject: [PATCH 29/35] chore: better variable names in aggregations.pyx --- pandas/_libs/window/aggregations.pyx | 120 +++++++++++++-------------- 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 3ffc14c1b4cbc..45a765c468aaa 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -493,22 +493,22 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, # Rolling skewness -cdef void add_skew(float64_t val, Moments *m, +cdef void add_skew(float64_t val, Moments *moments, bint *numerically_unstable, ) noexcept nogil: """ add a value from the skew calc """ cdef: - float64_t old_m3 = m.m3 + float64_t old_m3 = moments.m3 # Not NaN if val == val: - moments_add_value(m, val, 3) - if fabs(old_m3) * InvCondTol > fabs(m.m3): + moments_add_value(moments, val, 3) + if fabs(old_m3) * InvCondTol > fabs(moments.m3): # possible catastrophic cancellation numerically_unstable[0] = True -cdef void remove_skew(float64_t val, Moments *m, +cdef void remove_skew(float64_t val, Moments *moments, bint *numerically_unstable) noexcept nogil: """ remove a value from the skew calc """ cdef: @@ -526,22 +526,22 @@ cdef void remove_skew(float64_t val, Moments *m, # Not NaN if val == val: - m.n -= 1 - n = (m.n) - delta = val - m.mean + moments.n -= 1 + n = (moments.n) + delta = val - moments.mean delta_n = delta / n term1 = delta_n * delta * (n + 1.0) - m3_update = delta_n * (term1 * (n + 2.0) - 3.0 * m.m2) - new_m3 = m.m3 - m3_update + m3_update = delta_n * (term1 * (n + 2.0) - 3.0 * moments.m2) + new_m3 = moments.m3 - m3_update - if (fabs(m3_update) + fabs(m.m3)) * InvCondTol > fabs(new_m3): + if (fabs(m3_update) + fabs(moments.m3)) * InvCondTol > fabs(new_m3): # possible catastrophic cancellation numerically_unstable[0] = True - m.m3 = new_m3 - m.m2 -= term1 - m.mean -= delta_n + moments.m3 = new_m3 + moments.m2 -= term1 + moments.mean -= delta_n def roll_skew(const float64_t[:] values, ndarray[int64_t] start, @@ -549,7 +549,7 @@ def roll_skew(const float64_t[:] values, ndarray[int64_t] start, cdef: Py_ssize_t i, j float64_t val - Moments m + Moments moments int64_t N = len(start) int64_t s, e ndarray[float64_t] output @@ -582,35 +582,35 @@ def roll_skew(const float64_t[:] values, ndarray[int64_t] start, # calculate deletes for j in range(start[i - 1], s): val = values[j] - remove_skew(val, &m, &numerically_unstable) + remove_skew(val, &moments, &numerically_unstable) # calculate adds for j in range(end[i - 1], e): val = values[j] - add_skew(val, &m, &numerically_unstable) + add_skew(val, &moments, &numerically_unstable) if requires_recompute or numerically_unstable: - m.n = 0 - m.mean = 0.0 - m.m2 = 0.0 - m.m3 = 0.0 - m.m4 = 0.0 + moments.n = 0 + moments.mean = 0.0 + moments.m2 = 0.0 + moments.m3 = 0.0 + moments.m4 = 0.0 for j in range(s, e): val = values[j] - add_skew(val, &m, &numerically_unstable) + add_skew(val, &moments, &numerically_unstable) numerically_unstable = False - output[i] = NaN if m.n < minp else calc_skew(m) + output[i] = NaN if moments.n < minp else calc_skew(moments) if not is_monotonic_increasing_bounds: - m.n = 0 - m.mean = 0.0 - m.m2 = 0.0 - m.m3 = 0.0 - m.m4 = 0.0 + moments.n = 0 + moments.mean = 0.0 + moments.m2 = 0.0 + moments.m3 = 0.0 + moments.m4 = 0.0 return output @@ -618,22 +618,22 @@ def roll_skew(const float64_t[:] values, ndarray[int64_t] start, # Rolling kurtosis -cdef void add_kurt(float64_t val, Moments *m, +cdef void add_kurt(float64_t val, Moments *moments, bint *numerically_unstable, ) noexcept nogil: """ add a value from the kurotic calc """ cdef: - float64_t old_m4 = m.m4 + float64_t old_m4 = moments.m4 # Not NaN if val == val: - moments_add_value(m, val, 4) - if fabs(old_m4) * InvCondTol > fabs(m.m4): + moments_add_value(moments, val, 4) + if fabs(old_m4) * InvCondTol > fabs(moments.m4): # possible catastrophic cancellation numerically_unstable[0] = True -cdef void remove_kurt(float64_t val, Moments *m, +cdef void remove_kurt(float64_t val, Moments *moments, bint *numerically_unstable, ) noexcept nogil: """ remove a value from the kurotic calc """ @@ -642,36 +642,36 @@ cdef void remove_kurt(float64_t val, Moments *m, # Not NaN if val == val: - m.n -= 1 - n = (m.n) - delta = val - m.mean + moments.n -= 1 + n = (moments.n) + delta = val - moments.mean delta_n = delta / n term1 = delta_n * delta * (n + 1.0) m4_update = delta_n * ( - 4.0 * m.m3 + 4.0 * moments.m3 + delta_n * ( - 6.0 * m.m2 + 6.0 * moments.m2 - term1 * (n * n + 3.0 * n + 3.0) ) ) - new_m4 = m.m4 + m4_update + new_m4 = moments.m4 + m4_update - if (fabs(m4_update) + fabs(m.m4)) * InvCondTol > fabs(new_m4): + if (fabs(m4_update) + fabs(moments.m4)) * InvCondTol > fabs(new_m4): # possible catastrophic cancellation numerically_unstable[0] = True - m.m4 = new_m4 - m.m3 -= delta_n * (term1 * (n + 2.0) - 3.0 * m.m2) - m.m2 -= term1 - m.mean -= delta_n + moments.m4 = new_m4 + moments.m3 -= delta_n * (term1 * (n + 2.0) - 3.0 * moments.m2) + moments.m2 -= term1 + moments.mean -= delta_n def roll_kurt(const float64_t[:] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp) -> np.ndarray: cdef: Py_ssize_t i, j - Moments m + Moments moments int64_t s, e int64_t N = len(start) ndarray[float64_t] output @@ -704,32 +704,32 @@ def roll_kurt(const float64_t[:] values, ndarray[int64_t] start, # and removed # calculate deletes for j in range(start[i - 1], s): - remove_kurt(values[j], &m, &numerically_unstable) + remove_kurt(values[j], &moments, &numerically_unstable) # calculate adds for j in range(end[i - 1], e): - add_kurt(values[j], &m, &numerically_unstable) + add_kurt(values[j], &moments, &numerically_unstable) if requires_recompute or numerically_unstable: - m.n = 0 - m.mean = 0.0 - m.m2 = 0.0 - m.m3 = 0.0 - m.m4 = 0.0 + moments.n = 0 + moments.mean = 0.0 + moments.m2 = 0.0 + moments.m3 = 0.0 + moments.m4 = 0.0 for j in range(s, e): - add_kurt(values[j], &m, &numerically_unstable) + add_kurt(values[j], &moments, &numerically_unstable) numerically_unstable = False - output[i] = NaN if m.n < minp else calc_kurt(m) + output[i] = NaN if moments.n < minp else calc_kurt(moments) if not is_monotonic_increasing_bounds: - m.n = 0 - m.mean = 0.0 - m.m2 = 0.0 - m.m3 = 0.0 - m.m4 = 0.0 + moments.n = 0 + moments.mean = 0.0 + moments.m2 = 0.0 + moments.m3 = 0.0 + moments.m4 = 0.0 return output From 1f0953c43e0f96e926d859b1fc0de1ca049494d4 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Thu, 2 Apr 2026 22:51:41 -0300 Subject: [PATCH 30/35] refactor: simplify diff --- pandas/_libs/algos.pxd | 75 ++++++++++----- pandas/_libs/algos.pyx | 59 +++++++----- pandas/_libs/groupby.pyx | 43 +++++---- pandas/_libs/include/pandas/moments.h | 92 ++++-------------- pandas/_libs/src/moments.c | 126 ++++++++++++++++++++----- pandas/_libs/window/aggregations.pyx | 131 +++++++++++++------------- 6 files changed, 304 insertions(+), 222 deletions(-) diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd index 8ae078f4363de..b5859ba2af3e8 100644 --- a/pandas/_libs/algos.pxd +++ b/pandas/_libs/algos.pxd @@ -19,36 +19,67 @@ cdef numeric_t kth_smallest_c(numeric_t* arr, Py_ssize_t k, Py_ssize_t n) noexce cdef extern from "pandas/moments.h": - ctypedef struct Moments: - int64_t n - float64_t mean - float64_t m2 - float64_t m3 - float64_t m4 - void moments_add_value( - Moments* moments, - float64_t val, - int max_moment, - ) nogil - - Moments moments_merge( - Moments a, - Moments b, - int max_moment, - ) nogil + float64_t val, + int64_t* nobs, + float64_t* mean, + float64_t* m2, + float64_t* m3, + float64_t* m4, + int max_moment, + ) noexcept nogil - Moments accumulate_moments_scalar( - const float64_t* values, + void accumulate_moments_scalar( size_t n, + const float64_t* values, bint skipna, const uint8_t* mask, + int64_t* nobs, + float64_t* mean, + float64_t* m2, + float64_t* m3, + float64_t* m4, int max_moment, - ) nogil + ) noexcept nogil + + +@cython.cdivision(True) +cdef inline float64_t calc_skew( + int64_t nobs, float64_t m2, float64_t m3 +) noexcept nogil: + cdef: + float64_t moments_ratio, correction, dnobs + + if nobs < 3: + return NAN + + dnobs = nobs + + moments_ratio = m3 / (m2 * sqrt(m2)) + correction = (dnobs * sqrt(dnobs - 1.0)) / (dnobs - 2.0) + return moments_ratio * correction + + +@cython.cdivision(True) +cdef inline float64_t calc_kurt( + int64_t nobs, float64_t m2, float64_t m4 +) noexcept nogil: + cdef: + float64_t result, dnobs, term1, term2, inner, correction + float64_t moments_ratio + + if nobs < 4: + return NAN + dnobs = nobs + moments_ratio = m4 / (m2 * m2) + term1 = dnobs * (dnobs + 1.0) * moments_ratio + term2 = 3.0 * (dnobs - 1.0) + inner = term1 - term2 - float64_t calc_skew(Moments) nogil + correction = (dnobs - 1.0) / ((dnobs - 2.0) * (dnobs - 3.0)) + result = correction * inner - float64_t calc_kurt(Moments) nogil + return result cdef enum TiebreakEnumType: diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index af06f89d36849..f05fa3e1819b1 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -5,7 +5,6 @@ from libc.math cimport ( sqrt, ) from libc.stdlib cimport ( - calloc, free, malloc, ) @@ -1607,7 +1606,11 @@ cdef void accumulate_moments_axis( const float64_t[:, :] values, bint skipna, const uint8_t[:, :] mask, - Moments *moments, + int64_t[:] nobs, + float64_t[:] mean, + float64_t[:] m2, + float64_t[:] m3, + float64_t[:] m4, int axis, int max_moment, ) noexcept nogil: @@ -1616,6 +1619,8 @@ cdef void accumulate_moments_axis( Py_ssize_t nouter, ninner bint uses_mask = mask is not None float64_t val + float64_t* m3_ptr = NULL + float64_t* m4_ptr = NULL if axis == 0: # Assumes F-contiguous @@ -1628,6 +1633,10 @@ cdef void accumulate_moments_axis( # PERF: The outer dimension is embarrassingly parallel for i in range(nouter): + if max_moment >= 3: + m3_ptr = &m3[i] + if max_moment >= 4: + m4_ptr = &m4[i] # PERF: If it's possible to enforce contiguity in the last dimension, # it's possible to use accumulate_moments_scalar with SIMD instructions. # It may create more threads if OMP_NESTED is enabled and the outer loop @@ -1639,7 +1648,8 @@ cdef void accumulate_moments_axis( val = NaN if skipna and isnan(val): continue - moments_add_value(&moments[i], val, max_moment) + moments_add_value(val, &nobs[i], &mean[i], &m2[i], m3_ptr, m4_ptr, + max_moment) @cython.boundscheck(False) @@ -1650,15 +1660,17 @@ def scalar_skew( const uint8_t[::1] mask, ) -> float: cdef: - Moments moments + int64_t nobs = 0 + float64_t mean = 0.0, m2 = 0.0, m3 = 0.0 size_t n = values.shape[0] const float64_t* p_values = &values[0] const uint8_t* p_mask = &mask[0] if mask is not None else NULL with nogil: - moments = accumulate_moments_scalar(p_values, n, skipna, p_mask, 3) + accumulate_moments_scalar(n, p_values, skipna, p_mask, + &nobs, &mean, &m2, &m3, NULL, 3) - return calc_skew(moments) + return calc_skew(nobs, m2, m3) @cython.boundscheck(False) @@ -1669,15 +1681,17 @@ def scalar_kurt( const uint8_t[::1] mask, ) -> float: cdef: - Moments moments + int64_t nobs = 0 + float64_t mean = 0.0, m2 = 0.0, m3 = 0.0, m4 = 0.0 size_t n = values.shape[0] const float64_t* p_values = &values[0] const uint8_t* p_mask = &mask[0] if mask is not None else NULL with nogil: - moments = accumulate_moments_scalar(p_values, n, skipna, p_mask, 4) + accumulate_moments_scalar(n, p_values, skipna, p_mask, + &nobs, &mean, &m2, &m3, &m4, 4) - return calc_kurt(moments) + return calc_kurt(nobs, m2, m4) @cython.boundscheck(False) @@ -1691,19 +1705,18 @@ def axis_skew( cdef: Py_ssize_t i Py_ssize_t nouter = values.shape[1] if axis == 0 else values.shape[0] - Moments *moments = calloc(nouter, sizeof(Moments)) + int64_t[::1] nobs = np.zeros(nouter, dtype=np.int64) + float64_t[::1] mean = np.zeros(nouter) + float64_t[::1] m2 = np.zeros(nouter) + float64_t[::1] m3 = np.zeros(nouter) ndarray result_arr = np.empty(nouter, dtype=np.float64) float64_t[:] result = result_arr - if moments == NULL: - raise MemoryError() - with nogil: - accumulate_moments_axis(values, skipna, mask, moments, axis, 3) + accumulate_moments_axis(values, skipna, mask, nobs, mean, m2, m3, None, axis, 3) for i in range(nouter): - result[i] = calc_skew(moments[i]) + result[i] = calc_skew(nobs[i], m2[i], m3[i]) - free(moments) return result_arr @@ -1718,19 +1731,19 @@ def axis_kurt( cdef: Py_ssize_t i Py_ssize_t nouter = values.shape[1] if axis == 0 else values.shape[0] - Moments *moments = calloc(nouter, sizeof(Moments)) + int64_t[::1] nobs = np.zeros(nouter, dtype=np.int64) + float64_t[::1] mean = np.zeros(nouter) + float64_t[::1] m2 = np.zeros(nouter) + float64_t[::1] m3 = np.zeros(nouter) + float64_t[::1] m4 = np.zeros(nouter) ndarray result_arr = np.empty(nouter, dtype=np.float64) float64_t[:] result = result_arr - if moments == NULL: - raise MemoryError() - with nogil: - accumulate_moments_axis(values, skipna, mask, moments, axis, 4) + accumulate_moments_axis(values, skipna, mask, nobs, mean, m2, m3, m4, axis, 4) for i in range(nouter): - result[i] = calc_kurt(moments[i]) + result[i] = calc_kurt(nobs[i], m2[i], m4[i]) - free(moments) return result_arr diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d6529623e41de..15abebef987ed 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -10,7 +10,6 @@ from libc.math cimport ( sqrt, ) from libc.stdlib cimport ( - calloc, free, malloc, ) @@ -35,7 +34,6 @@ cnp.import_array() from pandas._libs cimport util from pandas._libs.algos cimport ( - Moments, calc_kurt, calc_skew, get_rank_nan_fill_val, @@ -1025,18 +1023,23 @@ def group_skew( ) -> None: cdef: Py_ssize_t i, j, N, K, lab, ngroups = len(counts) + int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) bint uses_mask = mask is not None - Moments* ms + float64_t[:, ::1] mean, M2, M3 float64_t val if len_values != len_labels: raise ValueError("len(index) != len(labels)") + nobs = np.zeros((out).shape, dtype=np.int64) + + mean = np.zeros((out).shape, dtype=np.float64) + # M2, and M3 correspond to 2nd and 3rd central moments + M2 = np.zeros((out).shape, dtype=np.float64) + M3 = np.zeros((out).shape, dtype=np.float64) + N, K = (values).shape - ms = calloc(ngroups * K, sizeof(Moments)) - if ms is NULL: - raise MemoryError() out[:, :] = 0.0 @@ -1057,16 +1060,15 @@ def group_skew( if skipna and (isnan(val) or _treat_as_na(val, False)): continue - moments_add_value(&ms[lab * K + j], val, 3) + moments_add_value(val, &nobs[lab, j], &mean[lab, j], &M2[lab, j], + &M3[lab, j], NULL, 3) for i in range(ngroups): for j in range(K): - out[i, j] = calc_skew(ms[i * K + j]) + out[i, j] = calc_skew(nobs[i, j], M2[i, j], M3[i, j]) if result_mask is not None and isnan(out[i, j]): result_mask[i, j] = 1 - free(ms) - @cython.wraparound(False) @cython.boundscheck(False) @@ -1083,18 +1085,24 @@ def group_kurt( ) -> None: cdef: Py_ssize_t i, j, N, K, lab, ngroups = len(counts) + int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) bint uses_mask = mask is not None - Moments* ms + float64_t[:, ::1] mean, M2, M3, M4 float64_t val if len_values != len_labels: raise ValueError("len(index) != len(labels)") + nobs = np.zeros((out).shape, dtype=np.int64) + + mean = np.zeros((out).shape, dtype=np.float64) + # M2, M3 and M4 correspond to 2nd, 3rd and 4th Central Moments + M2 = np.zeros((out).shape, dtype=np.float64) + M3 = np.zeros((out).shape, dtype=np.float64) + M4 = np.zeros((out).shape, dtype=np.float64) + N, K = (values).shape - ms = calloc(ngroups * K, sizeof(Moments)) - if ms is NULL: - raise MemoryError() out[:, :] = 0.0 @@ -1115,16 +1123,15 @@ def group_kurt( if skipna and (isnan(val) or _treat_as_na(val, False)): continue - moments_add_value(&ms[lab * K + j], val, 4) + moments_add_value(val, &nobs[lab, j], &mean[lab, j], &M2[lab, j], + &M3[lab, j], &M4[lab, j], 4) for i in range(ngroups): for j in range(K): - out[i, j] = calc_kurt(ms[i * K + j]) + out[i, j] = calc_kurt(nobs[i, j], M2[i, j], M4[i, j]) if result_mask is not None and isnan(out[i, j]): result_mask[i, j] = 1 - free(ms) - @cython.wraparound(False) @cython.boundscheck(False) diff --git a/pandas/_libs/include/pandas/moments.h b/pandas/_libs/include/pandas/moments.h index e9516acf4a881..51d62486c191b 100644 --- a/pandas/_libs/include/pandas/moments.h +++ b/pandas/_libs/include/pandas/moments.h @@ -9,7 +9,7 @@ The full license is in the LICENSE file, distributed with this software. #pragma once -#include +#include #include #include @@ -18,6 +18,9 @@ The full license is in the LICENSE file, distributed with this software. extern "C" { #endif +// NOTE: It may be better to use the Moments struct +// instead of using multiple double references +// and a signed value to nobs. typedef struct { int64_t n; double mean; @@ -26,90 +29,33 @@ typedef struct { double m4; } Moments; -static inline void moments_add_value(Moments *moments, double val, +static inline void moments_add_value(double val, int64_t *nobs, double *mean, + double *m2, double *m3, double *m4, int max_moment) { - double delta = val - moments->mean; - moments->n++; - double n = (double)moments->n; + double delta = val - *mean; + (*nobs)++; + double n = (double)(*nobs); double delta_n = delta / n; double term1 = delta * delta_n * (n - 1.0); if (max_moment >= 4) { - moments->m4 += - delta_n * - (-4.0 * moments->m3 + - delta_n * (6.0 * moments->m2 + term1 * (n * n - 3.0 * n + 3.0))); + *m4 += delta_n * (-4.0 * *m3 + + delta_n * (6.0 * *m2 + term1 * (n * n - 3.0 * n + 3.0))); } if (max_moment >= 3) { - moments->m3 += delta_n * (term1 * (n - 2.0) - 3.0 * moments->m2); + *m3 += delta_n * (term1 * (n - 2.0) - 3.0 * *m2); } - moments->m2 += term1; - moments->mean += delta_n; + *m2 += term1; + *mean += delta_n; } -static inline Moments moments_merge(Moments a, Moments b, int max_moment) { - - if (a.n == 0) { - return b; - } - if (b.n == 0) { - return a; - } - - Moments result; - - result.n = a.n + b.n; - double n_a = (double)a.n; - double n_b = (double)b.n; - double delta = b.mean - a.mean; - double delta_n = delta / (double)result.n; - double term1 = delta * delta_n * n_a * n_b; - - if (max_moment >= 4) { - result.m4 = - a.m4 + b.m4 + - delta_n * (4.0 * (n_a * b.m3 - n_b * a.m3) + - delta_n * (6.0 * (n_a * n_a * b.m2 + n_b * n_b * a.m2) + - term1 * (n_a * n_a - n_a * n_b + n_b * n_b))); - } - if (max_moment >= 3) { - result.m3 = - a.m3 + b.m3 + - delta_n * (3.0 * (n_a * b.m2 - n_b * a.m2) + term1 * (n_a - n_b)); - } - result.m2 = a.m2 + b.m2 + term1; - result.mean = a.mean + delta_n * n_b; - - return result; -} +Moments moments_merge(Moments a, Moments b, int max_moment); /// Compute central moments until `max_moment` using `n` elements from `values`. -Moments accumulate_moments_scalar(const double *values, size_t n, int skipna, - const uint8_t *mask, int max_moment); - -static inline double calc_skew(Moments moments) { - if (moments.n < 3) { - return NAN; - } - double dnobs = (double)moments.n; - double moments_ratio = moments.m3 / (moments.m2 * sqrt(moments.m2)); - double correction = (dnobs * sqrt(dnobs - 1.0)) / (dnobs - 2.0); - return moments_ratio * correction; -} - -static inline double calc_kurt(Moments moments) { - if (moments.n < 4) { - return NAN; - } - double dnobs = (double)moments.n; - double moments_ratio = moments.m4 / (moments.m2 * moments.m2); - double term1 = dnobs * (dnobs + 1.0) * moments_ratio; - double term2 = 3.0 * (dnobs - 1.0); - double inner = term1 - term2; - double correction = (dnobs - 1.0) / ((dnobs - 2.0) * (dnobs - 3.0)); - return correction * inner; -} - +void accumulate_moments_scalar(size_t n, const double *values, bool skipna, + const uint8_t *mask, int64_t *nobs, double *mean, + double *m2, double *m3, double *m4, + int max_moment); #ifdef __cplusplus } #endif diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index b80dadf350d04..144aeae5eae9f 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -18,6 +18,63 @@ static inline int omp_get_thread_num(void) { return 0; } static inline int omp_get_num_threads(void) { return 1; } #endif // _OPENMP +static inline void moments_add_valuem(Moments *moments, double val, + int max_moment) { + double delta = val - moments->mean; + moments->n++; + double n = (double)moments->n; + double delta_n = delta / n; + double term1 = delta * delta_n * (n - 1.0); + + if (max_moment >= 4) { + moments->m4 += + delta_n * + (-4.0 * moments->m3 + + delta_n * (6.0 * moments->m2 + term1 * (n * n - 3.0 * n + 3.0))); + } + if (max_moment >= 3) { + moments->m3 += delta_n * (term1 * (n - 2.0) - 3.0 * moments->m2); + } + moments->m2 += term1; + moments->mean += delta_n; +} + +Moments moments_merge(Moments a, Moments b, int max_moment) { + + if (a.n == 0) { + return b; + } + if (b.n == 0) { + return a; + } + + Moments result; + + result.n = a.n + b.n; + double n_a = (double)a.n; + double n_b = (double)b.n; + double delta = b.mean - a.mean; + double delta_n = delta / (double)result.n; + double term1 = delta * delta_n * n_a * n_b; + + if (max_moment >= 4) { + result.m4 = + a.m4 + b.m4 + + delta_n * (4.0 * (n_a * b.m3 - n_b * a.m3) + + delta_n * (6.0 * (n_a * n_a * b.m2 + n_b * n_b * a.m2) + + term1 * (n_a * n_a - n_a * n_b + n_b * n_b))); + } + if (max_moment >= 3) { + result.m3 = + a.m3 + b.m3 + + delta_n * (3.0 * (n_a * b.m2 - n_b * a.m2) + term1 * (n_a - n_b)); + } + result.m2 = a.m2 + b.m2 + term1; + result.mean = a.mean + delta_n * n_b; + + return result; +} + #ifndef __has_attribute # define __has_attribute(x) 0 #endif // __has_attribute @@ -53,10 +110,9 @@ typedef uint8_t v4u8 ((v4d)(((v4si)(mask) & (v4si)(a)) | (~(v4si)(mask) & (v4si)(b)))) # endif // defined(__clang__) -PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values, - size_t n, int skipna, - const uint8_t *mask, - int max_moment) { +PANDAS_SIMD_TARGETS +Moments accumulate_moments_simd(size_t n, const double *values, int skipna, + const uint8_t *mask, int max_moment) { v4d v_mean = {0.0, 0.0, 0.0, 0.0}; v4d v_m2 = {0.0, 0.0, 0.0, 0.0}; v4d v_m3 = {0.0, 0.0, 0.0, 0.0}; @@ -114,17 +170,16 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values, v_mean += v_delta_n; } - double n_arr[4], mean_arr[4], m2_arr[4], m3_arr[4], m4_arr[4]; - *(v4d *)n_arr = v_n; + double n_arrd[4], mean_arr[4], m2_arr[4], m3_arr[4], m4_arr[4]; + *(v4d *)n_arrd = v_n; *(v4d *)mean_arr = v_mean; *(v4d *)m2_arr = v_m2; *(v4d *)m3_arr = v_m3; *(v4d *)m4_arr = v_m4; - Moments moments_arr[4]; + int64_t n_arr[4]; for (int j = 0; j < 4; j++) { - moments_arr[j] = (Moments){(int64_t)n_arr[j], mean_arr[j], m2_arr[j], - m3_arr[j], m4_arr[j]}; + n_arr[j] = (int64_t)n_arrd[j]; } // Distribute remaining values across chunks @@ -134,7 +189,16 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values, val = NAN; if (skipna && isnan(val)) continue; - moments_add_value(&moments_arr[i % 4], val, max_moment); + moments_add_value(val, &n_arr[i % 4], &mean_arr[i % 4], &m2_arr[i % 4], + &m3_arr[i % 4], &m4_arr[i % 4], max_moment); + } + Moments moments_arr[4]; + for (int j = 0; j < 4; j++) { + moments_arr[j] = (Moments){.n = n_arr[j], + .mean = mean_arr[j], + .m2 = m2_arr[j], + .m3 = m3_arr[j], + .m4 = m4_arr[j]}; } // pairwise merge for numerical stability @@ -148,7 +212,7 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values, /* --- Scalar Fallback Implementation --- */ -Moments accumulate_moments_scalar_block(const double *values, size_t n, +Moments accumulate_moments_scalar_block(size_t n, const double *values, int skipna, const uint8_t *mask, int max_moment) { Moments moments = {0}; @@ -158,26 +222,28 @@ Moments accumulate_moments_scalar_block(const double *values, size_t n, val = NAN; if (skipna && isnan(val)) continue; - moments_add_value(&moments, val, max_moment); + moments_add_valuem(&moments, val, max_moment); } return moments; } /* --- Accumulation Dispatch (Choose SIMD or Scalar) --- */ -Moments accumulate_moments_dispatch(const double *values, size_t n, int skipna, +Moments accumulate_moments_dispatch(size_t n, const double *values, int skipna, const uint8_t *mask, int max_moment) { #if defined(PANDAS_HAS_SIMD) - return accumulate_moments_simd(values, n, skipna, mask, max_moment); + return accumulate_moments_simd(n, values, skipna, mask, max_moment); #endif // defined(PANDAS_HAS_SIMD) - return accumulate_moments_scalar_block(values, n, skipna, mask, max_moment); + return accumulate_moments_scalar_block(n, values, skipna, mask, max_moment); } /* --- Moments 1D Accumulator Implementation --- */ -Moments accumulate_moments_scalar(const double *values, size_t n, int skipna, - const uint8_t *mask, int max_moment) { - Moments result = {0}; +void accumulate_moments_scalar(size_t n, const double *values, bool skipna, + const uint8_t *mask, int64_t *nobs, double *mean, + double *m2, double *m3, double *m4, + int max_moment) { + Moments acc = {0}; #ifdef _OPENMP # pragma omp parallel if (n > 10000) @@ -190,16 +256,34 @@ Moments accumulate_moments_scalar(const double *values, size_t n, int skipna, size_t end = tid == num_threads - 1 ? n : ((tid + 1) * n) / num_threads; Moments moments_local = - accumulate_moments_dispatch(values + start, end - start, skipna, + accumulate_moments_dispatch(end - start, values + start, skipna, mask ? mask + start : NULL, max_moment); #ifdef _OPENMP # pragma omp critical #endif { - result = moments_merge(result, moments_local, max_moment); + acc = moments_merge(acc, moments_local, max_moment); } } - return result; + Moments tmp = + (Moments){.n = *nobs, .mean = *mean, .m2 = *m2, .m3 = 0.0, .m4 = 0.0}; + if (max_moment >= 4) { + tmp.m4 = *m4; + } + if (max_moment >= 3) { + tmp.m3 = *m3; + } + Moments result = moments_merge(tmp, acc, max_moment); + + if (max_moment >= 4) { + *m4 = result.m4; + } + if (max_moment >= 3) { + *m3 = result.m3; + } + *m2 = result.m2; + *mean = result.mean; + *nobs = result.n; } diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 45a765c468aaa..619d5590cc207 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -9,7 +9,6 @@ from libcpp.stack cimport stack from libcpp.unordered_map cimport unordered_map from pandas._libs.algos cimport ( - Moments, TiebreakEnumType, calc_kurt, calc_skew, @@ -493,22 +492,26 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, # Rolling skewness -cdef void add_skew(float64_t val, Moments *moments, +cdef void add_skew(float64_t val, int64_t *nobs, + float64_t *mean, float64_t *m2, + float64_t *m3, bint *numerically_unstable, ) noexcept nogil: """ add a value from the skew calc """ cdef: - float64_t old_m3 = moments.m3 + float64_t old_m3 = m3[0] # Not NaN if val == val: - moments_add_value(moments, val, 3) - if fabs(old_m3) * InvCondTol > fabs(moments.m3): + moments_add_value(val, nobs, mean, m2, m3, NULL, 3) + if fabs(old_m3) * InvCondTol > fabs(m3[0]): # possible catastrophic cancellation numerically_unstable[0] = True -cdef void remove_skew(float64_t val, Moments *moments, +cdef void remove_skew(float64_t val, int64_t *nobs, + float64_t *mean, float64_t *m2, + float64_t *m3, bint *numerically_unstable) noexcept nogil: """ remove a value from the skew calc """ cdef: @@ -526,22 +529,22 @@ cdef void remove_skew(float64_t val, Moments *moments, # Not NaN if val == val: - moments.n -= 1 - n = (moments.n) - delta = val - moments.mean + nobs[0] -= 1 + n = (nobs[0]) + delta = val - mean[0] delta_n = delta / n term1 = delta_n * delta * (n + 1.0) - m3_update = delta_n * (term1 * (n + 2.0) - 3.0 * moments.m2) - new_m3 = moments.m3 - m3_update + m3_update = delta_n * (term1 * (n + 2.0) - 3.0 * m2[0]) + new_m3 = m3[0] - m3_update - if (fabs(m3_update) + fabs(moments.m3)) * InvCondTol > fabs(new_m3): + if (fabs(m3_update) + fabs(m3[0])) * InvCondTol > fabs(new_m3): # possible catastrophic cancellation numerically_unstable[0] = True - moments.m3 = new_m3 - moments.m2 -= term1 - moments.mean -= delta_n + m3[0] = new_m3 + m2[0] -= term1 + mean[0] -= delta_n def roll_skew(const float64_t[:] values, ndarray[int64_t] start, @@ -549,8 +552,8 @@ def roll_skew(const float64_t[:] values, ndarray[int64_t] start, cdef: Py_ssize_t i, j float64_t val - Moments moments - int64_t N = len(start) + float64_t mean, m2, m3 + int64_t nobs = 0, N = len(start) int64_t s, e ndarray[float64_t] output bint is_monotonic_increasing_bounds @@ -582,35 +585,31 @@ def roll_skew(const float64_t[:] values, ndarray[int64_t] start, # calculate deletes for j in range(start[i - 1], s): val = values[j] - remove_skew(val, &moments, &numerically_unstable) + remove_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable) # calculate adds for j in range(end[i - 1], e): val = values[j] - add_skew(val, &moments, &numerically_unstable) + add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable) if requires_recompute or numerically_unstable: - moments.n = 0 - moments.mean = 0.0 - moments.m2 = 0.0 - moments.m3 = 0.0 - moments.m4 = 0.0 + mean = m2 = m3 = 0.0 + nobs = 0 for j in range(s, e): val = values[j] - add_skew(val, &moments, &numerically_unstable) + add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable) numerically_unstable = False - output[i] = NaN if moments.n < minp else calc_skew(moments) + output[i] = NaN if nobs < minp else calc_skew(nobs, m2, m3) if not is_monotonic_increasing_bounds: - moments.n = 0 - moments.mean = 0.0 - moments.m2 = 0.0 - moments.m3 = 0.0 - moments.m4 = 0.0 + nobs = 0 + mean = 0.0 + m2 = 0.0 + m3 = 0.0 return output @@ -618,22 +617,26 @@ def roll_skew(const float64_t[:] values, ndarray[int64_t] start, # Rolling kurtosis -cdef void add_kurt(float64_t val, Moments *moments, +cdef void add_kurt(float64_t val, int64_t *nobs, + float64_t *mean, float64_t *m2, + float64_t *m3, float64_t *m4, bint *numerically_unstable, ) noexcept nogil: """ add a value from the kurotic calc """ cdef: - float64_t old_m4 = moments.m4 + float64_t old_m4 = m4[0] # Not NaN if val == val: - moments_add_value(moments, val, 4) - if fabs(old_m4) * InvCondTol > fabs(moments.m4): + moments_add_value(val, nobs, mean, m2, m3, m4, 4) + if fabs(old_m4) * InvCondTol > fabs(m4[0]): # possible catastrophic cancellation numerically_unstable[0] = True -cdef void remove_kurt(float64_t val, Moments *moments, +cdef void remove_kurt(float64_t val, int64_t *nobs, + float64_t *mean, float64_t *m2, + float64_t *m3, float64_t *m4, bint *numerically_unstable, ) noexcept nogil: """ remove a value from the kurotic calc """ @@ -642,37 +645,37 @@ cdef void remove_kurt(float64_t val, Moments *moments, # Not NaN if val == val: - moments.n -= 1 - n = (moments.n) - delta = val - moments.mean + nobs[0] -= 1 + n = (nobs[0]) + delta = val - mean[0] delta_n = delta / n term1 = delta_n * delta * (n + 1.0) m4_update = delta_n * ( - 4.0 * moments.m3 + 4.0 * m3[0] + delta_n * ( - 6.0 * moments.m2 + 6.0 * m2[0] - term1 * (n * n + 3.0 * n + 3.0) ) ) - new_m4 = moments.m4 + m4_update + new_m4 = m4[0] + m4_update - if (fabs(m4_update) + fabs(moments.m4)) * InvCondTol > fabs(new_m4): + if (fabs(m4_update) + fabs(m4[0])) * InvCondTol > fabs(new_m4): # possible catastrophic cancellation numerically_unstable[0] = True - moments.m4 = new_m4 - moments.m3 -= delta_n * (term1 * (n + 2.0) - 3.0 * moments.m2) - moments.m2 -= term1 - moments.mean -= delta_n + m4[0] = new_m4 + m3[0] -= delta_n * (term1 * (n + 2.0) - 3.0 * m2[0]) + m2[0] -= term1 + mean[0] -= delta_n def roll_kurt(const float64_t[:] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp) -> np.ndarray: cdef: Py_ssize_t i, j - Moments moments - int64_t s, e + float64_t mean, m2, m3, m4 + int64_t nobs, s, e int64_t N = len(start) ndarray[float64_t] output bint is_monotonic_increasing_bounds @@ -704,32 +707,30 @@ def roll_kurt(const float64_t[:] values, ndarray[int64_t] start, # and removed # calculate deletes for j in range(start[i - 1], s): - remove_kurt(values[j], &moments, &numerically_unstable) + remove_kurt(values[j], &nobs, &mean, &m2, &m3, &m4, + &numerically_unstable) # calculate adds for j in range(end[i - 1], e): - add_kurt(values[j], &moments, &numerically_unstable) + add_kurt(values[j], &nobs, &mean, &m2, &m3, &m4, + &numerically_unstable) if requires_recompute or numerically_unstable: - moments.n = 0 - moments.mean = 0.0 - moments.m2 = 0.0 - moments.m3 = 0.0 - moments.m4 = 0.0 + mean = m2 = m3 = m4 = 0.0 + nobs = 0 for j in range(s, e): - add_kurt(values[j], &moments, &numerically_unstable) - - numerically_unstable = False + add_kurt(values[j], &nobs, &mean, &m2, &m3, &m4, + &numerically_unstable) - output[i] = NaN if moments.n < minp else calc_kurt(moments) + output[i] = NaN if nobs < minp else calc_kurt(nobs, m2, m4) if not is_monotonic_increasing_bounds: - moments.n = 0 - moments.mean = 0.0 - moments.m2 = 0.0 - moments.m3 = 0.0 - moments.m4 = 0.0 + nobs = 0 + mean = 0.0 + m2 = 0.0 + m3 = 0.0 + m4 = 0.0 return output From f6c2e2d73aa02cf74ab585c89cbd6779c5be1b84 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Thu, 2 Apr 2026 23:12:16 -0300 Subject: [PATCH 31/35] perf: remove OpenMP --- .../development/contributing_environment.rst | 3 +- meson.options | 5 --- pandas/_libs/meson.build | 4 +-- pandas/_libs/src/moments.c | 34 +++---------------- 4 files changed, 6 insertions(+), 40 deletions(-) delete mode 100644 meson.options diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index b814e1ca96020..ce0430bd9e876 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -169,13 +169,12 @@ To compile and install pandas in editable mode, run: python -m pip install --verbose --editable . --no-build-isolation -Additional Meson and Pandas options can be passed to the ``pip install`` command to modify the installation. +Additional Meson options can be passed to the ``pip install`` command to modify the installation. Helpful options include: * ``-Ceditable-verbose=true``: Print verbose logs during rebuild, even during ``import``. * ``-Cbuilddir="your builddir here"``: Specify a different build directory for the C extensions. * ``-Csetup-args="-Ddebug=true"``: Compile the C extensions with debug symbols. -* ``-Csetup-args=-Dopenmp=disabled``: Compile Pandas without parallelization support through OpenMP. .. note:: When pandas is installed in ``--editable`` mode, pandas will automatically rebuild the library upon ``import``, diff --git a/meson.options b/meson.options deleted file mode 100644 index 22fc68252de52..0000000000000 --- a/meson.options +++ /dev/null @@ -1,5 +0,0 @@ -option( - 'openmp', - type: 'feature', - description: 'Parallelization with OpenMP', -) diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index e4db9972dd44d..2f20136b375c8 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -56,8 +56,6 @@ fast_float_dep = fast_float.get_variable('fast_float_dep') subdir('tslibs') -openmp_dep = dependency('OpenMP', required: get_option('openmp')) - libs_sources = { # Dict of extension name -> dict of {sources, include_dirs, and deps} # numpy include dir is implicitly included @@ -68,7 +66,7 @@ libs_sources = { _algos_common_helper, _algos_take_helper, ], - 'deps': [_khash_primitive_helper_dep, m_dep, openmp_dep], + 'deps': [_khash_primitive_helper_dep, m_dep], }, 'arrays': {'sources': ['arrays.pyx']}, 'groupby': {'sources': ['groupby.pyx'], 'deps': [m_dep]}, diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index 144aeae5eae9f..1b71083ede5c7 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -11,13 +11,6 @@ The full license is in the LICENSE file, distributed with this software. #include #include -#ifdef _OPENMP -# include -#else -static inline int omp_get_thread_num(void) { return 0; } -static inline int omp_get_num_threads(void) { return 1; } -#endif // _OPENMP - static inline void moments_add_valuem(Moments *moments, double val, int max_moment) { double delta = val - moments->mean; @@ -243,29 +236,10 @@ void accumulate_moments_scalar(size_t n, const double *values, bool skipna, const uint8_t *mask, int64_t *nobs, double *mean, double *m2, double *m3, double *m4, int max_moment) { - Moments acc = {0}; - -#ifdef _OPENMP -# pragma omp parallel if (n > 10000) -#endif - { - size_t tid = (size_t)omp_get_thread_num(); - size_t num_threads = (size_t)omp_get_num_threads(); - - size_t start = (tid * n) / num_threads; - size_t end = tid == num_threads - 1 ? n : ((tid + 1) * n) / num_threads; - - Moments moments_local = - accumulate_moments_dispatch(end - start, values + start, skipna, - mask ? mask + start : NULL, max_moment); - -#ifdef _OPENMP -# pragma omp critical -#endif - { - acc = moments_merge(acc, moments_local, max_moment); - } - } + // PERF: It's possible to parallelize moment reductions + // and call `moments_merge` to join the results. + Moments acc = + accumulate_moments_dispatch(n, values, skipna, mask, max_moment); Moments tmp = (Moments){.n = *nobs, .mean = *mean, .m2 = *m2, .m3 = 0.0, .m4 = 0.0}; From 58ba9473967c676255053aeb2ef0db266dbffe51 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Fri, 3 Apr 2026 00:32:28 -0300 Subject: [PATCH 32/35] fix: use long instead of int64_t for apple clang --- pandas/_libs/include/pandas/moments.h | 6 +++--- pandas/_libs/src/moments.c | 17 ++++++++++------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/include/pandas/moments.h b/pandas/_libs/include/pandas/moments.h index 51d62486c191b..e608b526e786e 100644 --- a/pandas/_libs/include/pandas/moments.h +++ b/pandas/_libs/include/pandas/moments.h @@ -22,14 +22,14 @@ extern "C" { // instead of using multiple double references // and a signed value to nobs. typedef struct { - int64_t n; + uint64_t n; double mean; double m2; double m3; double m4; } Moments; -static inline void moments_add_value(double val, int64_t *nobs, double *mean, +static inline void moments_add_value(double val, long *nobs, double *mean, double *m2, double *m3, double *m4, int max_moment) { double delta = val - *mean; @@ -53,7 +53,7 @@ Moments moments_merge(Moments a, Moments b, int max_moment); /// Compute central moments until `max_moment` using `n` elements from `values`. void accumulate_moments_scalar(size_t n, const double *values, bool skipna, - const uint8_t *mask, int64_t *nobs, double *mean, + const uint8_t *mask, long *nobs, double *mean, double *m2, double *m3, double *m4, int max_moment); #ifdef __cplusplus diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index 1b71083ede5c7..ccb156059f3ef 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -170,9 +170,9 @@ Moments accumulate_moments_simd(size_t n, const double *values, int skipna, *(v4d *)m3_arr = v_m3; *(v4d *)m4_arr = v_m4; - int64_t n_arr[4]; + long n_arr[4]; for (int j = 0; j < 4; j++) { - n_arr[j] = (int64_t)n_arrd[j]; + n_arr[j] = (long)n_arrd[j]; } // Distribute remaining values across chunks @@ -187,7 +187,7 @@ Moments accumulate_moments_simd(size_t n, const double *values, int skipna, } Moments moments_arr[4]; for (int j = 0; j < 4; j++) { - moments_arr[j] = (Moments){.n = n_arr[j], + moments_arr[j] = (Moments){.n = (uint64_t)n_arr[j], .mean = mean_arr[j], .m2 = m2_arr[j], .m3 = m3_arr[j], @@ -233,7 +233,7 @@ Moments accumulate_moments_dispatch(size_t n, const double *values, int skipna, /* --- Moments 1D Accumulator Implementation --- */ void accumulate_moments_scalar(size_t n, const double *values, bool skipna, - const uint8_t *mask, int64_t *nobs, double *mean, + const uint8_t *mask, long *nobs, double *mean, double *m2, double *m3, double *m4, int max_moment) { // PERF: It's possible to parallelize moment reductions @@ -241,8 +241,8 @@ void accumulate_moments_scalar(size_t n, const double *values, bool skipna, Moments acc = accumulate_moments_dispatch(n, values, skipna, mask, max_moment); - Moments tmp = - (Moments){.n = *nobs, .mean = *mean, .m2 = *m2, .m3 = 0.0, .m4 = 0.0}; + Moments tmp = (Moments){ + .n = (uint64_t)*nobs, .mean = *mean, .m2 = *m2, .m3 = 0.0, .m4 = 0.0}; if (max_moment >= 4) { tmp.m4 = *m4; } @@ -259,5 +259,8 @@ void accumulate_moments_scalar(size_t n, const double *values, bool skipna, } *m2 = result.m2; *mean = result.mean; - *nobs = result.n; + // FIXME: This conversion may overflow. + // Using `long` because the type used in Cython is `np.int64_t`, + // which is aliased to `long`. + *nobs = (long)result.n; } From 7c2e83e592210ff84e7af936c0ff060c331ea130 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Fri, 3 Apr 2026 08:37:50 -0300 Subject: [PATCH 33/35] fix: fix some -Wconversion warnings --- pandas/_libs/src/moments.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index ccb156059f3ef..a737210edf7f5 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -95,13 +95,8 @@ typedef uint8_t v4u8 #ifdef PANDAS_HAS_SIMD /* Vector select: returns (mask ? a : b) where mask is all-ones or all-zeros */ -# if defined(__clang__) -# define v_select(mask, a, b) ((mask) ? (a) : (b)) -# else -// gcc doesn't have vectorized ternary operators when compiling C files. -# define v_select(mask, a, b) \ - ((v4d)(((v4si)(mask) & (v4si)(a)) | (~(v4si)(mask) & (v4si)(b)))) -# endif // defined(__clang__) +# define v_select(mask, a, b) \ + ((v4d)(((v4si)(mask) & (v4si)(a)) | (~(v4si)(mask) & (v4si)(b)))) PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(size_t n, const double *values, int skipna, @@ -116,6 +111,7 @@ Moments accumulate_moments_simd(size_t n, const double *values, int skipna, v4d v_one = {1.0, 1.0, 1.0, 1.0}; v4d v_zero = {0.0, 0.0, 0.0, 0.0}; + v4si v_zerosi = {0, 0, 0, 0}; v4d v_nan = {NAN, NAN, NAN, NAN}; size_t i = 0; @@ -126,7 +122,7 @@ Moments accumulate_moments_simd(size_t n, const double *values, int skipna, v4u8 mask_vec = *(v4u8 *)(mask + i); v4si mask_si = __builtin_convertvector(mask_vec, v4si); // replace val with NAN where mask is 1. - v_val = v_select(mask_si != 0, v_nan, v_val); + v_val = v_select(mask_si != v_zerosi, v_nan, v_val); } // skip_mask is 1.0 if we should skip, 0.0 otherwise From 6269bdc26d20a64ccd382f243ba4fefff7cad89c Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Fri, 3 Apr 2026 09:18:54 -0300 Subject: [PATCH 34/35] fix: fix types for 32 bit --- pandas/_libs/include/pandas/moments.h | 4 ++-- pandas/_libs/src/moments.c | 11 ++++------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/include/pandas/moments.h b/pandas/_libs/include/pandas/moments.h index e608b526e786e..9723761f64ef3 100644 --- a/pandas/_libs/include/pandas/moments.h +++ b/pandas/_libs/include/pandas/moments.h @@ -29,7 +29,7 @@ typedef struct { double m4; } Moments; -static inline void moments_add_value(double val, long *nobs, double *mean, +static inline void moments_add_value(double val, int64_t *nobs, double *mean, double *m2, double *m3, double *m4, int max_moment) { double delta = val - *mean; @@ -53,7 +53,7 @@ Moments moments_merge(Moments a, Moments b, int max_moment); /// Compute central moments until `max_moment` using `n` elements from `values`. void accumulate_moments_scalar(size_t n, const double *values, bool skipna, - const uint8_t *mask, long *nobs, double *mean, + const uint8_t *mask, int64_t *nobs, double *mean, double *m2, double *m3, double *m4, int max_moment); #ifdef __cplusplus diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c index a737210edf7f5..755fbec477505 100644 --- a/pandas/_libs/src/moments.c +++ b/pandas/_libs/src/moments.c @@ -166,9 +166,9 @@ Moments accumulate_moments_simd(size_t n, const double *values, int skipna, *(v4d *)m3_arr = v_m3; *(v4d *)m4_arr = v_m4; - long n_arr[4]; + int64_t n_arr[4]; for (int j = 0; j < 4; j++) { - n_arr[j] = (long)n_arrd[j]; + n_arr[j] = (int64_t)n_arrd[j]; } // Distribute remaining values across chunks @@ -229,7 +229,7 @@ Moments accumulate_moments_dispatch(size_t n, const double *values, int skipna, /* --- Moments 1D Accumulator Implementation --- */ void accumulate_moments_scalar(size_t n, const double *values, bool skipna, - const uint8_t *mask, long *nobs, double *mean, + const uint8_t *mask, int64_t *nobs, double *mean, double *m2, double *m3, double *m4, int max_moment) { // PERF: It's possible to parallelize moment reductions @@ -255,8 +255,5 @@ void accumulate_moments_scalar(size_t n, const double *values, bool skipna, } *m2 = result.m2; *mean = result.mean; - // FIXME: This conversion may overflow. - // Using `long` because the type used in Cython is `np.int64_t`, - // which is aliased to `long`. - *nobs = (long)result.n; + *nobs = (int64_t)result.n; } From 25216651ed17c2f2926f15c08421cdee5a702d76 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Wed, 29 Apr 2026 18:44:35 -0300 Subject: [PATCH 35/35] revert test changes --- pandas/tests/apply/test_str.py | 3 +-- pandas/tests/arrays/floating/test_function.py | 3 +-- pandas/tests/arrays/integer/test_function.py | 3 +-- pandas/tests/test_nanops.py | 3 +-- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index 9589db07dbcee..e5a9492630b13 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -41,8 +41,7 @@ def test_apply_with_string_funcs(float_frame, func, kwds, how): def test_with_string_args(datetime_series, all_numeric_reductions): result = datetime_series.apply(all_numeric_reductions) expected = getattr(datetime_series, all_numeric_reductions)() - # reductions computed in parallel may yield different results - tm.assert_almost_equal(result, expected) + assert result == expected @pytest.mark.parametrize("op", ["mean", "median", "std", "var"]) diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py index 2fcdcd5a9e823..a5938b2aa93e8 100644 --- a/pandas/tests/arrays/floating/test_function.py +++ b/pandas/tests/arrays/floating/test_function.py @@ -116,8 +116,7 @@ def test_stat_method(pandasmethname, kwargs): s2 = pd.Series(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype="float64") pandasmeth = getattr(s2, pandasmethname) expected = pandasmeth(**kwargs) - # reductions computed in parallel may yield different results - tm.assert_almost_equal(result, expected) + assert expected == result def test_value_counts_na(): diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index a288ab631ae3b..6b3308baf3ee7 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -134,8 +134,7 @@ def test_stat_method(pandasmethname, kwargs): s2 = pd.Series(data=[1, 2, 3, 4, 5, 6], dtype="Int64") pandasmeth = getattr(s2, pandasmethname) expected = pandasmeth(**kwargs) - # reductions computed in parallel may yield different results - tm.assert_almost_equal(result, expected) + assert expected == result def test_value_counts_na(): diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index b83e2d6e0e7f2..acd92d916bdcc 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1243,8 +1243,7 @@ def test_nanops_independent_of_mask_param(operation): mask = ser.isna() median_expected = operation(ser._values) median_result = operation(ser._values, mask=mask._values) - # reductions computed in parallel may yield different results - tm.assert_almost_equal(median_result, median_expected) + assert median_expected == median_result @pytest.mark.parametrize("min_count", [-1, 0])