diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd index e47f7d56ba343..b5859ba2af3e8 100644 --- a/pandas/_libs/algos.pxd +++ b/pandas/_libs/algos.pxd @@ -6,6 +6,7 @@ from libc.math cimport ( from numpy cimport ( float64_t, int64_t, + uint8_t, ) from pandas._libs.dtypes cimport ( @@ -17,37 +18,29 @@ from pandas._libs.dtypes cimport ( cdef numeric_t kth_smallest_c(numeric_t* arr, Py_ssize_t k, Py_ssize_t n) noexcept nogil -@cython.cdivision(True) -cdef inline void moments_add_value( - float64_t val, - int64_t* nobs, - float64_t* mean, - float64_t* m2, - float64_t* m3, - float64_t* m4, - int max_moment, -) noexcept nogil: - cdef: - float64_t n - float64_t delta = val - mean[0] - float64_t delta_n, term1 - - nobs[0] += 1 - n = nobs[0] - delta_n = delta / n - term1 = delta * delta_n * (n - 1.0) - - if max_moment >= 4: - m4[0] += delta_n * ( - -4.0 * m3[0] - + delta_n * ( - 6.0 * m2[0] + term1 * (n * n - 3.0 * n + 3.0) - ) - ) - if max_moment >= 3: - m3[0] += delta_n * (term1 * (n - 2.0) - 3.0 * m2[0]) - m2[0] += term1 - mean[0] += delta_n +cdef extern from "pandas/moments.h": + void moments_add_value( + float64_t val, + int64_t* nobs, + float64_t* mean, + float64_t* m2, + float64_t* m3, + float64_t* m4, + int max_moment, + ) noexcept nogil + + void accumulate_moments_scalar( + size_t n, + const float64_t* values, + bint skipna, + const uint8_t* mask, + int64_t* nobs, + float64_t* mean, + float64_t* m2, + float64_t* m3, + float64_t* m4, + int max_moment, + ) noexcept nogil @cython.cdivision(True) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 97aa64593a3fc..f05fa3e1819b1 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1600,33 +1600,6 @@ def diff_2d( # ---------------------------------------------------------------------- -@cython.boundscheck(False) -@cython.wraparound(False) -cdef void accumulate_moments_scalar( - const float64_t[:] values, - bint skipna, - const uint8_t[:] mask, - int64_t* nobs, - float64_t* mean, - float64_t* m2, - float64_t* m3, - float64_t* m4, - int max_moment, -) noexcept nogil: - cdef: - Py_ssize_t i, n = len(values) - bint uses_mask = mask is not None - float64_t val - - for i in range(n): - val = values[i] - if uses_mask and mask[i]: - val = NaN - if skipna and isnan(val): - continue - moments_add_value(val, nobs, mean, m2, m3, m4, max_moment) - - @cython.boundscheck(False) @cython.wraparound(False) cdef void accumulate_moments_axis( @@ -1658,11 +1631,17 @@ cdef void accumulate_moments_axis( nouter = nrows ninner = ncols + # PERF: The outer dimension is embarrassingly parallel for i in range(nouter): if max_moment >= 3: m3_ptr = &m3[i] if max_moment >= 4: m4_ptr = &m4[i] + # PERF: If it's possible to enforce contiguity in the last dimension, + # it's possible to use accumulate_moments_scalar with SIMD instructions. + # It may create more threads if OMP_NESTED is enabled and the outer loop + # is run in parallel. + # If it doesn't run in parallel, it will at least use SIMD instructions. for j in range(ninner): val = values[j, i] if axis == 0 else values[i, j] if uses_mask and (mask[j, i] if axis == 0 else mask[i, j]): @@ -1676,16 +1655,20 @@ cdef void accumulate_moments_axis( @cython.boundscheck(False) @cython.wraparound(False) def scalar_skew( - const float64_t[:] values, + const float64_t[::1] values, bint skipna, - const uint8_t[:] mask, + const uint8_t[::1] mask, ) -> float: cdef: int64_t nobs = 0 float64_t mean = 0.0, m2 = 0.0, m3 = 0.0 + size_t n = values.shape[0] + const float64_t* p_values = &values[0] + const uint8_t* p_mask = &mask[0] if mask is not None else NULL with nogil: - accumulate_moments_scalar(values, skipna, mask, &nobs, &mean, &m2, &m3, NULL, 3) + accumulate_moments_scalar(n, p_values, skipna, p_mask, + &nobs, &mean, &m2, &m3, NULL, 3) return calc_skew(nobs, m2, m3) @@ -1693,16 +1676,20 @@ def scalar_skew( @cython.boundscheck(False) @cython.wraparound(False) def scalar_kurt( - const float64_t[:] values, + const float64_t[::1] values, bint skipna, - const uint8_t[:] mask, + const uint8_t[::1] mask, ) -> float: cdef: int64_t nobs = 0 float64_t mean = 0.0, m2 = 0.0, m3 = 0.0, m4 = 0.0 + size_t n = values.shape[0] + const float64_t* p_values = &values[0] + const uint8_t* p_mask = &mask[0] if mask is not None else NULL with nogil: - accumulate_moments_scalar(values, skipna, mask, &nobs, &mean, &m2, &m3, &m4, 4) + accumulate_moments_scalar(n, p_values, skipna, p_mask, + &nobs, &mean, &m2, &m3, &m4, 4) return calc_kurt(nobs, m2, m4) diff --git a/pandas/_libs/include/pandas/moments.h b/pandas/_libs/include/pandas/moments.h new file mode 100644 index 0000000000000..9723761f64ef3 --- /dev/null +++ b/pandas/_libs/include/pandas/moments.h @@ -0,0 +1,61 @@ +/* +Copyright (c) 2026, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. +*/ + +#pragma once + +#include +#include +#include + +#ifdef __cplusplus +// prevent name mangling +extern "C" { +#endif + +// NOTE: It may be better to use the Moments struct +// instead of using multiple double references +// and a signed value to nobs. +typedef struct { + uint64_t n; + double mean; + double m2; + double m3; + double m4; +} Moments; + +static inline void moments_add_value(double val, int64_t *nobs, double *mean, + double *m2, double *m3, double *m4, + int max_moment) { + double delta = val - *mean; + (*nobs)++; + double n = (double)(*nobs); + double delta_n = delta / n; + double term1 = delta * delta_n * (n - 1.0); + + if (max_moment >= 4) { + *m4 += delta_n * (-4.0 * *m3 + + delta_n * (6.0 * *m2 + term1 * (n * n - 3.0 * n + 3.0))); + } + if (max_moment >= 3) { + *m3 += delta_n * (term1 * (n - 2.0) - 3.0 * *m2); + } + *m2 += term1; + *mean += delta_n; +} + +Moments moments_merge(Moments a, Moments b, int max_moment); + +/// Compute central moments until `max_moment` using `n` elements from `values`. +void accumulate_moments_scalar(size_t n, const double *values, bool skipna, + const uint8_t *mask, int64_t *nobs, double *mean, + double *m2, double *m3, double *m4, + int max_moment); +#ifdef __cplusplus +} +#endif diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index 56ff2a01b450c..2f20136b375c8 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -60,7 +60,12 @@ libs_sources = { # Dict of extension name -> dict of {sources, include_dirs, and deps} # numpy include dir is implicitly included 'algos': { - 'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper], + 'sources': [ + 'algos.pyx', + 'src/moments.c', + _algos_common_helper, + _algos_take_helper, + ], 'deps': [_khash_primitive_helper_dep, m_dep], }, 'arrays': {'sources': ['arrays.pyx']}, diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c new file mode 100644 index 0000000000000..755fbec477505 --- /dev/null +++ b/pandas/_libs/src/moments.c @@ -0,0 +1,259 @@ +/* +Copyright (c) 2026, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. +*/ + +#include "pandas/moments.h" +#include +#include + +static inline void moments_add_valuem(Moments *moments, double val, + int max_moment) { + double delta = val - moments->mean; + moments->n++; + double n = (double)moments->n; + double delta_n = delta / n; + double term1 = delta * delta_n * (n - 1.0); + + if (max_moment >= 4) { + moments->m4 += + delta_n * + (-4.0 * moments->m3 + + delta_n * (6.0 * moments->m2 + term1 * (n * n - 3.0 * n + 3.0))); + } + if (max_moment >= 3) { + moments->m3 += delta_n * (term1 * (n - 2.0) - 3.0 * moments->m2); + } + moments->m2 += term1; + moments->mean += delta_n; +} + +Moments moments_merge(Moments a, Moments b, int max_moment) { + + if (a.n == 0) { + return b; + } + if (b.n == 0) { + return a; + } + + Moments result; + + result.n = a.n + b.n; + double n_a = (double)a.n; + double n_b = (double)b.n; + double delta = b.mean - a.mean; + double delta_n = delta / (double)result.n; + double term1 = delta * delta_n * n_a * n_b; + + if (max_moment >= 4) { + result.m4 = + a.m4 + b.m4 + + delta_n * (4.0 * (n_a * b.m3 - n_b * a.m3) + + delta_n * (6.0 * (n_a * n_a * b.m2 + n_b * n_b * a.m2) + + term1 * (n_a * n_a - n_a * n_b + n_b * n_b))); + } + if (max_moment >= 3) { + result.m3 = + a.m3 + b.m3 + + delta_n * (3.0 * (n_a * b.m2 - n_b * a.m2) + term1 * (n_a - n_b)); + } + result.m2 = a.m2 + b.m2 + term1; + result.mean = a.mean + delta_n * n_b; + + return result; +} + +#ifndef __has_attribute +# define __has_attribute(x) 0 +#endif // __has_attribute + +#if defined(__x86_64__) && __has_attribute(target_clones) && defined(__GLIBC__) +# define PANDAS_SIMD_TARGETS __attribute__((target_clones("avx2", "default"))) +#else +# define PANDAS_SIMD_TARGETS +#endif // x86_64 + glibc + target_clones + +/* --- SIMD Implementation --- */ +#if __has_attribute(ext_vector_type) +typedef double v4d __attribute__((ext_vector_type(4), aligned(1))); +typedef long long v4si __attribute__((ext_vector_type(4), aligned(1))); +typedef uint8_t v4u8 __attribute__((ext_vector_type(4), aligned(1))); +# define PANDAS_HAS_SIMD 1 +#elif __has_attribute(vector_size) +typedef double v4d __attribute__((vector_size(4 * sizeof(double)), aligned(1))); +typedef long long v4si + __attribute__((vector_size(4 * sizeof(long long)), aligned(1))); +typedef uint8_t v4u8 + __attribute__((vector_size(4 * sizeof(uint8_t)), aligned(1))); +# define PANDAS_HAS_SIMD 1 +#endif // __has_attribute(ext_vector_type) + +#ifdef PANDAS_HAS_SIMD +/* Vector select: returns (mask ? a : b) where mask is all-ones or all-zeros */ +# define v_select(mask, a, b) \ + ((v4d)(((v4si)(mask) & (v4si)(a)) | (~(v4si)(mask) & (v4si)(b)))) + +PANDAS_SIMD_TARGETS +Moments accumulate_moments_simd(size_t n, const double *values, int skipna, + const uint8_t *mask, int max_moment) { + v4d v_mean = {0.0, 0.0, 0.0, 0.0}; + v4d v_m2 = {0.0, 0.0, 0.0, 0.0}; + v4d v_m3 = {0.0, 0.0, 0.0, 0.0}; + v4d v_m4 = {0.0, 0.0, 0.0, 0.0}; + // Using double for v_n to avoid epi64 -> pd conversions. + // The results are exact up to n < 2^53. + v4d v_n = {0.0, 0.0, 0.0, 0.0}; + + v4d v_one = {1.0, 1.0, 1.0, 1.0}; + v4d v_zero = {0.0, 0.0, 0.0, 0.0}; + v4si v_zerosi = {0, 0, 0, 0}; + v4d v_nan = {NAN, NAN, NAN, NAN}; + + size_t i = 0; + for (; i + 3 < n; i += 4) { + v4d v_val = *(v4d *)(values + i); + + if (mask) { + v4u8 mask_vec = *(v4u8 *)(mask + i); + v4si mask_si = __builtin_convertvector(mask_vec, v4si); + // replace val with NAN where mask is 1. + v_val = v_select(mask_si != v_zerosi, v_nan, v_val); + } + + // skip_mask is 1.0 if we should skip, 0.0 otherwise + v4d v_is_nan = v_select(v_val == v_val, v_zero, v_one); + v4d v_skip_mask = skipna ? v_is_nan : v_zero; + + // Increment 1 where we do NOT skip + v4d v_n_increment = v_one - v_skip_mask; + v_n += v_n_increment; + v4d v_n_nonzero = v_select(v_n < v_one, v_one, v_n); + + v4d v_delta = v_val - v_mean; + + // replace delta with zero when skipping to don't update moments + v_delta = v_select(v_skip_mask == v_zero, v_delta, v_zero); + v4d v_delta_n = v_delta / v_n_nonzero; + + v4d v_delta_n2 = v_delta * v_delta_n; + v4d v_term1 = v_delta_n2 * (v_n - v_one); + + if (max_moment >= 4) { + v4d v_n2 = v_n * v_n; + // n * n - 3.0 * n + 3.0 + v4d v_n_term = (v_n2 - (3.0 * v_n)) + 3.0; + v_m4 += v_delta_n * ((-4.0 * v_m3) + + (v_delta_n * ((6.0 * v_m2) + (v_term1 * v_n_term)))); + } + + if (max_moment >= 3) { + v_m3 += v_delta_n * ((v_term1 * (v_n - 2.0)) - (3.0 * v_m2)); + } + + v_m2 += v_term1; + v_mean += v_delta_n; + } + + double n_arrd[4], mean_arr[4], m2_arr[4], m3_arr[4], m4_arr[4]; + *(v4d *)n_arrd = v_n; + *(v4d *)mean_arr = v_mean; + *(v4d *)m2_arr = v_m2; + *(v4d *)m3_arr = v_m3; + *(v4d *)m4_arr = v_m4; + + int64_t n_arr[4]; + for (int j = 0; j < 4; j++) { + n_arr[j] = (int64_t)n_arrd[j]; + } + + // Distribute remaining values across chunks + for (; i < n; i++) { + double val = values[i]; + if (mask && mask[i]) + val = NAN; + if (skipna && isnan(val)) + continue; + moments_add_value(val, &n_arr[i % 4], &mean_arr[i % 4], &m2_arr[i % 4], + &m3_arr[i % 4], &m4_arr[i % 4], max_moment); + } + Moments moments_arr[4]; + for (int j = 0; j < 4; j++) { + moments_arr[j] = (Moments){.n = (uint64_t)n_arr[j], + .mean = mean_arr[j], + .m2 = m2_arr[j], + .m3 = m3_arr[j], + .m4 = m4_arr[j]}; + } + + // pairwise merge for numerical stability + Moments m_01 = moments_merge(moments_arr[0], moments_arr[1], max_moment); + Moments m_23 = moments_merge(moments_arr[2], moments_arr[3], max_moment); + + return moments_merge(m_01, m_23, max_moment); +} +# undef v_select +#endif // PANDAS_HAS_SIMD + +/* --- Scalar Fallback Implementation --- */ + +Moments accumulate_moments_scalar_block(size_t n, const double *values, + int skipna, const uint8_t *mask, + int max_moment) { + Moments moments = {0}; + for (size_t i = 0; i < n; i++) { + double val = values[i]; + if (mask && mask[i]) + val = NAN; + if (skipna && isnan(val)) + continue; + moments_add_valuem(&moments, val, max_moment); + } + return moments; +} + +/* --- Accumulation Dispatch (Choose SIMD or Scalar) --- */ + +Moments accumulate_moments_dispatch(size_t n, const double *values, int skipna, + const uint8_t *mask, int max_moment) { +#if defined(PANDAS_HAS_SIMD) + return accumulate_moments_simd(n, values, skipna, mask, max_moment); +#endif // defined(PANDAS_HAS_SIMD) + return accumulate_moments_scalar_block(n, values, skipna, mask, max_moment); +} + +/* --- Moments 1D Accumulator Implementation --- */ + +void accumulate_moments_scalar(size_t n, const double *values, bool skipna, + const uint8_t *mask, int64_t *nobs, double *mean, + double *m2, double *m3, double *m4, + int max_moment) { + // PERF: It's possible to parallelize moment reductions + // and call `moments_merge` to join the results. + Moments acc = + accumulate_moments_dispatch(n, values, skipna, mask, max_moment); + + Moments tmp = (Moments){ + .n = (uint64_t)*nobs, .mean = *mean, .m2 = *m2, .m3 = 0.0, .m4 = 0.0}; + if (max_moment >= 4) { + tmp.m4 = *m4; + } + if (max_moment >= 3) { + tmp.m3 = *m3; + } + Moments result = moments_merge(tmp, acc, max_moment); + + if (max_moment >= 4) { + *m4 = result.m4; + } + if (max_moment >= 3) { + *m3 = result.m3; + } + *m2 = result.m2; + *mean = result.mean; + *nobs = (int64_t)result.n; +} diff --git a/pandas/_libs/window/meson.build b/pandas/_libs/window/meson.build index 8c00a98b1241a..3e3e25b626914 100644 --- a/pandas/_libs/window/meson.build +++ b/pandas/_libs/window/meson.build @@ -10,6 +10,7 @@ py.extension_module( ['aggregations.pyx'], cython_args: cy_args, include_directories: [inc_np, inc_pd], + dependencies: [m_dep], subdir: 'pandas/_libs/window', override_options: ['cython_language=cpp'], install: true,