From 11043f851f0c39af99d4a447a65d26846280e392 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Sat, 14 Mar 2026 10:56:06 -0300
Subject: [PATCH 01/35] perf: accumulate moments with multithreading

---
 pandas/_libs/algos.pxd                | 102 +++++++-------------
 pandas/_libs/algos.pyx                |  98 +++++++------------
 pandas/_libs/groupby.pyx              |  43 ++++-----
 pandas/_libs/include/pandas/moments.h | 121 ++++++++++++++++++++++++
 pandas/_libs/meson.build              |  10 +-
 pandas/_libs/src/moments.c            |  40 ++++++++
 pandas/_libs/window/aggregations.pyx  | 131 +++++++++++++-------------
 pandas/_libs/window/meson.build       |   1 +
 8 files changed, 318 insertions(+), 228 deletions(-)
 create mode 100644 pandas/_libs/include/pandas/moments.h
 create mode 100644 pandas/_libs/src/moments.c

diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd
index e47f7d56ba343..4d80831b603c4 100644
--- a/pandas/_libs/algos.pxd
+++ b/pandas/_libs/algos.pxd
@@ -6,6 +6,7 @@ from libc.math cimport (
 from numpy cimport (
     float64_t,
     int64_t,
+    uint8_t,
 )
 
 from pandas._libs.dtypes cimport (
@@ -17,76 +18,37 @@ from pandas._libs.dtypes cimport (
 cdef numeric_t kth_smallest_c(numeric_t* arr, Py_ssize_t k, Py_ssize_t n) noexcept nogil
 
 
-@cython.cdivision(True)
-cdef inline void moments_add_value(
-    float64_t val,
-    int64_t* nobs,
-    float64_t* mean,
-    float64_t* m2,
-    float64_t* m3,
-    float64_t* m4,
-    int max_moment,
-) noexcept nogil:
-    cdef:
-        float64_t n
-        float64_t delta = val - mean[0]
-        float64_t delta_n, term1
-
-    nobs[0] += 1
-    n = <float64_t>nobs[0]
-    delta_n = delta / n
-    term1 = delta * delta_n * (n - 1.0)
-
-    if max_moment >= 4:
-        m4[0] += delta_n * (
-                -4.0 * m3[0]
-                + delta_n * (
-                    6.0 * m2[0] + term1 * (n * n - 3.0 * n + 3.0)
-                    )
-                )
-    if max_moment >= 3:
-        m3[0] += delta_n * (term1 * (n - 2.0) - 3.0 * m2[0])
-    m2[0] += term1
-    mean[0] += delta_n
-
-
-@cython.cdivision(True)
-cdef inline float64_t calc_skew(
-    int64_t nobs, float64_t m2, float64_t m3
-) noexcept nogil:
-    cdef:
-        float64_t moments_ratio, correction, dnobs
-
-    if nobs < 3:
-        return NAN
-
-    dnobs = <float64_t>nobs
-
-    moments_ratio = m3 / (m2 * sqrt(m2))
-    correction = (dnobs * sqrt(dnobs - 1.0)) / (dnobs - 2.0)
-    return moments_ratio * correction
-
-
-@cython.cdivision(True)
-cdef inline float64_t calc_kurt(
-    int64_t nobs, float64_t m2, float64_t m4
-) noexcept nogil:
-    cdef:
-        float64_t result, dnobs, term1, term2, inner, correction
-        float64_t moments_ratio
-
-    if nobs < 4:
-        return NAN
-    dnobs = <float64_t>nobs
-    moments_ratio = m4 / (m2 * m2)
-    term1 = dnobs * (dnobs + 1.0) * moments_ratio
-    term2 = 3.0 * (dnobs - 1.0)
-    inner = term1 - term2
-
-    correction = (dnobs - 1.0) / ((dnobs - 2.0) * (dnobs - 3.0))
-    result = correction * inner
-
-    return result
+cdef extern from "pandas/moments.h":
+    ctypedef struct Moments:
+        int64_t n
+        float64_t mean
+        float64_t m2
+        float64_t m3
+        float64_t m4
+
+    void moments_add_value(
+        Moments* moments,
+        float64_t val,
+        int max_moment,
+    ) nogil
+
+    Moments moments_merge(
+        Moments a,
+        Moments b,
+        int max_moment,
+    ) nogil
+
+    Moments accumulate_moments_scalar(
+        const float64_t* values,
+        int64_t n,
+        bint skipna,
+        const uint8_t* mask,
+        int max_moment,
+    ) nogil
+
+    float64_t calc_skew(Moments) nogil
+
+    float64_t calc_kurt(Moments) nogil
 
 
 cdef enum TiebreakEnumType:
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 97aa64593a3fc..2e4585566c486 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -5,6 +5,7 @@ from libc.math cimport (
     sqrt,
 )
 from libc.stdlib cimport (
+    calloc,
     free,
     malloc,
 )
@@ -1600,44 +1601,13 @@ def diff_2d(
 # ----------------------------------------------------------------------
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-cdef void accumulate_moments_scalar(
-    const float64_t[:] values,
-    bint skipna,
-    const uint8_t[:] mask,
-    int64_t* nobs,
-    float64_t* mean,
-    float64_t* m2,
-    float64_t* m3,
-    float64_t* m4,
-    int max_moment,
-) noexcept nogil:
-    cdef:
-        Py_ssize_t i, n = len(values)
-        bint uses_mask = mask is not None
-        float64_t val
-
-    for i in range(n):
-        val = values[i]
-        if uses_mask and mask[i]:
-            val = NaN
-        if skipna and isnan(val):
-            continue
-        moments_add_value(val, nobs, mean, m2, m3, m4, max_moment)
-
-
 @cython.boundscheck(False)
 @cython.wraparound(False)
 cdef void accumulate_moments_axis(
     const float64_t[:, :] values,
     bint skipna,
     const uint8_t[:, :] mask,
-    int64_t[:] nobs,
-    float64_t[:] mean,
-    float64_t[:] m2,
-    float64_t[:] m3,
-    float64_t[:] m4,
+    Moments *moments,
     int axis,
     int max_moment,
 ) noexcept nogil:
@@ -1646,8 +1616,6 @@ cdef void accumulate_moments_axis(
         Py_ssize_t nouter, ninner
         bint uses_mask = mask is not None
         float64_t val
-        float64_t* m3_ptr = NULL
-        float64_t* m4_ptr = NULL
 
     if axis == 0:
         # Assumes F-contiguous
@@ -1659,52 +1627,51 @@ cdef void accumulate_moments_axis(
         ninner = ncols
 
     for i in range(nouter):
-        if max_moment >= 3:
-            m3_ptr = &m3[i]
-        if max_moment >= 4:
-            m4_ptr = &m4[i]
         for j in range(ninner):
             val = values[j, i] if axis == 0 else values[i, j]
             if uses_mask and (mask[j, i] if axis == 0 else mask[i, j]):
                 val = NaN
             if skipna and isnan(val):
                 continue
-            moments_add_value(val, &nobs[i], &mean[i], &m2[i], m3_ptr, m4_ptr,
-                              max_moment)
+            moments_add_value(&moments[i], val, max_moment)
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def scalar_skew(
-    const float64_t[:] values,
+    const float64_t[::1] values,
     bint skipna,
-    const uint8_t[:] mask,
+    const uint8_t[::1] mask,
 ) -> float:
     cdef:
-        int64_t nobs = 0
-        float64_t mean = 0.0, m2 = 0.0, m3 = 0.0
+        Moments moments
+        Py_ssize_t n = values.shape[0]
+        const float64_t* p_values = &values[0]
+        const uint8_t* p_mask = &mask[0] if mask is not None else NULL
 
     with nogil:
-        accumulate_moments_scalar(values, skipna, mask, &nobs, &mean, &m2, &m3, NULL, 3)
+        moments = accumulate_moments_scalar(p_values, n, skipna, p_mask, 3)
 
-    return calc_skew(nobs, m2, m3)
+    return calc_skew(moments)
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def scalar_kurt(
-    const float64_t[:] values,
+    const float64_t[::1] values,
     bint skipna,
-    const uint8_t[:] mask,
+    const uint8_t[::1] mask,
 ) -> float:
     cdef:
-        int64_t nobs = 0
-        float64_t mean = 0.0, m2 = 0.0, m3 = 0.0, m4 = 0.0
+        Moments moments
+        Py_ssize_t n = values.shape[0]
+        const float64_t* p_values = &values[0]
+        const uint8_t* p_mask = &mask[0] if mask is not None else NULL
 
     with nogil:
-        accumulate_moments_scalar(values, skipna, mask, &nobs, &mean, &m2, &m3, &m4, 4)
+        moments = accumulate_moments_scalar(p_values, n, skipna, p_mask, 4)
 
-    return calc_kurt(nobs, m2, m4)
+    return calc_kurt(moments)
 
 
 @cython.boundscheck(False)
@@ -1718,18 +1685,19 @@ def axis_skew(
     cdef:
         Py_ssize_t i
         Py_ssize_t nouter = values.shape[1] if axis == 0 else values.shape[0]
-        int64_t[::1] nobs = np.zeros(nouter, dtype=np.int64)
-        float64_t[::1] mean = np.zeros(nouter)
-        float64_t[::1] m2 = np.zeros(nouter)
-        float64_t[::1] m3 = np.zeros(nouter)
+        Moments *moments = <Moments *>calloc(nouter, sizeof(Moments))
         ndarray result_arr = np.empty(nouter, dtype=np.float64)
         float64_t[:] result = result_arr
 
+    if moments == NULL:
+        raise MemoryError()
+
     with nogil:
-        accumulate_moments_axis(values, skipna, mask, nobs, mean, m2, m3, None, axis, 3)
+        accumulate_moments_axis(values, skipna, mask, moments, axis, 3)
         for i in range(nouter):
-            result[i] = calc_skew(nobs[i], m2[i], m3[i])
+            result[i] = calc_skew(moments[i])
 
+    free(moments)
     return result_arr
 
 
@@ -1744,19 +1712,19 @@ def axis_kurt(
     cdef:
         Py_ssize_t i
         Py_ssize_t nouter = values.shape[1] if axis == 0 else values.shape[0]
-        int64_t[::1] nobs = np.zeros(nouter, dtype=np.int64)
-        float64_t[::1] mean = np.zeros(nouter)
-        float64_t[::1] m2 = np.zeros(nouter)
-        float64_t[::1] m3 = np.zeros(nouter)
-        float64_t[::1] m4 = np.zeros(nouter)
+        Moments *moments = <Moments *>calloc(nouter, sizeof(Moments))
         ndarray result_arr = np.empty(nouter, dtype=np.float64)
         float64_t[:] result = result_arr
 
+    if moments == NULL:
+        raise MemoryError()
+
     with nogil:
-        accumulate_moments_axis(values, skipna, mask, nobs, mean, m2, m3, m4, axis, 4)
+        accumulate_moments_axis(values, skipna, mask, moments, axis, 4)
         for i in range(nouter):
-            result[i] = calc_kurt(nobs[i], m2[i], m4[i])
+            result[i] = calc_kurt(moments[i])
 
+    free(moments)
     return result_arr
 
 
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 15abebef987ed..14b8320e7b044 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -10,6 +10,7 @@ from libc.math cimport (
     sqrt,
 )
 from libc.stdlib cimport (
+    calloc,
     free,
     malloc,
 )
@@ -34,6 +35,7 @@ cnp.import_array()
 
 from pandas._libs cimport util
 from pandas._libs.algos cimport (
+    Moments,
     calc_kurt,
     calc_skew,
     get_rank_nan_fill_val,
@@ -1023,23 +1025,18 @@ def group_skew(
 ) -> None:
     cdef:
         Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
-        int64_t[:, ::1] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
         bint uses_mask = mask is not None
-        float64_t[:, ::1] mean, M2, M3
+        Moments* ms
         float64_t val
 
     if len_values != len_labels:
         raise ValueError("len(index) != len(labels)")
 
-    nobs = np.zeros((<object>out).shape, dtype=np.int64)
-
-    mean = np.zeros((<object>out).shape, dtype=np.float64)
-    # M2, and M3 correspond to 2nd and 3rd central moments
-    M2 = np.zeros((<object>out).shape, dtype=np.float64)
-    M3 = np.zeros((<object>out).shape, dtype=np.float64)
-
     N, K = (<object>values).shape
+    ms = <Moments *>calloc(ngroups * K, sizeof(Moments))
+    if ms is NULL:
+        raise MemoryError()
 
     out[:, :] = 0.0
 
@@ -1060,15 +1057,16 @@ def group_skew(
                 if skipna and (isnan(val) or _treat_as_na(val, False)):
                     continue
 
-                moments_add_value(val, &nobs[lab, j], &mean[lab, j], &M2[lab, j],
-                                  &M3[lab, j], NULL, 3)
+                moments_add_value(&ms[lab * K + j], val, 3)
 
         for i in range(ngroups):
             for j in range(K):
-                out[i, j] = calc_skew(nobs[i, j], M2[i, j], M3[i, j])
+                out[i, j] = calc_skew(ms[i * K + j])
                 if result_mask is not None and isnan(out[i, j]):
                     result_mask[i, j] = 1
 
+    free(ms)
+
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
@@ -1085,24 +1083,18 @@ def group_kurt(
 ) -> None:
     cdef:
         Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
-        int64_t[:, ::1] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
         bint uses_mask = mask is not None
-        float64_t[:, ::1] mean, M2, M3, M4
+        Moments* ms
         float64_t val
 
     if len_values != len_labels:
         raise ValueError("len(index) != len(labels)")
 
-    nobs = np.zeros((<object>out).shape, dtype=np.int64)
-
-    mean = np.zeros((<object>out).shape, dtype=np.float64)
-    # M2, M3 and M4 correspond to 2nd, 3rd and 4th Central Moments
-    M2 = np.zeros((<object>out).shape, dtype=np.float64)
-    M3 = np.zeros((<object>out).shape, dtype=np.float64)
-    M4 = np.zeros((<object>out).shape, dtype=np.float64)
-
     N, K = (<object>values).shape
+    ms = <Moments *>calloc(ngroups * K, sizeof(Moments))
+    if ms is NULL:
+        raise MemoryError()
 
     out[:, :] = 0.0
 
@@ -1123,15 +1115,16 @@ def group_kurt(
                 if skipna and (isnan(val) or _treat_as_na(val, False)):
                     continue
 
-                moments_add_value(val, &nobs[lab, j], &mean[lab, j], &M2[lab, j],
-                                  &M3[lab, j], &M4[lab, j], 4)
+                moments_add_value(&ms[lab * K + j], val, 4)
 
         for i in range(ngroups):
             for j in range(K):
-                out[i, j] = calc_kurt(nobs[i, j], M2[i, j], M4[i, j])
+                out[i, j] = calc_kurt(ms[i * K + j])
                 if result_mask is not None and isnan(out[i, j]):
                     result_mask[i, j] = 1
 
+    free(ms)
+
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
diff --git a/pandas/_libs/include/pandas/moments.h b/pandas/_libs/include/pandas/moments.h
new file mode 100644
index 0000000000000..2a56c7e19e169
--- /dev/null
+++ b/pandas/_libs/include/pandas/moments.h
@@ -0,0 +1,121 @@
+/*
+Copyright (c) 2026, PyData Development Team
+All rights reserved.
+
+Distributed under the terms of the BSD Simplified License.
+
+The full license is in the LICENSE file, distributed with this software.
+*/
+
+#pragma once
+
+#include <math.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+// prevent name mangling
+extern "C" {
+#endif
+
+typedef struct {
+  int64_t n;
+  double mean;
+  double m2;
+  double m3;
+  double m4;
+} Moments;
+
+static inline void moments_add_value(Moments *moments, double val,
+                                     int max_moment) {
+  double delta = val - moments->mean;
+  moments->n++;
+  double n = (double)moments->n;
+  double delta_n = delta / n;
+  double term1 = delta * delta_n * (n - 1.0);
+
+  if (max_moment >= 4) {
+    moments->m4 +=
+        delta_n *
+        (-4.0 * moments->m3 +
+         delta_n * (6.0 * moments->m2 + term1 * (n * n - 3.0 * n + 3.0)));
+  }
+  if (max_moment >= 3) {
+    moments->m3 += delta_n * (term1 * (n - 2.0) - 3.0 * moments->m2);
+  }
+  moments->m2 += term1;
+  moments->mean += delta_n;
+}
+
+static inline Moments moments_merge(Moments a, Moments b, int max_moment) {
+
+  if (a.n == 0) {
+    return b;
+  }
+  if (b.n == 0) {
+    return a;
+  }
+
+  Moments out;
+
+  out.n = a.n + b.n;
+  double n_a = (double)a.n;
+  double n_b = (double)b.n;
+  double delta = b.mean - a.mean;
+  double n = (double)out.n;
+  double delta_n = delta / n;
+  double term1 = delta * delta_n * n_a * n_b;
+
+  if (max_moment >= 4) {
+    out.m4 =
+        a.m4 + b.m4 +
+        delta_n * (4.0 * (n_a * b.m3 - n_b * a.m3) +
+                   delta_n * (6.0 * (n_a * n_a * b.m2 + n_b * n_b * a.m2) +
+                              term1 * (n_a * n_a - n_a * n_b + n_b * n_b)));
+  } else {
+    out.m4 = 0.0;
+  }
+  if (max_moment >= 3) {
+    out.m3 = a.m3 + b.m3 +
+             delta_n * (3.0 * (n_a * b.m2 - n_b * a.m2) + term1 * (n_a - n_b));
+  } else {
+    out.m3 = 0.0;
+  }
+  out.m2 = a.m2 + b.m2 + term1;
+  out.mean = a.mean + delta_n * n_b;
+
+  return out;
+}
+
+/// Compute central moments until `max_moment` using `n` elements from `values`.
+/// The size is represented as signed integer for MSVC compatibility with
+/// OpenMP.
+Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna,
+                                  const uint8_t *mask, int max_moment);
+
+static inline double calc_skew(Moments moments) {
+  if (moments.n < 3) {
+    return NAN;
+  }
+  double dnobs = (double)moments.n;
+  double moments_ratio = moments.m3 / (moments.m2 * sqrt(moments.m2));
+  double correction = (dnobs * sqrt(dnobs - 1.0)) / (dnobs - 2.0);
+  return moments_ratio * correction;
+}
+
+static inline double calc_kurt(Moments moments) {
+  if (moments.n < 4) {
+    return NAN;
+  }
+  double dnobs = (double)moments.n;
+  double moments_ratio = moments.m4 / (moments.m2 * moments.m2);
+  double term1 = dnobs * (dnobs + 1.0) * moments_ratio;
+  double term2 = 3.0 * (dnobs - 1.0);
+  double inner = term1 - term2;
+  double correction = (dnobs - 1.0) / ((dnobs - 2.0) * (dnobs - 3.0));
+  return correction * inner;
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build
index 56ff2a01b450c..2c2122613d3e3 100644
--- a/pandas/_libs/meson.build
+++ b/pandas/_libs/meson.build
@@ -55,13 +55,19 @@ fast_float = subproject('fast_float')
 fast_float_dep = fast_float.get_variable('fast_float_dep')
 
 subdir('tslibs')
+openmp_dep = dependency('OpenMP', required: false)
 
 libs_sources = {
     # Dict of extension name -> dict of {sources, include_dirs, and deps}
     # numpy include dir is implicitly included
     'algos': {
-        'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper],
-        'deps': [_khash_primitive_helper_dep, m_dep],
+        'sources': [
+            'algos.pyx',
+            'src/moments.c',
+            _algos_common_helper,
+            _algos_take_helper,
+        ],
+        'deps': [_khash_primitive_helper_dep, m_dep, openmp_dep],
     },
     'arrays': {'sources': ['arrays.pyx']},
     'groupby': {'sources': ['groupby.pyx'], 'deps': [m_dep]},
diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
new file mode 100644
index 0000000000000..fd31a9c1c1d11
--- /dev/null
+++ b/pandas/_libs/src/moments.c
@@ -0,0 +1,40 @@
+/*
+Copyright (c) 2026, PyData Development Team
+All rights reserved.
+
+Distributed under the terms of the BSD Simplified License.
+
+The full license is in the LICENSE file, distributed with this software.
+*/
+
+#include "pandas/moments.h"
+#include <math.h>
+
+Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna,
+                                  const uint8_t *mask, int max_moment) {
+  Moments result = {0};
+
+#pragma omp parallel
+  {
+    Moments moments_local = {0};
+
+#pragma omp for nowait
+    for (int64_t i = 0; i < n; i++) {
+      double val = values[i];
+      if (mask && mask[i]) {
+        val = NAN;
+      }
+      if (skipna && isnan(val)) {
+        continue;
+      }
+      moments_add_value(&moments_local, val, max_moment);
+    }
+
+#pragma omp critical
+    {
+      result = moments_merge(result, moments_local, max_moment);
+    }
+  }
+
+  return result;
+}
diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
index 619d5590cc207..3ffc14c1b4cbc 100644
--- a/pandas/_libs/window/aggregations.pyx
+++ b/pandas/_libs/window/aggregations.pyx
@@ -9,6 +9,7 @@ from libcpp.stack cimport stack
 from libcpp.unordered_map cimport unordered_map
 
 from pandas._libs.algos cimport (
+    Moments,
     TiebreakEnumType,
     calc_kurt,
     calc_skew,
@@ -492,26 +493,22 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
 # Rolling skewness
 
 
-cdef void add_skew(float64_t val, int64_t *nobs,
-                   float64_t *mean, float64_t *m2,
-                   float64_t *m3,
+cdef void add_skew(float64_t val, Moments *m,
                    bint *numerically_unstable,
                    ) noexcept nogil:
     """ add a value from the skew calc """
     cdef:
-        float64_t old_m3 = m3[0]
+        float64_t old_m3 = m.m3
 
     # Not NaN
     if val == val:
-        moments_add_value(val, nobs, mean, m2, m3, NULL, 3)
-        if fabs(old_m3) * InvCondTol > fabs(m3[0]):
+        moments_add_value(m, val, 3)
+        if fabs(old_m3) * InvCondTol > fabs(m.m3):
             # possible catastrophic cancellation
             numerically_unstable[0] = True
 
 
-cdef void remove_skew(float64_t val, int64_t *nobs,
-                      float64_t *mean, float64_t *m2,
-                      float64_t *m3,
+cdef void remove_skew(float64_t val, Moments *m,
                       bint *numerically_unstable) noexcept nogil:
     """ remove a value from the skew calc """
     cdef:
@@ -529,22 +526,22 @@ cdef void remove_skew(float64_t val, int64_t *nobs,
 
     # Not NaN
     if val == val:
-        nobs[0] -= 1
-        n = <float64_t>(nobs[0])
-        delta = val - mean[0]
+        m.n -= 1
+        n = <float64_t>(m.n)
+        delta = val - m.mean
         delta_n = delta / n
         term1 = delta_n * delta * (n + 1.0)
 
-        m3_update = delta_n * (term1 * (n + 2.0) - 3.0 * m2[0])
-        new_m3 = m3[0] - m3_update
+        m3_update = delta_n * (term1 * (n + 2.0) - 3.0 * m.m2)
+        new_m3 = m.m3 - m3_update
 
-        if (fabs(m3_update) + fabs(m3[0])) * InvCondTol > fabs(new_m3):
+        if (fabs(m3_update) + fabs(m.m3)) * InvCondTol > fabs(new_m3):
             # possible catastrophic cancellation
             numerically_unstable[0] = True
 
-        m3[0] = new_m3
-        m2[0] -= term1
-        mean[0] -= delta_n
+        m.m3 = new_m3
+        m.m2 -= term1
+        m.mean -= delta_n
 
 
 def roll_skew(const float64_t[:] values, ndarray[int64_t] start,
@@ -552,8 +549,8 @@ def roll_skew(const float64_t[:] values, ndarray[int64_t] start,
     cdef:
         Py_ssize_t i, j
         float64_t val
-        float64_t mean, m2, m3
-        int64_t nobs = 0, N = len(start)
+        Moments m
+        int64_t N = len(start)
         int64_t s, e
         ndarray[float64_t] output
         bint is_monotonic_increasing_bounds
@@ -585,31 +582,35 @@ def roll_skew(const float64_t[:] values, ndarray[int64_t] start,
                 # calculate deletes
                 for j in range(start[i - 1], s):
                     val = values[j]
-                    remove_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable)
+                    remove_skew(val, &m, &numerically_unstable)
 
                 # calculate adds
                 for j in range(end[i - 1], e):
                     val = values[j]
-                    add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable)
+                    add_skew(val, &m, &numerically_unstable)
 
             if requires_recompute or numerically_unstable:
 
-                mean = m2 = m3 = 0.0
-                nobs = 0
+                m.n = 0
+                m.mean = 0.0
+                m.m2 = 0.0
+                m.m3 = 0.0
+                m.m4 = 0.0
 
                 for j in range(s, e):
                     val = values[j]
-                    add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable)
+                    add_skew(val, &m, &numerically_unstable)
 
                 numerically_unstable = False
 
-            output[i] = NaN if nobs < minp else calc_skew(nobs, m2, m3)
+            output[i] = NaN if m.n < minp else calc_skew(m)
 
             if not is_monotonic_increasing_bounds:
-                nobs = 0
-                mean = 0.0
-                m2 = 0.0
-                m3 = 0.0
+                m.n = 0
+                m.mean = 0.0
+                m.m2 = 0.0
+                m.m3 = 0.0
+                m.m4 = 0.0
 
     return output
 
@@ -617,26 +618,22 @@ def roll_skew(const float64_t[:] values, ndarray[int64_t] start,
 # Rolling kurtosis
 
 
-cdef void add_kurt(float64_t val, int64_t *nobs,
-                   float64_t *mean, float64_t *m2,
-                   float64_t *m3, float64_t *m4,
+cdef void add_kurt(float64_t val, Moments *m,
                    bint *numerically_unstable,
                    ) noexcept nogil:
     """ add a value from the kurotic calc """
     cdef:
-        float64_t old_m4 = m4[0]
+        float64_t old_m4 = m.m4
 
     # Not NaN
     if val == val:
-        moments_add_value(val, nobs, mean, m2, m3, m4, 4)
-        if fabs(old_m4) * InvCondTol > fabs(m4[0]):
+        moments_add_value(m, val, 4)
+        if fabs(old_m4) * InvCondTol > fabs(m.m4):
             # possible catastrophic cancellation
             numerically_unstable[0] = True
 
 
-cdef void remove_kurt(float64_t val, int64_t *nobs,
-                      float64_t *mean, float64_t *m2,
-                      float64_t *m3, float64_t *m4,
+cdef void remove_kurt(float64_t val, Moments *m,
                       bint *numerically_unstable,
                       ) noexcept nogil:
     """ remove a value from the kurotic calc """
@@ -645,37 +642,37 @@ cdef void remove_kurt(float64_t val, int64_t *nobs,
 
     # Not NaN
     if val == val:
-        nobs[0] -= 1
-        n = <float64_t>(nobs[0])
-        delta = val - mean[0]
+        m.n -= 1
+        n = <float64_t>(m.n)
+        delta = val - m.mean
         delta_n = delta / n
         term1 = delta_n * delta * (n + 1.0)
 
         m4_update = delta_n * (
-                4.0 * m3[0]
+                4.0 * m.m3
                 + delta_n * (
-                    6.0 * m2[0]
+                    6.0 * m.m2
                     - term1 * (n * n + 3.0 * n + 3.0)
                     )
                 )
-        new_m4 = m4[0] + m4_update
+        new_m4 = m.m4 + m4_update
 
-        if (fabs(m4_update) + fabs(m4[0])) * InvCondTol > fabs(new_m4):
+        if (fabs(m4_update) + fabs(m.m4)) * InvCondTol > fabs(new_m4):
             # possible catastrophic cancellation
             numerically_unstable[0] = True
 
-        m4[0] = new_m4
-        m3[0] -= delta_n * (term1 * (n + 2.0) - 3.0 * m2[0])
-        m2[0] -= term1
-        mean[0] -= delta_n
+        m.m4 = new_m4
+        m.m3 -= delta_n * (term1 * (n + 2.0) - 3.0 * m.m2)
+        m.m2 -= term1
+        m.mean -= delta_n
 
 
 def roll_kurt(const float64_t[:] values, ndarray[int64_t] start,
               ndarray[int64_t] end, int64_t minp) -> np.ndarray:
     cdef:
         Py_ssize_t i, j
-        float64_t mean, m2, m3, m4
-        int64_t nobs, s, e
+        Moments m
+        int64_t s, e
         int64_t N = len(start)
         ndarray[float64_t] output
         bint is_monotonic_increasing_bounds
@@ -707,30 +704,32 @@ def roll_kurt(const float64_t[:] values, ndarray[int64_t] start,
                 # and removed
                 # calculate deletes
                 for j in range(start[i - 1], s):
-                    remove_kurt(values[j], &nobs, &mean, &m2, &m3, &m4,
-                                &numerically_unstable)
+                    remove_kurt(values[j], &m, &numerically_unstable)
 
                 # calculate adds
                 for j in range(end[i - 1], e):
-                    add_kurt(values[j], &nobs, &mean, &m2, &m3, &m4,
-                             &numerically_unstable)
+                    add_kurt(values[j], &m, &numerically_unstable)
 
             if requires_recompute or numerically_unstable:
 
-                mean = m2 = m3 = m4 = 0.0
-                nobs = 0
+                m.n = 0
+                m.mean = 0.0
+                m.m2 = 0.0
+                m.m3 = 0.0
+                m.m4 = 0.0
                 for j in range(s, e):
-                    add_kurt(values[j], &nobs, &mean, &m2, &m3, &m4,
-                             &numerically_unstable)
+                    add_kurt(values[j], &m, &numerically_unstable)
+
+                numerically_unstable = False
 
-            output[i] = NaN if nobs < minp else calc_kurt(nobs, m2, m4)
+            output[i] = NaN if m.n < minp else calc_kurt(m)
 
             if not is_monotonic_increasing_bounds:
-                nobs = 0
-                mean = 0.0
-                m2 = 0.0
-                m3 = 0.0
-                m4 = 0.0
+                m.n = 0
+                m.mean = 0.0
+                m.m2 = 0.0
+                m.m3 = 0.0
+                m.m4 = 0.0
 
     return output
 
diff --git a/pandas/_libs/window/meson.build b/pandas/_libs/window/meson.build
index 8c00a98b1241a..3e3e25b626914 100644
--- a/pandas/_libs/window/meson.build
+++ b/pandas/_libs/window/meson.build
@@ -10,6 +10,7 @@ py.extension_module(
     ['aggregations.pyx'],
     cython_args: cy_args,
     include_directories: [inc_np, inc_pd],
+    dependencies: [m_dep],
     subdir: 'pandas/_libs/window',
     override_options: ['cython_language=cpp'],
     install: true,

From 2c9de5c6be4238699630dffe8334858d7e36a8b9 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Sat, 14 Mar 2026 12:48:09 -0300
Subject: [PATCH 02/35] fix: omit warnings when OpenMP not used

---
 pandas/_libs/src/moments.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index fd31a9c1c1d11..dc63ea6412276 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -14,11 +14,15 @@ Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna,
                                   const uint8_t *mask, int max_moment) {
   Moments result = {0};
 
-#pragma omp parallel
+#ifdef _OPENMP
+#  pragma omp parallel
+#endif
   {
     Moments moments_local = {0};
 
-#pragma omp for nowait
+#ifdef _OPENMP
+#  pragma omp for nowait
+#endif
     for (int64_t i = 0; i < n; i++) {
       double val = values[i];
       if (mask && mask[i]) {
@@ -30,7 +34,9 @@ Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna,
       moments_add_value(&moments_local, val, max_moment);
     }
 
-#pragma omp critical
+#ifdef _OPENMP
+#  pragma omp critical
+#endif
     {
       result = moments_merge(result, moments_local, max_moment);
     }

From 69786e80a279faaaafa811172f5263516823cb0d Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Sat, 14 Mar 2026 13:27:49 -0300
Subject: [PATCH 03/35] fix: try to fix msvc error

---
 pandas/_libs/src/moments.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index dc63ea6412276..009790951ae6a 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -9,10 +9,12 @@ The full license is in the LICENSE file, distributed with this software.
 
 #include "pandas/moments.h"
 #include <math.h>
+#include <stdint.h>
 
 Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna,
                                   const uint8_t *mask, int max_moment) {
   Moments result = {0};
+  int64_t i;
 
 #ifdef _OPENMP
 #  pragma omp parallel
@@ -23,7 +25,7 @@ Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna,
 #ifdef _OPENMP
 #  pragma omp for nowait
 #endif
-    for (int64_t i = 0; i < n; i++) {
+    for (i = 0; i < n; i++) {
       double val = values[i];
       if (mask && mask[i]) {
         val = NAN;

From 3464b248c4b92e8834660062054193091b3528ef Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Sat, 14 Mar 2026 15:00:02 -0300
Subject: [PATCH 04/35] chore: rename output and don't assign 0.0 on else

---
 pandas/_libs/include/pandas/moments.h | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/pandas/_libs/include/pandas/moments.h b/pandas/_libs/include/pandas/moments.h
index 2a56c7e19e169..e492e8200c82b 100644
--- a/pandas/_libs/include/pandas/moments.h
+++ b/pandas/_libs/include/pandas/moments.h
@@ -56,35 +56,31 @@ static inline Moments moments_merge(Moments a, Moments b, int max_moment) {
     return a;
   }
 
-  Moments out;
+  Moments result;
 
-  out.n = a.n + b.n;
+  result.n = a.n + b.n;
   double n_a = (double)a.n;
   double n_b = (double)b.n;
   double delta = b.mean - a.mean;
-  double n = (double)out.n;
-  double delta_n = delta / n;
+  double delta_n = delta / (double)result.n;
   double term1 = delta * delta_n * n_a * n_b;
 
   if (max_moment >= 4) {
-    out.m4 =
+    result.m4 =
         a.m4 + b.m4 +
         delta_n * (4.0 * (n_a * b.m3 - n_b * a.m3) +
                    delta_n * (6.0 * (n_a * n_a * b.m2 + n_b * n_b * a.m2) +
                               term1 * (n_a * n_a - n_a * n_b + n_b * n_b)));
-  } else {
-    out.m4 = 0.0;
   }
   if (max_moment >= 3) {
-    out.m3 = a.m3 + b.m3 +
-             delta_n * (3.0 * (n_a * b.m2 - n_b * a.m2) + term1 * (n_a - n_b));
-  } else {
-    out.m3 = 0.0;
+    result.m3 =
+        a.m3 + b.m3 +
+        delta_n * (3.0 * (n_a * b.m2 - n_b * a.m2) + term1 * (n_a - n_b));
   }
-  out.m2 = a.m2 + b.m2 + term1;
-  out.mean = a.mean + delta_n * n_b;
+  result.m2 = a.m2 + b.m2 + term1;
+  result.mean = a.mean + delta_n * n_b;
 
-  return out;
+  return result;
 }
 
 /// Compute central moments until `max_moment` using `n` elements from `values`.

From a86820e9b4b879615ca700a5f1e84f93f3e49b13 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Sat, 14 Mar 2026 15:48:57 -0300
Subject: [PATCH 05/35] test: use almost_equal instead of strict equality

Depending on how the job is distributed among the threads, the tests
may compute different results. So, assert almost equality instead of
strict equality.
---
 pandas/tests/apply/test_str.py                | 3 ++-
 pandas/tests/arrays/floating/test_function.py | 3 ++-
 pandas/tests/arrays/integer/test_function.py  | 3 ++-
 pandas/tests/test_nanops.py                   | 3 ++-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py
index e5a9492630b13..9589db07dbcee 100644
--- a/pandas/tests/apply/test_str.py
+++ b/pandas/tests/apply/test_str.py
@@ -41,7 +41,8 @@ def test_apply_with_string_funcs(float_frame, func, kwds, how):
 def test_with_string_args(datetime_series, all_numeric_reductions):
     result = datetime_series.apply(all_numeric_reductions)
     expected = getattr(datetime_series, all_numeric_reductions)()
-    assert result == expected
+    # reductions computed in parallel may yield different results
+    tm.assert_almost_equal(result, expected)
 
 
 @pytest.mark.parametrize("op", ["mean", "median", "std", "var"])
diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py
index a5938b2aa93e8..2fcdcd5a9e823 100644
--- a/pandas/tests/arrays/floating/test_function.py
+++ b/pandas/tests/arrays/floating/test_function.py
@@ -116,7 +116,8 @@ def test_stat_method(pandasmethname, kwargs):
     s2 = pd.Series(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype="float64")
     pandasmeth = getattr(s2, pandasmethname)
     expected = pandasmeth(**kwargs)
-    assert expected == result
+    # reductions computed in parallel may yield different results
+    tm.assert_almost_equal(result, expected)
 
 
 def test_value_counts_na():
diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py
index 6b3308baf3ee7..a288ab631ae3b 100644
--- a/pandas/tests/arrays/integer/test_function.py
+++ b/pandas/tests/arrays/integer/test_function.py
@@ -134,7 +134,8 @@ def test_stat_method(pandasmethname, kwargs):
     s2 = pd.Series(data=[1, 2, 3, 4, 5, 6], dtype="Int64")
     pandasmeth = getattr(s2, pandasmethname)
     expected = pandasmeth(**kwargs)
-    assert expected == result
+    # reductions computed in parallel may yield different results
+    tm.assert_almost_equal(result, expected)
 
 
 def test_value_counts_na():
diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py
index acd92d916bdcc..b83e2d6e0e7f2 100644
--- a/pandas/tests/test_nanops.py
+++ b/pandas/tests/test_nanops.py
@@ -1243,7 +1243,8 @@ def test_nanops_independent_of_mask_param(operation):
     mask = ser.isna()
     median_expected = operation(ser._values)
     median_result = operation(ser._values, mask=mask._values)
-    assert median_expected == median_result
+    # reductions computed in parallel may yield different results
+    tm.assert_almost_equal(median_result, median_expected)
 
 
 @pytest.mark.parametrize("min_count", [-1, 0])

From 3485af7cb12146ff4b7f2e4fda1714bc4fc10a8a Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Mon, 16 Mar 2026 23:28:11 -0300
Subject: [PATCH 06/35] perf: use AVX2 instructions

---
 pandas/_libs/src/moments.c | 204 ++++++++++++++++++++++++++++++++++---
 1 file changed, 188 insertions(+), 16 deletions(-)

diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index 009790951ae6a..59b26283244c9 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -10,31 +10,203 @@ The full license is in the LICENSE file, distributed with this software.
 #include "pandas/moments.h"
 #include <math.h>
 #include <stdint.h>
+#include <string.h>
+
+#ifdef _OPENMP
+#  include <omp.h>
+#else
+static inline int omp_get_thread_num() { return 0; }
+static inline int omp_get_num_threads() { return 1; }
+#endif
+
+/* --- AVX2 SIMD Implementation --- */
+
+#ifdef __x86_64__
+#  if defined(__clang__)
+#    pragma clang attribute push(__attribute__((target("avx2"))),              \
+                                 apply_to = function)
+#    define PANDAS_HAS_AVX2 1
+#  elif defined(__GNUC__)
+#    pragma GCC push_options
+#    pragma GCC target("avx2")
+#    define PANDAS_HAS_AVX2 1
+#  endif // defined(__GNUC__)
+#endif   // ifdef __x86_64__
+
+#ifdef PANDAS_HAS_AVX2
+#  include <immintrin.h>
+
+// Vectorization of [moments_add_value]
+static inline Moments accumulate_moments_avx2(const double *values, int64_t n,
+                                              int skipna, const uint8_t *mask,
+                                              int max_moment) {
+  __m256d v_mean = _mm256_setzero_pd();
+  __m256d v_m2 = _mm256_setzero_pd();
+  __m256d v_m3 = _mm256_setzero_pd();
+  __m256d v_m4 = _mm256_setzero_pd();
+  // Using double for v_n to avoid epi64 -> pd conversions.
+  // The results are exact up to n < 2^53.
+  __m256d v_n = _mm256_setzero_pd();
+
+  __m256d v_one = _mm256_set1_pd(1.0);
+  __m256d v_zero = _mm256_setzero_pd();
+  __m256d v_nan = _mm256_set1_pd(NAN);
+
+  int64_t i = 0;
+  for (; i < n - 4; i += 4) {
+    __m256d v_val = _mm256_loadu_pd(values + i);
+
+    if (mask) {
+      uint32_t m32;
+      memcpy(&m32, mask + i, sizeof(uint32_t));
+      __m128i v_mask_bytes = _mm_cvtsi32_si128((int)m32);
+      __m256i v_mask_epi64 = _mm256_cvtepu8_epi64(v_mask_bytes);
+      __m256i v_mask_is_zero =
+          _mm256_cmpeq_epi64(v_mask_epi64, _mm256_setzero_si256());
+      // replace val with NAN where mask is 1.
+      v_val =
+          _mm256_blendv_pd(v_nan, v_val, _mm256_castsi256_pd(v_mask_is_zero));
+    }
+
+    __m256d v_skip_mask =
+        skipna ? _mm256_cmp_pd(v_val, v_val, _CMP_UNORD_Q) : v_zero;
+    // Increment 1 where we do not skip
+    __m256d v_n_increment = _mm256_andnot_pd(v_skip_mask, v_one);
+    v_n = _mm256_add_pd(v_n, v_n_increment);
+    __m256d v_n_nonzero = _mm256_max_pd(v_n, v_one);
+
+    __m256d v_delta = _mm256_sub_pd(v_val, v_mean);
+
+    // replace delta with zero when skipping to don't update moments
+    v_delta = _mm256_blendv_pd(v_delta, v_zero, v_skip_mask);
+    __m256d v_delta_n = _mm256_div_pd(v_delta, v_n_nonzero);
+
+    __m256d v_delta_n2 = _mm256_mul_pd(v_delta, v_delta_n);
+    __m256d v_term1 = _mm256_mul_pd(v_delta_n2, _mm256_sub_pd(v_n, v_one));
+
+    if (max_moment >= 4) {
+      __m256d v_n2 = _mm256_mul_pd(v_n, v_n);
+      // n * n - 3.0 * n + 3.0
+      __m256d v_n_term = _mm256_add_pd(
+          _mm256_sub_pd(v_n2, _mm256_mul_pd(_mm256_set1_pd(3.0), v_n)),
+          _mm256_set1_pd(3.0));
+      __m256d v_m4_update = _mm256_mul_pd(
+          v_delta_n,
+          _mm256_add_pd(
+              _mm256_mul_pd(_mm256_set1_pd(-4.0), v_m3),
+              _mm256_mul_pd(
+                  v_delta_n,
+                  _mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(6.0), v_m2),
+                                _mm256_mul_pd(v_term1, v_n_term)))));
+      v_m4 = _mm256_add_pd(v_m4, v_m4_update);
+    }
+
+    if (max_moment >= 3) {
+      __m256d v_m3_term = _mm256_mul_pd(
+          v_delta_n,
+          _mm256_sub_pd(
+              _mm256_mul_pd(v_term1, _mm256_sub_pd(v_n, _mm256_set1_pd(2.0))),
+              _mm256_mul_pd(_mm256_set1_pd(3.0), v_m2)));
+      v_m3 = _mm256_add_pd(v_m3, v_m3_term);
+    }
+
+    v_m2 = _mm256_add_pd(v_m2, v_term1);
+    v_mean = _mm256_add_pd(v_mean, v_delta_n);
+  }
+
+  double n_arr[4], mean_arr[4], m2_arr[4], m3_arr[4], m4_arr[4];
+  _mm256_storeu_pd(n_arr, v_n);
+  _mm256_storeu_pd(mean_arr, v_mean);
+  _mm256_storeu_pd(m2_arr, v_m2);
+  _mm256_storeu_pd(m3_arr, v_m3);
+  _mm256_storeu_pd(m4_arr, v_m4);
+
+  Moments moments_arr[4];
+  for (int j = 0; j < 4; j++) {
+    moments_arr[j] = (Moments){(int64_t)round(n_arr[j]), mean_arr[j], m2_arr[j],
+                               m3_arr[j], m4_arr[j]};
+  }
+
+  // Distribute remaining values across chunks
+  for (; i < n; i++) {
+    double val = values[i];
+    if (mask && mask[i])
+      val = NAN;
+    if (skipna && isnan(val))
+      continue;
+    moments_add_value(&moments_arr[i % 4], val, max_moment);
+  }
+
+  // pairwise merge for numerical stability
+  Moments m_01 = moments_merge(moments_arr[0], moments_arr[1], max_moment);
+  Moments m_23 = moments_merge(moments_arr[2], moments_arr[3], max_moment);
+
+  return moments_merge(m_01, m_23, max_moment);
+}
+#endif
+
+#ifdef __x86_64__
+#  if defined(__clang__)
+#    pragma clang attribute pop
+#  elif defined(__GNUC__)
+#    pragma GCC pop_options
+#  endif
+#endif // ifdef __x86_64__
+
+/* --- Scalar Fallback Implementation --- */
+
+static inline Moments accumulate_moments_scalar_block(const double *values,
+                                                      int64_t n, int skipna,
+                                                      const uint8_t *mask,
+                                                      int max_moment) {
+  Moments moments = {0};
+  for (int64_t i = 0; i < n; i++) {
+    double val = values[i];
+    if (mask && mask[i])
+      val = NAN;
+    if (skipna && isnan(val))
+      continue;
+    moments_add_value(&moments, val, max_moment);
+  }
+  return moments;
+}
+
+/* --- Accumulation Dispatch (Choose AVX2 or Scalar) --- */
+
+static inline Moments accumulate_moments_dispatch(const double *values,
+                                                  int64_t n, int skipna,
+                                                  const uint8_t *mask,
+                                                  int max_moment) {
+#if defined(PANDAS_HAS_AVX2) && (defined(__GNUC__) || defined(__clang__))
+  if (__builtin_cpu_supports("avx2")) {
+    return accumulate_moments_avx2(values, n, skipna, mask, max_moment);
+  }
+#endif
+  return accumulate_moments_scalar_block(values, n, skipna, mask, max_moment);
+}
+#undef PANDAS_HAS_AVX2
+
+/* --- Public API (Orchestrates OpenMP Parallelism) --- */
 
 Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna,
                                   const uint8_t *mask, int max_moment) {
   Moments result = {0};
-  int64_t i;
 
 #ifdef _OPENMP
-#  pragma omp parallel
+  // parallel threshold chosen based on `perf report` where libgomp wasn't the
+  // overhead
+#  pragma omp parallel if (n > 10000)
 #endif
   {
-    Moments moments_local = {0};
+    int tid = omp_get_thread_num();
+    int num_threads = omp_get_num_threads();
 
-#ifdef _OPENMP
-#  pragma omp for nowait
-#endif
-    for (i = 0; i < n; i++) {
-      double val = values[i];
-      if (mask && mask[i]) {
-        val = NAN;
-      }
-      if (skipna && isnan(val)) {
-        continue;
-      }
-      moments_add_value(&moments_local, val, max_moment);
-    }
+    int64_t start = (tid * n) / num_threads;
+    int64_t end = tid == num_threads - 1 ? n : ((tid + 1) * n) / num_threads;
+
+    Moments moments_local =
+        accumulate_moments_dispatch(values + start, end - start, skipna,
+                                    mask ? mask + start : NULL, max_moment);
 
 #ifdef _OPENMP
 #  pragma omp critical

From f7c0d8558152e6381fc691d3d5342a9308e818bc Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Wed, 18 Mar 2026 21:05:45 -0300
Subject: [PATCH 07/35] wip: neon SIMD

---
 pandas/_libs/src/moments.c | 138 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 137 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index 59b26283244c9..91b3fb810ac49 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -153,6 +153,138 @@ static inline Moments accumulate_moments_avx2(const double *values, int64_t n,
 #  endif
 #endif // ifdef __x86_64__
 
+/* --- NEON SIMD Implementation --- */
+
+#ifdef __aarch64__
+#  if defined(__clang__)
+#    pragma clang attribute push(__attribute__((target("simd"))),              \
+                                 apply_to = function)
+#    define PANDAS_HAS_NEON 1
+#  elif defined(__GNUC__)
+#    pragma GCC push_options
+#    pragma GCC target("+simd")
+#    define PANDAS_HAS_NEON 1
+#  endif
+#endif
+
+#ifdef PANDAS_HAS_NEON
+#  include <arm_neon.h>
+
+// Vectorization of [moments_add_value]
+static inline Moments accumulate_moments_neon(const double *values, int64_t n,
+                                              int skipna, const uint8_t *mask,
+                                              int max_moment) {
+  float64x2_t v_mean = vdupq_n_f64(0.0);
+  float64x2_t v_m2 = vdupq_n_f64(0.0);
+  float64x2_t v_m3 = vdupq_n_f64(0.0);
+  float64x2_t v_m4 = vdupq_n_f64(0.0);
+  // Using double for v_n to avoid conversions.
+  float64x2_t v_n = vdupq_n_f64(0.0);
+
+  float64x2_t v_one = vdupq_n_f64(1.0);
+  float64x2_t v_zero = vdupq_n_f64(0.0);
+  float64x2_t v_nan = vdupq_n_f64(NAN);
+
+  int64_t i = 0;
+  for (; i < n - 2; i += 2) {
+    float64x2_t v_val = vld1q_f64(values + i);
+
+    uint64x2_t v_keep_mask = vdupq_n_u64(-1ULL);
+
+    if (mask) {
+      uint16_t m16;
+      memcpy(&m16, mask + i, sizeof(uint16_t));
+      uint8x8_t v_mask_bytes = vreinterpret_u8_u16(vdup_n_u16(m16));
+      // Expand 2 bytes to 2x 64-bit.
+      uint16x8_t v_m16 = vmovl_u8(v_mask_bytes);
+      uint32x4_t v_m32 = vmovl_u16(vget_low_u16(v_m16));
+      uint64x2_t v_m64 = vmovl_u32(vget_low_u32(v_m32));
+
+      uint64x2_t v_mask_is_zero = vceqq_u64(v_m64, vdupq_n_u64(0));
+      // replace val with NAN where mask is != 0.
+      v_val = vbslq_f64(v_mask_is_zero, v_val, v_nan);
+    }
+
+    if (skipna) {
+      uint64x2_t v_is_not_nan = vceqq_f64(v_val, v_val);
+      v_keep_mask = vandq_u64(v_keep_mask, v_is_not_nan);
+    }
+
+    // Increment 1 where we do not skip
+    float64x2_t v_n_increment = vbslq_f64(v_keep_mask, v_one, v_zero);
+    v_n = vaddq_f64(v_n, v_n_increment);
+    float64x2_t v_n_nonzero = vmaxq_f64(v_n, v_one);
+
+    float64x2_t v_delta = vsubq_f64(v_val, v_mean);
+
+    // replace delta with zero when skipping to don't update moments
+    v_delta = vbslq_f64(v_keep_mask, v_delta, v_zero);
+    float64x2_t v_delta_n = vdivq_f64(v_delta, v_n_nonzero);
+
+    float64x2_t v_delta_n2 = vmulq_f64(v_delta, v_delta_n);
+    float64x2_t v_term1 = vmulq_f64(v_delta_n2, vsubq_f64(v_n, v_one));
+
+    if (max_moment >= 4) {
+      float64x2_t v_n2 = vmulq_f64(v_n, v_n);
+      // n * n - 3.0 * n + 3.0
+      float64x2_t v_n_term = vaddq_f64(
+          vsubq_f64(v_n2, vmulq_f64(vdupq_n_f64(3.0), v_n)), vdupq_n_f64(3.0));
+      float64x2_t v_m4_update = vmulq_f64(
+          v_delta_n,
+          vaddq_f64(
+              vmulq_f64(vdupq_n_f64(-4.0), v_m3),
+              vmulq_f64(v_delta_n, vaddq_f64(vmulq_f64(vdupq_n_f64(6.0), v_m2),
+                                             vmulq_f64(v_term1, v_n_term)))));
+      v_m4 = vaddq_f64(v_m4, v_m4_update);
+    }
+
+    if (max_moment >= 3) {
+      float64x2_t v_m3_term = vmulq_f64(
+          v_delta_n,
+          vsubq_f64(vmulq_f64(v_term1, vsubq_f64(v_n, vdupq_n_f64(2.0))),
+                    vmulq_f64(vdupq_n_f64(3.0), v_m2)));
+      v_m3 = vaddq_f64(v_m3, v_m3_term);
+    }
+
+    v_m2 = vaddq_f64(v_m2, v_term1);
+    v_mean = vaddq_f64(v_mean, v_delta_n);
+  }
+
+  double n_arr[2], mean_arr[2], m2_arr[2], m3_arr[2], m4_arr[2];
+  vst1q_f64(n_arr, v_n);
+  vst1q_f64(mean_arr, v_mean);
+  vst1q_f64(m2_arr, v_m2);
+  vst1q_f64(m3_arr, v_m3);
+  vst1q_f64(m4_arr, v_m4);
+
+  Moments moments_arr[2];
+  for (int j = 0; j < 2; j++) {
+    moments_arr[j] = (Moments){(int64_t)round(n_arr[j]), mean_arr[j], m2_arr[j],
+                               m3_arr[j], m4_arr[j]};
+  }
+
+  // Distribute remaining values across chunks
+  for (; i < n; i++) {
+    double val = values[i];
+    if (mask && mask[i])
+      val = NAN;
+    if (skipna && isnan(val))
+      continue;
+    moments_add_value(&moments_arr[i % 2], val, max_moment);
+  }
+
+  return moments_merge(moments_arr[0], moments_arr[1], max_moment);
+}
+#endif
+
+#ifdef __aarch64__
+#  if defined(__clang__)
+#    pragma clang attribute pop
+#  elif defined(__GNUC__)
+#    pragma GCC pop_options
+#  endif
+#endif // ifdef __aarch64__
+
 /* --- Scalar Fallback Implementation --- */
 
 static inline Moments accumulate_moments_scalar_block(const double *values,
@@ -171,7 +303,7 @@ static inline Moments accumulate_moments_scalar_block(const double *values,
   return moments;
 }
 
-/* --- Accumulation Dispatch (Choose AVX2 or Scalar) --- */
+/* --- Accumulation Dispatch (Choose AVX2, NEON or Scalar) --- */
 
 static inline Moments accumulate_moments_dispatch(const double *values,
                                                   int64_t n, int skipna,
@@ -181,10 +313,14 @@ static inline Moments accumulate_moments_dispatch(const double *values,
   if (__builtin_cpu_supports("avx2")) {
     return accumulate_moments_avx2(values, n, skipna, mask, max_moment);
   }
+#endif
+#ifdef PANDAS_HAS_NEON
+  return accumulate_moments_neon(values, n, skipna, mask, max_moment);
 #endif
   return accumulate_moments_scalar_block(values, n, skipna, mask, max_moment);
 }
 #undef PANDAS_HAS_AVX2
+#undef PANDAS_HAS_NEON
 
 /* --- Public API (Orchestrates OpenMP Parallelism) --- */
 

From c508a10a2b5fb25c4232d17362ceeca7479790b4 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Fri, 20 Mar 2026 11:43:27 -0300
Subject: [PATCH 08/35] refactor: use vector extensions

---
 pandas/_libs/src/moments.c | 301 ++++++++++---------------------------
 1 file changed, 78 insertions(+), 223 deletions(-)

diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index 91b3fb810ac49..6e72885d6a298 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -19,107 +19,106 @@ static inline int omp_get_thread_num() { return 0; }
 static inline int omp_get_num_threads() { return 1; }
 #endif
 
-/* --- AVX2 SIMD Implementation --- */
+/* --- SIMD Implementation --- */
+#if defined(__clang__)
+typedef double v4d __attribute__((ext_vector_type(4), aligned(1)));
+typedef long long v4si __attribute__((ext_vector_type(4), aligned(1)));
+#  define PANDAS_HAS_SIMD 1
+#  if defined(__x86_64__)
+#    define PANDAS_SIMD_TARGETS                                                \
+      __attribute__((target_clones("avx2", "default")))
+#  else
+#    define PANDAS_SIMD_TARGETS
+#  endif
+#elif defined(__GNUC__)
+typedef double v4d __attribute__((vector_size(32), aligned(1)));
+typedef long long v4si __attribute__((vector_size(32), aligned(1)));
+#  define PANDAS_HAS_SIMD 1
+#  if defined(__x86_64__)
+#    define PANDAS_SIMD_TARGETS                                                \
+      __attribute__((target_clones("avx2", "default")))
+#  else
+#    define PANDAS_SIMD_TARGETS
+#  endif
+#endif
 
-#ifdef __x86_64__
+#ifdef PANDAS_HAS_SIMD
+/* Vector select: returns (mask ? a : b) where mask is all-ones or all-zeros */
 #  if defined(__clang__)
-#    pragma clang attribute push(__attribute__((target("avx2"))),              \
-                                 apply_to = function)
-#    define PANDAS_HAS_AVX2 1
-#  elif defined(__GNUC__)
-#    pragma GCC push_options
-#    pragma GCC target("avx2")
-#    define PANDAS_HAS_AVX2 1
-#  endif // defined(__GNUC__)
-#endif   // ifdef __x86_64__
-
-#ifdef PANDAS_HAS_AVX2
-#  include <immintrin.h>
-
-// Vectorization of [moments_add_value]
-static inline Moments accumulate_moments_avx2(const double *values, int64_t n,
-                                              int skipna, const uint8_t *mask,
-                                              int max_moment) {
-  __m256d v_mean = _mm256_setzero_pd();
-  __m256d v_m2 = _mm256_setzero_pd();
-  __m256d v_m3 = _mm256_setzero_pd();
-  __m256d v_m4 = _mm256_setzero_pd();
+#    define v_select(mask, a, b) ((mask) ? (a) : (b))
+#  else
+// gcc doesn't have ternary operators when compiling C files.
+#    define v_select(mask, a, b)                                               \
+      ((v4d)(((v4si)(mask) & (v4si)(a)) | (~(v4si)(mask) & (v4si)(b))))
+#  endif
+
+PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values,
+                                                    int64_t n, int skipna,
+                                                    const uint8_t *mask,
+                                                    int max_moment) {
+  v4d v_mean = {0.0, 0.0, 0.0, 0.0};
+  v4d v_m2 = {0.0, 0.0, 0.0, 0.0};
+  v4d v_m3 = {0.0, 0.0, 0.0, 0.0};
+  v4d v_m4 = {0.0, 0.0, 0.0, 0.0};
   // Using double for v_n to avoid epi64 -> pd conversions.
   // The results are exact up to n < 2^53.
-  __m256d v_n = _mm256_setzero_pd();
+  v4d v_n = {0.0, 0.0, 0.0, 0.0};
 
-  __m256d v_one = _mm256_set1_pd(1.0);
-  __m256d v_zero = _mm256_setzero_pd();
-  __m256d v_nan = _mm256_set1_pd(NAN);
+  v4d v_one = {1.0, 1.0, 1.0, 1.0};
+  v4d v_zero = {0.0, 0.0, 0.0, 0.0};
+  v4d v_nan = {NAN, NAN, NAN, NAN};
 
   int64_t i = 0;
   for (; i < n - 4; i += 4) {
-    __m256d v_val = _mm256_loadu_pd(values + i);
+    v4d v_val = *(v4d *)(values + i);
 
     if (mask) {
-      uint32_t m32;
-      memcpy(&m32, mask + i, sizeof(uint32_t));
-      __m128i v_mask_bytes = _mm_cvtsi32_si128((int)m32);
-      __m256i v_mask_epi64 = _mm256_cvtepu8_epi64(v_mask_bytes);
-      __m256i v_mask_is_zero =
-          _mm256_cmpeq_epi64(v_mask_epi64, _mm256_setzero_si256());
+      v4d v_m_val = {mask[i + 0] ? 1.0 : 0.0, mask[i + 1] ? 1.0 : 0.0,
+                     mask[i + 2] ? 1.0 : 0.0, mask[i + 3] ? 1.0 : 0.0};
       // replace val with NAN where mask is 1.
-      v_val =
-          _mm256_blendv_pd(v_nan, v_val, _mm256_castsi256_pd(v_mask_is_zero));
+      v_val = v_select(v_m_val == v_one, v_nan, v_val);
     }
 
-    __m256d v_skip_mask =
-        skipna ? _mm256_cmp_pd(v_val, v_val, _CMP_UNORD_Q) : v_zero;
-    // Increment 1 where we do not skip
-    __m256d v_n_increment = _mm256_andnot_pd(v_skip_mask, v_one);
-    v_n = _mm256_add_pd(v_n, v_n_increment);
-    __m256d v_n_nonzero = _mm256_max_pd(v_n, v_one);
+    // skip_mask is 1.0 if we should skip, 0.0 otherwise
+    v4d v_is_nan = v_select(v_val == v_val, v_zero, v_one);
+    v4d v_skip_mask = skipna ? v_is_nan : v_zero;
+
+    // Increment 1 where we do NOT skip
+    v4d v_n_increment = v_one - v_skip_mask;
+    v_n += v_n_increment;
+    v4d v_n_nonzero = v_select(v_n < v_one, v_one, v_n);
 
-    __m256d v_delta = _mm256_sub_pd(v_val, v_mean);
+    v4d v_delta = v_val - v_mean;
 
     // replace delta with zero when skipping to don't update moments
-    v_delta = _mm256_blendv_pd(v_delta, v_zero, v_skip_mask);
-    __m256d v_delta_n = _mm256_div_pd(v_delta, v_n_nonzero);
+    v_delta = v_select(v_skip_mask == v_zero, v_delta, v_zero);
+    v4d v_delta_n = v_delta / v_n_nonzero;
 
-    __m256d v_delta_n2 = _mm256_mul_pd(v_delta, v_delta_n);
-    __m256d v_term1 = _mm256_mul_pd(v_delta_n2, _mm256_sub_pd(v_n, v_one));
+    v4d v_delta_n2 = v_delta * v_delta_n;
+    v4d v_term1 = v_delta_n2 * (v_n - v_one);
 
     if (max_moment >= 4) {
-      __m256d v_n2 = _mm256_mul_pd(v_n, v_n);
+      v4d v_n2 = v_n * v_n;
       // n * n - 3.0 * n + 3.0
-      __m256d v_n_term = _mm256_add_pd(
-          _mm256_sub_pd(v_n2, _mm256_mul_pd(_mm256_set1_pd(3.0), v_n)),
-          _mm256_set1_pd(3.0));
-      __m256d v_m4_update = _mm256_mul_pd(
-          v_delta_n,
-          _mm256_add_pd(
-              _mm256_mul_pd(_mm256_set1_pd(-4.0), v_m3),
-              _mm256_mul_pd(
-                  v_delta_n,
-                  _mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(6.0), v_m2),
-                                _mm256_mul_pd(v_term1, v_n_term)))));
-      v_m4 = _mm256_add_pd(v_m4, v_m4_update);
+      v4d v_n_term = (v_n2 - (3.0 * v_n)) + 3.0;
+      v_m4 += v_delta_n * ((-4.0 * v_m3) +
+                           (v_delta_n * ((6.0 * v_m2) + (v_term1 * v_n_term))));
     }
 
     if (max_moment >= 3) {
-      __m256d v_m3_term = _mm256_mul_pd(
-          v_delta_n,
-          _mm256_sub_pd(
-              _mm256_mul_pd(v_term1, _mm256_sub_pd(v_n, _mm256_set1_pd(2.0))),
-              _mm256_mul_pd(_mm256_set1_pd(3.0), v_m2)));
-      v_m3 = _mm256_add_pd(v_m3, v_m3_term);
+      v_m3 += v_delta_n * ((v_term1 * (v_n - 2.0)) - (3.0 * v_m2));
     }
 
-    v_m2 = _mm256_add_pd(v_m2, v_term1);
-    v_mean = _mm256_add_pd(v_mean, v_delta_n);
+    v_m2 += v_term1;
+    v_mean += v_delta_n;
   }
 
   double n_arr[4], mean_arr[4], m2_arr[4], m3_arr[4], m4_arr[4];
-  _mm256_storeu_pd(n_arr, v_n);
-  _mm256_storeu_pd(mean_arr, v_mean);
-  _mm256_storeu_pd(m2_arr, v_m2);
-  _mm256_storeu_pd(m3_arr, v_m3);
-  _mm256_storeu_pd(m4_arr, v_m4);
+  *(v4d *)n_arr = v_n;
+  *(v4d *)mean_arr = v_mean;
+  *(v4d *)m2_arr = v_m2;
+  *(v4d *)m3_arr = v_m3;
+  *(v4d *)m4_arr = v_m4;
 
   Moments moments_arr[4];
   for (int j = 0; j < 4; j++) {
@@ -145,146 +144,6 @@ static inline Moments accumulate_moments_avx2(const double *values, int64_t n,
 }
 #endif
 
-#ifdef __x86_64__
-#  if defined(__clang__)
-#    pragma clang attribute pop
-#  elif defined(__GNUC__)
-#    pragma GCC pop_options
-#  endif
-#endif // ifdef __x86_64__
-
-/* --- NEON SIMD Implementation --- */
-
-#ifdef __aarch64__
-#  if defined(__clang__)
-#    pragma clang attribute push(__attribute__((target("simd"))),              \
-                                 apply_to = function)
-#    define PANDAS_HAS_NEON 1
-#  elif defined(__GNUC__)
-#    pragma GCC push_options
-#    pragma GCC target("+simd")
-#    define PANDAS_HAS_NEON 1
-#  endif
-#endif
-
-#ifdef PANDAS_HAS_NEON
-#  include <arm_neon.h>
-
-// Vectorization of [moments_add_value]
-static inline Moments accumulate_moments_neon(const double *values, int64_t n,
-                                              int skipna, const uint8_t *mask,
-                                              int max_moment) {
-  float64x2_t v_mean = vdupq_n_f64(0.0);
-  float64x2_t v_m2 = vdupq_n_f64(0.0);
-  float64x2_t v_m3 = vdupq_n_f64(0.0);
-  float64x2_t v_m4 = vdupq_n_f64(0.0);
-  // Using double for v_n to avoid conversions.
-  float64x2_t v_n = vdupq_n_f64(0.0);
-
-  float64x2_t v_one = vdupq_n_f64(1.0);
-  float64x2_t v_zero = vdupq_n_f64(0.0);
-  float64x2_t v_nan = vdupq_n_f64(NAN);
-
-  int64_t i = 0;
-  for (; i < n - 2; i += 2) {
-    float64x2_t v_val = vld1q_f64(values + i);
-
-    uint64x2_t v_keep_mask = vdupq_n_u64(-1ULL);
-
-    if (mask) {
-      uint16_t m16;
-      memcpy(&m16, mask + i, sizeof(uint16_t));
-      uint8x8_t v_mask_bytes = vreinterpret_u8_u16(vdup_n_u16(m16));
-      // Expand 2 bytes to 2x 64-bit.
-      uint16x8_t v_m16 = vmovl_u8(v_mask_bytes);
-      uint32x4_t v_m32 = vmovl_u16(vget_low_u16(v_m16));
-      uint64x2_t v_m64 = vmovl_u32(vget_low_u32(v_m32));
-
-      uint64x2_t v_mask_is_zero = vceqq_u64(v_m64, vdupq_n_u64(0));
-      // replace val with NAN where mask is != 0.
-      v_val = vbslq_f64(v_mask_is_zero, v_val, v_nan);
-    }
-
-    if (skipna) {
-      uint64x2_t v_is_not_nan = vceqq_f64(v_val, v_val);
-      v_keep_mask = vandq_u64(v_keep_mask, v_is_not_nan);
-    }
-
-    // Increment 1 where we do not skip
-    float64x2_t v_n_increment = vbslq_f64(v_keep_mask, v_one, v_zero);
-    v_n = vaddq_f64(v_n, v_n_increment);
-    float64x2_t v_n_nonzero = vmaxq_f64(v_n, v_one);
-
-    float64x2_t v_delta = vsubq_f64(v_val, v_mean);
-
-    // replace delta with zero when skipping to don't update moments
-    v_delta = vbslq_f64(v_keep_mask, v_delta, v_zero);
-    float64x2_t v_delta_n = vdivq_f64(v_delta, v_n_nonzero);
-
-    float64x2_t v_delta_n2 = vmulq_f64(v_delta, v_delta_n);
-    float64x2_t v_term1 = vmulq_f64(v_delta_n2, vsubq_f64(v_n, v_one));
-
-    if (max_moment >= 4) {
-      float64x2_t v_n2 = vmulq_f64(v_n, v_n);
-      // n * n - 3.0 * n + 3.0
-      float64x2_t v_n_term = vaddq_f64(
-          vsubq_f64(v_n2, vmulq_f64(vdupq_n_f64(3.0), v_n)), vdupq_n_f64(3.0));
-      float64x2_t v_m4_update = vmulq_f64(
-          v_delta_n,
-          vaddq_f64(
-              vmulq_f64(vdupq_n_f64(-4.0), v_m3),
-              vmulq_f64(v_delta_n, vaddq_f64(vmulq_f64(vdupq_n_f64(6.0), v_m2),
-                                             vmulq_f64(v_term1, v_n_term)))));
-      v_m4 = vaddq_f64(v_m4, v_m4_update);
-    }
-
-    if (max_moment >= 3) {
-      float64x2_t v_m3_term = vmulq_f64(
-          v_delta_n,
-          vsubq_f64(vmulq_f64(v_term1, vsubq_f64(v_n, vdupq_n_f64(2.0))),
-                    vmulq_f64(vdupq_n_f64(3.0), v_m2)));
-      v_m3 = vaddq_f64(v_m3, v_m3_term);
-    }
-
-    v_m2 = vaddq_f64(v_m2, v_term1);
-    v_mean = vaddq_f64(v_mean, v_delta_n);
-  }
-
-  double n_arr[2], mean_arr[2], m2_arr[2], m3_arr[2], m4_arr[2];
-  vst1q_f64(n_arr, v_n);
-  vst1q_f64(mean_arr, v_mean);
-  vst1q_f64(m2_arr, v_m2);
-  vst1q_f64(m3_arr, v_m3);
-  vst1q_f64(m4_arr, v_m4);
-
-  Moments moments_arr[2];
-  for (int j = 0; j < 2; j++) {
-    moments_arr[j] = (Moments){(int64_t)round(n_arr[j]), mean_arr[j], m2_arr[j],
-                               m3_arr[j], m4_arr[j]};
-  }
-
-  // Distribute remaining values across chunks
-  for (; i < n; i++) {
-    double val = values[i];
-    if (mask && mask[i])
-      val = NAN;
-    if (skipna && isnan(val))
-      continue;
-    moments_add_value(&moments_arr[i % 2], val, max_moment);
-  }
-
-  return moments_merge(moments_arr[0], moments_arr[1], max_moment);
-}
-#endif
-
-#ifdef __aarch64__
-#  if defined(__clang__)
-#    pragma clang attribute pop
-#  elif defined(__GNUC__)
-#    pragma GCC pop_options
-#  endif
-#endif // ifdef __aarch64__
-
 /* --- Scalar Fallback Implementation --- */
 
 static inline Moments accumulate_moments_scalar_block(const double *values,
@@ -303,24 +162,20 @@ static inline Moments accumulate_moments_scalar_block(const double *values,
   return moments;
 }
 
-/* --- Accumulation Dispatch (Choose AVX2, NEON or Scalar) --- */
+/* --- Accumulation Dispatch (Choose SIMD or Scalar) --- */
 
 static inline Moments accumulate_moments_dispatch(const double *values,
                                                   int64_t n, int skipna,
                                                   const uint8_t *mask,
                                                   int max_moment) {
-#if defined(PANDAS_HAS_AVX2) && (defined(__GNUC__) || defined(__clang__))
-  if (__builtin_cpu_supports("avx2")) {
-    return accumulate_moments_avx2(values, n, skipna, mask, max_moment);
-  }
-#endif
-#ifdef PANDAS_HAS_NEON
-  return accumulate_moments_neon(values, n, skipna, mask, max_moment);
+#if defined(PANDAS_HAS_SIMD)
+  return accumulate_moments_simd(values, n, skipna, mask, max_moment);
 #endif
   return accumulate_moments_scalar_block(values, n, skipna, mask, max_moment);
 }
-#undef PANDAS_HAS_AVX2
-#undef PANDAS_HAS_NEON
+#undef PANDAS_HAS_SIMD
+#undef PANDAS_SIMD_TARGETS
+#undef v_select
 
 /* --- Public API (Orchestrates OpenMP Parallelism) --- */
 

From e35e33785410340cc25519a221964698ac6a2a9f Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Fri, 20 Mar 2026 14:44:14 -0300
Subject: [PATCH 09/35] fix: fix musl build

---
 pandas/_libs/src/moments.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index 6e72885d6a298..04dda82f2053a 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -19,27 +19,25 @@ static inline int omp_get_thread_num() { return 0; }
 static inline int omp_get_num_threads() { return 1; }
 #endif
 
+#ifndef __has_attribute
+#  define __has_attribute(x) 0
+#endif
+
+#if defined(__x86_64__) && __has_attribute(target_clones) && defined(__GLIBC__)
+#  define PANDAS_SIMD_TARGETS __attribute__((target_clones("avx2", "default")))
+#else
+#  define PANDAS_SIMD_TARGETS
+#endif
+
 /* --- SIMD Implementation --- */
 #if defined(__clang__)
 typedef double v4d __attribute__((ext_vector_type(4), aligned(1)));
 typedef long long v4si __attribute__((ext_vector_type(4), aligned(1)));
 #  define PANDAS_HAS_SIMD 1
-#  if defined(__x86_64__)
-#    define PANDAS_SIMD_TARGETS                                                \
-      __attribute__((target_clones("avx2", "default")))
-#  else
-#    define PANDAS_SIMD_TARGETS
-#  endif
 #elif defined(__GNUC__)
 typedef double v4d __attribute__((vector_size(32), aligned(1)));
 typedef long long v4si __attribute__((vector_size(32), aligned(1)));
 #  define PANDAS_HAS_SIMD 1
-#  if defined(__x86_64__)
-#    define PANDAS_SIMD_TARGETS                                                \
-      __attribute__((target_clones("avx2", "default")))
-#  else
-#    define PANDAS_SIMD_TARGETS
-#  endif
 #endif
 
 #ifdef PANDAS_HAS_SIMD

From 7c5b6e955f55836e9de5448d4a800ca27ae6eb9c Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Fri, 20 Mar 2026 14:48:37 -0300
Subject: [PATCH 10/35] fix: remove unnecessary undef

---
 pandas/_libs/src/moments.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index 04dda82f2053a..d98896a50ee80 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -171,9 +171,6 @@ static inline Moments accumulate_moments_dispatch(const double *values,
 #endif
   return accumulate_moments_scalar_block(values, n, skipna, mask, max_moment);
 }
-#undef PANDAS_HAS_SIMD
-#undef PANDAS_SIMD_TARGETS
-#undef v_select
 
 /* --- Public API (Orchestrates OpenMP Parallelism) --- */
 

From 39fcf5f9849de81d96bf1d77670274b50a542606 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Fri, 20 Mar 2026 15:22:21 -0300
Subject: [PATCH 11/35] chore: remove unused include

---
 pandas/_libs/src/moments.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index d98896a50ee80..4e436abfacaf5 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -10,7 +10,6 @@ The full license is in the LICENSE file, distributed with this software.
 #include "pandas/moments.h"
 #include <math.h>
 #include <stdint.h>
-#include <string.h>
 
 #ifdef _OPENMP
 #  include <omp.h>

From 040656512d230ae25e2c311214acef199d26b15e Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Fri, 20 Mar 2026 15:26:58 -0300
Subject: [PATCH 12/35] fix: check for vector attribute

---
 pandas/_libs/src/moments.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index 4e436abfacaf5..5bd81479b40a8 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -29,11 +29,11 @@ static inline int omp_get_num_threads() { return 1; }
 #endif
 
 /* --- SIMD Implementation --- */
-#if defined(__clang__)
+#if defined(__clang__) && __has_attribute(ext_vector_type)
 typedef double v4d __attribute__((ext_vector_type(4), aligned(1)));
 typedef long long v4si __attribute__((ext_vector_type(4), aligned(1)));
 #  define PANDAS_HAS_SIMD 1
-#elif defined(__GNUC__)
+#elif defined(__GNUC__) && __has_attribute(vector_size)
 typedef double v4d __attribute__((vector_size(32), aligned(1)));
 typedef long long v4si __attribute__((vector_size(32), aligned(1)));
 #  define PANDAS_HAS_SIMD 1

From bc951a40a54de0363faa8b2447f96459a2eba1c0 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Fri, 20 Mar 2026 15:30:04 -0300
Subject: [PATCH 13/35] fix: improve typedef for different platforms

---
 pandas/_libs/src/moments.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index 5bd81479b40a8..ea5738f742e8b 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -34,8 +34,9 @@ typedef double v4d __attribute__((ext_vector_type(4), aligned(1)));
 typedef long long v4si __attribute__((ext_vector_type(4), aligned(1)));
 #  define PANDAS_HAS_SIMD 1
 #elif defined(__GNUC__) && __has_attribute(vector_size)
-typedef double v4d __attribute__((vector_size(32), aligned(1)));
-typedef long long v4si __attribute__((vector_size(32), aligned(1)));
+typedef double v4d __attribute__((vector_size(4 * sizeof(double)), aligned(1)));
+typedef long long v4si
+    __attribute__((vector_size(4 * sizeof(long long)), aligned(1)));
 #  define PANDAS_HAS_SIMD 1
 #endif
 

From c2c832af9ef88f83b77030bc93866f7e72137a8a Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Fri, 20 Mar 2026 15:37:40 -0300
Subject: [PATCH 14/35] chore: clarify vectorized in gcc and undefine v_select

---
 pandas/_libs/src/moments.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index ea5738f742e8b..a0723f87bbbb1 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -45,7 +45,7 @@ typedef long long v4si
 #  if defined(__clang__)
 #    define v_select(mask, a, b) ((mask) ? (a) : (b))
 #  else
-// gcc doesn't have ternary operators when compiling C files.
+// gcc doesn't have vectorized ternary operators when compiling C files.
 #    define v_select(mask, a, b)                                               \
       ((v4d)(((v4si)(mask) & (v4si)(a)) | (~(v4si)(mask) & (v4si)(b))))
 #  endif
@@ -140,6 +140,7 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values,
 
   return moments_merge(m_01, m_23, max_moment);
 }
+#  undef v_select
 #endif
 
 /* --- Scalar Fallback Implementation --- */

From 1d7e1085dfc5c7070fa0ef7e62cfbcc9065aea78 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Fri, 20 Mar 2026 17:31:39 -0300
Subject: [PATCH 15/35] perf: vectorize mask handling

---
 pandas/_libs/src/moments.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index a0723f87bbbb1..ce0869c6d9c03 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -32,11 +32,14 @@ static inline int omp_get_num_threads() { return 1; }
 #if defined(__clang__) && __has_attribute(ext_vector_type)
 typedef double v4d __attribute__((ext_vector_type(4), aligned(1)));
 typedef long long v4si __attribute__((ext_vector_type(4), aligned(1)));
+typedef uint8_t v4u8 __attribute__((ext_vector_type(4), aligned(1)));
 #  define PANDAS_HAS_SIMD 1
 #elif defined(__GNUC__) && __has_attribute(vector_size)
 typedef double v4d __attribute__((vector_size(4 * sizeof(double)), aligned(1)));
 typedef long long v4si
     __attribute__((vector_size(4 * sizeof(long long)), aligned(1)));
+typedef uint8_t v4u8
+    __attribute__((vector_size(4 * sizeof(uint8_t)), aligned(1)));
 #  define PANDAS_HAS_SIMD 1
 #endif
 
@@ -71,10 +74,10 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values,
     v4d v_val = *(v4d *)(values + i);
 
     if (mask) {
-      v4d v_m_val = {mask[i + 0] ? 1.0 : 0.0, mask[i + 1] ? 1.0 : 0.0,
-                     mask[i + 2] ? 1.0 : 0.0, mask[i + 3] ? 1.0 : 0.0};
+      v4u8 mask_vec = *(v4u8 *)(mask + i);
+      v4si mask_si = __builtin_convertvector(mask_vec, v4si);
       // replace val with NAN where mask is 1.
-      v_val = v_select(v_m_val == v_one, v_nan, v_val);
+      v_val = v_select(mask_si != 0, v_nan, v_val);
     }
 
     // skip_mask is 1.0 if we should skip, 0.0 otherwise

From 40fd15349620212091ce2a0160148c2d2889c61d Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Fri, 20 Mar 2026 17:34:40 -0300
Subject: [PATCH 16/35] chore: remove inlines in favor of compiler discretion

---
 pandas/_libs/src/moments.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index ce0869c6d9c03..cb4d5d832b196 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -148,10 +148,9 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values,
 
 /* --- Scalar Fallback Implementation --- */
 
-static inline Moments accumulate_moments_scalar_block(const double *values,
-                                                      int64_t n, int skipna,
-                                                      const uint8_t *mask,
-                                                      int max_moment) {
+Moments accumulate_moments_scalar_block(const double *values, int64_t n,
+                                        int skipna, const uint8_t *mask,
+                                        int max_moment) {
   Moments moments = {0};
   for (int64_t i = 0; i < n; i++) {
     double val = values[i];
@@ -166,10 +165,8 @@ static inline Moments accumulate_moments_scalar_block(const double *values,
 
 /* --- Accumulation Dispatch (Choose SIMD or Scalar) --- */
 
-static inline Moments accumulate_moments_dispatch(const double *values,
-                                                  int64_t n, int skipna,
-                                                  const uint8_t *mask,
-                                                  int max_moment) {
+Moments accumulate_moments_dispatch(const double *values, int64_t n, int skipna,
+                                    const uint8_t *mask, int max_moment) {
 #if defined(PANDAS_HAS_SIMD)
   return accumulate_moments_simd(values, n, skipna, mask, max_moment);
 #endif

From 7d3f2860c9d01f0248b209c9e1f23510a4b3f356 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Fri, 20 Mar 2026 18:03:30 -0300
Subject: [PATCH 17/35] chore: comments

---
 pandas/_libs/src/moments.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index cb4d5d832b196..0c7ef0d17b85b 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -173,15 +173,13 @@ Moments accumulate_moments_dispatch(const double *values, int64_t n, int skipna,
   return accumulate_moments_scalar_block(values, n, skipna, mask, max_moment);
 }
 
-/* --- Public API (Orchestrates OpenMP Parallelism) --- */
+/* --- Moments 1D Accumulator Implementation --- */
 
 Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna,
                                   const uint8_t *mask, int max_moment) {
   Moments result = {0};
 
 #ifdef _OPENMP
-  // parallel threshold chosen based on `perf report` where libgomp wasn't the
-  // overhead
 #  pragma omp parallel if (n > 10000)
 #endif
   {

From 6f9224e5b00e691c6db10b2f823748b6fb08be95 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Sat, 21 Mar 2026 00:20:02 -0300
Subject: [PATCH 18/35] fix: fix loop bounds and avoid unnecessary round

---
 pandas/_libs/src/moments.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index 0c7ef0d17b85b..e9ec4f84fccfe 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -70,7 +70,7 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values,
   v4d v_nan = {NAN, NAN, NAN, NAN};
 
   int64_t i = 0;
-  for (; i < n - 4; i += 4) {
+  for (; i < n - 3; i += 4) {
     v4d v_val = *(v4d *)(values + i);
 
     if (mask) {
@@ -123,7 +123,7 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values,
 
   Moments moments_arr[4];
   for (int j = 0; j < 4; j++) {
-    moments_arr[j] = (Moments){(int64_t)round(n_arr[j]), mean_arr[j], m2_arr[j],
+    moments_arr[j] = (Moments){(int64_t)n_arr[j], mean_arr[j], m2_arr[j],
                                m3_arr[j], m4_arr[j]};
   }
 

From f3ba6abb201628ccd69fc781bcc874eed3c5223b Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Sat, 21 Mar 2026 17:33:22 -0300
Subject: [PATCH 19/35] chore: add comments after endif

---
 pandas/_libs/src/moments.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index e9ec4f84fccfe..2f69da965c24a 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -14,19 +14,19 @@ The full license is in the LICENSE file, distributed with this software.
 #ifdef _OPENMP
 #  include <omp.h>
 #else
-static inline int omp_get_thread_num() { return 0; }
-static inline int omp_get_num_threads() { return 1; }
-#endif
+static inline int omp_get_thread_num(void) { return 0; }
+static inline int omp_get_num_threads(void) { return 1; }
+#endif // _OPENMP
 
 #ifndef __has_attribute
 #  define __has_attribute(x) 0
-#endif
+#endif // __has_attribute
 
 #if defined(__x86_64__) && __has_attribute(target_clones) && defined(__GLIBC__)
 #  define PANDAS_SIMD_TARGETS __attribute__((target_clones("avx2", "default")))
 #else
 #  define PANDAS_SIMD_TARGETS
-#endif
+#endif // x86_64 + glibc + target_clones
 
 /* --- SIMD Implementation --- */
 #if defined(__clang__) && __has_attribute(ext_vector_type)
@@ -41,7 +41,7 @@ typedef long long v4si
 typedef uint8_t v4u8
     __attribute__((vector_size(4 * sizeof(uint8_t)), aligned(1)));
 #  define PANDAS_HAS_SIMD 1
-#endif
+#endif // __clang__ or __GNUC__ with vector extension
 
 #ifdef PANDAS_HAS_SIMD
 /* Vector select: returns (mask ? a : b) where mask is all-ones or all-zeros */
@@ -51,7 +51,7 @@ typedef uint8_t v4u8
 // gcc doesn't have vectorized ternary operators when compiling C files.
 #    define v_select(mask, a, b)                                               \
       ((v4d)(((v4si)(mask) & (v4si)(a)) | (~(v4si)(mask) & (v4si)(b))))
-#  endif
+#  endif // defined(__clang__)
 
 PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values,
                                                     int64_t n, int skipna,
@@ -144,7 +144,7 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values,
   return moments_merge(m_01, m_23, max_moment);
 }
 #  undef v_select
-#endif
+#endif // PANDAS_HAS_SIMD
 
 /* --- Scalar Fallback Implementation --- */
 
@@ -169,7 +169,7 @@ Moments accumulate_moments_dispatch(const double *values, int64_t n, int skipna,
                                     const uint8_t *mask, int max_moment) {
 #if defined(PANDAS_HAS_SIMD)
   return accumulate_moments_simd(values, n, skipna, mask, max_moment);
-#endif
+#endif // defined(PANDAS_HAS_SIMD)
   return accumulate_moments_scalar_block(values, n, skipna, mask, max_moment);
 }
 

From 18566d2fad50cfce7cdf64cca75555dde4572d09 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Sat, 21 Mar 2026 23:48:27 -0300
Subject: [PATCH 20/35] refactor: use `size_t` instead of signed integer for
 array size

---
 pandas/_libs/algos.pxd                |  2 +-
 pandas/_libs/algos.pyx                |  4 ++--
 pandas/_libs/include/pandas/moments.h |  2 +-
 pandas/_libs/src/moments.c            | 22 +++++++++++-----------
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd
index 4d80831b603c4..8ae078f4363de 100644
--- a/pandas/_libs/algos.pxd
+++ b/pandas/_libs/algos.pxd
@@ -40,7 +40,7 @@ cdef extern from "pandas/moments.h":
 
     Moments accumulate_moments_scalar(
         const float64_t* values,
-        int64_t n,
+        size_t n,
         bint skipna,
         const uint8_t* mask,
         int max_moment,
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 2e4585566c486..f25df20c63f76 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -1645,7 +1645,7 @@ def scalar_skew(
 ) -> float:
     cdef:
         Moments moments
-        Py_ssize_t n = values.shape[0]
+        size_t n = <size_t>values.shape[0]
         const float64_t* p_values = &values[0]
         const uint8_t* p_mask = &mask[0] if mask is not None else NULL
 
@@ -1664,7 +1664,7 @@ def scalar_kurt(
 ) -> float:
     cdef:
         Moments moments
-        Py_ssize_t n = values.shape[0]
+        size_t n = <size_t>values.shape[0]
         const float64_t* p_values = &values[0]
         const uint8_t* p_mask = &mask[0] if mask is not None else NULL
 
diff --git a/pandas/_libs/include/pandas/moments.h b/pandas/_libs/include/pandas/moments.h
index e492e8200c82b..166ace232fb5a 100644
--- a/pandas/_libs/include/pandas/moments.h
+++ b/pandas/_libs/include/pandas/moments.h
@@ -86,7 +86,7 @@ static inline Moments moments_merge(Moments a, Moments b, int max_moment) {
 /// Compute central moments until `max_moment` using `n` elements from `values`.
 /// The size is represented as signed integer for MSVC compatibility with
 /// OpenMP.
-Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna,
+Moments accumulate_moments_scalar(const double *values, size_t n, int skipna,
                                   const uint8_t *mask, int max_moment);
 
 static inline double calc_skew(Moments moments) {
diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index 2f69da965c24a..2276db43dc600 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -54,7 +54,7 @@ typedef uint8_t v4u8
 #  endif // defined(__clang__)
 
 PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values,
-                                                    int64_t n, int skipna,
+                                                    size_t n, int skipna,
                                                     const uint8_t *mask,
                                                     int max_moment) {
   v4d v_mean = {0.0, 0.0, 0.0, 0.0};
@@ -69,8 +69,8 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values,
   v4d v_zero = {0.0, 0.0, 0.0, 0.0};
   v4d v_nan = {NAN, NAN, NAN, NAN};
 
-  int64_t i = 0;
-  for (; i < n - 3; i += 4) {
+  size_t i = 0;
+  for (; i + 3 < n; i += 4) {
     v4d v_val = *(v4d *)(values + i);
 
     if (mask) {
@@ -148,11 +148,11 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values,
 
 /* --- Scalar Fallback Implementation --- */
 
-Moments accumulate_moments_scalar_block(const double *values, int64_t n,
+Moments accumulate_moments_scalar_block(const double *values, size_t n,
                                         int skipna, const uint8_t *mask,
                                         int max_moment) {
   Moments moments = {0};
-  for (int64_t i = 0; i < n; i++) {
+  for (size_t i = 0; i < n; i++) {
     double val = values[i];
     if (mask && mask[i])
       val = NAN;
@@ -165,7 +165,7 @@ Moments accumulate_moments_scalar_block(const double *values, int64_t n,
 
 /* --- Accumulation Dispatch (Choose SIMD or Scalar) --- */
 
-Moments accumulate_moments_dispatch(const double *values, int64_t n, int skipna,
+Moments accumulate_moments_dispatch(const double *values, size_t n, int skipna,
                                     const uint8_t *mask, int max_moment) {
 #if defined(PANDAS_HAS_SIMD)
   return accumulate_moments_simd(values, n, skipna, mask, max_moment);
@@ -175,7 +175,7 @@ Moments accumulate_moments_dispatch(const double *values, int64_t n, int skipna,
 
 /* --- Moments 1D Accumulator Implementation --- */
 
-Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna,
+Moments accumulate_moments_scalar(const double *values, size_t n, int skipna,
                                   const uint8_t *mask, int max_moment) {
   Moments result = {0};
 
@@ -183,11 +183,11 @@ Moments accumulate_moments_scalar(const double *values, int64_t n, int skipna,
 #  pragma omp parallel if (n > 10000)
 #endif
   {
-    int tid = omp_get_thread_num();
-    int num_threads = omp_get_num_threads();
+    size_t tid = (size_t)omp_get_thread_num();
+    size_t num_threads = (size_t)omp_get_num_threads();
 
-    int64_t start = (tid * n) / num_threads;
-    int64_t end = tid == num_threads - 1 ? n : ((tid + 1) * n) / num_threads;
+    size_t start = (tid * n) / num_threads;
+    size_t end = tid == num_threads - 1 ? n : ((tid + 1) * n) / num_threads;
 
     Moments moments_local =
         accumulate_moments_dispatch(values + start, end - start, skipna,

From 9254c3c24df0ae5e1c7beafe45d65f576779b465 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Sun, 22 Mar 2026 00:01:54 -0300
Subject: [PATCH 21/35] fix: cast type to size_t to avoid conversion warnings

---
 pandas/_libs/algos.pyx   | 4 ++--
 pandas/_libs/groupby.pyx | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index f25df20c63f76..839bf31904a87 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -1685,7 +1685,7 @@ def axis_skew(
     cdef:
         Py_ssize_t i
         Py_ssize_t nouter = values.shape[1] if axis == 0 else values.shape[0]
-        Moments *moments = <Moments *>calloc(nouter, sizeof(Moments))
+        Moments *moments = <Moments *>calloc(<size_t>nouter, sizeof(Moments))
         ndarray result_arr = np.empty(nouter, dtype=np.float64)
         float64_t[:] result = result_arr
 
@@ -1712,7 +1712,7 @@ def axis_kurt(
     cdef:
         Py_ssize_t i
         Py_ssize_t nouter = values.shape[1] if axis == 0 else values.shape[0]
-        Moments *moments = <Moments *>calloc(nouter, sizeof(Moments))
+        Moments *moments = <Moments *>calloc(<size_t>nouter, sizeof(Moments))
         ndarray result_arr = np.empty(nouter, dtype=np.float64)
         float64_t[:] result = result_arr
 
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 14b8320e7b044..d6529623e41de 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -1034,7 +1034,7 @@ def group_skew(
         raise ValueError("len(index) != len(labels)")
 
     N, K = (<object>values).shape
-    ms = <Moments *>calloc(ngroups * K, sizeof(Moments))
+    ms = <Moments *>calloc(<size_t>ngroups * <size_t>K, sizeof(Moments))
     if ms is NULL:
         raise MemoryError()
 
@@ -1092,7 +1092,7 @@ def group_kurt(
         raise ValueError("len(index) != len(labels)")
 
     N, K = (<object>values).shape
-    ms = <Moments *>calloc(ngroups * K, sizeof(Moments))
+    ms = <Moments *>calloc(<size_t>ngroups * <size_t>K, sizeof(Moments))
     if ms is NULL:
         raise MemoryError()
 

From 933db467972fb3b4d4b246a84aabcdc3c369fbbc Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Sun, 22 Mar 2026 00:19:51 -0300
Subject: [PATCH 22/35] add note about performance

---
 pandas/_libs/algos.pyx | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 839bf31904a87..af06f89d36849 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -1626,7 +1626,13 @@ cdef void accumulate_moments_axis(
         nouter = nrows
         ninner = ncols
 
+    # PERF: The outer dimension is embarrassingly parallel
     for i in range(nouter):
+        # PERF: If it's possible to enforce contiguity in the last dimension,
+        # it's possible to use accumulate_moments_scalar with SIMD instructions.
+        # It may create more threads if OMP_NESTED is enabled and the outer loop
+        # is run in parallel.
+        # If it doesn't run in parallel, it will at least use SIMD instructions.
         for j in range(ninner):
             val = values[j, i] if axis == 0 else values[i, j]
             if uses_mask and (mask[j, i] if axis == 0 else mask[i, j]):

From 5acaa4c498162e778cf5a80d1a34bdbdf14b4401 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Sun, 22 Mar 2026 12:08:45 -0300
Subject: [PATCH 23/35] feat: make OpenMP optional and disabled by default

---
 doc/source/development/contributing_environment.rst | 3 ++-
 meson_options.txt                                   | 1 +
 pandas/_libs/meson.build                            | 7 ++++++-
 3 files changed, 9 insertions(+), 2 deletions(-)
 create mode 100644 meson_options.txt

diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst
index ce0430bd9e876..b5034dee446ca 100644
--- a/doc/source/development/contributing_environment.rst
+++ b/doc/source/development/contributing_environment.rst
@@ -169,12 +169,13 @@ To compile and install pandas in editable mode, run:
 
    python -m pip install --verbose --editable . --no-build-isolation
 
-Additional Meson options can be passed to the ``pip install`` command to modify the installation.
+Additional Meson and Pandas options can be passed to the ``pip install`` command to modify the installation.
 Helpful options include:
 
 * ``-Ceditable-verbose=true``: Print verbose logs during rebuild, even during ``import``.
 * ``-Cbuilddir="your builddir here"``: Specify a different build directory for the C extensions.
 * ``-Csetup-args="-Ddebug=true"``: Compile the C extensions with debug symbols.
+* ``-Csetup-args=-Duse_openmp=true``: Compile Pandas with parallelization support through OpenMP.
 
 .. note::
    When pandas is installed in ``--editable`` mode, pandas will automatically rebuild the library upon ``import``,
diff --git a/meson_options.txt b/meson_options.txt
new file mode 100644
index 0000000000000..536c1cadeb718
--- /dev/null
+++ b/meson_options.txt
@@ -0,0 +1 @@
+option('use_openmp', type: 'boolean', value: false, description: 'Whether to enable OpenMP support')
diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build
index 2c2122613d3e3..694f792fdb72d 100644
--- a/pandas/_libs/meson.build
+++ b/pandas/_libs/meson.build
@@ -55,7 +55,12 @@ fast_float = subproject('fast_float')
 fast_float_dep = fast_float.get_variable('fast_float_dep')
 
 subdir('tslibs')
-openmp_dep = dependency('OpenMP', required: false)
+
+if get_option('use_openmp')
+    openmp_dep = dependency('OpenMP', required: true)
+else
+    openmp_dep = declare_dependency()
+endif
 
 libs_sources = {
     # Dict of extension name -> dict of {sources, include_dirs, and deps}

From d62ac23899b2736cc42f73411b8762f25b6433f6 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Sun, 22 Mar 2026 19:12:12 -0300
Subject: [PATCH 24/35] refactor: use feature instead of boolean for openmp

---
 doc/source/development/contributing_environment.rst | 2 +-
 meson_options.txt                                   | 2 +-
 pandas/_libs/meson.build                            | 6 +-----
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst
index b5034dee446ca..012304b3da3fc 100644
--- a/doc/source/development/contributing_environment.rst
+++ b/doc/source/development/contributing_environment.rst
@@ -175,7 +175,7 @@ Helpful options include:
 * ``-Ceditable-verbose=true``: Print verbose logs during rebuild, even during ``import``.
 * ``-Cbuilddir="your builddir here"``: Specify a different build directory for the C extensions.
 * ``-Csetup-args="-Ddebug=true"``: Compile the C extensions with debug symbols.
-* ``-Csetup-args=-Duse_openmp=true``: Compile Pandas with parallelization support through OpenMP.
+* ``-Csetup-args=-Dopenmp=enabled``: Compile Pandas with parallelization support through OpenMP.
 
 .. note::
    When pandas is installed in ``--editable`` mode, pandas will automatically rebuild the library upon ``import``,
diff --git a/meson_options.txt b/meson_options.txt
index 536c1cadeb718..128bb55b2f3f6 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -1 +1 @@
-option('use_openmp', type: 'boolean', value: false, description: 'Whether to enable OpenMP support')
+option('openmp', type: 'feature', value: 'disabled', description: 'Parallelization with OpenMP')
diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build
index 694f792fdb72d..e4db9972dd44d 100644
--- a/pandas/_libs/meson.build
+++ b/pandas/_libs/meson.build
@@ -56,11 +56,7 @@ fast_float_dep = fast_float.get_variable('fast_float_dep')
 
 subdir('tslibs')
 
-if get_option('use_openmp')
-    openmp_dep = dependency('OpenMP', required: true)
-else
-    openmp_dep = declare_dependency()
-endif
+openmp_dep = dependency('OpenMP', required: get_option('openmp'))
 
 libs_sources = {
     # Dict of extension name -> dict of {sources, include_dirs, and deps}

From bc43aaad0d4d75e8425969e049ae1b8abde1c0e0 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Mon, 23 Mar 2026 17:57:34 -0300
Subject: [PATCH 25/35] perf: use OpenMP automatically

---
 doc/source/development/contributing_environment.rst | 2 +-
 meson_options.txt                                   | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst
index 012304b3da3fc..b814e1ca96020 100644
--- a/doc/source/development/contributing_environment.rst
+++ b/doc/source/development/contributing_environment.rst
@@ -175,7 +175,7 @@ Helpful options include:
 * ``-Ceditable-verbose=true``: Print verbose logs during rebuild, even during ``import``.
 * ``-Cbuilddir="your builddir here"``: Specify a different build directory for the C extensions.
 * ``-Csetup-args="-Ddebug=true"``: Compile the C extensions with debug symbols.
-* ``-Csetup-args=-Dopenmp=enabled``: Compile Pandas with parallelization support through OpenMP.
+* ``-Csetup-args=-Dopenmp=disabled``: Compile Pandas without parallelization support through OpenMP.
 
 .. note::
    When pandas is installed in ``--editable`` mode, pandas will automatically rebuild the library upon ``import``,
diff --git a/meson_options.txt b/meson_options.txt
index 128bb55b2f3f6..22fc68252de52 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -1 +1,5 @@
-option('openmp', type: 'feature', value: 'disabled', description: 'Parallelization with OpenMP')
+option(
+    'openmp',
+    type: 'feature',
+    description: 'Parallelization with OpenMP',
+)

From baf66c4834666fb8026f221dd04362c6454eed73 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Mon, 23 Mar 2026 19:04:06 -0300
Subject: [PATCH 26/35] fix: use meson.options to fix the build error

---
 meson_options.txt => meson.options | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename meson_options.txt => meson.options (100%)

diff --git a/meson_options.txt b/meson.options
similarity index 100%
rename from meson_options.txt
rename to meson.options

From 893796fc65e251da3e79afc79b54436ca0e36084 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Mon, 23 Mar 2026 21:24:26 -0300
Subject: [PATCH 27/35] docs(moments): remove outdated comment about MSVC

---
 pandas/_libs/include/pandas/moments.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pandas/_libs/include/pandas/moments.h b/pandas/_libs/include/pandas/moments.h
index 166ace232fb5a..e9516acf4a881 100644
--- a/pandas/_libs/include/pandas/moments.h
+++ b/pandas/_libs/include/pandas/moments.h
@@ -84,8 +84,6 @@ static inline Moments moments_merge(Moments a, Moments b, int max_moment) {
 }
 
 /// Compute central moments until `max_moment` using `n` elements from `values`.
-/// The size is represented as signed integer for MSVC compatibility with
-/// OpenMP.
 Moments accumulate_moments_scalar(const double *values, size_t n, int skipna,
                                   const uint8_t *mask, int max_moment);
 

From 57d3cb1a3b2b8f4b6349b2c6c13164e3132833b5 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Mon, 23 Mar 2026 21:26:53 -0300
Subject: [PATCH 28/35] fix: don't force compiler for vector extension

---
 pandas/_libs/src/moments.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index 2276db43dc600..b80dadf350d04 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -29,19 +29,19 @@ static inline int omp_get_num_threads(void) { return 1; }
 #endif // x86_64 + glibc + target_clones
 
 /* --- SIMD Implementation --- */
-#if defined(__clang__) && __has_attribute(ext_vector_type)
+#if __has_attribute(ext_vector_type)
 typedef double v4d __attribute__((ext_vector_type(4), aligned(1)));
 typedef long long v4si __attribute__((ext_vector_type(4), aligned(1)));
 typedef uint8_t v4u8 __attribute__((ext_vector_type(4), aligned(1)));
 #  define PANDAS_HAS_SIMD 1
-#elif defined(__GNUC__) && __has_attribute(vector_size)
+#elif __has_attribute(vector_size)
 typedef double v4d __attribute__((vector_size(4 * sizeof(double)), aligned(1)));
 typedef long long v4si
     __attribute__((vector_size(4 * sizeof(long long)), aligned(1)));
 typedef uint8_t v4u8
     __attribute__((vector_size(4 * sizeof(uint8_t)), aligned(1)));
 #  define PANDAS_HAS_SIMD 1
-#endif // __clang__ or __GNUC__ with vector extension
+#endif // __has_attribute(ext_vector_type)
 
 #ifdef PANDAS_HAS_SIMD
 /* Vector select: returns (mask ? a : b) where mask is all-ones or all-zeros */

From 10e14e57237cc11f96e915324c7f858c9725b9dc Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Mon, 23 Mar 2026 21:42:29 -0300
Subject: [PATCH 29/35] chore: better variable names in aggregations.pyx

---
 pandas/_libs/window/aggregations.pyx | 120 +++++++++++++--------------
 1 file changed, 60 insertions(+), 60 deletions(-)

diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
index 3ffc14c1b4cbc..45a765c468aaa 100644
--- a/pandas/_libs/window/aggregations.pyx
+++ b/pandas/_libs/window/aggregations.pyx
@@ -493,22 +493,22 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
 # Rolling skewness
 
 
-cdef void add_skew(float64_t val, Moments *m,
+cdef void add_skew(float64_t val, Moments *moments,
                    bint *numerically_unstable,
                    ) noexcept nogil:
     """ add a value from the skew calc """
     cdef:
-        float64_t old_m3 = m.m3
+        float64_t old_m3 = moments.m3
 
     # Not NaN
     if val == val:
-        moments_add_value(m, val, 3)
-        if fabs(old_m3) * InvCondTol > fabs(m.m3):
+        moments_add_value(moments, val, 3)
+        if fabs(old_m3) * InvCondTol > fabs(moments.m3):
             # possible catastrophic cancellation
             numerically_unstable[0] = True
 
 
-cdef void remove_skew(float64_t val, Moments *m,
+cdef void remove_skew(float64_t val, Moments *moments,
                       bint *numerically_unstable) noexcept nogil:
     """ remove a value from the skew calc """
     cdef:
@@ -526,22 +526,22 @@ cdef void remove_skew(float64_t val, Moments *m,
 
     # Not NaN
     if val == val:
-        m.n -= 1
-        n = <float64_t>(m.n)
-        delta = val - m.mean
+        moments.n -= 1
+        n = <float64_t>(moments.n)
+        delta = val - moments.mean
         delta_n = delta / n
         term1 = delta_n * delta * (n + 1.0)
 
-        m3_update = delta_n * (term1 * (n + 2.0) - 3.0 * m.m2)
-        new_m3 = m.m3 - m3_update
+        m3_update = delta_n * (term1 * (n + 2.0) - 3.0 * moments.m2)
+        new_m3 = moments.m3 - m3_update
 
-        if (fabs(m3_update) + fabs(m.m3)) * InvCondTol > fabs(new_m3):
+        if (fabs(m3_update) + fabs(moments.m3)) * InvCondTol > fabs(new_m3):
             # possible catastrophic cancellation
             numerically_unstable[0] = True
 
-        m.m3 = new_m3
-        m.m2 -= term1
-        m.mean -= delta_n
+        moments.m3 = new_m3
+        moments.m2 -= term1
+        moments.mean -= delta_n
 
 
 def roll_skew(const float64_t[:] values, ndarray[int64_t] start,
@@ -549,7 +549,7 @@ def roll_skew(const float64_t[:] values, ndarray[int64_t] start,
     cdef:
         Py_ssize_t i, j
         float64_t val
-        Moments m
+        Moments moments
         int64_t N = len(start)
         int64_t s, e
         ndarray[float64_t] output
@@ -582,35 +582,35 @@ def roll_skew(const float64_t[:] values, ndarray[int64_t] start,
                 # calculate deletes
                 for j in range(start[i - 1], s):
                     val = values[j]
-                    remove_skew(val, &m, &numerically_unstable)
+                    remove_skew(val, &moments, &numerically_unstable)
 
                 # calculate adds
                 for j in range(end[i - 1], e):
                     val = values[j]
-                    add_skew(val, &m, &numerically_unstable)
+                    add_skew(val, &moments, &numerically_unstable)
 
             if requires_recompute or numerically_unstable:
 
-                m.n = 0
-                m.mean = 0.0
-                m.m2 = 0.0
-                m.m3 = 0.0
-                m.m4 = 0.0
+                moments.n = 0
+                moments.mean = 0.0
+                moments.m2 = 0.0
+                moments.m3 = 0.0
+                moments.m4 = 0.0
 
                 for j in range(s, e):
                     val = values[j]
-                    add_skew(val, &m, &numerically_unstable)
+                    add_skew(val, &moments, &numerically_unstable)
 
                 numerically_unstable = False
 
-            output[i] = NaN if m.n < minp else calc_skew(m)
+            output[i] = NaN if moments.n < minp else calc_skew(moments)
 
             if not is_monotonic_increasing_bounds:
-                m.n = 0
-                m.mean = 0.0
-                m.m2 = 0.0
-                m.m3 = 0.0
-                m.m4 = 0.0
+                moments.n = 0
+                moments.mean = 0.0
+                moments.m2 = 0.0
+                moments.m3 = 0.0
+                moments.m4 = 0.0
 
     return output
 
@@ -618,22 +618,22 @@ def roll_skew(const float64_t[:] values, ndarray[int64_t] start,
 # Rolling kurtosis
 
 
-cdef void add_kurt(float64_t val, Moments *m,
+cdef void add_kurt(float64_t val, Moments *moments,
                    bint *numerically_unstable,
                    ) noexcept nogil:
     """ add a value from the kurotic calc """
     cdef:
-        float64_t old_m4 = m.m4
+        float64_t old_m4 = moments.m4
 
     # Not NaN
     if val == val:
-        moments_add_value(m, val, 4)
-        if fabs(old_m4) * InvCondTol > fabs(m.m4):
+        moments_add_value(moments, val, 4)
+        if fabs(old_m4) * InvCondTol > fabs(moments.m4):
             # possible catastrophic cancellation
             numerically_unstable[0] = True
 
 
-cdef void remove_kurt(float64_t val, Moments *m,
+cdef void remove_kurt(float64_t val, Moments *moments,
                       bint *numerically_unstable,
                       ) noexcept nogil:
     """ remove a value from the kurotic calc """
@@ -642,36 +642,36 @@ cdef void remove_kurt(float64_t val, Moments *m,
 
     # Not NaN
     if val == val:
-        m.n -= 1
-        n = <float64_t>(m.n)
-        delta = val - m.mean
+        moments.n -= 1
+        n = <float64_t>(moments.n)
+        delta = val - moments.mean
         delta_n = delta / n
         term1 = delta_n * delta * (n + 1.0)
 
         m4_update = delta_n * (
-                4.0 * m.m3
+                4.0 * moments.m3
                 + delta_n * (
-                    6.0 * m.m2
+                    6.0 * moments.m2
                     - term1 * (n * n + 3.0 * n + 3.0)
                     )
                 )
-        new_m4 = m.m4 + m4_update
+        new_m4 = moments.m4 + m4_update
 
-        if (fabs(m4_update) + fabs(m.m4)) * InvCondTol > fabs(new_m4):
+        if (fabs(m4_update) + fabs(moments.m4)) * InvCondTol > fabs(new_m4):
             # possible catastrophic cancellation
             numerically_unstable[0] = True
 
-        m.m4 = new_m4
-        m.m3 -= delta_n * (term1 * (n + 2.0) - 3.0 * m.m2)
-        m.m2 -= term1
-        m.mean -= delta_n
+        moments.m4 = new_m4
+        moments.m3 -= delta_n * (term1 * (n + 2.0) - 3.0 * moments.m2)
+        moments.m2 -= term1
+        moments.mean -= delta_n
 
 
 def roll_kurt(const float64_t[:] values, ndarray[int64_t] start,
               ndarray[int64_t] end, int64_t minp) -> np.ndarray:
     cdef:
         Py_ssize_t i, j
-        Moments m
+        Moments moments
         int64_t s, e
         int64_t N = len(start)
         ndarray[float64_t] output
@@ -704,32 +704,32 @@ def roll_kurt(const float64_t[:] values, ndarray[int64_t] start,
                 # and removed
                 # calculate deletes
                 for j in range(start[i - 1], s):
-                    remove_kurt(values[j], &m, &numerically_unstable)
+                    remove_kurt(values[j], &moments, &numerically_unstable)
 
                 # calculate adds
                 for j in range(end[i - 1], e):
-                    add_kurt(values[j], &m, &numerically_unstable)
+                    add_kurt(values[j], &moments, &numerically_unstable)
 
             if requires_recompute or numerically_unstable:
 
-                m.n = 0
-                m.mean = 0.0
-                m.m2 = 0.0
-                m.m3 = 0.0
-                m.m4 = 0.0
+                moments.n = 0
+                moments.mean = 0.0
+                moments.m2 = 0.0
+                moments.m3 = 0.0
+                moments.m4 = 0.0
                 for j in range(s, e):
-                    add_kurt(values[j], &m, &numerically_unstable)
+                    add_kurt(values[j], &moments, &numerically_unstable)
 
                 numerically_unstable = False
 
-            output[i] = NaN if m.n < minp else calc_kurt(m)
+            output[i] = NaN if moments.n < minp else calc_kurt(moments)
 
             if not is_monotonic_increasing_bounds:
-                m.n = 0
-                m.mean = 0.0
-                m.m2 = 0.0
-                m.m3 = 0.0
-                m.m4 = 0.0
+                moments.n = 0
+                moments.mean = 0.0
+                moments.m2 = 0.0
+                moments.m3 = 0.0
+                moments.m4 = 0.0
 
     return output
 

From 1f0953c43e0f96e926d859b1fc0de1ca049494d4 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Thu, 2 Apr 2026 22:51:41 -0300
Subject: [PATCH 30/35] refactor: simplify diff

---
 pandas/_libs/algos.pxd                |  75 ++++++++++-----
 pandas/_libs/algos.pyx                |  59 +++++++-----
 pandas/_libs/groupby.pyx              |  43 +++++----
 pandas/_libs/include/pandas/moments.h |  92 ++++--------------
 pandas/_libs/src/moments.c            | 126 ++++++++++++++++++++-----
 pandas/_libs/window/aggregations.pyx  | 131 +++++++++++++-------------
 6 files changed, 304 insertions(+), 222 deletions(-)

diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd
index 8ae078f4363de..b5859ba2af3e8 100644
--- a/pandas/_libs/algos.pxd
+++ b/pandas/_libs/algos.pxd
@@ -19,36 +19,67 @@ cdef numeric_t kth_smallest_c(numeric_t* arr, Py_ssize_t k, Py_ssize_t n) noexce
 
 
 cdef extern from "pandas/moments.h":
-    ctypedef struct Moments:
-        int64_t n
-        float64_t mean
-        float64_t m2
-        float64_t m3
-        float64_t m4
-
     void moments_add_value(
-        Moments* moments,
-        float64_t val,
-        int max_moment,
-    ) nogil
-
-    Moments moments_merge(
-        Moments a,
-        Moments b,
-        int max_moment,
-    ) nogil
+       float64_t val,
+       int64_t* nobs,
+       float64_t* mean,
+       float64_t* m2,
+       float64_t* m3,
+       float64_t* m4,
+       int max_moment,
+    ) noexcept nogil
 
-    Moments accumulate_moments_scalar(
-        const float64_t* values,
+    void accumulate_moments_scalar(
         size_t n,
+        const float64_t* values,
         bint skipna,
         const uint8_t* mask,
+        int64_t* nobs,
+        float64_t* mean,
+        float64_t* m2,
+        float64_t* m3,
+        float64_t* m4,
         int max_moment,
-    ) nogil
+    ) noexcept nogil
+
+
+@cython.cdivision(True)
+cdef inline float64_t calc_skew(
+    int64_t nobs, float64_t m2, float64_t m3
+) noexcept nogil:
+    cdef:
+        float64_t moments_ratio, correction, dnobs
+
+    if nobs < 3:
+        return NAN
+
+    dnobs = <float64_t>nobs
+
+    moments_ratio = m3 / (m2 * sqrt(m2))
+    correction = (dnobs * sqrt(dnobs - 1.0)) / (dnobs - 2.0)
+    return moments_ratio * correction
+
+
+@cython.cdivision(True)
+cdef inline float64_t calc_kurt(
+    int64_t nobs, float64_t m2, float64_t m4
+) noexcept nogil:
+    cdef:
+        float64_t result, dnobs, term1, term2, inner, correction
+        float64_t moments_ratio
+
+    if nobs < 4:
+        return NAN
+    dnobs = <float64_t>nobs
+    moments_ratio = m4 / (m2 * m2)
+    term1 = dnobs * (dnobs + 1.0) * moments_ratio
+    term2 = 3.0 * (dnobs - 1.0)
+    inner = term1 - term2
 
-    float64_t calc_skew(Moments) nogil
+    correction = (dnobs - 1.0) / ((dnobs - 2.0) * (dnobs - 3.0))
+    result = correction * inner
 
-    float64_t calc_kurt(Moments) nogil
+    return result
 
 
 cdef enum TiebreakEnumType:
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index af06f89d36849..f05fa3e1819b1 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -5,7 +5,6 @@ from libc.math cimport (
     sqrt,
 )
 from libc.stdlib cimport (
-    calloc,
     free,
     malloc,
 )
@@ -1607,7 +1606,11 @@ cdef void accumulate_moments_axis(
     const float64_t[:, :] values,
     bint skipna,
     const uint8_t[:, :] mask,
-    Moments *moments,
+    int64_t[:] nobs,
+    float64_t[:] mean,
+    float64_t[:] m2,
+    float64_t[:] m3,
+    float64_t[:] m4,
     int axis,
     int max_moment,
 ) noexcept nogil:
@@ -1616,6 +1619,8 @@ cdef void accumulate_moments_axis(
         Py_ssize_t nouter, ninner
         bint uses_mask = mask is not None
         float64_t val
+        float64_t* m3_ptr = NULL
+        float64_t* m4_ptr = NULL
 
     if axis == 0:
         # Assumes F-contiguous
@@ -1628,6 +1633,10 @@ cdef void accumulate_moments_axis(
 
     # PERF: The outer dimension is embarrassingly parallel
     for i in range(nouter):
+        if max_moment >= 3:
+            m3_ptr = &m3[i]
+        if max_moment >= 4:
+            m4_ptr = &m4[i]
         # PERF: If it's possible to enforce contiguity in the last dimension,
         # it's possible to use accumulate_moments_scalar with SIMD instructions.
         # It may create more threads if OMP_NESTED is enabled and the outer loop
@@ -1639,7 +1648,8 @@ cdef void accumulate_moments_axis(
                 val = NaN
             if skipna and isnan(val):
                 continue
-            moments_add_value(&moments[i], val, max_moment)
+            moments_add_value(val, &nobs[i], &mean[i], &m2[i], m3_ptr, m4_ptr,
+                              max_moment)
 
 
 @cython.boundscheck(False)
@@ -1650,15 +1660,17 @@ def scalar_skew(
     const uint8_t[::1] mask,
 ) -> float:
     cdef:
-        Moments moments
+        int64_t nobs = 0
+        float64_t mean = 0.0, m2 = 0.0, m3 = 0.0
         size_t n = <size_t>values.shape[0]
         const float64_t* p_values = &values[0]
         const uint8_t* p_mask = &mask[0] if mask is not None else NULL
 
     with nogil:
-        moments = accumulate_moments_scalar(p_values, n, skipna, p_mask, 3)
+        accumulate_moments_scalar(n, p_values, skipna, p_mask,
+                                  &nobs, &mean, &m2, &m3, NULL, 3)
 
-    return calc_skew(moments)
+    return calc_skew(nobs, m2, m3)
 
 
 @cython.boundscheck(False)
@@ -1669,15 +1681,17 @@ def scalar_kurt(
     const uint8_t[::1] mask,
 ) -> float:
     cdef:
-        Moments moments
+        int64_t nobs = 0
+        float64_t mean = 0.0, m2 = 0.0, m3 = 0.0, m4 = 0.0
         size_t n = <size_t>values.shape[0]
         const float64_t* p_values = &values[0]
         const uint8_t* p_mask = &mask[0] if mask is not None else NULL
 
     with nogil:
-        moments = accumulate_moments_scalar(p_values, n, skipna, p_mask, 4)
+        accumulate_moments_scalar(n, p_values, skipna, p_mask,
+                                  &nobs, &mean, &m2, &m3, &m4, 4)
 
-    return calc_kurt(moments)
+    return calc_kurt(nobs, m2, m4)
 
 
 @cython.boundscheck(False)
@@ -1691,19 +1705,18 @@ def axis_skew(
     cdef:
         Py_ssize_t i
         Py_ssize_t nouter = values.shape[1] if axis == 0 else values.shape[0]
-        Moments *moments = <Moments *>calloc(<size_t>nouter, sizeof(Moments))
+        int64_t[::1] nobs = np.zeros(nouter, dtype=np.int64)
+        float64_t[::1] mean = np.zeros(nouter)
+        float64_t[::1] m2 = np.zeros(nouter)
+        float64_t[::1] m3 = np.zeros(nouter)
         ndarray result_arr = np.empty(nouter, dtype=np.float64)
         float64_t[:] result = result_arr
 
-    if moments == NULL:
-        raise MemoryError()
-
     with nogil:
-        accumulate_moments_axis(values, skipna, mask, moments, axis, 3)
+        accumulate_moments_axis(values, skipna, mask, nobs, mean, m2, m3, None, axis, 3)
         for i in range(nouter):
-            result[i] = calc_skew(moments[i])
+            result[i] = calc_skew(nobs[i], m2[i], m3[i])
 
-    free(moments)
     return result_arr
 
 
@@ -1718,19 +1731,19 @@ def axis_kurt(
     cdef:
         Py_ssize_t i
         Py_ssize_t nouter = values.shape[1] if axis == 0 else values.shape[0]
-        Moments *moments = <Moments *>calloc(<size_t>nouter, sizeof(Moments))
+        int64_t[::1] nobs = np.zeros(nouter, dtype=np.int64)
+        float64_t[::1] mean = np.zeros(nouter)
+        float64_t[::1] m2 = np.zeros(nouter)
+        float64_t[::1] m3 = np.zeros(nouter)
+        float64_t[::1] m4 = np.zeros(nouter)
         ndarray result_arr = np.empty(nouter, dtype=np.float64)
         float64_t[:] result = result_arr
 
-    if moments == NULL:
-        raise MemoryError()
-
     with nogil:
-        accumulate_moments_axis(values, skipna, mask, moments, axis, 4)
+        accumulate_moments_axis(values, skipna, mask, nobs, mean, m2, m3, m4, axis, 4)
         for i in range(nouter):
-            result[i] = calc_kurt(moments[i])
+            result[i] = calc_kurt(nobs[i], m2[i], m4[i])
 
-    free(moments)
     return result_arr
 
 
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index d6529623e41de..15abebef987ed 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -10,7 +10,6 @@ from libc.math cimport (
     sqrt,
 )
 from libc.stdlib cimport (
-    calloc,
     free,
     malloc,
 )
@@ -35,7 +34,6 @@ cnp.import_array()
 
 from pandas._libs cimport util
 from pandas._libs.algos cimport (
-    Moments,
     calc_kurt,
     calc_skew,
     get_rank_nan_fill_val,
@@ -1025,18 +1023,23 @@ def group_skew(
 ) -> None:
     cdef:
         Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
+        int64_t[:, ::1] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
         bint uses_mask = mask is not None
-        Moments* ms
+        float64_t[:, ::1] mean, M2, M3
         float64_t val
 
     if len_values != len_labels:
         raise ValueError("len(index) != len(labels)")
 
+    nobs = np.zeros((<object>out).shape, dtype=np.int64)
+
+    mean = np.zeros((<object>out).shape, dtype=np.float64)
+    # M2, and M3 correspond to 2nd and 3rd central moments
+    M2 = np.zeros((<object>out).shape, dtype=np.float64)
+    M3 = np.zeros((<object>out).shape, dtype=np.float64)
+
     N, K = (<object>values).shape
-    ms = <Moments *>calloc(<size_t>ngroups * <size_t>K, sizeof(Moments))
-    if ms is NULL:
-        raise MemoryError()
 
     out[:, :] = 0.0
 
@@ -1057,16 +1060,15 @@ def group_skew(
                 if skipna and (isnan(val) or _treat_as_na(val, False)):
                     continue
 
-                moments_add_value(&ms[lab * K + j], val, 3)
+                moments_add_value(val, &nobs[lab, j], &mean[lab, j], &M2[lab, j],
+                                  &M3[lab, j], NULL, 3)
 
         for i in range(ngroups):
             for j in range(K):
-                out[i, j] = calc_skew(ms[i * K + j])
+                out[i, j] = calc_skew(nobs[i, j], M2[i, j], M3[i, j])
                 if result_mask is not None and isnan(out[i, j]):
                     result_mask[i, j] = 1
 
-    free(ms)
-
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
@@ -1083,18 +1085,24 @@ def group_kurt(
 ) -> None:
     cdef:
         Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
+        int64_t[:, ::1] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
         bint uses_mask = mask is not None
-        Moments* ms
+        float64_t[:, ::1] mean, M2, M3, M4
         float64_t val
 
     if len_values != len_labels:
         raise ValueError("len(index) != len(labels)")
 
+    nobs = np.zeros((<object>out).shape, dtype=np.int64)
+
+    mean = np.zeros((<object>out).shape, dtype=np.float64)
+    # M2, M3 and M4 correspond to 2nd, 3rd and 4th Central Moments
+    M2 = np.zeros((<object>out).shape, dtype=np.float64)
+    M3 = np.zeros((<object>out).shape, dtype=np.float64)
+    M4 = np.zeros((<object>out).shape, dtype=np.float64)
+
     N, K = (<object>values).shape
-    ms = <Moments *>calloc(<size_t>ngroups * <size_t>K, sizeof(Moments))
-    if ms is NULL:
-        raise MemoryError()
 
     out[:, :] = 0.0
 
@@ -1115,16 +1123,15 @@ def group_kurt(
                 if skipna and (isnan(val) or _treat_as_na(val, False)):
                     continue
 
-                moments_add_value(&ms[lab * K + j], val, 4)
+                moments_add_value(val, &nobs[lab, j], &mean[lab, j], &M2[lab, j],
+                                  &M3[lab, j], &M4[lab, j], 4)
 
         for i in range(ngroups):
             for j in range(K):
-                out[i, j] = calc_kurt(ms[i * K + j])
+                out[i, j] = calc_kurt(nobs[i, j], M2[i, j], M4[i, j])
                 if result_mask is not None and isnan(out[i, j]):
                     result_mask[i, j] = 1
 
-    free(ms)
-
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
diff --git a/pandas/_libs/include/pandas/moments.h b/pandas/_libs/include/pandas/moments.h
index e9516acf4a881..51d62486c191b 100644
--- a/pandas/_libs/include/pandas/moments.h
+++ b/pandas/_libs/include/pandas/moments.h
@@ -9,7 +9,7 @@ The full license is in the LICENSE file, distributed with this software.
 
 #pragma once
 
-#include <math.h>
+#include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 
@@ -18,6 +18,9 @@ The full license is in the LICENSE file, distributed with this software.
 extern "C" {
 #endif
 
+// NOTE: It may be better to use the Moments struct
+//       instead of using multiple double references
+//       and a signed value to nobs.
 typedef struct {
   int64_t n;
   double mean;
@@ -26,90 +29,33 @@ typedef struct {
   double m4;
 } Moments;
 
-static inline void moments_add_value(Moments *moments, double val,
+static inline void moments_add_value(double val, int64_t *nobs, double *mean,
+                                     double *m2, double *m3, double *m4,
                                      int max_moment) {
-  double delta = val - moments->mean;
-  moments->n++;
-  double n = (double)moments->n;
+  double delta = val - *mean;
+  (*nobs)++;
+  double n = (double)(*nobs);
   double delta_n = delta / n;
   double term1 = delta * delta_n * (n - 1.0);
 
   if (max_moment >= 4) {
-    moments->m4 +=
-        delta_n *
-        (-4.0 * moments->m3 +
-         delta_n * (6.0 * moments->m2 + term1 * (n * n - 3.0 * n + 3.0)));
+    *m4 += delta_n * (-4.0 * *m3 +
+                      delta_n * (6.0 * *m2 + term1 * (n * n - 3.0 * n + 3.0)));
   }
   if (max_moment >= 3) {
-    moments->m3 += delta_n * (term1 * (n - 2.0) - 3.0 * moments->m2);
+    *m3 += delta_n * (term1 * (n - 2.0) - 3.0 * *m2);
   }
-  moments->m2 += term1;
-  moments->mean += delta_n;
+  *m2 += term1;
+  *mean += delta_n;
 }
 
-static inline Moments moments_merge(Moments a, Moments b, int max_moment) {
-
-  if (a.n == 0) {
-    return b;
-  }
-  if (b.n == 0) {
-    return a;
-  }
-
-  Moments result;
-
-  result.n = a.n + b.n;
-  double n_a = (double)a.n;
-  double n_b = (double)b.n;
-  double delta = b.mean - a.mean;
-  double delta_n = delta / (double)result.n;
-  double term1 = delta * delta_n * n_a * n_b;
-
-  if (max_moment >= 4) {
-    result.m4 =
-        a.m4 + b.m4 +
-        delta_n * (4.0 * (n_a * b.m3 - n_b * a.m3) +
-                   delta_n * (6.0 * (n_a * n_a * b.m2 + n_b * n_b * a.m2) +
-                              term1 * (n_a * n_a - n_a * n_b + n_b * n_b)));
-  }
-  if (max_moment >= 3) {
-    result.m3 =
-        a.m3 + b.m3 +
-        delta_n * (3.0 * (n_a * b.m2 - n_b * a.m2) + term1 * (n_a - n_b));
-  }
-  result.m2 = a.m2 + b.m2 + term1;
-  result.mean = a.mean + delta_n * n_b;
-
-  return result;
-}
+Moments moments_merge(Moments a, Moments b, int max_moment);
 
 /// Compute central moments until `max_moment` using `n` elements from `values`.
-Moments accumulate_moments_scalar(const double *values, size_t n, int skipna,
-                                  const uint8_t *mask, int max_moment);
-
-static inline double calc_skew(Moments moments) {
-  if (moments.n < 3) {
-    return NAN;
-  }
-  double dnobs = (double)moments.n;
-  double moments_ratio = moments.m3 / (moments.m2 * sqrt(moments.m2));
-  double correction = (dnobs * sqrt(dnobs - 1.0)) / (dnobs - 2.0);
-  return moments_ratio * correction;
-}
-
-static inline double calc_kurt(Moments moments) {
-  if (moments.n < 4) {
-    return NAN;
-  }
-  double dnobs = (double)moments.n;
-  double moments_ratio = moments.m4 / (moments.m2 * moments.m2);
-  double term1 = dnobs * (dnobs + 1.0) * moments_ratio;
-  double term2 = 3.0 * (dnobs - 1.0);
-  double inner = term1 - term2;
-  double correction = (dnobs - 1.0) / ((dnobs - 2.0) * (dnobs - 3.0));
-  return correction * inner;
-}
-
+void accumulate_moments_scalar(size_t n, const double *values, bool skipna,
+                               const uint8_t *mask, int64_t *nobs, double *mean,
+                               double *m2, double *m3, double *m4,
+                               int max_moment);
 #ifdef __cplusplus
 }
 #endif
diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index b80dadf350d04..144aeae5eae9f 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -18,6 +18,63 @@ static inline int omp_get_thread_num(void) { return 0; }
 static inline int omp_get_num_threads(void) { return 1; }
 #endif // _OPENMP
 
+static inline void moments_add_valuem(Moments *moments, double val,
+                                      int max_moment) {
+  double delta = val - moments->mean;
+  moments->n++;
+  double n = (double)moments->n;
+  double delta_n = delta / n;
+  double term1 = delta * delta_n * (n - 1.0);
+
+  if (max_moment >= 4) {
+    moments->m4 +=
+        delta_n *
+        (-4.0 * moments->m3 +
+         delta_n * (6.0 * moments->m2 + term1 * (n * n - 3.0 * n + 3.0)));
+  }
+  if (max_moment >= 3) {
+    moments->m3 += delta_n * (term1 * (n - 2.0) - 3.0 * moments->m2);
+  }
+  moments->m2 += term1;
+  moments->mean += delta_n;
+}
+
+Moments moments_merge(Moments a, Moments b, int max_moment) {
+
+  if (a.n == 0) {
+    return b;
+  }
+  if (b.n == 0) {
+    return a;
+  }
+
+  Moments result;
+
+  result.n = a.n + b.n;
+  double n_a = (double)a.n;
+  double n_b = (double)b.n;
+  double delta = b.mean - a.mean;
+  double delta_n = delta / (double)result.n;
+  double term1 = delta * delta_n * n_a * n_b;
+
+  if (max_moment >= 4) {
+    result.m4 =
+        a.m4 + b.m4 +
+        delta_n * (4.0 * (n_a * b.m3 - n_b * a.m3) +
+                   delta_n * (6.0 * (n_a * n_a * b.m2 + n_b * n_b * a.m2) +
+                              term1 * (n_a * n_a - n_a * n_b + n_b * n_b)));
+  }
+  if (max_moment >= 3) {
+    result.m3 =
+        a.m3 + b.m3 +
+        delta_n * (3.0 * (n_a * b.m2 - n_b * a.m2) + term1 * (n_a - n_b));
+  }
+  result.m2 = a.m2 + b.m2 + term1;
+  result.mean = a.mean + delta_n * n_b;
+
+  return result;
+}
+
 #ifndef __has_attribute
 #  define __has_attribute(x) 0
 #endif // __has_attribute
@@ -53,10 +110,9 @@ typedef uint8_t v4u8
       ((v4d)(((v4si)(mask) & (v4si)(a)) | (~(v4si)(mask) & (v4si)(b))))
 #  endif // defined(__clang__)
 
-PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values,
-                                                    size_t n, int skipna,
-                                                    const uint8_t *mask,
-                                                    int max_moment) {
+PANDAS_SIMD_TARGETS
+Moments accumulate_moments_simd(size_t n, const double *values, int skipna,
+                                const uint8_t *mask, int max_moment) {
   v4d v_mean = {0.0, 0.0, 0.0, 0.0};
   v4d v_m2 = {0.0, 0.0, 0.0, 0.0};
   v4d v_m3 = {0.0, 0.0, 0.0, 0.0};
@@ -114,17 +170,16 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values,
     v_mean += v_delta_n;
   }
 
-  double n_arr[4], mean_arr[4], m2_arr[4], m3_arr[4], m4_arr[4];
-  *(v4d *)n_arr = v_n;
+  double n_arrd[4], mean_arr[4], m2_arr[4], m3_arr[4], m4_arr[4];
+  *(v4d *)n_arrd = v_n;
   *(v4d *)mean_arr = v_mean;
   *(v4d *)m2_arr = v_m2;
   *(v4d *)m3_arr = v_m3;
   *(v4d *)m4_arr = v_m4;
 
-  Moments moments_arr[4];
+  int64_t n_arr[4];
   for (int j = 0; j < 4; j++) {
-    moments_arr[j] = (Moments){(int64_t)n_arr[j], mean_arr[j], m2_arr[j],
-                               m3_arr[j], m4_arr[j]};
+    n_arr[j] = (int64_t)n_arrd[j];
   }
 
   // Distribute remaining values across chunks
@@ -134,7 +189,16 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values,
       val = NAN;
     if (skipna && isnan(val))
       continue;
-    moments_add_value(&moments_arr[i % 4], val, max_moment);
+    moments_add_value(val, &n_arr[i % 4], &mean_arr[i % 4], &m2_arr[i % 4],
+                      &m3_arr[i % 4], &m4_arr[i % 4], max_moment);
+  }
+  Moments moments_arr[4];
+  for (int j = 0; j < 4; j++) {
+    moments_arr[j] = (Moments){.n = n_arr[j],
+                               .mean = mean_arr[j],
+                               .m2 = m2_arr[j],
+                               .m3 = m3_arr[j],
+                               .m4 = m4_arr[j]};
   }
 
   // pairwise merge for numerical stability
@@ -148,7 +212,7 @@ PANDAS_SIMD_TARGETS Moments accumulate_moments_simd(const double *values,
 
 /* --- Scalar Fallback Implementation --- */
 
-Moments accumulate_moments_scalar_block(const double *values, size_t n,
+Moments accumulate_moments_scalar_block(size_t n, const double *values,
                                         int skipna, const uint8_t *mask,
                                         int max_moment) {
   Moments moments = {0};
@@ -158,26 +222,28 @@ Moments accumulate_moments_scalar_block(const double *values, size_t n,
       val = NAN;
     if (skipna && isnan(val))
       continue;
-    moments_add_value(&moments, val, max_moment);
+    moments_add_valuem(&moments, val, max_moment);
   }
   return moments;
 }
 
 /* --- Accumulation Dispatch (Choose SIMD or Scalar) --- */
 
-Moments accumulate_moments_dispatch(const double *values, size_t n, int skipna,
+Moments accumulate_moments_dispatch(size_t n, const double *values, int skipna,
                                     const uint8_t *mask, int max_moment) {
 #if defined(PANDAS_HAS_SIMD)
-  return accumulate_moments_simd(values, n, skipna, mask, max_moment);
+  return accumulate_moments_simd(n, values, skipna, mask, max_moment);
 #endif // defined(PANDAS_HAS_SIMD)
-  return accumulate_moments_scalar_block(values, n, skipna, mask, max_moment);
+  return accumulate_moments_scalar_block(n, values, skipna, mask, max_moment);
 }
 
 /* --- Moments 1D Accumulator Implementation --- */
 
-Moments accumulate_moments_scalar(const double *values, size_t n, int skipna,
-                                  const uint8_t *mask, int max_moment) {
-  Moments result = {0};
+void accumulate_moments_scalar(size_t n, const double *values, bool skipna,
+                               const uint8_t *mask, int64_t *nobs, double *mean,
+                               double *m2, double *m3, double *m4,
+                               int max_moment) {
+  Moments acc = {0};
 
 #ifdef _OPENMP
 #  pragma omp parallel if (n > 10000)
@@ -190,16 +256,34 @@ Moments accumulate_moments_scalar(const double *values, size_t n, int skipna,
     size_t end = tid == num_threads - 1 ? n : ((tid + 1) * n) / num_threads;
 
     Moments moments_local =
-        accumulate_moments_dispatch(values + start, end - start, skipna,
+        accumulate_moments_dispatch(end - start, values + start, skipna,
                                     mask ? mask + start : NULL, max_moment);
 
 #ifdef _OPENMP
 #  pragma omp critical
 #endif
     {
-      result = moments_merge(result, moments_local, max_moment);
+      acc = moments_merge(acc, moments_local, max_moment);
     }
   }
 
-  return result;
+  Moments tmp =
+      (Moments){.n = *nobs, .mean = *mean, .m2 = *m2, .m3 = 0.0, .m4 = 0.0};
+  if (max_moment >= 4) {
+    tmp.m4 = *m4;
+  }
+  if (max_moment >= 3) {
+    tmp.m3 = *m3;
+  }
+  Moments result = moments_merge(tmp, acc, max_moment);
+
+  if (max_moment >= 4) {
+    *m4 = result.m4;
+  }
+  if (max_moment >= 3) {
+    *m3 = result.m3;
+  }
+  *m2 = result.m2;
+  *mean = result.mean;
+  *nobs = result.n;
 }
diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
index 45a765c468aaa..619d5590cc207 100644
--- a/pandas/_libs/window/aggregations.pyx
+++ b/pandas/_libs/window/aggregations.pyx
@@ -9,7 +9,6 @@ from libcpp.stack cimport stack
 from libcpp.unordered_map cimport unordered_map
 
 from pandas._libs.algos cimport (
-    Moments,
     TiebreakEnumType,
     calc_kurt,
     calc_skew,
@@ -493,22 +492,26 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
 # Rolling skewness
 
 
-cdef void add_skew(float64_t val, Moments *moments,
+cdef void add_skew(float64_t val, int64_t *nobs,
+                   float64_t *mean, float64_t *m2,
+                   float64_t *m3,
                    bint *numerically_unstable,
                    ) noexcept nogil:
     """ add a value from the skew calc """
     cdef:
-        float64_t old_m3 = moments.m3
+        float64_t old_m3 = m3[0]
 
     # Not NaN
     if val == val:
-        moments_add_value(moments, val, 3)
-        if fabs(old_m3) * InvCondTol > fabs(moments.m3):
+        moments_add_value(val, nobs, mean, m2, m3, NULL, 3)
+        if fabs(old_m3) * InvCondTol > fabs(m3[0]):
             # possible catastrophic cancellation
             numerically_unstable[0] = True
 
 
-cdef void remove_skew(float64_t val, Moments *moments,
+cdef void remove_skew(float64_t val, int64_t *nobs,
+                      float64_t *mean, float64_t *m2,
+                      float64_t *m3,
                       bint *numerically_unstable) noexcept nogil:
     """ remove a value from the skew calc """
     cdef:
@@ -526,22 +529,22 @@ cdef void remove_skew(float64_t val, Moments *moments,
 
     # Not NaN
     if val == val:
-        moments.n -= 1
-        n = <float64_t>(moments.n)
-        delta = val - moments.mean
+        nobs[0] -= 1
+        n = <float64_t>(nobs[0])
+        delta = val - mean[0]
         delta_n = delta / n
         term1 = delta_n * delta * (n + 1.0)
 
-        m3_update = delta_n * (term1 * (n + 2.0) - 3.0 * moments.m2)
-        new_m3 = moments.m3 - m3_update
+        m3_update = delta_n * (term1 * (n + 2.0) - 3.0 * m2[0])
+        new_m3 = m3[0] - m3_update
 
-        if (fabs(m3_update) + fabs(moments.m3)) * InvCondTol > fabs(new_m3):
+        if (fabs(m3_update) + fabs(m3[0])) * InvCondTol > fabs(new_m3):
             # possible catastrophic cancellation
             numerically_unstable[0] = True
 
-        moments.m3 = new_m3
-        moments.m2 -= term1
-        moments.mean -= delta_n
+        m3[0] = new_m3
+        m2[0] -= term1
+        mean[0] -= delta_n
 
 
 def roll_skew(const float64_t[:] values, ndarray[int64_t] start,
@@ -549,8 +552,8 @@ def roll_skew(const float64_t[:] values, ndarray[int64_t] start,
     cdef:
         Py_ssize_t i, j
         float64_t val
-        Moments moments
-        int64_t N = len(start)
+        float64_t mean, m2, m3
+        int64_t nobs = 0, N = len(start)
         int64_t s, e
         ndarray[float64_t] output
         bint is_monotonic_increasing_bounds
@@ -582,35 +585,31 @@ def roll_skew(const float64_t[:] values, ndarray[int64_t] start,
                 # calculate deletes
                 for j in range(start[i - 1], s):
                     val = values[j]
-                    remove_skew(val, &moments, &numerically_unstable)
+                    remove_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable)
 
                 # calculate adds
                 for j in range(end[i - 1], e):
                     val = values[j]
-                    add_skew(val, &moments, &numerically_unstable)
+                    add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable)
 
             if requires_recompute or numerically_unstable:
 
-                moments.n = 0
-                moments.mean = 0.0
-                moments.m2 = 0.0
-                moments.m3 = 0.0
-                moments.m4 = 0.0
+                mean = m2 = m3 = 0.0
+                nobs = 0
 
                 for j in range(s, e):
                     val = values[j]
-                    add_skew(val, &moments, &numerically_unstable)
+                    add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable)
 
                 numerically_unstable = False
 
-            output[i] = NaN if moments.n < minp else calc_skew(moments)
+            output[i] = NaN if nobs < minp else calc_skew(nobs, m2, m3)
 
             if not is_monotonic_increasing_bounds:
-                moments.n = 0
-                moments.mean = 0.0
-                moments.m2 = 0.0
-                moments.m3 = 0.0
-                moments.m4 = 0.0
+                nobs = 0
+                mean = 0.0
+                m2 = 0.0
+                m3 = 0.0
 
     return output
 
@@ -618,22 +617,26 @@ def roll_skew(const float64_t[:] values, ndarray[int64_t] start,
 # Rolling kurtosis
 
 
-cdef void add_kurt(float64_t val, Moments *moments,
+cdef void add_kurt(float64_t val, int64_t *nobs,
+                   float64_t *mean, float64_t *m2,
+                   float64_t *m3, float64_t *m4,
                    bint *numerically_unstable,
                    ) noexcept nogil:
     """ add a value from the kurotic calc """
     cdef:
-        float64_t old_m4 = moments.m4
+        float64_t old_m4 = m4[0]
 
     # Not NaN
     if val == val:
-        moments_add_value(moments, val, 4)
-        if fabs(old_m4) * InvCondTol > fabs(moments.m4):
+        moments_add_value(val, nobs, mean, m2, m3, m4, 4)
+        if fabs(old_m4) * InvCondTol > fabs(m4[0]):
             # possible catastrophic cancellation
             numerically_unstable[0] = True
 
 
-cdef void remove_kurt(float64_t val, Moments *moments,
+cdef void remove_kurt(float64_t val, int64_t *nobs,
+                      float64_t *mean, float64_t *m2,
+                      float64_t *m3, float64_t *m4,
                       bint *numerically_unstable,
                       ) noexcept nogil:
     """ remove a value from the kurotic calc """
@@ -642,37 +645,37 @@ cdef void remove_kurt(float64_t val, Moments *moments,
 
     # Not NaN
     if val == val:
-        moments.n -= 1
-        n = <float64_t>(moments.n)
-        delta = val - moments.mean
+        nobs[0] -= 1
+        n = <float64_t>(nobs[0])
+        delta = val - mean[0]
         delta_n = delta / n
         term1 = delta_n * delta * (n + 1.0)
 
         m4_update = delta_n * (
-                4.0 * moments.m3
+                4.0 * m3[0]
                 + delta_n * (
-                    6.0 * moments.m2
+                    6.0 * m2[0]
                     - term1 * (n * n + 3.0 * n + 3.0)
                     )
                 )
-        new_m4 = moments.m4 + m4_update
+        new_m4 = m4[0] + m4_update
 
-        if (fabs(m4_update) + fabs(moments.m4)) * InvCondTol > fabs(new_m4):
+        if (fabs(m4_update) + fabs(m4[0])) * InvCondTol > fabs(new_m4):
             # possible catastrophic cancellation
             numerically_unstable[0] = True
 
-        moments.m4 = new_m4
-        moments.m3 -= delta_n * (term1 * (n + 2.0) - 3.0 * moments.m2)
-        moments.m2 -= term1
-        moments.mean -= delta_n
+        m4[0] = new_m4
+        m3[0] -= delta_n * (term1 * (n + 2.0) - 3.0 * m2[0])
+        m2[0] -= term1
+        mean[0] -= delta_n
 
 
 def roll_kurt(const float64_t[:] values, ndarray[int64_t] start,
               ndarray[int64_t] end, int64_t minp) -> np.ndarray:
     cdef:
         Py_ssize_t i, j
-        Moments moments
-        int64_t s, e
+        float64_t mean, m2, m3, m4
+        int64_t nobs, s, e
         int64_t N = len(start)
         ndarray[float64_t] output
         bint is_monotonic_increasing_bounds
@@ -704,32 +707,30 @@ def roll_kurt(const float64_t[:] values, ndarray[int64_t] start,
                 # and removed
                 # calculate deletes
                 for j in range(start[i - 1], s):
-                    remove_kurt(values[j], &moments, &numerically_unstable)
+                    remove_kurt(values[j], &nobs, &mean, &m2, &m3, &m4,
+                                &numerically_unstable)
 
                 # calculate adds
                 for j in range(end[i - 1], e):
-                    add_kurt(values[j], &moments, &numerically_unstable)
+                    add_kurt(values[j], &nobs, &mean, &m2, &m3, &m4,
+                             &numerically_unstable)
 
             if requires_recompute or numerically_unstable:
 
-                moments.n = 0
-                moments.mean = 0.0
-                moments.m2 = 0.0
-                moments.m3 = 0.0
-                moments.m4 = 0.0
+                mean = m2 = m3 = m4 = 0.0
+                nobs = 0
                 for j in range(s, e):
-                    add_kurt(values[j], &moments, &numerically_unstable)
-
-                numerically_unstable = False
+                    add_kurt(values[j], &nobs, &mean, &m2, &m3, &m4,
+                             &numerically_unstable)
 
-            output[i] = NaN if moments.n < minp else calc_kurt(moments)
+            output[i] = NaN if nobs < minp else calc_kurt(nobs, m2, m4)
 
             if not is_monotonic_increasing_bounds:
-                moments.n = 0
-                moments.mean = 0.0
-                moments.m2 = 0.0
-                moments.m3 = 0.0
-                moments.m4 = 0.0
+                nobs = 0
+                mean = 0.0
+                m2 = 0.0
+                m3 = 0.0
+                m4 = 0.0
 
     return output
 

From f6c2e2d73aa02cf74ab585c89cbd6779c5be1b84 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Thu, 2 Apr 2026 23:12:16 -0300
Subject: [PATCH 31/35] perf: remove OpenMP

---
 .../development/contributing_environment.rst  |  3 +-
 meson.options                                 |  5 ---
 pandas/_libs/meson.build                      |  4 +--
 pandas/_libs/src/moments.c                    | 34 +++----------------
 4 files changed, 6 insertions(+), 40 deletions(-)
 delete mode 100644 meson.options

diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst
index b814e1ca96020..ce0430bd9e876 100644
--- a/doc/source/development/contributing_environment.rst
+++ b/doc/source/development/contributing_environment.rst
@@ -169,13 +169,12 @@ To compile and install pandas in editable mode, run:
 
    python -m pip install --verbose --editable . --no-build-isolation
 
-Additional Meson and Pandas options can be passed to the ``pip install`` command to modify the installation.
+Additional Meson options can be passed to the ``pip install`` command to modify the installation.
 Helpful options include:
 
 * ``-Ceditable-verbose=true``: Print verbose logs during rebuild, even during ``import``.
 * ``-Cbuilddir="your builddir here"``: Specify a different build directory for the C extensions.
 * ``-Csetup-args="-Ddebug=true"``: Compile the C extensions with debug symbols.
-* ``-Csetup-args=-Dopenmp=disabled``: Compile Pandas without parallelization support through OpenMP.
 
 .. note::
    When pandas is installed in ``--editable`` mode, pandas will automatically rebuild the library upon ``import``,
diff --git a/meson.options b/meson.options
deleted file mode 100644
index 22fc68252de52..0000000000000
--- a/meson.options
+++ /dev/null
@@ -1,5 +0,0 @@
-option(
-    'openmp',
-    type: 'feature',
-    description: 'Parallelization with OpenMP',
-)
diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build
index e4db9972dd44d..2f20136b375c8 100644
--- a/pandas/_libs/meson.build
+++ b/pandas/_libs/meson.build
@@ -56,8 +56,6 @@ fast_float_dep = fast_float.get_variable('fast_float_dep')
 
 subdir('tslibs')
 
-openmp_dep = dependency('OpenMP', required: get_option('openmp'))
-
 libs_sources = {
     # Dict of extension name -> dict of {sources, include_dirs, and deps}
     # numpy include dir is implicitly included
@@ -68,7 +66,7 @@ libs_sources = {
             _algos_common_helper,
             _algos_take_helper,
         ],
-        'deps': [_khash_primitive_helper_dep, m_dep, openmp_dep],
+        'deps': [_khash_primitive_helper_dep, m_dep],
     },
     'arrays': {'sources': ['arrays.pyx']},
     'groupby': {'sources': ['groupby.pyx'], 'deps': [m_dep]},
diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index 144aeae5eae9f..1b71083ede5c7 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -11,13 +11,6 @@ The full license is in the LICENSE file, distributed with this software.
 #include <math.h>
 #include <stdint.h>
 
-#ifdef _OPENMP
-#  include <omp.h>
-#else
-static inline int omp_get_thread_num(void) { return 0; }
-static inline int omp_get_num_threads(void) { return 1; }
-#endif // _OPENMP
-
 static inline void moments_add_valuem(Moments *moments, double val,
                                       int max_moment) {
   double delta = val - moments->mean;
@@ -243,29 +236,10 @@ void accumulate_moments_scalar(size_t n, const double *values, bool skipna,
                                const uint8_t *mask, int64_t *nobs, double *mean,
                                double *m2, double *m3, double *m4,
                                int max_moment) {
-  Moments acc = {0};
-
-#ifdef _OPENMP
-#  pragma omp parallel if (n > 10000)
-#endif
-  {
-    size_t tid = (size_t)omp_get_thread_num();
-    size_t num_threads = (size_t)omp_get_num_threads();
-
-    size_t start = (tid * n) / num_threads;
-    size_t end = tid == num_threads - 1 ? n : ((tid + 1) * n) / num_threads;
-
-    Moments moments_local =
-        accumulate_moments_dispatch(end - start, values + start, skipna,
-                                    mask ? mask + start : NULL, max_moment);
-
-#ifdef _OPENMP
-#  pragma omp critical
-#endif
-    {
-      acc = moments_merge(acc, moments_local, max_moment);
-    }
-  }
+  // PERF: It's possible to parallelize moment reductions
+  // and call `moments_merge` to join the results.
+  Moments acc =
+      accumulate_moments_dispatch(n, values, skipna, mask, max_moment);
 
   Moments tmp =
       (Moments){.n = *nobs, .mean = *mean, .m2 = *m2, .m3 = 0.0, .m4 = 0.0};

From 58ba9473967c676255053aeb2ef0db266dbffe51 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Fri, 3 Apr 2026 00:32:28 -0300
Subject: [PATCH 32/35] fix: use long instead of int64_t for apple clang

---
 pandas/_libs/include/pandas/moments.h |  6 +++---
 pandas/_libs/src/moments.c            | 17 ++++++++++-------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/pandas/_libs/include/pandas/moments.h b/pandas/_libs/include/pandas/moments.h
index 51d62486c191b..e608b526e786e 100644
--- a/pandas/_libs/include/pandas/moments.h
+++ b/pandas/_libs/include/pandas/moments.h
@@ -22,14 +22,14 @@ extern "C" {
 //       instead of using multiple double references
 //       and a signed value to nobs.
 typedef struct {
-  int64_t n;
+  uint64_t n;
   double mean;
   double m2;
   double m3;
   double m4;
 } Moments;
 
-static inline void moments_add_value(double val, int64_t *nobs, double *mean,
+static inline void moments_add_value(double val, long *nobs, double *mean,
                                      double *m2, double *m3, double *m4,
                                      int max_moment) {
   double delta = val - *mean;
@@ -53,7 +53,7 @@ Moments moments_merge(Moments a, Moments b, int max_moment);
 
 /// Compute central moments until `max_moment` using `n` elements from `values`.
 void accumulate_moments_scalar(size_t n, const double *values, bool skipna,
-                               const uint8_t *mask, int64_t *nobs, double *mean,
+                               const uint8_t *mask, long *nobs, double *mean,
                                double *m2, double *m3, double *m4,
                                int max_moment);
 #ifdef __cplusplus
diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index 1b71083ede5c7..ccb156059f3ef 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -170,9 +170,9 @@ Moments accumulate_moments_simd(size_t n, const double *values, int skipna,
   *(v4d *)m3_arr = v_m3;
   *(v4d *)m4_arr = v_m4;
 
-  int64_t n_arr[4];
+  long n_arr[4];
   for (int j = 0; j < 4; j++) {
-    n_arr[j] = (int64_t)n_arrd[j];
+    n_arr[j] = (long)n_arrd[j];
   }
 
   // Distribute remaining values across chunks
@@ -187,7 +187,7 @@ Moments accumulate_moments_simd(size_t n, const double *values, int skipna,
   }
   Moments moments_arr[4];
   for (int j = 0; j < 4; j++) {
-    moments_arr[j] = (Moments){.n = n_arr[j],
+    moments_arr[j] = (Moments){.n = (uint64_t)n_arr[j],
                                .mean = mean_arr[j],
                                .m2 = m2_arr[j],
                                .m3 = m3_arr[j],
@@ -233,7 +233,7 @@ Moments accumulate_moments_dispatch(size_t n, const double *values, int skipna,
 /* --- Moments 1D Accumulator Implementation --- */
 
 void accumulate_moments_scalar(size_t n, const double *values, bool skipna,
-                               const uint8_t *mask, int64_t *nobs, double *mean,
+                               const uint8_t *mask, long *nobs, double *mean,
                                double *m2, double *m3, double *m4,
                                int max_moment) {
   // PERF: It's possible to parallelize moment reductions
@@ -241,8 +241,8 @@ void accumulate_moments_scalar(size_t n, const double *values, bool skipna,
   Moments acc =
       accumulate_moments_dispatch(n, values, skipna, mask, max_moment);
 
-  Moments tmp =
-      (Moments){.n = *nobs, .mean = *mean, .m2 = *m2, .m3 = 0.0, .m4 = 0.0};
+  Moments tmp = (Moments){
+      .n = (uint64_t)*nobs, .mean = *mean, .m2 = *m2, .m3 = 0.0, .m4 = 0.0};
   if (max_moment >= 4) {
     tmp.m4 = *m4;
   }
@@ -259,5 +259,8 @@ void accumulate_moments_scalar(size_t n, const double *values, bool skipna,
   }
   *m2 = result.m2;
   *mean = result.mean;
-  *nobs = result.n;
+  // FIXME: This conversion may overflow.
+  //        Using `long` because the type used in Cython is `np.int64_t`,
+  //        which is aliased to `long`.
+  *nobs = (long)result.n;
 }

From 7c2e83e592210ff84e7af936c0ff060c331ea130 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Fri, 3 Apr 2026 08:37:50 -0300
Subject: [PATCH 33/35] fix: fix some -Wconversion warnings

---
 pandas/_libs/src/moments.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index ccb156059f3ef..a737210edf7f5 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -95,13 +95,8 @@ typedef uint8_t v4u8
 
 #ifdef PANDAS_HAS_SIMD
 /* Vector select: returns (mask ? a : b) where mask is all-ones or all-zeros */
-#  if defined(__clang__)
-#    define v_select(mask, a, b) ((mask) ? (a) : (b))
-#  else
-// gcc doesn't have vectorized ternary operators when compiling C files.
-#    define v_select(mask, a, b)                                               \
-      ((v4d)(((v4si)(mask) & (v4si)(a)) | (~(v4si)(mask) & (v4si)(b))))
-#  endif // defined(__clang__)
+#  define v_select(mask, a, b)                                                 \
+    ((v4d)(((v4si)(mask) & (v4si)(a)) | (~(v4si)(mask) & (v4si)(b))))
 
 PANDAS_SIMD_TARGETS
 Moments accumulate_moments_simd(size_t n, const double *values, int skipna,
@@ -116,6 +111,7 @@ Moments accumulate_moments_simd(size_t n, const double *values, int skipna,
 
   v4d v_one = {1.0, 1.0, 1.0, 1.0};
   v4d v_zero = {0.0, 0.0, 0.0, 0.0};
+  v4si v_zerosi = {0, 0, 0, 0};
   v4d v_nan = {NAN, NAN, NAN, NAN};
 
   size_t i = 0;
@@ -126,7 +122,7 @@ Moments accumulate_moments_simd(size_t n, const double *values, int skipna,
       v4u8 mask_vec = *(v4u8 *)(mask + i);
       v4si mask_si = __builtin_convertvector(mask_vec, v4si);
       // replace val with NAN where mask is 1.
-      v_val = v_select(mask_si != 0, v_nan, v_val);
+      v_val = v_select(mask_si != v_zerosi, v_nan, v_val);
     }
 
     // skip_mask is 1.0 if we should skip, 0.0 otherwise

From 6269bdc26d20a64ccd382f243ba4fefff7cad89c Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Fri, 3 Apr 2026 09:18:54 -0300
Subject: [PATCH 34/35] fix: fix types for 32 bit

---
 pandas/_libs/include/pandas/moments.h |  4 ++--
 pandas/_libs/src/moments.c            | 11 ++++-------
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/pandas/_libs/include/pandas/moments.h b/pandas/_libs/include/pandas/moments.h
index e608b526e786e..9723761f64ef3 100644
--- a/pandas/_libs/include/pandas/moments.h
+++ b/pandas/_libs/include/pandas/moments.h
@@ -29,7 +29,7 @@ typedef struct {
   double m4;
 } Moments;
 
-static inline void moments_add_value(double val, long *nobs, double *mean,
+static inline void moments_add_value(double val, int64_t *nobs, double *mean,
                                      double *m2, double *m3, double *m4,
                                      int max_moment) {
   double delta = val - *mean;
@@ -53,7 +53,7 @@ Moments moments_merge(Moments a, Moments b, int max_moment);
 
 /// Compute central moments until `max_moment` using `n` elements from `values`.
 void accumulate_moments_scalar(size_t n, const double *values, bool skipna,
-                               const uint8_t *mask, long *nobs, double *mean,
+                               const uint8_t *mask, int64_t *nobs, double *mean,
                                double *m2, double *m3, double *m4,
                                int max_moment);
 #ifdef __cplusplus
diff --git a/pandas/_libs/src/moments.c b/pandas/_libs/src/moments.c
index a737210edf7f5..755fbec477505 100644
--- a/pandas/_libs/src/moments.c
+++ b/pandas/_libs/src/moments.c
@@ -166,9 +166,9 @@ Moments accumulate_moments_simd(size_t n, const double *values, int skipna,
   *(v4d *)m3_arr = v_m3;
   *(v4d *)m4_arr = v_m4;
 
-  long n_arr[4];
+  int64_t n_arr[4];
   for (int j = 0; j < 4; j++) {
-    n_arr[j] = (long)n_arrd[j];
+    n_arr[j] = (int64_t)n_arrd[j];
   }
 
   // Distribute remaining values across chunks
@@ -229,7 +229,7 @@ Moments accumulate_moments_dispatch(size_t n, const double *values, int skipna,
 /* --- Moments 1D Accumulator Implementation --- */
 
 void accumulate_moments_scalar(size_t n, const double *values, bool skipna,
-                               const uint8_t *mask, long *nobs, double *mean,
+                               const uint8_t *mask, int64_t *nobs, double *mean,
                                double *m2, double *m3, double *m4,
                                int max_moment) {
   // PERF: It's possible to parallelize moment reductions
@@ -255,8 +255,5 @@ void accumulate_moments_scalar(size_t n, const double *values, bool skipna,
   }
   *m2 = result.m2;
   *mean = result.mean;
-  // FIXME: This conversion may overflow.
-  //        Using `long` because the type used in Cython is `np.int64_t`,
-  //        which is aliased to `long`.
-  *nobs = (long)result.n;
+  *nobs = (int64_t)result.n;
 }

From 25216651ed17c2f2926f15c08421cdee5a702d76 Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Wed, 29 Apr 2026 18:44:35 -0300
Subject: [PATCH 35/35] revert test changes

---
 pandas/tests/apply/test_str.py                | 3 +--
 pandas/tests/arrays/floating/test_function.py | 3 +--
 pandas/tests/arrays/integer/test_function.py  | 3 +--
 pandas/tests/test_nanops.py                   | 3 +--
 4 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py
index 9589db07dbcee..e5a9492630b13 100644
--- a/pandas/tests/apply/test_str.py
+++ b/pandas/tests/apply/test_str.py
@@ -41,8 +41,7 @@ def test_apply_with_string_funcs(float_frame, func, kwds, how):
 def test_with_string_args(datetime_series, all_numeric_reductions):
     result = datetime_series.apply(all_numeric_reductions)
     expected = getattr(datetime_series, all_numeric_reductions)()
-    # reductions computed in parallel may yield different results
-    tm.assert_almost_equal(result, expected)
+    assert result == expected
 
 
 @pytest.mark.parametrize("op", ["mean", "median", "std", "var"])
diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py
index 2fcdcd5a9e823..a5938b2aa93e8 100644
--- a/pandas/tests/arrays/floating/test_function.py
+++ b/pandas/tests/arrays/floating/test_function.py
@@ -116,8 +116,7 @@ def test_stat_method(pandasmethname, kwargs):
     s2 = pd.Series(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype="float64")
     pandasmeth = getattr(s2, pandasmethname)
     expected = pandasmeth(**kwargs)
-    # reductions computed in parallel may yield different results
-    tm.assert_almost_equal(result, expected)
+    assert expected == result
 
 
 def test_value_counts_na():
diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py
index a288ab631ae3b..6b3308baf3ee7 100644
--- a/pandas/tests/arrays/integer/test_function.py
+++ b/pandas/tests/arrays/integer/test_function.py
@@ -134,8 +134,7 @@ def test_stat_method(pandasmethname, kwargs):
     s2 = pd.Series(data=[1, 2, 3, 4, 5, 6], dtype="Int64")
     pandasmeth = getattr(s2, pandasmethname)
     expected = pandasmeth(**kwargs)
-    # reductions computed in parallel may yield different results
-    tm.assert_almost_equal(result, expected)
+    assert expected == result
 
 
 def test_value_counts_na():
diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py
index b83e2d6e0e7f2..acd92d916bdcc 100644
--- a/pandas/tests/test_nanops.py
+++ b/pandas/tests/test_nanops.py
@@ -1243,8 +1243,7 @@ def test_nanops_independent_of_mask_param(operation):
     mask = ser.isna()
     median_expected = operation(ser._values)
     median_result = operation(ser._values, mask=mask._values)
-    # reductions computed in parallel may yield different results
-    tm.assert_almost_equal(median_result, median_expected)
+    assert median_expected == median_result
 
 
 @pytest.mark.parametrize("min_count", [-1, 0])