From e2110baaf557250a1611aa91024ad1d6f7ec7988 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Thu, 19 Mar 2026 23:31:45 -0300 Subject: [PATCH 1/4] refactor(var): use moments accumulator to compute variance --- pandas/_libs/algos.pxd | 9 ++ pandas/_libs/algos.pyi | 13 +++ pandas/_libs/algos.pyx | 50 ++++++++- pandas/_libs/groupby.pyx | 75 ++++--------- pandas/_libs/window/aggregations.pyx | 100 ++++++------------ pandas/core/nanops.py | 58 +++------- pandas/tests/frame/test_reductions.py | 9 +- .../tests/reductions/test_stat_reductions.py | 3 + 8 files changed, 151 insertions(+), 166 deletions(-) diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd index e47f7d56ba343..28da52b9cfbb9 100644 --- a/pandas/_libs/algos.pxd +++ b/pandas/_libs/algos.pxd @@ -67,6 +67,15 @@ cdef inline float64_t calc_skew( return moments_ratio * correction +@cython.cdivision(True) +cdef inline float64_t calc_var( + int64_t nobs, float64_t m2, int64_t ddof, +) noexcept nogil: + if nobs < ddof: + return NAN + return m2 / (nobs - ddof) + + @cython.cdivision(True) cdef inline float64_t calc_kurt( int64_t nobs, float64_t m2, float64_t m4 diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi index e06071d7bfeb3..d1d7a6030909f 100644 --- a/pandas/_libs/algos.pyi +++ b/pandas/_libs/algos.pyi @@ -138,6 +138,12 @@ def diff_2d( # Moments stats (skew, kurt) # ---------------------------------------------------------------------- +def scalar_var( + values: np.ndarray, # const float64_t[:] + skipna: bool, # bint + ddof: int, + mask: npt.NDArray[np.bool_] | None, # const uint8_t[:] +) -> float: ... def scalar_skew( values: np.ndarray, # const float64_t[:] skipna: bool, # bint @@ -148,6 +154,13 @@ def scalar_kurt( skipna: bool, # bint mask: npt.NDArray[np.bool_] | None, # const uint8_t[:] ) -> float: ... +def axis_var( + values: np.ndarray, # const float64_t[:, :] + axis: int, + skipna: bool, # bint + ddof: int, + mask: npt.NDArray[np.bool_] | None, # const uint8_t[:, :] +) -> np.ndarray: ... def axis_skew( values: np.ndarray, # const float64_t[:, :] axis: int, diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 9e24089dd48bd..739095addd427 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1597,7 +1597,7 @@ def diff_2d( # ---------------------------------------------------------------------- -# Moments stats (skew, kurt) +# Moments stats (var, skew, kurt) # ---------------------------------------------------------------------- @@ -1674,6 +1674,26 @@ cdef void accumulate_moments_axis( max_moment) +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.cdivision(True) +def scalar_var( + const float64_t[:] values, + bint skipna, + int64_t ddof, + const uint8_t[:] mask, +) -> float: + cdef: + int64_t nobs = 0 + float64_t mean = 0.0, m2 = 0.0 + + with nogil: + accumulate_moments_scalar(values, skipna, mask, + &nobs, &mean, &m2, NULL, NULL, 2) + + return calc_var(nobs, m2, ddof) + + @cython.boundscheck(False) @cython.wraparound(False) def scalar_skew( @@ -1708,6 +1728,34 @@ def scalar_kurt( return calc_kurt(nobs, m2, m4) +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.cdivision(True) +def axis_var( + const float64_t[:, :] values, + int axis, + bint skipna, + int64_t ddof, + const uint8_t[:, :] mask, +) -> ndarray: + cdef: + Py_ssize_t i + Py_ssize_t nouter = values.shape[1] if axis == 0 else values.shape[0] + int64_t[::1] nobs = np.zeros(nouter, dtype=np.int64) + float64_t[::1] mean = np.zeros(nouter) + float64_t[::1] m2 = np.zeros(nouter) + ndarray result_arr = np.empty(nouter, dtype=np.float64) + float64_t[:] result = result_arr + + with nogil: + accumulate_moments_axis(values, skipna, mask, + nobs, mean, m2, None, None, axis, 2) + for i in range(nouter): + result[i] = calc_var(nobs[i], m2[i], ddof) + + return result_arr + + @cython.boundscheck(False) @cython.wraparound(False) def axis_skew( diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 15abebef987ed..53b202f8d9539 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -36,6 +36,7 @@ from pandas._libs cimport util from pandas._libs.algos cimport ( calc_kurt, calc_skew, + calc_var, get_rank_nan_fill_val, kth_smallest_c, moments_add_value, @@ -922,11 +923,11 @@ def group_var( ) -> None: cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - floating val, ct, oldmean - floating[:, ::1] mean + float64_t val + float64_t[:, ::1] mean, M2 int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) - bint isna_entry, isna_result, uses_mask = mask is not None + bint uses_mask = mask is not None bint is_std = name == "std" bint is_sem = name == "sem" @@ -936,7 +937,8 @@ def group_var( raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) - mean = np.zeros((out).shape, dtype=(out).base.dtype) + mean = np.zeros((out).shape, dtype=np.float64) + M2 = np.zeros((out).shape, dtype=np.float64) N, K = (values).shape @@ -953,61 +955,26 @@ def group_var( for j in range(K): val = values[i, j] - if uses_mask: - isna_entry = mask[i, j] - elif is_datetimelike: - # With group_var, we cannot just use _treat_as_na bc - # datetimelike dtypes get cast to float64 instead of - # to int64. - isna_entry = val == NPY_NAT - else: - isna_entry = _treat_as_na(val, is_datetimelike) - - if not skipna: - if uses_mask: - isna_result = result_mask[lab, j] - elif is_datetimelike: - # With group_var, we cannot just use _treat_as_na bc - # datetimelike dtypes get cast to float64 instead of - # to int64. - isna_result = out[lab, j] == NPY_NAT - else: - isna_result = _treat_as_na(out[lab, j], is_datetimelike) + if (uses_mask and mask[i, j]) or (is_datetimelike and val == NPY_NAT): + val = NaN - if isna_result: - # If aggregate is already NA, don't add to it. This is - # important for datetimelike because adding a value to NPY_NAT - # may not result in a NPY_NAT - continue + if skipna and isnan(val): + continue - if not isna_entry: - nobs[lab, j] += 1 - oldmean = mean[lab, j] - mean[lab, j] += (val - oldmean) / nobs[lab, j] - out[lab, j] += (val - mean[lab, j]) * (val - oldmean) - elif not skipna: - nobs[lab, j] = 0 - if uses_mask: - result_mask[lab, j] = True - else: - out[lab, j] = NAN + moments_add_value(val, &nobs[lab, j], &mean[lab, j], &M2[lab, j], + NULL, NULL, 2) for i in range(ncounts): for j in range(K): - ct = nobs[i, j] - if ct <= ddof: - if uses_mask: - result_mask[i, j] = True - else: - out[i, j] = NAN - else: - if is_std: - out[i, j] = sqrt(out[i, j] / (ct - ddof)) - elif is_sem: - out[i, j] = sqrt(out[i, j] / (ct - ddof) / ct) - else: - # just "var" - out[i, j] /= (ct - ddof) + out[i, j] = calc_var(nobs[i, j], M2[i, j], ddof) + + if result_mask is not None and isnan(out[i, j]): + result_mask[i, j] = 1 + + if is_std: + out[i, j] = sqrt(out[i, j]) + elif is_sem: + out[i, j] = sqrt(out[i, j] / nobs[i, j]) @cython.wraparound(False) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 619d5590cc207..f83f1c7b48721 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -12,6 +12,7 @@ from pandas._libs.algos cimport ( TiebreakEnumType, calc_kurt, calc_skew, + calc_var, moments_add_value, ) @@ -330,94 +331,58 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start, # Rolling variance -cdef float64_t calc_var( - int64_t minp, - int ddof, - float64_t nobs, - float64_t ssqdm_x, -) noexcept nogil: - cdef: - float64_t result - - # Variance is unchanged if no observation is added or removed - if (nobs >= minp) and (nobs > ddof): - result = ssqdm_x / (nobs - ddof) - else: - result = NaN - - return result - - cdef void add_var( float64_t val, - float64_t *nobs, - float64_t *mean_x, - float64_t *ssqdm_x, + int64_t *nobs, + float64_t *mean, + float64_t *m2, float64_t *compensation, bint *numerically_unstable, ) noexcept nogil: """ add a value from the var calc """ cdef: - float64_t delta, prev_mean, y, t - float64_t prev_m2 = ssqdm_x[0] + float64_t old_m2 = m2[0] - # GH#21813, if msvc 2017 bug is resolved, we should be OK with != instead of `isnan` - if val != val: - return - - nobs[0] = nobs[0] + 1 - - # Welford's method for the online variance-calculation - # using Kahan summation - # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - prev_mean = mean_x[0] - compensation[0] - y = val - compensation[0] - t = y - mean_x[0] - compensation[0] = t + mean_x[0] - y - delta = t - if nobs[0]: - mean_x[0] = mean_x[0] + delta / nobs[0] - else: - mean_x[0] = 0 - ssqdm_x[0] = ssqdm_x[0] + (val - prev_mean) * (val - mean_x[0]) - - if prev_m2 * InvCondTol > ssqdm_x[0]: - # possible catastrophic cancellation - numerically_unstable[0] = True + # Not NaN + if val == val: + moments_add_value(val, nobs, mean, m2, NULL, NULL, 2) + if fabs(old_m2) * InvCondTol > fabs(m2[0]): + # possible catastrophic cancellation + numerically_unstable[0] = True cdef void remove_var( float64_t val, - float64_t *nobs, - float64_t *mean_x, - float64_t *ssqdm_x, + int64_t *nobs, + float64_t *mean, + float64_t *m2, float64_t *compensation, bint *numerically_unstable, ) noexcept nogil: """ remove a value from the var calc """ cdef: float64_t delta, prev_mean, y, t - float64_t prev_m2 = ssqdm_x[0] + float64_t prev_m2 = m2[0] if val == val: nobs[0] = nobs[0] - 1 if nobs[0]: # Welford's method for the online variance-calculation # using Kahan summation # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - prev_mean = mean_x[0] - compensation[0] + prev_mean = mean[0] - compensation[0] y = val - compensation[0] - t = y - mean_x[0] - compensation[0] = t + mean_x[0] - y + t = y - mean[0] + compensation[0] = t + mean[0] - y delta = t - mean_x[0] = mean_x[0] - delta / nobs[0] - ssqdm_x[0] = ssqdm_x[0] - (val - prev_mean) * (val - mean_x[0]) + mean[0] = mean[0] - delta / nobs[0] + m2[0] = m2[0] - (val - prev_mean) * (val - mean[0]) - if prev_m2 * InvCondTol > ssqdm_x[0]: + if prev_m2 * InvCondTol > m2[0]: # possible catastrophic cancellation numerically_unstable[0] = True else: - mean_x[0] = 0 - ssqdm_x[0] = 0 + mean[0] = 0 + m2[0] = 0 numerically_unstable[0] = False @@ -427,7 +392,8 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, Numerically stable implementation using Welford's method. """ cdef: - float64_t mean_x, ssqdm_x, nobs, compensation_add, + int64_t nobs + float64_t mean, m2, compensation_add, float64_t compensation_remove int64_t s, e Py_ssize_t i, j, N = len(start) @@ -462,28 +428,28 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, # calculate deletes for j in range(start[i - 1], s): - remove_var(values[j], &nobs, &mean_x, &ssqdm_x, + remove_var(values[j], &nobs, &mean, &m2, &compensation_remove, &numerically_unstable) # calculate adds for j in range(end[i - 1], e): - add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add, + add_var(values[j], &nobs, &mean, &m2, &compensation_add, &numerically_unstable) if requires_recompute or numerically_unstable: - mean_x = ssqdm_x = nobs = compensation_add = compensation_remove = 0 + mean = m2 = nobs = compensation_add = compensation_remove = 0 for j in range(s, e): - add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add, + add_var(values[j], &nobs, &mean, &m2, &compensation_add, &numerically_unstable) numerically_unstable = False - output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + output[i] = NaN if nobs < minp else calc_var(nobs, m2, ddof) if not is_monotonic_increasing_bounds: - nobs = 0.0 - mean_x = 0.0 - ssqdm_x = 0.0 + nobs = 0 + mean = 0.0 + m2 = 0.0 compensation_remove = 0.0 return output diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 0b8cccb021cc2..6c79c7def9b9f 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -998,12 +998,7 @@ def nanvar( 1.0 """ dtype = values.dtype - mask = _maybe_get_mask(values, skipna, mask) - if dtype.kind in "iu": - values = values.astype("f8") - if mask is not None: - values[mask] = np.nan - elif dtype.kind == "c": + if dtype.kind == "c": # https://en.wikipedia.org/wiki/Complex_random_variable#Variance_and_pseudo-variance # The variance is equal to the sum of # the variances of the real and imaginary part of the complex random variable. @@ -1011,33 +1006,21 @@ def nanvar( values.real, axis=axis, skipna=skipna, ddof=ddof, mask=mask ) + nanvar(values.imag, axis=axis, skipna=skipna, ddof=ddof, mask=mask) - if values.dtype.kind == "f": - count, d = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype) + values = ensure_float64(values) + result: npt.NDArray[np.floating] | np.floating + if axis is None or (values.ndim == 1 and axis == 0): + result_float = libalgos.scalar_var( + values.ravel("K"), + skipna, + ddof, + mask.ravel("K") if mask is not None else None, + ) + result = np.float64(result_float) + elif axis in {0, 1}: + result = libalgos.axis_var(values, axis, skipna, ddof, mask) else: - count, d = _get_counts_nanvar(values.shape, mask, axis, ddof) - - if skipna and mask is not None: - values = values.copy() - np.putmask(values, mask, 0) - - # xref GH10242 - # Compute variance via two-pass algorithm, which is stable against - # cancellation errors and relatively accurate for small numbers of - # observations. - # - # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count - if axis is not None: - avg = np.expand_dims(avg, axis) - - sqr = _ensure_numeric((avg - values) ** 2) - if mask is not None: - np.putmask(sqr, mask, 0) - result = sqr.sum(axis=axis, dtype=np.float64) / d + raise ValueError("axis must be 0, 1 or None") - # Return variance as np.float64 (the datatype used in the accumulator), - # unless we were dealing with a float array, in which case use the same - # precision as the original values array. if dtype.kind == "f": result = result.astype(dtype, copy=False) return result @@ -1079,22 +1062,15 @@ def nansem( >>> nanops.nansem(s.values) np.float64(0.5773502691896258) """ - # This checks if non-numeric-like data is passed with numeric_only=False - # and raises a TypeError otherwise - nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask) - mask = _maybe_get_mask(values, skipna, mask) # Convert to bottleneck return a float if values.dtype.kind not in "fc": values = values.astype("f8") - if not skipna and mask is not None and mask.any(): - return np.nan - - dtype_count = np.dtype(np.float64) + count_dtype = np.dtype(np.float64) if values.dtype.kind == "f": - dtype_count = values.dtype - count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, dtype_count) + count_dtype = values.dtype + count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, count_dtype) var = nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask) return np.sqrt(var) / np.sqrt(count) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 0a3950805301d..7d41d0df9ad40 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -568,9 +568,12 @@ def test_numeric_only_flag(self, meth): tm.assert_series_equal(expected, result) # df1 has all numbers, df2 has a letter inside - msg = r"unsupported operand type\(s\) for -: 'float' and 'str'" - with pytest.raises(TypeError, match=msg): - getattr(df1, meth)(axis=1, numeric_only=False) + + # The 100 string in df1 will be converted to its float representation + # instead of doing a preemptive search to raise an error. + result = getattr(df1, meth)(axis=1, numeric_only=False) + df1_float = df1.astype("f8") + expected = getattr(df1_float, meth)(axis=1, numeric_only=False) msg = "could not convert string to float: 'a'" with pytest.raises(TypeError, match=msg): getattr(df2, meth)(axis=1, numeric_only=False) diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index 4312888733040..4157041041841 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -277,6 +277,9 @@ def test_kurt_corner(self, using_python_scalars): @pytest.mark.parametrize( "opname", [ + "std", + "sem", + "var", "skew", "kurt", ], From e857f810be1b89d54c57f4dc9e9ee3ebcff32adf Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Sat, 4 Apr 2026 19:00:25 -0300 Subject: [PATCH 2/4] perf: undo nanops changes --- pandas/_libs/algos.pyi | 13 ------ pandas/_libs/algos.pyx | 50 +---------------------- pandas/core/nanops.py | 58 +++++++++++++++++++-------- pandas/tests/frame/test_reductions.py | 9 ++--- 4 files changed, 45 insertions(+), 85 deletions(-) diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi index d1d7a6030909f..e06071d7bfeb3 100644 --- a/pandas/_libs/algos.pyi +++ b/pandas/_libs/algos.pyi @@ -138,12 +138,6 @@ def diff_2d( # Moments stats (skew, kurt) # ---------------------------------------------------------------------- -def scalar_var( - values: np.ndarray, # const float64_t[:] - skipna: bool, # bint - ddof: int, - mask: npt.NDArray[np.bool_] | None, # const uint8_t[:] -) -> float: ... def scalar_skew( values: np.ndarray, # const float64_t[:] skipna: bool, # bint @@ -154,13 +148,6 @@ def scalar_kurt( skipna: bool, # bint mask: npt.NDArray[np.bool_] | None, # const uint8_t[:] ) -> float: ... -def axis_var( - values: np.ndarray, # const float64_t[:, :] - axis: int, - skipna: bool, # bint - ddof: int, - mask: npt.NDArray[np.bool_] | None, # const uint8_t[:, :] -) -> np.ndarray: ... def axis_skew( values: np.ndarray, # const float64_t[:, :] axis: int, diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 739095addd427..9e24089dd48bd 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1597,7 +1597,7 @@ def diff_2d( # ---------------------------------------------------------------------- -# Moments stats (var, skew, kurt) +# Moments stats (skew, kurt) # ---------------------------------------------------------------------- @@ -1674,26 +1674,6 @@ cdef void accumulate_moments_axis( max_moment) -@cython.boundscheck(False) -@cython.wraparound(False) -@cython.cdivision(True) -def scalar_var( - const float64_t[:] values, - bint skipna, - int64_t ddof, - const uint8_t[:] mask, -) -> float: - cdef: - int64_t nobs = 0 - float64_t mean = 0.0, m2 = 0.0 - - with nogil: - accumulate_moments_scalar(values, skipna, mask, - &nobs, &mean, &m2, NULL, NULL, 2) - - return calc_var(nobs, m2, ddof) - - @cython.boundscheck(False) @cython.wraparound(False) def scalar_skew( @@ -1728,34 +1708,6 @@ def scalar_kurt( return calc_kurt(nobs, m2, m4) -@cython.boundscheck(False) -@cython.wraparound(False) -@cython.cdivision(True) -def axis_var( - const float64_t[:, :] values, - int axis, - bint skipna, - int64_t ddof, - const uint8_t[:, :] mask, -) -> ndarray: - cdef: - Py_ssize_t i - Py_ssize_t nouter = values.shape[1] if axis == 0 else values.shape[0] - int64_t[::1] nobs = np.zeros(nouter, dtype=np.int64) - float64_t[::1] mean = np.zeros(nouter) - float64_t[::1] m2 = np.zeros(nouter) - ndarray result_arr = np.empty(nouter, dtype=np.float64) - float64_t[:] result = result_arr - - with nogil: - accumulate_moments_axis(values, skipna, mask, - nobs, mean, m2, None, None, axis, 2) - for i in range(nouter): - result[i] = calc_var(nobs[i], m2[i], ddof) - - return result_arr - - @cython.boundscheck(False) @cython.wraparound(False) def axis_skew( diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 6c79c7def9b9f..0b8cccb021cc2 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -998,7 +998,12 @@ def nanvar( 1.0 """ dtype = values.dtype - if dtype.kind == "c": + mask = _maybe_get_mask(values, skipna, mask) + if dtype.kind in "iu": + values = values.astype("f8") + if mask is not None: + values[mask] = np.nan + elif dtype.kind == "c": # https://en.wikipedia.org/wiki/Complex_random_variable#Variance_and_pseudo-variance # The variance is equal to the sum of # the variances of the real and imaginary part of the complex random variable. @@ -1006,21 +1011,33 @@ def nanvar( values.real, axis=axis, skipna=skipna, ddof=ddof, mask=mask ) + nanvar(values.imag, axis=axis, skipna=skipna, ddof=ddof, mask=mask) - values = ensure_float64(values) - result: npt.NDArray[np.floating] | np.floating - if axis is None or (values.ndim == 1 and axis == 0): - result_float = libalgos.scalar_var( - values.ravel("K"), - skipna, - ddof, - mask.ravel("K") if mask is not None else None, - ) - result = np.float64(result_float) - elif axis in {0, 1}: - result = libalgos.axis_var(values, axis, skipna, ddof, mask) + if values.dtype.kind == "f": + count, d = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype) else: - raise ValueError("axis must be 0, 1 or None") + count, d = _get_counts_nanvar(values.shape, mask, axis, ddof) + + if skipna and mask is not None: + values = values.copy() + np.putmask(values, mask, 0) + + # xref GH10242 + # Compute variance via two-pass algorithm, which is stable against + # cancellation errors and relatively accurate for small numbers of + # observations. + # + # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count + if axis is not None: + avg = np.expand_dims(avg, axis) + + sqr = _ensure_numeric((avg - values) ** 2) + if mask is not None: + np.putmask(sqr, mask, 0) + result = sqr.sum(axis=axis, dtype=np.float64) / d + # Return variance as np.float64 (the datatype used in the accumulator), + # unless we were dealing with a float array, in which case use the same + # precision as the original values array. if dtype.kind == "f": result = result.astype(dtype, copy=False) return result @@ -1062,15 +1079,22 @@ def nansem( >>> nanops.nansem(s.values) np.float64(0.5773502691896258) """ + # This checks if non-numeric-like data is passed with numeric_only=False + # and raises a TypeError otherwise + nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask) + mask = _maybe_get_mask(values, skipna, mask) # Convert to bottleneck return a float if values.dtype.kind not in "fc": values = values.astype("f8") - count_dtype = np.dtype(np.float64) + if not skipna and mask is not None and mask.any(): + return np.nan + + dtype_count = np.dtype(np.float64) if values.dtype.kind == "f": - count_dtype = values.dtype - count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, count_dtype) + dtype_count = values.dtype + count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, dtype_count) var = nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask) return np.sqrt(var) / np.sqrt(count) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 7d41d0df9ad40..0a3950805301d 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -568,12 +568,9 @@ def test_numeric_only_flag(self, meth): tm.assert_series_equal(expected, result) # df1 has all numbers, df2 has a letter inside - - # The 100 string in df1 will be converted to its float representation - # instead of doing a preemptive search to raise an error. - result = getattr(df1, meth)(axis=1, numeric_only=False) - df1_float = df1.astype("f8") - expected = getattr(df1_float, meth)(axis=1, numeric_only=False) + msg = r"unsupported operand type\(s\) for -: 'float' and 'str'" + with pytest.raises(TypeError, match=msg): + getattr(df1, meth)(axis=1, numeric_only=False) msg = "could not convert string to float: 'a'" with pytest.raises(TypeError, match=msg): getattr(df2, meth)(axis=1, numeric_only=False) From 8d1a5d7d39a3946578ba0e0102901b1b70f5d05c Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Sat, 25 Apr 2026 01:02:49 -0300 Subject: [PATCH 3/4] fix: use leq for ddof to avoid inf results --- pandas/_libs/algos.pxd | 2 +- pandas/tests/reductions/test_stat_reductions.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd index 28da52b9cfbb9..c9f7afe778e37 100644 --- a/pandas/_libs/algos.pxd +++ b/pandas/_libs/algos.pxd @@ -71,7 +71,7 @@ cdef inline float64_t calc_skew( cdef inline float64_t calc_var( int64_t nobs, float64_t m2, int64_t ddof, ) noexcept nogil: - if nobs < ddof: + if nobs <= ddof: return NAN return m2 / (nobs - ddof) diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index 4157041041841..e9e0dc070e4f3 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -308,3 +308,18 @@ def test_reduction_consistency(opname, loc, scale): result_frame = getattr(DataFrame(s), opname)().iloc[0] tm.assert_almost_equal(result_series, result_frame) + + +@pytest.mark.parametrize("opname", ["var", "std", "sem"]) +def test_stat_nobs_equal_ddof(opname): + s = Series([1.0, 2.0, 3.0]) + ddof = 3 + + result_series = getattr(s, opname)(ddof=ddof) + assert np.isnan(result_series) + + result_gb = getattr(s.groupby([0, 0, 0]), opname)(ddof=ddof).iloc[0] + assert np.isnan(result_gb) + + result_window = getattr(s.rolling(3, min_periods=3), opname)(ddof=ddof).iloc[2] + assert np.isnan(result_window) From 77a790fc36255d783d5cd7ca5cf2a9d99b37cadb Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Sat, 25 Apr 2026 11:25:24 -0300 Subject: [PATCH 4/4] fix: revert na handling --- pandas/_libs/groupby.pyx | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 53b202f8d9539..2a76b2e8b7ed5 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -927,7 +927,7 @@ def group_var( float64_t[:, ::1] mean, M2 int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) - bint uses_mask = mask is not None + bint isna_entry, isna_result, uses_mask = mask is not None bint is_std = name == "std" bint is_sem = name == "sem" @@ -955,11 +955,36 @@ def group_var( for j in range(K): val = values[i, j] - if (uses_mask and mask[i, j]) or (is_datetimelike and val == NPY_NAT): - val = NaN + if uses_mask: + isna_entry = mask[i, j] + elif is_datetimelike: + # With group_var, we cannot just use _treat_as_na bc + # datetimelike dtypes get cast to float64 instead of + # to int64. + isna_entry = val == NPY_NAT + else: + isna_entry = _treat_as_na(val, is_datetimelike) - if skipna and isnan(val): - continue + if not skipna: + if uses_mask: + isna_result = result_mask[lab, j] + elif is_datetimelike: + # With group_var, we cannot just use _treat_as_na bc + # datetimelike dtypes get cast to float64 instead of + # to int64. + isna_result = out[lab, j] == NPY_NAT + else: + isna_result = _treat_as_na(out[lab, j], is_datetimelike) + + if isna_result: + # If aggregate is already NA, don't add to it. This is + # important for datetimelike because adding a value to NPY_NAT + # may not result in a NPY_NAT + continue + if isna_entry: + if skipna: + continue + val = NaN moments_add_value(val, &nobs[lab, j], &mean[lab, j], &M2[lab, j], NULL, NULL, 2)