diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd index e47f7d56ba343..c9f7afe778e37 100644 --- a/pandas/_libs/algos.pxd +++ b/pandas/_libs/algos.pxd @@ -67,6 +67,15 @@ cdef inline float64_t calc_skew( return moments_ratio * correction +@cython.cdivision(True) +cdef inline float64_t calc_var( + int64_t nobs, float64_t m2, int64_t ddof, +) noexcept nogil: + if nobs <= ddof: + return NAN + return m2 / (nobs - ddof) + + @cython.cdivision(True) cdef inline float64_t calc_kurt( int64_t nobs, float64_t m2, float64_t m4 diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 15abebef987ed..2a76b2e8b7ed5 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -36,6 +36,7 @@ from pandas._libs cimport util from pandas._libs.algos cimport ( calc_kurt, calc_skew, + calc_var, get_rank_nan_fill_val, kth_smallest_c, moments_add_value, @@ -922,8 +923,8 @@ def group_var( ) -> None: cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - floating val, ct, oldmean - floating[:, ::1] mean + float64_t val + float64_t[:, ::1] mean, M2 int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) bint isna_entry, isna_result, uses_mask = mask is not None @@ -936,7 +937,8 @@ def group_var( raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) - mean = np.zeros((out).shape, dtype=(out).base.dtype) + mean = np.zeros((out).shape, dtype=np.float64) + M2 = np.zeros((out).shape, dtype=np.float64) N, K = (values).shape @@ -979,35 +981,25 @@ def group_var( # important for datetimelike because adding a value to NPY_NAT # may not result in a NPY_NAT continue + if isna_entry: + if skipna: + continue + val = NaN - if not isna_entry: - nobs[lab, j] += 1 - oldmean = mean[lab, j] - mean[lab, j] += (val - oldmean) / nobs[lab, j] - out[lab, j] += (val - mean[lab, j]) * (val - oldmean) - elif not skipna: - nobs[lab, j] = 0 - if uses_mask: - result_mask[lab, j] = True - else: - out[lab, j] = NAN + moments_add_value(val, &nobs[lab, j], &mean[lab, j], &M2[lab, j], + NULL, NULL, 2) for i in range(ncounts): for j in range(K): - ct = nobs[i, j] - if ct <= ddof: - if uses_mask: - result_mask[i, j] = True - else: - out[i, j] = NAN - else: - if is_std: - out[i, j] = sqrt(out[i, j] / (ct - ddof)) - elif is_sem: - out[i, j] = sqrt(out[i, j] / (ct - ddof) / ct) - else: - # just "var" - out[i, j] /= (ct - ddof) + out[i, j] = calc_var(nobs[i, j], M2[i, j], ddof) + + if result_mask is not None and isnan(out[i, j]): + result_mask[i, j] = 1 + + if is_std: + out[i, j] = sqrt(out[i, j]) + elif is_sem: + out[i, j] = sqrt(out[i, j] / nobs[i, j]) @cython.wraparound(False) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 619d5590cc207..f83f1c7b48721 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -12,6 +12,7 @@ from pandas._libs.algos cimport ( TiebreakEnumType, calc_kurt, calc_skew, + calc_var, moments_add_value, ) @@ -330,94 +331,58 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start, # Rolling variance -cdef float64_t calc_var( - int64_t minp, - int ddof, - float64_t nobs, - float64_t ssqdm_x, -) noexcept nogil: - cdef: - float64_t result - - # Variance is unchanged if no observation is added or removed - if (nobs >= minp) and (nobs > ddof): - result = ssqdm_x / (nobs - ddof) - else: - result = NaN - - return result - - cdef void add_var( float64_t val, - float64_t *nobs, - float64_t *mean_x, - float64_t *ssqdm_x, + int64_t *nobs, + float64_t *mean, + float64_t *m2, float64_t *compensation, bint *numerically_unstable, ) noexcept nogil: """ add a value from the var calc """ cdef: - float64_t delta, prev_mean, y, t - float64_t prev_m2 = ssqdm_x[0] + float64_t old_m2 = m2[0] - # GH#21813, if msvc 2017 bug is resolved, we should be OK with != instead of `isnan` - if val != val: - return - - nobs[0] = nobs[0] + 1 - - # Welford's method for the online variance-calculation - # using Kahan summation - # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - prev_mean = mean_x[0] - compensation[0] - y = val - compensation[0] - t = y - mean_x[0] - compensation[0] = t + mean_x[0] - y - delta = t - if nobs[0]: - mean_x[0] = mean_x[0] + delta / nobs[0] - else: - mean_x[0] = 0 - ssqdm_x[0] = ssqdm_x[0] + (val - prev_mean) * (val - mean_x[0]) - - if prev_m2 * InvCondTol > ssqdm_x[0]: - # possible catastrophic cancellation - numerically_unstable[0] = True + # Not NaN + if val == val: + moments_add_value(val, nobs, mean, m2, NULL, NULL, 2) + if fabs(old_m2) * InvCondTol > fabs(m2[0]): + # possible catastrophic cancellation + numerically_unstable[0] = True cdef void remove_var( float64_t val, - float64_t *nobs, - float64_t *mean_x, - float64_t *ssqdm_x, + int64_t *nobs, + float64_t *mean, + float64_t *m2, float64_t *compensation, bint *numerically_unstable, ) noexcept nogil: """ remove a value from the var calc """ cdef: float64_t delta, prev_mean, y, t - float64_t prev_m2 = ssqdm_x[0] + float64_t prev_m2 = m2[0] if val == val: nobs[0] = nobs[0] - 1 if nobs[0]: # Welford's method for the online variance-calculation # using Kahan summation # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - prev_mean = mean_x[0] - compensation[0] + prev_mean = mean[0] - compensation[0] y = val - compensation[0] - t = y - mean_x[0] - compensation[0] = t + mean_x[0] - y + t = y - mean[0] + compensation[0] = t + mean[0] - y delta = t - mean_x[0] = mean_x[0] - delta / nobs[0] - ssqdm_x[0] = ssqdm_x[0] - (val - prev_mean) * (val - mean_x[0]) + mean[0] = mean[0] - delta / nobs[0] + m2[0] = m2[0] - (val - prev_mean) * (val - mean[0]) - if prev_m2 * InvCondTol > ssqdm_x[0]: + if prev_m2 * InvCondTol > m2[0]: # possible catastrophic cancellation numerically_unstable[0] = True else: - mean_x[0] = 0 - ssqdm_x[0] = 0 + mean[0] = 0 + m2[0] = 0 numerically_unstable[0] = False @@ -427,7 +392,8 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, Numerically stable implementation using Welford's method. """ cdef: - float64_t mean_x, ssqdm_x, nobs, compensation_add, + int64_t nobs + float64_t mean, m2, compensation_add, float64_t compensation_remove int64_t s, e Py_ssize_t i, j, N = len(start) @@ -462,28 +428,28 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, # calculate deletes for j in range(start[i - 1], s): - remove_var(values[j], &nobs, &mean_x, &ssqdm_x, + remove_var(values[j], &nobs, &mean, &m2, &compensation_remove, &numerically_unstable) # calculate adds for j in range(end[i - 1], e): - add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add, + add_var(values[j], &nobs, &mean, &m2, &compensation_add, &numerically_unstable) if requires_recompute or numerically_unstable: - mean_x = ssqdm_x = nobs = compensation_add = compensation_remove = 0 + mean = m2 = nobs = compensation_add = compensation_remove = 0 for j in range(s, e): - add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add, + add_var(values[j], &nobs, &mean, &m2, &compensation_add, &numerically_unstable) numerically_unstable = False - output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + output[i] = NaN if nobs < minp else calc_var(nobs, m2, ddof) if not is_monotonic_increasing_bounds: - nobs = 0.0 - mean_x = 0.0 - ssqdm_x = 0.0 + nobs = 0 + mean = 0.0 + m2 = 0.0 compensation_remove = 0.0 return output diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index 4312888733040..e9e0dc070e4f3 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -277,6 +277,9 @@ def test_kurt_corner(self, using_python_scalars): @pytest.mark.parametrize( "opname", [ + "std", + "sem", + "var", "skew", "kurt", ], @@ -305,3 +308,18 @@ def test_reduction_consistency(opname, loc, scale): result_frame = getattr(DataFrame(s), opname)().iloc[0] tm.assert_almost_equal(result_series, result_frame) + + +@pytest.mark.parametrize("opname", ["var", "std", "sem"]) +def test_stat_nobs_equal_ddof(opname): + s = Series([1.0, 2.0, 3.0]) + ddof = 3 + + result_series = getattr(s, opname)(ddof=ddof) + assert np.isnan(result_series) + + result_gb = getattr(s.groupby([0, 0, 0]), opname)(ddof=ddof).iloc[0] + assert np.isnan(result_gb) + + result_window = getattr(s.rolling(3, min_periods=3), opname)(ddof=ddof).iloc[2] + assert np.isnan(result_window)