Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions pandas/_libs/algos.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,15 @@ cdef inline float64_t calc_skew(
return moments_ratio * correction


@cython.cdivision(True)
cdef inline float64_t calc_var(
int64_t nobs, float64_t m2, int64_t ddof,
) noexcept nogil:
if nobs <= ddof:
return NAN
return m2 / <float64_t>(nobs - ddof)


@cython.cdivision(True)
cdef inline float64_t calc_kurt(
int64_t nobs, float64_t m2, float64_t m4
Expand Down
48 changes: 20 additions & 28 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ from pandas._libs cimport util
from pandas._libs.algos cimport (
calc_kurt,
calc_skew,
calc_var,
get_rank_nan_fill_val,
kth_smallest_c,
moments_add_value,
Expand Down Expand Up @@ -922,8 +923,8 @@ def group_var(
) -> None:
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
floating val, ct, oldmean
floating[:, ::1] mean
float64_t val
float64_t[:, ::1] mean, M2
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)
bint isna_entry, isna_result, uses_mask = mask is not None
Expand All @@ -936,7 +937,8 @@ def group_var(
raise ValueError("len(index) != len(labels)")

nobs = np.zeros((<object>out).shape, dtype=np.int64)
mean = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
mean = np.zeros((<object>out).shape, dtype=np.float64)
M2 = np.zeros((<object>out).shape, dtype=np.float64)

N, K = (<object>values).shape

Expand Down Expand Up @@ -979,35 +981,25 @@ def group_var(
# important for datetimelike because adding a value to NPY_NAT
# may not result in a NPY_NAT
continue
if isna_entry:
if skipna:
continue
val = NaN

if not isna_entry:
nobs[lab, j] += 1
oldmean = mean[lab, j]
mean[lab, j] += (val - oldmean) / nobs[lab, j]
out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
elif not skipna:
nobs[lab, j] = 0
if uses_mask:
result_mask[lab, j] = True
else:
out[lab, j] = NAN
moments_add_value(val, &nobs[lab, j], &mean[lab, j], &M2[lab, j],
NULL, NULL, 2)

for i in range(ncounts):
for j in range(K):
ct = nobs[i, j]
if ct <= ddof:
if uses_mask:
result_mask[i, j] = True
else:
out[i, j] = NAN
else:
if is_std:
out[i, j] = sqrt(out[i, j] / (ct - ddof))
elif is_sem:
out[i, j] = sqrt(out[i, j] / (ct - ddof) / ct)
else:
# just "var"
out[i, j] /= (ct - ddof)
out[i, j] = calc_var(nobs[i, j], M2[i, j], ddof)

if result_mask is not None and isnan(out[i, j]):
result_mask[i, j] = 1

if is_std:
out[i, j] = sqrt(out[i, j])
elif is_sem:
out[i, j] = sqrt(out[i, j] / <float64_t>nobs[i, j])


@cython.wraparound(False)
Expand Down
100 changes: 33 additions & 67 deletions pandas/_libs/window/aggregations.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ from pandas._libs.algos cimport (
TiebreakEnumType,
calc_kurt,
calc_skew,
calc_var,
moments_add_value,
)

Expand Down Expand Up @@ -330,94 +331,58 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start,
# Rolling variance


cdef float64_t calc_var(
int64_t minp,
int ddof,
float64_t nobs,
float64_t ssqdm_x,
) noexcept nogil:
cdef:
float64_t result

# Variance is unchanged if no observation is added or removed
if (nobs >= minp) and (nobs > ddof):
result = ssqdm_x / (nobs - <float64_t>ddof)
else:
result = NaN

return result


cdef void add_var(
float64_t val,
float64_t *nobs,
float64_t *mean_x,
float64_t *ssqdm_x,
int64_t *nobs,
float64_t *mean,
float64_t *m2,
float64_t *compensation,
bint *numerically_unstable,
) noexcept nogil:
""" add a value from the var calc """
cdef:
float64_t delta, prev_mean, y, t
float64_t prev_m2 = ssqdm_x[0]
float64_t old_m2 = m2[0]

# GH#21813, if msvc 2017 bug is resolved, we should be OK with != instead of `isnan`
if val != val:
return

nobs[0] = nobs[0] + 1

# Welford's method for the online variance-calculation
# using Kahan summation
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
prev_mean = mean_x[0] - compensation[0]
y = val - compensation[0]
t = y - mean_x[0]
compensation[0] = t + mean_x[0] - y
delta = t
if nobs[0]:
mean_x[0] = mean_x[0] + delta / nobs[0]
else:
mean_x[0] = 0
ssqdm_x[0] = ssqdm_x[0] + (val - prev_mean) * (val - mean_x[0])

if prev_m2 * InvCondTol > ssqdm_x[0]:
# possible catastrophic cancellation
numerically_unstable[0] = True
# Not NaN
if val == val:
moments_add_value(val, nobs, mean, m2, NULL, NULL, 2)
if fabs(old_m2) * InvCondTol > fabs(m2[0]):
# possible catastrophic cancellation
numerically_unstable[0] = True


cdef void remove_var(
float64_t val,
float64_t *nobs,
float64_t *mean_x,
float64_t *ssqdm_x,
int64_t *nobs,
float64_t *mean,
float64_t *m2,
float64_t *compensation,
bint *numerically_unstable,
) noexcept nogil:
""" remove a value from the var calc """
cdef:
float64_t delta, prev_mean, y, t
float64_t prev_m2 = ssqdm_x[0]
float64_t prev_m2 = m2[0]
if val == val:
nobs[0] = nobs[0] - 1
if nobs[0]:
# Welford's method for the online variance-calculation
# using Kahan summation
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
prev_mean = mean_x[0] - compensation[0]
prev_mean = mean[0] - compensation[0]
y = val - compensation[0]
t = y - mean_x[0]
compensation[0] = t + mean_x[0] - y
t = y - mean[0]
compensation[0] = t + mean[0] - y
delta = t
mean_x[0] = mean_x[0] - delta / nobs[0]
ssqdm_x[0] = ssqdm_x[0] - (val - prev_mean) * (val - mean_x[0])
mean[0] = mean[0] - delta / nobs[0]
m2[0] = m2[0] - (val - prev_mean) * (val - mean[0])

if prev_m2 * InvCondTol > ssqdm_x[0]:
if prev_m2 * InvCondTol > m2[0]:
# possible catastrophic cancellation
numerically_unstable[0] = True
else:
mean_x[0] = 0
ssqdm_x[0] = 0
mean[0] = 0
m2[0] = 0
numerically_unstable[0] = False


Expand All @@ -427,7 +392,8 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
Numerically stable implementation using Welford's method.
"""
cdef:
float64_t mean_x, ssqdm_x, nobs, compensation_add,
int64_t nobs
float64_t mean, m2, compensation_add,
float64_t compensation_remove
int64_t s, e
Py_ssize_t i, j, N = len(start)
Expand Down Expand Up @@ -462,28 +428,28 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,

# calculate deletes
for j in range(start[i - 1], s):
remove_var(values[j], &nobs, &mean_x, &ssqdm_x,
remove_var(values[j], &nobs, &mean, &m2,
&compensation_remove, &numerically_unstable)

# calculate adds
for j in range(end[i - 1], e):
add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add,
add_var(values[j], &nobs, &mean, &m2, &compensation_add,
&numerically_unstable)

if requires_recompute or numerically_unstable:

mean_x = ssqdm_x = nobs = compensation_add = compensation_remove = 0
mean = m2 = nobs = compensation_add = compensation_remove = 0
for j in range(s, e):
add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add,
add_var(values[j], &nobs, &mean, &m2, &compensation_add,
&numerically_unstable)
numerically_unstable = False

output[i] = calc_var(minp, ddof, nobs, ssqdm_x)
output[i] = NaN if nobs < minp else calc_var(nobs, m2, ddof)

if not is_monotonic_increasing_bounds:
nobs = 0.0
mean_x = 0.0
ssqdm_x = 0.0
nobs = 0
mean = 0.0
m2 = 0.0
compensation_remove = 0.0

return output
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/reductions/test_stat_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,9 @@ def test_kurt_corner(self, using_python_scalars):
@pytest.mark.parametrize(
"opname",
[
"std",
"sem",
"var",
"skew",
"kurt",
],
Expand Down Expand Up @@ -305,3 +308,18 @@ def test_reduction_consistency(opname, loc, scale):

result_frame = getattr(DataFrame(s), opname)().iloc[0]
tm.assert_almost_equal(result_series, result_frame)


@pytest.mark.parametrize("opname", ["var", "std", "sem"])
def test_stat_nobs_equal_ddof(opname):
s = Series([1.0, 2.0, 3.0])
ddof = 3

result_series = getattr(s, opname)(ddof=ddof)
assert np.isnan(result_series)

result_gb = getattr(s.groupby([0, 0, 0]), opname)(ddof=ddof).iloc[0]
assert np.isnan(result_gb)

result_window = getattr(s.rolling(3, min_periods=3), opname)(ddof=ddof).iloc[2]
assert np.isnan(result_window)
Loading