-
- Notifications
You must be signed in to change notification settings - Fork 19.2k
BUG: fix polluted window in skewness computation #62863
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 7 commits
7401a36 23345cf 488e99e ff9108d a2d0937 8020107 edc2393 ae1f3f9 5cc8b60 7f1d2a3 0aa2949 6f21167 cca03e9 6177d6a 9d9451d 74aba02 a1fb800 File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,7 @@ | ||
| # cython: boundscheck=False, wraparound=False, cdivision=True | ||
| | ||
| from libc.math cimport ( | ||
| fabs, | ||
| round, | ||
| signbit, | ||
| sqrt, | ||
| | @@ -482,196 +483,150 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, | |
| | ||
| | ||
| cdef float64_t calc_skew(int64_t minp, int64_t nobs, | ||
| float64_t x, float64_t xx, float64_t xxx, | ||
| int64_t num_consecutive_same_value | ||
| float64_t mean, float64_t m2, float64_t m3, | ||
| ) noexcept nogil: | ||
| cdef: | ||
| float64_t result, dnobs | ||
| float64_t A, B, C, R | ||
| float64_t moments_ratio, correction | ||
| | ||
| if nobs >= minp: | ||
| dnobs = <float64_t>nobs | ||
| A = x / dnobs | ||
| B = xx / dnobs - A * A | ||
| C = xxx / dnobs - A * A * A - 3 * A * B | ||
| | ||
| if nobs < 3: | ||
| result = NaN | ||
| # GH 42064 46431 | ||
| # uniform case, force result to be 0 | ||
| elif num_consecutive_same_value >= nobs: | ||
| result = 0.0 | ||
| # #18044: with uniform distribution, floating issue will | ||
| # cause B != 0. and cause the result is a very | ||
| # #18044: with degenerate distribution, floating issue will | ||
| # cause m2 != 0. and cause the result is a very | ||
| # large number. | ||
| # | ||
| # in core/nanops.py nanskew/nankurt call the function | ||
| # _zero_out_fperr(m2) to fix floating error. | ||
| # if the variance is less than 1e-14, it could be | ||
| # treat as zero, here we follow the original | ||
| # skew/kurt behaviour to check B <= 1e-14 | ||
| elif B <= 1e-14: | ||
| # skew/kurt behaviour to check m2 < n * (eps * eps * mean * mean) | ||
| elif m2 < dnobs * (1e-28 * mean * mean if fabs(mean) > 1e-14 else 1e-14): | ||
| result = NaN | ||
| else: | ||
| R = sqrt(B) | ||
| result = ((sqrt(dnobs * (dnobs - 1.)) * C) / | ||
| ((dnobs - 2) * R * R * R)) | ||
| moments_ratio = m3 / (m2 * sqrt(m2)) | ||
| correction = dnobs * sqrt((dnobs - 1)) / (dnobs - 2) | ||
| result = moments_ratio * correction | ||
| else: | ||
| result = NaN | ||
| | ||
| return result | ||
| | ||
| | ||
| cdef void add_skew(float64_t val, int64_t *nobs, | ||
| float64_t *x, float64_t *xx, | ||
| float64_t *xxx, | ||
| float64_t *compensation_x, | ||
| float64_t *compensation_xx, | ||
| float64_t *compensation_xxx, | ||
| int64_t *num_consecutive_same_value, | ||
| float64_t *prev_value, | ||
| float64_t *mean, float64_t *m2, | ||
| float64_t *m3, | ||
| bint *numerically_unstable | ||
| ) noexcept nogil: | ||
| """ add a value from the skew calc """ | ||
| cdef: | ||
| float64_t y, t | ||
| float64_t n, delta, delta_n, term1, m3_update, new_m3 | ||
| | ||
| # Not NaN | ||
| if val == val: | ||
| nobs[0] = nobs[0] + 1 | ||
| nobs[0] += 1 | ||
| n = <float64_t>(nobs[0]) | ||
| delta = val - mean[0] | ||
| delta_n = delta / n | ||
| term1 = delta * delta_n * (n - 1.0) | ||
| | ||
| y = val - compensation_x[0] | ||
| t = x[0] + y | ||
| compensation_x[0] = t - x[0] - y | ||
| x[0] = t | ||
| y = val * val - compensation_xx[0] | ||
| t = xx[0] + y | ||
| compensation_xx[0] = t - xx[0] - y | ||
| xx[0] = t | ||
| y = val * val * val - compensation_xxx[0] | ||
| t = xxx[0] + y | ||
| compensation_xxx[0] = t - xxx[0] - y | ||
| xxx[0] = t | ||
| m3_update = delta_n * (term1 * (n - 2.0) - 3.0 * m2[0]) | ||
| new_m3 = m3[0] + m3_update | ||
| if fabs(m3_update) + fabs(m3[0]) > 1e10 * fabs(new_m3): | ||
| ||
| # possible catastrophic cancellation | ||
| numerically_unstable[0] = True | ||
| | ||
| # GH#42064, record num of same values to remove floating point artifacts | ||
| if val == prev_value[0]: | ||
| num_consecutive_same_value[0] += 1 | ||
| else: | ||
| # reset to 1 (include current value itself) | ||
| num_consecutive_same_value[0] = 1 | ||
| prev_value[0] = val | ||
| m3[0] = new_m3 | ||
| m2[0] += term1 | ||
| mean[0] += delta_n | ||
| | ||
| | ||
| cdef void remove_skew(float64_t val, int64_t *nobs, | ||
| float64_t *x, float64_t *xx, | ||
| float64_t *xxx, | ||
| float64_t *compensation_x, | ||
| float64_t *compensation_xx, | ||
| float64_t *compensation_xxx) noexcept nogil: | ||
| float64_t *mean, float64_t *m2, | ||
| float64_t *m3, | ||
| bint *numerically_unstable) noexcept nogil: | ||
| """ remove a value from the skew calc """ | ||
| cdef: | ||
| float64_t y, t | ||
| float64_t n, delta, delta_n, term1, m3_update, new_m3 | ||
| | ||
| # Not NaN | ||
| if val == val: | ||
| nobs[0] = nobs[0] - 1 | ||
| nobs[0] -= 1 | ||
| n = <float64_t>(nobs[0]) | ||
| delta = val - mean[0] | ||
| delta_n = delta / n | ||
| term1 = delta_n * delta * (n + 1.0) | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might be interesting to use fused add multiply There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I couldn't figure out an effective way to reformulate the computation of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the latest commit, I had a performance regression against the main branch. | ||
| | ||
| y = - val - compensation_x[0] | ||
| t = x[0] + y | ||
| compensation_x[0] = t - x[0] - y | ||
| x[0] = t | ||
| y = - val * val - compensation_xx[0] | ||
| t = xx[0] + y | ||
| compensation_xx[0] = t - xx[0] - y | ||
| xx[0] = t | ||
| y = - val * val * val - compensation_xxx[0] | ||
| t = xxx[0] + y | ||
| compensation_xxx[0] = t - xxx[0] - y | ||
| xxx[0] = t | ||
| m3_update = delta_n * (term1 * (n + 2.0) - 3.0 * m2[0]) | ||
| new_m3 = m3[0] - m3_update | ||
| | ||
| if fabs(m3_update) + fabs(m3[0]) > 1e10 * fabs(new_m3): | ||
| # possible catastrophic cancellation | ||
| numerically_unstable[0] = True | ||
| | ||
| m3[0] = new_m3 | ||
| m2[0] -= term1 | ||
| mean[0] -= delta_n | ||
| | ||
| | ||
| def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, | ||
| ndarray[int64_t] end, int64_t minp) -> np.ndarray: | ||
| cdef: | ||
| Py_ssize_t i, j | ||
| float64_t val, min_val, mean_val, sum_val = 0 | ||
| float64_t compensation_xxx_add, compensation_xxx_remove | ||
| float64_t compensation_xx_add, compensation_xx_remove | ||
| float64_t compensation_x_add, compensation_x_remove | ||
| float64_t x, xx, xxx | ||
| float64_t prev_value | ||
| int64_t nobs = 0, N = len(start), V = len(values), nobs_mean = 0 | ||
| int64_t s, e, num_consecutive_same_value | ||
| ndarray[float64_t] output, values_copy | ||
| bint is_monotonic_increasing_bounds | ||
| float64_t val | ||
| float64_t mean, m2, m3 | ||
| int64_t nobs = 0, N = len(start) | ||
| int64_t s, e | ||
| ndarray[float64_t] output | ||
| bint requires_recompute, numerically_unstable = False | ||
| | ||
| minp = max(minp, 3) | ||
| is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( | ||
| start, end | ||
| ) | ||
| output = np.empty(N, dtype=np.float64) | ||
| min_val = np.nanmin(values) | ||
| values_copy = np.copy(values) | ||
| | ||
| with nogil: | ||
| for i in range(0, V): | ||
| val = values_copy[i] | ||
| if val == val: | ||
| nobs_mean += 1 | ||
| sum_val += val | ||
| mean_val = sum_val / nobs_mean | ||
| # Other cases would lead to imprecision for smallest values | ||
| if min_val - mean_val > -1e5: | ||
| mean_val = round(mean_val) | ||
| for i in range(0, V): | ||
| values_copy[i] = values_copy[i] - mean_val | ||
| | ||
| for i in range(0, N): | ||
| | ||
| s = start[i] | ||
| e = end[i] | ||
| | ||
| # Over the first window, observations can only be added | ||
| # never removed | ||
| if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]: | ||
| | ||
| prev_value = values[s] | ||
| num_consecutive_same_value = 0 | ||
| | ||
| compensation_xxx_add = compensation_xxx_remove = 0 | ||
| compensation_xx_add = compensation_xx_remove = 0 | ||
| compensation_x_add = compensation_x_remove = 0 | ||
| x = xx = xxx = 0 | ||
| nobs = 0 | ||
| for j in range(s, e): | ||
| val = values_copy[j] | ||
| add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add, | ||
| &compensation_xx_add, &compensation_xxx_add, | ||
| &num_consecutive_same_value, &prev_value) | ||
| | ||
| else: | ||
| requires_recompute = ( | ||
| i == 0 | ||
| or not is_monotonic_increasing_bounds | ||
| or s >= end[i - 1] | ||
| or numerically_unstable | ||
| ) | ||
| | ||
| # After the first window, observations can both be added | ||
| # and removed | ||
| # calculate deletes | ||
| if not requires_recompute: | ||
| for j in range(start[i - 1], s): | ||
| val = values_copy[j] | ||
| remove_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_remove, | ||
| &compensation_xx_remove, &compensation_xxx_remove) | ||
| val = values[j] | ||
| remove_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable) | ||
| | ||
| # calculate adds | ||
| for j in range(end[i - 1], e): | ||
| val = values_copy[j] | ||
| add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add, | ||
| &compensation_xx_add, &compensation_xxx_add, | ||
| &num_consecutive_same_value, &prev_value) | ||
| val = values[j] | ||
| add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable) | ||
| | ||
| if requires_recompute or numerically_unstable: | ||
| mean = m2 = m3 = 0.0 | ||
| nobs = 0 | ||
| | ||
| for j in range(s, e): | ||
| val = values[j] | ||
| add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable) | ||
| | ||
| numerically_unstable = False | ||
| | ||
| output[i] = calc_skew(minp, nobs, x, xx, xxx, num_consecutive_same_value) | ||
| output[i] = calc_skew(minp, nobs, mean, m2, m3) | ||
| | ||
| if not is_monotonic_increasing_bounds: | ||
| nobs = 0 | ||
| x = 0.0 | ||
| xx = 0.0 | ||
| xxx = 0.0 | ||
| mean = 0.0 | ||
| m2 = 0.0 | ||
| m3 = 0.0 | ||
| | ||
| return output | ||
| | ||
| | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks you for using descriptive names for these variables!