-
-
Couldn't load subscription status.
- Fork 19.2k
BUG: Fix some PeriodIndex resampling issues #16153
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ca7b6f2 c27f430 390e16e 23566c2 a82879d 4b1c740 73c0990 fa6c1d3 7ea04e9 82a8275 432c623 c8814fb 486ad67 ad8519f 39fc7e2 efcad5b 41401d4 398a684 8358c41 6084e0c File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -14,7 +14,7 @@ | |
| from pandas.core.indexes.datetimes import DatetimeIndex, date_range | ||
| from pandas.core.indexes.timedeltas import TimedeltaIndex | ||
| from pandas.tseries.offsets import DateOffset, Tick, Day, _delta_to_nanoseconds | ||
| from pandas.core.indexes.period import PeriodIndex, period_range | ||
| from pandas.core.indexes.period import PeriodIndex | ||
| import pandas.core.common as com | ||
| import pandas.core.algorithms as algos | ||
| from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries | ||
| | @@ -834,53 +834,32 @@ class PeriodIndexResampler(DatetimeIndexResampler): | |
| def _resampler_for_grouping(self): | ||
| return PeriodIndexResamplerGroupby | ||
| | ||
| def _get_binner_for_time(self): | ||
| if self.kind == 'timestamp': | ||
| return super(PeriodIndexResampler, self)._get_binner_for_time() | ||
| return self.groupby._get_period_bins(self.ax) | ||
| | ||
| def _convert_obj(self, obj): | ||
| obj = super(PeriodIndexResampler, self)._convert_obj(obj) | ||
| | ||
| offset = to_offset(self.freq) | ||
| if offset.n > 1: | ||
| if self.kind == 'period': # pragma: no cover | ||
| print('Warning: multiple of frequency -> timestamps') | ||
| | ||
| # Cannot have multiple of periods, convert to timestamp | ||
| if self._from_selection: | ||
| # see GH 14008, GH 12871 | ||
| msg = ("Resampling from level= or on= selection" | ||
| " with a PeriodIndex is not currently supported," | ||
| " use .set_index(...) to explicitly set index") | ||
| raise NotImplementedError(msg) | ||
| | ||
| if self.loffset is not None: | ||
| # Cannot apply loffset/timedelta to PeriodIndex -> convert to | ||
| # timestamps | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we show a warning for this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should, given that we kind of did that before (it was | ||
| self.kind = 'timestamp' | ||
| | ||
| # convert to timestamp | ||
| if not (self.kind is None or self.kind == 'period'): | ||
| if self._from_selection: | ||
| # see GH 14008, GH 12871 | ||
| msg = ("Resampling from level= or on= selection" | ||
| " with a PeriodIndex is not currently supported," | ||
| " use .set_index(...) to explicitly set index") | ||
| raise NotImplementedError(msg) | ||
| else: | ||
| obj = obj.to_timestamp(how=self.convention) | ||
| if self.kind == 'timestamp': | ||
| obj = obj.to_timestamp(how=self.convention) | ||
| | ||
| return obj | ||
| | ||
| def aggregate(self, arg, *args, **kwargs): | ||
| result, how = self._aggregate(arg, *args, **kwargs) | ||
| if result is None: | ||
| result = self._downsample(arg, *args, **kwargs) | ||
| | ||
| result = self._apply_loffset(result) | ||
| return result | ||
| | ||
| agg = aggregate | ||
| | ||
| def _get_new_index(self): | ||
| """ return our new index """ | ||
| ax = self.ax | ||
| | ||
| if len(ax) == 0: | ||
| values = [] | ||
| else: | ||
| start = ax[0].asfreq(self.freq, how=self.convention) | ||
| end = ax[-1].asfreq(self.freq, how='end') | ||
| values = period_range(start, end, freq=self.freq).asi8 | ||
| | ||
| return ax._shallow_copy(values, freq=self.freq) | ||
| | ||
| def _downsample(self, how, **kwargs): | ||
| """ | ||
| Downsample the cython defined function | ||
| | @@ -898,22 +877,17 @@ def _downsample(self, how, **kwargs): | |
| how = self._is_cython_func(how) or how | ||
| ax = self.ax | ||
| | ||
| new_index = self._get_new_index() | ||
| | ||
| # Start vs. end of period | ||
| memb = ax.asfreq(self.freq, how=self.convention) | ||
| | ||
| if is_subperiod(ax.freq, self.freq): | ||
| # Downsampling | ||
| if len(new_index) == 0: | ||
| bins = [] | ||
| else: | ||
| i8 = memb.asi8 | ||
| rng = np.arange(i8[0], i8[-1] + 1) | ||
| bins = memb.searchsorted(rng, side='right') | ||
| grouper = BinGrouper(bins, new_index) | ||
| return self._groupby_and_aggregate(how, grouper=grouper) | ||
| return self._groupby_and_aggregate(how, grouper=self.grouper) | ||
| elif is_superperiod(ax.freq, self.freq): | ||
| if how == 'ohlc': | ||
| # GH #13083 | ||
| # upsampling to subperiods is handled as an asfreq, which works | ||
| # for pure aggregating/reducing methods | ||
| # OHLC reduces along the time dimension, but creates multiple | ||
| # values for each period -> handle by _groupby_and_aggregate() | ||
| return self._groupby_and_aggregate(how, grouper=self.grouper) | ||
| return self.asfreq() | ||
| elif ax.freq == self.freq: | ||
| return self.asfreq() | ||
| | @@ -936,19 +910,16 @@ def _upsample(self, method, limit=None, fill_value=None): | |
| .fillna | ||
| | ||
| """ | ||
| if self._from_selection: | ||
| raise ValueError("Upsampling from level= or on= selection" | ||
| " is not supported, use .set_index(...)" | ||
| " to explicitly set index to" | ||
| " datetime-like") | ||
| | ||
| # we may need to actually resample as if we are timestamps | ||
| if self.kind == 'timestamp': | ||
| return super(PeriodIndexResampler, self)._upsample( | ||
| method, limit=limit, fill_value=fill_value) | ||
| | ||
| self._set_binner() | ||
| ax = self.ax | ||
| obj = self.obj | ||
| new_index = self._get_new_index() | ||
| new_index = self.binner | ||
| | ||
| # Start vs. end of period | ||
| memb = ax.asfreq(self.freq, how=self.convention) | ||
| | @@ -1293,6 +1264,51 @@ def _get_time_period_bins(self, ax): | |
| | ||
| return binner, bins, labels | ||
| | ||
| def _get_period_bins(self, ax): | ||
| if not isinstance(ax, PeriodIndex): | ||
| raise TypeError('axis must be a PeriodIndex, but got ' | ||
| 'an instance of %r' % type(ax).__name__) | ||
| | ||
| memb = ax.asfreq(self.freq, how=self.convention) | ||
| | ||
| # NaT handling as in pandas._lib.lib.generate_bins_dt64() | ||
| nat_count = 0 | ||
| if memb.hasnans: | ||
| nat_count = np.sum(memb._isnan) | ||
| memb = memb[~memb._isnan] | ||
| | ||
| # if index contains no valid (non-NaT) values, return empty index | ||
| if not len(memb): | ||
| binner = labels = PeriodIndex( | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I left this, ok for now. | ||
| data=[], freq=self.freq, name=ax.name) | ||
| return binner, [], labels | ||
| | ||
| start = ax.min().asfreq(self.freq, how=self.convention) | ||
| end = ax.max().asfreq(self.freq, how='end') | ||
| | ||
| labels = binner = PeriodIndex(start=start, end=end, | ||
| freq=self.freq, name=ax.name) | ||
| | ||
| i8 = memb.asi8 | ||
| freq_mult = self.freq.n | ||
| | ||
| # when upsampling to subperiods, we need to generate enough bins | ||
| expected_bins_count = len(binner) * freq_mult | ||
| i8_extend = expected_bins_count - (i8[-1] - i8[0]) | ||
| rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult) | ||
| rng += freq_mult | ||
| bins = memb.searchsorted(rng, side='left') | ||
| | ||
| if nat_count > 0: | ||
| # NaT handling as in pandas._lib.lib.generate_bins_dt64() | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this path tested sufficiently, e.g. 0, 1, 2 NaT? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added a test case for consecutive NaTs in the index (1cad7fa) Should be sufficiently tested, cases covered:
Any ideas for more exhaustive test cases? | ||
| # shift bins by the number of NaT | ||
| bins += nat_count | ||
| bins = np.insert(bins, 0, nat_count) | ||
| binner = binner.insert(0, tslib.NaT) | ||
| labels = labels.insert(0, tslib.NaT) | ||
| | ||
| return binner, bins, labels | ||
| | ||
| | ||
| def _take_new_index(obj, indexer, new_index, axis=0): | ||
| from pandas.core.api import Series, DataFrame | ||
| | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the fixed things that are not changed (e.g. [1], [2]), do in a separate ipython block above (to avoid repeating in previous/new).