-
- Notifications
You must be signed in to change notification settings - Fork 19.2k
Description
Pandas version checks
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
>>> import pandas as pd >>> import numpy as np >>> ix1 = pd.MultiIndex.from_arrays([[np.nan, 81, 81, 82, 82], [np.nan, np.nan, np.nan, np.nan, np.nan], pd.to_datetime([np.nan, '2018-06-01', '2018-07-01', '2018-07-01', '2018-08-01'])], names=['foo', 'bar', 'date']) >>> ix1 MultiIndex([( nan, nan, 'NaT'), (81.0, nan, '2018-06-01'), (81.0, nan, '2018-07-01'), (82.0, nan, '2018-07-01'), (82.0, nan, '2018-08-01')], names=['foo', 'bar', 'date']) >>> >>> s1 = pd.Series([np.nan, 25.058969, 22.519751, 20.847981, 21.625236], index=ix1) >>> s1 foo bar date NaN NaN NaT NaN 81 NaN 2018-06-01 25.058969 2018-07-01 22.519751 82 NaN 2018-07-01 20.847981 2018-08-01 21.625236 dtype: float64 >>> >>> ix2 = pd.Index([81, 82, 83, 84, 85, 86, 87], name='foo') >>> ix2 Index([81, 82, 83, 84, 85, 86, 87], dtype='int64', name='foo') >>> >>> s2 = pd.Series([28.2800, 25.2500, 22.2200, 16.7660, 14.0087, 14.9480, 29.2900], ix2) >>> >>> s2 foo 81 28.2800 82 25.2500 83 22.2200 84 16.7660 85 14.0087 86 14.9480 87 29.2900 dtype: float64 >>> >>> >>> s1 - s2 Traceback (most recent call last): File "<stdin>", line 1, in <module> File ".venv/lib64/python3.11/site-packages/pandas/core/ops/common.py", line 76, in new_method return method(self, other) ^^^^^^^^^^^^^^^^^^^ File ".venv/lib64/python3.11/site-packages/pandas/core/arraylike.py", line 194, in __sub__ return self._arith_method(other, operator.sub) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File ".venv/lib64/python3.11/site-packages/pandas/core/series.py", line 5814, in _arith_method self, other = self._align_for_op(other) ^^^^^^^^^^^^^^^^^^^^^^^^^ File ".venv/lib64/python3.11/site-packages/pandas/core/series.py", line 5844, in _align_for_op left, right = left.align(right, copy=False) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File ".venv/lib64/python3.11/site-packages/pandas/core/generic.py", line 10091, in align left, _right, join_index = self._align_series( ^^^^^^^^^^^^^^^^^^^ File ".venv/lib64/python3.11/site-packages/pandas/core/generic.py", line 10213, in _align_series left = self._reindex_indexer(join_index, lidx, copy) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File ".venv/lib64/python3.11/site-packages/pandas/core/series.py", line 4782, in _reindex_indexer return self._constructor(new_values, index=new_index, copy=False) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File ".venv/lib64/python3.11/site-packages/pandas/core/series.py", line 503, in __init__ com.require_length_match(data, index) File ".venv/lib64/python3.11/site-packages/pandas/core/common.py", line 561, in require_length_match raise ValueError( ValueError: Length of values (5) does not match length of index (4) >>> s1.iloc[1:] - s2 # proactively removing the all-nan index row allows the operation to succeed without error foo bar date 81 NaN 2018-06-01 -3.221031 2018-07-01 -5.760249 82 NaN 2018-07-01 -4.402019 2018-08-01 -3.624764 dtype: float64
Issue Description
It is possible to carry out arithmetic operations on two series with "mixed" indices when at least 1 level is the same. However, in my case s1 - s2
, s1
contains an all nan
index row which raises a ValueError: Length of values (5) does not match length of index (4)
.
I found that this could be an error in how the two series are aligned.
class Series(...): ... def _align_series(...): ... if not axis: # equal if self.index.equals(other.index): join_index, lidx, ridx = None, None, None else: join_index, lidx, ridx = self.index.join( other.index, how=join, level=level, return_indexers=True ) ## At this point, `join_index` is invalid as it contains different length codes: ## join_index.code == FrozenList([[0, 0, 1, 1], [-1, -1, -1, -1, -1], [-1, 0, 1, 1, 2]]) ## which returns (4 items in level index 0) ## join_index.get_level_values(0) == Index([81.0, 81.0, 82.0, 82.0], dtype='float64', name='foo') ## while (5 items in level index 1 and 2) ## join_index.get_level_values(1) == Index([nan, nan, nan, nan, nan], dtype='float64', name='demand_index') ## join_index.get_level_values(2) == DatetimeIndex(['NaT', '2018-06-01', '2018-07-01', '2018-07-01', '2018-08-01'], dtype='datetime64[ns]', name='blend_date', freq=None) ## if is_series: ## The invalid `join_index` is picked up by `._reindex_indexer(...)` as `len(join_index)` == 4 and `len(new_values) == 5` which causes the `ValueError` left = self._reindex_indexer(join_index, lidx, copy) ... def _reindex_indexer(...): ... new_values = algorithms.take_nd( self._values, indexer, allow_fill=True, fill_value=None ) return self._constructor(new_values, index=new_index, copy=False) ## <- raises the ValueError
I traced the origin of the mismatching codes to pandas.core.indexes.base.py:Index._join_level
which blatantly ignores missing values to construct a new index.
class Index(...): def _join_level(...): ... else: left_lev_indexer = ensure_platform_int(left_lev_indexer) rev_indexer = lib.get_reverse_indexer(left_lev_indexer, len(old_level)) old_codes = left.codes[level] ## This will ignore missing values (nan) without ever inserting ## those values back into the index, ultimately leading to ## different length codes taker = old_codes[old_codes != -1] new_lev_codes = rev_indexer.take(taker) new_codes = list(left.codes) new_codes[level] = new_lev_codes new_levels = list(left.levels) new_levels[level] = new_level if keep_order: # just drop missing values. o.w. keep order left_indexer = np.arange(len(left), dtype=np.intp) left_indexer = cast(np.ndarray, left_indexer) mask = new_lev_codes != -1 if not mask.all(): new_codes = [lab[mask] for lab in new_codes] left_indexer = left_indexer[mask] ... join_index = MultiIndex( levels=new_levels, codes=new_codes, names=left.names, verify_integrity=False, )
This is all possible because verify_integrity
is set to False (and not passed down). If I set verify_integrity=True
the join_index = MultiIndex(...)
fails much earlier with ValueError: Length of levels and codes must match. NOTE: this index is in an inconsistent state.
class MultiIndex(...): def __new__(...): ... # result._set_codes(codes, copy=copy, validate=False) result._set_codes(codes, copy=copy, validate=False, verify_integrity=True) ...
I tried to fix this by changing the taker = old_codes[old_codes != -1]
to taker = old_codes
. This alleviates the initial ValueError
(just tested for my case). If I also comment out the -1 handling, I get the desired expected behaviour.
# if not mask.all(): # new_codes = [lab[mask] for lab in new_codes] # left_indexer = left_indexer[mask]
Expected Behavior
>>> s1 foo bar date NaN NaN NaT NaN 81 NaN 2018-06-01 25.058969 2018-07-01 22.519751 82 NaN 2018-07-01 20.847981 2018-08-01 21.625236 dtype: float64 >>> s2 foo 81 28.2800 82 25.2500 83 22.2200 84 16.7660 85 14.0087 86 14.9480 87 29.2900 dtype: float64 >>> s1 - s2 foo bar date NaN NaN NaT NaN 81 NaN 2018-06-01 -3.221031 2018-07-01 -5.760249 82 NaN 2018-07-01 -4.402019 2018-08-01 -3.624764
Installed Versions
INSTALLED VERSIONS ------------------ commit : e86ed377639948c64c429059127bcf5b359ab6be python : 3.11.11.final.0 python-bits : 64 OS : Linux OS-release : 6.12.11-200.fc41.x86_64 Version : #1 SMP PREEMPT_DYNAMIC Fri Jan 24 04:59:58 UTC 2025 machine : x86_64 processor : byteorder : little LC_ALL : None LANG : en_AU.UTF-8 LOCALE : en_AU.UTF-8 pandas : 2.1.1 numpy : 1.24.3 pytz : 2020.4 dateutil : 2.8.2 setuptools : 69.2.0 pip : 24.2 Cython : 0.29.34 pytest : 7.3.1 hypothesis : None sphinx : None blosc : None feather : None xlsxwriter : 0.9.6 lxml.etree : None html5lib : None pymysql : None psycopg2 : 2.9.6 jinja2 : 2.11.2 IPython : None pandas_datareader : None bs4 : None bottleneck : 1.3.5 dataframe-api-compat: None fastparquet : None fsspec : None gcsfs : None matplotlib : None numba : None numexpr : 2.8.4 odfpy : None openpyxl : 3.1.2 pandas_gbq : None pyarrow : 11.0.0 pyreadstat : None pyxlsb : None s3fs : None scipy : 1.10.1 sqlalchemy : 1.3.23 tables : 3.8.0 tabulate : None xarray : None xlrd : 2.0.1 zstandard : None tzdata : 2024.2 qtpy : None pyqt5 : None
Also happens with pandas==2.2.3