-
- Notifications
You must be signed in to change notification settings - Fork 19.2k
API: better error-handling for df.set_index #22486
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
28ec3a9 8dd1453 18d3464 fa68660 6d3dfa1 159ecf5 46f8dc7 3a6469d c3cdde2 dd8d000 47c4e74 06ed33a 9871be2 9da6e6c 361a905 42d5f2a File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -49,7 +49,8 @@ def test_set_index_cast(self): | |
| tm.assert_frame_equal(df, df2) | ||
| | ||
| # A has duplicate values, C does not | ||
| @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']]) | ||
| @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], | ||
| ('tuple', 'as', 'label')]) | ||
| @pytest.mark.parametrize('inplace', [True, False]) | ||
| @pytest.mark.parametrize('drop', [True, False]) | ||
| def test_set_index_drop_inplace(self, frame_of_index_cols, | ||
| | @@ -72,7 +73,8 @@ def test_set_index_drop_inplace(self, frame_of_index_cols, | |
| tm.assert_frame_equal(result, expected) | ||
| | ||
| # A has duplicate values, C does not | ||
| @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']]) | ||
| @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], | ||
| ('tuple', 'as', 'label')]) | ||
| @pytest.mark.parametrize('drop', [True, False]) | ||
| def test_set_index_append(self, frame_of_index_cols, drop, keys): | ||
| df = frame_of_index_cols | ||
| | @@ -88,7 +90,8 @@ def test_set_index_append(self, frame_of_index_cols, drop, keys): | |
| tm.assert_frame_equal(result, expected) | ||
| | ||
| # A has duplicate values, C does not | ||
| @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']]) | ||
| @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], | ||
| ('tuple', 'as', 'label')]) | ||
| @pytest.mark.parametrize('drop', [True, False]) | ||
| def test_set_index_append_to_multiindex(self, frame_of_index_cols, | ||
| drop, keys): | ||
| | @@ -114,8 +117,10 @@ def test_set_index_after_mutation(self): | |
| tm.assert_frame_equal(result, expected) | ||
| | ||
| # MultiIndex constructor does not work directly on Series -> lambda | ||
| # Add list-of-list constructor because list is ambiguous -> lambda | ||
| # also test index name if append=True (name is duplicate here for B) | ||
| @pytest.mark.parametrize('box', [Series, Index, np.array, | ||
| list, tuple, iter, lambda x: [list(x)], | ||
| lambda x: MultiIndex.from_arrays([x])]) | ||
| @pytest.mark.parametrize('append, index_name', [(True, None), | ||
| (True, 'B'), (True, 'test'), (False, None)]) | ||
| | @@ -126,21 +131,29 @@ def test_set_index_pass_single_array(self, frame_of_index_cols, | |
| df.index.name = index_name | ||
| | ||
| key = box(df['B']) | ||
| # np.array and list "forget" the name of B | ||
| name = [None if box in [np.array, list] else 'B'] | ||
| if box == list: | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why exactly is a list interpreted differently here? this makes this test really really odd. I am worried something changed here, as this appears to work just fine in the master. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You will see above (line 123) that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jreback | ||
| # list of strings gets interpreted as list of keys | ||
| msg = "['one', 'two', 'three', 'one', 'two']" | ||
| with tm.assert_raises_regex(KeyError, msg): | ||
| df.set_index(key, drop=drop, append=append) | ||
| else: | ||
| # np.array/tuple/iter/list-of-list "forget" the name of B | ||
| name_mi = getattr(key, 'names', None) | ||
| name = [getattr(key, 'name', None)] if name_mi is None else name_mi | ||
| | ||
| result = df.set_index(key, drop=drop, append=append) | ||
| result = df.set_index(key, drop=drop, append=append) | ||
| | ||
| # only valid column keys are dropped | ||
| # since B is always passed as array above, nothing is dropped | ||
| expected = df.set_index(['B'], drop=False, append=append) | ||
| expected.index.names = [index_name] + name if append else name | ||
| # only valid column keys are dropped | ||
| # since B is always passed as array above, nothing is dropped | ||
| expected = df.set_index(['B'], drop=False, append=append) | ||
| expected.index.names = [index_name] + name if append else name | ||
| | ||
| tm.assert_frame_equal(result, expected) | ||
| tm.assert_frame_equal(result, expected) | ||
| | ||
| # MultiIndex constructor does not work directly on Series -> lambda | ||
| # also test index name if append=True (name is duplicate here for A & B) | ||
| @pytest.mark.parametrize('box', [Series, Index, np.array, list, | ||
| @pytest.mark.parametrize('box', [Series, Index, np.array, | ||
| list, tuple, iter, | ||
| lambda x: MultiIndex.from_arrays([x])]) | ||
| @pytest.mark.parametrize('append, index_name', | ||
| [(True, None), (True, 'A'), (True, 'B'), | ||
| | @@ -152,8 +165,8 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, | |
| df.index.name = index_name | ||
| | ||
| keys = ['A', box(df['B'])] | ||
| # np.array and list "forget" the name of B | ||
| names = ['A', None if box in [np.array, list] else 'B'] | ||
| # np.array/list/tuple/iter "forget" the name of B | ||
| names = ['A', None if box in [np.array, list, tuple, iter] else 'B'] | ||
| | ||
| result = df.set_index(keys, drop=drop, append=append) | ||
| | ||
| | @@ -168,10 +181,12 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, | |
| # MultiIndex constructor does not work directly on Series -> lambda | ||
| # We also emulate a "constructor" for the label -> lambda | ||
| # also test index name if append=True (name is duplicate here for A) | ||
| @pytest.mark.parametrize('box2', [Series, Index, np.array, list, | ||
| @pytest.mark.parametrize('box2', [Series, Index, np.array, | ||
| list, tuple, iter, | ||
| lambda x: MultiIndex.from_arrays([x]), | ||
| lambda x: x.name]) | ||
| @pytest.mark.parametrize('box1', [Series, Index, np.array, list, | ||
| @pytest.mark.parametrize('box1', [Series, Index, np.array, | ||
| list, tuple, iter, | ||
| lambda x: MultiIndex.from_arrays([x]), | ||
| lambda x: x.name]) | ||
| @pytest.mark.parametrize('append, index_name', [(True, None), | ||
| | @@ -183,21 +198,20 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, | |
| df.index.name = index_name | ||
| | ||
| keys = [box1(df['A']), box2(df['A'])] | ||
| result = df.set_index(keys, drop=drop, append=append) | ||
| | ||
| # == gives ambiguous Boolean for Series | ||
| if drop and keys[0] is 'A' and keys[1] is 'A': | ||
| with tm.assert_raises_regex(KeyError, '.*'): | ||
| df.set_index(keys, drop=drop, append=append) | ||
| else: | ||
| result = df.set_index(keys, drop=drop, append=append) | ||
| | ||
| # to test against already-tested behavior, we add sequentially, | ||
| # hence second append always True; must wrap in list, otherwise | ||
| # list-box will be illegal | ||
| expected = df.set_index([keys[0]], drop=drop, append=append) | ||
| expected = expected.set_index([keys[1]], drop=drop, append=True) | ||
| # if either box was iter, the content has been consumed; re-read it | ||
| keys = [box1(df['A']), box2(df['A'])] | ||
| | ||
| tm.assert_frame_equal(result, expected) | ||
| # to test against already-tested behaviour, we add sequentially, | ||
| # hence second append always True; must wrap keys in list, otherwise | ||
| # box = list would be illegal; need to adapt drop for case that both | ||
| # keys are 'A' -- can't drop the same column twice | ||
| expected = df.set_index([keys[0]], drop=(False if (keys[0] is 'A' | ||
| ||
| and keys[1] is 'A') | ||
| else drop), append=append) | ||
| expected = expected.set_index([keys[1]], drop=drop, append=True) | ||
| tm.assert_frame_equal(result, expected) | ||
| | ||
| @pytest.mark.parametrize('append', [True, False]) | ||
| @pytest.mark.parametrize('drop', [True, False]) | ||
| | @@ -229,13 +243,24 @@ def test_set_index_verify_integrity(self, frame_of_index_cols): | |
| def test_set_index_raise(self, frame_of_index_cols, drop, append): | ||
| df = frame_of_index_cols | ||
| | ||
| with tm.assert_raises_regex(KeyError, '.*'): # column names are A-E | ||
| with tm.assert_raises_regex(KeyError, "['foo', 'bar', 'baz']"): | ||
| # column names are A-E, as well as one tuple | ||
| df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append) | ||
| | ||
| # non-existent key in list with arrays | ||
| with tm.assert_raises_regex(KeyError, '.*'): | ||
| with tm.assert_raises_regex(KeyError, 'X'): | ||
| df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append) | ||
| | ||
| msg = 'The parameter "keys" may only contain a combination of.*' | ||
| # forbidden type, e.g. set | ||
| with tm.assert_raises_regex(TypeError, msg): | ||
| df.set_index(set(df['A']), drop=drop, append=append) | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are you singling out an iterable (set) is there some reason? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, just any of the types that are not allowed (I've tried to indicate as much by using "e.g.") | ||
| | ||
| # forbidden type in list, e.g. set | ||
| with tm.assert_raises_regex(TypeError, msg): | ||
| df.set_index(['A', df['A'], set(df['A'])], | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see my comment above, I am not sure a set is specifically excluded. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am proposing to exclude sets (see above). Irrespective of that, I need to test raising the TypeError for forbidden types (which type exactly is less relevant) | ||
| drop=drop, append=append) | ||
| | ||
| def test_construction_with_categorical_index(self): | ||
| ci = tm.makeCategoricalIndex(10) | ||
| ci.name = 'B' | ||
| | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is this isinstance check for set necessary?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didn't know sets are considered list-like. IMO they shouldn't be, because they have no defined order (which is important for an index, especially when
append=True). Do we really want to support this option of letting users shoot themselves in the foot?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i am not averse to excluding, but i also don't want ad-infinitem special cases here. This code is already too complex. We accept an iterable, so this is a valid input, if the user wants to do it that's there issue; these are not currently excluded.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The larger issue IMO is that
is_list_like(<set>)should be False and not True. It's the only exclusion here, and it makes sense. On many issues, pandas prides itself on enforcing sensible defaults and behaviour. Why would we want to enable something that's broken by design?I may also add that this is not something that's currently allowed, and it shouldn't be IMO