pandas-dev · jreback · Oct 19, 2018 · Aug 23, 2018 · Sep 19, 2018 · Sep 23, 2018
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -586,6 +586,7 @@ Other API Changes
 - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`)
 - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`)
 - :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`)
+- :meth:`DataFrame.set_index` now allows all one-dimensional list-likes, raises a ``TypeError`` for incorrect types, has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
 
 .. _whatsnew_0240.deprecations:
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -60,6 +60,7 @@
  is_sequence,
  is_named_tuple)
 from pandas.core.dtypes.concat import _get_sliced_frame_result_type
+from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex
 from pandas.core.dtypes.missing import isna, notna
 
 
@@ -3952,6 +3953,25 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
  if not isinstance(keys, list):
  keys = [keys]
 
+ missing = []
+ for col in keys:
+ if (is_scalar(col) or isinstance(col, tuple)) and col in self:
+ # tuples can be both column keys or list-likes
+ # if they are valid column keys, everything is fine
+ continue
+ elif is_scalar(col) and col not in self:
+ # tuples that are not column keys are considered list-like,
+ # not considered missing
+ missing.append(col)
+ elif (not is_list_like(col) or isinstance(col, set)
+ or getattr(col, 'ndim', 1) > 1):
+ raise TypeError('The parameter "keys" may only contain a '
+ 'combination of valid column keys and '
+ 'one-dimensional list-likes')
+
+ if missing:
+ raise KeyError('{}'.format(missing))
+
  if inplace:
  frame = self
  else:
@@ -3961,37 +3981,37 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
  names = []
  if append:
  names = [x for x in self.index.names]
- if isinstance(self.index, MultiIndex):
+ if isinstance(self.index, ABCMultiIndex):
  for i in range(self.index.nlevels):
  arrays.append(self.index._get_level_values(i))
  else:
  arrays.append(self.index)
 
  to_remove = []
  for col in keys:
- if isinstance(col, MultiIndex):
- # append all but the last column so we don't have to modify
- # the end of this loop
- for n in range(col.nlevels - 1):
+ if isinstance(col, ABCMultiIndex):
+ for n in range(col.nlevels):
  arrays.append(col._get_level_values(n))
-
- level = col._get_level_values(col.nlevels - 1)
  names.extend(col.names)
- elif isinstance(col, Series):
- level = col._values
- names.append(col.name)
- elif isinstance(col, Index):
- level = col
+ elif isinstance(col, (ABCIndexClass, ABCSeries)):
+ # if Index then not MultiIndex (treated above)
+ arrays.append(col)
  names.append(col.name)
- elif isinstance(col, (list, np.ndarray, Index)):
- level = col
+ elif isinstance(col, (list, np.ndarray)):
+ arrays.append(col)
+ names.append(None)
+ elif (is_list_like(col)
+ and not (isinstance(col, tuple) and col in self)):
+ # all other list-likes (but avoid valid column keys)
+ col = list(col) # ensure iterator do not get read twice etc.
+ arrays.append(col)
  names.append(None)
+ # from here, col can only be a column label
  else:
- level = frame[col]._values
+ arrays.append(frame[col]._values)
  names.append(col)
  if drop:
  to_remove.append(col)
- arrays.append(level)
 
  index = ensure_index_from_sequences(arrays, names)
 
@@ -4000,7 +4020,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
  raise ValueError('Index has duplicate keys: {dup}'.format(
  dup=duplicates))
 
- for c in to_remove:
+ # use set to handle duplicate column names gracefully in case of drop
+ for c in set(to_remove):
  del frame[c]
 
  # clear up memory usage

diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py
@@ -182,12 +182,13 @@ def frame_of_index_cols():
  """
  Fixture for DataFrame of columns that can be used for indexing
 
- Columns are ['A', 'B', 'C', 'D', 'E']; 'A' & 'B' contain duplicates (but
- are jointly unique), the rest are unique.
+ Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')];
+ 'A' & 'B' contain duplicates (but are jointly unique), the rest are unique.
  """
  df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
  'B': ['one', 'two', 'three', 'one', 'two'],
  'C': ['a', 'b', 'c', 'd', 'e'],
  'D': np.random.randn(5),
- 'E': np.random.randn(5)})
+ 'E': np.random.randn(5),
+ ('tuple', 'as', 'label'): np.random.randn(5)})
  return df
diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py
@@ -49,7 +49,8 @@ def test_set_index_cast(self):
  tm.assert_frame_equal(df, df2)
 
  # A has duplicate values, C does not
- @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']])
+ @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'],
+ ('tuple', 'as', 'label')])
  @pytest.mark.parametrize('inplace', [True, False])
  @pytest.mark.parametrize('drop', [True, False])
  def test_set_index_drop_inplace(self, frame_of_index_cols,
@@ -72,7 +73,8 @@ def test_set_index_drop_inplace(self, frame_of_index_cols,
  tm.assert_frame_equal(result, expected)
 
  # A has duplicate values, C does not
- @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']])
+ @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'],
+ ('tuple', 'as', 'label')])
  @pytest.mark.parametrize('drop', [True, False])
  def test_set_index_append(self, frame_of_index_cols, drop, keys):
  df = frame_of_index_cols
@@ -88,7 +90,8 @@ def test_set_index_append(self, frame_of_index_cols, drop, keys):
  tm.assert_frame_equal(result, expected)
 
  # A has duplicate values, C does not
- @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']])
+ @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'],
+ ('tuple', 'as', 'label')])
  @pytest.mark.parametrize('drop', [True, False])
  def test_set_index_append_to_multiindex(self, frame_of_index_cols,
  drop, keys):
@@ -114,8 +117,10 @@ def test_set_index_after_mutation(self):
  tm.assert_frame_equal(result, expected)
 
  # MultiIndex constructor does not work directly on Series -> lambda
+ # Add list-of-list constructor because list is ambiguous -> lambda
  # also test index name if append=True (name is duplicate here for B)
  @pytest.mark.parametrize('box', [Series, Index, np.array,
+ list, tuple, iter, lambda x: [list(x)],
  lambda x: MultiIndex.from_arrays([x])])
  @pytest.mark.parametrize('append, index_name', [(True, None),
  (True, 'B'), (True, 'test'), (False, None)])
@@ -126,21 +131,29 @@ def test_set_index_pass_single_array(self, frame_of_index_cols,
  df.index.name = index_name
 
  key = box(df['B'])
- # np.array and list "forget" the name of B
- name = [None if box in [np.array, list] else 'B']
+ if box == list:
+ # list of strings gets interpreted as list of keys
+ msg = "['one', 'two', 'three', 'one', 'two']"
+ with tm.assert_raises_regex(KeyError, msg):
+ df.set_index(key, drop=drop, append=append)
+ else:
+ # np.array/tuple/iter/list-of-list "forget" the name of B
+ name_mi = getattr(key, 'names', None)
+ name = [getattr(key, 'name', None)] if name_mi is None else name_mi
 
- result = df.set_index(key, drop=drop, append=append)
+  result = df.set_index(key, drop=drop, append=append)
 
- # only valid column keys are dropped
- # since B is always passed as array above, nothing is dropped
- expected = df.set_index(['B'], drop=False, append=append)
- expected.index.names = [index_name] + name if append else name
+  # only valid column keys are dropped
+  # since B is always passed as array above, nothing is dropped
+  expected = df.set_index(['B'], drop=False, append=append)
+  expected.index.names = [index_name] + name if append else name
 
- tm.assert_frame_equal(result, expected)
+  tm.assert_frame_equal(result, expected)
 
  # MultiIndex constructor does not work directly on Series -> lambda
  # also test index name if append=True (name is duplicate here for A & B)
- @pytest.mark.parametrize('box', [Series, Index, np.array, list,
+ @pytest.mark.parametrize('box', [Series, Index, np.array,
+ list, tuple, iter,
  lambda x: MultiIndex.from_arrays([x])])
  @pytest.mark.parametrize('append, index_name',
  [(True, None), (True, 'A'), (True, 'B'),
@@ -152,8 +165,8 @@ def test_set_index_pass_arrays(self, frame_of_index_cols,
  df.index.name = index_name
 
  keys = ['A', box(df['B'])]
- # np.array and list "forget" the name of B
- names = ['A', None if box in [np.array, list] else 'B']
+ # np.array/list/tuple/iter "forget" the name of B
+ names = ['A', None if box in [np.array, list, tuple, iter] else 'B']
 
  result = df.set_index(keys, drop=drop, append=append)
 
@@ -168,10 +181,12 @@ def test_set_index_pass_arrays(self, frame_of_index_cols,
  # MultiIndex constructor does not work directly on Series -> lambda
  # We also emulate a "constructor" for the label -> lambda
  # also test index name if append=True (name is duplicate here for A)
- @pytest.mark.parametrize('box2', [Series, Index, np.array, list,
+ @pytest.mark.parametrize('box2', [Series, Index, np.array,
+ list, tuple, iter,
  lambda x: MultiIndex.from_arrays([x]),
  lambda x: x.name])
- @pytest.mark.parametrize('box1', [Series, Index, np.array, list,
+ @pytest.mark.parametrize('box1', [Series, Index, np.array,
+ list, tuple, iter,
  lambda x: MultiIndex.from_arrays([x]),
  lambda x: x.name])
  @pytest.mark.parametrize('append, index_name', [(True, None),
@@ -183,21 +198,20 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop,
  df.index.name = index_name
 
  keys = [box1(df['A']), box2(df['A'])]
+ result = df.set_index(keys, drop=drop, append=append)
 
- # == gives ambiguous Boolean for Series
- if drop and keys[0] is 'A' and keys[1] is 'A':
- with tm.assert_raises_regex(KeyError, '.*'):
- df.set_index(keys, drop=drop, append=append)
- else:
- result = df.set_index(keys, drop=drop, append=append)
-
- # to test against already-tested behavior, we add sequentially,
- # hence second append always True; must wrap in list, otherwise
- # list-box will be illegal
- expected = df.set_index([keys[0]], drop=drop, append=append)
- expected = expected.set_index([keys[1]], drop=drop, append=True)
+ # if either box was iter, the content has been consumed; re-read it
+ keys = [box1(df['A']), box2(df['A'])]
 
- tm.assert_frame_equal(result, expected)
+ # to test against already-tested behaviour, we add sequentially,
+ # hence second append always True; must wrap keys in list, otherwise
+ # box = list would be illegal; need to adapt drop for case that both
+ # keys are 'A' -- can't drop the same column twice
+ expected = df.set_index([keys[0]], drop=(False if (keys[0] is 'A'
+ and keys[1] is 'A')
+ else drop), append=append)
+ expected = expected.set_index([keys[1]], drop=drop, append=True)
+ tm.assert_frame_equal(result, expected)
 
  @pytest.mark.parametrize('append', [True, False])
  @pytest.mark.parametrize('drop', [True, False])
@@ -229,13 +243,24 @@ def test_set_index_verify_integrity(self, frame_of_index_cols):
  def test_set_index_raise(self, frame_of_index_cols, drop, append):
  df = frame_of_index_cols
 
- with tm.assert_raises_regex(KeyError, '.*'): # column names are A-E
+ with tm.assert_raises_regex(KeyError, "['foo', 'bar', 'baz']"):
+ # column names are A-E, as well as one tuple
  df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append)
 
  # non-existent key in list with arrays
- with tm.assert_raises_regex(KeyError, '.*'):
+ with tm.assert_raises_regex(KeyError, 'X'):
  df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append)
 
+ msg = 'The parameter "keys" may only contain a combination of.*'
+ # forbidden type, e.g. set
+ with tm.assert_raises_regex(TypeError, msg):
+ df.set_index(set(df['A']), drop=drop, append=append)
+
+ # forbidden type in list, e.g. set
+ with tm.assert_raises_regex(TypeError, msg):
+ df.set_index(['A', df['A'], set(df['A'])],
+ drop=drop, append=append)
+
  def test_construction_with_categorical_index(self):
  ci = tm.makeCategoricalIndex(10)
  ci.name = 'B'