pandas-dev
diff --git a/‎pandas/core/groupby.py‎
Lines changed: 123 additions & 7 deletions b/‎pandas/core/groupby.py‎
Lines changed: 123 additions & 7 deletions
diff --git a/‎pandas/tests/test_groupby.py‎
Lines changed: 46 additions & 6 deletions b/‎pandas/tests/test_groupby.py‎
Lines changed: 46 additions & 6 deletions
@@ -52,7 +52,6 @@
 
 _apply_whitelist = frozenset(['last', 'first',
  'mean', 'sum', 'min', 'max',
- 'head', 'tail',
  'cumsum', 'cumprod', 'cummin', 'cummax',
  'resample',
  'describe',
@@ -482,13 +481,19 @@ def picker(arr):
  return np.nan
  return self.agg(picker)
 
- def cumcount(self):
- """Number each item in each group from 0 to the length of that group.
+ def cumcount(self, **kwargs):
+ """
+ Number each item in each group from 0 to the length of that group - 1.
 
  Essentially this is equivalent to
 
  >>> self.apply(lambda x: Series(np.arange(len(x)), x.index))
 
+ Parameters
+ ----------
+ ascending : bool, default True
+ If False, number in reverse, from length of group - 1 to 0.
+
  Example
  -------
 
@@ -510,14 +515,111 @@ def cumcount(self):
  4 1
  5 3
  dtype: int64
+ >>> df.groupby('A').cumcount(ascending=False)
+ 0 3
+ 1 2
+ 2 1
+ 3 1
+ 4 0
+ 5 0
+ dtype: int64
 
  """
+ ascending = kwargs.pop('ascending', True)
+
  index = self.obj.index
- cumcounts = np.zeros(len(index), dtype='int64')
- for v in self.indices.values():
- cumcounts[v] = np.arange(len(v), dtype='int64')
+ rng = np.arange(self.grouper._max_groupsize, dtype='int64')
+ cumcounts = self._cumcount_array(rng, ascending=ascending)
  return Series(cumcounts, index)
 
+ def head(self, n=5):
+ """
+ Returns first n rows of each group.
+
+ Essentially equivalent to ``.apply(lambda x: x.head(n))``
+
+ Example
+ -------
+
+ >>> df = DataFrame([[1, 2], [1, 4], [5, 6]],
+ columns=['A', 'B'])
+ >>> df.groupby('A', as_index=False).head(1) 
+ A B
+ 0 1 2
+ 2 5 6
+ >>> df.groupby('A').head(1)
+ A B
+ A 
+ 1 0 1 2
+ 5 2 5 6
+
+ """
+ rng = np.arange(self.grouper._max_groupsize, dtype='int64')
+ in_head = self._cumcount_array(rng) < n
+ head = self.obj[in_head]
+ if self.as_index:
+ head.index = self._index_with_as_index(in_head)
+ return head
+
+ def tail(self, n=5):
+ """
+ Returns last n rows of each group
+
+ Essentially equivalent to ``.apply(lambda x: x.tail(n))``
+
+ Example
+ -------
+
+ >>> df = DataFrame([[1, 2], [1, 4], [5, 6]],
+ columns=['A', 'B'])
+ >>> df.groupby('A', as_index=False).tail(1) 
+ A B
+ 0 1 2
+ 2 5 6
+ >>> df.groupby('A').head(1)
+ A B
+ A 
+ 1 0 1 2
+ 5 2 5 6
+ 
+ """
+ rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64')
+ in_tail = self._cumcount_array(rng, ascending=False) > -n
+ tail = self.obj[in_tail]
+ if self.as_index:
+ tail.index = self._index_with_as_index(in_tail)
+ return tail
+
+ def _cumcount_array(self, arr, **kwargs):
+ ascending = kwargs.pop('ascending', True)
+
+ len_index = len(self.obj.index)
+ cumcounts = np.zeros(len_index, dtype='int64')
+ if ascending:
+ for v in self.indices.values():
+ cumcounts[v] = arr[:len(v)]
+ else:
+ for v in self.indices.values():
+ cumcounts[v] = arr[len(v)-1::-1]
+ return cumcounts
+
+ def _index_with_as_index(self, b):
+ """
+ Take boolean mask of index to be returned from apply, if as_index=True
+
+ """
+ # TODO perf, it feels like this should already be somewhere...
+ from itertools import chain
+ original = self.obj.index
+ gp = self.grouper
+ levels = chain((gp.levels[i][gp.labels[i][b]]
+ for i in range(len(gp.groupings))),
+ (original.get_level_values(i)[b]
+ for i in range(original.nlevels)))
+ new = MultiIndex.from_arrays(list(levels))
+ new.names = gp.names + original.names
+ return new
+
  def _try_cast(self, result, obj):
  """
  try to cast the result to our obj original type,
@@ -758,14 +860,28 @@ def names(self):
  def size(self):
  """
  Compute group sizes
+
  """
  # TODO: better impl
  labels, _, ngroups = self.group_info
- bin_counts = Series(labels).value_counts()
+ bin_counts = algos.value_counts(labels, sort=False)
  bin_counts = bin_counts.reindex(np.arange(ngroups))
  bin_counts.index = self.result_index
  return bin_counts
 
+ @cache_readonly
+ def _max_groupsize(self):
+ '''
+ Compute size of largest group
+
+ '''
+ # For many items in each group this is much faster than
+ # self.size().max(), in worst case marginally slower
+ if self.indices:
+ return max(len(v) for v in self.indices.values())
+ else:
+ return 0
+
  @cache_readonly
  def groups(self):
  if len(self.groupings) == 1:
 
@@ -1203,24 +1203,64 @@ def test_groupby_as_index_apply(self):
  g_not_as = df.groupby('user_id', as_index=False)
 
  res_as = g_as.head(2).index
- exp_as = MultiIndex.from_tuples([(1, 0), (1, 2), (2, 1), (3, 4)])
+ exp_as = MultiIndex.from_tuples([(1, 0), (2, 1), (1, 2), (3, 4)])
  assert_index_equal(res_as, exp_as)
 
  res_not_as = g_not_as.head(2).index
- exp_not_as = Index([0, 2, 1, 4])
+ exp_not_as = Index([0, 1, 2, 4])
  assert_index_equal(res_not_as, exp_not_as)
 
- res_as = g_as.apply(lambda x: x.head(2)).index
- assert_index_equal(res_not_as, exp_not_as)
+ res_as_apply = g_as.apply(lambda x: x.head(2)).index
+ res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index
 
- res_not_as = g_not_as.apply(lambda x: x.head(2)).index
- assert_index_equal(res_not_as, exp_not_as)
+ # apply doesn't maintain the original ordering
+ exp_not_as_apply = Index([0, 2, 1, 4]) 
+ exp_as_apply = MultiIndex.from_tuples([(1, 0), (1, 2), (2, 1), (3, 4)])
+
+ assert_index_equal(res_as_apply, exp_as_apply)
+ assert_index_equal(res_not_as_apply, exp_not_as_apply)
 
  ind = Index(list('abcde'))
  df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
  res = df.groupby(0, as_index=False).apply(lambda x: x).index
  assert_index_equal(res, ind)
 
+ def test_groupby_head_tail(self):
+ df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
+ g_as = df.groupby('A', as_index=True)
+ g_not_as = df.groupby('A', as_index=False)
+
+ # as_index= False, much easier
+ assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1))
+ assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1))
+
+ empty_not_as = DataFrame(columns=df.columns)
+ assert_frame_equal(empty_not_as, g_not_as.head(0))
+ assert_frame_equal(empty_not_as, g_not_as.tail(0))
+ assert_frame_equal(empty_not_as, g_not_as.head(-1))
+ assert_frame_equal(empty_not_as, g_not_as.tail(-1))
+
+ assert_frame_equal(df, g_not_as.head(7)) # contains all
+ assert_frame_equal(df, g_not_as.tail(7))
+
+ # as_index=True, yuck
+ # prepend the A column as an index, in a roundabout way
+ df_as = df.copy()
+ df_as.index = df.set_index('A', append=True,
+ drop=False).index.swaplevel(0, 1)
+
+ assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1))
+ assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1))
+
+ empty_as = DataFrame(index=df_as.index[:0], columns=df.columns)
+ assert_frame_equal(empty_as, g_as.head(0))
+ assert_frame_equal(empty_as, g_as.tail(0))
+ assert_frame_equal(empty_as, g_as.head(-1))
+ assert_frame_equal(empty_as, g_as.tail(-1))
+
+ assert_frame_equal(df_as, g_as.head(7)) # contains all
+ assert_frame_equal(df_as, g_as.tail(7))
+
  def test_groupby_multiple_key(self):
  df = tm.makeTimeDataFrame()
  grouped = df.groupby([lambda x: x.year,