pandas-dev · jreback · Jan 10, 2021
diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst
@@ -24,6 +24,7 @@ Fixed regressions
 - Fixed regression in :meth:`.GroupBy.sem` where the presence of non-numeric columns would cause an error instead of being dropped (:issue:`38774`)
 - Fixed regression in :func:`read_excel` with non-rawbyte file handles (:issue:`38788`)
 - Bug in :meth:`read_csv` with ``float_precision="high"`` caused segfault or wrong parsing of long exponent strings. This resulted in a regression in some cases as the default for ``float_precision`` was changed in pandas 1.2.0 (:issue:`38753`)
+- Fixed regression in :meth:`DataFrameGroupBy.diff` raising for ``int8`` and ``int16`` columns (:issue:`39050`)
 - Fixed regression in :meth:`Rolling.skew` and :meth:`Rolling.kurt` modifying the object inplace (:issue:`38908`)
 - Fixed regression in :meth:`read_csv` and other read functions were the encoding error policy (``errors``) did not default to ``"replace"`` when no encoding was specified (:issue:`38989`)
 

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -1981,7 +1981,13 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3):
 
  elif is_integer_dtype(dtype):
  # We have to cast in order to be able to hold np.nan
- dtype = np.float64
+
+ # int8, int16 are incompatible with float64,
+ # see https://github.com/cython/cython/issues/2646
+ if arr.dtype.name in ["int8", "int16"]:
+ dtype = np.float32
+ else:
+ dtype = np.float64
 
  orig_ndim = arr.ndim
  if orig_ndim == 1:

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -1697,64 +1697,6 @@ def test_sort(x):
  g.apply(test_sort)
 
 
-def test_group_shift_with_null_key():
- # This test is designed to replicate the segfault in issue #13813.
- n_rows = 1200
-
- # Generate a moderately large dataframe with occasional missing
- # values in column `B`, and then group by [`A`, `B`]. This should
- # force `-1` in `labels` array of `g.grouper.group_info` exactly
- # at those places, where the group-by key is partially missing.
- df = DataFrame(
- [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
- dtype=float,
- columns=["A", "B", "Z"],
- index=None,
- )
- g = df.groupby(["A", "B"])
-
- expected = DataFrame(
- [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
- dtype=float,
- columns=["Z"],
- index=None,
- )
- result = g.shift(-1)
-
- tm.assert_frame_equal(result, expected)
-
-
-def test_group_shift_with_fill_value():
- # GH #24128
- n_rows = 24
- df = DataFrame(
- [(i % 12, i % 3, i) for i in range(n_rows)],
- dtype=float,
- columns=["A", "B", "Z"],
- index=None,
- )
- g = df.groupby(["A", "B"])
-
- expected = DataFrame(
- [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
- dtype=float,
- columns=["Z"],
- index=None,
- )
- result = g.shift(-1, fill_value=0)[["Z"]]
-
- tm.assert_frame_equal(result, expected)
-
-
-def test_group_shift_lose_timezone():
- # GH 30134
- now_dt = Timestamp.utcnow()
- df = DataFrame({"a": [1, 1], "date": now_dt})
- result = df.groupby("a").shift(0).iloc[0]
- expected = Series({"date": now_dt}, name=result.name)
- tm.assert_series_equal(result, expected)
-
-
 def test_pivot_table_values_key_error():
  # This test is designed to replicate the error in issue #14938
  df = DataFrame(

diff --git a/pandas/tests/groupby/test_groupby_shift_diff.py b/pandas/tests/groupby/test_groupby_shift_diff.py
@@ -0,0 +1,106 @@
+import numpy as np
+import pytest
+
+from pandas import DataFrame, NaT, Series, Timedelta, Timestamp
+import pandas._testing as tm
+
+
+def test_group_shift_with_null_key():
+ # This test is designed to replicate the segfault in issue #13813.
+ n_rows = 1200
+
+ # Generate a moderately large dataframe with occasional missing
+ # values in column `B`, and then group by [`A`, `B`]. This should
+ # force `-1` in `labels` array of `g.grouper.group_info` exactly
+ # at those places, where the group-by key is partially missing.
+ df = DataFrame(
+ [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
+ dtype=float,
+ columns=["A", "B", "Z"],
+ index=None,
+ )
+ g = df.groupby(["A", "B"])
+
+ expected = DataFrame(
+ [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
+ dtype=float,
+ columns=["Z"],
+ index=None,
+ )
+ result = g.shift(-1)
+
+ tm.assert_frame_equal(result, expected)
+
+
+def test_group_shift_with_fill_value():
+ # GH #24128
+ n_rows = 24
+ df = DataFrame(
+ [(i % 12, i % 3, i) for i in range(n_rows)],
+ dtype=float,
+ columns=["A", "B", "Z"],
+ index=None,
+ )
+ g = df.groupby(["A", "B"])
+
+ expected = DataFrame(
+ [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
+ dtype=float,
+ columns=["Z"],
+ index=None,
+ )
+ result = g.shift(-1, fill_value=0)[["Z"]]
+
+ tm.assert_frame_equal(result, expected)
+
+
+def test_group_shift_lose_timezone():
+ # GH 30134
+ now_dt = Timestamp.utcnow()
+ df = DataFrame({"a": [1, 1], "date": now_dt})
+ result = df.groupby("a").shift(0).iloc[0]
+ expected = Series({"date": now_dt}, name=result.name)
+ tm.assert_series_equal(result, expected)
+
+
+def test_group_diff_real(any_real_dtype):
+ df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]}, dtype=any_real_dtype)
+ result = df.groupby("a")["b"].diff()
+ exp_dtype = "float"
+ if any_real_dtype in ["int8", "int16", "float32"]:
+ exp_dtype = "float32"
+ expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b")
+ tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+ "data",
+ [
+ [
+ Timestamp("2013-01-01"),
+ Timestamp("2013-01-02"),
+ Timestamp("2013-01-03"),
+ ],
+ [Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")],
+ ],
+)
+def test_group_diff_datetimelike(data):
+ df = DataFrame({"a": [1, 2, 2], "b": data})
+ result = df.groupby("a")["b"].diff()
+ expected = Series([NaT, NaT, Timedelta("1 days")], name="b")
+ tm.assert_series_equal(result, expected)
+
+
+def test_group_diff_bool():
+ df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
+ result = df.groupby("a")["b"].diff()
+ expected = Series([np.nan, np.nan, np.nan, False, False], name="b")
+ tm.assert_series_equal(result, expected)
+
+
+def test_group_diff_object_raises(object_dtype):
+ df = DataFrame(
+ {"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype
+ )
+ with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"):
+ df.groupby("a")["b"].diff()
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -2409,3 +2409,10 @@ def test_diff_ea_axis(self):
  msg = "cannot diff DatetimeArray on axis=1"
  with pytest.raises(ValueError, match=msg):
  algos.diff(dta, 1, axis=1)
+
+ @pytest.mark.parametrize("dtype", ["int8", "int16"])
+ def test_diff_low_precision_int(self, dtype):
+ arr = np.array([0, 1, 1, 0, 0], dtype=dtype)
+ result = algos.diff(arr, 1)
+ expected = np.array([np.nan, 1, 0, -1, 0], dtype="float32")
+ tm.assert_numpy_array_equal(result, expected)