Skip to content
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -776,6 +776,7 @@ Timezones
Numeric
^^^^^^^
- Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`)
- Bug in :meth:`Series.__floordiv__` and :meth:`Series.__truediv__` for :class:`ArrowDtype` with integral dtypes raising for large divisors (:issue:`56706`)
- Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`)
- Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`)

Expand Down
52 changes: 42 additions & 10 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@

def cast_for_truediv(
arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar
) -> pa.ChunkedArray:
) -> tuple[pa.ChunkedArray, pa.Array | pa.Scalar]:
# Ensure int / int -> float mirroring Python/Numpy behavior
# as pc.divide_checked(int, int) -> int
if pa.types.is_integer(arrow_array.type) and pa.types.is_integer(
Expand All @@ -120,19 +120,51 @@ def cast_for_truediv(
# Intentionally not using arrow_array.cast because it could be a scalar
# value in reflected case, and safe=False only added to
# scalar cast in pyarrow 13.
return pc.cast(arrow_array, pa.float64(), safe=False)
return arrow_array
# In arrow, common type between integral and float64 is float64,
# but integral type is safe casted to float64, to mirror python
# and numpy, we want an unsafe cast, so we cast both operands to
# to float64 before invoking arrow.
return pc.cast(arrow_array, pa.float64(), safe=False), pc.cast(
pa_object, pa.float64(), safe=False
)

return arrow_array, pa_object

def floordiv_compat(
left: pa.ChunkedArray | pa.Array | pa.Scalar,
right: pa.ChunkedArray | pa.Array | pa.Scalar,
) -> pa.ChunkedArray:
# Ensure int // int -> int mirroring Python/Numpy behavior
# as pc.floor(pc.divide_checked(int, int)) -> float
converted_left = cast_for_truediv(left, right)
result = pc.floor(pc.divide(converted_left, right))
if pa.types.is_integer(left.type) and pa.types.is_integer(right.type):
result = result.cast(left.type)
# Use divide_checked to ensure cases like -9223372036854775808 // -1
# don't silently overflow.
divided = pc.divide_checked(left, right)
# GH 56676: avoid storing intermediate calculating in floating point type.
has_remainder = pc.not_equal(pc.multiply(divided, right), left)
result = pc.if_else(
# Pass a typed arrow scalar rather than stdlib int
# which always inferred as int64, to prevent overflow
# in case of large uint64 values.
pc.and_(
pc.less(
pc.bit_wise_xor(left, right), pa.scalar(0, type=divided.type)
),
has_remainder,
),
# GH 55561: floordiv should round towards negative infinity.
# pc.divide_checked for integral types rounds towards 0.
# Avoid using subtract_checked which would incorrectly raise
# for -9223372036854775808 // 1, because if integer overflow
# occurs, then has_remainder should be false, and overflowed
# value is discarded.
pc.subtract(divided, pa.scalar(1, type=divided.type)),
divided,
)
else:
# Use divide instead of divide_checked to match numpy
# floordiv where divide by 0 returns infinity for floating
# point types.
divided = pc.divide(left, right)
result = pc.floor(divided)
return result

ARROW_ARITHMETIC_FUNCS = {
Expand All @@ -142,8 +174,8 @@ def floordiv_compat(
"rsub": lambda x, y: pc.subtract_checked(y, x),
"mul": pc.multiply_checked,
"rmul": lambda x, y: pc.multiply_checked(y, x),
"truediv": lambda x, y: pc.divide(cast_for_truediv(x, y), y),
"rtruediv": lambda x, y: pc.divide(y, cast_for_truediv(x, y)),
"truediv": lambda x, y: pc.divide(*cast_for_truediv(x, y)),
"rtruediv": lambda x, y: pc.divide(*cast_for_truediv(y, x)),
"floordiv": lambda x, y: floordiv_compat(x, y),
"rfloordiv": lambda x, y: floordiv_compat(y, x),
"mod": NotImplemented,
Expand Down
65 changes: 62 additions & 3 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -905,8 +905,9 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
else:
assert pa.types.is_decimal(alt_dtype.pyarrow_dtype)
return expected.astype(alt_dtype)

else:
elif op_name not in ["__floordiv__", "__rfloordiv__"] or isinstance(
other, pd.Series
):
pa_expected = pa_expected.cast(orig_pa_type)

pd_expected = type(expected_data._values)(pa_expected)
Expand Down Expand Up @@ -3239,13 +3240,71 @@ def test_arrow_floordiv():


def test_arrow_floordiv_large_values():
# GH 55561
# GH 56645
a = pd.Series([1425801600000000000], dtype="int64[pyarrow]")
expected = pd.Series([1425801600000], dtype="int64[pyarrow]")
result = a // 1_000_000
tm.assert_series_equal(result, expected)


def test_arrow_floordiv_large_integral_result():
# GH 56676
a = pd.Series([18014398509481983, -9223372036854775808], dtype="int64[pyarrow]")
result = a // 1
tm.assert_series_equal(result, a)


def test_arrow_floordiv_larger_divisor():
# GH 56676
a = pd.Series([-23], dtype="int64[pyarrow]")
result = a // 24
expected = pd.Series([-1], dtype="int64[pyarrow]")
tm.assert_series_equal(result, expected)


def test_arrow_floordiv_integral_invalid():
# GH 56676
a = pd.Series([-9223372036854775808], dtype="int64[pyarrow]")
with pytest.raises(pa.lib.ArrowInvalid, match="overflow"):
a // -1
with pytest.raises(pa.lib.ArrowInvalid, match="divide by zero"):
a // 0


def test_arrow_floordiv_floating_0_divisor():
# GH 56676
a = pd.Series([2], dtype="double[pyarrow]")
result = a // 0
expected = pd.Series([float("inf")], dtype="double[pyarrow]")
tm.assert_series_equal(result, expected)


def test_arrow_floordiv_no_overflow():
# GH 56676
a = pd.Series([9223372036854775808], dtype="uint64[pyarrow]")
b = pd.Series([1], dtype="uint64[pyarrow]")
result = a // b
tm.assert_series_equal(result, a)


def test_arrow_true_division_large_divisor():
# GH 56706
a = pd.Series([0], dtype="int64[pyarrow]")
b = pd.Series([18014398509481983], dtype="int64[pyarrow]")
expected = pd.Series([0], dtype="float64[pyarrow]")
result = a / b
tm.assert_series_equal(result, expected)


def test_arrow_floor_division_large_divisor():
# GH 56706
a = pd.Series([0], dtype="int64[pyarrow]")
b = pd.Series([18014398509481983], dtype="int64[pyarrow]")
expected = pd.Series([0], dtype="int64[pyarrow]")
result = a // b
tm.assert_series_equal(result, expected)


def test_string_to_datetime_parsing_cast():
# GH 56266
string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"]
Expand Down