pandas-dev · mroeschke · Aug 9, 2024 · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024
@@ -14,3 +14,9 @@ runs:
  condarc-file: ci/.condarc
  cache-environment: true
  cache-downloads: true
+
+ - name: Uninstall pyarrow
+ if: ${{ env.REMOVE_PYARROW == '1' }}
+ run: |
+ micromamba remove -y pyarrow
+ shell: bash -el {0}
@@ -29,6 +29,7 @@ jobs:
  env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml]
  # Prevent the include jobs from overriding other jobs
  pattern: [""]
+ pandas_future_infer_string: ["0"]
  include:
  - name: "Downstream Compat"
  env_file: actions-311-downstream_compat.yaml
@@ -58,6 +59,9 @@ jobs:
  # It will be temporarily activated during tests with locale.setlocale
  extra_loc: "zh_CN"
  - name: "Future infer strings"
+ env_file: actions-312.yaml
+ pandas_future_infer_string: "1"
+ - name: "Future infer strings (without pyarrow)"
  env_file: actions-311.yaml
  pandas_future_infer_string: "1"
  - name: "Pypy"
@@ -85,9 +89,10 @@ jobs:
  NPY_PROMOTION_STATE: ${{ matrix.env_file == 'actions-311-numpydev.yaml' && 'weak' || 'legacy' }}
  # Clipboard tests
  QT_QPA_PLATFORM: offscreen
+ REMOVE_PYARROW: ${{ matrix.name == 'Future infer strings (without pyarrow)' && '1' || '0' }}
  concurrency:
  # https://github.community/t/concurrecy-not-work-for-push/183068/7
- group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}}
+ group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_future_infer_string }}
  cancel-in-progress: true
 
  services:

diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
@@ -6,6 +6,8 @@
 
 from pandas._config import using_string_dtype
 
+from pandas.compat import HAS_PYARROW
+
 from pandas.core.dtypes.dtypes import CategoricalDtype
 
 import pandas as pd
@@ -1245,6 +1247,9 @@ def test_agg_multiple_mixed():
  tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+)
 def test_agg_multiple_mixed_raises():
  # GH 20909
  mdf = DataFrame(

diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py
@@ -12,6 +12,9 @@
 import numpy as np
 import pytest
 
+from pandas._config import using_string_dtype
+
+from pandas.compat import HAS_PYARROW
 from pandas.errors import SpecificationError
 
 from pandas import (
@@ -209,6 +212,10 @@ def transform(row):
  data.apply(transform, axis=1)
 
 
+# we should raise a proper TypeError instead of propagating the pyarrow error
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+)
 @pytest.mark.parametrize(
  "df, func, expected",
  tm.get_cython_table_params(
@@ -229,6 +236,10 @@ def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_str
  df.agg(func, axis=axis)
 
 
+# we should raise a proper TypeError instead of propagating the pyarrow error
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+)
 @pytest.mark.parametrize(
  "series, func, expected",
  chain(

diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py
@@ -104,6 +104,7 @@ def test_numba_nonunique_unsupported(apply_axis):
 
 
 def test_numba_unsupported_dtypes(apply_axis):
+ pytest.importorskip("pyarrow")
  f = lambda x: x
  df = DataFrame({"a": [1, 2], "b": ["a", "b"], "c": [4, 5]})
  df["c"] = df["c"].astype("double[pyarrow]")

diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py
@@ -8,6 +8,9 @@
 import numpy as np
 import pytest
 
+from pandas._config import using_string_dtype
+
+from pandas.compat import HAS_PYARROW
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -315,6 +318,9 @@ def test_add(self):
  expected = pd.Index(["1a", "1b", "1c"])
  tm.assert_index_equal("1" + index, expected)
 
+ @pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+ )
  def test_sub_fail(self, using_infer_string):
  index = pd.Index([str(i) for i in range(10)])
 

diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py
@@ -3,6 +3,10 @@
 import numpy as np
 import pytest
 
+from pandas._config import using_string_dtype
+
+from pandas.compat import HAS_PYARROW
+
 import pandas as pd
 import pandas._testing as tm
 
@@ -90,6 +94,9 @@ def test_op_int8(left_array, right_array, opname):
 # -----------------------------------------------------------------------------
 
 
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+)
 def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
  # invalid ops
 

diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py
@@ -6,7 +6,10 @@
 
 from pandas._config import using_string_dtype
 
-from pandas.compat import PYPY
+from pandas.compat import (
+ HAS_PYARROW,
+ PYPY,
+)
 
 from pandas import (
  Categorical,
@@ -296,7 +299,9 @@ def test_nbytes(self):
  exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories
  assert cat.nbytes == exp
 
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+ @pytest.mark.xfail(
+ using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)"
+ )
  def test_memory_usage(self):
  cat = Categorical([1, 2, 3])
 

diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
@@ -8,6 +8,8 @@
 
 from pandas._config import using_string_dtype
 
+from pandas.compat import HAS_PYARROW
+
 from pandas.core.dtypes.common import (
  is_float_dtype,
  is_integer_dtype,
@@ -442,7 +444,9 @@ def test_constructor_str_unknown(self):
  with pytest.raises(ValueError, match="Unknown dtype"):
  Categorical([1, 2], dtype="foo")
 
- @pytest.mark.xfail(using_string_dtype(), reason="Can't be NumPy strings")
+ @pytest.mark.xfail(
+ using_string_dtype() and HAS_PYARROW, reason="Can't be NumPy strings"
+ )
  def test_constructor_np_strs(self):
  # GH#31499 Hashtable.map_locations needs to work on np.str_ objects
  cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")])

diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+from pandas.compat import HAS_PYARROW
+
 import pandas as pd
 from pandas import (
  DataFrame,
@@ -102,9 +104,10 @@ def test_groupby_reductions(op, expected):
  ["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")],
  ],
 )
-def test_mixed_reductions(op, expected, using_infer_string):
- if op in ["any", "all"] and using_infer_string:
- expected = expected.astype("bool")
+def test_mixed_reductions(request, op, expected, using_infer_string):
+ if op in ["any", "all"] and using_infer_string and HAS_PYARROW:
+ # TODO(infer_string) inconsistent result type
+ request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
  df = DataFrame(
  {
  "A": ["a", "b", "b"],

diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py
@@ -1,6 +1,10 @@
 import numpy as np
 import pytest
 
+from pandas._config import using_string_dtype
+
+from pandas.compat import HAS_PYARROW
+
 from pandas.core.dtypes.dtypes import DatetimeTZDtype
 
 import pandas as pd
@@ -20,6 +24,7 @@
  SparseArray,
  TimedeltaArray,
 )
+from pandas.core.arrays.string_ import StringArrayNumpySemantics
 from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
 
 
@@ -218,7 +223,9 @@ def test_iter_box_period(self):
 )
 def test_values_consistent(arr, expected_type, dtype, using_infer_string):
  if using_infer_string and dtype == "object":
- expected_type = ArrowStringArrayNumpySemantics
+ expected_type = (
+ ArrowStringArrayNumpySemantics if HAS_PYARROW else StringArrayNumpySemantics
+ )
  l_values = Series(arr)._values
  r_values = pd.Index(arr)._values
  assert type(l_values) is expected_type
@@ -355,6 +362,9 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request):
  tm.assert_numpy_array_equal(result, expected)
 
 
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
+)
 @pytest.mark.parametrize("as_series", [True, False])
 @pytest.mark.parametrize(
  "arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)]

diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py
@@ -5,6 +5,7 @@
 
 from pandas._config import using_string_dtype
 
+from pandas.compat import HAS_PYARROW
 from pandas.compat.pyarrow import pa_version_under12p0
 import pandas.util._test_decorators as td
 
@@ -197,7 +198,7 @@ def test_astype_arrow_timestamp():
  assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._pa_array)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 def test_convert_dtypes_infer_objects():
  ser = Series(["a", "b", "c"])
  ser_orig = ser.copy()
@@ -213,7 +214,7 @@ def test_convert_dtypes_infer_objects():
  tm.assert_series_equal(ser, ser_orig)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 def test_convert_dtypes():
  df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]})
  df_orig = df.copy()

diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py
@@ -3,6 +3,8 @@
 
 from pandas._config import using_string_dtype
 
+from pandas.compat import HAS_PYARROW
+
 from pandas import (
  DataFrame,
  Index,
@@ -14,7 +16,7 @@
 from pandas.tests.copy_view.util import get_array
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 def test_concat_frames():
  df = DataFrame({"b": ["a"] * 3})
  df2 = DataFrame({"a": ["a"] * 3})
@@ -33,7 +35,7 @@ def test_concat_frames():
  tm.assert_frame_equal(df, df_orig)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 def test_concat_frames_updating_input():
  df = DataFrame({"b": ["a"] * 3})
  df2 = DataFrame({"a": ["a"] * 3})
@@ -153,7 +155,7 @@ def test_concat_copy_keyword():
  assert np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 @pytest.mark.parametrize(
  "func",
  [
@@ -249,7 +251,7 @@ def test_merge_copy_keyword():
  assert np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 def test_join_on_key():
  df_index = Index(["a", "b", "c"], name="key")
 
@@ -277,7 +279,7 @@ def test_join_on_key():
  tm.assert_frame_equal(df2, df2_orig)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 def test_join_multiple_dataframes_on_key():
  df_index = Index(["a", "b", "c"], name="key")
 

diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py
@@ -3,6 +3,8 @@
 
 from pandas._config import using_string_dtype
 
+from pandas.compat import HAS_PYARROW
+
 from pandas import (
  NA,
  DataFrame,
@@ -121,7 +123,7 @@ def test_interpolate_cannot_with_object_dtype():
  df.interpolate()
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 def test_interpolate_object_convert_no_op():
  df = DataFrame({"a": ["a", "b", "c"], "b": 1})
  arr_a = get_array(df, "a")

diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py
@@ -3,6 +3,8 @@
 
 from pandas._config import using_string_dtype
 
+from pandas.compat import HAS_PYARROW
+
 import pandas as pd
 from pandas import (
  DataFrame,
@@ -714,7 +716,7 @@ def test_head_tail(method):
  tm.assert_frame_equal(df, df_orig)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 def test_infer_objects():
  df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"})
  df_orig = df.copy()
@@ -730,6 +732,9 @@ def test_infer_objects():
  tm.assert_frame_equal(df, df_orig)
 
 
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+)
 def test_infer_objects_no_reference():
  df = DataFrame(
  {
@@ -899,7 +904,7 @@ def test_sort_values_inplace(obj, kwargs):
  tm.assert_equal(view, obj_orig)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 @pytest.mark.parametrize("decimals", [-1, 0, 1])
 def test_round(decimals):
  df = DataFrame({"a": [1, 2], "b": "c"})