Skip to content
17 changes: 11 additions & 6 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,15 +509,20 @@ def ensure_wrapped_if_datetimelike(arr):

def sanitize_masked_array(data: ma.MaskedArray) -> np.ndarray:
"""
Convert numpy MaskedArray to ensure mask is softened.
Convert numpy MaskedArray to ensure mask is softened,

"""
mask = ma.getmaskarray(data)
if mask.any():
dtype, fill_value = maybe_promote(data.dtype, np.nan)
dtype = cast(np.dtype, dtype)
data = ma.asarray(data.astype(dtype, copy=True))
data.soften_mask() # set hardmask False if it was True
data[mask] = fill_value
dtype = cast(np.dtype, data.dtype)
if isinstance(dtype, ExtensionDtype) and dtype.name.startswith("Masked"):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't checked thoroughly, but I think you want dtype._can_hold_na instead of having logic depend on the name of the dtype.

data = ma.asarray(data.astype(dtype, copy=True))
data.soften_mask() # If the data is a Masked EA, directly soften the mask.
else:
dtype, fill_value = maybe_promote(data.dtype, np.nan)
data = ma.asarray(data.astype(dtype, copy=True))
data.soften_mask() # set hardmask False if it was True
data[mask] = fill_value
else:
data = data.copy()
return data
Expand Down
35 changes: 35 additions & 0 deletions pandas/tests/dtypes/cast/test_construct_ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,38 @@ def test_construct_1d_ndarray_preserving_na_datetimelike(dtype):

result = sanitize_array(arr, index=None, dtype=np.dtype(object))
tm.assert_numpy_array_equal(result, expected)


@pytest.mark.parametrize(
"values, dtype, expected",
[
(
np.ma.masked_array([1, 2, 3], mask=[False, True, False]),
"int64",
np.array([1, 2, 3], dtype=np.int64),
),
(
np.ma.masked_array([1, 2, 3], mask=[False, True, False]),
"float64",
np.array([1, 2, 3], dtype=np.float64),
),
(
np.ma.masked_array([1, 2, 3], mask=[False, True, False]),
"UInt64",
np.array([1, 2, 3], dtype=np.uint64),
),
(
np.ma.masked_array([1.0, 2.0, 3.0], mask=[False, True, False]),
"float64",
np.array([1.0, 2.0, 3.0], dtype=np.float64),
),
(
np.ma.masked_array([1.0, 2.0, 3.0], mask=[False, True, False]),
"Int64",
np.array([1, 2, 3], dtype=np.int64),
),
],
)
def test_sanitize_masked_array_with_masked_ea(values, dtype, expected):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a case that tests for a large number as in an issue.

Also start the test with a reference to the GitHub issue:

# GH#60050 
result = sanitize_array(values, index=None, dtype=dtype)
tm.assert_masked_array_equal(result, expected)
Loading