Skip to content
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 18 additions & 4 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from __future__ import annotations

from distutils.version import LooseVersion
from typing import TYPE_CHECKING, Any, Sequence, Type, Union
from typing import TYPE_CHECKING, Any, Sequence, Tuple, Type, Union

import numpy as np

from pandas._libs import lib, missing as libmissing
from pandas.util._decorators import doc
from pandas.util._validators import validate_fillna_kwargs

from pandas.core.dtypes.base import ExtensionDtype
Expand All @@ -15,10 +16,12 @@
from pandas.api.types import (
is_array_like,
is_bool_dtype,
is_int64_dtype,
is_integer,
is_integer_dtype,
is_scalar,
)
from pandas.core.algorithms import factorize
from pandas.core.arraylike import OpsMixin
from pandas.core.arrays.base import ExtensionArray
from pandas.core.indexers import check_array_indexer, validate_indices
Expand Down Expand Up @@ -252,9 +255,20 @@ def __len__(self) -> int:
"""
return len(self._data)

@classmethod
def _from_factorized(cls, values, original):
return cls._from_sequence(values)
@doc(ExtensionArray.factorize)
def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]:
if self._data.num_chunks == 1:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nowadays, dictionary_encode works fine for ChunkedArrays as well, so I am not sure this if statement is actually needed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tooks a stab at that in fletcher to let CI verify this assumption: Seems to work with pyarrow 0.17-2.0 xhochy/fletcher#206

encoded = self._data.chunk(0).dictionary_encode()
indices = encoded.indices.to_pandas()
if indices.dtype.kind == "f":
indices[np.isnan(indices)] = na_sentinel
indices = indices.astype(int)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

int -> np.int64

if not is_int64_dtype(indices):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can just do
indices = indices.astype(np.int64, copy=False)

indices = indices.astype(np.int64)
return indices.values, type(self)(encoded.dictionary)
else:
np_array = self._data.to_pandas().values
return factorize(np_array, na_sentinel=na_sentinel)

@classmethod
def _concat_same_type(cls, to_concat) -> ArrowStringArray:
Expand Down