55 Callable ,
66 Union ,
77)
8+ import warnings
89
910import numpy as np
1011
1819 npt ,
1920)
2021from pandas .compat import pa_version_under7p0
22+ from pandas .util ._exceptions import find_stack_level
2123
2224from pandas .core .dtypes .common import (
2325 is_bool_dtype ,
@@ -112,7 +114,7 @@ def __init__(self, values) -> None:
112114 super ().__init__ (values )
113115 self ._dtype = StringDtype (storage = "pyarrow" )
114116
115- if not pa .types .is_string (self ._data .type ):
117+ if not pa .types .is_string (self ._pa_array .type ):
116118 raise ValueError (
117119 "ArrowStringArray requires a PyArrow (chunked) array of string type"
118120 )
@@ -125,7 +127,7 @@ def __len__(self) -> int:
125127 -------
126128 length : int
127129 """
128- return len (self ._data )
130+ return len (self ._pa_array )
129131
130132 @classmethod
131133 def _from_sequence (cls , scalars , dtype : Dtype | None = None , copy : bool = False ):
@@ -193,7 +195,7 @@ def isin(self, values) -> npt.NDArray[np.bool_]:
193195 if not len (value_set ):
194196 return np .zeros (len (self ), dtype = bool )
195197
196- result = pc .is_in (self ._data , value_set = pa .array (value_set ))
198+ result = pc .is_in (self ._pa_array , value_set = pa .array (value_set ))
197199 # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
198200 # to False
199201 return np .array (result , dtype = np .bool_ )
@@ -206,13 +208,24 @@ def astype(self, dtype, copy: bool = True):
206208 return self .copy ()
207209 return self
208210 elif isinstance (dtype , NumericDtype ):
209- data = self ._data .cast (pa .from_numpy_dtype (dtype .numpy_dtype ))
211+ data = self ._pa_array .cast (pa .from_numpy_dtype (dtype .numpy_dtype ))
210212 return dtype .__from_arrow__ (data )
211213 elif isinstance (dtype , np .dtype ) and np .issubdtype (dtype , np .floating ):
212214 return self .to_numpy (dtype = dtype , na_value = np .nan )
213215
214216 return super ().astype (dtype , copy = copy )
215217
218+ @property
219+ def _data (self ):
220+ # dask accesses ._data directlys
221+ warnings .warn (
222+ f"{ type (self ).__name__ } ._data is a deprecated and will be removed "
223+ "in a future version, use ._pa_array instead" ,
224+ FutureWarning ,
225+ stacklevel = find_stack_level (),
226+ )
227+ return self ._pa_array
228+
216229 # ------------------------------------------------------------------------
217230 # String methods interface
218231
@@ -292,12 +305,12 @@ def _str_contains(
292305 fallback_performancewarning ()
293306 return super ()._str_contains (pat , case , flags , na , regex )
294307 else :
295- result = pc .match_substring_regex (self ._data , pat )
308+ result = pc .match_substring_regex (self ._pa_array , pat )
296309 else :
297310 if case :
298- result = pc .match_substring (self ._data , pat )
311+ result = pc .match_substring (self ._pa_array , pat )
299312 else :
300- result = pc .match_substring (pc .utf8_upper (self ._data ), pat .upper ())
313+ result = pc .match_substring (pc .utf8_upper (self ._pa_array ), pat .upper ())
301314 result = BooleanDtype ().__from_arrow__ (result )
302315 if not isna (na ):
303316 result [isna (result )] = bool (na )
@@ -325,7 +338,7 @@ def _str_replace(
325338 return super ()._str_replace (pat , repl , n , case , flags , regex )
326339
327340 func = pc .replace_substring_regex if regex else pc .replace_substring
328- result = func (self ._data , pattern = pat , replacement = repl , max_replacements = n )
341+ result = func (self ._pa_array , pattern = pat , replacement = repl , max_replacements = n )
329342 return type (self )(result )
330343
331344 def _str_match (
@@ -343,68 +356,68 @@ def _str_fullmatch(
343356 return self ._str_match (pat , case , flags , na )
344357
345358 def _str_isalnum (self ):
346- result = pc .utf8_is_alnum (self ._data )
359+ result = pc .utf8_is_alnum (self ._pa_array )
347360 return BooleanDtype ().__from_arrow__ (result )
348361
349362 def _str_isalpha (self ):
350- result = pc .utf8_is_alpha (self ._data )
363+ result = pc .utf8_is_alpha (self ._pa_array )
351364 return BooleanDtype ().__from_arrow__ (result )
352365
353366 def _str_isdecimal (self ):
354- result = pc .utf8_is_decimal (self ._data )
367+ result = pc .utf8_is_decimal (self ._pa_array )
355368 return BooleanDtype ().__from_arrow__ (result )
356369
357370 def _str_isdigit (self ):
358- result = pc .utf8_is_digit (self ._data )
371+ result = pc .utf8_is_digit (self ._pa_array )
359372 return BooleanDtype ().__from_arrow__ (result )
360373
361374 def _str_islower (self ):
362- result = pc .utf8_is_lower (self ._data )
375+ result = pc .utf8_is_lower (self ._pa_array )
363376 return BooleanDtype ().__from_arrow__ (result )
364377
365378 def _str_isnumeric (self ):
366- result = pc .utf8_is_numeric (self ._data )
379+ result = pc .utf8_is_numeric (self ._pa_array )
367380 return BooleanDtype ().__from_arrow__ (result )
368381
369382 def _str_isspace (self ):
370- result = pc .utf8_is_space (self ._data )
383+ result = pc .utf8_is_space (self ._pa_array )
371384 return BooleanDtype ().__from_arrow__ (result )
372385
373386 def _str_istitle (self ):
374- result = pc .utf8_is_title (self ._data )
387+ result = pc .utf8_is_title (self ._pa_array )
375388 return BooleanDtype ().__from_arrow__ (result )
376389
377390 def _str_isupper (self ):
378- result = pc .utf8_is_upper (self ._data )
391+ result = pc .utf8_is_upper (self ._pa_array )
379392 return BooleanDtype ().__from_arrow__ (result )
380393
381394 def _str_len (self ):
382- result = pc .utf8_length (self ._data )
395+ result = pc .utf8_length (self ._pa_array )
383396 return Int64Dtype ().__from_arrow__ (result )
384397
385398 def _str_lower (self ):
386- return type (self )(pc .utf8_lower (self ._data ))
399+ return type (self )(pc .utf8_lower (self ._pa_array ))
387400
388401 def _str_upper (self ):
389- return type (self )(pc .utf8_upper (self ._data ))
402+ return type (self )(pc .utf8_upper (self ._pa_array ))
390403
391404 def _str_strip (self , to_strip = None ):
392405 if to_strip is None :
393- result = pc .utf8_trim_whitespace (self ._data )
406+ result = pc .utf8_trim_whitespace (self ._pa_array )
394407 else :
395- result = pc .utf8_trim (self ._data , characters = to_strip )
408+ result = pc .utf8_trim (self ._pa_array , characters = to_strip )
396409 return type (self )(result )
397410
398411 def _str_lstrip (self , to_strip = None ):
399412 if to_strip is None :
400- result = pc .utf8_ltrim_whitespace (self ._data )
413+ result = pc .utf8_ltrim_whitespace (self ._pa_array )
401414 else :
402- result = pc .utf8_ltrim (self ._data , characters = to_strip )
415+ result = pc .utf8_ltrim (self ._pa_array , characters = to_strip )
403416 return type (self )(result )
404417
405418 def _str_rstrip (self , to_strip = None ):
406419 if to_strip is None :
407- result = pc .utf8_rtrim_whitespace (self ._data )
420+ result = pc .utf8_rtrim_whitespace (self ._pa_array )
408421 else :
409- result = pc .utf8_rtrim (self ._data , characters = to_strip )
422+ result = pc .utf8_rtrim (self ._pa_array , characters = to_strip )
410423 return type (self )(result )
0 commit comments