2222
2323from pandas ._typing import ArrayLike
2424from pandas .core import algorithms
25+ from pandas .core .algorithms import unique
26+
27+ # ---------------------------------------------------------------------
28+ # types used in annotations
29+
30+ ArrayConvertible = Union [list , tuple , ArrayLike , ABCSeries ]
31+
32+ # ---------------------------------------------------------------------
2533
2634# ---------------------------------------------------------------------
2735# types used in annotations
@@ -42,13 +50,67 @@ def _guess_datetime_format_for_array(arr, **kwargs):
4250 return _guess_datetime_format (arr [non_nan_elements [0 ]], ** kwargs )
4351
4452
53+ def should_cache (arg : ArrayConvertible , unique_share : float = 0.7 ,
54+ check_count : Optional [int ] = None ) -> bool :
55+ """
56+ Decides whether to do caching.
57+
58+ If the percent of unique elements among `check_count` elements less
59+ than `unique_share * 100` then we can do caching.
60+
61+ Parameters
62+ ----------
63+ arg: listlike, tuple, 1-d array, Series
64+ unique_share: float, default=0.7, optional
65+ 0 < unique_share < 1
66+ check_count: int, optional
67+ 0 <= check_count <= len(arg)
68+
69+ Returns
70+ -------
71+ do_caching: bool
72+
73+ Notes
74+ -----
75+ By default for a sequence of less than 50 items in size, we don't do
76+ caching; for the number of elements less than 5000, we take ten percent of
77+ all elements to check for a uniqueness share; if the sequence size is more
78+ than 5000, then we check only the first 500 elements.
79+ All constants were chosen empirically by.
80+ """
81+ do_caching = True
82+
83+ # default realization
84+ if check_count is None :
85+ # in this case, the gain from caching is negligible
86+ if len (arg ) <= 50 :
87+ return False
88+
89+ if len (arg ) <= 5000 :
90+ check_count = int (len (arg ) * 0.1 )
91+ else :
92+ check_count = 500
93+ else :
94+ assert 0 <= check_count <= len (arg ), \
95+ 'check_count must be in next bounds: [0; len(arg)]'
96+ if check_count == 0 :
97+ return False
98+
99+ assert 0 < unique_share < 1 , 'unique_share must be in next bounds: (0; 1)'
100+
101+ unique_elements = unique (arg [:check_count ])
102+ if len (unique_elements ) > check_count * unique_share :
103+ do_caching = False
104+ return do_caching
105+
106+
45107def _maybe_cache (arg , format , cache , convert_listlike ):
46108 """
47109 Create a cache of unique dates from an array of dates
48110
49111 Parameters
50112 ----------
51- arg : integer, float, string, datetime, list , tuple, 1-d array, Series
113+ arg : listlike , tuple, 1-d array, Series
52114 format : string
53115 Strftime format to parse time
54116 cache : boolean
@@ -65,11 +127,12 @@ def _maybe_cache(arg, format, cache, convert_listlike):
65127 cache_array = Series ()
66128 if cache :
67129 # Perform a quicker unique check
68- from pandas import Index
69- unique_dates = Index (arg ).unique ()
130+ if not should_cache (arg ):
131+ return cache_array
132+
133+ unique_dates = unique (arg )
70134 if len (unique_dates ) < len (arg ):
71- cache_dates = convert_listlike (unique_dates .to_numpy (),
72- True , format )
135+ cache_dates = convert_listlike (unique_dates , True , format )
73136 cache_array = Series (cache_dates , index = unique_dates )
74137 return cache_array
75138
@@ -448,7 +511,7 @@ def _adjust_to_origin(arg, origin, unit):
448511def to_datetime (arg , errors = 'raise' , dayfirst = False , yearfirst = False ,
449512 utc = None , box = True , format = None , exact = True ,
450513 unit = None , infer_datetime_format = False , origin = 'unix' ,
451- cache = False ):
514+ cache = True ):
452515 """
453516 Convert argument to datetime.
454517
@@ -529,13 +592,16 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
529592 origin.
530593
531594 .. versionadded:: 0.20.0
532- cache : boolean, default False
595+ cache : boolean, default True
533596 If True, use a cache of unique, converted dates to apply the datetime
534597 conversion. May produce significant speed-up when parsing duplicate
535598 date strings, especially ones with timezone offsets.
536599
537600 .. versionadded:: 0.23.0
538601
602+ .. versionchanged:: 0.25.0
603+ - changed default value from False to True
604+
539605 Returns
540606 -------
541607 ret : datetime if parsing succeeded.
0 commit comments