55import pandas as pd
66from pandas .util import testing as tm
77
8- for imp in [' pandas.util' , ' pandas.tools.hashing' ]:
8+ for imp in [" pandas.util" , " pandas.tools.hashing" ]:
99 try :
1010 hashing = import_module (imp )
1111 break
1515
1616class Factorize :
1717
18- params = [[True , False ], [' int' , ' uint' , ' float' , ' string' ]]
19- param_names = [' sort' , ' dtype' ]
18+ params = [[True , False ], [" int" , " uint" , " float" , " string" ]]
19+ param_names = [" sort" , " dtype" ]
2020
2121 def setup (self , sort , dtype ):
22- N = 10 ** 5
23- data = {'int' : pd .Int64Index (np .arange (N ).repeat (5 )),
24- 'uint' : pd .UInt64Index (np .arange (N ).repeat (5 )),
25- 'float' : pd .Float64Index (np .random .randn (N ).repeat (5 )),
26- 'string' : tm .makeStringIndex (N ).repeat (5 )}
22+ N = 10 ** 5
23+ data = {
24+ "int" : pd .Int64Index (np .arange (N ).repeat (5 )),
25+ "uint" : pd .UInt64Index (np .arange (N ).repeat (5 )),
26+ "float" : pd .Float64Index (np .random .randn (N ).repeat (5 )),
27+ "string" : tm .makeStringIndex (N ).repeat (5 ),
28+ }
2729 self .idx = data [dtype ]
2830
2931 def time_factorize (self , sort , dtype ):
@@ -32,15 +34,17 @@ def time_factorize(self, sort, dtype):
3234
3335class FactorizeUnique :
3436
35- params = [[True , False ], [' int' , ' uint' , ' float' , ' string' ]]
36- param_names = [' sort' , ' dtype' ]
37+ params = [[True , False ], [" int" , " uint" , " float" , " string" ]]
38+ param_names = [" sort" , " dtype" ]
3739
3840 def setup (self , sort , dtype ):
39- N = 10 ** 5
40- data = {'int' : pd .Int64Index (np .arange (N )),
41- 'uint' : pd .UInt64Index (np .arange (N )),
42- 'float' : pd .Float64Index (np .arange (N )),
43- 'string' : tm .makeStringIndex (N )}
41+ N = 10 ** 5
42+ data = {
43+ "int" : pd .Int64Index (np .arange (N )),
44+ "uint" : pd .UInt64Index (np .arange (N )),
45+ "float" : pd .Float64Index (np .arange (N )),
46+ "string" : tm .makeStringIndex (N ),
47+ }
4448 self .idx = data [dtype ]
4549 assert self .idx .is_unique
4650
@@ -50,15 +54,17 @@ def time_factorize(self, sort, dtype):
5054
5155class Duplicated :
5256
53- params = [[' first' , ' last' , False ], [' int' , ' uint' , ' float' , ' string' ]]
54- param_names = [' keep' , ' dtype' ]
57+ params = [[" first" , " last" , False ], [" int" , " uint" , " float" , " string" ]]
58+ param_names = [" keep" , " dtype" ]
5559
5660 def setup (self , keep , dtype ):
57- N = 10 ** 5
58- data = {'int' : pd .Int64Index (np .arange (N ).repeat (5 )),
59- 'uint' : pd .UInt64Index (np .arange (N ).repeat (5 )),
60- 'float' : pd .Float64Index (np .random .randn (N ).repeat (5 )),
61- 'string' : tm .makeStringIndex (N ).repeat (5 )}
61+ N = 10 ** 5
62+ data = {
63+ "int" : pd .Int64Index (np .arange (N ).repeat (5 )),
64+ "uint" : pd .UInt64Index (np .arange (N ).repeat (5 )),
65+ "float" : pd .Float64Index (np .random .randn (N ).repeat (5 )),
66+ "string" : tm .makeStringIndex (N ).repeat (5 ),
67+ }
6268 self .idx = data [dtype ]
6369 # cache is_unique
6470 self .idx .is_unique
@@ -69,15 +75,17 @@ def time_duplicated(self, keep, dtype):
6975
7076class DuplicatedUniqueIndex :
7177
72- params = [' int' , ' uint' , ' float' , ' string' ]
73- param_names = [' dtype' ]
78+ params = [" int" , " uint" , " float" , " string" ]
79+ param_names = [" dtype" ]
7480
7581 def setup (self , dtype ):
76- N = 10 ** 5
77- data = {'int' : pd .Int64Index (np .arange (N )),
78- 'uint' : pd .UInt64Index (np .arange (N )),
79- 'float' : pd .Float64Index (np .random .randn (N )),
80- 'string' : tm .makeStringIndex (N )}
82+ N = 10 ** 5
83+ data = {
84+ "int" : pd .Int64Index (np .arange (N )),
85+ "uint" : pd .UInt64Index (np .arange (N )),
86+ "float" : pd .Float64Index (np .random .randn (N )),
87+ "string" : tm .makeStringIndex (N ),
88+ }
8189 self .idx = data [dtype ]
8290 # cache is_unique
8391 self .idx .is_unique
@@ -87,67 +95,74 @@ def time_duplicated_unique(self, dtype):
8795
8896
8997class Hashing :
90-
9198 def setup_cache (self ):
92- N = 10 ** 5
99+ N = 10 ** 5
93100
94101 df = pd .DataFrame (
95- {'strings' : pd .Series (tm .makeStringIndex (10000 ).take (
96- np .random .randint (0 , 10000 , size = N ))),
97- 'floats' : np .random .randn (N ),
98- 'ints' : np .arange (N ),
99- 'dates' : pd .date_range ('20110101' , freq = 's' , periods = N ),
100- 'timedeltas' : pd .timedelta_range ('1 day' , freq = 's' , periods = N )})
101- df ['categories' ] = df ['strings' ].astype ('category' )
102+ {
103+ "strings" : pd .Series (
104+ tm .makeStringIndex (10000 ).take (np .random .randint (0 , 10000 , size = N ))
105+ ),
106+ "floats" : np .random .randn (N ),
107+ "ints" : np .arange (N ),
108+ "dates" : pd .date_range ("20110101" , freq = "s" , periods = N ),
109+ "timedeltas" : pd .timedelta_range ("1 day" , freq = "s" , periods = N ),
110+ }
111+ )
112+ df ["categories" ] = df ["strings" ].astype ("category" )
102113 df .iloc [10 :20 ] = np .nan
103114 return df
104115
105116 def time_frame (self , df ):
106117 hashing .hash_pandas_object (df )
107118
108119 def time_series_int (self , df ):
109- hashing .hash_pandas_object (df [' ints' ])
120+ hashing .hash_pandas_object (df [" ints" ])
110121
111122 def time_series_string (self , df ):
112- hashing .hash_pandas_object (df [' strings' ])
123+ hashing .hash_pandas_object (df [" strings" ])
113124
114125 def time_series_float (self , df ):
115- hashing .hash_pandas_object (df [' floats' ])
126+ hashing .hash_pandas_object (df [" floats" ])
116127
117128 def time_series_categorical (self , df ):
118- hashing .hash_pandas_object (df [' categories' ])
129+ hashing .hash_pandas_object (df [" categories" ])
119130
120131 def time_series_timedeltas (self , df ):
121- hashing .hash_pandas_object (df [' timedeltas' ])
132+ hashing .hash_pandas_object (df [" timedeltas" ])
122133
123134 def time_series_dates (self , df ):
124- hashing .hash_pandas_object (df [' dates' ])
135+ hashing .hash_pandas_object (df [" dates" ])
125136
126137
127138class Quantile :
128- params = [[0 , 0.5 , 1 ],
129- ['linear' , 'nearest' , 'lower' , 'higher' , 'midpoint' ],
130- ['float' , 'int' , 'uint' ]]
131- param_names = ['quantile' , 'interpolation' , 'dtype' ]
139+ params = [
140+ [0 , 0.5 , 1 ],
141+ ["linear" , "nearest" , "lower" , "higher" , "midpoint" ],
142+ ["float" , "int" , "uint" ],
143+ ]
144+ param_names = ["quantile" , "interpolation" , "dtype" ]
132145
133146 def setup (self , quantile , interpolation , dtype ):
134- N = 10 ** 5
135- data = {'int' : np .arange (N ),
136- 'uint' : np .arange (N ).astype (np .uint64 ),
137- 'float' : np .random .randn (N )}
147+ N = 10 ** 5
148+ data = {
149+ "int" : np .arange (N ),
150+ "uint" : np .arange (N ).astype (np .uint64 ),
151+ "float" : np .random .randn (N ),
152+ }
138153 self .idx = pd .Series (data [dtype ].repeat (5 ))
139154
140155 def time_quantile (self , quantile , interpolation , dtype ):
141156 self .idx .quantile (quantile , interpolation = interpolation )
142157
143158
144159class SortIntegerArray :
145- params = [10 ** 3 , 10 ** 5 ]
160+ params = [10 ** 3 , 10 ** 5 ]
146161
147162 def setup (self , N ):
148163 data = np .arange (N , dtype = float )
149164 data [40 ] = np .nan
150- self .array = pd .array (data , dtype = ' Int64' )
165+ self .array = pd .array (data , dtype = " Int64" )
151166
152167 def time_argsort (self , N ):
153168 self .array .argsort ()
0 commit comments