@@ -112,17 +112,6 @@ def fit(self, X, y):
112112 # data.
113113 self ._in_fit = True
114114
115- # bin the data
116- if self .verbose :
117- print ("Binning {:.3f} GB of data: " .format (X .nbytes / 1e9 ), end = "" ,
118- flush = True )
119- tic = time ()
120- self .bin_mapper_ = _BinMapper (max_bins = self .max_bins , random_state = rng )
121- X_binned = self .bin_mapper_ .fit_transform (X )
122- toc = time ()
123- if self .verbose :
124- duration = toc - tic
125- print ("{:.3f} s" .format (duration ))
126115
127116 self .loss_ = self ._get_loss ()
128117
@@ -135,17 +124,20 @@ def fit(self, X, y):
135124 # stratify for classification
136125 stratify = y if hasattr (self .loss_ , 'predict_proba' ) else None
137126
138- X_binned_train , X_binned_val , y_train , y_val = train_test_split (
139- X_binned , y , test_size = self .validation_fraction ,
140- stratify = stratify , random_state = rng )
127+ X_train , X_val , y_train , y_val = train_test_split (
128+ X , y , test_size = self .validation_fraction , stratify = stratify ,
129+ random_state = rng )
130+ else :
131+ X_train , y_train = X , y
132+ X_val , y_val = None , None
141133
142- # Predicting is faster of C-contiguous arrays, training is faster
143- # on Fortran arrays.
144- X_binned_val = np .ascontiguousarray (X_binned_val )
145- X_binned_train = np .asfortranarray (X_binned_train )
134+ # Bin the data
135+ self .bin_mapper_ = _BinMapper (max_bins = self .max_bins , random_state = rng )
136+ X_binned_train = self ._bin_data (X_train , rng , is_training_data = True )
137+ if X_val is not None :
138+ X_binned_val = self ._bin_data (X_val , rng , is_training_data = False )
146139 else :
147- X_binned_train , y_train = X_binned , y
148- X_binned_val , y_val = None , None
140+ X_binned_val = None
149141
150142 if self .verbose :
151143 print ("Fitting gradient boosted rounds:" )
@@ -387,6 +379,32 @@ def _should_stop(self, scores):
387379 for score in recent_scores ]
388380 return not any (recent_improvements )
389381
382+ def _bin_data (self , X , rng , is_training_data ):
383+ """Bin data X.
384+
385+ If is_training_data, then set the bin_mapper_ attribute.
386+ Else, the binned data is converted to a C-contiguous array.
387+ """
388+
389+ description = 'training' if is_training_data else 'validation'
390+ if self .verbose :
391+ print ("Binning {:.3f} GB of {} data: " .format (
392+ X .nbytes / 1e9 , description ), end = "" , flush = True )
393+ tic = time ()
394+ if is_training_data :
395+ X_binned = self .bin_mapper_ .fit_transform (X ) # F-aligned array
396+ else :
397+ X_binned = self .bin_mapper_ .transform (X ) # F-aligned array
398+ # We convert the array to C-contiguous since predicting is faster
399+ # with this layout (training is faster on F-arrays though)
400+ X_binned = np .ascontiguousarray (X_binned )
401+ toc = time ()
402+ if self .verbose :
403+ duration = toc - tic
404+ print ("{:.3f} s" .format (duration ))
405+
406+ return X_binned
407+
390408 def _print_iteration_stats (self , iteration_start_time ):
391409 """Print info about the current fitting iteration."""
392410 log_msg = ''
0 commit comments