liuwenbin2012159
diff --git a/‎doc/whats_new/v0.22.rst‎
Lines changed: 8 additions & 0 deletions b/‎doc/whats_new/v0.22.rst‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎sklearn/ensemble/_hist_gradient_boosting/binning.py‎
Lines changed: 1 addition & 1 deletion b/‎sklearn/ensemble/_hist_gradient_boosting/binning.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py‎
Lines changed: 38 additions & 20 deletions b/‎sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py‎
Lines changed: 38 additions & 20 deletions
diff --git a/‎sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py‎
Lines changed: 27 additions & 0 deletions b/‎sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py‎
Lines changed: 27 additions & 0 deletions
@@ -39,6 +39,14 @@ Changelog
  :pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
  where 123456 is the *pull request* number, not the issue number.
 
+:mod:`sklearn.ensemble`
+.......................
+
+- |Fix| :class:`ensemble.HistGradientBoostingClassifier` and
+ :class:`ensemble.HistGradientBoostingRegressor` now bin the training and
+ validation data separately to avoid any data leak. :pr:`13933` by
+ `NicolasHug`_.
+
 :mod:`sklearn.linear_model`
 ..................
 
 
@@ -140,7 +140,7 @@ def transform(self, X):
  Returns
  -------
  X_binned : array-like, shape (n_samples, n_features)
- The binned data.
+ The binned data (fortran-aligned).
  """
  X = check_array(X, dtype=[X_DTYPE])
  check_is_fitted(self, ['bin_thresholds_', 'actual_n_bins_'])
 
@@ -112,17 +112,6 @@ def fit(self, X, y):
  # data.
  self._in_fit = True
 
- # bin the data
- if self.verbose:
- print("Binning {:.3f} GB of data: ".format(X.nbytes / 1e9), end="",
- flush=True)
- tic = time()
- self.bin_mapper_ = _BinMapper(max_bins=self.max_bins, random_state=rng)
- X_binned = self.bin_mapper_.fit_transform(X)
- toc = time()
- if self.verbose:
- duration = toc - tic
- print("{:.3f} s".format(duration))
 
  self.loss_ = self._get_loss()
 
@@ -135,17 +124,20 @@ def fit(self, X, y):
  # stratify for classification
  stratify = y if hasattr(self.loss_, 'predict_proba') else None
 
- X_binned_train, X_binned_val, y_train, y_val = train_test_split(
- X_binned, y, test_size=self.validation_fraction,
- stratify=stratify, random_state=rng)
+ X_train, X_val, y_train, y_val = train_test_split(
+ X, y, test_size=self.validation_fraction, stratify=stratify,
+ random_state=rng)
+ else:
+ X_train, y_train = X, y
+ X_val, y_val = None, None
 
- # Predicting is faster of C-contiguous arrays, training is faster
- # on Fortran arrays.
- X_binned_val = np.ascontiguousarray(X_binned_val)
- X_binned_train = np.asfortranarray(X_binned_train)
+ # Bin the data
+ self.bin_mapper_ = _BinMapper(max_bins=self.max_bins, random_state=rng)
+ X_binned_train = self._bin_data(X_train, rng, is_training_data=True)
+ if X_val is not None:
+ X_binned_val = self._bin_data(X_val, rng, is_training_data=False)
  else:
- X_binned_train, y_train = X_binned, y
- X_binned_val, y_val = None, None
+ X_binned_val = None
 
  if self.verbose:
  print("Fitting gradient boosted rounds:")
@@ -387,6 +379,32 @@ def _should_stop(self, scores):
  for score in recent_scores]
  return not any(recent_improvements)
 
+ def _bin_data(self, X, rng, is_training_data):
+ """Bin data X.
+
+ If is_training_data, then set the bin_mapper_ attribute.
+ Else, the binned data is converted to a C-contiguous array.
+ """
+
+ description = 'training' if is_training_data else 'validation'
+ if self.verbose:
+ print("Binning {:.3f} GB of {} data: ".format(
+ X.nbytes / 1e9, description), end="", flush=True)
+ tic = time()
+ if is_training_data:
+ X_binned = self.bin_mapper_.fit_transform(X) # F-aligned array
+ else:
+ X_binned = self.bin_mapper_.transform(X) # F-aligned array
+ # We convert the array to C-contiguous since predicting is faster
+ # with this layout (training is faster on F-arrays though)
+ X_binned = np.ascontiguousarray(X_binned)
+ toc = time()
+ if self.verbose:
+ duration = toc - tic
+ print("{:.3f} s".format(duration))
+
+ return X_binned
+
  def _print_iteration_stats(self, iteration_start_time):
  """Print info about the current fitting iteration."""
  log_msg = ''
 
@@ -6,6 +6,7 @@
 from sklearn.experimental import enable_hist_gradient_boosting # noqa
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
 
 
 X_classification, y_classification = make_classification(random_state=0)
@@ -145,3 +146,29 @@ def test_should_stop(scores, n_iter_no_change, tol, stopping):
  n_iter_no_change=n_iter_no_change, tol=tol
  )
  assert gbdt._should_stop(scores) == stopping
+
+
+def test_binning_train_validation_are_separated():
+ # Make sure training and validation data are binned separately.
+ # See issue 13926
+
+ rng = np.random.RandomState(0)
+ validation_fraction = .2
+ gb = HistGradientBoostingClassifier(
+ n_iter_no_change=5,
+ validation_fraction=validation_fraction,
+ random_state=rng
+ )
+ gb.fit(X_classification, y_classification)
+ mapper_training_data = gb.bin_mapper_
+
+ # Note that since the data is small there is no subsampling and the
+ # random_state doesn't matter
+ mapper_whole_data = _BinMapper(random_state=0)
+ mapper_whole_data.fit(X_classification)
+
+ n_samples = X_classification.shape[0]
+ assert np.all(mapper_training_data.actual_n_bins_ ==
+ int((1 - validation_fraction) * n_samples))
+ assert np.all(mapper_training_data.actual_n_bins_ !=
+ mapper_whole_data.actual_n_bins_)