liuwenbin2012159
diff --git a/‎sklearn/ensemble/_hist_gradient_boosting/grower.py‎
Lines changed: 5 additions & 1 deletion b/‎sklearn/ensemble/_hist_gradient_boosting/grower.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py‎
Lines changed: 17 additions & 0 deletions b/‎sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎sklearn/utils/estimator_checks.py‎
Lines changed: 5 additions & 2 deletions b/‎sklearn/utils/estimator_checks.py‎
Lines changed: 5 additions & 2 deletions
@@ -16,6 +16,10 @@
 from .predictor import TreePredictor
 from .utils import sum_parallel
 from .types import PREDICTOR_RECORD_DTYPE
+from .types import Y_DTYPE
+
+
+EPS = np.finfo(Y_DTYPE).eps # to avoid zero division errors
 
 
 class TreeNode:
@@ -398,7 +402,7 @@ def _finalize_leaf(self, node):
  https://arxiv.org/abs/1603.02754
  """
  node.value = -self.shrinkage * node.sum_gradients / (
- node.sum_hessians + self.splitter.l2_regularization)
+ node.sum_hessians + self.splitter.l2_regularization + EPS)
  self.finalized_leaves.append(node)
 
  def _finalize_splittable_nodes(self):
 
@@ -172,3 +172,20 @@ def test_binning_train_validation_are_separated():
  int((1 - validation_fraction) * n_samples))
  assert np.all(mapper_training_data.actual_n_bins_ !=
  mapper_whole_data.actual_n_bins_)
+
+
+@pytest.mark.parametrize('data', [
+ make_classification(random_state=0, n_classes=2),
+ make_classification(random_state=0, n_classes=3, n_informative=3)
+], ids=['binary_crossentropy', 'categorical_crossentropy'])
+def test_zero_division_hessians(data):
+ # non regression test for issue #14018
+ # make sure we avoid zero division errors when computing the leaves values.
+
+ # If the learning rate is too high, the raw predictions are bad and will
+ # saturate the softmax (or sigmoid in binary classif). This leads to
+ # probabilities being exactly 0 or 1, gradients being constant, and
+ # hessians being zero.
+ X, y = data
+ gb = HistGradientBoostingClassifier(learning_rate=100, max_iter=10)
+ gb.fit(X, y)
@@ -2401,8 +2401,11 @@ def check_decision_proba_consistency(name, estimator_orig):
  hasattr(estimator, "predict_proba")):
 
  estimator.fit(X, y)
- a = estimator.predict_proba(X_test)[:, 1]
- b = estimator.decision_function(X_test)
+ # Since the link function from decision_function() to predict_proba()
+ # is sometimes not precise enough (typically expit), we round to the
+ # 10th decimal to avoid numerical issues.
+ a = estimator.predict_proba(X_test)[:, 1].round(decimals=10)
+ b = estimator.decision_function(X_test).round(decimals=10)
  assert_array_equal(rankdata(a), rankdata(b))