peterjc
diff --git a/‎sklearn/linear_model/sgd_fast.pyx‎
Lines changed: 12 additions & 10 deletions b/‎sklearn/linear_model/sgd_fast.pyx‎
Lines changed: 12 additions & 10 deletions
diff --git a/‎sklearn/linear_model/stochastic_gradient.py‎
Lines changed: 72 additions & 58 deletions b/‎sklearn/linear_model/stochastic_gradient.py‎
Lines changed: 72 additions & 58 deletions
@@ -340,7 +340,7 @@ def plain_sgd(np.ndarray[double, ndim=1, mode='c'] weights,
  double l1_ratio,
  SequentialDataset dataset,
  np.ndarray[unsigned char, ndim=1, mode='c'] validation_mask,
- bint early_stopping, estimator,
+ bint early_stopping, validation_score_cb,
  int n_iter_no_change,
  int max_iter, double tol, int fit_intercept,
  int verbose, bint shuffle, np.uint32_t seed,
@@ -374,8 +374,9 @@ def plain_sgd(np.ndarray[double, ndim=1, mode='c'] weights,
  Equal to True on the validation set.
  early_stopping : boolean
  Whether to use a stopping criterion based on the validation set.
- estimator : BaseSGD
- A concrete object inheriting from ``BaseSGD``.
+ validation_score_cb : callable
+ A callable to compute a validation score given the current
+ coefficients and intercept values.
  Used only if early_stopping is True.
  n_iter_no_change : int
  Number of iteration with no improvement to wait before stopping.
@@ -435,7 +436,7 @@ def plain_sgd(np.ndarray[double, ndim=1, mode='c'] weights,
  dataset,
  validation_mask,
  early_stopping,
- estimator,
+ validation_score_cb,
  n_iter_no_change,
  max_iter, tol, fit_intercept,
  verbose, shuffle, seed,
@@ -458,7 +459,7 @@ def average_sgd(np.ndarray[double, ndim=1, mode='c'] weights,
  double l1_ratio,
  SequentialDataset dataset,
  np.ndarray[unsigned char, ndim=1, mode='c'] validation_mask,
- bint early_stopping, estimator,
+ bint early_stopping, validation_score_cb,
  int n_iter_no_change,
  int max_iter, double tol, int fit_intercept,
  int verbose, bint shuffle, np.uint32_t seed,
@@ -497,8 +498,9 @@ def average_sgd(np.ndarray[double, ndim=1, mode='c'] weights,
  Equal to True on the validation set.
  early_stopping : boolean
  Whether to use a stopping criterion based on the validation set.
- estimator : BaseSGD
- A concrete object inheriting from ``BaseSGD``.
+ validation_score_cb : callable
+ A callable to compute a validation score given the current
+ coefficients and intercept values.
  Used only if early_stopping is True.
  n_iter_no_change : int
  Number of iteration with no improvement to wait before stopping.
@@ -562,7 +564,7 @@ def average_sgd(np.ndarray[double, ndim=1, mode='c'] weights,
  dataset,
  validation_mask,
  early_stopping,
- estimator,
+ validation_score_cb,
  n_iter_no_change,
  max_iter, tol, fit_intercept,
  verbose, shuffle, seed,
@@ -584,7 +586,7 @@ def _plain_sgd(np.ndarray[double, ndim=1, mode='c'] weights,
  double l1_ratio,
  SequentialDataset dataset,
  np.ndarray[unsigned char, ndim=1, mode='c'] validation_mask,
- bint early_stopping, estimator,
+ bint early_stopping, validation_score_cb,
  int n_iter_no_change,
  int max_iter, double tol, int fit_intercept,
  int verbose, bint shuffle, np.uint32_t seed,
@@ -759,7 +761,7 @@ def _plain_sgd(np.ndarray[double, ndim=1, mode='c'] weights,
  # evaluate the score on the validation set
  if early_stopping:
  with gil:
- score = estimator._validation_score(weights, intercept)
+ score = validation_score_cb(weights, intercept)
  if tol > -INFINITY and score < best_score + tol:
  no_improvement_count += 1
  else:
 
@@ -11,6 +11,7 @@
 
 from ..utils import Parallel, delayed
 
+from ..base import clone, is_classifier
 from .base import LinearClassifierMixin, SparseCoefMixin
 from .base import make_dataset
 from ..base import BaseEstimator, RegressorMixin
@@ -20,7 +21,7 @@
 from ..utils.validation import check_is_fitted
 from ..exceptions import ConvergenceWarning
 from ..externals import six
-from ..model_selection import train_test_split
+from ..model_selection import StratifiedShuffleSplit, ShuffleSplit
 
 from .sgd_fast import plain_sgd, average_sgd
 from ..utils import compute_class_weight
@@ -43,6 +44,26 @@
 # Default value of ``epsilon`` parameter.
 
 
+class _ValidationScoreCallback(object):
+ """Callback for early stopping based on validation score"""
+
+ def __init__(self, estimator, X_val, y_val, sample_weight_val,
+ classes=None):
+ self.estimator = clone(estimator)
+ self.estimator.t_ = 1 # to pass check_is_fitted
+ if classes is not None:
+ self.estimator.classes_ = classes
+ self.X_val = X_val
+ self.y_val = y_val
+ self.sample_weight_val = sample_weight_val
+
+ def __call__(self, coef, intercept):
+ est = self.estimator
+ est.coef_ = coef.reshape(1, -1)
+ est.intercept_ = np.atleast_1d(intercept)
+ return est.score(self.X_val, self.y_val, self.sample_weight_val)
+
+
 class BaseSGD(six.with_metaclass(ABCMeta, BaseEstimator, SparseCoefMixin)):
  """Base class for SGD classification and regression."""
 
@@ -248,71 +269,52 @@ def _allocate_parameter_mem(self, n_classes, n_features, coef_init=None,
  dtype=np.float64,
  order="C")
 
- def _make_validation_split(self, X, y, sample_weight):
+ def _make_validation_split(self, y):
  """Split the dataset between training set and validation set.
 
  Parameters
  ----------
- X : {array, sparse matrix}, shape (n_samples, n_features)
- Training data.
-
  y : array, shape (n_samples, )
  Target values.
 
- sample_weight : array, shape (n_samples, )
- Weights applied to individual samples.
-
  Returns
  -------
  validation_mask : array, shape (n_samples, )
  Equal to 1 on the validation set, 0 on the training set.
  """
- n_samples = X.shape[0]
+ n_samples = y.shape[0]
  validation_mask = np.zeros(n_samples, dtype=np.uint8)
  if not self.early_stopping:
  # use the full set for training, with an empty validation set
  return validation_mask
 
- tmp = train_test_split(X, y, np.arange(n_samples), sample_weight,
- test_size=self.validation_fraction,
- random_state=self.random_state)
- X_train, X_val, y_train, y_val = tmp[:4]
- idx_train, idx_val, sample_weight_train, sample_weight_val = tmp[4:8]
- if X_train.shape[0] == 0 or X_val.shape[0] == 0:
+ if is_classifier(self):
+ splitter_type = StratifiedShuffleSplit
+ else:
+ splitter_type = ShuffleSplit
+ cv = splitter_type(test_size=self.validation_fraction,
+ random_state=self.random_state)
+ idx_train, idx_val = next(cv.split(np.zeros(shape=(y.shape[0], 1)), y))
+ if idx_train.shape[0] == 0 or idx_val.shape[0] == 0:
  raise ValueError(
  "Splitting %d samples into a train set and a validation set "
  "with validation_fraction=%r led to an empty set (%d and %d "
  "samples). Please either change validation_fraction, increase "
  "number of samples, or disable early_stopping."
- % (n_samples, self.validation_fraction, X_train.shape[0],
- X_val.shape[0]))
+ % (n_samples, self.validation_fraction, idx_train.shape[0],
+ idx_val.shape[0]))
 
- self._X_val = X_val
- self._y_val = y_val
- self._sample_weight_val = sample_weight_val
  validation_mask[idx_val] = 1
  return validation_mask
 
- def _delete_validation_split(self):
- if self.early_stopping:
- del self._X_val
- del self._y_val
- del self._sample_weight_val
-
- def _validation_score(self, coef, intercept):
- """Compute the score on the validation set. Used for early stopping."""
- # store attributes
- old_coefs, old_intercept = self.coef_, self.intercept_
-
- # replace them with current coefficients for scoring
- self.coef_ = coef.reshape(1, -1)
- self.intercept_ = np.atleast_1d(intercept)
- score = self.score(self._X_val, self._y_val, self._sample_weight_val)
-
- # restore old attributes
- self.coef_, self.intercept_ = old_coefs, old_intercept
+ def _make_validation_score_cb(self, validation_mask, X, y, sample_weight,
+ classes=None):
+ if not self.early_stopping:
+ return None
 
- return score
+ return _ValidationScoreCallback(
+ self, X[validation_mask], y[validation_mask],
+ sample_weight[validation_mask], classes=classes)
 
 
 def _prepare_fit_binary(est, y, i):
@@ -348,7 +350,7 @@ def _prepare_fit_binary(est, y, i):
 
 
 def fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter,
- pos_weight, neg_weight, sample_weight):
+ pos_weight, neg_weight, sample_weight, validation_mask=None):
  """Fit a single binary classifier.
 
  The i'th class is considered the "positive" class.
@@ -388,6 +390,10 @@ def fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter,
 
  sample_weight : numpy array of shape [n_samples, ]
  The weight of each sample
+
+ validation_mask : numpy array of shape [n_samples, ] or None
+ Precomputed validation mask in case _fit_binary is called in the
+ context of a one-vs-rest reduction.
  """
  # if average is not true, average_coef, and average_intercept will be
  # unused
@@ -399,7 +405,11 @@ def fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter,
  penalty_type = est._get_penalty_type(est.penalty)
  learning_rate_type = est._get_learning_rate_type(learning_rate)
 
- validation_mask = est._make_validation_split(X, y, sample_weight)
+ if validation_mask is None:
+ validation_mask = est._make_validation_split(y_i)
+ classes = np.array([-1, 1], dtype=y_i.dtype)
+ validation_score_cb = est._make_validation_score_cb(
+ validation_mask, X, y_i, sample_weight, classes=classes)
 
  # XXX should have random_state_!
  random_state = check_random_state(est.random_state)
@@ -412,8 +422,8 @@ def fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter,
  if not est.average:
  result = plain_sgd(coef, intercept, est.loss_function_,
  penalty_type, alpha, C, est.l1_ratio,
- dataset, validation_mask, est.early_stopping, est,
- int(est.n_iter_no_change),
+ dataset, validation_mask, est.early_stopping,
+ validation_score_cb, int(est.n_iter_no_change),
  max_iter, tol, int(est.fit_intercept),
  int(est.verbose), int(est.shuffle), seed,
  pos_weight, neg_weight,
@@ -426,8 +436,8 @@ def fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter,
  average_intercept, est.loss_function_,
  penalty_type, alpha, C, est.l1_ratio,
  dataset, validation_mask, est.early_stopping,
- est, int(est.n_iter_no_change),
- max_iter, tol,
+ validation_score_cb,
+ int(est.n_iter_no_change), max_iter, tol,
  int(est.fit_intercept), int(est.verbose),
  int(est.shuffle), seed, pos_weight,
  neg_weight, learning_rate_type, est.eta0,
@@ -441,7 +451,6 @@ def fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter,
 
  result = standard_coef, standard_intercept, n_iter_
 
- est._delete_validation_split()
  return result
 
 
@@ -610,14 +619,19 @@ def _fit_multiclass(self, X, y, alpha, C, learning_rate,
  """Fit a multi-class classifier by combining binary classifiers
 
  Each binary classifier predicts one class versus all others. This
- strategy is called OVA: One Versus All.
+ strategy is called OvA (One versus All) or OvR (One versus Rest).
  """
+ # Precompute the validation split using the multiclass labels
+ # to ensure proper balancing of the classes.
+ validation_mask = self._make_validation_split(y)
+
  # Use joblib to fit OvA in parallel.
  result = Parallel(n_jobs=self.n_jobs, prefer="threads",
  verbose=self.verbose)(
  delayed(fit_binary)(self, i, X, y, alpha, C, learning_rate,
  max_iter, self._expanded_class_weight[i],
- 1., sample_weight)
+ 1., sample_weight,
+ validation_mask=validation_mask)
  for i in range(len(self.classes_)))
 
  # take the maximum of n_iter_ over every binary fit
@@ -1115,18 +1129,16 @@ def _partial_fit(self, X, y, alpha, C, loss, learning_rate,
  sample_weight = self._validate_sample_weight(sample_weight, n_samples)
 
  if getattr(self, "coef_", None) is None:
- self._allocate_parameter_mem(1, n_features,
- coef_init, intercept_init)
+ self._allocate_parameter_mem(1, n_features, coef_init,
+ intercept_init)
  elif n_features != self.coef_.shape[-1]:
  raise ValueError("Number of features %d does not match previous "
  "data %d." % (n_features, self.coef_.shape[-1]))
  if self.average > 0 and getattr(self, "average_coef_", None) is None:
  self.average_coef_ = np.zeros(n_features,
  dtype=np.float64,
  order="C")
- self.average_intercept_ = np.zeros(1,
- dtype=np.float64,
- order="C")
+ self.average_intercept_ = np.zeros(1, dtype=np.float64, order="C")
 
  self._fit_regressor(X, y, alpha, C, loss, learning_rate,
  sample_weight, max_iter)
@@ -1269,7 +1281,9 @@ def _fit_regressor(self, X, y, alpha, C, loss, learning_rate,
  if not hasattr(self, "t_"):
  self.t_ = 1.0
 
- validation_mask = self._make_validation_split(X, y, sample_weight)
+ validation_mask = self._make_validation_split(y)
+ validation_score_cb = self._make_validation_score_cb(
+ validation_mask, X, y, sample_weight)
 
  random_state = check_random_state(self.random_state)
  # numpy mtrand expects a C long which is a signed 32 bit integer under
@@ -1290,7 +1304,8 @@ def _fit_regressor(self, X, y, alpha, C, loss, learning_rate,
  alpha, C,
  self.l1_ratio,
  dataset,
- validation_mask, self.early_stopping, self,
+ validation_mask, self.early_stopping,
+ validation_score_cb,
  int(self.n_iter_no_change),
  max_iter, tol,
  int(self.fit_intercept),
@@ -1322,7 +1337,8 @@ def _fit_regressor(self, X, y, alpha, C, loss, learning_rate,
  alpha, C,
  self.l1_ratio,
  dataset,
- validation_mask, self.early_stopping, self,
+ validation_mask, self.early_stopping,
+ validation_score_cb,
  int(self.n_iter_no_change),
  max_iter, tol,
  int(self.fit_intercept),
@@ -1337,8 +1353,6 @@ def _fit_regressor(self, X, y, alpha, C, loss, learning_rate,
  self.t_ += self.n_iter_ * X.shape[0]
  self.intercept_ = np.atleast_1d(self.intercept_)
 
- self._delete_validation_split()
-
 
 class SGDRegressor(BaseSGDRegressor):
  """Linear model fitted by minimizing a regularized empirical loss with SGD