sworddish
diff --git a/‎doc/modules/svm.rst‎
Lines changed: 20 additions & 16 deletions b/‎doc/modules/svm.rst‎
Lines changed: 20 additions & 16 deletions
diff --git a/‎doc/tutorial.rst‎
Lines changed: 5 additions & 4 deletions b/‎doc/tutorial.rst‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎doc/whats_new.rst‎
Lines changed: 5 additions & 0 deletions b/‎doc/whats_new.rst‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎examples/applications/face_recognition.py‎
Lines changed: 1 addition & 2 deletions b/‎examples/applications/face_recognition.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎examples/linear_model/plot_sgd_weighted_classes.py‎
Lines changed: 5 additions & 5 deletions b/‎examples/linear_model/plot_sgd_weighted_classes.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎examples/svm/plot_separating_hyperplane_unbalanced.py‎
Lines changed: 5 additions & 5 deletions b/‎examples/svm/plot_separating_hyperplane_unbalanced.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎sklearn/linear_model/logistic.py‎
Lines changed: 4 additions & 4 deletions b/‎sklearn/linear_model/logistic.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎sklearn/linear_model/stochastic_gradient.py‎
Lines changed: 14 additions & 23 deletions b/‎sklearn/linear_model/stochastic_gradient.py‎
Lines changed: 14 additions & 23 deletions
@@ -82,9 +82,10 @@ training samples::
  >>> X = [[0, 0], [1, 1]]
  >>> Y = [0, 1]
  >>> clf = svm.SVC()
- >>> clf.fit(X, Y)
- SVC(C=1.0, cache_size=200, coef0=0.0, degree=3, gamma=0.5, kernel='rbf',
- probability=False, scale_C=True, shrinking=True, tol=0.001)
+ >>> clf.fit(X, Y) # doctest: +NORMALIZE_WHITESPACE
+ SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
+ gamma=0.5, kernel='rbf', probability=False, scale_C=True, shrinking=True,
+ tol=0.001)
 
 After being fitted, the model can then be used to predict new values::
 
@@ -120,9 +121,10 @@ classifiers are constructed and each one trains data from two classes::
  >>> X = [[0], [1], [2], [3]]
  >>> Y = [0, 1, 2, 3]
  >>> clf = svm.SVC()
- >>> clf.fit(X, Y)
- SVC(C=1.0, cache_size=200, coef0=0.0, degree=3, gamma=1.0, kernel='rbf',
- probability=False, scale_C=True, shrinking=True, tol=0.001)
+ >>> clf.fit(X, Y) # doctest: +NORMALIZE_WHITESPACE
+ SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
+ gamma=1.0, kernel='rbf', probability=False, scale_C=True, shrinking=True,
+ tol=0.001)
  >>> dec = clf.decision_function([[1]])
  >>> dec.shape[1] # 4 classes: 4*3/2 = 6
  6
@@ -132,9 +134,10 @@ multi-class strategy, thus training n_class models. If there are only
 two classes, only one model is trained::
 
  >>> lin_clf = svm.LinearSVC()
- >>> lin_clf.fit(X, Y)
- LinearSVC(C=1.0, dual=True, fit_intercept=True, intercept_scaling=1,
- loss='l2', multi_class=False, penalty='l2', scale_C=True, tol=0.0001)
+ >>> lin_clf.fit(X, Y) # doctest: +NORMALIZE_WHITESPACE
+ LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
+ intercept_scaling=1, loss='l2', multi_class=False, penalty='l2',
+ scale_C=True, tol=0.0001)
  >>> dec = lin_clf.decision_function([[1]])
  >>> dec.shape[1]
  4
@@ -258,9 +261,10 @@ floating point values instead of integer values::
  >>> X = [[0, 0], [2, 2]]
  >>> y = [0.5, 2.5]
  >>> clf = svm.SVR()
- >>> clf.fit(X, y)
- SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.5,
- kernel='rbf', probability=False, scale_C=True, shrinking=True, tol=0.001)
+ >>> clf.fit(X, y) # doctest: +NORMALIZE_WHITESPACE
+ SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
+ epsilon=0.1, gamma=0.5, kernel='rbf', probability=False, scale_C=True,
+ shrinking=True, tol=0.001)
  >>> clf.predict([[1, 1]])
  array([ 1.5])
 
@@ -451,10 +455,10 @@ vectors and the test vectors must be provided.
  >>> clf = svm.SVC(kernel='precomputed')
  >>> # linear kernel computation
  >>> gram = np.dot(X, X.T)
- >>> clf.fit(gram, y)
- SVC(C=1.0, cache_size=200, coef0=0.0, degree=3, gamma=0.0,
-  kernel='precomputed', probability=False, scale_C=True, shrinking=True,
-   tol=0.001)
+ >>> clf.fit(gram, y) # doctest: +NORMALIZE_WHITESPACE
+ SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
+ gamma=0.0, kernel='precomputed', probability=False, scale_C=True,
+ shrinking=True, tol=0.001)
  >>> # predict on training examples
  >>> clf.predict(gram)
  array([ 0., 1.])
 
@@ -152,8 +152,9 @@ set, let us use all the images of our dataset apart from the last
 one::
 
  >>> clf.fit(digits.data[:-1], digits.target[:-1])
- SVC(C=100.0, cache_size=200, coef0=0.0, degree=3, gamma=0.001, kernel='rbf',
- probability=False, scale_C=True, shrinking=True, tol=0.001)
+ SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
+ gamma=0.001, kernel='rbf', probability=False, scale_C=True,
+ shrinking=True, tol=0.001)
 
 Now you can predict new values, in particular, we can ask to the
 classifier what is the digit of our last image in the `digits` dataset,
@@ -188,8 +189,8 @@ persistence model, namely `pickle <http://docs.python.org/library/pickle.html>`_
  >>> iris = datasets.load_iris()
  >>> X, y = iris.data, iris.target
  >>> clf.fit(X, y)
- SVC(C=1.0, cache_size=200, coef0=0.0, degree=3, gamma=0.25, kernel='rbf',
- probability=False, scale_C=True, shrinking=True, tol=0.001)
+ SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.25,
+ kernel='rbf', probability=False, scale_C=True, shrinking=True, tol=0.001)
 
  >>> import pickle
  >>> s = pickle.dumps(clf)
 
@@ -70,6 +70,11 @@ API changes summary
  objects are now deprecated.
  `scores_` or `pvalues_` should be used instead.
 
+ - In :class:`LogisticRegression`, :class:`LinearSVC`, :class:`SVC` and
+ :class:`NuSVC`, the `class_weight` parameter is now an initialization
+ parameter, not a parameter to fit. This makes grid searches
+ over this parameter possible.
+
  - LFW ``data`` is now always shape ``(n_samples, n_features)`` to be
  consistent with the Olivetti faces dataset. Use ``images`` and
  ``pairs`` attribute to access the natural images shapes instead.
 
@@ -110,8 +110,7 @@
  'C': [1, 5, 10, 50, 100],
  'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
 }
-clf = GridSearchCV(SVC(kernel='rbf'), param_grid,
- fit_params={'class_weight': 'auto'})
+clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
 clf = clf.fit(X_train_pca, y_train)
 print "done in %0.3fs" % (time() - t0)
 print "Best estimator found by grid search:"
 
@@ -39,19 +39,19 @@
 
 
 # get the separating hyperplane using weighted classes
-wclf = SGDClassifier(n_iter=100, alpha=0.01)
-wclf.fit(X, y, class_weight={1: 10})
+wclf = SGDClassifier(n_iter=100, alpha=0.01, class_weight={1: 10})
+wclf.fit(X, y)
 
 ww = wclf.coef_.ravel()
 wa = -ww[0] / ww[1]
 wyy = wa * xx - wclf.intercept_ / ww[1]
 
 # plot separating hyperplanes and samples
 pl.set_cmap(pl.cm.Paired)
-h0 = pl.plot(xx, yy, 'k-')
-h1 = pl.plot(xx, wyy, 'k--')
+h0 = pl.plot(xx, yy, 'k-', label='no weights')
+h1 = pl.plot(xx, wyy, 'k--', label='with weights')
 pl.scatter(X[:, 0], X[:, 1], c=y)
-pl.legend((h0, h1), ('no weights', 'with weights'))
+pl.legend()
 
 pl.axis('tight')
 pl.show()
@@ -35,19 +35,19 @@
 
 
 # get the separating hyperplane using weighted classes
-wclf = svm.SVC(kernel='linear')
-wclf.fit(X, y, class_weight={1: 10})
+wclf = svm.SVC(kernel='linear', class_weight={1: 10})
+wclf.fit(X, y)
 
 ww = wclf.coef_[0]
 wa = -ww[0] / ww[1]
 wyy = wa * xx - wclf.intercept_[0] / ww[1]
 
 # plot separating hyperplanes and samples
 pl.set_cmap(pl.cm.Paired)
-h0 = pl.plot(xx, yy, 'k-')
-h1 = pl.plot(xx, wyy, 'k--')
+h0 = pl.plot(xx, yy, 'k-', label='no weights')
+h1 = pl.plot(xx, wyy, 'k--', label='with weights')
 pl.scatter(X[:, 0], X[:, 1], c=y)
-pl.legend((h0, h1), ('no weights', 'with weights'))
+pl.legend()
 
 pl.axis('tight')
 pl.show()
@@ -90,12 +90,12 @@ class LogisticRegression(BaseLibLinear, ClassifierMixin, SelectorMixin):
 
  def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0,
  fit_intercept=True, intercept_scaling=1,
- scale_C=True):
+ scale_C=True, class_weight=None):
 
  super(LogisticRegression, self).__init__(penalty=penalty,
  dual=dual, loss='lr', tol=tol, C=C,
  fit_intercept=fit_intercept, intercept_scaling=intercept_scaling,
- scale_C=scale_C)
+ scale_C=scale_C, class_weight=class_weight)
 
  def predict_proba(self, X):
  """Probability estimates.
@@ -118,8 +118,8 @@ def predict_proba(self, X):
  prob_wrap = (csr_predict_prob_wrap if self._sparse else
  predict_prob_wrap)
  probas = prob_wrap(X, self.raw_coef_, self._get_solver_type(),
- self.tol, self.C, self.class_weight_label,
- self.class_weight, self.label_, self._get_bias())
+ self.tol, self.C, self.class_weight_label_,
+ self.class_weight_, self.label_, self._get_bias())
  return probas[:, np.argsort(self.label_)]
 
  def predict_log_proba(self, X):
 
@@ -6,6 +6,7 @@
 
 import numpy as np
 import scipy.sparse as sp
+import warnings
 
 from ..externals.joblib import Parallel, delayed
 
@@ -222,8 +223,7 @@ def _set_class_weight(self, class_weight, classes, y):
 
  self._expanded_class_weight = weight
 
- def _partial_fit(self, X, y, n_iter, classes=None,
- class_weight=None, sample_weight=None):
+ def _partial_fit(self, X, y, n_iter, classes=None, sample_weight=None):
  X = safe_asarray(X, dtype=np.float64, order="C")
  y = np.asarray(y)
 
@@ -243,7 +243,7 @@ def _partial_fit(self, X, y, n_iter, classes=None,
  n_classes = self.classes_.shape[0]
 
  # Allocate datastructures from input arguments
- self._set_class_weight(class_weight, self.classes_, y)
+ self._set_class_weight(self.class_weight, self.classes_, y)
  sample_weight = self._validate_sample_weight(sample_weight, n_samples)
 
  if self.coef_ is None:
@@ -283,16 +283,6 @@ def partial_fit(self, X, y, classes=None,
  and can be omitted in the subsequent calls.
  Note that y doesn't need to contain all labels in `classes`.
 
- class_weight : dict, {class_label : weight} or "auto"
- Weights associated with classes.
-
- The "auto" mode uses the values of y to automatically adjust
- weights inversely proportional to class frequencies.
-
- If None, values defined in the previous call to partial_fit
- will be used. If partial_fit was never called before,
- uniform weights are assumed.
-
  sample_weight : array-like, shape = [n_samples], optional
  Weights applied to individual samples.
  If not provided, uniform weights are assumed.
@@ -301,8 +291,12 @@ def partial_fit(self, X, y, classes=None,
  -------
  self : returns an instance of self.
  """
+ if class_weight != None:
+ warnings.warn("Using 'class_weight' as a parameter to the 'fit'"
+ "method is deprecated. Set it on initialization instead.",
+ DeprecationWarning)
+ self.class_weight = class_weight
  return self._partial_fit(X, y, n_iter=1, classes=classes,
- class_weight=class_weight,
  sample_weight=sample_weight)
 
  def fit(self, X, y, coef_init=None, intercept_init=None,
@@ -323,13 +317,6 @@ def fit(self, X, y, coef_init=None, intercept_init=None,
  intercept_init : array, shape = [n_classes]
  The initial intercept to warm-start the optimization.
 
- class_weight : dict, {class_label : weight} or "auto"
- Weights associated with classes. If not given, all classes
- are supposed to have weight one.
-
- The "auto" mode uses the values of y to automatically adjust
- weights inversely proportional to class frequencies.
-
  sample_weight : array-like, shape = [n_samples], optional
  Weights applied to individual samples.
  If not provided, uniform weights are assumed.
@@ -338,6 +325,11 @@ def fit(self, X, y, coef_init=None, intercept_init=None,
  -------
  self : returns an instance of self.
  """
+ if class_weight != None:
+ warnings.warn("Using 'class_weight' as a parameter to the 'fit'"
+ "method is deprecated. Set it on initialization instead.",
+ DeprecationWarning)
+ self.class_weight = class_weight
  X = safe_asarray(X, dtype=np.float64, order="C")
  y = np.asarray(y)
 
@@ -363,8 +355,7 @@ def fit(self, X, y, coef_init=None, intercept_init=None,
 
  self._partial_fit(X, y, self.n_iter,
  classes=classes,
- sample_weight=sample_weight,
- class_weight=class_weight)
+ sample_weight=sample_weight)
 
  # fitting is over, we can now transform coef_ to fortran order
  # for faster predictions