xuesj
diff --git a/‎doc/modules/feature_selection.rst‎
Lines changed: 2 additions & 2 deletions b/‎doc/modules/feature_selection.rst‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/ensemble/plot_feature_transformation.py‎
Lines changed: 3 additions & 2 deletions b/‎examples/ensemble/plot_feature_transformation.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/ensemble/plot_random_forest_embedding.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/ensemble/plot_random_forest_embedding.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sklearn/ensemble/tests/test_forest.py‎
Lines changed: 5 additions & 5 deletions b/‎sklearn/ensemble/tests/test_forest.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎sklearn/ensemble/tests/test_gradient_boosting.py‎
Lines changed: 9 additions & 11 deletions b/‎sklearn/ensemble/tests/test_gradient_boosting.py‎
Lines changed: 9 additions & 11 deletions
diff --git a/‎sklearn/feature_selection/from_model.py‎
Lines changed: 31 additions & 8 deletions b/‎sklearn/feature_selection/from_model.py‎
Lines changed: 31 additions & 8 deletions
diff --git a/‎sklearn/feature_selection/tests/test_from_model.py‎
Lines changed: 9 additions & 11 deletions b/‎sklearn/feature_selection/tests/test_from_model.py‎
Lines changed: 9 additions & 11 deletions
diff --git a/‎sklearn/tree/tests/test_tree.py‎
Lines changed: 5 additions & 8 deletions b/‎sklearn/tree/tests/test_tree.py‎
Lines changed: 5 additions & 8 deletions
diff --git a/‎sklearn/utils/estimator_checks.py‎
Lines changed: 15 additions & 3 deletions b/‎sklearn/utils/estimator_checks.py‎
Lines changed: 15 additions & 3 deletions
@@ -173,7 +173,7 @@ for classification::
  >>> X.shape
  (150, 4)
  >>> lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
- >>> model = SelectFromModel(lsvc)
+ >>> model = SelectFromModel(lsvc, prefit=True)
  >>> X_new = model.transform(X)
  >>> X_new.shape
  (150, 3)
@@ -277,7 +277,7 @@ meta-transformer)::
  >>> clf = clf.fit(X, y)
  >>> clf.feature_importances_ # doctest: +SKIP
  array([ 0.04..., 0.05..., 0.4..., 0.4...])
- >>> model = SelectFromModel(clf)
+ >>> model = SelectFromModel(clf, prefit=True)
  >>> X_new = model.transform(X)
  >>> X_new.shape # doctest: +SKIP
  (150, 2)
 
@@ -54,9 +54,10 @@
 rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator)
 rt_lm = LogisticRegression()
 rt.fit(X_train, y_train)
-rt_lm.fit(SelectFromModel(rt).transform(X_train_lr), y_train_lr)
+rt_lm.fit(SelectFromModel(rt, prefit=True).transform(X_train_lr), y_train_lr)
 
-y_pred_rt = rt_lm.predict_proba(SelectFromModel(rt).transform(X_test))[:, 1]
+y_pred_rt = rt_lm.predict_proba(
+SelectFromModel(rt, prefit=True).transform(X_test))[:, 1]
 fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)
 
 # Supervised transformation based on random forests
 
@@ -39,7 +39,7 @@
 # use RandomTreesEmbedding to transform data
 hasher = RandomTreesEmbedding(n_estimators=10, random_state=0, max_depth=3)
 hasher.fit(X)
-model = SelectFromModel(hasher)
+model = SelectFromModel(hasher, prefit=True)
 X_transformed = model.transform(X)
 
 # Visualize result using PCA
 
@@ -29,7 +29,6 @@
 from sklearn.utils.testing import assert_greater_equal
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_warns
-from sklearn.utils.testing import clean_warning_registry
 from sklearn.utils.testing import ignore_warnings
 
 from sklearn import datasets
@@ -204,10 +203,11 @@ def check_importances(X, y, name, criterion):
  assert_equal(importances.shape[0], 10)
  assert_equal(n_important, 3)
 
- clean_warning_registry()
- with warnings.catch_warnings(record=True) as record:
- X_new = est.transform(X, threshold="mean")
- assert_less(0 < X_new.shape[1], X.shape[1])
+ # XXX: Remove this test in 0.19 after transform support to estimators
+ # is removed.
+ X_new = assert_warns(
+ DeprecationWarning, est.transform, X, threshold="mean")
+ assert_less(0 < X_new.shape[1], X.shape[1])
 
  # Check with parallel
  importances = est.feature_importances_
 
@@ -16,7 +16,7 @@
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn.ensemble.gradient_boosting import ZeroEstimator
 from sklearn.metrics import mean_squared_error
-from sklearn.utils import check_random_state, tosequence, warnings
+from sklearn.utils import check_random_state, tosequence
 from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_array_equal
@@ -26,7 +26,6 @@
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import assert_warns
-from sklearn.utils.testing import clean_warning_registry
 from sklearn.utils.testing import ignore_warnings
 from sklearn.utils.validation import DataConversionWarning
 from sklearn.utils.validation import NotFittedError
@@ -297,16 +296,15 @@ def test_feature_importances():
  presort=presort)
  clf.fit(X, y)
  assert_true(hasattr(clf, 'feature_importances_'))
- clean_warning_registry()
- with warnings.catch_warnings(record=True) as record:
- X_new = clf.transform(X, threshold="mean")
- assert_less(X_new.shape[1], X.shape[1])
 
- X_new = clf.transform(X, threshold="mean")
- assert_less(X_new.shape[1], X.shape[1])
-
- feature_mask = clf.feature_importances_ > clf.feature_importances_.mean()
- assert_array_almost_equal(X_new, X[:, feature_mask])
+ # XXX: Remove this test in 0.19 after transform support to estimators
+ # is removed.
+ X_new = assert_warns(
+ DeprecationWarning, clf.transform, X, threshold="mean")
+ assert_less(X_new.shape[1], X.shape[1])
+ feature_mask = (
+ clf.feature_importances_ > clf.feature_importances_.mean())
+ assert_array_almost_equal(X_new, X[:, feature_mask])
 
 
 def test_probability_log():
 
@@ -66,6 +66,10 @@ def _calculate_threshold(estimator, importances, threshold):
  elif threshold == "mean":
  threshold = np.mean(importances)
 
+ else:
+ raise ValueError("Expected threshold='mean' or threshold='median' "
+ "got %s" % threshold)
+
  else:
  threshold = float(threshold)
 
@@ -144,10 +148,8 @@ class SelectFromModel(BaseEstimator, SelectorMixin):
  ----------
  estimator : object
  The base estimator from which the transformer is built.
- This can be both a fitted or a non-fitted estimator.
- If it a fitted estimator, then ``transform`` can be called directly,
- otherwise train the model using ``fit`` and then ``transform`` to do
- feature selection.
+ This can be both a fitted (if ``prefit`` is set to True)
+ or a non-fitted estimator.
 
  threshold : string, float, optional
  The threshold value to use for feature selection. Features whose
@@ -158,26 +160,39 @@ class SelectFromModel(BaseEstimator, SelectorMixin):
  available, the object attribute ``threshold`` is used. Otherwise,
  "mean" is used by default.
 
+ prefit : bool, default True
+ Whether a prefit model is expected to be passed into the constructor
+ directly or not. If True, ``transform`` must be called directly
+ and SelectFromModel cannot be used with ``cross_val_score``,
+ ``GridSearchCV`` and similar utilities that clone the estimator.
+ Otherwise train the model using ``fit`` and then ``transform`` to do
+ feature selection.
+
  Attributes
  ----------
  `estimator_`: an estimator
  The base estimator from which the transformer is built.
  This is stored only when a non-fitted estimator is passed to the
- ``SelectFromModel``.
+ ``SelectFromModel``, i.e when prefit is False.
 
  `threshold_`: float
  The threshold value used for feature selection.
  """
- def __init__(self, estimator, threshold=None):
+ def __init__(self, estimator, threshold=None, prefit=False):
  self.estimator = estimator
  self.threshold = threshold
+ self.prefit = prefit
 
  def _get_support_mask(self):
  # SelectFromModel can directly call on transform.
- if hasattr(self, "estimator_"):
+ if self.prefit:
+ estimator = self.estimator
+ elif hasattr(self, 'estimator_'):
  estimator = self.estimator_
  else:
- estimator = self.estimator
+ raise ValueError(
+ 'Either fit the model before transform or set "prefit=True"'
+ ' while passing the fitted estimator to the constructor.')
  scores = _get_feature_importances(estimator)
  self.threshold_ = _calculate_threshold(estimator, scores,
  self.threshold)
@@ -202,6 +217,10 @@ def fit(self, X, y=None, **fit_params):
  self : object
  Returns self.
  """
+ if self.prefit:
+ raise ValueError(
+ 'Fitting will overwrite your already fitted model. Call '
+ 'transform directly.')
  if not hasattr(self, "estimator_"):
  self.estimator_ = clone(self.estimator)
  self.estimator_.fit(X, y, **fit_params)
@@ -226,6 +245,10 @@ def partial_fit(self, X, y=None, **fit_params):
  self : object
  Returns self.
  """
+ if self.prefit:
+ raise ValueError(
+ 'Fitting will overwrite your already fitted model. Call '
+ 'transform directly.')
  if not hasattr(self, "estimator_"):
  self.estimator_ = clone(self.estimator)
  self.estimator_.partial_fit(X, y, **fit_params)
 
@@ -3,14 +3,13 @@
 
 from nose.tools import assert_raises, assert_true
 
-from sklearn.utils import warnings
 from sklearn.utils.testing import assert_less
 from sklearn.utils.testing import assert_greater
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_almost_equal
-from sklearn.utils.testing import clean_warning_registry
+from sklearn.utils.testing import assert_warns
 
 from sklearn import datasets
 from sklearn.linear_model import LogisticRegression
@@ -33,9 +32,8 @@ def test_transform_linear_model():
  X = func(iris.data)
  clf.set_params(penalty="l1")
  clf.fit(X, iris.target)
- clean_warning_registry()
- with warnings.catch_warnings(record=True) as record:
- X_new = clf.transform(X, thresh)
+ X_new = assert_warns(
+ DeprecationWarning, clf.transform, X, thresh)
  if isinstance(clf, SGDClassifier):
  assert_true(X_new.shape[1] <= X.shape[1])
  else:
@@ -48,10 +46,10 @@ def test_transform_linear_model():
 
 def test_invalid_input():
  clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=None)
-
- clf.fit(iris.data, iris.target)
- assert_raises(ValueError, clf.transform, iris.data, "gobbledigook")
- assert_raises(ValueError, clf.transform, iris.data, ".5 * gobbledigook")
+ for threshold in ["gobbledigook", ".5 * gobbledigook"]:
+  model = SelectFromModel(clf, threshold=threshold)
+  model.fit(iris.data, iris.target)
+  assert_raises(ValueError, model.transform, iris.data)
 
 
 def test_validate_estimator():
@@ -133,7 +131,7 @@ def test_fitted_estimator():
  X_transform = model.transform(iris.data)
 
  clf.fit(iris.data, iris.target)
- model = SelectFromModel(clf)
+ model = SelectFromModel(clf, prefit=True)
  assert_array_equal(model.transform(iris.data), X_transform)
 
 
@@ -146,7 +144,7 @@ def test_threshold_string():
  # Calculate the threshold from the estimator directly.
  est.fit(iris.data, iris.target)
  threshold = 0.5 * np.mean(est.feature_importances_)
- model = SelectFromModel(est, threshold=threshold)
+ model = SelectFromModel(est, threshold=threshold, prefit=True)
  assert_array_equal(X_transform, model.transform(iris.data))
 
 
 
@@ -16,7 +16,6 @@
 from sklearn.metrics import accuracy_score
 from sklearn.metrics import mean_squared_error
 
-from sklearn.utils import warnings
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_almost_equal
@@ -27,7 +26,7 @@
 from sklearn.utils.testing import assert_greater_equal
 from sklearn.utils.testing import assert_less
 from sklearn.utils.testing import assert_true
-from sklearn.utils.testing import clean_warning_registry
+from sklearn.utils.testing import assert_warns
 from sklearn.utils.testing import raises
 
 from sklearn.utils.validation import check_random_state
@@ -380,12 +379,10 @@ def test_importances():
  assert_equal(importances.shape[0], 10, "Failed with {0}".format(name))
  assert_equal(n_important, 3, "Failed with {0}".format(name))
 
-
- clean_warning_registry()
- with warnings.catch_warnings(record=True) as record:
- X_new = clf.transform(X, threshold="mean")
- assert_less(0, X_new.shape[1], "Failed with {0}".format(name))
- assert_less(X_new.shape[1], X.shape[1], "Failed with {0}".format(name))
+ X_new = assert_warns(
+ DeprecationWarning, clf.transform, X, threshold="mean")
+ assert_less(0, X_new.shape[1], "Failed with {0}".format(name))
+ assert_less(X_new.shape[1], X.shape[1], "Failed with {0}".format(name))
 
  # Check on iris that importances are the same for all builders
  clf = DecisionTreeClassifier(random_state=0)
 
@@ -61,6 +61,16 @@
  'RANSACRegressor', 'RadiusNeighborsRegressor',
  'RandomForestRegressor', 'Ridge', 'RidgeCV']
 
+# Estimators with deprecated transform methods. Can be removed in 0.19 when
+# _LearntSelectorMixin is removed.
+DEPRECATED_TRANSFORM = [
+ "RandomForestClassifier", "RandomForestRegressor", "ExtraTreesClassifier",
+ "ExtraTreesRegressor", "RandomTreesEmbedding", "DecisionTreeClassifier",
+ "DecisionTreeRegressor", "ExtraTreeClassifier", "ExtraTreeRegressor",
+ "LinearSVC", "SGDClassifier", "SGDRegressor", "Perceptron",
+ "LogisticRegression", "LogisticRegressionCV",
+ "GradientBoostingClassifier", "GradientBoostingRegressor"]
+
 
 def _yield_non_meta_checks(name, Estimator):
  yield check_estimators_dtypes
@@ -168,8 +178,9 @@ def _yield_all_checks(name, Estimator):
  for check in _yield_regressor_checks(name, Estimator):
  yield check
  if issubclass(Estimator, TransformerMixin):
- for check in _yield_transformer_checks(name, Estimator):
- yield check
+ if name not in DEPRECATED_TRANSFORM:
+ for check in _yield_transformer_checks(name, Estimator):
+ yield check
  if issubclass(Estimator, ClusterMixin):
  for check in _yield_clustering_checks(name, Estimator):
  yield check
@@ -329,7 +340,8 @@ def check_dtype_object(name, Estimator):
  if hasattr(estimator, "predict"):
  estimator.predict(X)
 
- if hasattr(estimator, "transform"):
+ if (hasattr(estimator, "transform") and
+ name not in DEPRECATED_TRANSFORM):
  estimator.transform(X)
 
  try: