xuesj
diff --git a/‎sklearn/cluster/k_means_.py‎
Lines changed: 16 additions & 10 deletions b/‎sklearn/cluster/k_means_.py‎
Lines changed: 16 additions & 10 deletions
diff --git a/‎sklearn/cluster/tests/test_k_means.py‎
Lines changed: 30 additions & 6 deletions b/‎sklearn/cluster/tests/test_k_means.py‎
Lines changed: 30 additions & 6 deletions
@@ -140,6 +140,18 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
 ###############################################################################
 # K-means batch estimation by EM (expectation maximization)
 
+def _validate_center_shape(X, n_centers, centers):
+ """Check if centers is compatible with X and n_centers"""
+ if len(centers) != n_centers:
+ raise ValueError('The shape of the initial centers (%s) '
+ 'does not match the number of clusters %i'
+ % (centers.shape, n_centers))
+ if centers.shape[1] != X.shape[1]:
+ raise ValueError(
+ "The number of features of the initial centers %s "
+ "does not match the number of features of the data %s."
+ % (centers.shape[1], X.shape[1]))
+
 
 def _tolerance(X, tol):
  """Return a tolerance which is independent of the dataset"""
@@ -285,7 +297,9 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances='auto',
  X -= X_mean
 
  if hasattr(init, '__array__'):
- init = np.asarray(init).copy()
+ init = check_array(init, dtype=np.float64, copy=True)
+ _validate_center_shape(X, n_clusters, init)
+
  init -= X_mean
  if n_init != 1:
  warnings.warn(
@@ -638,11 +652,7 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None,
  if sp.issparse(centers):
  centers = centers.toarray()
 
- if len(centers) != k:
- raise ValueError('The shape of the initial centers (%s) '
- 'does not match the number of clusters %i'
- % (centers.shape, k))
-
+ _validate_center_shape(X, k, centers)
  return centers
 
 
@@ -759,10 +769,6 @@ def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300,
  tol=1e-4, precompute_distances='auto',
  verbose=0, random_state=None, copy_x=True, n_jobs=1):
 
- if hasattr(init, '__array__'):
- n_clusters = init.shape[0]
- init = np.asarray(init, dtype=np.float64)
-
  self.n_clusters = n_clusters
  self.init = init
  self.max_iter = max_iter
 
@@ -10,7 +10,7 @@
 from sklearn.utils.testing import SkipTest
 from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_raises
-from sklearn.utils.testing import assert_raises_regexp
+from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import assert_greater
 from sklearn.utils.testing import assert_less
@@ -257,8 +257,30 @@ def test_k_means_n_init():
 
  # two regression tests on bad n_init argument
  # previous bug: n_init <= 0 threw non-informative TypeError (#3858)
- assert_raises_regexp(ValueError, "n_init", KMeans(n_init=0).fit, X)
- assert_raises_regexp(ValueError, "n_init", KMeans(n_init=-1).fit, X)
+ assert_raises_regex(ValueError, "n_init", KMeans(n_init=0).fit, X)
+ assert_raises_regex(ValueError, "n_init", KMeans(n_init=-1).fit, X)
+
+
+def test_k_means_explicit_init_shape():
+ # test for sensible errors when giving explicit init
+ # with wrong number of features or clusters
+ rnd = np.random.RandomState(0)
+ X = rnd.normal(size=(40, 3))
+ for Class in [KMeans, MiniBatchKMeans]:
+ # mismatch of number of features
+ km = Class(n_init=1, init=X[:, :2], n_clusters=len(X))
+ msg = "does not match the number of features of the data"
+ assert_raises_regex(ValueError, msg, km.fit, X)
+ # for callable init
+ km = Class(n_init=1, init=lambda X_, k, random_state: X_[:, :2], n_clusters=len(X))
+ assert_raises_regex(ValueError, msg, km.fit, X)
+ # mismatch of number of clusters
+ msg = "does not match the number of clusters"
+ km = Class(n_init=1, init=X[:2, :], n_clusters=3)
+ assert_raises_regex(ValueError, msg, km.fit, X)
+ # for callable init
+ km = Class(n_init=1, init=lambda X_, k, random_state: X_[:2, :], n_clusters=3)
+ assert_raises_regex(ValueError, msg, km.fit, X)
 
 
 def test_k_means_fortran_aligned_data():
@@ -267,7 +289,7 @@ def test_k_means_fortran_aligned_data():
  centers = np.array([[0, 0], [0, 1]])
  labels = np.array([0, 1, 1])
  km = KMeans(n_init=1, init=centers, precompute_distances=False,
- random_state=42)
+ random_state=42, n_clusters=2)
  km.fit(X)
  assert_array_equal(km.cluster_centers_, centers)
  assert_array_equal(km.labels_, labels)
@@ -437,8 +459,10 @@ def test_init(X, k, random_state):
 
  # Small test to check that giving the wrong number of centers
  # raises a meaningful error
- assert_raises(ValueError,
- MiniBatchKMeans(init=test_init, random_state=42).fit, X_csr)
+ msg = "does not match the number of clusters"
+ assert_raises_regex(ValueError, msg, MiniBatchKMeans(init=test_init,
+ random_state=42).fit,
+ X_csr)
 
  # Now check that the fit actually works
  mb_k_means = MiniBatchKMeans(n_clusters=3, init=test_init,