mkurian
diff --git a/‎sklearn/cluster/k_means_.py‎
Lines changed: 39 additions & 3 deletions b/‎sklearn/cluster/k_means_.py‎
Lines changed: 39 additions & 3 deletions
diff --git a/‎sklearn/cluster/tests/test_k_means.py‎
Lines changed: 7 additions & 0 deletions b/‎sklearn/cluster/tests/test_k_means.py‎
Lines changed: 7 additions & 0 deletions
@@ -147,7 +147,7 @@ def _tolerance(X, tol):
  return np.mean(variances) * tol
 
 
-def k_means(X, n_clusters, init='k-means++', precompute_distances=True,
+def k_means(X, n_clusters, init='k-means++', precompute_distances='auto',
  n_init=10, max_iter=300, verbose=False,
  tol=1e-4, random_state=None, copy_x=True, n_jobs=1,
  return_n_iter=False):
@@ -186,6 +186,17 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances=True,
  If a callable is passed, it should take arguments X, k and
  and a random state and return an initialization.
 
+ precompute_distances : {'auto', True, False}
+ Precompute distances (faster but takes more memory).
+
+ 'auto' : do not precompute distances if n_samples * n_clusters > 12
+ million. This corresponds to about 100MB overhead per job using
+ double precision.
+
+ True : always precompute distances
+
+ False : never precompute distances
+
  tol : float, optional
  The relative increment in the results before declaring convergence.
 
@@ -240,6 +251,20 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances=True,
  X = as_float_array(X, copy=copy_x)
  tol = _tolerance(X, tol)
 
+ # If the distances are precomputed every job will create a matrix of shape
+ # (n_clusters, n_samples). To stop KMeans from eating up memory we only
+ # activate this if the created matrix is guaranteed to be under 100MB. 12
+ # million entries consume a little under 100MB if they are of type double.
+ if precompute_distances == 'auto':
+ n_samples = X.shape[0]
+ precompute_distances = (n_clusters * n_samples) < 12e6
+ elif isinstance(precompute_distances, bool):
+ pass
+ else:
+ raise ValueError("precompute_distances should be 'auto' or True/False"
+ ", but a value of %r was passed" %
+ precompute_distances)
+
  # subtract of mean of x for more accurate distance computations
  if not sp.issparse(X) or hasattr(init, '__array__'):
  X_mean = X.mean(axis=0)
@@ -349,6 +374,9 @@ def _kmeans_single(X, n_clusters, x_squared_norms, max_iter=300,
  x_squared_norms: array
  Precomputed x_squared_norms.
 
+ precompute_distances : boolean, default: True
+ Precompute distances (faster but takes more memory).
+
  random_state: integer or numpy.RandomState, optional
  The generator used to initialize the centers. If an integer is
  given, it fixes the seed. Defaults to the global numpy random
@@ -625,9 +653,17 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
  If an ndarray is passed, it should be of shape (n_clusters, n_features)
  and gives the initial centers.
 
- precompute_distances : boolean, default: True
+ precompute_distances : {'auto', True, False}
  Precompute distances (faster but takes more memory).
 
+ 'auto' : do not precompute distances if n_samples * n_clusters > 12
+ million. This corresponds to about 100MB overhead per job using
+ double precision.
+
+ True : always precompute distances
+
+ False : never precompute distances
+
  tol : float, default: 1e-4
  Relative tolerance with regards to inertia to declare convergence
 
@@ -684,7 +720,7 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
  """
 
  def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300,
- tol=1e-4, precompute_distances=True,
+ tol=1e-4, precompute_distances='auto',
  verbose=0, random_state=None, copy_x=True, n_jobs=1):
 
  if hasattr(init, '__array__'):
 
@@ -211,6 +211,13 @@ def test_k_means_plus_plus_init_2_jobs():
  _check_fitted_model(km)
 
 
+def test_k_means_precompute_distances_flag():
+ # check that a warning is raised if the precompute_distances flag is not
+ # supported
+ km = KMeans(precompute_distances="wrong")
+ assert_raises(ValueError, km.fit, X)
+
+
 def test_k_means_plus_plus_init_sparse():
  km = KMeans(init="k-means++", n_clusters=n_clusters, random_state=42)
  km.fit(X_csr)