@@ -147,7 +147,7 @@ def _tolerance(X, tol):
147147 return np .mean (variances ) * tol
148148
149149
150- def k_means (X , n_clusters , init = 'k-means++' , precompute_distances = True ,
150+ def k_means (X , n_clusters , init = 'k-means++' , precompute_distances = 'auto' ,
151151 n_init = 10 , max_iter = 300 , verbose = False ,
152152 tol = 1e-4 , random_state = None , copy_x = True , n_jobs = 1 ,
153153 return_n_iter = False ):
@@ -186,6 +186,17 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances=True,
186186 If a callable is passed, it should take arguments X, k and
187187 and a random state and return an initialization.
188188
189+ precompute_distances : {'auto', True, False}
190+ Precompute distances (faster but takes more memory).
191+
192+ 'auto' : do not precompute distances if n_samples * n_clusters > 12
193+ million. This corresponds to about 100MB overhead per job using
194+ double precision.
195+
196+ True : always precompute distances
197+
198+ False : never precompute distances
199+
189200 tol : float, optional
190201 The relative increment in the results before declaring convergence.
191202
@@ -240,6 +251,20 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances=True,
240251 X = as_float_array (X , copy = copy_x )
241252 tol = _tolerance (X , tol )
242253
254+ # If the distances are precomputed every job will create a matrix of shape
255+ # (n_clusters, n_samples). To stop KMeans from eating up memory we only
256+ # activate this if the created matrix is guaranteed to be under 100MB. 12
257+ # million entries consume a little under 100MB if they are of type double.
258+ if precompute_distances == 'auto' :
259+ n_samples = X .shape [0 ]
260+ precompute_distances = (n_clusters * n_samples ) < 12e6
261+ elif isinstance (precompute_distances , bool ):
262+ pass
263+ else :
264+ raise ValueError ("precompute_distances should be 'auto' or True/False"
265+ ", but a value of %r was passed" %
266+ precompute_distances )
267+
243268 # subtract of mean of x for more accurate distance computations
244269 if not sp .issparse (X ) or hasattr (init , '__array__' ):
245270 X_mean = X .mean (axis = 0 )
@@ -349,6 +374,9 @@ def _kmeans_single(X, n_clusters, x_squared_norms, max_iter=300,
349374 x_squared_norms: array
350375 Precomputed x_squared_norms.
351376
377+ precompute_distances : boolean, default: True
378+ Precompute distances (faster but takes more memory).
379+
352380 random_state: integer or numpy.RandomState, optional
353381 The generator used to initialize the centers. If an integer is
354382 given, it fixes the seed. Defaults to the global numpy random
@@ -625,9 +653,17 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
625653 If an ndarray is passed, it should be of shape (n_clusters, n_features)
626654 and gives the initial centers.
627655
628- precompute_distances : boolean, default: True
656+ precompute_distances : {'auto', True, False}
629657 Precompute distances (faster but takes more memory).
630658
659+ 'auto' : do not precompute distances if n_samples * n_clusters > 12
660+ million. This corresponds to about 100MB overhead per job using
661+ double precision.
662+
663+ True : always precompute distances
664+
665+ False : never precompute distances
666+
631667 tol : float, default: 1e-4
632668 Relative tolerance with regards to inertia to declare convergence
633669
@@ -684,7 +720,7 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
684720 """
685721
686722 def __init__ (self , n_clusters = 8 , init = 'k-means++' , n_init = 10 , max_iter = 300 ,
687- tol = 1e-4 , precompute_distances = True ,
723+ tol = 1e-4 , precompute_distances = 'auto' ,
688724 verbose = 0 , random_state = None , copy_x = True , n_jobs = 1 ):
689725
690726 if hasattr (init , '__array__' ):
0 commit comments