Skip to content

Commit 6409357

Browse files
kain88-delarsmans
authored andcommitted
ENH: Precompute distances only if overhead is below 100MB
Precomputing distances can consume a lot of memory if several jobs are used. With 8 jobs a dataset of shape (1e6, 2) and n_clusters=100 the precomputed distances will consume 3GB of memory while the dataset only consume 15MB (Assuming double precision is used).
1 parent 090b1c1 commit 6409357

File tree

2 files changed

+46
-3
lines changed

2 files changed

+46
-3
lines changed

sklearn/cluster/k_means_.py

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ def _tolerance(X, tol):
147147
return np.mean(variances) * tol
148148

149149

150-
def k_means(X, n_clusters, init='k-means++', precompute_distances=True,
150+
def k_means(X, n_clusters, init='k-means++', precompute_distances='auto',
151151
n_init=10, max_iter=300, verbose=False,
152152
tol=1e-4, random_state=None, copy_x=True, n_jobs=1,
153153
return_n_iter=False):
@@ -186,6 +186,17 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances=True,
186186
If a callable is passed, it should take arguments X, k and
187187
and a random state and return an initialization.
188188
189+
precompute_distances : {'auto', True, False}
190+
Precompute distances (faster but takes more memory).
191+
192+
'auto' : do not precompute distances if n_samples * n_clusters > 12
193+
million. This corresponds to about 100MB overhead per job using
194+
double precision.
195+
196+
True : always precompute distances
197+
198+
False : never precompute distances
199+
189200
tol : float, optional
190201
The relative increment in the results before declaring convergence.
191202
@@ -240,6 +251,20 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances=True,
240251
X = as_float_array(X, copy=copy_x)
241252
tol = _tolerance(X, tol)
242253

254+
# If the distances are precomputed every job will create a matrix of shape
255+
# (n_clusters, n_samples). To stop KMeans from eating up memory we only
256+
# activate this if the created matrix is guaranteed to be under 100MB. 12
257+
# million entries consume a little under 100MB if they are of type double.
258+
if precompute_distances == 'auto':
259+
n_samples = X.shape[0]
260+
precompute_distances = (n_clusters * n_samples) < 12e6
261+
elif isinstance(precompute_distances, bool):
262+
pass
263+
else:
264+
raise ValueError("precompute_distances should be 'auto' or True/False"
265+
", but a value of %r was passed" %
266+
precompute_distances)
267+
243268
# subtract of mean of x for more accurate distance computations
244269
if not sp.issparse(X) or hasattr(init, '__array__'):
245270
X_mean = X.mean(axis=0)
@@ -349,6 +374,9 @@ def _kmeans_single(X, n_clusters, x_squared_norms, max_iter=300,
349374
x_squared_norms: array
350375
Precomputed x_squared_norms.
351376
377+
precompute_distances : boolean, default: True
378+
Precompute distances (faster but takes more memory).
379+
352380
random_state: integer or numpy.RandomState, optional
353381
The generator used to initialize the centers. If an integer is
354382
given, it fixes the seed. Defaults to the global numpy random
@@ -625,9 +653,17 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
625653
If an ndarray is passed, it should be of shape (n_clusters, n_features)
626654
and gives the initial centers.
627655
628-
precompute_distances : boolean, default: True
656+
precompute_distances : {'auto', True, False}
629657
Precompute distances (faster but takes more memory).
630658
659+
'auto' : do not precompute distances if n_samples * n_clusters > 12
660+
million. This corresponds to about 100MB overhead per job using
661+
double precision.
662+
663+
True : always precompute distances
664+
665+
False : never precompute distances
666+
631667
tol : float, default: 1e-4
632668
Relative tolerance with regards to inertia to declare convergence
633669
@@ -684,7 +720,7 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
684720
"""
685721

686722
def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300,
687-
tol=1e-4, precompute_distances=True,
723+
tol=1e-4, precompute_distances='auto',
688724
verbose=0, random_state=None, copy_x=True, n_jobs=1):
689725

690726
if hasattr(init, '__array__'):

sklearn/cluster/tests/test_k_means.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,13 @@ def test_k_means_plus_plus_init_2_jobs():
211211
_check_fitted_model(km)
212212

213213

214+
def test_k_means_precompute_distances_flag():
215+
# check that a warning is raised if the precompute_distances flag is not
216+
# supported
217+
km = KMeans(precompute_distances="wrong")
218+
assert_raises(ValueError, km.fit, X)
219+
220+
214221
def test_k_means_plus_plus_init_sparse():
215222
km = KMeans(init="k-means++", n_clusters=n_clusters, random_state=42)
216223
km.fit(X_csr)

0 commit comments

Comments
 (0)