55# License: BSD 3 clause
66
77import numpy as np
8- from scipy import linalg
8+ from scipy import linalg , sparse
99
1010from .base import _BasePCA
1111from ..utils import check_array , gen_batches
@@ -21,11 +21,13 @@ class IncrementalPCA(_BasePCA):
2121 but not scaled for each feature before applying the SVD.
2222
2323 Depending on the size of the input data, this algorithm can be much more
24- memory efficient than a PCA.
24+ memory efficient than a PCA, and allows sparse input .
2525
2626 This algorithm has constant memory complexity, on the order
27- of ``batch_size``, enabling use of np.memmap files without loading the
28- entire file into memory.
27+ of ``batch_size * n_features``, enabling use of np.memmap files without
28+ loading the entire file into memory. For sparse matrices, the input
29+ is converted to dense in batches (in order to be able to subtract the
30+ mean) which avoids storing the entire dense matrix at any one time.
2931
3032 The computational overhead of each SVD is
3133 ``O(batch_size * n_features ** 2)``, but only 2 * batch_size samples
@@ -104,13 +106,15 @@ class IncrementalPCA(_BasePCA):
104106 --------
105107 >>> from sklearn.datasets import load_digits
106108 >>> from sklearn.decomposition import IncrementalPCA
109+ >>> from scipy import sparse
107110 >>> X, _ = load_digits(return_X_y=True)
108111 >>> transformer = IncrementalPCA(n_components=7, batch_size=200)
109112 >>> # either partially fit on smaller batches of data
110113 >>> transformer.partial_fit(X[:100, :])
111114 IncrementalPCA(batch_size=200, n_components=7)
112115 >>> # or let the fit function itself divide the data into batches
113- >>> X_transformed = transformer.fit_transform(X)
116+ >>> X_sparse = sparse.csr_matrix(X)
117+ >>> X_transformed = transformer.fit_transform(X_sparse)
114118 >>> X_transformed.shape
115119 (1797, 7)
116120
@@ -167,7 +171,7 @@ def fit(self, X, y=None):
167171
168172 Parameters
169173 ----------
170- X : array-like, shape (n_samples, n_features)
174+ X : array-like or sparse matrix , shape (n_samples, n_features)
171175 Training data, where n_samples is the number of samples and
172176 n_features is the number of features.
173177
@@ -188,7 +192,8 @@ def fit(self, X, y=None):
188192 self .singular_values_ = None
189193 self .noise_variance_ = None
190194
191- X = check_array (X , copy = self .copy , dtype = [np .float64 , np .float32 ])
195+ X = check_array (X , accept_sparse = ['csr' , 'csc' , 'lil' ],
196+ copy = self .copy , dtype = [np .float64 , np .float32 ])
192197 n_samples , n_features = X .shape
193198
194199 if self .batch_size is None :
@@ -198,7 +203,10 @@ def fit(self, X, y=None):
198203
199204 for batch in gen_batches (n_samples , self .batch_size_ ,
200205 min_batch_size = self .n_components or 0 ):
201- self .partial_fit (X [batch ], check_input = False )
206+ X_batch = X [batch ]
207+ if sparse .issparse (X_batch ):
208+ X_batch = X_batch .toarray ()
209+ self .partial_fit (X_batch , check_input = False )
202210
203211 return self
204212
@@ -221,6 +229,11 @@ def partial_fit(self, X, y=None, check_input=True):
221229 Returns the instance itself.
222230 """
223231 if check_input :
232+ if sparse .issparse (X ):
233+ raise TypeError (
234+ "IncrementalPCA.partial_fit does not support "
235+ "sparse input. Either convert data to dense "
236+ "or use IncrementalPCA.fit to do so in batches." )
224237 X = check_array (X , copy = self .copy , dtype = [np .float64 , np .float32 ])
225238 n_samples , n_features = X .shape
226239 if not hasattr (self , 'components_' ):
@@ -274,7 +287,7 @@ def partial_fit(self, X, y=None, check_input=True):
274287 np .sqrt ((self .n_samples_seen_ * n_samples ) /
275288 n_total_samples ) * (self .mean_ - col_batch_mean )
276289 X = np .vstack ((self .singular_values_ .reshape ((- 1 , 1 )) *
277- self .components_ , X , mean_correction ))
290+ self .components_ , X , mean_correction ))
278291
279292 U , S , V = linalg .svd (X , full_matrices = False )
280293 U , V = svd_flip (U , V , u_based_decision = False )
@@ -295,3 +308,42 @@ def partial_fit(self, X, y=None, check_input=True):
295308 else :
296309 self .noise_variance_ = 0.
297310 return self
311+
312+ def transform (self , X ):
313+ """Apply dimensionality reduction to X.
314+
315+ X is projected on the first principal components previously extracted
316+ from a training set, using minibatches of size batch_size if X is
317+ sparse.
318+
319+ Parameters
320+ ----------
321+ X : array-like, shape (n_samples, n_features)
322+ New data, where n_samples is the number of samples
323+ and n_features is the number of features.
324+
325+ Returns
326+ -------
327+ X_new : array-like, shape (n_samples, n_components)
328+
329+ Examples
330+ --------
331+
332+ >>> import numpy as np
333+ >>> from sklearn.decomposition import IncrementalPCA
334+ >>> X = np.array([[-1, -1], [-2, -1], [-3, -2],
335+ ... [1, 1], [2, 1], [3, 2]])
336+ >>> ipca = IncrementalPCA(n_components=2, batch_size=3)
337+ >>> ipca.fit(X)
338+ IncrementalPCA(batch_size=3, n_components=2)
339+ >>> ipca.transform(X) # doctest: +SKIP
340+ """
341+ if sparse .issparse (X ):
342+ n_samples = X .shape [0 ]
343+ output = []
344+ for batch in gen_batches (n_samples , self .batch_size_ ,
345+ min_batch_size = self .n_components or 0 ):
346+ output .append (super ().transform (X [batch ].toarray ()))
347+ return np .vstack (output )
348+ else :
349+ return super ().transform (X )
0 commit comments