|  | 
|  | 1 | +""" | 
|  | 2 | +t-distributed stochastic neighbor embedding (t-SNE) | 
|  | 3 | +
 | 
|  | 4 | +For more details, see: | 
|  | 5 | +https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding | 
|  | 6 | +""" | 
|  | 7 | + | 
|  | 8 | +import doctest | 
|  | 9 | + | 
|  | 10 | +import numpy as np | 
|  | 11 | +from numpy import ndarray | 
|  | 12 | +from sklearn.datasets import load_iris | 
|  | 13 | + | 
|  | 14 | + | 
|  | 15 | +def collect_dataset() -> tuple[ndarray, ndarray]: | 
|  | 16 | + """ | 
|  | 17 | + Load the Iris dataset and return features and labels. | 
|  | 18 | +
 | 
|  | 19 | + Returns: | 
|  | 20 | + tuple[ndarray, ndarray]: Feature matrix and target labels. | 
|  | 21 | +
 | 
|  | 22 | + >>> features, targets = collect_dataset() | 
|  | 23 | + >>> features.shape | 
|  | 24 | + (150, 4) | 
|  | 25 | + >>> targets.shape | 
|  | 26 | + (150,) | 
|  | 27 | + """ | 
|  | 28 | + iris_dataset = load_iris() | 
|  | 29 | + return np.array(iris_dataset.data), np.array(iris_dataset.target) | 
|  | 30 | + | 
|  | 31 | + | 
|  | 32 | +def compute_pairwise_affinities(data_matrix: ndarray, sigma: float = 1.0) -> ndarray: | 
|  | 33 | + """ | 
|  | 34 | + Compute high-dimensional affinities (P matrix) using a Gaussian kernel. | 
|  | 35 | +
 | 
|  | 36 | + Args: | 
|  | 37 | + data_matrix: Input data of shape (n_samples, n_features). | 
|  | 38 | + sigma: Gaussian kernel bandwidth. | 
|  | 39 | +
 | 
|  | 40 | + Returns: | 
|  | 41 | + ndarray: Symmetrized probability matrix. | 
|  | 42 | +
 | 
|  | 43 | + >>> x = np.array([[0.0, 0.0], [1.0, 0.0]]) | 
|  | 44 | + >>> probabilities = compute_pairwise_affinities(x) | 
|  | 45 | + >>> float(round(probabilities[0, 1], 3)) | 
|  | 46 | + 0.25 | 
|  | 47 | + """ | 
|  | 48 | + n_samples = data_matrix.shape[0] | 
|  | 49 | + squared_sum = np.sum(np.square(data_matrix), axis=1) | 
|  | 50 | + squared_distance = np.add( | 
|  | 51 | + np.add(-2 * np.dot(data_matrix, data_matrix.T), squared_sum).T, squared_sum | 
|  | 52 | + ) | 
|  | 53 | + | 
|  | 54 | + affinity_matrix = np.exp(-squared_distance / (2 * sigma**2)) | 
|  | 55 | + np.fill_diagonal(affinity_matrix, 0) | 
|  | 56 | + | 
|  | 57 | + affinity_matrix /= np.sum(affinity_matrix) | 
|  | 58 | + return (affinity_matrix + affinity_matrix.T) / (2 * n_samples) | 
|  | 59 | + | 
|  | 60 | + | 
|  | 61 | +def compute_low_dim_affinities(embedding_matrix: ndarray) -> tuple[ndarray, ndarray]: | 
|  | 62 | + """ | 
|  | 63 | + Compute low-dimensional affinities (Q matrix) using a Student-t distribution. | 
|  | 64 | +
 | 
|  | 65 | + Args: | 
|  | 66 | + embedding_matrix: Low-dimensional embedding of shape (n_samples, n_components). | 
|  | 67 | +
 | 
|  | 68 | + Returns: | 
|  | 69 | + tuple[ndarray, ndarray]: (Q probability matrix, numerator matrix). | 
|  | 70 | +
 | 
|  | 71 | + >>> y = np.array([[0.0, 0.0], [1.0, 0.0]]) | 
|  | 72 | + >>> q_matrix, numerators = compute_low_dim_affinities(y) | 
|  | 73 | + >>> q_matrix.shape | 
|  | 74 | + (2, 2) | 
|  | 75 | + """ | 
|  | 76 | + squared_sum = np.sum(np.square(embedding_matrix), axis=1) | 
|  | 77 | + numerator_matrix = 1 / ( | 
|  | 78 | + 1 | 
|  | 79 | + + np.add( | 
|  | 80 | + np.add(-2 * np.dot(embedding_matrix, embedding_matrix.T), squared_sum).T, | 
|  | 81 | + squared_sum, | 
|  | 82 | + ) | 
|  | 83 | + ) | 
|  | 84 | + np.fill_diagonal(numerator_matrix, 0) | 
|  | 85 | + | 
|  | 86 | + q_matrix = numerator_matrix / np.sum(numerator_matrix) | 
|  | 87 | + return q_matrix, numerator_matrix | 
|  | 88 | + | 
|  | 89 | + | 
|  | 90 | +def apply_tsne( | 
|  | 91 | + data_matrix: ndarray, | 
|  | 92 | + n_components: int = 2, | 
|  | 93 | + learning_rate: float = 200.0, | 
|  | 94 | + n_iter: int = 500, | 
|  | 95 | +) -> ndarray: | 
|  | 96 | + """ | 
|  | 97 | + Apply t-SNE for dimensionality reduction. | 
|  | 98 | +
 | 
|  | 99 | + Args: | 
|  | 100 | + data_matrix: Original dataset (features). | 
|  | 101 | + n_components: Target dimension (2D or 3D). | 
|  | 102 | + learning_rate: Step size for gradient descent. | 
|  | 103 | + n_iter: Number of iterations. | 
|  | 104 | +
 | 
|  | 105 | + Returns: | 
|  | 106 | + ndarray: Low-dimensional embedding of the data. | 
|  | 107 | +
 | 
|  | 108 | + >>> features, _ = collect_dataset() | 
|  | 109 | + >>> embedding = apply_tsne(features, n_components=2, n_iter=50) | 
|  | 110 | + >>> embedding.shape | 
|  | 111 | + (150, 2) | 
|  | 112 | + """ | 
|  | 113 | + if n_components < 1 or n_iter < 1: | 
|  | 114 | + raise ValueError("n_components and n_iter must be >= 1") | 
|  | 115 | + | 
|  | 116 | + n_samples = data_matrix.shape[0] | 
|  | 117 | + rng = np.random.default_rng() | 
|  | 118 | + embedding = rng.standard_normal((n_samples, n_components)) * 1e-4 | 
|  | 119 | + | 
|  | 120 | + high_dim_affinities = compute_pairwise_affinities(data_matrix) | 
|  | 121 | + high_dim_affinities = np.maximum(high_dim_affinities, 1e-12) | 
|  | 122 | + | 
|  | 123 | + embedding_increment = np.zeros_like(embedding) | 
|  | 124 | + momentum = 0.5 | 
|  | 125 | + | 
|  | 126 | + for iteration in range(n_iter): | 
|  | 127 | + low_dim_affinities, numerator_matrix = compute_low_dim_affinities(embedding) | 
|  | 128 | + low_dim_affinities = np.maximum(low_dim_affinities, 1e-12) | 
|  | 129 | + | 
|  | 130 | + affinity_diff = high_dim_affinities - low_dim_affinities | 
|  | 131 | + | 
|  | 132 | + gradient = 4 * ( | 
|  | 133 | + np.dot((affinity_diff * numerator_matrix), embedding) | 
|  | 134 | + - np.multiply( | 
|  | 135 | + np.sum(affinity_diff * numerator_matrix, axis=1)[:, np.newaxis], | 
|  | 136 | + embedding, | 
|  | 137 | + ) | 
|  | 138 | + ) | 
|  | 139 | + | 
|  | 140 | + embedding_increment = momentum * embedding_increment - learning_rate * gradient | 
|  | 141 | + embedding += embedding_increment | 
|  | 142 | + | 
|  | 143 | + if iteration == int(n_iter / 4): | 
|  | 144 | + momentum = 0.8 | 
|  | 145 | + | 
|  | 146 | + return embedding | 
|  | 147 | + | 
|  | 148 | + | 
|  | 149 | +def main() -> None: | 
|  | 150 | + """ | 
|  | 151 | + Run t-SNE on the Iris dataset and display the first 5 embeddings. | 
|  | 152 | +
 | 
|  | 153 | + >>> main() # doctest: +ELLIPSIS | 
|  | 154 | + t-SNE embedding (first 5 points): | 
|  | 155 | + [[... | 
|  | 156 | + """ | 
|  | 157 | + features, _labels = collect_dataset() | 
|  | 158 | + embedding = apply_tsne(features, n_components=2, n_iter=300) | 
|  | 159 | + | 
|  | 160 | + if not isinstance(embedding, np.ndarray): | 
|  | 161 | + raise TypeError("t-SNE embedding must be an ndarray") | 
|  | 162 | + | 
|  | 163 | + print("t-SNE embedding (first 5 points):") | 
|  | 164 | + print(embedding[:5]) | 
|  | 165 | + | 
|  | 166 | + # Optional visualization (Ruff/mypy compliant) | 
|  | 167 | + | 
|  | 168 | + # import matplotlib.pyplot as plt | 
|  | 169 | + # plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis") | 
|  | 170 | + # plt.title("t-SNE Visualization of the Iris Dataset") | 
|  | 171 | + # plt.xlabel("Dimension 1") | 
|  | 172 | + # plt.ylabel("Dimension 2") | 
|  | 173 | + # plt.show() | 
|  | 174 | + | 
|  | 175 | + | 
|  | 176 | +if __name__ == "__main__": | 
|  | 177 | + doctest.testmod() | 
|  | 178 | + main() | 
0 commit comments