Tejasrahane
diff --git a/‎machine_learning/t_stochastic_neighbour_embedding.py‎
Lines changed: 178 additions & 0 deletions b/‎machine_learning/t_stochastic_neighbour_embedding.py‎
Lines changed: 178 additions & 0 deletions
@@ -0,0 +1,178 @@
+"""
+t-distributed stochastic neighbor embedding (t-SNE)
+
+For more details, see:
+https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding
+"""
+
+import doctest
+
+import numpy as np
+from numpy import ndarray
+from sklearn.datasets import load_iris
+
+
+def collect_dataset() -> tuple[ndarray, ndarray]:
+ """
+ Load the Iris dataset and return features and labels.
+
+ Returns:
+ tuple[ndarray, ndarray]: Feature matrix and target labels.
+
+ >>> features, targets = collect_dataset()
+ >>> features.shape
+ (150, 4)
+ >>> targets.shape
+ (150,)
+ """
+ iris_dataset = load_iris()
+ return np.array(iris_dataset.data), np.array(iris_dataset.target)
+
+
+def compute_pairwise_affinities(data_matrix: ndarray, sigma: float = 1.0) -> ndarray:
+ """
+ Compute high-dimensional affinities (P matrix) using a Gaussian kernel.
+
+ Args:
+ data_matrix: Input data of shape (n_samples, n_features).
+ sigma: Gaussian kernel bandwidth.
+
+ Returns:
+ ndarray: Symmetrized probability matrix.
+
+ >>> x = np.array([[0.0, 0.0], [1.0, 0.0]])
+ >>> probabilities = compute_pairwise_affinities(x)
+ >>> float(round(probabilities[0, 1], 3))
+ 0.25
+ """
+ n_samples = data_matrix.shape[0]
+ squared_sum = np.sum(np.square(data_matrix), axis=1)
+ squared_distance = np.add(
+ np.add(-2 * np.dot(data_matrix, data_matrix.T), squared_sum).T, squared_sum
+ )
+
+ affinity_matrix = np.exp(-squared_distance / (2 * sigma**2))
+ np.fill_diagonal(affinity_matrix, 0)
+
+ affinity_matrix /= np.sum(affinity_matrix)
+ return (affinity_matrix + affinity_matrix.T) / (2 * n_samples)
+
+
+def compute_low_dim_affinities(embedding_matrix: ndarray) -> tuple[ndarray, ndarray]:
+ """
+ Compute low-dimensional affinities (Q matrix) using a Student-t distribution.
+
+ Args:
+ embedding_matrix: Low-dimensional embedding of shape (n_samples, n_components).
+
+ Returns:
+ tuple[ndarray, ndarray]: (Q probability matrix, numerator matrix).
+
+ >>> y = np.array([[0.0, 0.0], [1.0, 0.0]])
+ >>> q_matrix, numerators = compute_low_dim_affinities(y)
+ >>> q_matrix.shape
+ (2, 2)
+ """
+ squared_sum = np.sum(np.square(embedding_matrix), axis=1)
+ numerator_matrix = 1 / (
+ 1
+ + np.add(
+ np.add(-2 * np.dot(embedding_matrix, embedding_matrix.T), squared_sum).T,
+ squared_sum,
+ )
+ )
+ np.fill_diagonal(numerator_matrix, 0)
+
+ q_matrix = numerator_matrix / np.sum(numerator_matrix)
+ return q_matrix, numerator_matrix
+
+
+def apply_tsne(
+ data_matrix: ndarray,
+ n_components: int = 2,
+ learning_rate: float = 200.0,
+ n_iter: int = 500,
+) -> ndarray:
+ """
+ Apply t-SNE for dimensionality reduction.
+
+ Args:
+ data_matrix: Original dataset (features).
+ n_components: Target dimension (2D or 3D).
+ learning_rate: Step size for gradient descent.
+ n_iter: Number of iterations.
+
+ Returns:
+ ndarray: Low-dimensional embedding of the data.
+
+ >>> features, _ = collect_dataset()
+ >>> embedding = apply_tsne(features, n_components=2, n_iter=50)
+ >>> embedding.shape
+ (150, 2)
+ """
+ if n_components < 1 or n_iter < 1:
+ raise ValueError("n_components and n_iter must be >= 1")
+
+ n_samples = data_matrix.shape[0]
+ rng = np.random.default_rng()
+ embedding = rng.standard_normal((n_samples, n_components)) * 1e-4
+
+ high_dim_affinities = compute_pairwise_affinities(data_matrix)
+ high_dim_affinities = np.maximum(high_dim_affinities, 1e-12)
+
+ embedding_increment = np.zeros_like(embedding)
+ momentum = 0.5
+
+ for iteration in range(n_iter):
+ low_dim_affinities, numerator_matrix = compute_low_dim_affinities(embedding)
+ low_dim_affinities = np.maximum(low_dim_affinities, 1e-12)
+
+ affinity_diff = high_dim_affinities - low_dim_affinities
+
+ gradient = 4 * (
+ np.dot((affinity_diff * numerator_matrix), embedding)
+ - np.multiply(
+ np.sum(affinity_diff * numerator_matrix, axis=1)[:, np.newaxis],
+ embedding,
+ )
+ )
+
+ embedding_increment = momentum * embedding_increment - learning_rate * gradient
+ embedding += embedding_increment
+
+ if iteration == int(n_iter / 4):
+ momentum = 0.8
+
+ return embedding
+
+
+def main() -> None:
+ """
+ Run t-SNE on the Iris dataset and display the first 5 embeddings.
+
+ >>> main() # doctest: +ELLIPSIS
+ t-SNE embedding (first 5 points):
+ [[...
+ """
+ features, _labels = collect_dataset()
+ embedding = apply_tsne(features, n_components=2, n_iter=300)
+
+ if not isinstance(embedding, np.ndarray):
+ raise TypeError("t-SNE embedding must be an ndarray")
+
+ print("t-SNE embedding (first 5 points):")
+ print(embedding[:5])
+
+ # Optional visualization (Ruff/mypy compliant)
+
+ # import matplotlib.pyplot as plt
+ # plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis")
+ # plt.title("t-SNE Visualization of the Iris Dataset")
+ # plt.xlabel("Dimension 1")
+ # plt.ylabel("Dimension 2")
+ # plt.show()
+
+
+if __name__ == "__main__":
+ doctest.testmod()
+ main()