|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +""" |
| 3 | +Created on Tue Aug 29 15:31:25 2017 |
| 4 | +
|
| 5 | +@author: Arnab |
| 6 | +""" |
| 7 | + |
| 8 | +## Initialisation |
| 9 | + |
| 10 | +import pandas as pd |
| 11 | +import numpy as np |
| 12 | +import matplotlib.pyplot as plt |
| 13 | +import scipy.io as sio |
| 14 | +import copy |
| 15 | +from pprint import pprint |
| 16 | + |
| 17 | + |
| 18 | + |
| 19 | +##For plotting the clusters |
| 20 | + |
| 21 | +def plot_clusters(df,means,colmap): |
| 22 | + plt.scatter(df['x'], df['y'], color=df['color'], alpha=0.5, edgecolor='k') |
| 23 | + for i in means.keys(): |
| 24 | + plt.scatter(*means[i],s=200, color=colmap[i],marker='o') |
| 25 | + plt.xlim(-1, 5) |
| 26 | + plt.ylim(-2, 3) |
| 27 | + plt.show() |
| 28 | + |
| 29 | + |
| 30 | +##Assign mean randomly |
| 31 | + |
| 32 | +def initial_mean_assignment(df,k): |
| 33 | + np.random.seed(200) |
| 34 | + initmeans = { |
| 35 | + i+1: [np.random.uniform(-2,4), np.random.uniform(-1, 2)] |
| 36 | + for i in range(k) |
| 37 | + } |
| 38 | + return initmeans |
| 39 | + |
| 40 | +## Distance Function |
| 41 | + |
| 42 | +def distance(x1,y1,x2,y2,n): |
| 43 | + if n==0: |
| 44 | + return np.sqrt( (x1-x2)**2 + (y1-y2)**2 ) |
| 45 | + elif n==1: |
| 46 | + return 1-(x1*x2+y1*y2)/(np.sqrt(x1**2+y1**2)*np.sqrt(x2**2+y2**2)) |
| 47 | + |
| 48 | + |
| 49 | +## Assignment Stage |
| 50 | + |
| 51 | +def One_iteration_clustering(df, centroids, colmap,dist_m): |
| 52 | + for i in centroids.keys(): |
| 53 | + df['distance_from_{}'.format(i)] = distance(df['x'],df['y'],centroids[i][0],centroids[i][1],dist_m) |
| 54 | + mean_to_distance_cols = ['distance_from_{}'.format(i) for i in centroids.keys()] |
| 55 | + df['closest'] = df.loc[:, mean_to_distance_cols].idxmin(axis=1) |
| 56 | + df['closest'] = df['closest'].map(lambda x: int(x.lstrip('distance_from_'))) |
| 57 | + df['color'] = df['closest'].map(lambda x: colmap[x]) |
| 58 | + |
| 59 | + return df |
| 60 | + |
| 61 | +## Update the mean values |
| 62 | + |
| 63 | +def update_means(df,means): |
| 64 | + for i in means.keys(): |
| 65 | + means[i][0] = np.mean(df[df['closest'] == i]['x']) |
| 66 | + means[i][1] = np.mean(df[df['closest'] == i]['y']) |
| 67 | + return means |
| 68 | + |
| 69 | + |
| 70 | +## For clusting |
| 71 | + |
| 72 | +def kmeans_clustering(df,k,colmap,dist_m): |
| 73 | + |
| 74 | + means=initial_mean_assignment(df,k) |
| 75 | + i=0; |
| 76 | + while 1: |
| 77 | + old_means=copy.deepcopy(means) |
| 78 | + df = One_iteration_clustering(df, means, colmap,dist_m) |
| 79 | + means = update_means(df,means) |
| 80 | + i=i+1; |
| 81 | + print "Iteration"+str(i) |
| 82 | + if old_means==means: |
| 83 | + break |
| 84 | + return means |
| 85 | + |
| 86 | +#To compare the two distance metrics |
| 87 | + |
| 88 | +def cluster_comparision(df,colormap): |
| 89 | + |
| 90 | + eeuc=[] |
| 91 | + ecos=[] |
| 92 | + k=[2,3,4,5,6] |
| 93 | + er_euc = pd.DataFrame({'2': [0],'3': [0],'4': [0],'5': [0],'6': [0]}) |
| 94 | + er_cos = pd.DataFrame({'2': [0],'3': [0],'4': [0],'5': [0],'6': [0]}) |
| 95 | + for i in range(2,7): |
| 96 | + mean_euc=kmeans_clustering(df,i,colmap,0) |
| 97 | + for j in range(len(df.index)): |
| 98 | +er_euc['{}'.format(i)] += df['distance_from_{}'.format(df['closest'][j])][j] |
| 99 | + mean_cos=kmeans_clustering(df,i,colmap,1) |
| 100 | + for j in range(len(df.index)): |
| 101 | +er_cos['{}'.format(i)] += df['distance_from_{}'.format(df['closest'][j])][j] |
| 102 | + eeuc.append(er_euc['{}'.format(i)]) |
| 103 | + print er_euc['{}'.format(i)] |
| 104 | + ecos.append(er_cos['{}'.format(i)]) |
| 105 | + print er_cos['{}'.format(i)] |
| 106 | + plt.plot(k,eeuc) |
| 107 | + plt.plot(k,ecos) |
| 108 | + plt.xlabel('Number of Clusters') |
| 109 | + plt.ylabel('Error') |
| 110 | + plt.title('Error variation with number of clusters') |
| 111 | + return df |
| 112 | + |
| 113 | +## To illustrate kmeans clustering |
| 114 | + |
| 115 | +def kmean_illus_(df,k,colmap,dist_m): |
| 116 | + means=kmeans_clustering(df,k,colmap,dist_m) |
| 117 | + plot_clusters(df,means,colmap) |
| 118 | + print "Successfully clustered with k="+str(k) |
| 119 | + |
| 120 | + |
| 121 | +if __name__ == '__main__': |
| 122 | + a = sio.loadmat('data.mat') |
| 123 | + data=a['h'] |
| 124 | + #pprint(data) |
| 125 | + df = pd.DataFrame(data,columns=list('xy')) |
| 126 | + colmap = {1: 'r', 2: 'g', 3: 'b', 4: 'c', 5: 'm', 6: 'y', 7: 'k'} |
| 127 | + |
| 128 | + ''' |
| 129 | + Code segment to see the illustration of the kmeans clustering |
| 130 | + dist_m values selects which distance metric we want to use |
| 131 | + dist_m = 0 for Euclidean distance |
| 132 | + dist_m = 1 for Cosine distane |
| 133 | + ''' |
| 134 | + dist_m = 1 |
| 135 | + k=3 |
| 136 | + #kmean_illus(df,k,colmap,dist_m) |
| 137 | + |
| 138 | + ''' |
| 139 | + Code segment to see the plot of the error with number of clusters |
| 140 | + Knee point observed is k=3 |
| 141 | + ''' |
| 142 | + |
| 143 | + df=cluster_comparision(df,colmap) |
| 144 | + |
| 145 | + |
| 146 | + |
| 147 | + |
0 commit comments