Skip to content

Commit 9cfb787

Browse files
authored
Add files via upload
1 parent 02fed4e commit 9cfb787

File tree

2 files changed

+147
-0
lines changed

2 files changed

+147
-0
lines changed
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Tue Aug 29 15:31:25 2017
4+
5+
@author: Arnab
6+
"""
7+
8+
## Initialisation
9+
10+
import pandas as pd
11+
import numpy as np
12+
import matplotlib.pyplot as plt
13+
import scipy.io as sio
14+
import copy
15+
from pprint import pprint
16+
17+
18+
19+
##For plotting the clusters
20+
21+
def plot_clusters(df,means,colmap):
22+
plt.scatter(df['x'], df['y'], color=df['color'], alpha=0.5, edgecolor='k')
23+
for i in means.keys():
24+
plt.scatter(*means[i],s=200, color=colmap[i],marker='o')
25+
plt.xlim(-1, 5)
26+
plt.ylim(-2, 3)
27+
plt.show()
28+
29+
30+
##Assign mean randomly
31+
32+
def initial_mean_assignment(df,k):
33+
np.random.seed(200)
34+
initmeans = {
35+
i+1: [np.random.uniform(-2,4), np.random.uniform(-1, 2)]
36+
for i in range(k)
37+
}
38+
return initmeans
39+
40+
## Distance Function
41+
42+
def distance(x1,y1,x2,y2,n):
43+
if n==0:
44+
return np.sqrt( (x1-x2)**2 + (y1-y2)**2 )
45+
elif n==1:
46+
return 1-(x1*x2+y1*y2)/(np.sqrt(x1**2+y1**2)*np.sqrt(x2**2+y2**2))
47+
48+
49+
## Assignment Stage
50+
51+
def One_iteration_clustering(df, centroids, colmap,dist_m):
52+
for i in centroids.keys():
53+
df['distance_from_{}'.format(i)] = distance(df['x'],df['y'],centroids[i][0],centroids[i][1],dist_m)
54+
mean_to_distance_cols = ['distance_from_{}'.format(i) for i in centroids.keys()]
55+
df['closest'] = df.loc[:, mean_to_distance_cols].idxmin(axis=1)
56+
df['closest'] = df['closest'].map(lambda x: int(x.lstrip('distance_from_')))
57+
df['color'] = df['closest'].map(lambda x: colmap[x])
58+
59+
return df
60+
61+
## Update the mean values
62+
63+
def update_means(df,means):
64+
for i in means.keys():
65+
means[i][0] = np.mean(df[df['closest'] == i]['x'])
66+
means[i][1] = np.mean(df[df['closest'] == i]['y'])
67+
return means
68+
69+
70+
## For clusting
71+
72+
def kmeans_clustering(df,k,colmap,dist_m):
73+
74+
means=initial_mean_assignment(df,k)
75+
i=0;
76+
while 1:
77+
old_means=copy.deepcopy(means)
78+
df = One_iteration_clustering(df, means, colmap,dist_m)
79+
means = update_means(df,means)
80+
i=i+1;
81+
print "Iteration"+str(i)
82+
if old_means==means:
83+
break
84+
return means
85+
86+
#To compare the two distance metrics
87+
88+
def cluster_comparision(df,colormap):
89+
90+
eeuc=[]
91+
ecos=[]
92+
k=[2,3,4,5,6]
93+
er_euc = pd.DataFrame({'2': [0],'3': [0],'4': [0],'5': [0],'6': [0]})
94+
er_cos = pd.DataFrame({'2': [0],'3': [0],'4': [0],'5': [0],'6': [0]})
95+
for i in range(2,7):
96+
mean_euc=kmeans_clustering(df,i,colmap,0)
97+
for j in range(len(df.index)):
98+
er_euc['{}'.format(i)] += df['distance_from_{}'.format(df['closest'][j])][j]
99+
mean_cos=kmeans_clustering(df,i,colmap,1)
100+
for j in range(len(df.index)):
101+
er_cos['{}'.format(i)] += df['distance_from_{}'.format(df['closest'][j])][j]
102+
eeuc.append(er_euc['{}'.format(i)])
103+
print er_euc['{}'.format(i)]
104+
ecos.append(er_cos['{}'.format(i)])
105+
print er_cos['{}'.format(i)]
106+
plt.plot(k,eeuc)
107+
plt.plot(k,ecos)
108+
plt.xlabel('Number of Clusters')
109+
plt.ylabel('Error')
110+
plt.title('Error variation with number of clusters')
111+
return df
112+
113+
## To illustrate kmeans clustering
114+
115+
def kmean_illus_(df,k,colmap,dist_m):
116+
means=kmeans_clustering(df,k,colmap,dist_m)
117+
plot_clusters(df,means,colmap)
118+
print "Successfully clustered with k="+str(k)
119+
120+
121+
if __name__ == '__main__':
122+
a = sio.loadmat('data.mat')
123+
data=a['h']
124+
#pprint(data)
125+
df = pd.DataFrame(data,columns=list('xy'))
126+
colmap = {1: 'r', 2: 'g', 3: 'b', 4: 'c', 5: 'm', 6: 'y', 7: 'k'}
127+
128+
'''
129+
Code segment to see the illustration of the kmeans clustering
130+
dist_m values selects which distance metric we want to use
131+
dist_m = 0 for Euclidean distance
132+
dist_m = 1 for Cosine distane
133+
'''
134+
dist_m = 1
135+
k=3
136+
#kmean_illus(df,k,colmap,dist_m)
137+
138+
'''
139+
Code segment to see the plot of the error with number of clusters
140+
Knee point observed is k=3
141+
'''
142+
143+
df=cluster_comparision(df,colmap)
144+
145+
146+
147+

Kmeans_Clustering/data.mat

15.8 KB
Binary file not shown.

0 commit comments

Comments
 (0)