1+ import pandas as pd
2+ from datetime import datetime
3+ import os
4+ import numpy as np
5+
6+
7+ #The path where the data is stored
8+ datapath_begin = 'New Data Collection/'
9+ datapath_end = '/Keyboard Database/sentence/'
10+
11+
12+ #The details of the user data we are considering
13+ user_details = {0 :['Lalit' ,'14EC10025' ],2 :['Arnab' ,'14EC35031' ],3 :['Sandeep' ,'14EC35033' ],4 :['Charu' ,'14EC35003' ],1 :['Koruprolu Asish' ,'14EC10024' ] }
14+
15+
16+ '''To read files and extract data from it '''
17+ ##############################################
18+
19+ def extract_data (username ,roll_number ,mood ,cont = False ):
20+
21+ data2 = pd .DataFrame ({'key' : [],
22+ 'press_time' : [],
23+ 'latency' : []})
24+ path = datapath_begin + username + '_' + roll_number + datapath_end + mood + '/'
25+ directory = os .path .join (path )
26+ print 'Collecting data of ' + username + '...'
27+ for root ,dirs ,files in os .walk (directory ):
28+ for file in files :
29+ if file .endswith (".txt" ):
30+ with open (directory + file ,"r" ) as file1 :
31+ if cont :
32+ FMT = '%d:%H:%M:%S.%f'
33+ else :
34+ FMT = '%d:%m:%Y:%H:%M:%S.%f'
35+ loop = 0
36+ queue = [[],[]]
37+ data1 = pd .DataFrame ({'key' : [],
38+ 'time' : [],
39+ 'key_press_time' : []})
40+ for line in file1 :
41+ loop += 1
42+ #refine data
43+ if cont :
44+ a = line .split ()[0 ]
45+ b = line .split ()[1 ]
46+ c = line .split ()[7 ]
47+
48+ else :
49+ if len (line .split ()) == 3 :
50+ [a ,b ,c ] = line .split ()
51+ if b == '\b ' :
52+ b = "backspace"
53+ c = c [:- 4 ] + '.' + c [- 3 :]
54+ c = c [:6 ] + '20' + c [6 :]
55+
56+ elif len (line .split ()) == 2 :
57+ [a ,c ] = line .split ()
58+ b = "space"
59+ c = c [:- 4 ] + '.' + c [- 3 :]
60+ c = c [:6 ] + '20' + c [6 :]
61+
62+ else :
63+ print "Error in Data" + str (file )
64+
65+ #record keypress
66+ if a == 'KeyDown' :
67+ try :
68+ index_of_letter = queue [0 ].index (b )
69+ except :
70+ queue [0 ].append (b )
71+ queue [1 ].append (c )
72+ else :
73+ continue
74+
75+ #compute data for key release
76+ if loop > 1 and a == 'KeyUp' :
77+ #locate index
78+
79+ try :
80+ index_of_letter = queue [0 ].index (b )
81+
82+ except ValueError :
83+ index_of_letter = None
84+ else :
85+ c1 = queue [1 ][index_of_letter ]
86+
87+ #calculate time diff
88+ tdelta = datetime .strptime (c , FMT ) - datetime .strptime (c1 , FMT )
89+
90+ df1 = pd .DataFrame ({'key' :[b ],
91+ 'time' :[c1 ],
92+ 'key_press_time' :[tdelta .microseconds ]})
93+ data1 = data1 .append (df1 ,ignore_index = True )
94+
95+ #remove key from queue
96+ del queue [0 ][index_of_letter ]
97+ del queue [1 ][index_of_letter ]
98+ if loop > 5000 :
99+ break
100+
101+ for i in range (len (data1 .index )):
102+ if i == 0 :
103+ [a1 ,b1 ,c1 ] = data1 .iloc [i ]
104+ df1 = pd .DataFrame ({'key' :[a1 ],
105+ 'press_time' :[0 ],
106+ 'latency' :[b1 ]})
107+ data2 = data2 .append (df1 ,ignore_index = True )
108+ else :
109+ [a1 ,b1 ,c1 ] = data1 .iloc [i ]
110+ [a2 ,b2 ,c2 ] = data1 .iloc [i - 1 ]
111+ tdelta = datetime .strptime (c1 , FMT ) - datetime .strptime (c2 , FMT )
112+ df1 = pd .DataFrame ({'key' :[a1 ],
113+ 'press_time' :[tdelta .microseconds ],
114+ 'latency' :[b1 ]})
115+ data2 = data2 .append (df1 ,ignore_index = True )
116+ return data2
117+
118+ #####################################################
119+
120+
121+
122+
123+ '''To remove outliers from the dataset '''
124+ ##############################################
125+
126+ def remove_outlier (df_in , col_name ):
127+ q1 = df_in [col_name ].quantile (0.25 )
128+ q3 = df_in [col_name ].quantile (0.75 )
129+ iqr = q3 - q1 #Interquartile range
130+ fence_low = q1 - 1.5 * iqr
131+ fence_high = q3 + 1.5 * iqr
132+ df_out = df_in .loc [(df_in [col_name ] > fence_low ) & (df_in [col_name ] < fence_high )]
133+ return df_out
134+
135+ ###############################################
136+
137+
138+
139+
140+
141+
142+ '''To remove outliers from the dataset '''
143+ ##############################################
144+
145+ def get_data (n_classes ,emotion = 'Happy' ,neutral = False ,cont = False ):
146+ data_list1 = []
147+ data_list2 = []
148+ label_list = []
149+ data = pd .DataFrame ({'key' : [],
150+ 'press_time' : [],
151+ 'latency' : []})
152+ data1 = pd .DataFrame ({'key' : [],
153+ 'press_time' : [],
154+ 'latency' : []})
155+ for i in range (n_classes ):
156+ #Selecting the type of data
157+ if neutral :
158+ fldr_name = 'Neutral'
159+ elif cont :
160+ fldr_name = 'Continuous'
161+ else :
162+ fldr_name = 'Emotional/' + emotion
163+ #Extracting raw data from the files
164+ data1 = extract_data (user_details [i ][0 ],user_details [i ][1 ],fldr_name ,cont )
165+ data2 = data1 .loc [(data1 ['press_time' ] != 0 )]
166+ #Removes the outliers from the data with respect to both the attributes
167+ data = remove_outlier (remove_outlier (data2 ,'press_time' ),'latency' )
168+ for j in range (data .shape [0 ]):
169+ #Converted the data to ms
170+ data_list1 .append (data .iloc [j ,1 ]/ 1000 )
171+ data_list2 .append (data .iloc [j ,2 ]/ 1000 )
172+ label_list .append (i )
173+ #To store the data in a numpy array in a proper way
174+ X_data = np .zeros ((len (data_list1 )- 1 ,2 ))
175+ Y_data = np .zeros ((len (data_list1 )- 1 ))
176+ X_data [:,0 ]= np .array (data_list1 [0 :- 1 ])
177+ X_data [:,1 ]= np .array (data_list2 [0 :- 1 ])
178+ Y_data [:]= np .array (label_list [0 :- 1 ])
179+ Y_data = np .reshape (Y_data ,(len (data_list1 )- 1 ,1 ))
180+ return X_data ,Y_data
181+
182+ ###############################################
183+
184+
185+
186+
187+
188+
189+ '''Main function to verify the Feature extraction '''
190+ #####################################################
191+
192+
193+ if __name__ == '__main__' :
194+ n_classes = 5
195+ X_cont ,Y_cont = get_data (n_classes ,cont = True )
196+ print X_cont .shape
197+ print Y_cont .shape
0 commit comments