Installing the open source Yandex CatBoost package
pip install catboostImporting the required packaged: Numpy, Pandas, Matplotlib, Seaborn, Scikit-learn and CatBoost
import numpy as np import pandas as pd import matplotlib.pyplot as plt # plt.style.use('ggplot') import seaborn as sns from catboost import Pool, CatBoostClassifier, cv, CatboostIpythonWidget from sklearn.preprocessing import MinMaxScaler from sklearn.feature_selection import VarianceThresholdLoading of IBM HR Dataset into pandas dataframe
ibm_hr_df = pd.read_csv("IBM-HR-Employee-Attrition.csv")Getting the summary statistics of the IBM HR dataset
ibm_hr_df.describe()| Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.0 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | ... | 1470.000000 | 1470.0 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 |
| mean | 36.923810 | 802.485714 | 9.192517 | 2.912925 | 1.0 | 1024.865306 | 2.721769 | 65.891156 | 2.729932 | 2.063946 | ... | 2.712245 | 80.0 | 0.793878 | 11.279592 | 2.799320 | 2.761224 | 7.008163 | 4.229252 | 2.187755 | 4.123129 |
| std | 9.135373 | 403.509100 | 8.106864 | 1.024165 | 0.0 | 602.024335 | 1.093082 | 20.329428 | 0.711561 | 1.106940 | ... | 1.081209 | 0.0 | 0.852077 | 7.780782 | 1.289271 | 0.706476 | 6.126525 | 3.623137 | 3.222430 | 3.568136 |
| min | 18.000000 | 102.000000 | 1.000000 | 1.000000 | 1.0 | 1.000000 | 1.000000 | 30.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 80.0 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 30.000000 | 465.000000 | 2.000000 | 2.000000 | 1.0 | 491.250000 | 2.000000 | 48.000000 | 2.000000 | 1.000000 | ... | 2.000000 | 80.0 | 0.000000 | 6.000000 | 2.000000 | 2.000000 | 3.000000 | 2.000000 | 0.000000 | 2.000000 |
| 50% | 36.000000 | 802.000000 | 7.000000 | 3.000000 | 1.0 | 1020.500000 | 3.000000 | 66.000000 | 3.000000 | 2.000000 | ... | 3.000000 | 80.0 | 1.000000 | 10.000000 | 3.000000 | 3.000000 | 5.000000 | 3.000000 | 1.000000 | 3.000000 |
| 75% | 43.000000 | 1157.000000 | 14.000000 | 4.000000 | 1.0 | 1555.750000 | 4.000000 | 83.750000 | 3.000000 | 3.000000 | ... | 4.000000 | 80.0 | 1.000000 | 15.000000 | 3.000000 | 3.000000 | 9.000000 | 7.000000 | 3.000000 | 7.000000 |
| max | 60.000000 | 1499.000000 | 29.000000 | 5.000000 | 1.0 | 2068.000000 | 4.000000 | 100.000000 | 4.000000 | 5.000000 | ... | 4.000000 | 80.0 | 3.000000 | 40.000000 | 6.000000 | 4.000000 | 40.000000 | 18.000000 | 15.000000 | 17.000000 |
8 rows Ă— 26 columns
Zooming in on the summary statistics of irrelevant attributes EmployeeCount and StandardHours
irrList = ['EmployeeCount', 'StandardHours'] ibm_hr_df[irrList].describe()| EmployeeCount | StandardHours | |
|---|---|---|
| count | 1470.0 | 1470.0 |
| mean | 1.0 | 80.0 |
| std | 0.0 | 0.0 |
| min | 1.0 | 80.0 |
| 25% | 1.0 | 80.0 |
| 50% | 1.0 | 80.0 |
| 75% | 1.0 | 80.0 |
| max | 1.0 | 80.0 |
Zooming in on the summary statistics of irrelevant attribute Over18
ibm_hr_df["Over18"].value_counts()Y 1470 Name: Over18, dtype: int64 From the summary statistics, one could see that attributes EmployeeCount, StandardHours and Over18 holds only one single value for all of the 1470 records
EmployeeCount only holds a single value - 1.0
StandardHours only holds a single value - 80.0
Over18 only holds a single value - 'Y'
These irrelevant attributes are duely dropped from the dataset
Checking for 'NA' and missing values in the dataset.
ibm_hr_df.isnull().sum(axis=0)Age 0 Attrition 0 BusinessTravel 0 DailyRate 0 Department 0 DistanceFromHome 0 Education 0 EducationField 0 EmployeeCount 0 EmployeeNumber 0 EnvironmentSatisfaction 0 Gender 0 HourlyRate 0 JobInvolvement 0 JobLevel 0 JobRole 0 JobSatisfaction 0 MaritalStatus 0 MonthlyIncome 0 MonthlyRate 0 NumCompaniesWorked 0 Over18 0 OverTime 0 PercentSalaryHike 0 PerformanceRating 0 RelationshipSatisfaction 0 StandardHours 0 StockOptionLevel 0 TotalWorkingYears 0 TrainingTimesLastYear 0 WorkLifeBalance 0 YearsAtCompany 0 YearsInCurrentRole 0 YearsSinceLastPromotion 0 YearsWithCurrManager 0 dtype: int64 Well, we got lucky here, there isn't any missing values in this dataset
Next, let's check for the existence of duplicate records in the dataset
ibm_hr_df.duplicated().sum()0 There are also no duplicate records in the dataset
Converting OverTime binary categorical attribute to {1, 0}
ibm_hr_df['OverTime'].replace(to_replace=dict(Yes=1, No=0), inplace=True)ibm_hr_df = ibm_hr_df.drop(['EmployeeCount', 'StandardHours', 'Over18'], axis=1)Performing variance analysis
Performing Pearson correlation analysis between attributes to aid in dimension reduction
plt.figure(figsize=(16,16)) sns.heatmap(ibm_hr_df.corr(), annot=True, fmt=".2f") plt.show()Performing variance analysis to aid in feature selection
variance_x = ibm_hr_df.drop('Attrition', axis=1) variance_one_hot = pd.get_dummies(variance_x)#Normalise the dataset. This is required for getting the variance threshold scaler = MinMaxScaler() scaler.fit(variance_one_hot) MinMaxScaler(copy=True, feature_range=(0, 1)) scaled_variance_one_hot = scaler.transform(variance_one_hot)#Set the threshold values and run VarianceThreshold thres = .85* (1 - .85) sel = VarianceThreshold(threshold=thres) sel.fit(scaled_variance_one_hot) variance = sel.variances_#Sorting of the score in acsending orders for plotting indices = np.argsort(variance)[::-1] feature_list = list(variance_one_hot) sorted_feature_list = [] thres_list = [] for f in range(len(variance_one_hot.columns)): sorted_feature_list.append(feature_list[indices[f]]) thres_list.append(thres)plt.figure(figsize=(14,6)) plt.title("Feature Variance: %f" %(thres), fontsize = 14) plt.bar(range(len(variance_one_hot.columns)), variance[indices], color="c") plt.xticks(range(len(variance_one_hot.columns)), sorted_feature_list, rotation = 90) plt.xlim([-0.5, len(variance_one_hot.columns)]) plt.plot(range(len(variance_one_hot.columns)), thres_list, "k-", color="r") plt.tight_layout() plt.show()Performing Pearson correlation analysis between attributes to aid in feature selection
rAttrList = ['Department', 'OverTime', 'HourlyRate', 'StockOptionLevel', 'DistanceFromHome', 'YearsInCurrentRole', 'Age']#keep only the attribute list on rAttrList label_hr_df = ibm_hr_df[rAttrList]#convert continous attribute DistanceFromHome to Catergorical #: 1: near, 2: mid distance, 3: far maxValues = label_hr_df['DistanceFromHome'].max() minValues = label_hr_df['DistanceFromHome'].min() intervals = (maxValues - minValues)/3 bins = [0, (minValues + intervals), (maxValues - intervals), maxValues] groupName = [1, 2, 3] label_hr_df['CatDistanceFromHome'] = pd.cut(label_hr_df['DistanceFromHome'], bins, labels = groupName)# convert col type from cat to int64 label_hr_df['CatDistanceFromHome'] = pd.to_numeric(label_hr_df['CatDistanceFromHome']) label_hr_df.drop(['DistanceFromHome'], axis = 1, inplace = True)#replace department into 0 & 1, 0: R&D, and 1: Non-R&D label_hr_df['Department'].replace(['Research & Development', 'Human Resources', 'Sales'], [0, 1, 1], inplace = True)#normalise data label_hr_df_norm = (label_hr_df - label_hr_df.min()) / (label_hr_df.max() - label_hr_df.min())#create a data frame for the function value and class labels value_df = pd.DataFrame(columns = ['ClassValue'])#compute the class value for row in range (0, ibm_hr_df.shape[0]): if label_hr_df_norm['Department'][row] == 0: value = 0.3 * label_hr_df_norm['HourlyRate'][row] - 0.2 * label_hr_df_norm['OverTime'][row] + \ - 0.2 * label_hr_df_norm['CatDistanceFromHome'][row] + 0.15 * label_hr_df_norm['StockOptionLevel'][row] + \ 0.1 * label_hr_df_norm['Age'][row] - 0.05 * label_hr_df_norm['YearsInCurrentRole'][row] else: value = 0.2 * label_hr_df_norm['HourlyRate'][row] - 0.3 * label_hr_df_norm['OverTime'][row] + \ - 0.15 * label_hr_df_norm['CatDistanceFromHome'][row] + 0.2 * label_hr_df_norm['StockOptionLevel'][row] + \ 0.05 * label_hr_df_norm['Age'][row] - 0.1 * label_hr_df_norm['YearsInCurrentRole'][row] value_df.loc[row] = value# top 500 highest class value is satisfied with their job v1 = value_df.sort_values('ClassValue', ascending = False).reset_index(drop = True)\ ['ClassValue'][499] # next top 500 is neutral v2 = value_df.sort_values('ClassValue', ascending = False).reset_index(drop = True)\ ['ClassValue'][999] # rest is unsatisfiedlabel_df = pd.DataFrame(columns = ['ClassLabel'])#compute the classlabel for row in range (0, value_df.shape[0]): if value_df['ClassValue'][row] >= v1: cat = "Satisfied" elif value_df['ClassValue'][row] >= v2: cat = "Neutral" else: cat = "Unsatisfied" label_df.loc[row] = catdf = pd.concat([ibm_hr_df, label_df], axis = 1)df = df[['Age', 'Department', 'DistanceFromHome', 'HourlyRate', 'OverTime', 'StockOptionLevel', 'MaritalStatus', 'YearsInCurrentRole', 'EmployeeNumber', 'ClassLabel']]Split dataset into attributes/features X and label/class y
X = df.drop('ClassLabel', axis=1) y = df.ClassLabelReplacing label/class value from 'Satisfied', 'Neutral' and 'Unsatisfied' to 2, 1 and 0
y.replace(to_replace=dict(Satisfied=2, Neutral=1, Unsatisfied=0), inplace=True)Performing 'one hot encoding' method
one_hot = pd.get_dummies(X)categorical_features_indices = np.where(one_hot.dtypes != np.float)[0]Now lets split our data to train (70%) and test (30%) set:
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(one_hot, y, train_size=0.7, random_state=1234)model = CatBoostClassifier( custom_loss = ['Accuracy'], random_seed = 100, loss_function = 'MultiClass' )model.fit( X_train, y_train, cat_features = categorical_features_indices, verbose = True, # you can uncomment this for text output #plot = True )cm = pd.DataFrame() cm['Satisfaction'] = y_test cm['Predict'] = model.predict(X_test)mappingSatisfaction = {0:'Unsatisfied', 1: 'Neutral', 2: 'Satisfied'} mappingPredict = {0.0:'Unsatisfied', 1.0: 'Neutral', 2.0: 'Satisfied'} cm = cm.replace({'Satisfaction': mappingSatisfaction, 'Predict': mappingPredict})pd.crosstab(cm['Satisfaction'], cm['Predict'], margins=True)| Predict | Neutral | Satisfied | Unsatisfied | All |
|---|---|---|---|---|
| Satisfaction | ||||
| Neutral | 143 | 8 | 8 | 159 |
| Satisfied | 20 | 123 | 1 | 144 |
| Unsatisfied | 18 | 0 | 120 | 138 |
| All | 181 | 131 | 129 | 441 |
model.score(X_test, y_test)0.87528344671201819 model = CatBoostClassifier( l2_leaf_reg = 5, iterations = 1000, fold_len_multiplier = 1.1, custom_loss = ['Accuracy'], random_seed = 100, loss_function = 'MultiClass' )model.fit( X_train, y_train, cat_features = categorical_features_indices, verbose = True, # you can uncomment this for text output #plot = True )cm = pd.DataFrame() cm['Satisfaction'] = y_test cm['Predict'] = model.predict(X_test)mappingSatisfaction = {0:'Unsatisfied', 1: 'Neutral', 2: 'Satisfied'} mappingPredict = {0.0:'Unsatisfied', 1.0: 'Neutral', 2.0: 'Satisfied'} cm = cm.replace({'Satisfaction': mappingSatisfaction, 'Predict': mappingPredict})pd.crosstab(cm['Satisfaction'], cm['Predict'], margins=True)| Predict | Neutral | Satisfied | Unsatisfied | All |
|---|---|---|---|---|
| Satisfaction | ||||
| Neutral | 142 | 9 | 8 | 159 |
| Satisfied | 17 | 126 | 1 | 144 |
| Unsatisfied | 12 | 0 | 126 | 138 |
| All | 171 | 135 | 135 | 441 |
model.score(X_test, y_test)0.89342403628117917 Normalization of features, after realizing that tuning no longer improve model's accuracy
one_hot = (one_hot - one_hot.mean()) / (one_hot.max() - one_hot.min())categorical_features_indices = np.where(one_hot.dtypes != np.float)[0]from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(one_hot, y, train_size=0.7, random_state=1234)model = CatBoostClassifier( l2_leaf_reg = 5, iterations = 1000, fold_len_multiplier = 1.1, custom_loss = ['Accuracy'], random_seed = 100, loss_function = 'MultiClass' )model.fit( X_train, y_train, cat_features = categorical_features_indices, verbose = True, # you can uncomment this for text output #plot = True )feature_score = pd.DataFrame(list(zip(one_hot.dtypes.index, model.get_feature_importance(Pool(one_hot, label=y, cat_features=categorical_features_indices)))), columns=['Feature','Score'])feature_score = feature_score.sort_values(by='Score', ascending=False, inplace=False, kind='quicksort', na_position='last')plt.rcParams["figure.figsize"] = (12,7) ax = feature_score.plot('Feature', 'Score', kind='bar', color='c') ax.set_title("Catboost Feature Importance Ranking", fontsize = 14) ax.set_xlabel('') rects = ax.patches # get feature score as labels round to 2 decimal labels = feature_score['Score'].round(2) for rect, label in zip(rects, labels): height = rect.get_height() ax.text(rect.get_x() + rect.get_width()/2, height + 0.35, label, ha='center', va='bottom') plt.show()cm = pd.DataFrame() cm['Satisfaction'] = y_test cm['Predict'] = model.predict(X_test)mappingSatisfaction = {0:'Unsatisfied', 1: 'Neutral', 2: 'Satisfied'} mappingPredict = {0.0:'Unsatisfied', 1.0: 'Neutral', 2.0: 'Satisfied'} cm = cm.replace({'Satisfaction': mappingSatisfaction, 'Predict': mappingPredict})pd.crosstab(cm['Satisfaction'], cm['Predict'], margins=True)| Predict | Neutral | Satisfied | Unsatisfied | All |
|---|---|---|---|---|
| Satisfaction | ||||
| Neutral | 146 | 11 | 2 | 159 |
| Satisfied | 7 | 137 | 0 | 144 |
| Unsatisfied | 8 | 0 | 130 | 138 |
| All | 161 | 148 | 132 | 441 |
model.score(X_test, y_test)0.93650793650793651 

