Skip to content

Commit a224981

Browse files
authored
Merge pull request #7 from Anshuadhikari/patch-2
DATACAMP2021 Cleaning Data with Python Chapter-2
2 parents f6208c4 + cf9b0ae commit a224981

File tree

1 file changed

+90
-0
lines changed

1 file changed

+90
-0
lines changed
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
##Finding consistency
2+
3+
# Print categories DataFrame
4+
print(categories)
5+
6+
# Print unique values of survey columns in airlines
7+
print('Cleanliness: ', airlines['cleanliness'].unique(), "\n")
8+
print('Safety: ', airlines['safety'].unique(), "\n")
9+
print('Satisfaction: ', airlines['satisfaction'].unique(), "\n")
10+
11+
# Find the cleanliness category in airlines not in categories
12+
cat_clean = set(airlines['cleanliness']).difference(categories['cleanliness'])
13+
14+
# Find rows with that category
15+
cat_clean_rows = airlines['cleanliness'].isin(cat_clean)
16+
17+
# Print rows with inconsistent category
18+
print(airlines[cat_clean_rows])
19+
20+
#Print rows with consistent category
21+
print(airlines[~cat_clean_rows])
22+
23+
##Categorical variables-Inconsistent categories
24+
25+
# Print unique values of both columns
26+
print(airlines['dest_region'].unique())
27+
print(airlines['dest_size'].unique())
28+
29+
# Print unique values of both columns
30+
print(airlines['dest_region'].unique())
31+
print(airlines['dest_size'].unique())
32+
33+
# Lower dest_region column and then replace "eur" with "europe"
34+
airlines['dest_region'] = airlines['dest_region'].str.lower()
35+
airlines['dest_region'] = airlines['dest_region'].replace({'eur':'europe'})
36+
37+
# Remove white spaces from `dest_size`
38+
airlines['dest_size'] = airlines['dest_size'].str.strip()
39+
40+
# Verify changes have been effected
41+
print(airlines['dest_region'].unique())
42+
print(airlines['dest_size'].unique())
43+
44+
##Remapping categories
45+
46+
# Create ranges for categories
47+
label_ranges = [0, 60, 180, np.inf]
48+
label_names = ['short', 'medium', 'long']
49+
50+
# Create wait_type column
51+
airlines['wait_type'] = pd.cut(airlines['wait_min'], bins = label_ranges,
52+
labels = label_names)
53+
54+
# Create mappings and replace
55+
mappings = {'Monday':'weekday', 'Tuesday':'weekday', 'Wednesday': 'weekday',
56+
'Thursday': 'weekday', 'Friday': 'weekday',
57+
'Saturday': 'weekend', 'Sunday': 'weekend'}
58+
59+
airlines['day_week'] = airlines['day'].replace(mappings)
60+
61+
##Removing titles and taking names
62+
63+
# Replace "Dr." with empty string ""
64+
airlines['full_name'] = airlines['full_name'].str.replace("Dr.","")
65+
66+
# Replace "Mr." with empty string ""
67+
airlines['full_name'] =airlines['full_name'].str.replace("Mr.","")
68+
69+
# Replace "Miss" with empty string ""
70+
airlines['full_name'] =airlines['full_name'].str.replace("Miss","")
71+
72+
# Replace "Ms." with empty string ""
73+
airlines['full_name'] =airlines['full_name'].str.replace("Ms.","")
74+
75+
# Assert that full_name has no honorifics
76+
assert airlines['full_name'].str.contains('Ms.|Mr.|Miss|Dr.').any() == False
77+
78+
##Keeping it descriptive
79+
# Store length of each row in survey_response column
80+
resp_length = airlines['survey_response'].str.len()
81+
82+
# Find rows in airlines where resp_length > 40
83+
airlines_survey = airlines[resp_length > 40]
84+
85+
# Assert minimum survey_response length is > 40
86+
assert airlines_survey['survey_response'].str.len().min() > 40
87+
88+
# Print new survey_response column
89+
print(airlines_survey['survey_response'])
90+

0 commit comments

Comments
 (0)