|
| 1 | +##Finding consistency |
| 2 | + |
| 3 | +# Print categories DataFrame |
| 4 | +print(categories) |
| 5 | + |
| 6 | +# Print unique values of survey columns in airlines |
| 7 | +print('Cleanliness: ', airlines['cleanliness'].unique(), "\n") |
| 8 | +print('Safety: ', airlines['safety'].unique(), "\n") |
| 9 | +print('Satisfaction: ', airlines['satisfaction'].unique(), "\n") |
| 10 | + |
| 11 | +# Find the cleanliness category in airlines not in categories |
| 12 | +cat_clean = set(airlines['cleanliness']).difference(categories['cleanliness']) |
| 13 | + |
| 14 | +# Find rows with that category |
| 15 | +cat_clean_rows = airlines['cleanliness'].isin(cat_clean) |
| 16 | + |
| 17 | +# Print rows with inconsistent category |
| 18 | +print(airlines[cat_clean_rows]) |
| 19 | + |
| 20 | +#Print rows with consistent category |
| 21 | +print(airlines[~cat_clean_rows]) |
| 22 | + |
| 23 | +##Categorical variables-Inconsistent categories |
| 24 | + |
| 25 | +# Print unique values of both columns |
| 26 | +print(airlines['dest_region'].unique()) |
| 27 | +print(airlines['dest_size'].unique()) |
| 28 | + |
| 29 | +# Print unique values of both columns |
| 30 | +print(airlines['dest_region'].unique()) |
| 31 | +print(airlines['dest_size'].unique()) |
| 32 | + |
| 33 | +# Lower dest_region column and then replace "eur" with "europe" |
| 34 | +airlines['dest_region'] = airlines['dest_region'].str.lower() |
| 35 | +airlines['dest_region'] = airlines['dest_region'].replace({'eur':'europe'}) |
| 36 | + |
| 37 | +# Remove white spaces from `dest_size` |
| 38 | +airlines['dest_size'] = airlines['dest_size'].str.strip() |
| 39 | + |
| 40 | +# Verify changes have been effected |
| 41 | +print(airlines['dest_region'].unique()) |
| 42 | +print(airlines['dest_size'].unique()) |
| 43 | + |
| 44 | +##Remapping categories |
| 45 | + |
| 46 | +# Create ranges for categories |
| 47 | +label_ranges = [0, 60, 180, np.inf] |
| 48 | +label_names = ['short', 'medium', 'long'] |
| 49 | + |
| 50 | +# Create wait_type column |
| 51 | +airlines['wait_type'] = pd.cut(airlines['wait_min'], bins = label_ranges, |
| 52 | + labels = label_names) |
| 53 | + |
| 54 | +# Create mappings and replace |
| 55 | +mappings = {'Monday':'weekday', 'Tuesday':'weekday', 'Wednesday': 'weekday', |
| 56 | + 'Thursday': 'weekday', 'Friday': 'weekday', |
| 57 | + 'Saturday': 'weekend', 'Sunday': 'weekend'} |
| 58 | + |
| 59 | +airlines['day_week'] = airlines['day'].replace(mappings) |
| 60 | + |
| 61 | +##Removing titles and taking names |
| 62 | + |
| 63 | +# Replace "Dr." with empty string "" |
| 64 | +airlines['full_name'] = airlines['full_name'].str.replace("Dr.","") |
| 65 | + |
| 66 | +# Replace "Mr." with empty string "" |
| 67 | +airlines['full_name'] =airlines['full_name'].str.replace("Mr.","") |
| 68 | + |
| 69 | +# Replace "Miss" with empty string "" |
| 70 | +airlines['full_name'] =airlines['full_name'].str.replace("Miss","") |
| 71 | + |
| 72 | +# Replace "Ms." with empty string "" |
| 73 | +airlines['full_name'] =airlines['full_name'].str.replace("Ms.","") |
| 74 | + |
| 75 | +# Assert that full_name has no honorifics |
| 76 | +assert airlines['full_name'].str.contains('Ms.|Mr.|Miss|Dr.').any() == False |
| 77 | + |
| 78 | +##Keeping it descriptive |
| 79 | +# Store length of each row in survey_response column |
| 80 | +resp_length = airlines['survey_response'].str.len() |
| 81 | + |
| 82 | +# Find rows in airlines where resp_length > 40 |
| 83 | +airlines_survey = airlines[resp_length > 40] |
| 84 | + |
| 85 | +# Assert minimum survey_response length is > 40 |
| 86 | +assert airlines_survey['survey_response'].str.len().min() > 40 |
| 87 | + |
| 88 | +# Print new survey_response column |
| 89 | +print(airlines_survey['survey_response']) |
| 90 | + |
0 commit comments