1
1
import pandas as pd
2
2
3
-
4
3
def calculate_demographic_data (print_data = True ):
5
4
# Read data from file
6
- df = None
5
+ df = pd . read_csv ( 'adult.data.csv' )
7
6
8
7
# How many of each race are represented in this dataset? This should be a Pandas series with race names as the index labels.
9
- race_count = None
8
+ race_count = {}
9
+ for race in df ['race' ]:
10
+ if race in race_count :
11
+ race_count [race ] += 1
12
+ else :
13
+ race_count [race ] = 1
14
+ race_count = pd .Series (data = race_count )
10
15
11
16
# What is the average age of men?
12
- average_age_men = None
17
+ sum_age_men = 0
18
+ no_of_men = 0
19
+ for age , sex in zip (df ["age" ], df ["sex" ]):
20
+ if sex == "Male" :
21
+ no_of_men += 1
22
+ sum_age_men += age
23
+ average_age_men = round (sum_age_men / no_of_men , 1 )
13
24
14
25
# What is the percentage of people who have a Bachelor's degree?
15
- percentage_bachelors = None
26
+ no_of_people = 0
27
+ no_of_bachelors = 0
28
+ for degree in df ['education' ]:
29
+ no_of_people += 1
30
+ if degree == "Bachelors" :
31
+ no_of_bachelors += 1
32
+ percentage_bachelors = round (no_of_bachelors / no_of_people * 100 , 1 )
16
33
17
34
# What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?
18
35
# What percentage of people without advanced education make more than 50K?
19
36
20
37
# with and without `Bachelors`, `Masters`, or `Doctorate`
21
- higher_education = None
22
- lower_education = None
23
-
38
+ higher_education = 0
39
+ lower_education = 0
24
40
# percentage with salary >50K
25
- higher_education_rich = None
26
- lower_education_rich = None
41
+ higher_education_rich = 0
42
+ lower_education_rich = 0
43
+ for education , salary in zip (df ['education' ], df ['salary' ]):
44
+ if education in ('Bachelors' , 'Masters' , 'Doctorate' ):
45
+ higher_education += 1
46
+ if salary == '>50K' :
47
+ higher_education_rich += 1
48
+ else :
49
+ lower_education += 1
50
+ if salary == '>50K' :
51
+ lower_education_rich += 1
52
+ lower_education_rich = round (lower_education_rich / lower_education * 100 , 1 )
53
+ higher_education_rich = round (higher_education_rich / higher_education * 100 , 1 )
27
54
28
55
# What is the minimum number of hours a person works per week (hours-per-week feature)?
29
56
min_work_hours = None
57
+ for hours in df ['hours-per-week' ]:
58
+ if min_work_hours is None or hours < min_work_hours :
59
+ min_work_hours = hours
30
60
31
61
# What percentage of the people who work the minimum number of hours per week have a salary of >50K?
32
- num_min_workers = None
33
-
34
- rich_percentage = None
62
+ num_min_workers = 0
63
+ min_work_hours_rich = 0
64
+ for hours , salary in zip (df ['hours-per-week' ], df ['salary' ]):
65
+ if hours == min_work_hours :
66
+ if salary == '>50K' :
67
+ min_work_hours_rich += 1
68
+ num_min_workers += 1
69
+ rich_percentage = round (min_work_hours_rich / num_min_workers * 100 )
35
70
36
71
# What country has the highest percentage of people that earn >50K?
37
- highest_earning_country = None
38
- highest_earning_country_percentage = None
72
+ country_earnings = {}
73
+ for country , salary in zip (df ['native-country' ], df ['salary' ]):
74
+ if country in country_earnings :
75
+ country_earnings [country ][1 ] += 1
76
+ else :
77
+ country_earnings [country ] = [0 , 1 ]
78
+ if salary == ">50K" :
79
+ country_earnings [country ][0 ] += 1
80
+
81
+ highest_earning_country = max (country_earnings , key = lambda x : country_earnings [x ][0 ] / country_earnings [x ][1 ])
82
+ highest_earning_country_percentage = round (country_earnings [highest_earning_country ][0 ] / country_earnings [highest_earning_country ][1 ] * 100 , 1 )
39
83
40
84
# Identify the most popular occupation for those who earn >50K in India.
41
- top_IN_occupation = None
85
+ top_IN_occupation_dict = {}
86
+ for occupation , country , salary in zip (df ["occupation" ], df ["native-country" ], df ["salary" ]):
87
+ if country == "India" and salary == ">50K" :
88
+ if occupation in top_IN_occupation_dict :
89
+ top_IN_occupation_dict [occupation ] += 1
90
+ else :
91
+ top_IN_occupation_dict [occupation ] = 1
92
+ top_IN_occupation = max (top_IN_occupation_dict , key = lambda x : top_IN_occupation_dict [x ])
42
93
43
94
# DO NOT MODIFY BELOW THIS LINE
44
-
45
95
if print_data :
46
96
print ("Number of each race:\n " , race_count )
47
97
print ("Average age of men:" , average_age_men )
@@ -63,7 +113,6 @@ def calculate_demographic_data(print_data=True):
63
113
'min_work_hours' : min_work_hours ,
64
114
'rich_percentage' : rich_percentage ,
65
115
'highest_earning_country' : highest_earning_country ,
66
- 'highest_earning_country_percentage' :
67
- highest_earning_country_percentage ,
116
+ 'highest_earning_country_percentage' : highest_earning_country_percentage ,
68
117
'top_IN_occupation' : top_IN_occupation
69
- }
118
+ }
0 commit comments