Skip to content

Commit f138f89

Browse files
Create script.py
1 parent 6276a57 commit f138f89

File tree

35 files changed

+440
-0
lines changed

35 files changed

+440
-0
lines changed
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Import pandas as pd
2+
import pandas as pd
3+
4+
# Read the CSV and assign it to the variable data
5+
data = pd.read_csv('vt_tax_data_2016.csv')
6+
7+
# View the first few lines of data
8+
print(data.head())
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Import pandas with the alias pd
2+
import pandas as pd
3+
4+
# Load TSV using the sep keyword argument to set delimiter
5+
data = pd.read_csv('vt_tax_data_2016.tsv', sep='\t')
6+
7+
# Plot the total number of tax returns by income group
8+
counts = data.groupby("agi_stub").N1.sum()
9+
counts.plot.bar()
10+
plt.show()
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Create list of columns to use
2+
cols = ['zipcode','agi_stub','mars1','MARS2','NUMDEP']
3+
4+
# Create data frame from csv using only selected columns
5+
data = pd.read_csv("vt_tax_data_2016.csv", usecols=cols)
6+
7+
# View counts of dependents and tax returns by income level
8+
print(data.groupby("agi_stub").sum())
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Create data frame of next 500 rows with labeled columns
2+
vt_data_next500 = pd.read_csv("vt_tax_data_2016.csv",
3+
nrows=500,
4+
skiprows=500,
5+
header=None,
6+
names=vt_data_first500.columns)
7+
8+
# View the Vermont data frames to confirm they're different
9+
print(vt_data_first500.head())
10+
print(vt_data_next500.head())
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Create dict specifying data types for agi_stub and zipcode
2+
data_types = {'agi_stub':'category',
3+
'zipcode':'str'}
4+
5+
# Load csv using dtype to set correct data types
6+
data = pd.read_csv("vt_tax_data_2016.csv", dtype=data_types)
7+
8+
# Print data types of resulting frame
9+
print(data.dtypes.head())
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Create dict specifying that 0s in zipcode are NA values
2+
null_values = {'zipcode':0}
3+
4+
# Load csv using na_values keyword argument
5+
data = pd.read_csv("vt_tax_data_2016.csv",
6+
na_values=null_values)
7+
8+
# View rows with NA ZIP codes
9+
print(data[data.zipcode.isna()])
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
try:
2+
# Set warn_bad_lines to issue warnings about bad records
3+
data = pd.read_csv("vt_tax_data_2016_corrupt.csv",
4+
error_bad_lines=False,
5+
warn_bad_lines=True)
6+
7+
# View first 5 records
8+
print(data.head())
9+
10+
except pd.io.common.CParserError:
11+
print("Your data contained rows that could not be parsed.")
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Load pandas as pd
2+
import pandas as pd
3+
4+
# Read spreadsheet and assign it to survey_responses
5+
survey_responses = pd.read_excel('fcc_survey.xlsx')
6+
7+
# View the head of the data frame
8+
print(survey_responses.head())
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Create string of lettered columns to load
2+
col_string = "AD, AW:BA"
3+
4+
# Load data with skiprows and usecols set
5+
survey_responses = pd.read_excel("fcc_survey_headers.xlsx",
6+
skiprows=2,
7+
usecols=col_string)
8+
9+
# View the names of the columns selected
10+
print(survey_responses.columns)
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Create df from second worksheet by referencing its position
2+
responses_2017 = pd.read_excel("fcc_survey.xlsx",
3+
sheet_name=1)
4+
# sheet_name='2017')
5+
6+
# Graph where people would like to get a developer job
7+
job_prefs = responses_2017.groupby("JobPref").JobPref.count()
8+
job_prefs.plot.barh()
9+
plt.show()

0 commit comments

Comments
 (0)