elmoallistair
diff --git a/‎Streamlined Data Ingestion with pandas/01 Importing Data from Flat Files/01 Get data from CSVs/script.py‎
Lines changed: 8 additions & 0 deletions b/‎Streamlined Data Ingestion with pandas/01 Importing Data from Flat Files/01 Get data from CSVs/script.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎Streamlined Data Ingestion with pandas/01 Importing Data from Flat Files/02 Get data from other flat files/script.py‎
Lines changed: 10 additions & 0 deletions b/‎Streamlined Data Ingestion with pandas/01 Importing Data from Flat Files/02 Get data from other flat files/script.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎Streamlined Data Ingestion with pandas/01 Importing Data from Flat Files/03 Import a subset of columns/script.py‎
Lines changed: 8 additions & 0 deletions b/‎Streamlined Data Ingestion with pandas/01 Importing Data from Flat Files/03 Import a subset of columns/script.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎Streamlined Data Ingestion with pandas/01 Importing Data from Flat Files/04 Import a file in chunks/script.py‎
Lines changed: 10 additions & 0 deletions b/‎Streamlined Data Ingestion with pandas/01 Importing Data from Flat Files/04 Import a file in chunks/script.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎Streamlined Data Ingestion with pandas/01 Importing Data from Flat Files/05 Specify data types/script.py‎
Lines changed: 9 additions & 0 deletions b/‎Streamlined Data Ingestion with pandas/01 Importing Data from Flat Files/05 Specify data types/script.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎Streamlined Data Ingestion with pandas/01 Importing Data from Flat Files/06 Set custom NA values/script.py‎
Lines changed: 9 additions & 0 deletions b/‎Streamlined Data Ingestion with pandas/01 Importing Data from Flat Files/06 Set custom NA values/script.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎Streamlined Data Ingestion with pandas/01 Importing Data from Flat Files/07 Skip bad data/script.py‎
Lines changed: 11 additions & 0 deletions b/‎Streamlined Data Ingestion with pandas/01 Importing Data from Flat Files/07 Skip bad data/script.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎Streamlined Data Ingestion with pandas/02 Importing Data From Excel Files/01 Get data from a spreadsheet/script.py‎
Lines changed: 8 additions & 0 deletions b/‎Streamlined Data Ingestion with pandas/02 Importing Data From Excel Files/01 Get data from a spreadsheet/script.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎Streamlined Data Ingestion with pandas/02 Importing Data From Excel Files/02 Load a portion of a spreadsheet/script.py‎
Lines changed: 10 additions & 0 deletions b/‎Streamlined Data Ingestion with pandas/02 Importing Data From Excel Files/02 Load a portion of a spreadsheet/script.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎Streamlined Data Ingestion with pandas/02 Importing Data From Excel Files/03 Select a single sheet/script.py‎
Lines changed: 9 additions & 0 deletions b/‎Streamlined Data Ingestion with pandas/02 Importing Data From Excel Files/03 Select a single sheet/script.py‎
Lines changed: 9 additions & 0 deletions
@@ -0,0 +1,8 @@
+# Import pandas as pd
+import pandas as pd
+
+# Read the CSV and assign it to the variable data
+data = pd.read_csv('vt_tax_data_2016.csv')
+
+# View the first few lines of data
+print(data.head())
@@ -0,0 +1,10 @@
+# Import pandas with the alias pd
+import pandas as pd
+
+# Load TSV using the sep keyword argument to set delimiter
+data = pd.read_csv('vt_tax_data_2016.tsv', sep='\t')
+
+# Plot the total number of tax returns by income group
+counts = data.groupby("agi_stub").N1.sum()
+counts.plot.bar()
+plt.show()
@@ -0,0 +1,8 @@
+# Create list of columns to use
+cols = ['zipcode','agi_stub','mars1','MARS2','NUMDEP']
+
+# Create data frame from csv using only selected columns
+data = pd.read_csv("vt_tax_data_2016.csv", usecols=cols)
+
+# View counts of dependents and tax returns by income level
+print(data.groupby("agi_stub").sum())
@@ -0,0 +1,10 @@
+# Create data frame of next 500 rows with labeled columns
+vt_data_next500 = pd.read_csv("vt_tax_data_2016.csv", 
+ nrows=500,
+ skiprows=500,
+ header=None,
+ names=vt_data_first500.columns)
+
+# View the Vermont data frames to confirm they're different
+print(vt_data_first500.head())
+print(vt_data_next500.head())
@@ -0,0 +1,9 @@
+# Create dict specifying data types for agi_stub and zipcode
+data_types = {'agi_stub':'category',
+ 'zipcode':'str'}
+
+# Load csv using dtype to set correct data types
+data = pd.read_csv("vt_tax_data_2016.csv", dtype=data_types)
+
+# Print data types of resulting frame
+print(data.dtypes.head())
@@ -0,0 +1,9 @@
+# Create dict specifying that 0s in zipcode are NA values
+null_values = {'zipcode':0}
+
+# Load csv using na_values keyword argument
+data = pd.read_csv("vt_tax_data_2016.csv", 
+ na_values=null_values)
+
+# View rows with NA ZIP codes
+print(data[data.zipcode.isna()])
@@ -0,0 +1,11 @@
+try:
+ # Set warn_bad_lines to issue warnings about bad records
+ data = pd.read_csv("vt_tax_data_2016_corrupt.csv", 
+ error_bad_lines=False, 
+ warn_bad_lines=True)
+ 
+ # View first 5 records
+ print(data.head())
+ 
+except pd.io.common.CParserError:
+ print("Your data contained rows that could not be parsed.")
@@ -0,0 +1,8 @@
+# Load pandas as pd
+import pandas as pd
+
+# Read spreadsheet and assign it to survey_responses
+survey_responses = pd.read_excel('fcc_survey.xlsx')
+
+# View the head of the data frame
+print(survey_responses.head())
@@ -0,0 +1,10 @@
+# Create string of lettered columns to load
+col_string = "AD, AW:BA"
+
+# Load data with skiprows and usecols set
+survey_responses = pd.read_excel("fcc_survey_headers.xlsx", 
+ skiprows=2, 
+ usecols=col_string)
+
+# View the names of the columns selected
+print(survey_responses.columns)
@@ -0,0 +1,9 @@
+# Create df from second worksheet by referencing its position
+responses_2017 = pd.read_excel("fcc_survey.xlsx",
+ sheet_name=1)
+ # sheet_name='2017')
+
+# Graph where people would like to get a developer job
+job_prefs = responses_2017.groupby("JobPref").JobPref.count()
+job_prefs.plot.barh()
+plt.show()