Skip to the content.

Applied Data Science Capstone

Data Collection

Data Collection API

The SpaceX REST API endpoints:

import requests import pandas as pd spacex_url = "https://api.spacexdata.com/v4/launches/past" response = requests.get(spacex_url) data = pd.json_normalize(response.json()) 

Jupyter Notebook: Data Collection API

Web Scraping

Data Source: Wikipedia: List of Falcon 9 and Falcon Heavy launches

Jupyter Notebook: Web Scraping

Data Wrangling

Jupyter Notebook: Data Wrangling


↥ back to top


Exploratory Data Analysis

Exploratory Analysis Using SQL

Jupyter Notebook: Using SQL

Exploratory Analysis Using Pandas and Matplotlib

Jupyter Notebook: Data Visualization

Interactive Visual Analytics and Dashboards

Jupyter Notebook: Folium

# Function to assign color to launch outcome def assign_marker_color(launch_outcome): if launch_outcome == 1: return 'green' else: return 'red' spacex_df['marker_color'] = spacex_df['class'].apply(assign_marker_color) 
marker_cluster = MarkerCluster() # Add marker_cluster to current site_map site_map.add_child(marker_cluster) # for each row in spacex_df data frame # create a Marker object with its coordinate # and customize the Marker's icon property to indicate if this launch was successed or failed, # e.g., icon=folium.Icon(color='white', icon_color=row['marker_color'] for index, record in spacex_df.iterrows(): # TODO: Create and add a Marker cluster to the site map  # marker = folium.Marker(...)  marker = folium.map.Marker( [record['Lat'],record['Long']], # Create an icon as a text label  icon=folium.Icon(color='white', icon_color=record['marker_color']) ) marker_cluster.add_child(marker) 


↥ back to top


SpaceX Dash App

SpaceX Dash App Assignment Guide

# Import required libraries import pandas as pd import dash import dash_html_components as html import dash_core_components as dcc from dash.dependencies import Input, Output import plotly.express as px # Read the airline data into pandas dataframe spacex_df = pd.read_csv("spacex_launch_dash.csv") max_payload = spacex_df['Payload Mass (kg)'].max() min_payload = spacex_df['Payload Mass (kg)'].min() # Create a dash application app = dash.Dash(__name__) # Create an app layout app.layout = html.Div(children=[ html.H1('SpaceX Launch Records Dashboard', style={'textAlign': 'center', 'color': '#503D36', 'font-size': 40}), # TASK 1: Add a dropdown list to enable Launch Site selection  # The default select value is for ALL sites  # dcc.Dropdown(id='site-dropdown',...)  dcc.Dropdown( id='site-dropdown', options=[ {'label': 'All Sites', 'value': 'ALL'}, {'label': 'CCAFS LC-40', 'value': 'CCAFS LC-40'}, {'label': 'CCAFS SLC-40', 'value': 'CCAFS SLC-40'}, {'label': 'KSC LC-39A', 'value': 'KSC LC-39A'}, {'label': 'VAFB SLC-4E', 'value': 'VAFB SLC-4E'} ], value='all', placeholder="Select a Launch Site here", searchable=True ), html.Br(), # TASK 2: Add a pie chart to show the total successful launches count for all sites  # If a specific launch site was selected, show the Success vs. Failed counts for the site  html.Div(dcc.Graph(id='success-pie-chart')), html.Br(), html.P("Payload range (Kg):"), # TASK 3: Add a slider to select payload range  #dcc.RangeSlider(id='payload-slider',...)  dcc.RangeSlider(id='payload-slider', min=0, max=10000, step=1000, # marks={0: '0',  # 100: '100'},  value=[min_payload, max_payload]), # TASK 4: Add a scatter chart to show the correlation between payload and launch success  html.Div(dcc.Graph(id='success-payload-scatter-chart')), ]) # TASK 2: # Add a callback function for `site-dropdown` as input, `success-pie-chart` as output @app.callback(Output(component_id='success-pie-chart', component_property='figure'), Input(component_id='site-dropdown', component_property='value')) #Place to define the callback function . def get_pie_chart(entered_site): filtered_df = spacex_df if entered_site == 'ALL': fig = px.pie(filtered_df, values='class', # use value 1 to calc success rate  names='Launch Site', title='Total Success Launches by Site') return fig else: # return the outcomes piechart for a selected site  filtered_df = filtered_df[filtered_df['Launch Site'] == entered_site] fig = px.pie(filtered_df, #values='class', # no need to use value 1 to calc, just count 0 or 1  names='class', title=f'Total Success Launches for {entered_site}') return fig # TASK 4: # Add a callback function for `site-dropdown` and `payload-slider` as inputs, `success-payload-scatter-chart` as output @app.callback(Output(component_id='success-payload-scatter-chart', component_property='figure'), [Input(component_id='site-dropdown', component_property='value'), Input(component_id="payload-slider", component_property="value")]) #Place to define the callback function . def get_scatter_plot(entered_site, payload): filtered_df = spacex_df if entered_site == 'ALL': fig = px.scatter(filtered_df, x='Payload Mass (kg)', y="class", color="Booster Version Category", title='Correlation between Payload and Success for all Sites') return fig else: # return the outcomes piechart for a selected site  filtered_df = filtered_df[(filtered_df['Launch Site']==entered_site) \ & (filtered_df['Payload Mass (kg)'] >= payload[0]) \ & (filtered_df['Payload Mass (kg)'] <= payload[1])] fig = px.scatter(filtered_df, x='Payload Mass (kg)', y="class", color="Booster Version Category", title=f'Correlation between Payload and Success for {entered_site}') return fig # Run the app if __name__ == '__main__': app.run_server() 

Plotly Dash Reference links:


↥ back to top


Predictive Analysis (Classification)

import pandas as pd from sklearn import preprocessing from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.neighbors import KNeighborsClassifier def plot_confusion_matrix(y,y_predict): "this function plots the confusion matrix" from sklearn.metrics import confusion_matrix cm = confusion_matrix(y, y_predict) ax= plt.subplot() sns.heatmap(cm, annot=True, ax = ax); #annot=True to annotate cells  ax.set_xlabel('Predicted labels') ax.set_ylabel('True labels') ax.set_title('Confusion Matrix'); ax.xaxis.set_ticklabels(['did not land', 'land']); ax.yaxis.set_ticklabels(['did not land', 'landed']) # load data data = pd.read_csv("dataset_part_2.csv") X = pd.read_csv('dataset_part_3.csv') Y = data['Class'].to_numpy() X = preprocessing.StandardScaler().fit(X).transform(X) # split data X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2) print ('Train set:', X_train.shape, Y_train.shape) print ('Test set:', X_test.shape, Y_test.shape) # build model to search best parameter parameters = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'p': [1,2]} KNN = KNeighborsClassifier() knn_cv = GridSearchCV(KNN, parameters, cv=10) knn_cv.fit(X,Y) print("tuned hpyerparameters :(best parameters) ",knn_cv.best_params_) print("accuracy :",knn_cv.best_score_) # accuracy on test data knn_cv.score(X_test, Y_test) # plot the confusion matrix yhat = knn_cv.predict(X_test) plot_confusion_matrix(Y_test,yhat) 

Jupyter Notebook: Machine Learning Prediction

How to Present Your Findings

Structure of A Report


↥ back to top