|
| 1 | +from mlaide import client |
| 2 | +from mlaide.model import ArtifactRef |
| 3 | +import pandas as pd |
| 4 | +from sklearn import metrics |
| 5 | +from sklearn.linear_model import Lasso, LinearRegression |
| 6 | +from sklearn.model_selection import cross_val_score, train_test_split |
| 7 | +from sklearn.pipeline import Pipeline |
| 8 | +from sklearn.preprocessing import StandardScaler |
| 9 | +import numpy as np |
| 10 | + |
| 11 | +# create connection |
| 12 | +options = client.MvcOptions( |
| 13 | + mvc_server_url='http://localhost:8881/api/v1', |
| 14 | + api_key='NTIxYmUxOWMtNTJkMi00NDQ0LTljYjUtMTU1ZWZhMDFjYWFmOuKCqDdBPMKw4oKjwrjCsOKCsHTigrFj' |
| 15 | +) |
| 16 | +mlaide_client = client.MvcClient(project_key='usa-housing', options=options) |
| 17 | + |
| 18 | +# start tracking |
| 19 | +run_data_preparation = mlaide_client.start_new_run(experiment_key='linear-regression', run_name='data preparation') |
| 20 | + |
| 21 | +# read data |
| 22 | +housing_data = pd.read_csv('data/housing.csv') |
| 23 | + |
| 24 | +# add dataset as artifact |
| 25 | +artifact = run_data_preparation.create_artifact(name="USA housing dataset", artifact_type="dataset", metadata={}) |
| 26 | +run_data_preparation.add_artifact_file(artifact, 'data/housing.csv') |
| 27 | +run_data_preparation.set_completed_status() |
| 28 | + |
| 29 | + |
| 30 | +# create run with an reference to the dataset artifact |
| 31 | +artifact_ref = ArtifactRef(name="USA housing dataset", version=1) |
| 32 | +run_pipeline_setup = mlaide_client.start_new_run(experiment_key='linear-regression', |
| 33 | + run_name='pipeline setup', |
| 34 | + used_artifacts=[artifact_ref]) |
| 35 | + |
| 36 | +X = housing_data[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms', |
| 37 | + 'Avg. Area Number of Bedrooms', 'Area Population']] |
| 38 | +y = housing_data['Price'] |
| 39 | + |
| 40 | +test_size=0.3 |
| 41 | +random_state=42 |
| 42 | + |
| 43 | +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state) |
| 44 | + |
| 45 | +run_pipeline_setup.log_parameter('test_size', test_size) |
| 46 | +run_pipeline_setup.log_parameter('random_state', random_state) |
| 47 | + |
| 48 | +# create pipeline with standard scaler and store the pipeline in ML Aide |
| 49 | +pipeline = Pipeline([ |
| 50 | + ('std_scalar', StandardScaler()) |
| 51 | +]) |
| 52 | + |
| 53 | +X_train = pipeline.fit_transform(X_train) |
| 54 | +X_test = pipeline.transform(X_test) |
| 55 | + |
| 56 | +run_pipeline_setup.log_model(pipeline, model_name="pipeline") |
| 57 | +run_pipeline_setup.set_completed_status() |
| 58 | + |
| 59 | +# Linear Regression |
| 60 | +dataset_artifact_ref = ArtifactRef(name="USA housing dataset", version=1) |
| 61 | +pipeline_artifact_ref = ArtifactRef(name="pipeline", version=1) |
| 62 | +run_linear_regression = mlaide_client.start_new_run(experiment_key='linear-regression', |
| 63 | + run_name='linear regression', |
| 64 | + used_artifacts=[dataset_artifact_ref, pipeline_artifact_ref]) |
| 65 | + |
| 66 | +lin_reg = LinearRegression(normalize=True) |
| 67 | +lin_reg.fit(X_train,y_train) |
| 68 | + |
| 69 | +run_linear_regression.log_model(lin_reg, 'linear regression') |
| 70 | + |
| 71 | +test_pred = lin_reg.predict(X_test) |
| 72 | +train_pred = lin_reg.predict(X_train) |
| 73 | + |
| 74 | +mae = metrics.mean_absolute_error(y_test, test_pred) |
| 75 | +mse = metrics.mean_squared_error(y_test, test_pred) |
| 76 | +rmse = np.sqrt(metrics.mean_squared_error(y_test, test_pred)) |
| 77 | +r2 = metrics.r2_score(y_test, test_pred) |
| 78 | +cross_validation = cross_val_score(LinearRegression(), X, y, cv=10).mean() |
| 79 | + |
| 80 | +run_linear_regression.log_metric('mae', mae) |
| 81 | +run_linear_regression.log_metric('mse', mse) |
| 82 | +run_linear_regression.log_metric('rmse', rmse) |
| 83 | +run_linear_regression.log_metric('r2', r2) |
| 84 | +run_linear_regression.log_metric('cross validation', cross_validation) |
| 85 | + |
| 86 | +run_linear_regression.set_completed_status() |
| 87 | + |
| 88 | +# Lasso Regression |
| 89 | +dataset_artifact_ref = ArtifactRef(name="USA housing dataset", version=1) |
| 90 | +pipeline_artifact_ref = ArtifactRef(name="pipeline", version=1) |
| 91 | +run_lasso = mlaide_client.start_new_run(experiment_key='lasso-regression', |
| 92 | + run_name='lasso regression', |
| 93 | + used_artifacts=[dataset_artifact_ref, pipeline_artifact_ref]) |
| 94 | + |
| 95 | +alpha = 0.1 |
| 96 | +precompute = True |
| 97 | +positive = True |
| 98 | +selection = 'random' |
| 99 | +random_state = 42 |
| 100 | + |
| 101 | +run_lasso.log_parameter('alpha', alpha) |
| 102 | +run_lasso.log_parameter('precompute', precompute) |
| 103 | +run_lasso.log_parameter('positive', positive) |
| 104 | +run_lasso.log_parameter('selection', selection) |
| 105 | +run_lasso.log_parameter('random state', random_state) |
| 106 | + |
| 107 | +model = Lasso(alpha=alpha, |
| 108 | + precompute=precompute, |
| 109 | + positive=positive, |
| 110 | + selection=selection, |
| 111 | + random_state=random_state) |
| 112 | +model.fit(X_train, y_train) |
| 113 | + |
| 114 | +run_lasso.log_model(model, 'lasso') |
| 115 | + |
| 116 | +test_pred = model.predict(X_test) |
| 117 | +train_pred = model.predict(X_train) |
| 118 | + |
| 119 | +mae = metrics.mean_absolute_error(y_test, test_pred) |
| 120 | +mse = metrics.mean_squared_error(y_test, test_pred) |
| 121 | +rmse = np.sqrt(metrics.mean_squared_error(y_test, test_pred)) |
| 122 | +r2 = metrics.r2_score(y_test, test_pred) |
| 123 | +cross_validation = cross_val_score(Lasso(), X, y, cv=10).mean() |
| 124 | + |
| 125 | +run_lasso.log_metric('mae', mae) |
| 126 | +run_lasso.log_metric('mse', mse) |
| 127 | +run_lasso.log_metric('rmse', rmse) |
| 128 | +run_lasso.log_metric('r2', r2) |
| 129 | +run_lasso.log_metric('cross validation', cross_validation) |
| 130 | + |
| 131 | +run_lasso.set_completed_status() |
| 132 | + |
0 commit comments