leetoo
diff --git a/‎delta_lake_scikit_learn_local_training_and_serving/code/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎delta_lake_scikit_learn_local_training_and_serving/code/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎delta_lake_scikit_learn_local_training_and_serving/code/scikit_boston_housing.py‎
Lines changed: 105 additions & 0 deletions b/‎delta_lake_scikit_learn_local_training_and_serving/code/scikit_boston_housing.py‎
Lines changed: 105 additions & 0 deletions
diff --git a/‎delta_lake_scikit_learn_local_training_and_serving/delta_lake_scikit_learn_local_training_and_serving.py‎
Lines changed: 51 additions & 0 deletions b/‎delta_lake_scikit_learn_local_training_and_serving/delta_lake_scikit_learn_local_training_and_serving.py‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎delta_lake_scikit_learn_local_training_and_serving/profile/open-datasets.share‎
Lines changed: 5 additions & 0 deletions b/‎delta_lake_scikit_learn_local_training_and_serving/profile/open-datasets.share‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎delta_lake_scikit_learn_local_training_and_serving/requirements.txt‎
Lines changed: 4 additions & 0 deletions b/‎delta_lake_scikit_learn_local_training_and_serving/requirements.txt‎
Lines changed: 4 additions & 0 deletions
@@ -0,0 +1 @@
+delta-sharing
@@ -0,0 +1,105 @@
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# A copy of the License is located at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# or in the "license" file accompanying this file. This file is distributed
+# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+from __future__ import print_function
+
+import argparse
+import os
+import numpy as np
+
+import joblib
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_squared_error, r2_score
+import delta_sharing
+
+
+if __name__ == "__main__":
+ print("Training Started")
+ parser = argparse.ArgumentParser()
+
+ # Hyperparameters are described here. In this simple example we are just including one hyperparameter.
+ parser.add_argument("--max_leaf_nodes", type=int, default=-1)
+
+ # Sagemaker specific arguments. Defaults are set in the environment variables.
+ parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
+ parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
+ parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
+
+ args = parser.parse_args()
+ print("Got Args: {}".format(args))
+
+ # Take the profile file, create a SharingClient, and read data from the delta lake table
+ profile_files = [os.path.join(args.train, file) for file in os.listdir(args.train)]
+ if len(profile_files) == 0:
+ raise ValueError(
+ (
+ "There are no files in {}.\n"
+ + "This usually indicates that the channel ({}) was incorrectly specified,\n"
+ + "the data specification in S3 was incorrectly specified or the role specified\n"
+ + "does not have permission to access the data."
+ ).format(args.train, "train")
+ )
+
+ profile_file = profile_files[0]
+ print(f'Found profile file: {profile_file}')
+
+ # Create a SharingClient
+ client = delta_sharing.SharingClient(profile_file)
+ table_url = profile_file + "#delta_sharing.default.boston-housing"
+
+ # Load the table as a Pandas DataFrame
+ print('Loading boston-housing table from Delta Lake')
+ train_data = delta_sharing.load_as_pandas(table_url)
+ print(f'Train data shape: {train_data.shape}')
+
+ # Drop null values - THIS SHOULD BE DONE IN PRE-PROCESSING STAGE AS BEST PRACTISE
+ train_data.dropna(inplace=True)
+
+ # Split the data into training and testing sets
+ X = train_data.iloc[:, 1:14]
+ Y = train_data.iloc[:, 14]
+
+ X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)
+ print(f'X_train.shape: {X_train.shape}')
+ print(f'X_test.shape: {X_test.shape}')
+ print(f'Y_train.shape: {Y_train.shape}')
+ print(f'Y_test.shape: {Y_test.shape}')
+
+ linear_model = LinearRegression()
+ linear_model.fit(X_train, Y_train)
+
+ # model evaluation for training set
+ y_train_predict = linear_model.predict(X_train)
+ rmse = (np.sqrt(mean_squared_error(Y_train, y_train_predict)))
+ r2 = r2_score(Y_train, y_train_predict)
+
+ print("The model performance for training set")
+ print("--------------------------------------")
+ print(f'RMSE is {rmse}')
+ print(f'R2 score is {r2}')
+
+ # Save model
+ joblib.dump(linear_model, os.path.join(args.model_dir, "model.joblib"))
+
+ print("Training Completed")
+
+
+def model_fn(model_dir):
+ """Deserialized and return fitted model
+
+ Note that this should have the same name as the serialized model in the main method
+ """
+ clf = joblib.load(os.path.join(model_dir, "model.joblib"))
+ return clf
@@ -0,0 +1,51 @@
+# This is a sample Python program that trains a simple scikit-learn model
+# on the boston-housing dataset fetched from Delta Lake.
+# This implementation will work on your *local computer* or in the *AWS Cloud*.
+#
+# Delta Sharing: An Open Protocol for Secure Data Sharing
+# https://github.com/delta-io/delta-sharing
+#
+# Prerequisites:
+# 1. Install required Python packages:
+# `pip install -r requirements.txt`
+# 2. Docker Desktop installed and running on your computer:
+# `docker ps`
+# 3. You should have AWS credentials configured on your local machine
+# in order to be able to pull the docker image from ECR.
+###############################################################################################
+
+
+from sagemaker.sklearn import SKLearn
+
+
+DUMMY_IAM_ROLE = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001'
+
+
+def main():
+
+ print('Starting model training.')
+ print('Note: if launching for the first time in local mode, container image download might take a few minutes to complete.')
+
+ sklearn = SKLearn(
+ entry_point="scikit_boston_housing.py",
+ source_dir='code',
+ framework_version="0.23-1",
+ instance_type="local",
+ role=DUMMY_IAM_ROLE
+ )
+
+ delta_lake_profile_file = "file://./profile/open-datasets.share"
+
+ sklearn.fit({"train": delta_lake_profile_file})
+ print('Completed model training')
+
+ # print('Deploying endpoint in local mode')
+ # predictor = sklearn.deploy(initial_instance_count=1, instance_type='local')
+ #
+ #
+ # print('About to delete the endpoint to stop paying (if in cloud mode).')
+ # predictor.delete_endpoint(predictor.endpoint_name)
+
+
+if __name__ == "__main__":
+ main()
@@ -0,0 +1,5 @@
+{
+ "shareCredentialsVersion": 1,
+ "endpoint": "https://sharing.delta.io/delta-sharing/",
+ "bearerToken": "faaie590d541265bcab1f2de9813274bf233"
+}
@@ -0,0 +1,4 @@
+numpy
+pandas
+sagemaker>=2.0.0<3.0.0
+sagemaker[local]
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +numpy
 +pandas
 +sagemaker>=2.0.0<3.0.0
 +sagemaker[local]