Posted on Dec 1, 2024

sagemaker pie

Import Required Libraries

import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import joblib import tempfile import boto3

from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.feature_selection import SelectKBest, f_regression from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

Define Constants BUCKET_NAME = "employee-data" S3_INPUT_FOLDER = "inputfiles" S3_OUTPUT_FOLDER = "ml-output" FILE_NAME = "employee_cleaned_data.csv" AWS S3 Initialization s3_client = boto3.client('s3')

Task 1: Load Data from S3

try: Define S3 file path s3_file_key = f"{S3_INPUT_FOLDER}/{FILE_NAME}" Use tempfile to download the file locally with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as temp_file: temp_file_path = temp_file.name s3_client.download_file(BUCKET_NAME, s3_file_key, temp_file_path) print(f"File downloaded successfully from s3://{BUCKET_NAME}/{s3_file_key}") Load the dataset into a Pandas DataFrame df = pd.read_csv(temp_file_path) print("Data loaded successfully!") except Exception as e: print("Error loading data from S3:", e)

Task 2: Preprocess Data
Remove unique identifier column

if "employee_operations_id" in df.columns: df = df.drop(columns=["employee_operations_id"]) Extract numeric values from the 'region' column if "region" in df.columns: df['region'] = df['region'].str.extract('(\d+)').astype(float)

Task 3: Analyze and Visualize Data
Remove duplicates

duplicate_count = df.duplicated().sum() print(f"Number of duplicate records: {duplicate_count}") df = df.drop_duplicates() Pie chart for Gender Distribution if 'gender' in df.columns: gender_counts = df['gender'].value_counts() plt.figure(figsize=(8, 6)) plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=140, colors=['#ff9999','#66b3ff','#99ff99']) plt.title('Gender Distribution') plt.show()

Count plot for Education by Gender

if 'education' in df.columns and 'gender' in df.columns: plt.figure(figsize=(10, 6)) sns.countplot(data=df, x='education', hue='gender', palette='Set2') plt.title('Education Level Distribution by Gender') plt.xlabel('Education Level') plt.ylabel('Count') plt.xticks(rotation=45) plt.show()

Task 4: Feature Engineering
Define dependent and independent variables

dependent_variable = "turnover" # Replace with actual column name if dependent_variable in df.columns: X = df.drop(columns=[dependent_variable]) Y = df[dependent_variable] # Preprocess with ColumnTransformer categorical_columns = X.select_dtypes(include=['object']).columns.tolist() numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist() column_transformer = ColumnTransformer( transformers=[ ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns), ('num', 'passthrough', numerical_columns) ] ) X_transformed = column_transformer.fit_transform(X) Feature selection selector = SelectKBest(score_func=f_regression, k=5) X_selected = selector.fit_transform(X_transformed, Y)

Task 5: Model Training and Evaluation

Split data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X_selected, Y, test_size=0.2, random_state=0)

Feature scaling

scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test)

Train Logistic Regression model

model = LogisticRegression(random_state=0) model.fit(X_train_scaled, y_train)

Predictions and evaluation

y_pred = model.predict(X_test_scaled) accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) print(f"Accuracy: {accuracy:.2f}") print(f"Precision: {precision:.2f}") print(f"Recall: {recall:.2f}") print(f"F1-Score: {f1:.2f}") Confusion matrix heatmap cm = confusion_matrix(y_test, y_pred) plt.figure(figsize=(8, 6)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive']) plt.xlabel('Predicted Values') plt.ylabel('Actual Values') plt.title('Confusion Matrix') plt.show()

Task 6: Deploy Model

with tempfile.NamedTemporaryFile(delete=False, suffix='.pkl') as temp_file: model_file_path = temp_file.name joblib.dump(model, model_file_path) s3_file_key = f"{S3_OUTPUT_FOLDER}/logistic_regression_model.pkl" try: s3_client.upload_file(model_file_path, BUCKET_NAME, s3_file_key) print(f"Model uploaded successfully to s3://{BUCKET_NAME}/{s3_file_key}") except Exception as e: print("Error uploading the model:", e)

Task 7: Prediction Using Deployed Model

try: with tempfile.NamedTemporaryFile(delete=False, suffix='.pkl') as temp_file: model_file_path = temp_file.name s3_client.download_file(BUCKET_NAME, s3_file_key, model_file_path) model = joblib.load(model_file_path) y_pred = model.predict(X_test_scaled) accuracy = accuracy_score(y_test, y_pred) print(f"Prediction Accuracy: {accuracy:.2f}") except Exception as e: print("Error during prediction:", e)

DEV Community

sagemaker pie

Top comments (0)