openlayer-ai · gbayomi · Sep 13, 2022 · Sep 14, 2022 · Sep 14, 2022 · Sep 14, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ## [Unreleased]
 
+### Added
+
+* A method `add_baseline` that automatically trains and uploads a tabular classification model to the platform.
+
 ### Changed
 
 * Migrated package name from [openlayer](https://pypi.org/project/openlayer/) to [openlayer](https://pypi.org/project/openlayer/) due to a company name change.
diff --git a/openlayer/__init__.py b/openlayer/__init__.py
@@ -9,10 +9,12 @@
 
 import marshmallow as ma
 import pandas as pd
+from Automunge import AutoMunge
 from bentoml.saved_bundle import bundler
 from bentoml.utils import tempdir
 
 from . import api, exceptions, schemas, utils
+from .baseline import QuickBaseline
 from .datasets import Dataset
 from .models import Model, ModelType, create_template_model
 from .projects import Project
@@ -1206,6 +1208,186 @@ def add_dataframe(
  project_id=project_id,
  )
 
+ def add_baseline(
+ self,
+ task_type: TaskType,
+ class_names: List[str],
+ label_column_name: str,
+ commit_message: str,
+ train_df: pd.DataFrame = None,
+ val_df: pd.DataFrame = None,
+ ensemble_size: int = 10,
+ random_seed: int = 0,
+ timeout: int = 60,
+ per_run_limit: int = None,
+ project_id: str = None,
+ ) -> Model:
+ """Add a baseline model to the Openlayer platform. You only need to specify a training set
+ and we will automatically find and train a baseline model using AutoML.
+
+ Parameters
+ ----------
+ task_type : :obj:`TaskType`
+ Type of ML task. E.g. :obj:`TaskType.TabularClassification`
+ .. important::
+ For now, the `add_baseline` method only supports tabular classification
+ tasks, so `task_type` must be equal to `TaskType.TabularClassification`
+ class_names : List[str]
+ List of class names corresponding to the outputs of your predict function.
+ E.g. `['positive', 'negative']`.
+ label_column_name : str
+ Column containing dataset labels
+ commit_message : str
+ Commit message for the model version.
+ train_df : pd.DataFrame, default None
+ Training set dataframe.
+ val_df : pd.DataFrame, default None
+ Validation set dataframe. If specified, will be added to the project.
+ ensemble_size : int, default 10
+ Number of models ensembled.
+ random_seed : int, default 0
+ Random seed to be used for model training.
+ timeout : int, default 60
+ Maximum time to train all the models.
+ per_run_limit : int, default None
+ Maximum time to train each model.
+
+ Returns
+ -------
+ :obj:`Model`
+ An object containing information about your uploaded model.
+
+ Examples
+ --------
+ .. seealso::
+ Our `sample notebooks
+ <https://github.com/unboxai/openlayer-python/tree/main/examples>`_ and
+ `tutorials <https://docs.openlayer.com/docs/overview-of-tutorial-tracks>`_.
+
+ First, instantiate the client:
+
+ >>> import openlayer
+ >>> client = openlayer.OpenlayerClient('YOUR_API_KEY_HERE')
+
+ Create a project if you don't have one:
+
+ >>> from openlayer.tasks import TaskType
+ >>> project = client.create_project(
+ ... name="Churn Prediction",
+ ... task_type=TaskType.TabularClassification,
+ ... description="My first project!",
+ ... )
+
+ If you already have a project created on the platform:
+
+ >>> project = client.load_project(name="Your project name")
+
+ Let's say your training set looks like the following:
+
+ >>> train_df
+ CreditScore Geography Balance Churned
+ 0 618 France 321.92 1
+ 1 714 Germany 102001.22 0
+ 2 604 Spain 12333.15 0
+ .. ... ... ... ...
+
+ Now you can create a baseline model:
+
+ >>> class_names = ['Retained', 'Churned']
+ >>> label_column_name = 'Exited'
+ >>> project.add_baseline(
+ ... class_names=class_names,
+ ... label_column_name=label_column,
+ ... train_df=train_df,
+ ... val_df=val_df,
+ ... ensemble_size=3,
+ ... timeout=60*10,
+ ... per_run_limit=None,
+ ... commit_message="first commit!"
+ ... )
+ """
+ # ---------------------------- Schema validations ---------------------------- #
+ if task_type is not TaskType.TabularClassification:
+ raise exceptions.OpenlayerValidationError(
+ "The `add_baseline` method is only valid for TaskType.TabularClassification tasks. \n "
+ ) from None
+ # --------------------------- Resource validations --------------------------- #
+ if len(train_df) < 3000:
+ raise exceptions.OpenlayerResourceError(
+ f"The training set specified as `train_df` is too small, with only {len(train_df)} rows. \n",
+ mitigation="Please provide a training set with at least 3000 rows.",
+ ) from None
+
+ # Instantiate object
+ qb = QuickBaseline(
+ train_df=train_df,
+ label_column_name=label_column_name,
+ ensemble_size=ensemble_size,
+ random_seed=random_seed,
+ )
+
+ # Preprocess the training set
+ print("Preprocessing the training set")
+ preprocessing_dict, train_features_df = qb.preprocess_dataset()
+
+ # Get the column names and categorical feature names
+ col_names = qb.column_names
+ categorical_feature_names = qb.get_categorical_feature_names(train_features_df)
+
+ # Upload the validation set -- if there are issues, it's better to fail prior to model training
+ if val_df is not None:
+ self.add_dataframe(
+ df=val_df,
+ task_type=task_type,
+ project_id=project_id,
+ class_names=class_names,
+ label_column_name=label_column_name,
+ commit_message=commit_message,
+ feature_names=col_names,
+ categorical_feature_names=categorical_feature_names,
+ )
+
+ # Train model
+ print(
+ f"Training model for approximately {round(0.0166 * timeout, 2)} minute(s)."
+ )
+ model = qb.train_auto_classifiers(
+ timeout=timeout,
+ per_run_limit=per_run_limit,
+ train_features_df=train_features_df,
+ )
+
+ # Get model predict function for the upload process
+ predict_proba = qb.get_predict_function()
+
+ # Create requirements file
+ filename = "auto-requirements.txt"
+ with open(filename, "w") as f:
+ f.write("Automunge==8.30\n")
+ f.write("scikit-learn== 0.24.1")
+
+ # Upload model
+ model_info = self.add_model(
+ function=predict_proba,
+ task_type=task_type,
+ project_id=project_id,
+ model=model,
+ model_type=ModelType.sklearn,
+ class_names=class_names,
+ name=f"Baseline model",
+ commit_message=commit_message,
+ feature_names=col_names,
+ train_sample_df=train_df.sample(n=3000, random_state=random_seed),
+ train_sample_label_column_name=label_column_name,
+ categorical_feature_names=categorical_feature_names,
+ requirements_txt_file="auto-requirements.txt",
+ col_names=col_names,
+ preprocessor=AutoMunge(),
+ preprocessing_dict=preprocessing_dict,
+ )
+
+ return model_info
+
  @staticmethod
  def _format_error_message(err) -> str:
  """Formats the error messaeges from Marshmallow"""