Skip to content
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

## [Unreleased]

### Added

* A method `add_baseline` that automatically trains and uploads a tabular classification model to the platform.

### Changed

* Migrated package name from [openlayer](https://pypi.org/project/openlayer/) to [openlayer](https://pypi.org/project/openlayer/) due to a company name change.
182 changes: 182 additions & 0 deletions openlayer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@

import marshmallow as ma
import pandas as pd
from Automunge import AutoMunge
from bentoml.saved_bundle import bundler
from bentoml.utils import tempdir

from . import api, exceptions, schemas, utils
from .baseline import QuickBaseline
from .datasets import Dataset
from .models import Model, ModelType, create_template_model
from .projects import Project
Expand Down Expand Up @@ -1206,6 +1208,186 @@ def add_dataframe(
project_id=project_id,
)

def add_baseline(
self,
task_type: TaskType,
class_names: List[str],
label_column_name: str,
commit_message: str,
train_df: pd.DataFrame = None,
val_df: pd.DataFrame = None,
ensemble_size: int = 10,
random_seed: int = 0,
timeout: int = 60,
per_run_limit: int = None,
project_id: str = None,
) -> Model:
"""Add a baseline model to the Openlayer platform. You only need to specify a training set
and we will automatically find and train a baseline model using AutoML.

Parameters
----------
task_type : :obj:`TaskType`
Type of ML task. E.g. :obj:`TaskType.TabularClassification`
.. important::
For now, the `add_baseline` method only supports tabular classification
tasks, so `task_type` must be equal to `TaskType.TabularClassification`
class_names : List[str]
List of class names corresponding to the outputs of your predict function.
E.g. `['positive', 'negative']`.
label_column_name : str
Column containing dataset labels
commit_message : str
Commit message for the model version.
train_df : pd.DataFrame, default None
Training set dataframe.
val_df : pd.DataFrame, default None
Validation set dataframe. If specified, will be added to the project.
ensemble_size : int, default 10
Number of models ensembled.
random_seed : int, default 0
Random seed to be used for model training.
timeout : int, default 60
Maximum time to train all the models.
per_run_limit : int, default None
Maximum time to train each model.

Returns
-------
:obj:`Model`
An object containing information about your uploaded model.

Examples
--------
.. seealso::
Our `sample notebooks
<https://github.com/unboxai/openlayer-python/tree/main/examples>`_ and
`tutorials <https://docs.openlayer.com/docs/overview-of-tutorial-tracks>`_.

First, instantiate the client:

>>> import openlayer
>>> client = openlayer.OpenlayerClient('YOUR_API_KEY_HERE')

Create a project if you don't have one:

>>> from openlayer.tasks import TaskType
>>> project = client.create_project(
... name="Churn Prediction",
... task_type=TaskType.TabularClassification,
... description="My first project!",
... )

If you already have a project created on the platform:

>>> project = client.load_project(name="Your project name")

Let's say your training set looks like the following:

>>> train_df
CreditScore Geography Balance Churned
0 618 France 321.92 1
1 714 Germany 102001.22 0
2 604 Spain 12333.15 0
.. ... ... ... ...

Now you can create a baseline model:

>>> class_names = ['Retained', 'Churned']
>>> label_column_name = 'Exited'
>>> project.add_baseline(
... class_names=class_names,
... label_column_name=label_column,
... train_df=train_df,
... val_df=val_df,
... ensemble_size=3,
... timeout=60*10,
... per_run_limit=None,
... commit_message="first commit!"
... )
"""
# ---------------------------- Schema validations ---------------------------- #
if task_type is not TaskType.TabularClassification:
raise exceptions.OpenlayerValidationError(
"The `add_baseline` method is only valid for TaskType.TabularClassification tasks. \n "
) from None
# --------------------------- Resource validations --------------------------- #
if len(train_df) < 3000:
raise exceptions.OpenlayerResourceError(
f"The training set specified as `train_df` is too small, with only {len(train_df)} rows. \n",
mitigation="Please provide a training set with at least 3000 rows.",
) from None

# Instantiate object
qb = QuickBaseline(
train_df=train_df,
label_column_name=label_column_name,
ensemble_size=ensemble_size,
random_seed=random_seed,
)

# Preprocess the training set
print("Preprocessing the training set")
preprocessing_dict, train_features_df = qb.preprocess_dataset()

# Get the column names and categorical feature names
col_names = qb.column_names
categorical_feature_names = qb.get_categorical_feature_names(train_features_df)

# Upload the validation set -- if there are issues, it's better to fail prior to model training
if val_df is not None:
self.add_dataframe(
df=val_df,
task_type=task_type,
project_id=project_id,
class_names=class_names,
label_column_name=label_column_name,
commit_message=commit_message,
feature_names=col_names,
categorical_feature_names=categorical_feature_names,
)

# Train model
print(
f"Training model for approximately {round(0.0166 * timeout, 2)} minute(s)."
)
model = qb.train_auto_classifiers(
timeout=timeout,
per_run_limit=per_run_limit,
train_features_df=train_features_df,
)

# Get model predict function for the upload process
predict_proba = qb.get_predict_function()

# Create requirements file
filename = "auto-requirements.txt"
with open(filename, "w") as f:
f.write("Automunge==8.30\n")
f.write("scikit-learn== 0.24.1")

# Upload model
model_info = self.add_model(
function=predict_proba,
task_type=task_type,
project_id=project_id,
model=model,
model_type=ModelType.sklearn,
class_names=class_names,
name=f"Baseline model",
commit_message=commit_message,
feature_names=col_names,
train_sample_df=train_df.sample(n=3000, random_state=random_seed),
train_sample_label_column_name=label_column_name,
categorical_feature_names=categorical_feature_names,
requirements_txt_file="auto-requirements.txt",
col_names=col_names,
preprocessor=AutoMunge(),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

anything that isn't exposed via the api should just be the default value for the arg.

Then you can remove from Automunge import AutoMunge from this file

preprocessing_dict=preprocessing_dict,
)

return model_info

@staticmethod
def _format_error_message(err) -> str:
"""Formats the error messaeges from Marshmallow"""
Expand Down
Loading