Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 42 additions & 147 deletions openlayer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import yaml

from . import api, exceptions, schemas, utils, validators
from .datasets import Dataset
from .datasets import Dataset, DatasetType
from .models import Model
from .projects import Project
from .tasks import TaskType
Expand Down Expand Up @@ -274,8 +274,8 @@ def add_model(

if failed_validations:
raise exceptions.OpenlayerValidationError(
context="There are issues with the model package, as specified above. \n",
mitigation="Make sure to fix all of them before uploading the model.",
"There are issues with the model package. \n"
"Make sure to fix all of the issues listed above before the upload.",
) from None

# ------ Start of temporary workaround for the arguments in the payload ------ #
Expand Down Expand Up @@ -307,6 +307,7 @@ def add_model(
utils.remove_python_version(model_package_dir)

# Make sure the resulting model package is less than 2 GB
# TODO: this should depend on the subscription plan
if float(os.path.getsize("model")) / 1e9 > 2:
raise exceptions.OpenlayerResourceError(
context="There's an issue with the specified `model_package_dir`. \n",
Expand Down Expand Up @@ -342,13 +343,15 @@ def add_dataset(
file_path: str,
class_names: List[str],
label_column_name: str,
dataset_type: DatasetType,
feature_names: List[str] = [],
text_column_name: Optional[str] = None,
categorical_feature_names: List[str] = [],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why'd we decide to not use a .yaml for this one but use it for model? bc it could be in memory?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is not final. I feel like these args are still a bit awkward. I added the support for a yaml for consistency with the model upload. Since there, the user is constructing a model package, it made sense to ask for files.

In the dataset case, the user is not preparing a package and, as you said, a lot of this info could be in memory (especially if they're using the add_dataframe method). I'm going to be re-thinking how to pass these args in the next few days.

tag_column_name: Optional[str] = None,
language: str = "en",
sep: str = ",",
commit_message: Optional[str] = None,
dataset_config_file_path: Optional[str] = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need the docstring for this prop

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, since this is prob. not be final, I'll update the docstrings later. I'll have to re-write the docstrings to follow the new commit-style anyway, so I'll make sure to update it all together. (Wrote down on my to-dos so I don't forget)

project_id: str = None,
) -> Dataset:
r"""Uploads a dataset to the Openlayer platform (from a csv).
Expand All @@ -365,6 +368,9 @@ def add_dataset(

.. important::
The labels in this column must be zero-indexed integer values.
dataset_type : :obj:`DatasetType`
Type of dataset. E.g. :obj:`DatasetType.Validation` or
:obj:`DatasetType.Training`.
feature_names : List[str], default []
List of input feature names. Only applicable if your ``task_type`` is
:obj:`TaskType.TabularClassification` or :obj:`TaskType.TabularRegression`.
Expand Down Expand Up @@ -488,154 +494,36 @@ def add_dataset(
... )
>>> dataset.to_dict()
"""
# ---------------------------- Schema validations ---------------------------- #
if task_type not in [
TaskType.TabularClassification,
TaskType.TextClassification,
]:
raise exceptions.OpenlayerValidationError(
"`task_type` must be either TaskType.TabularClassification or "
"TaskType.TextClassification. \n"
) from None
dataset_schema = schemas.DatasetSchema()
try:
dataset_schema.load(
{
"file_path": file_path,
"commit_message": commit_message,
"class_names": class_names,
"label_column_name": label_column_name,
"tag_column_name": tag_column_name,
"language": language,
"sep": sep,
"feature_names": feature_names,
"text_column_name": text_column_name,
"categorical_feature_names": categorical_feature_names,
}
)
except ma.ValidationError as err:
# ---------------------------- Dataset validations --------------------------- #
# TODO: re-think the way the arguments are passed for the dataset upload
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah

dataset_config = None
if dataset_config_file_path is None:
dataset_config = {
"file_path": file_path,
"class_names": class_names,
"label_column_name": label_column_name,
"dataset_type": dataset_type.value,
"feature_names": feature_names,
"text_column_name": text_column_name,
"categorical_feature_names": categorical_feature_names,
"language": language,
"sep": sep,
}

dataset_validator = validators.DatasetValidator(
dataset_config_file_path=dataset_config_file_path,
dataset_config=dataset_config,
dataset_file_path=file_path,
)
failed_validations = dataset_validator.validate()

if failed_validations:
raise exceptions.OpenlayerValidationError(
self._format_error_message(err)
"There are issues with the dataset and its config. \n"
"Make sure to fix all of the issues listed above before the upload.",
) from None

# --------------------------- Resource validations --------------------------- #
exp_file_path = os.path.expanduser(file_path)
object_name = "original.csv"
if not os.path.isfile(exp_file_path):
raise exceptions.OpenlayerResourceError(
f"File at path `{file_path}` does not contain the dataset. \n"
) from None

with open(exp_file_path, "rt") as f:
reader = csv.reader(f, delimiter=sep)
headers = next(reader)
row_count = sum(1 for _ in reader)

df = pd.read_csv(file_path, sep=sep)

# Checking for null values
if df.isnull().values.any():
raise exceptions.OpenlayerResourceError(
context="There's an issue with the specified dataset. \n",
message="The dataset contains null values, which is currently "
"not supported. \n",
mitigation="Make sure to upload a dataset without null values.",
) from None

# Validating if the labels are zero indexed ints
unique_labels = set(df[label_column_name].unique())
zero_indexed_set = set(range(len(class_names)))
if unique_labels != zero_indexed_set:
raise exceptions.OpenlayerResourceError(
context=f"There's an issue with values in the column "
f"`{label_column_name}` of the dataset. \n",
message=f"The labels in `{label_column_name}` must be "
"zero-indexed integer values. \n",
mitigation="Make sure to upload a dataset with zero-indexed "
"integer labels that match the list in `class_names`. "
f"For example, the class `{class_names[0]}` should be "
"represented as a 0 in the dataset, the class "
f"`{class_names[1]}` should be a 1, and so on.",
) from None

# Validating the column dtypes
supported_dtypes = {"float32", "float64", "int32", "int64", "object"}
error_msg = ""
for col in df:
dtype = df[col].dtype.name
if dtype not in supported_dtypes:
error_msg += f"- Column `{col}` is of dtype {dtype}. \n"
if error_msg:
raise exceptions.OpenlayerResourceError(
context="There is an issue with some of the columns dtypes.\n",
message=error_msg,
mitigation=f"The supported dtypes are {supported_dtypes}. "
"Make sure to cast the above columns to a supported dtype.",
) from None
# ------------------ Resource-schema consistency validations ----------------- #
# Label column validations
try:
headers.index(label_column_name)
except ValueError:
raise exceptions.OpenlayerDatasetInconsistencyError(
f"`{label_column_name}` specified as `label_column_name` is not "
"in the dataset. \n"
) from None

if len(unique_labels) > len(class_names):
raise exceptions.OpenlayerDatasetInconsistencyError(
f"There are {len(unique_labels)} classes represented in the dataset, "
f"but only {len(class_names)} items in your `class_names`. \n",
mitigation=f"Make sure that there are at most {len(class_names)} "
"classes in your dataset.",
) from None

# Feature validations
try:
if text_column_name:
feature_names = [text_column_name]
for feature_name in feature_names:
headers.index(feature_name)
except ValueError:
if text_column_name:
raise exceptions.OpenlayerDatasetInconsistencyError(
f"`{text_column_name}` specified as `text_column_name` is not in "
"the dataset. \n"
) from None
else:
features_not_in_dataset = [
feature for feature in feature_names if feature not in headers
]
raise exceptions.OpenlayerDatasetInconsistencyError(
f"Features {features_not_in_dataset} specified in `feature_names` "
"are not in the dataset. \n"
) from None
# Tag column validation
try:
if tag_column_name:
headers.index(tag_column_name)
except ValueError:
raise exceptions.OpenlayerDatasetInconsistencyError(
f"`{tag_column_name}` specified as `tag_column_name` is not in "
"the dataset. \n"
) from None

# ----------------------- Subscription plan validations ---------------------- #
if row_count > self.subscription_plan["datasetRowCount"]:
raise exceptions.OpenlayerSubscriptionPlanException(
f"The dataset your are trying to upload contains {row_count} rows, "
"which exceeds your plan's limit of "
f"{self.subscription_plan['datasetRowCount']}. \n"
) from None
if task_type == TaskType.TextClassification:
max_text_size = df[text_column_name].str.len().max()
if max_text_size > 1000:
raise exceptions.OpenlayerSubscriptionPlanException(
"The dataset you are trying to upload contains rows with "
f"{max_text_size} characters, which exceeds the 1000 character "
"limit."
) from None

endpoint = f"projects/{project_id}/datasets"
payload = dict(
commitMessage=commit_message,
Expand Down Expand Up @@ -666,13 +554,15 @@ def add_dataframe(
df: pd.DataFrame,
class_names: List[str],
label_column_name: str,
dataset_type: DatasetType,
feature_names: List[str] = [],
text_column_name: Optional[str] = None,
categorical_feature_names: List[str] = [],
commit_message: Optional[str] = None,
tag_column_name: Optional[str] = None,
language: str = "en",
project_id: str = None,
dataset_config_file_path: Optional[str] = None,
) -> Dataset:
r"""Uploads a dataset to the Openlayer platform (from a pandas DataFrame).

Expand All @@ -688,6 +578,9 @@ def add_dataframe(

.. important::
The labels in this column must be zero-indexed integer values.
dataset_type : :obj:`DatasetType`
Type of dataset. E.g. :obj:`DatasetType.Validation` or
:obj:`DatasetType.Training`.
feature_names : List[str], default []
List of input feature names. Only applicable if your ``task_type`` is
:obj:`TaskType.TabularClassification` or :obj:`TaskType.TabularRegression`.
Expand Down Expand Up @@ -820,13 +713,15 @@ def add_dataframe(
task_type=task_type,
class_names=class_names,
label_column_name=label_column_name,
dataset_type=dataset_type,
text_column_name=text_column_name,
commit_message=commit_message,
tag_column_name=tag_column_name,
language=language,
feature_names=feature_names,
categorical_feature_names=categorical_feature_names,
project_id=project_id,
dataset_config_file_path=dataset_config_file_path,
)

@staticmethod
Expand Down
12 changes: 12 additions & 0 deletions openlayer/datasets.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
from enum import Enum


class DatasetType(Enum):
"""The different dataset types that are supported by Openlayer."""

#: For validation sets.
Validation = "validation"
#: For training sets.
Training = "training"


class Dataset:
"""An object containing information about a dataset on the Openlayer platform."""

Expand Down
36 changes: 4 additions & 32 deletions openlayer/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,43 +20,15 @@ def __init__(self, message, errcode=None):
super().__init__(f"<Response> {message}")


class OpenlayerResourceError(OpenlayerException):
def __init__(self, message, context=None, mitigation=None):
if not context:
context = "There is a problem with the specified file path. \n"
if not mitigation:
mitigation = (
"Make sure that the specified filepath contains the expected resource."
)
super().__init__(context + message + mitigation)


class OpenlayerValidationError(OpenlayerException):
def __init__(self, message, context=None, mitigation=None):
if not context:
context = "There are issues with some of the arguments: \n"
if not mitigation:
mitigation = (
"Make sure to respect the datatypes and constraints specified above."
)
super().__init__(context + message + mitigation)


class OpenlayerDatasetInconsistencyError(OpenlayerException):
def __init__(self, message, context=None, mitigation=None):
if not context:
context = "There are inconsistencies between the dataset and some of the arguments: \n"
if not mitigation:
mitigation = "Make sure that the value specified in the argument is a column header in the dataframe or csv being uploaded."
super().__init__(context + message + mitigation)
def __init__(self, message):
super().__init__(message)


class OpenlayerSubscriptionPlanException(OpenlayerException):
def __init__(self, message, context=None, mitigation=None):
if not context:
context = "You have reached your subscription plan's limits. \n"
if not mitigation:
mitigation = "To upgrade your plan, visit https://openlayer.com"
context = context or "You have reached your subscription plan's limits. \n"
mitigation = mitigation or "To upgrade your plan, visit https://openlayer.com"
super().__init__(context + message + mitigation)


Expand Down
2 changes: 1 addition & 1 deletion openlayer/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class ModelType(Enum):
"""

#: For custom built models.
custom = "Custom"
custom = "custom"
#: For models built with `fastText <https://fasttext.cc/>`_.
fasttext = "fasttext"
#: For models built with `Keras <https://keras.io/>`_.
Expand Down
10 changes: 6 additions & 4 deletions openlayer/schemas.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import marshmallow as ma

from .datasets import DatasetType
from .models import ModelType


Expand Down Expand Up @@ -55,9 +56,11 @@ class DatasetSchema(ma.Schema):
max=140,
),
)
tag_column_name = ma.fields.List(
ma.fields.Str(),
allow_none=True,
dataset_type = ma.fields.Str(
validate=ma.validate.OneOf(
[dataset_type.value for dataset_type in DatasetType],
error=f"`dataset_type` must be one of the supported frameworks. Check out our API reference for a full list https://reference.openlayer.com/reference/api/openlayer.DatasetType.html.\n ",
),
)
class_names = ma.fields.List(
ma.fields.Str(),
Expand All @@ -73,7 +76,6 @@ class DatasetSchema(ma.Schema):
sep = ma.fields.Str()
feature_names = ma.fields.List(
ma.fields.Str(),
allow_none=True,
)
text_column_name = ma.fields.Str(
allow_none=True,
Expand Down
15 changes: 0 additions & 15 deletions openlayer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,18 +99,3 @@ def remove_python_version(dir: str):
dir (str): the directory to remove the file from.
"""
os.remove(f"{dir}/python_version")


def copy_to_tmp_dir(dir: str) -> str:
"""Copies the contents of the specified directory (`dir`) to a temporary directory.

Args:
dir (str): the directory to copy the contents from.

Returns:
str: the path to the temporary directory.
"""
tmp_dir = tempfile.mkdtemp()
distutils.dir_util.copy_tree(dir, tmp_dir)

return tmp_dir
Loading