openlayer-ai · gustavocidornelas · Jan 21, 2023 · Jan 19, 2023 · whoseoyster · Jan 20, 2023
diff --git a/openlayer/__init__.py b/openlayer/__init__.py
@@ -10,7 +10,7 @@
 import yaml
 
 from . import api, exceptions, schemas, utils, validators
-from .datasets import Dataset
+from .datasets import Dataset, DatasetType
 from .models import Model
 from .projects import Project
 from .tasks import TaskType
@@ -274,8 +274,8 @@ def add_model(
 
  if failed_validations:
  raise exceptions.OpenlayerValidationError(
- context="There are issues with the model package, as specified above. \n",
- mitigation="Make sure to fix all of them before uploading the model.",
+ "There are issues with the model package. \n"
+ "Make sure to fix all of the issues listed above before the upload.",
  ) from None
 
  # ------ Start of temporary workaround for the arguments in the payload ------ #
@@ -307,6 +307,7 @@ def add_model(
  utils.remove_python_version(model_package_dir)
 
  # Make sure the resulting model package is less than 2 GB
+ # TODO: this should depend on the subscription plan
  if float(os.path.getsize("model")) / 1e9 > 2:
  raise exceptions.OpenlayerResourceError(
  context="There's an issue with the specified `model_package_dir`. \n",
@@ -342,13 +343,15 @@ def add_dataset(
  file_path: str,
  class_names: List[str],
  label_column_name: str,
+ dataset_type: DatasetType,
  feature_names: List[str] = [],
  text_column_name: Optional[str] = None,
  categorical_feature_names: List[str] = [],
  tag_column_name: Optional[str] = None,
  language: str = "en",
  sep: str = ",",
  commit_message: Optional[str] = None,
+ dataset_config_file_path: Optional[str] = None,
  project_id: str = None,
  ) -> Dataset:
  r"""Uploads a dataset to the Openlayer platform (from a csv).
@@ -365,6 +368,9 @@ def add_dataset(
 
  .. important::
  The labels in this column must be zero-indexed integer values.
+ dataset_type : :obj:`DatasetType`
+ Type of dataset. E.g. :obj:`DatasetType.Validation` or
+ :obj:`DatasetType.Training`.
  feature_names : List[str], default []
  List of input feature names. Only applicable if your ``task_type`` is
  :obj:`TaskType.TabularClassification` or :obj:`TaskType.TabularRegression`.
@@ -488,154 +494,36 @@ def add_dataset(
  ... )
  >>> dataset.to_dict()
  """
- # ---------------------------- Schema validations ---------------------------- #
- if task_type not in [
- TaskType.TabularClassification,
- TaskType.TextClassification,
- ]:
- raise exceptions.OpenlayerValidationError(
- "`task_type` must be either TaskType.TabularClassification or "
- "TaskType.TextClassification. \n"
- ) from None
- dataset_schema = schemas.DatasetSchema()
- try:
- dataset_schema.load(
- {
- "file_path": file_path,
- "commit_message": commit_message,
- "class_names": class_names,
- "label_column_name": label_column_name,
- "tag_column_name": tag_column_name,
- "language": language,
- "sep": sep,
- "feature_names": feature_names,
- "text_column_name": text_column_name,
- "categorical_feature_names": categorical_feature_names,
- }
- )
- except ma.ValidationError as err:
+ # ---------------------------- Dataset validations --------------------------- #
+ # TODO: re-think the way the arguments are passed for the dataset upload
+ dataset_config = None
+ if dataset_config_file_path is None:
+ dataset_config = {
+ "file_path": file_path,
+ "class_names": class_names,
+ "label_column_name": label_column_name,
+ "dataset_type": dataset_type.value,
+ "feature_names": feature_names,
+ "text_column_name": text_column_name,
+ "categorical_feature_names": categorical_feature_names,
+ "language": language,
+ "sep": sep,
+ }
+
+ dataset_validator = validators.DatasetValidator(
+ dataset_config_file_path=dataset_config_file_path,
+ dataset_config=dataset_config,
+ dataset_file_path=file_path,
+ )
+ failed_validations = dataset_validator.validate()
+
+ if failed_validations:
  raise exceptions.OpenlayerValidationError(
- self._format_error_message(err)
+ "There are issues with the dataset and its config. \n"
+ "Make sure to fix all of the issues listed above before the upload.",
  ) from None
 
- # --------------------------- Resource validations --------------------------- #
- exp_file_path = os.path.expanduser(file_path)
  object_name = "original.csv"
- if not os.path.isfile(exp_file_path):
- raise exceptions.OpenlayerResourceError(
- f"File at path `{file_path}` does not contain the dataset. \n"
- ) from None
-
- with open(exp_file_path, "rt") as f:
- reader = csv.reader(f, delimiter=sep)
- headers = next(reader)
- row_count = sum(1 for _ in reader)
-
- df = pd.read_csv(file_path, sep=sep)
-
- # Checking for null values
- if df.isnull().values.any():
- raise exceptions.OpenlayerResourceError(
- context="There's an issue with the specified dataset. \n",
- message="The dataset contains null values, which is currently "
- "not supported. \n",
- mitigation="Make sure to upload a dataset without null values.",
- ) from None
-
- # Validating if the labels are zero indexed ints
- unique_labels = set(df[label_column_name].unique())
- zero_indexed_set = set(range(len(class_names)))
- if unique_labels != zero_indexed_set:
- raise exceptions.OpenlayerResourceError(
- context=f"There's an issue with values in the column "
- f"`{label_column_name}` of the dataset. \n",
- message=f"The labels in `{label_column_name}` must be "
- "zero-indexed integer values. \n",
- mitigation="Make sure to upload a dataset with zero-indexed "
- "integer labels that match the list in `class_names`. "
- f"For example, the class `{class_names[0]}` should be "
- "represented as a 0 in the dataset, the class "
- f"`{class_names[1]}` should be a 1, and so on.",
- ) from None
-
- # Validating the column dtypes
- supported_dtypes = {"float32", "float64", "int32", "int64", "object"}
- error_msg = ""
- for col in df:
- dtype = df[col].dtype.name
- if dtype not in supported_dtypes:
- error_msg += f"- Column `{col}` is of dtype {dtype}. \n"
- if error_msg:
- raise exceptions.OpenlayerResourceError(
- context="There is an issue with some of the columns dtypes.\n",
- message=error_msg,
- mitigation=f"The supported dtypes are {supported_dtypes}. "
- "Make sure to cast the above columns to a supported dtype.",
- ) from None
- # ------------------ Resource-schema consistency validations ----------------- #
- # Label column validations
- try:
- headers.index(label_column_name)
- except ValueError:
- raise exceptions.OpenlayerDatasetInconsistencyError(
- f"`{label_column_name}` specified as `label_column_name` is not "
- "in the dataset. \n"
- ) from None
-
- if len(unique_labels) > len(class_names):
- raise exceptions.OpenlayerDatasetInconsistencyError(
- f"There are {len(unique_labels)} classes represented in the dataset, "
- f"but only {len(class_names)} items in your `class_names`. \n",
- mitigation=f"Make sure that there are at most {len(class_names)} "
- "classes in your dataset.",
- ) from None
-
- # Feature validations
- try:
- if text_column_name:
- feature_names = [text_column_name]
- for feature_name in feature_names:
- headers.index(feature_name)
- except ValueError:
- if text_column_name:
- raise exceptions.OpenlayerDatasetInconsistencyError(
- f"`{text_column_name}` specified as `text_column_name` is not in "
- "the dataset. \n"
- ) from None
- else:
- features_not_in_dataset = [
- feature for feature in feature_names if feature not in headers
- ]
- raise exceptions.OpenlayerDatasetInconsistencyError(
- f"Features {features_not_in_dataset} specified in `feature_names` "
- "are not in the dataset. \n"
- ) from None
- # Tag column validation
- try:
- if tag_column_name:
- headers.index(tag_column_name)
- except ValueError:
- raise exceptions.OpenlayerDatasetInconsistencyError(
- f"`{tag_column_name}` specified as `tag_column_name` is not in "
- "the dataset. \n"
- ) from None
-
- # ----------------------- Subscription plan validations ---------------------- #
- if row_count > self.subscription_plan["datasetRowCount"]:
- raise exceptions.OpenlayerSubscriptionPlanException(
- f"The dataset your are trying to upload contains {row_count} rows, "
- "which exceeds your plan's limit of "
- f"{self.subscription_plan['datasetRowCount']}. \n"
- ) from None
- if task_type == TaskType.TextClassification:
- max_text_size = df[text_column_name].str.len().max()
- if max_text_size > 1000:
- raise exceptions.OpenlayerSubscriptionPlanException(
- "The dataset you are trying to upload contains rows with "
- f"{max_text_size} characters, which exceeds the 1000 character "
- "limit."
- ) from None
-
  endpoint = f"projects/{project_id}/datasets"
  payload = dict(
  commitMessage=commit_message,
@@ -666,13 +554,15 @@ def add_dataframe(
  df: pd.DataFrame,
  class_names: List[str],
  label_column_name: str,
+ dataset_type: DatasetType,
  feature_names: List[str] = [],
  text_column_name: Optional[str] = None,
  categorical_feature_names: List[str] = [],
  commit_message: Optional[str] = None,
  tag_column_name: Optional[str] = None,
  language: str = "en",
  project_id: str = None,
+ dataset_config_file_path: Optional[str] = None,
  ) -> Dataset:
  r"""Uploads a dataset to the Openlayer platform (from a pandas DataFrame).
 
@@ -688,6 +578,9 @@ def add_dataframe(
 
  .. important::
  The labels in this column must be zero-indexed integer values.
+ dataset_type : :obj:`DatasetType`
+ Type of dataset. E.g. :obj:`DatasetType.Validation` or
+ :obj:`DatasetType.Training`.
  feature_names : List[str], default []
  List of input feature names. Only applicable if your ``task_type`` is
  :obj:`TaskType.TabularClassification` or :obj:`TaskType.TabularRegression`.
@@ -820,13 +713,15 @@ def add_dataframe(
  task_type=task_type,
  class_names=class_names,
  label_column_name=label_column_name,
+ dataset_type=dataset_type,
  text_column_name=text_column_name,
  commit_message=commit_message,
  tag_column_name=tag_column_name,
  language=language,
  feature_names=feature_names,
  categorical_feature_names=categorical_feature_names,
  project_id=project_id,
+ dataset_config_file_path=dataset_config_file_path,
  )
 
  @staticmethod

diff --git a/openlayer/datasets.py b/openlayer/datasets.py
@@ -1,3 +1,15 @@
+from enum import Enum
+
+
+class DatasetType(Enum):
+ """The different dataset types that are supported by Openlayer."""
+
+ #: For validation sets.
+ Validation = "validation"
+ #: For training sets.
+ Training = "training"
+
+
 class Dataset:
  """An object containing information about a dataset on the Openlayer platform."""
 

diff --git a/openlayer/exceptions.py b/openlayer/exceptions.py
@@ -20,43 +20,15 @@ def __init__(self, message, errcode=None):
  super().__init__(f"<Response> {message}")
 
 
-class OpenlayerResourceError(OpenlayerException):
- def __init__(self, message, context=None, mitigation=None):
- if not context:
- context = "There is a problem with the specified file path. \n"
- if not mitigation:
- mitigation = (
- "Make sure that the specified filepath contains the expected resource."
- )
- super().__init__(context + message + mitigation)
-
-
 class OpenlayerValidationError(OpenlayerException):
- def __init__(self, message, context=None, mitigation=None):
- if not context:
- context = "There are issues with some of the arguments: \n"
- if not mitigation:
- mitigation = (
- "Make sure to respect the datatypes and constraints specified above."
- )
- super().__init__(context + message + mitigation)
-
-
-class OpenlayerDatasetInconsistencyError(OpenlayerException):
- def __init__(self, message, context=None, mitigation=None):
- if not context:
- context = "There are inconsistencies between the dataset and some of the arguments: \n"
- if not mitigation:
- mitigation = "Make sure that the value specified in the argument is a column header in the dataframe or csv being uploaded."
- super().__init__(context + message + mitigation)
+ def __init__(self, message):
+ super().__init__(message)
 
 
 class OpenlayerSubscriptionPlanException(OpenlayerException):
  def __init__(self, message, context=None, mitigation=None):
- if not context:
- context = "You have reached your subscription plan's limits. \n"
- if not mitigation:
- mitigation = "To upgrade your plan, visit https://openlayer.com"
+ context = context or "You have reached your subscription plan's limits. \n"
+ mitigation = mitigation or "To upgrade your plan, visit https://openlayer.com"
  super().__init__(context + message + mitigation)
 
 

diff --git a/openlayer/models.py b/openlayer/models.py
@@ -10,7 +10,7 @@ class ModelType(Enum):
  """
 
  #: For custom built models.
- custom = "Custom"
+ custom = "custom"
  #: For models built with `fastText <https://fasttext.cc/>`_.
  fasttext = "fasttext"
  #: For models built with `Keras <https://keras.io/>`_.

diff --git a/openlayer/schemas.py b/openlayer/schemas.py
@@ -1,5 +1,6 @@
 import marshmallow as ma
 
+from .datasets import DatasetType
 from .models import ModelType
 
 
@@ -55,9 +56,11 @@ class DatasetSchema(ma.Schema):
  max=140,
  ),
  )
- tag_column_name = ma.fields.List(
- ma.fields.Str(),
- allow_none=True,
+ dataset_type = ma.fields.Str(
+ validate=ma.validate.OneOf(
+ [dataset_type.value for dataset_type in DatasetType],
+ error=f"`dataset_type` must be one of the supported frameworks. Check out our API reference for a full list https://reference.openlayer.com/reference/api/openlayer.DatasetType.html.\n ",
+ ),
  )
  class_names = ma.fields.List(
  ma.fields.Str(),
@@ -73,7 +76,6 @@ class DatasetSchema(ma.Schema):
  sep = ma.fields.Str()
  feature_names = ma.fields.List(
  ma.fields.Str(),
- allow_none=True,
  )
  text_column_name = ma.fields.Str(
  allow_none=True,

diff --git a/openlayer/utils.py b/openlayer/utils.py
@@ -99,18 +99,3 @@ def remove_python_version(dir: str):
  dir (str): the directory to remove the file from.
  """
  os.remove(f"{dir}/python_version")
-
-
-def copy_to_tmp_dir(dir: str) -> str:
- """Copies the contents of the specified directory (`dir`) to a temporary directory.
-
- Args:
- dir (str): the directory to copy the contents from.
-
- Returns:
- str: the path to the temporary directory.
- """
- tmp_dir = tempfile.mkdtemp()
- distutils.dir_util.copy_tree(dir, tmp_dir)
-
- return tmp_dir