automl · ravinkohli · Jul 11, 2022 · Jun 23, 2022 · Jun 23, 2022 · Jun 23, 2022
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
@@ -307,6 +307,7 @@ def _get_dataset_input_validator(
  resampling_strategy_args: Optional[Dict[str, Any]] = None,
  dataset_name: Optional[str] = None,
  dataset_compression: Optional[DatasetCompressionSpec] = None,
+ **kwargs: Any
  ) -> Tuple[BaseDataset, BaseInputValidator]:
  """
  Returns an object of a child class of `BaseDataset` and
@@ -353,6 +354,7 @@ def get_dataset(
  resampling_strategy_args: Optional[Dict[str, Any]] = None,
  dataset_name: Optional[str] = None,
  dataset_compression: Optional[DatasetCompressionSpec] = None,
+ **kwargs: Any
  ) -> BaseDataset:
  """
  Returns an object of a child class of `BaseDataset` according to the current task.
@@ -407,6 +409,10 @@ def get_dataset(
  Subsampling takes into account classification labels and stratifies
  accordingly. We guarantee that at least one occurrence of each
  label is included in the sampled set.
+ kwargs (Any):
+ can be used to pass task specific dataset arguments. Currently supports
+ passing `feat_types` for tabular tasks which specifies whether a feature is
+ 'numerical' or 'categorical'.
 
  Returns:
  BaseDataset:
@@ -420,7 +426,8 @@ def get_dataset(
  resampling_strategy=resampling_strategy,
  resampling_strategy_args=resampling_strategy_args,
  dataset_name=dataset_name,
- dataset_compression=dataset_compression)
+ dataset_compression=dataset_compression,
+ **kwargs)
 
  return dataset
 

diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
@@ -168,6 +168,7 @@ def _get_dataset_input_validator(
  resampling_strategy_args: Optional[Dict[str, Any]] = None,
  dataset_name: Optional[str] = None,
  dataset_compression: Optional[DatasetCompressionSpec] = None,
+ **kwargs: Any,
  ) -> Tuple[TabularDataset, TabularInputValidator]:
  """
  Returns an object of `TabularDataset` and an object of
@@ -194,6 +195,9 @@ def _get_dataset_input_validator(
  dataset_compression (Optional[DatasetCompressionSpec]):
  specifications for dataset compression. For more info check
  documentation for `BaseTask.get_dataset`.
+ kwargs (Any):
+ Currently for tabular tasks, expect `feat_types: (Optional[List[str]]` which
+ specifies whether a feature is 'numerical' or 'categorical'.
 
  Returns:
  TabularDataset:
@@ -206,12 +210,14 @@ def _get_dataset_input_validator(
  resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \
  self.resampling_strategy_args
 
+ feat_types = kwargs.pop('feat_types', None)
  # Create a validator object to make sure that the data provided by
  # the user matches the autopytorch requirements
  input_validator = TabularInputValidator(
  is_classification=True,
  logger_port=self._logger_port,
- dataset_compression=dataset_compression
+ dataset_compression=dataset_compression,
+ feat_types=feat_types
  )
 
  # Fit a input validator to check the provided data
@@ -238,6 +244,7 @@ def search(
  X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
  y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
  dataset_name: Optional[str] = None,
+ feat_types: Optional[List[str]] = None,
  budget_type: str = 'epochs',
  min_budget: int = 5,
  max_budget: int = 50,
@@ -266,6 +273,10 @@ def search(
  A pair of features (X_train) and targets (y_train) used to fit a
  pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
  be provided to track the generalization performance of each stage.
+ feat_types (Optional[List[str]]):
+ Description about the feature types of the columns.
+ Accepts `numerical` for integers, float data and `categorical`
+ for categories, strings and bool. Defaults to None.
  optimize_metric (str):
  name of the metric that is used to evaluate a pipeline.
  budget_type (str):
@@ -433,7 +444,8 @@ def search(
  resampling_strategy=self.resampling_strategy,
  resampling_strategy_args=self.resampling_strategy_args,
  dataset_name=dataset_name,
- dataset_compression=self._dataset_compression)
+ dataset_compression=self._dataset_compression,
+ feat_types=feat_types)
 
  return self._search(
  dataset=self.dataset,

diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
@@ -169,6 +169,7 @@ def _get_dataset_input_validator(
  resampling_strategy_args: Optional[Dict[str, Any]] = None,
  dataset_name: Optional[str] = None,
  dataset_compression: Optional[DatasetCompressionSpec] = None,
+ **kwargs: Any
  ) -> Tuple[TabularDataset, TabularInputValidator]:
  """
  Returns an object of `TabularDataset` and an object of
@@ -195,6 +196,9 @@ def _get_dataset_input_validator(
  dataset_compression (Optional[DatasetCompressionSpec]):
  specifications for dataset compression. For more info check
  documentation for `BaseTask.get_dataset`.
+ kwargs (Any):
+ Currently for tabular tasks, expect `feat_types: (Optional[List[str]]` which
+ specifies whether a feature is 'numerical' or 'categorical'.
  Returns:
  TabularDataset:
  the dataset object.
@@ -206,12 +210,14 @@ def _get_dataset_input_validator(
  resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \
  self.resampling_strategy_args
 
+ feat_types = kwargs.pop('feat_types', None)
  # Create a validator object to make sure that the data provided by
  # the user matches the autopytorch requirements
  input_validator = TabularInputValidator(
  is_classification=False,
  logger_port=self._logger_port,
- dataset_compression=dataset_compression
+ dataset_compression=dataset_compression,
+ feat_types=feat_types
  )
 
  # Fit a input validator to check the provided data
@@ -238,6 +244,7 @@ def search(
  X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
  y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
  dataset_name: Optional[str] = None,
+ feat_types: Optional[List[str]] = None,
  budget_type: str = 'epochs',
  min_budget: int = 5,
  max_budget: int = 50,
@@ -266,6 +273,10 @@ def search(
  A pair of features (X_train) and targets (y_train) used to fit a
  pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
  be provided to track the generalization performance of each stage.
+ feat_types (Optional[List[str]]):
+ Description about the feature types of the columns.
+ Accepts `numerical` for integers, float data and `categorical`
+ for categories, strings and bool. Defaults to None.
  optimize_metric (str):
  Name of the metric that is used to evaluate a pipeline.
  budget_type (str):
@@ -434,7 +445,8 @@ def search(
  resampling_strategy=self.resampling_strategy,
  resampling_strategy_args=self.resampling_strategy_args,
  dataset_name=dataset_name,
- dataset_compression=self._dataset_compression)
+ dataset_compression=self._dataset_compression,
+ feat_types=feat_types)
 
  return self._search(
  dataset=self.dataset,

diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
@@ -35,7 +35,7 @@ def __init__(
  logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None,
  ):
  # Register types to detect unsupported data format changes
- self.feat_type: Optional[List[str]] = None
+ self.feat_types: Optional[List[str]] = None
  self.data_type: Optional[type] = None
  self.dtypes: List[str] = []
  self.column_order: List[str] = []

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
@@ -94,12 +94,18 @@ class TabularFeatureValidator(BaseFeatureValidator):
  List of indices of numerical columns
  categorical_columns (List[int]):
  List of indices of categorical columns
+ feat_types (List[str]):
+ Description about the feature types of the columns.
+ Accepts `numerical` for integers, float data and `categorical`
+ for categories, strings and bool.
  """
  def __init__(
  self,
  logger: Optional[Union[PicklableClientLogger, Logger]] = None,
+ feat_types: Optional[List[str]] = None,
  ):
  super().__init__(logger)
+ self.feat_types = feat_types
 
  @staticmethod
  def _comparator(cmp1: str, cmp2: str) -> int:
@@ -167,9 +173,9 @@ def _fit(
  if not X.select_dtypes(include='object').empty:
  X = self.infer_objects(X)
 
- self.transformed_columns, self.feat_type = self._get_columns_to_encode(X)
+ self.transformed_columns, self.feat_types = self.get_columns_to_encode(X)
 
- assert self.feat_type is not None
+ assert self.feat_types is not None
 
  if len(self.transformed_columns) > 0:
 
@@ -186,8 +192,8 @@ def _fit(
  # The column transformer reorders the feature types
  # therefore, we need to change the order of columns as well
  # This means categorical columns are shifted to the left
- self.feat_type = sorted(
- self.feat_type,
+ self.feat_types = sorted(
+ self.feat_types,
  key=functools.cmp_to_key(self._comparator)
  )
 
@@ -201,7 +207,7 @@ def _fit(
  for cat in encoded_categories
  ]
 
- for i, type_ in enumerate(self.feat_type):
+ for i, type_ in enumerate(self.feat_types):
  if 'numerical' in type_:
  self.numerical_columns.append(i)
  else:
@@ -336,7 +342,7 @@ def _check_data(
 
  # Define the column to be encoded here as the feature validator is fitted once
  # per estimator
- self.transformed_columns, self.feat_type = self._get_columns_to_encode(X)
+ self.transformed_columns, self.feat_types = self.get_columns_to_encode(X)
 
  column_order = [column for column in X.columns]
  if len(self.column_order) > 0:
@@ -361,12 +367,72 @@ def _check_data(
  else:
  self.dtypes = dtypes
 
+ def get_columns_to_encode(
+ self,
+ X: pd.DataFrame
+ ) -> Tuple[List[str], List[str]]:
+ """
+ Return the columns to be transformed as well as
+ the type of feature for each column.
+
+ The returned values are dependent on `feat_types` passed to the `__init__`.
+
+ Args:
+ X (pd.DataFrame)
+ A set of features that are going to be validated (type and dimensionality
+ checks) and an encoder fitted in the case the data needs encoding
+
+ Returns:
+ transformed_columns (List[str]):
+ Columns to encode, if any
+ feat_type:
+ Type of each column numerical/categorical
+ """
+ transformed_columns, feat_types = self._get_columns_to_encode(X)
+ if self.feat_types is not None:
+ self._validate_feat_types(X)
+ transformed_columns = [X.columns[i] for i, col in enumerate(self.feat_types)
+ if col.lower() == 'categorical']
+ return transformed_columns, self.feat_types
+ else:
+ return transformed_columns, feat_types
+
+ def _validate_feat_types(self, X: pd.DataFrame) -> None:
+ """
+ Checks if the passed `feat_types` is compatible with what
+ AutoPyTorch expects, i.e, it should only contain `numerical`
+ or `categorical` and the number of feature types is equal to
+ the number of features. The case does not matter.
+
+ Args:
+ X (pd.DataFrame):
+ input features set
+
+ Raises:
+ ValueError:
+ if the number of feat_types is not equal to the number of features
+ if the feature type are not one of "numerical", "categorical"
+ """
+ assert self.feat_types is not None # mypy check
+
+ if len(self.feat_types) != len(X.columns):
+ raise ValueError(f"Expected number of `feat_types`: {len(self.feat_types)}"
+ f" to be the same as the number of features {len(X.columns)}")
+ for feat_type in set(self.feat_types):
+ if feat_type.lower() not in ['numerical', 'categorical']:
+ raise ValueError(f"Expected type of features to be in `['numerical', "
+ f"'categorical']`, but got {feat_type}")
+
  def _get_columns_to_encode(
  self,
  X: pd.DataFrame,
  ) -> Tuple[List[str], List[str]]:
  """
- Return the columns to be encoded from a pandas dataframe
+ Return the columns to be transformed as well as
+ the type of feature for each column from a pandas dataframe.
+
+ If `self.feat_types` is not None, it also validates that the
+ dataframe dtypes dont disagree with the ones passed in `__init__`.
 
  Args:
  X (pd.DataFrame)
@@ -380,21 +446,24 @@ def _get_columns_to_encode(
  Type of each column numerical/categorical
  """
 
- if len(self.transformed_columns) > 0 and self.feat_type is not None:
- return self.transformed_columns, self.feat_type
+ if len(self.transformed_columns) > 0 and self.feat_types is not None:
+ return self.transformed_columns, self.feat_types
 
  # Register if a column needs encoding
  transformed_columns = []
 
  # Also, register the feature types for the estimator
- feat_type = []
+ feat_types = []
 
  # Make sure each column is a valid type
  for i, column in enumerate(X.columns):
  if X[column].dtype.name in ['category', 'bool']:
 
  transformed_columns.append(column)
- feat_type.append('categorical')
+ if self.feat_types is not None and self.feat_types[i].lower() == 'numerical':
+ raise ValueError(f"Passed numerical as the feature type for column: {column} "
+ f"but the column is categorical")
+ feat_types.append('categorical')
  # Move away from np.issubdtype as it causes
  # TypeError: data type not understood in certain pandas types
  elif not is_numeric_dtype(X[column]):
@@ -434,8 +503,8 @@ def _get_columns_to_encode(
  )
  )
  else:
- feat_type.append('numerical')
- return transformed_columns, feat_type
+ feat_types.append('numerical')
+ return transformed_columns, feat_types
 
  def list_to_dataframe(
  self,

diff --git a/autoPyTorch/data/tabular_validator.py b/autoPyTorch/data/tabular_validator.py
@@ -1,6 +1,6 @@
 # -*- encoding: utf-8 -*-
 import logging
-from typing import Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 
@@ -41,18 +41,24 @@ class TabularInputValidator(BaseInputValidator):
  dataset_compression (Optional[DatasetCompressionSpec]):
  specifications for dataset compression. For more info check
  documentation for `BaseTask.get_dataset`.
+ feat_types (List[str]):
+ Description about the feature types of the columns.
+ Accepts `numerical` for integers, float data and `categorical`
+ for categories, strings and bool
  """
  def __init__(
  self,
  is_classification: bool = False,
  logger_port: Optional[int] = None,
  dataset_compression: Optional[DatasetCompressionSpec] = None,
+ feat_types: Optional[List[str]] = None,
  seed: int = 42,
  ):
  self.dataset_compression = dataset_compression
  self._reduced_dtype: Optional[DatasetDTypeContainerType] = None
  self.is_classification = is_classification
  self.logger_port = logger_port
+ self.feat_types = feat_types
  self.seed = seed
  if self.logger_port is not None:
  self.logger: Union[logging.Logger, PicklableClientLogger] = get_named_client_logger(
@@ -63,7 +69,8 @@ def __init__(
  self.logger = logging.getLogger('Validation')
 
  self.feature_validator = TabularFeatureValidator(
- logger=self.logger)
+ logger=self.logger,
+ feat_types=self.feat_types)
  self.target_validator = TabularTargetValidator(
  is_classification=self.is_classification,
  logger=self.logger