Skip to content
9 changes: 8 additions & 1 deletion autoPyTorch/api/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ def _get_dataset_input_validator(
resampling_strategy_args: Optional[Dict[str, Any]] = None,
dataset_name: Optional[str] = None,
dataset_compression: Optional[DatasetCompressionSpec] = None,
**kwargs: Any
) -> Tuple[BaseDataset, BaseInputValidator]:
"""
Returns an object of a child class of `BaseDataset` and
Expand Down Expand Up @@ -353,6 +354,7 @@ def get_dataset(
resampling_strategy_args: Optional[Dict[str, Any]] = None,
dataset_name: Optional[str] = None,
dataset_compression: Optional[DatasetCompressionSpec] = None,
**kwargs: Any
) -> BaseDataset:
"""
Returns an object of a child class of `BaseDataset` according to the current task.
Expand Down Expand Up @@ -407,6 +409,10 @@ def get_dataset(
Subsampling takes into account classification labels and stratifies
accordingly. We guarantee that at least one occurrence of each
label is included in the sampled set.
kwargs (Any):
can be used to pass task specific dataset arguments. Currently supports
passing `feat_types` for tabular tasks which specifies whether a feature is
'numerical' or 'categorical'.

Returns:
BaseDataset:
Expand All @@ -420,7 +426,8 @@ def get_dataset(
resampling_strategy=resampling_strategy,
resampling_strategy_args=resampling_strategy_args,
dataset_name=dataset_name,
dataset_compression=dataset_compression)
dataset_compression=dataset_compression,
**kwargs)

return dataset

Expand Down
16 changes: 14 additions & 2 deletions autoPyTorch/api/tabular_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ def _get_dataset_input_validator(
resampling_strategy_args: Optional[Dict[str, Any]] = None,
dataset_name: Optional[str] = None,
dataset_compression: Optional[DatasetCompressionSpec] = None,
**kwargs: Any,
) -> Tuple[TabularDataset, TabularInputValidator]:
"""
Returns an object of `TabularDataset` and an object of
Expand All @@ -194,6 +195,9 @@ def _get_dataset_input_validator(
dataset_compression (Optional[DatasetCompressionSpec]):
specifications for dataset compression. For more info check
documentation for `BaseTask.get_dataset`.
kwargs (Any):
Currently for tabular tasks, expect `feat_types: (Optional[List[str]]` which
specifies whether a feature is 'numerical' or 'categorical'.

Returns:
TabularDataset:
Expand All @@ -206,12 +210,14 @@ def _get_dataset_input_validator(
resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \
self.resampling_strategy_args

feat_types = kwargs.pop('feat_types', None)
# Create a validator object to make sure that the data provided by
# the user matches the autopytorch requirements
input_validator = TabularInputValidator(
is_classification=True,
logger_port=self._logger_port,
dataset_compression=dataset_compression
dataset_compression=dataset_compression,
feat_types=feat_types
)

# Fit a input validator to check the provided data
Expand All @@ -238,6 +244,7 @@ def search(
X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
dataset_name: Optional[str] = None,
feat_types: Optional[List[str]] = None,
budget_type: str = 'epochs',
min_budget: int = 5,
max_budget: int = 50,
Expand Down Expand Up @@ -266,6 +273,10 @@ def search(
A pair of features (X_train) and targets (y_train) used to fit a
pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
be provided to track the generalization performance of each stage.
feat_types (Optional[List[str]]):
Description about the feature types of the columns.
Accepts `numerical` for integers, float data and `categorical`
for categories, strings and bool. Defaults to None.
optimize_metric (str):
name of the metric that is used to evaluate a pipeline.
budget_type (str):
Expand Down Expand Up @@ -433,7 +444,8 @@ def search(
resampling_strategy=self.resampling_strategy,
resampling_strategy_args=self.resampling_strategy_args,
dataset_name=dataset_name,
dataset_compression=self._dataset_compression)
dataset_compression=self._dataset_compression,
feat_types=feat_types)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we check near here if feat_types includes only possible options, i.e. either numerical or categorical

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have added to tabular feature validator.


return self._search(
dataset=self.dataset,
Expand Down
16 changes: 14 additions & 2 deletions autoPyTorch/api/tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ def _get_dataset_input_validator(
resampling_strategy_args: Optional[Dict[str, Any]] = None,
dataset_name: Optional[str] = None,
dataset_compression: Optional[DatasetCompressionSpec] = None,
**kwargs: Any
) -> Tuple[TabularDataset, TabularInputValidator]:
"""
Returns an object of `TabularDataset` and an object of
Expand All @@ -195,6 +196,9 @@ def _get_dataset_input_validator(
dataset_compression (Optional[DatasetCompressionSpec]):
specifications for dataset compression. For more info check
documentation for `BaseTask.get_dataset`.
kwargs (Any):
Currently for tabular tasks, expect `feat_types: (Optional[List[str]]` which
specifies whether a feature is 'numerical' or 'categorical'.
Returns:
TabularDataset:
the dataset object.
Expand All @@ -206,12 +210,14 @@ def _get_dataset_input_validator(
resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \
self.resampling_strategy_args

feat_types = kwargs.pop('feat_types', None)
# Create a validator object to make sure that the data provided by
# the user matches the autopytorch requirements
input_validator = TabularInputValidator(
is_classification=False,
logger_port=self._logger_port,
dataset_compression=dataset_compression
dataset_compression=dataset_compression,
feat_types=feat_types
)

# Fit a input validator to check the provided data
Expand All @@ -238,6 +244,7 @@ def search(
X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
dataset_name: Optional[str] = None,
feat_types: Optional[List[str]] = None,
budget_type: str = 'epochs',
min_budget: int = 5,
max_budget: int = 50,
Expand Down Expand Up @@ -266,6 +273,10 @@ def search(
A pair of features (X_train) and targets (y_train) used to fit a
pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
be provided to track the generalization performance of each stage.
feat_types (Optional[List[str]]):
Description about the feature types of the columns.
Accepts `numerical` for integers, float data and `categorical`
for categories, strings and bool. Defaults to None.
optimize_metric (str):
Name of the metric that is used to evaluate a pipeline.
budget_type (str):
Expand Down Expand Up @@ -434,7 +445,8 @@ def search(
resampling_strategy=self.resampling_strategy,
resampling_strategy_args=self.resampling_strategy_args,
dataset_name=dataset_name,
dataset_compression=self._dataset_compression)
dataset_compression=self._dataset_compression,
feat_types=feat_types)

return self._search(
dataset=self.dataset,
Expand Down
2 changes: 1 addition & 1 deletion autoPyTorch/data/base_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __init__(
logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None,
):
# Register types to detect unsupported data format changes
self.feat_type: Optional[List[str]] = None
self.feat_types: Optional[List[str]] = None
self.data_type: Optional[type] = None
self.dtypes: List[str] = []
self.column_order: List[str] = []
Expand Down
95 changes: 82 additions & 13 deletions autoPyTorch/data/tabular_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,18 @@ class TabularFeatureValidator(BaseFeatureValidator):
List of indices of numerical columns
categorical_columns (List[int]):
List of indices of categorical columns
feat_types (List[str]):
Description about the feature types of the columns.
Accepts `numerical` for integers, float data and `categorical`
for categories, strings and bool.
"""
def __init__(
self,
logger: Optional[Union[PicklableClientLogger, Logger]] = None,
feat_types: Optional[List[str]] = None,
):
super().__init__(logger)
self.feat_types = feat_types

@staticmethod
def _comparator(cmp1: str, cmp2: str) -> int:
Expand Down Expand Up @@ -167,9 +173,9 @@ def _fit(
if not X.select_dtypes(include='object').empty:
X = self.infer_objects(X)

self.transformed_columns, self.feat_type = self._get_columns_to_encode(X)
self.transformed_columns, self.feat_types = self.get_columns_to_encode(X)

assert self.feat_type is not None
assert self.feat_types is not None

if len(self.transformed_columns) > 0:

Expand All @@ -186,8 +192,8 @@ def _fit(
# The column transformer reorders the feature types
# therefore, we need to change the order of columns as well
# This means categorical columns are shifted to the left
self.feat_type = sorted(
self.feat_type,
self.feat_types = sorted(
self.feat_types,
key=functools.cmp_to_key(self._comparator)
)

Expand All @@ -201,7 +207,7 @@ def _fit(
for cat in encoded_categories
]

for i, type_ in enumerate(self.feat_type):
for i, type_ in enumerate(self.feat_types):
if 'numerical' in type_:
self.numerical_columns.append(i)
else:
Expand Down Expand Up @@ -336,7 +342,7 @@ def _check_data(

# Define the column to be encoded here as the feature validator is fitted once
# per estimator
self.transformed_columns, self.feat_type = self._get_columns_to_encode(X)
self.transformed_columns, self.feat_types = self.get_columns_to_encode(X)

column_order = [column for column in X.columns]
if len(self.column_order) > 0:
Expand All @@ -361,12 +367,72 @@ def _check_data(
else:
self.dtypes = dtypes

def get_columns_to_encode(
self,
X: pd.DataFrame
) -> Tuple[List[str], List[str]]:
"""
Return the columns to be transformed as well as
the type of feature for each column.

The returned values are dependent on `feat_types` passed to the `__init__`.

Args:
X (pd.DataFrame)
A set of features that are going to be validated (type and dimensionality
checks) and an encoder fitted in the case the data needs encoding

Returns:
transformed_columns (List[str]):
Columns to encode, if any
feat_type:
Type of each column numerical/categorical
"""
transformed_columns, feat_types = self._get_columns_to_encode(X)
if self.feat_types is not None:
self._validate_feat_types(X)
transformed_columns = [X.columns[i] for i, col in enumerate(self.feat_types)
if col.lower() == 'categorical']
return transformed_columns, self.feat_types
else:
return transformed_columns, feat_types

def _validate_feat_types(self, X: pd.DataFrame) -> None:
"""
Checks if the passed `feat_types` is compatible with what
AutoPyTorch expects, i.e, it should only contain `numerical`
or `categorical` and the number of feature types is equal to
the number of features. The case does not matter.

Args:
X (pd.DataFrame):
input features set

Raises:
ValueError:
if the number of feat_types is not equal to the number of features
if the feature type are not one of "numerical", "categorical"
"""
assert self.feat_types is not None # mypy check

if len(self.feat_types) != len(X.columns):
raise ValueError(f"Expected number of `feat_types`: {len(self.feat_types)}"
f" to be the same as the number of features {len(X.columns)}")
for feat_type in set(self.feat_types):
if feat_type.lower() not in ['numerical', 'categorical']:
raise ValueError(f"Expected type of features to be in `['numerical', "
f"'categorical']`, but got {feat_type}")

def _get_columns_to_encode(
self,
X: pd.DataFrame,
) -> Tuple[List[str], List[str]]:
"""
Return the columns to be encoded from a pandas dataframe
Return the columns to be transformed as well as
the type of feature for each column from a pandas dataframe.

If `self.feat_types` is not None, it also validates that the
dataframe dtypes dont disagree with the ones passed in `__init__`.

Args:
X (pd.DataFrame)
Expand All @@ -380,21 +446,24 @@ def _get_columns_to_encode(
Type of each column numerical/categorical
"""

if len(self.transformed_columns) > 0 and self.feat_type is not None:
return self.transformed_columns, self.feat_type
if len(self.transformed_columns) > 0 and self.feat_types is not None:
return self.transformed_columns, self.feat_types

# Register if a column needs encoding
transformed_columns = []

# Also, register the feature types for the estimator
feat_type = []
feat_types = []

# Make sure each column is a valid type
for i, column in enumerate(X.columns):
if X[column].dtype.name in ['category', 'bool']:

transformed_columns.append(column)
feat_type.append('categorical')
if self.feat_types is not None and self.feat_types[i].lower() == 'numerical':
raise ValueError(f"Passed numerical as the feature type for column: {column} "
f"but the column is categorical")
feat_types.append('categorical')
# Move away from np.issubdtype as it causes
# TypeError: data type not understood in certain pandas types
elif not is_numeric_dtype(X[column]):
Expand Down Expand Up @@ -434,8 +503,8 @@ def _get_columns_to_encode(
)
)
else:
feat_type.append('numerical')
return transformed_columns, feat_type
feat_types.append('numerical')
return transformed_columns, feat_types

def list_to_dataframe(
self,
Expand Down
11 changes: 9 additions & 2 deletions autoPyTorch/data/tabular_validator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- encoding: utf-8 -*-
import logging
from typing import Optional, Tuple, Union
from typing import List, Optional, Tuple, Union

import numpy as np

Expand Down Expand Up @@ -41,18 +41,24 @@ class TabularInputValidator(BaseInputValidator):
dataset_compression (Optional[DatasetCompressionSpec]):
specifications for dataset compression. For more info check
documentation for `BaseTask.get_dataset`.
feat_types (List[str]):
Description about the feature types of the columns.
Accepts `numerical` for integers, float data and `categorical`
for categories, strings and bool
"""
def __init__(
self,
is_classification: bool = False,
logger_port: Optional[int] = None,
dataset_compression: Optional[DatasetCompressionSpec] = None,
feat_types: Optional[List[str]] = None,
seed: int = 42,
):
self.dataset_compression = dataset_compression
self._reduced_dtype: Optional[DatasetDTypeContainerType] = None
self.is_classification = is_classification
self.logger_port = logger_port
self.feat_types = feat_types
self.seed = seed
if self.logger_port is not None:
self.logger: Union[logging.Logger, PicklableClientLogger] = get_named_client_logger(
Expand All @@ -63,7 +69,8 @@ def __init__(
self.logger = logging.getLogger('Validation')

self.feature_validator = TabularFeatureValidator(
logger=self.logger)
logger=self.logger,
feat_types=self.feat_types)
self.target_validator = TabularTargetValidator(
is_classification=self.is_classification,
logger=self.logger
Expand Down
Loading