Skip to content

Commit d82f9c0

Browse files
committed
[fix] Add hyperparameter space for the minority coalescer
1 parent 3efadc3 commit d82f9c0

File tree

5 files changed

+35
-15
lines changed

5 files changed

+35
-15
lines changed

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,42 @@
11
from typing import Any, Dict, Optional, Union
22

3+
from ConfigSpace.configuration_space import ConfigurationSpace
4+
from ConfigSpace.hyperparameters import UniformFloatHyperparameter
5+
36
import numpy as np
47

58
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer
9+
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
610
from autoPyTorch.utils.implementations import MinorityCoalesceTransformer
711

812

913
class MinorityCoalescer(BaseCoalescer):
10-
"""Group together categories whose occurence is less than a specified min_fraction """
11-
def __init__(self, min_fraction: float, random_state: np.random.RandomState):
14+
"""Group together categories whose occurence is less than a specified min_frac """
15+
def __init__(self, min_frac: float, random_state: np.random.RandomState):
1216
super().__init__()
13-
self.min_fraction = min_fraction
17+
self.min_frac = min_frac
1418
self.random_state = random_state
1519

1620
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseCoalescer:
1721

1822
self.check_requirements(X, y)
1923

20-
self.preprocessor['categorical'] = MinorityCoalesceTransformer(min_fraction=self.min_fraction)
24+
self.preprocessor['categorical'] = MinorityCoalesceTransformer(min_frac=self.min_frac)
2125
return self
2226

27+
@staticmethod
28+
def get_hyperparameter_search_space(
29+
dataset_properties: Optional[Dict[str, Any]] = None,
30+
min_frac: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_frac',
31+
value_range=(1e-4, 0.5),
32+
default_value=1e-2,
33+
),
34+
) -> ConfigurationSpace:
35+
36+
cs = ConfigurationSpace()
37+
add_hyperparameter(cs, min_frac, UniformFloatHyperparameter)
38+
return cs
39+
2340
@staticmethod
2441
def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
2542
return {

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
4343
Dict[str, autoPyTorchComponent]: all BaseCoalescer components available
4444
as choices for coalescer the categorical columns
4545
"""
46+
# TODO: Create `@property def components(): ...`.
4647
components = OrderedDict()
4748
components.update(_coalescer)
4849
components.update(_addons.components)

autoPyTorch/utils/implementations.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,13 @@ def get_properties() -> Dict[str, Any]:
6666

6767

6868
class MinorityCoalesceTransformer(BaseEstimator, TransformerMixin):
69-
""" Group together categories whose occurrence is less than a specified min_fraction."""
70-
def __init__(self, min_fraction: Optional[float] = None):
71-
self.min_fraction = min_fraction
69+
""" Group together categories whose occurrence is less than a specified min_frac."""
70+
def __init__(self, min_frac: Optional[float] = None):
71+
self.min_frac = min_frac
7272
self._categories_to_coalesce: Optional[List[np.ndarray]] = None
7373

74-
if self.min_fraction is not None and (self.min_fraction < 0 or self.min_fraction > 1):
75-
raise ValueError(f"min_fraction for {self.__class__.__name__} must be in [0, 1], but got {min_fraction}")
74+
if self.min_frac is not None and (self.min_frac < 0 or self.min_frac > 1):
75+
raise ValueError(f"min_frac for {self.__class__.__name__} must be in [0, 1], but got {min_frac}")
7676

7777
def _check_dataset(self, X: Union[np.ndarray, sparse.csr_matrix]) -> None:
7878
"""
@@ -135,7 +135,7 @@ def fit(self, X: Union[np.ndarray, sparse.csr_matrix],
135135
self._check_dataset(X)
136136
n_instances, n_features = X.shape
137137

138-
if self.min_fraction is None:
138+
if self.min_frac is None:
139139
self._categories_to_coalesce = [np.array([]) for _ in range(n_features)]
140140
return self
141141

@@ -145,7 +145,7 @@ def fit(self, X: Union[np.ndarray, sparse.csr_matrix],
145145
col_data = self._get_column_data(X=X, col_idx=col, is_sparse=is_sparse)
146146
unique_vals, counts = np.unique(col_data, return_counts=True)
147147
frac = counts / n_instances
148-
categories_to_coalesce.append(unique_vals[frac < self.min_fraction])
148+
categories_to_coalesce.append(unique_vals[frac < self.min_frac])
149149

150150
self._categories_to_coalesce = categories_to_coalesce
151151
return self
@@ -166,7 +166,7 @@ def transform(
166166
if self._categories_to_coalesce is None:
167167
raise RuntimeError("fit() must be called before transform()")
168168

169-
if self.min_fraction is None:
169+
if self.min_frac is None:
170170
return X
171171

172172
n_features = X.shape[1]

test/test_pipeline/components/preprocessing/base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
44
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import \
55
TabularColumnTransformer
6+
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import CoalescerChoice
67
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import EncoderChoice
78
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
89
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
@@ -29,6 +30,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]],
2930
default_dataset_properties.update(dataset_properties)
3031

3132
steps.extend([
33+
("coalescer", CoalescerChoice())
3234
("imputer", SimpleImputer()),
3335
("variance_threshold", VarianceThreshold()),
3436
("encoder", EncoderChoice(default_dataset_properties)),

test/test_utils/test_coalescer_transformer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def test_default(X1):
5050

5151
def test_coalesce_10_percent(X1):
5252
X = X1
53-
Y = MinorityCoalesceTransformer(min_fraction=.1).fit_transform(X)
53+
Y = MinorityCoalesceTransformer(min_frac=.1).fit_transform(X)
5454
for col in range(Y.shape[1]):
5555
hist = np.histogram(Y[:, col], bins=np.arange(-2, 7))
5656
np.testing.assert_array_almost_equal(hist[0], [10, 0, 0, 0, 0, 30, 30, 30])
@@ -60,7 +60,7 @@ def test_coalesce_10_percent(X1):
6060

6161
def test_coalesce_10_percent_sparse(X1):
6262
X = scipy.sparse.csc_matrix(X1)
63-
Y = MinorityCoalesceTransformer(min_fraction=.1).fit_transform(X)
63+
Y = MinorityCoalesceTransformer(min_frac=.1).fit_transform(X)
6464
# Assert no copies were made
6565
assert id(X) == id(Y)
6666
Y = Y.todense()
@@ -80,7 +80,7 @@ def test_transform_after_fit(X1, X2):
8080
X_fit = X1 # Here categories 3, 4, 5 have ocurrence above 10%
8181
X_transf = X2 # Here it is the opposite, just categs 6 and 7 are above 10%
8282

83-
mc = MinorityCoalesceTransformer(min_fraction=.1).fit(X_fit)
83+
mc = MinorityCoalesceTransformer(min_frac=.1).fit(X_fit)
8484

8585
# transform() should coalesce categories as learned during fit.
8686
# Category distribution in X_transf should be irrelevant.

0 commit comments

Comments
 (0)