automl
diff --git a/‎autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py‎
Lines changed: 21 additions & 4 deletions b/‎autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py‎
Lines changed: 21 additions & 4 deletions
diff --git a/‎autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎autoPyTorch/utils/implementations.py‎
Lines changed: 8 additions & 8 deletions b/‎autoPyTorch/utils/implementations.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎test/test_pipeline/components/preprocessing/base.py‎
Lines changed: 2 additions & 0 deletions b/‎test/test_pipeline/components/preprocessing/base.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎test/test_utils/test_coalescer_transformer.py‎
Lines changed: 3 additions & 3 deletions b/‎test/test_utils/test_coalescer_transformer.py‎
Lines changed: 3 additions & 3 deletions
@@ -1,25 +1,42 @@
 from typing import Any, Dict, Optional, Union
 
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import UniformFloatHyperparameter
+
 import numpy as np
 
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
 from autoPyTorch.utils.implementations import MinorityCoalesceTransformer
 
 
 class MinorityCoalescer(BaseCoalescer):
- """Group together categories whose occurence is less than a specified min_fraction """
- def __init__(self, min_fraction: float, random_state: np.random.RandomState):
+ """Group together categories whose occurence is less than a specified min_frac """
+ def __init__(self, min_frac: float, random_state: np.random.RandomState):
  super().__init__()
- self.min_fraction = min_fraction
+ self.min_frac = min_frac
  self.random_state = random_state
 
  def fit(self, X: Dict[str, Any], y: Any = None) -> BaseCoalescer:
 
  self.check_requirements(X, y)
 
- self.preprocessor['categorical'] = MinorityCoalesceTransformer(min_fraction=self.min_fraction)
+ self.preprocessor['categorical'] = MinorityCoalesceTransformer(min_frac=self.min_frac)
  return self
 
+ @staticmethod
+ def get_hyperparameter_search_space(
+ dataset_properties: Optional[Dict[str, Any]] = None,
+ min_frac: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_frac',
+ value_range=(1e-4, 0.5),
+ default_value=1e-2,
+ ),
+ ) -> ConfigurationSpace:
+
+ cs = ConfigurationSpace()
+ add_hyperparameter(cs, min_frac, UniformFloatHyperparameter)
+ return cs
+
  @staticmethod
  def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
  return {
 
@@ -43,6 +43,7 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
  Dict[str, autoPyTorchComponent]: all BaseCoalescer components available
  as choices for coalescer the categorical columns
  """
+ # TODO: Create `@property def components(): ...`.
  components = OrderedDict()
  components.update(_coalescer)
  components.update(_addons.components)
 
@@ -66,13 +66,13 @@ def get_properties() -> Dict[str, Any]:
 
 
 class MinorityCoalesceTransformer(BaseEstimator, TransformerMixin):
- """ Group together categories whose occurrence is less than a specified min_fraction."""
- def __init__(self, min_fraction: Optional[float] = None):
- self.min_fraction = min_fraction
+ """ Group together categories whose occurrence is less than a specified min_frac."""
+ def __init__(self, min_frac: Optional[float] = None):
+ self.min_frac = min_frac
  self._categories_to_coalesce: Optional[List[np.ndarray]] = None
 
- if self.min_fraction is not None and (self.min_fraction < 0 or self.min_fraction > 1):
- raise ValueError(f"min_fraction for {self.__class__.__name__} must be in [0, 1], but got {min_fraction}")
+ if self.min_frac is not None and (self.min_frac < 0 or self.min_frac > 1):
+ raise ValueError(f"min_frac for {self.__class__.__name__} must be in [0, 1], but got {min_frac}")
 
  def _check_dataset(self, X: Union[np.ndarray, sparse.csr_matrix]) -> None:
  """
@@ -135,7 +135,7 @@ def fit(self, X: Union[np.ndarray, sparse.csr_matrix],
  self._check_dataset(X)
  n_instances, n_features = X.shape
 
- if self.min_fraction is None:
+ if self.min_frac is None:
  self._categories_to_coalesce = [np.array([]) for _ in range(n_features)]
  return self
 
@@ -145,7 +145,7 @@ def fit(self, X: Union[np.ndarray, sparse.csr_matrix],
  col_data = self._get_column_data(X=X, col_idx=col, is_sparse=is_sparse)
  unique_vals, counts = np.unique(col_data, return_counts=True)
  frac = counts / n_instances
- categories_to_coalesce.append(unique_vals[frac < self.min_fraction])
+ categories_to_coalesce.append(unique_vals[frac < self.min_frac])
 
  self._categories_to_coalesce = categories_to_coalesce
  return self
@@ -166,7 +166,7 @@ def transform(
  if self._categories_to_coalesce is None:
  raise RuntimeError("fit() must be called before transform()")
 
- if self.min_fraction is None:
+ if self.min_frac is None:
  return X
 
  n_features = X.shape[1]
 
@@ -3,6 +3,7 @@
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import \
  TabularColumnTransformer
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import CoalescerChoice
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import EncoderChoice
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
@@ -29,6 +30,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]],
  default_dataset_properties.update(dataset_properties)
 
  steps.extend([
+ ("coalescer", CoalescerChoice())
  ("imputer", SimpleImputer()),
  ("variance_threshold", VarianceThreshold()),
  ("encoder", EncoderChoice(default_dataset_properties)),
 
@@ -50,7 +50,7 @@ def test_default(X1):
 
 def test_coalesce_10_percent(X1):
  X = X1
- Y = MinorityCoalesceTransformer(min_fraction=.1).fit_transform(X)
+ Y = MinorityCoalesceTransformer(min_frac=.1).fit_transform(X)
  for col in range(Y.shape[1]):
  hist = np.histogram(Y[:, col], bins=np.arange(-2, 7))
  np.testing.assert_array_almost_equal(hist[0], [10, 0, 0, 0, 0, 30, 30, 30])
@@ -60,7 +60,7 @@ def test_coalesce_10_percent(X1):
 
 def test_coalesce_10_percent_sparse(X1):
  X = scipy.sparse.csc_matrix(X1)
- Y = MinorityCoalesceTransformer(min_fraction=.1).fit_transform(X)
+ Y = MinorityCoalesceTransformer(min_frac=.1).fit_transform(X)
  # Assert no copies were made
  assert id(X) == id(Y)
  Y = Y.todense()
@@ -80,7 +80,7 @@ def test_transform_after_fit(X1, X2):
  X_fit = X1 # Here categories 3, 4, 5 have ocurrence above 10%
  X_transf = X2 # Here it is the opposite, just categs 6 and 7 are above 10%
 
- mc = MinorityCoalesceTransformer(min_fraction=.1).fit(X_fit)
+ mc = MinorityCoalesceTransformer(min_frac=.1).fit(X_fit)
 
  # transform() should coalesce categories as learned during fit.
  # Category distribution in X_transf should be irrelevant.