Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 23 additions & 6 deletions imblearn/over_sampling/_smote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def _validate_estimator(self):
)

def _make_samples(
self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.0
self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.0, y=None
):
"""A support function that returns artificial samples constructed along
the line connecting nearest neighbours.
Expand Down Expand Up @@ -98,6 +98,10 @@ def _make_samples(
step_size : float, default=1.0
The step size to create samples.

y : ndarray of shape (n_samples_all,), default=None
The true target associated with `nn_data`. Used by Borderline SMOTE-2 to
weight the distances in the sample generation process.

Returns
-------
X_new : {ndarray, sparse matrix} of shape (n_samples_new, n_features)
Expand All @@ -114,11 +118,13 @@ def _make_samples(
rows = np.floor_divide(samples_indices, nn_num.shape[1])
cols = np.mod(samples_indices, nn_num.shape[1])

X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps, y_type)
X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps, y_type, y)
y_new = np.full(n_samples, fill_value=y_type, dtype=y_dtype)
return X_new, y_new

def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type=None):
def _generate_samples(
self, X, nn_data, nn_num, rows, cols, steps, y_type=None, y=None
):
r"""Generate a synthetic sample.

The rule for the generation is:
Expand Down Expand Up @@ -153,15 +159,26 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type=None):
steps : ndarray of shape (n_samples,), dtype=float
Step sizes for new samples.

y_type : None
Unused parameter. Only for compatibility reason with SMOTE-NC.
y_type : str, int or None, default=None
Class label of the current target classes for which we want to generate
samples.

y : ndarray of shape (n_samples_all,), default=None
The true target associated with `nn_data`. Used by Borderline SMOTE-2 to
weight the distances in the sample generation process.

Returns
-------
X_new : {ndarray, sparse matrix} of shape (n_samples, n_features)
Synthetically generated samples.
"""
diffs = nn_data[nn_num[rows, cols]] - X[rows]
if y is not None: # only entering for BorderlineSMOTE-2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM, clever implementation.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On second thoughts, would it not be enough to just half the diffs if we are multiplying it by steps in 186/188?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The paper states to use a random number. If we take half, we always use 0.5.

random_state = check_random_state(self.random_state)
mask_pair_samples = y[nn_num[rows, cols]] != y_type
diffs[mask_pair_samples] *= random_state.uniform(
low=0.0, high=0.5, size=(mask_pair_samples.sum(), 1)
)

if sparse.issparse(X):
sparse_func = type(X).__name__
Expand Down Expand Up @@ -736,7 +753,7 @@ def _fit_resample(self, X, y):

return X_resampled, y_resampled

def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type):
def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type, y=None):
"""Generate a synthetic sample with an additional steps for the
categorical features.

Expand Down
3 changes: 3 additions & 0 deletions imblearn/over_sampling/_smote/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,10 @@ def _fit_resample(self, X, y):

if self.kind == "borderline-1":
X_to_sample_from = X_class # consider the positive class only
y_to_check_neighbors = None
else: # self.kind == "borderline-2"
X_to_sample_from = X # consider the whole dataset
y_to_check_neighbors = y

self.nn_k_.fit(X_to_sample_from)
nns = self.nn_k_.kneighbors(X_danger, return_distance=False)[:, 1:]
Expand All @@ -236,6 +238,7 @@ def _fit_resample(self, X, y):
X_to_sample_from,
nns,
n_samples,
y=y_to_check_neighbors,
)
if sparse.issparse(X_new):
X_resampled = sparse.vstack([X_resampled, X_new])
Expand Down