From 8c07c3e2a366751860840ffeb536433b900dd70f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 7 Jul 2023 19:55:14 +0200 Subject: [PATCH 1/2] ENH add categorical_encoder to SMOTEN --- imblearn/over_sampling/_smote/base.py | 44 +++++++++++++++++-- .../over_sampling/_smote/tests/test_smoten.py | 21 +++++++++ 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py index 967f59d6f..3e834f707 100644 --- a/imblearn/over_sampling/_smote/base.py +++ b/imblearn/over_sampling/_smote/base.py @@ -13,6 +13,7 @@ import numpy as np from scipy import sparse +from sklearn.base import clone from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder from sklearn.utils import _safe_indexing, check_array, check_random_state from sklearn.utils.sparsefuncs_fast import ( @@ -732,6 +733,10 @@ class SMOTEN(SMOTE): Parameters ---------- + categorical_encoder : estimator, default=None + Ordinal encoder used to encode the categorical features. If `None`, a + :class:`~sklearn.preprocessing.OrdinalEncoder` is used with default parameters. + {sampling_strategy} {random_state} @@ -759,6 +764,9 @@ class SMOTEN(SMOTE): Attributes ---------- + categorical_encoder_ : estimator + The encoder used to encode the categorical features. + sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys corresponds to the class labels from which to sample and the values @@ -821,6 +829,31 @@ class SMOTEN(SMOTE): Class counts after resampling Counter({{0: 40, 1: 40}}) """ + _parameter_constraints: dict = { + **SMOTE._parameter_constraints, + "categorical_encoder": [ + HasMethods(["fit_transform", "inverse_transform"]), + None, + ], + } + + def __init__( + self, + categorical_encoder=None, + *, + sampling_strategy="auto", + random_state=None, + k_neighbors=5, + n_jobs=None, + ): + super().__init__( + sampling_strategy=sampling_strategy, + random_state=random_state, + k_neighbors=k_neighbors, + n_jobs=n_jobs, + ) + self.categorical_encoder = categorical_encoder + def _check_X_y(self, X, y): """Check should accept strings and not sparse matrices.""" y, binarize_y = check_target_type(y, indicate_one_vs_all=True) @@ -868,11 +901,14 @@ def _fit_resample(self, X, y): X_resampled = [X.copy()] y_resampled = [y.copy()] - encoder = OrdinalEncoder(dtype=np.int32) - X_encoded = encoder.fit_transform(X) + if self.categorical_encoder is None: + self.categorical_encoder_ = OrdinalEncoder(dtype=np.int32) + else: + self.categorical_encoder_ = clone(self.categorical_encoder) + X_encoded = self.categorical_encoder_.fit_transform(X) vdm = ValueDifferenceMetric( - n_categories=[len(cat) for cat in encoder.categories_] + n_categories=[len(cat) for cat in self.categorical_encoder_.categories_] ).fit(X_encoded, y) for class_sample, n_samples in self.sampling_strategy_.items(): @@ -890,7 +926,7 @@ def _fit_resample(self, X, y): X_class, class_sample, y.dtype, nn_indices, n_samples ) - X_new = encoder.inverse_transform(X_new) + X_new = self.categorical_encoder_.inverse_transform(X_new) X_resampled.append(X_new) y_resampled.append(y_new) diff --git a/imblearn/over_sampling/_smote/tests/test_smoten.py b/imblearn/over_sampling/_smote/tests/test_smoten.py index 774ad9963..6bd9d8356 100644 --- a/imblearn/over_sampling/_smote/tests/test_smoten.py +++ b/imblearn/over_sampling/_smote/tests/test_smoten.py @@ -1,5 +1,6 @@ import numpy as np import pytest +from sklearn.preprocessing import OrdinalEncoder from imblearn.over_sampling import SMOTEN @@ -27,6 +28,7 @@ def test_smoten(data): assert X_res.shape == (80, 3) assert y_res.shape == (80,) + assert isinstance(sampler.categorical_encoder_, OrdinalEncoder) def test_smoten_resampling(): @@ -52,3 +54,22 @@ def test_smoten_resampling(): X_generated, y_generated = X_res[X.shape[0] :], y_res[X.shape[0] :] np.testing.assert_array_equal(X_generated, "blue") np.testing.assert_array_equal(y_generated, "not apple") + + +def test_smoten_categorical_encoder(data): + """Check that `categorical_encoder` is used when provided.""" + + X, y = data + sampler = SMOTEN(random_state=0) + sampler.fit_resample(X, y) + + assert isinstance(sampler.categorical_encoder_, OrdinalEncoder) + assert sampler.categorical_encoder_.dtype == np.int32 + + encoder = OrdinalEncoder(dtype=np.int64) + sampler.set_params(categorical_encoder=encoder).fit_resample(X, y) + + assert isinstance(sampler.categorical_encoder_, OrdinalEncoder) + assert sampler.categorical_encoder is encoder + assert sampler.categorical_encoder_ is not encoder + assert sampler.categorical_encoder_.dtype == np.int64 From 58f84444588ae93ee24cd06492684c0d215b19d6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 7 Jul 2023 20:33:50 +0200 Subject: [PATCH 2/2] iter --- doc/whats_new/v0.11.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new/v0.11.rst b/doc/whats_new/v0.11.rst index a12bde941..1c3f377f6 100644 --- a/doc/whats_new/v0.11.rst +++ b/doc/whats_new/v0.11.rst @@ -20,6 +20,12 @@ Enhancements parameters. :pr:`1000` by :user:`Guillaume Lemaitre `. +- :class:`~imblearn.over_sampling.SMOTEN` now accepts a parameter `categorical_encoder` + allowing to specify a :class:`~sklearn.preprocessing.OrdinalEncoder` with custom + parameters. A new fitted parameter `categorical_encoder_` is exposed to access the + fitted encoder. + :pr:`1001` by :user:`Guillaume Lemaitre `. + Deprecation ...........