From 90573da08bb7ba4e4ae21401088dd315953fb7d3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 7 Jul 2023 21:05:47 +0200 Subject: [PATCH 1/2] FIX handle heterogeneous data type in categorical feature in SMOTENC --- doc/whats_new/v0.11.rst | 21 ++++++++---- imblearn/over_sampling/_smote/base.py | 13 ++++---- .../_smote/tests/test_smote_nc.py | 33 +++++++++++++++++++ 3 files changed, 54 insertions(+), 13 deletions(-) diff --git a/doc/whats_new/v0.11.rst b/doc/whats_new/v0.11.rst index a12bde941..2ae7268e9 100644 --- a/doc/whats_new/v0.11.rst +++ b/doc/whats_new/v0.11.rst @@ -6,12 +6,26 @@ Version 0.11.0 (Under development) Changelog --------- +Bug fixes +......... + +- :class:`~imblearn.over_sampling.SMOTENC` now handles mix types of data type such as + `bool` and `pd.category` by delegating the conversion to scikit-learn encoder. + :pr:`1002` by :user:`Guillaume Lemaitre `. + Compatibility ............. - Maintenance release for being compatible with scikit-learn >= 1.3.0. :pr:`999` by :user:`Guillaume Lemaitre `. +Deprecation +........... + +- The fitted attribute `ohe_` in :class:`~imblearn.over_sampling.SMOTENC` is deprecated + and will be removed in version 0.13. Use `categorical_encoder_` instead. + :pr:`1000` by :user:`Guillaume Lemaitre `. + Enhancements ............ @@ -19,10 +33,3 @@ Enhancements allowing to specify a :class:`~sklearn.preprocessing.OneHotEncoder` with custom parameters. :pr:`1000` by :user:`Guillaume Lemaitre `. - -Deprecation -........... - -- The fitted attribute `ohe_` in :class:`~imblearn.over_sampling.SMOTENC` is deprecated - and will be removed in version 0.13. Use `categorical_encoder_` instead. - :pr:`1000` by :user:`Guillaume Lemaitre `. diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py index bd0823ed0..08bd96407 100644 --- a/imblearn/over_sampling/_smote/base.py +++ b/imblearn/over_sampling/_smote/base.py @@ -20,6 +20,7 @@ csc_mean_variance_axis0, csr_mean_variance_axis0, ) +from sklearn.utils.validation import _num_features from ...metrics.pairwise import ValueDifferenceMetric from ...utils import Substitution, check_neighbors_object, check_target_type @@ -557,9 +558,9 @@ def _check_X_y(self, X, y): features. """ y, binarize_y = check_target_type(y, indicate_one_vs_all=True) - X, y = self._validate_data( - X, y, reset=True, dtype=None, accept_sparse=["csr", "csc"] - ) + if not (hasattr(X, "__array__") or sparse.issparse(X)): + X = check_array(X, dtype=object) + self._check_n_features(X, reset=True) return X, y, binarize_y def _validate_estimator(self): @@ -596,14 +597,14 @@ def _fit_resample(self, X, y): FutureWarning, ) - self.n_features_ = X.shape[1] + self.n_features_ = _num_features(X) self._validate_estimator() # compute the median of the standard deviation of the minority class target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) - X_continuous = X[:, self.continuous_features_] + X_continuous = _safe_indexing(X, self.continuous_features_, axis=1) X_continuous = check_array(X_continuous, accept_sparse=["csr", "csc"]) X_minority = _safe_indexing(X_continuous, np.flatnonzero(y == class_minority)) @@ -616,7 +617,7 @@ def _fit_resample(self, X, y): var = X_minority.var(axis=0) self.median_std_ = np.median(np.sqrt(var)) - X_categorical = X[:, self.categorical_features_] + X_categorical = _safe_indexing(X, self.categorical_features_, axis=1) if X_continuous.dtype.name != "object": dtype_ohe = X_continuous.dtype else: diff --git a/imblearn/over_sampling/_smote/tests/test_smote_nc.py b/imblearn/over_sampling/_smote/tests/test_smote_nc.py index f2c6f4aed..68df73466 100644 --- a/imblearn/over_sampling/_smote/tests/test_smote_nc.py +++ b/imblearn/over_sampling/_smote/tests/test_smote_nc.py @@ -290,3 +290,36 @@ def test_smotenc_param_validation(): name = smote.__class__.__name__ _set_checking_parameters(smote) check_param_validation(name, smote) + + +def test_smotenc_bool_categorical(): + """Check that we don't try to early convert the full input data to numeric when + handling a pandas dataframe. + + Non-regression test for: + https://github.com/scikit-learn-contrib/imbalanced-learn/issues/974 + """ + pd = pytest.importorskip("pandas") + + X = pd.DataFrame( + { + "c": pd.Categorical([x for x in "abbacaba" * 3]), + "f": [0.3, 0.5, 0.1, 0.2] * 6, + "b": [False, False, True] * 8, + } + ) + y = pd.DataFrame({"out": [1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0] * 2}) + smote = SMOTENC(categorical_features=[0]) + + X_res, y_res = smote.fit_resample(X, y) + pd.testing.assert_series_equal(X_res.dtypes, X.dtypes) + assert len(X_res) == len(y_res) + + smote.set_params(categorical_features=[0, 2]) + X_res, y_res = smote.fit_resample(X, y) + pd.testing.assert_series_equal(X_res.dtypes, X.dtypes) + assert len(X_res) == len(y_res) + + X_res, y_res = smote.fit_resample(X.astype({"b": "category"}), y) + pd.testing.assert_series_equal(X_res.dtypes, X.dtypes) + assert len(X_res) == len(y_res) From e9accf179b904f213c52466e28b69981f6339e0d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 7 Jul 2023 21:09:25 +0200 Subject: [PATCH 2/2] dix --- imblearn/over_sampling/_smote/tests/test_smote_nc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/imblearn/over_sampling/_smote/tests/test_smote_nc.py b/imblearn/over_sampling/_smote/tests/test_smote_nc.py index 68df73466..fa82abeef 100644 --- a/imblearn/over_sampling/_smote/tests/test_smote_nc.py +++ b/imblearn/over_sampling/_smote/tests/test_smote_nc.py @@ -320,6 +320,7 @@ def test_smotenc_bool_categorical(): pd.testing.assert_series_equal(X_res.dtypes, X.dtypes) assert len(X_res) == len(y_res) - X_res, y_res = smote.fit_resample(X.astype({"b": "category"}), y) + X = X.astype({"b": "category"}) + X_res, y_res = smote.fit_resample(X, y) pd.testing.assert_series_equal(X_res.dtypes, X.dtypes) assert len(X_res) == len(y_res)