ENH add categorical_encoder param to SMOTENC (#1000)

glemaitre · web-flow · commit 020f2784bd2c · 2023-07-07T20:30:27.000+02:00
diff --git a/doc/whats_new/v0.11.rst b/doc/whats_new/v0.11.rst
@@ -9,5 +9,20 @@ Changelog
 Compatibility
 .............
 
-- Maintenance release for be compatible with scikit-learn >= 1.3.0.
+- Maintenance release for being compatible with scikit-learn >= 1.3.0.
   :pr:`999` by :user:`Guillaume Lemaitre <glemaitre>`.
+
+Enhancements
+............
+
+- :class:`~imblearn.over_sampling.SMOTENC` now accepts a parameter `categorical_encoder`
+  allowing to specify a :class:`~sklearn.preprocessing.OneHotEncoder` with custom
+  parameters.
+  :pr:`1000` by :user:`Guillaume Lemaitre <glemaitre>`.
+
+Deprecation
+...........
+
+- The fitted attribute `ohe_` in :class:`~imblearn.over_sampling.SMOTENC` is deprecated
+  and will be removed in version 0.13. Use `categorical_encoder_` instead.
+  :pr:`1000` by :user:`Guillaume Lemaitre <glemaitre>`.
diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py
@@ -13,6 +13,7 @@
 
 import numpy as np
 from scipy import sparse
+from sklearn.base import clone
 from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
 from sklearn.utils import _safe_indexing, check_array, check_random_state
 from sklearn.utils.sparsefuncs_fast import (
@@ -393,6 +394,11 @@ class SMOTENC(SMOTE):
         - mask array of shape (n_features, ) and ``bool`` dtype for which
           ``True`` indicates the categorical features.
 
+    categorical_encoder : estimator, default=None
+        One-hot encoder used to encode the categorical features. If `None`, a
+        :class:`~sklearn.preprocessing.OneHotEncoder` is used with default parameters
+        apart from `handle_unknown` which is set to 'ignore'.
+
     {sampling_strategy}
 
     {random_state}
@@ -431,6 +437,13 @@ class SMOTENC(SMOTE):
     ohe_ : :class:`~sklearn.preprocessing.OneHotEncoder`
         The one-hot encoder used to encode the categorical features.
 
+        .. deprecated:: 0.11
+           `ohe_` is deprecated in 0.11 and will be removed in 0.13. Use
+           `categorical_encoder_` instead.
+
+    categorical_encoder_ : estimator
+        The encoder used to encode the categorical features.
+
     categorical_features_ : ndarray of shape (n_cat_features,), dtype=np.int64
         Indices of the categorical features.
 
@@ -514,12 +527,17 @@ class SMOTENC(SMOTE):
     _parameter_constraints: dict = {
         **SMOTE._parameter_constraints,
         "categorical_features": ["array-like"],
+        "categorical_encoder": [
+            HasMethods(["fit_transform", "inverse_transform"]),
+            None,
+        ],
     }
 
     def __init__(
         self,
         categorical_features,
         *,
+        categorical_encoder=None,
         sampling_strategy="auto",
         random_state=None,
         k_neighbors=5,
@@ -532,6 +550,7 @@ def __init__(
             n_jobs=n_jobs,
         )
         self.categorical_features = categorical_features
+        self.categorical_encoder = categorical_encoder
 
     def _check_X_y(self, X, y):
         """Overwrite the checking to let pass some string for categorical
@@ -603,17 +622,19 @@ def _fit_resample(self, X, y):
         else:
             dtype_ohe = np.float64
 
-        self.ohe_ = OneHotEncoder(handle_unknown="ignore", dtype=dtype_ohe)
-        if hasattr(self.ohe_, "sparse_output"):
-            # scikit-learn >= 1.2
-            self.ohe_.set_params(sparse_output=True)
+        if self.categorical_encoder is None:
+            self.categorical_encoder_ = OneHotEncoder(
+                handle_unknown="ignore", dtype=dtype_ohe
+            )
         else:
-            self.ohe_.set_params(sparse=True)
+            self.categorical_encoder_ = clone(self.categorical_encoder)
 
         # the input of the OneHotEncoder needs to be dense
-        X_ohe = self.ohe_.fit_transform(
+        X_ohe = self.categorical_encoder_.fit_transform(
             X_categorical.toarray() if sparse.issparse(X_categorical) else X_categorical
         )
+        if not sparse.issparse(X_ohe):
+            X_ohe = sparse.csr_matrix(X_ohe, dtype=dtype_ohe)
 
         # we can replace the 1 entries of the categorical features with the
         # median of the standard deviation. It will ensure that whenever
@@ -636,7 +657,7 @@ def _fit_resample(self, X, y):
         # reverse the encoding of the categorical features
         X_res_cat = X_resampled[:, self.continuous_features_.size :]
         X_res_cat.data = np.ones_like(X_res_cat.data)
-        X_res_cat_dec = self.ohe_.inverse_transform(X_res_cat)
+        X_res_cat_dec = self.categorical_encoder_.inverse_transform(X_res_cat)
 
         if sparse.issparse(X):
             X_resampled = sparse.hstack(
@@ -695,7 +716,7 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps):
         all_neighbors = nn_data[nn_num[rows]]
 
         categories_size = [self.continuous_features_.size] + [
-            cat.size for cat in self.ohe_.categories_
+            cat.size for cat in self.categorical_encoder_.categories_
         ]
 
         for start_idx, end_idx in zip(
@@ -714,6 +735,16 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps):
 
         return X_new
 
+    @property
+    def ohe_(self):
+        """One-hot encoder used to encode the categorical features."""
+        warnings.warn(
+            "'ohe_' attribute has been deprecated in 0.11 and will be removed "
+            "in 0.13. Use 'categorical_encoder_' instead.",
+            FutureWarning,
+        )
+        return self.categorical_encoder_
+
 
 @Substitution(
     sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
diff --git a/imblearn/over_sampling/_smote/tests/test_smote_nc.py b/imblearn/over_sampling/_smote/tests/test_smote_nc.py
@@ -8,11 +8,20 @@
 
 import numpy as np
 import pytest
+import sklearn
 from scipy import sparse
 from sklearn.datasets import make_classification
+from sklearn.preprocessing import OneHotEncoder
 from sklearn.utils._testing import assert_allclose, assert_array_equal
+from sklearn.utils.fixes import parse_version
 
 from imblearn.over_sampling import SMOTENC
+from imblearn.utils.estimator_checks import (
+    _set_checking_parameters,
+    check_param_validation,
+)
+
+sklearn_version = parse_version(sklearn.__version__)
 
 
 def data_heterogneous_ordered():
@@ -182,8 +191,7 @@ def test_smotenc_pandas():
     smote = SMOTENC(categorical_features=categorical_features, random_state=0)
     X_res_pd, y_res_pd = smote.fit_resample(X_pd, y)
     X_res, y_res = smote.fit_resample(X, y)
-    # FIXME: we should use to_numpy with pandas >= 0.25
-    assert_array_equal(X_res_pd.values, X_res)
+    assert_array_equal(X_res_pd.to_numpy(), X_res)
     assert_allclose(y_res_pd, y_res)
 
 
@@ -240,3 +248,45 @@ def test_smote_nc_with_null_median_std():
     # check that the categorical feature is not random but correspond to the
     # categories seen in the minority class samples
     assert X_res[-1, -1] == "C"
+
+
+def test_smotenc_categorical_encoder():
+    """Check that we can pass our own categorical encoder."""
+
+    # TODO: only use `sparse_output` when sklearn >= 1.2
+    param = "sparse" if sklearn_version < parse_version("1.2") else "sparse_output"
+
+    X, y, categorical_features = data_heterogneous_unordered()
+    smote = SMOTENC(categorical_features=categorical_features, random_state=0)
+    smote.fit_resample(X, y)
+
+    assert getattr(smote.categorical_encoder_, param) is True
+
+    encoder = OneHotEncoder()
+    encoder.set_params(**{param: False})
+    smote.set_params(categorical_encoder=encoder).fit_resample(X, y)
+    assert smote.categorical_encoder is encoder
+    assert smote.categorical_encoder_ is not encoder
+    assert getattr(smote.categorical_encoder_, param) is False
+
+
+# TODO(0.13): remove this test
+def test_smotenc_deprecation_ohe_():
+    """Check that we raise a deprecation warning when using `ohe_`."""
+    X, y, categorical_features = data_heterogneous_unordered()
+    smote = SMOTENC(categorical_features=categorical_features, random_state=0)
+    smote.fit_resample(X, y)
+
+    with pytest.warns(FutureWarning, match="'ohe_' attribute has been deprecated"):
+        smote.ohe_
+
+
+def test_smotenc_param_validation():
+    """Check that we validate the parameters correctly since this estimator requires
+    a specific parameter.
+    """
+    categorical_features = [0]
+    smote = SMOTENC(categorical_features=categorical_features, random_state=0)
+    name = smote.__class__.__name__
+    _set_checking_parameters(smote)
+    check_param_validation(name, smote)