FIX handle heterogeneous data type in categorical feature in SMOTENC (#1002)

glemaitre · web-flow · commit a1d9f3c0f948 · 2023-07-07T21:29:42.000+02:00
diff --git a/doc/whats_new/v0.11.rst b/doc/whats_new/v0.11.rst
@@ -6,23 +6,30 @@ Version 0.11.0 (Under development)
 Changelog
 ---------
 
+Bug fixes
+.........
+
+- :class:`~imblearn.over_sampling.SMOTENC` now handles mix types of data type such as
+  `bool` and `pd.category` by delegating the conversion to scikit-learn encoder.
+  :pr:`1002` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 Compatibility
 .............
 
 - Maintenance release for being compatible with scikit-learn >= 1.3.0.
   :pr:`999` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+Deprecation
+...........
+
+- The fitted attribute `ohe_` in :class:`~imblearn.over_sampling.SMOTENC` is deprecated
+  and will be removed in version 0.13. Use `categorical_encoder_` instead.
+  :pr:`1000` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 Enhancements
 ............
 
 - :class:`~imblearn.over_sampling.SMOTENC` now accepts a parameter `categorical_encoder`
   allowing to specify a :class:`~sklearn.preprocessing.OneHotEncoder` with custom
   parameters.
   :pr:`1000` by :user:`Guillaume Lemaitre <glemaitre>`.
-
-Deprecation
-...........
-
-- The fitted attribute `ohe_` in :class:`~imblearn.over_sampling.SMOTENC` is deprecated
-  and will be removed in version 0.13. Use `categorical_encoder_` instead.
-  :pr:`1000` by :user:`Guillaume Lemaitre <glemaitre>`.
diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py
@@ -20,6 +20,7 @@
     csc_mean_variance_axis0,
     csr_mean_variance_axis0,
 )
+from sklearn.utils.validation import _num_features
 
 from ...metrics.pairwise import ValueDifferenceMetric
 from ...utils import Substitution, check_neighbors_object, check_target_type
@@ -557,9 +558,9 @@ def _check_X_y(self, X, y):
         features.
         """
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
-        X, y = self._validate_data(
-            X, y, reset=True, dtype=None, accept_sparse=["csr", "csc"]
-        )
+        if not (hasattr(X, "__array__") or sparse.issparse(X)):
+            X = check_array(X, dtype=object)
+        self._check_n_features(X, reset=True)
         return X, y, binarize_y
 
     def _validate_estimator(self):
@@ -596,14 +597,14 @@ def _fit_resample(self, X, y):
                 FutureWarning,
             )
 
-        self.n_features_ = X.shape[1]
+        self.n_features_ = _num_features(X)
         self._validate_estimator()
 
         # compute the median of the standard deviation of the minority class
         target_stats = Counter(y)
         class_minority = min(target_stats, key=target_stats.get)
 
-        X_continuous = X[:, self.continuous_features_]
+        X_continuous = _safe_indexing(X, self.continuous_features_, axis=1)
         X_continuous = check_array(X_continuous, accept_sparse=["csr", "csc"])
         X_minority = _safe_indexing(X_continuous, np.flatnonzero(y == class_minority))
 
@@ -616,7 +617,7 @@ def _fit_resample(self, X, y):
             var = X_minority.var(axis=0)
         self.median_std_ = np.median(np.sqrt(var))
 
-        X_categorical = X[:, self.categorical_features_]
+        X_categorical = _safe_indexing(X, self.categorical_features_, axis=1)
         if X_continuous.dtype.name != "object":
             dtype_ohe = X_continuous.dtype
         else:
diff --git a/imblearn/over_sampling/_smote/tests/test_smote_nc.py b/imblearn/over_sampling/_smote/tests/test_smote_nc.py
@@ -290,3 +290,37 @@ def test_smotenc_param_validation():
     name = smote.__class__.__name__
     _set_checking_parameters(smote)
     check_param_validation(name, smote)
+
+
+def test_smotenc_bool_categorical():
+    """Check that we don't try to early convert the full input data to numeric when
+    handling a pandas dataframe.
+
+    Non-regression test for:
+    https://github.com/scikit-learn-contrib/imbalanced-learn/issues/974
+    """
+    pd = pytest.importorskip("pandas")
+
+    X = pd.DataFrame(
+        {
+            "c": pd.Categorical([x for x in "abbacaba" * 3]),
+            "f": [0.3, 0.5, 0.1, 0.2] * 6,
+            "b": [False, False, True] * 8,
+        }
+    )
+    y = pd.DataFrame({"out": [1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0] * 2})
+    smote = SMOTENC(categorical_features=[0])
+
+    X_res, y_res = smote.fit_resample(X, y)
+    pd.testing.assert_series_equal(X_res.dtypes, X.dtypes)
+    assert len(X_res) == len(y_res)
+
+    smote.set_params(categorical_features=[0, 2])
+    X_res, y_res = smote.fit_resample(X, y)
+    pd.testing.assert_series_equal(X_res.dtypes, X.dtypes)
+    assert len(X_res) == len(y_res)
+
+    X = X.astype({"b": "category"})
+    X_res, y_res = smote.fit_resample(X, y)
+    pd.testing.assert_series_equal(X_res.dtypes, X.dtypes)
+    assert len(X_res) == len(y_res)