From 8c07c3e2a366751860840ffeb536433b900dd70f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 7 Jul 2023 19:55:14 +0200
Subject: [PATCH 1/2] ENH add categorical_encoder to SMOTEN

---
 imblearn/over_sampling/_smote/base.py         | 44 +++++++++++++++++--
 .../over_sampling/_smote/tests/test_smoten.py | 21 +++++++++
 2 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py
index 967f59d6f..3e834f707 100644
--- a/imblearn/over_sampling/_smote/base.py
+++ b/imblearn/over_sampling/_smote/base.py
@@ -13,6 +13,7 @@
 
 import numpy as np
 from scipy import sparse
+from sklearn.base import clone
 from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
 from sklearn.utils import _safe_indexing, check_array, check_random_state
 from sklearn.utils.sparsefuncs_fast import (
@@ -732,6 +733,10 @@ class SMOTEN(SMOTE):
 
     Parameters
     ----------
+    categorical_encoder : estimator, default=None
+        Ordinal encoder used to encode the categorical features. If `None`, a
+        :class:`~sklearn.preprocessing.OrdinalEncoder` is used with default parameters.
+
     {sampling_strategy}
 
     {random_state}
@@ -759,6 +764,9 @@ class SMOTEN(SMOTE):
 
     Attributes
     ----------
+    categorical_encoder_ : estimator
+        The encoder used to encode the categorical features.
+
     sampling_strategy_ : dict
         Dictionary containing the information to sample the dataset. The keys
         corresponds to the class labels from which to sample and the values
@@ -821,6 +829,31 @@ class SMOTEN(SMOTE):
     Class counts after resampling Counter({{0: 40, 1: 40}})
     """
 
+    _parameter_constraints: dict = {
+        **SMOTE._parameter_constraints,
+        "categorical_encoder": [
+            HasMethods(["fit_transform", "inverse_transform"]),
+            None,
+        ],
+    }
+
+    def __init__(
+        self,
+        categorical_encoder=None,
+        *,
+        sampling_strategy="auto",
+        random_state=None,
+        k_neighbors=5,
+        n_jobs=None,
+    ):
+        super().__init__(
+            sampling_strategy=sampling_strategy,
+            random_state=random_state,
+            k_neighbors=k_neighbors,
+            n_jobs=n_jobs,
+        )
+        self.categorical_encoder = categorical_encoder
+
     def _check_X_y(self, X, y):
         """Check should accept strings and not sparse matrices."""
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
@@ -868,11 +901,14 @@ def _fit_resample(self, X, y):
         X_resampled = [X.copy()]
         y_resampled = [y.copy()]
 
-        encoder = OrdinalEncoder(dtype=np.int32)
-        X_encoded = encoder.fit_transform(X)
+        if self.categorical_encoder is None:
+            self.categorical_encoder_ = OrdinalEncoder(dtype=np.int32)
+        else:
+            self.categorical_encoder_ = clone(self.categorical_encoder)
+        X_encoded = self.categorical_encoder_.fit_transform(X)
 
         vdm = ValueDifferenceMetric(
-            n_categories=[len(cat) for cat in encoder.categories_]
+            n_categories=[len(cat) for cat in self.categorical_encoder_.categories_]
         ).fit(X_encoded, y)
 
         for class_sample, n_samples in self.sampling_strategy_.items():
@@ -890,7 +926,7 @@ def _fit_resample(self, X, y):
                 X_class, class_sample, y.dtype, nn_indices, n_samples
             )
 
-            X_new = encoder.inverse_transform(X_new)
+            X_new = self.categorical_encoder_.inverse_transform(X_new)
             X_resampled.append(X_new)
             y_resampled.append(y_new)
 
diff --git a/imblearn/over_sampling/_smote/tests/test_smoten.py b/imblearn/over_sampling/_smote/tests/test_smoten.py
index 774ad9963..6bd9d8356 100644
--- a/imblearn/over_sampling/_smote/tests/test_smoten.py
+++ b/imblearn/over_sampling/_smote/tests/test_smoten.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pytest
+from sklearn.preprocessing import OrdinalEncoder
 
 from imblearn.over_sampling import SMOTEN
 
@@ -27,6 +28,7 @@ def test_smoten(data):
 
     assert X_res.shape == (80, 3)
     assert y_res.shape == (80,)
+    assert isinstance(sampler.categorical_encoder_, OrdinalEncoder)
 
 
 def test_smoten_resampling():
@@ -52,3 +54,22 @@ def test_smoten_resampling():
     X_generated, y_generated = X_res[X.shape[0] :], y_res[X.shape[0] :]
     np.testing.assert_array_equal(X_generated, "blue")
     np.testing.assert_array_equal(y_generated, "not apple")
+
+
+def test_smoten_categorical_encoder(data):
+    """Check that `categorical_encoder` is used when provided."""
+
+    X, y = data
+    sampler = SMOTEN(random_state=0)
+    sampler.fit_resample(X, y)
+
+    assert isinstance(sampler.categorical_encoder_, OrdinalEncoder)
+    assert sampler.categorical_encoder_.dtype == np.int32
+
+    encoder = OrdinalEncoder(dtype=np.int64)
+    sampler.set_params(categorical_encoder=encoder).fit_resample(X, y)
+
+    assert isinstance(sampler.categorical_encoder_, OrdinalEncoder)
+    assert sampler.categorical_encoder is encoder
+    assert sampler.categorical_encoder_ is not encoder
+    assert sampler.categorical_encoder_.dtype == np.int64

From 58f84444588ae93ee24cd06492684c0d215b19d6 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 7 Jul 2023 20:33:50 +0200
Subject: [PATCH 2/2] iter

---
 doc/whats_new/v0.11.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/whats_new/v0.11.rst b/doc/whats_new/v0.11.rst
index a12bde941..1c3f377f6 100644
--- a/doc/whats_new/v0.11.rst
+++ b/doc/whats_new/v0.11.rst
@@ -20,6 +20,12 @@ Enhancements
   parameters.
   :pr:`1000` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+- :class:`~imblearn.over_sampling.SMOTEN` now accepts a parameter `categorical_encoder`
+  allowing to specify a :class:`~sklearn.preprocessing.OrdinalEncoder` with custom
+  parameters. A new fitted parameter `categorical_encoder_` is exposed to access the
+  fitted encoder.
+  :pr:`1001` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 Deprecation
 ...........