FIX divide by sqrt(2) the median entry in SMOTENC (#1014)

glemaitre · web-flow · commit 2f6b1f68c5d2 · 2023-07-10T15:53:22.000+02:00
diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst
@@ -203,11 +203,11 @@ or relying on `dtype` inference if the columns are using the
   >>> print(sorted(Counter(y_resampled).items()))
   [(0, 30), (1, 30)]
   >>> print(X_resampled[-5:])
-  [['A' 0.5246469549655818 2]
-   ['B' -0.3657680728116921 2]
-   ['B' 0.9344237230779993 2]
-   ['B' 0.3710891618824609 2]
-   ['B' 0.3327240726719727 2]]
+  [['A' 0.52... 2]
+   ['B' -0.36... 2]
+   ['B' 0.93... 2]
+   ['B' 0.37... 2]
+   ['B' 0.33... 2]]
 
 Therefore, it can be seen that the samples generated in the first and last
 columns are belonging to the same categories originally presented without any
diff --git a/doc/whats_new/v0.11.rst b/doc/whats_new/v0.11.rst
@@ -6,6 +6,14 @@ Version 0.11.1
 Changelog
 ---------
 
+Bug fixes
+.........
+
+- Fix a bug in :class:`~imblearn.over_sampling.SMOTENC` where the entries of the
+  one-hot encoding should be divided by `sqrt(2)` and not `2`, taking into account that
+  they are plugged into an Euclidean distance computation.
+  :pr:`1014` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 
 Version 0.11.0
 ==============
diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py
@@ -671,13 +671,18 @@ def _fit_resample(self, X, y):
 
         # In the edge case where the median of the std is equal to 0, the 1s
         # entries will be also nullified. In this case, we store the original
-        # categorical encoding which will be later used for inversing the OHE
+        # categorical encoding which will be later used for inverting the OHE
         if math.isclose(self.median_std_, 0):
             self._X_categorical_minority_encoded = _safe_indexing(
                 X_ohe.toarray(), np.flatnonzero(y == class_minority)
             )
 
-        X_ohe.data = np.ones_like(X_ohe.data, dtype=X_ohe.dtype) * self.median_std_ / 2
+        # With one-hot encoding, the median will be repeated twice. We need to divide
+        # by sqrt(2) such that we only have one median value contributing to the
+        # Euclidean distance
+        X_ohe.data = (
+            np.ones_like(X_ohe.data, dtype=X_ohe.dtype) * self.median_std_ / np.sqrt(2)
+        )
         X_encoded = sparse.hstack((X_continuous, X_ohe), format="csr")
 
         X_resampled, y_resampled = super()._fit_resample(X_encoded, y)