From 581987a5c0bcac67dd6e2a119e314fab7e7b54c5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 3 Feb 2021 22:41:34 +0100 Subject: [PATCH 01/20] MNT update test framework for sklearn 0.24 --- imblearn/utils/estimator_checks.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 729ceebea..b2f520373 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -25,8 +25,8 @@ from sklearn.cluster import KMeans from sklearn.exceptions import SkipTestWarning from sklearn.preprocessing import label_binarize -from sklearn.utils.estimator_checks import _mark_xfail_checks -from sklearn.utils.estimator_checks import _set_check_estimator_ids +from sklearn.utils.estimator_checks import _maybe_mark_xfail +from sklearn.utils.estimator_checks import _get_check_estimator_ids from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_raises_regex from sklearn.utils.multiclass import type_of_target @@ -117,18 +117,15 @@ def parametrize_with_checks(estimators): ... def test_sklearn_compatible_estimator(estimator, check): ... check(estimator) """ - names = (type(estimator).__name__ for estimator in estimators) - - checks_generator = ((clone(estimator), partial(check, name)) - for name, estimator in zip(names, estimators) - for check in _yield_all_checks(estimator)) - - checks_with_marks = ( - _mark_xfail_checks(estimator, check, pytest) - for estimator, check in checks_generator) - - return pytest.mark.parametrize("estimator, check", checks_with_marks, - ids=_set_check_estimator_ids) + def checks_generator(): + for estimator in estimators: + name = type(estimator).__name__ + for check in _yield_all_checks(estimator): + check = partial(check, name) + yield _maybe_mark_xfail(estimator, check, pytest) + + return pytest.mark.parametrize("estimator, check", checks_generator(), + ids=_get_check_estimator_ids) def check_target_type(name, estimator): From d96c5ea343214c722dfe8c5817014366e00ef6f9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 3 Feb 2021 22:44:43 +0100 Subject: [PATCH 02/20] PEP8 --- imblearn/utils/estimator_checks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index b2f520373..e4ddb4ac2 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -16,7 +16,6 @@ import numpy as np from scipy import sparse -from sklearn.base import clone from sklearn.datasets import ( fetch_openml, make_classification, From d6c0ab9e4b9949e17d52ab8efc2d202c5cad7183 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 3 Feb 2021 22:52:20 +0100 Subject: [PATCH 03/20] iter --- doc/conf.py | 4 ++-- doc/over_sampling.rst | 2 +- imblearn/over_sampling/tests/test_random_over_sampler.py | 2 +- .../_prototype_selection/tests/test_random_under_sampler.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index e404258e3..3628318f6 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -345,8 +345,8 @@ def patch_signature(subject, bound_method=False, follow_wrapped=True): # https://github.com/readthedocs/sphinx_rtd_theme/pull/747/files def setup(app): app.registry.documenters["class"] = PatchedClassDocumenter - app.add_javascript("js/copybutton.js") - app.add_stylesheet("basic.css") + app.app.add_js_file()("js/copybutton.js") + app.add_css_file("basic.css") # app.connect('autodoc-process-docstring', generate_example_rst) diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst index a154a62dc..e5948cbd6 100644 --- a/doc/over_sampling.rst +++ b/doc/over_sampling.rst @@ -60,7 +60,7 @@ In addition, :class:`RandomOverSampler` allows to sample heterogeneous data >>> import numpy as np >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], - ... dtype=np.object) + ... dtype=object) >>> y_hetero = np.array([0, 0, 1]) >>> X_resampled, y_resampled = ros.fit_resample(X_hetero, y_hetero) >>> print(X_resampled) diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py index ca58a8012..2acda02e5 100644 --- a/imblearn/over_sampling/tests/test_random_over_sampler.py +++ b/imblearn/over_sampling/tests/test_random_over_sampler.py @@ -115,7 +115,7 @@ def test_multiclass_fit_resample(): def test_random_over_sampling_heterogeneous_data(): X_hetero = np.array( - [["xxx", 1, 1.0], ["yyy", 2, 2.0], ["zzz", 3, 3.0]], dtype=np.object + [["xxx", 1, 1.0], ["yyy", 2, 2.0], ["zzz", 3, 3.0]], dtype=object ) y = np.array([0, 0, 1]) ros = RandomOverSampler(random_state=RND_SEED) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py index 945d31fec..2ca0b3354 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py @@ -101,7 +101,7 @@ def test_multiclass_fit_resample(): def test_random_under_sampling_heterogeneous_data(): X_hetero = np.array( - [["xxx", 1, 1.0], ["yyy", 2, 2.0], ["zzz", 3, 3.0]], dtype=np.object + [["xxx", 1, 1.0], ["yyy", 2, 2.0], ["zzz", 3, 3.0]], dtype=object ) y = np.array([0, 0, 1]) rus = RandomUnderSampler(random_state=RND_SEED) From 41a3422050f589c408e1144c03d4aa9bfb9aa45d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 3 Feb 2021 22:58:53 +0100 Subject: [PATCH 04/20] iter --- doc/under_sampling.rst | 2 +- imblearn/utils/estimator_checks.py | 45 ++++++++++++++++++++---------- 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst index d00aab7ce..13798ad78 100644 --- a/doc/under_sampling.rst +++ b/doc/under_sampling.rst @@ -107,7 +107,7 @@ In addition, :class:`RandomUnderSampler` allows to sample heterogeneous data (e.g. containing some strings):: >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], - ... dtype=np.object) + ... dtype=object) >>> y_hetero = np.array([0, 0, 1]) >>> X_resampled, y_resampled = rus.fit_resample(X_hetero, y_hetero) >>> print(X_resampled) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index e4ddb4ac2..d803be59c 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -16,6 +16,7 @@ import numpy as np from scipy import sparse +from sklearn.base import clone from sklearn.datasets import ( fetch_openml, make_classification, @@ -127,7 +128,8 @@ def checks_generator(): ids=_get_check_estimator_ids) -def check_target_type(name, estimator): +def check_target_type(name, estimator_orig): + estimator = clone(estimator_orig) # should raise warning if the target is continuous (we cannot raise error) X = np.random.random((20, 2)) y = np.linspace(0, 1, 20) @@ -144,7 +146,8 @@ def check_target_type(name, estimator): ) -def check_samplers_one_label(name, sampler): +def check_samplers_one_label(name, sampler_orig): + sampler = clone(sampler_orig) error_string_fit = "Sampler can't balance when only one class is present." X = np.random.random((20, 2)) y = np.zeros(20) @@ -164,7 +167,8 @@ def check_samplers_one_label(name, sampler): raise AssertionError(error_string_fit) -def check_samplers_fit(name, sampler): +def check_samplers_fit(name, sampler_orig): + sampler = clone(sampler_orig) np.random.seed(42) # Make this test reproducible X = np.random.random((30, 2)) y = np.array([1] * 20 + [0] * 10) @@ -174,7 +178,8 @@ def check_samplers_fit(name, sampler): ), "No fitted attribute sampling_strategy_" -def check_samplers_fit_resample(name, sampler): +def check_samplers_fit_resample(name, sampler_orig): + sampler = clone(sampler_orig) X, y = make_classification( n_samples=1000, n_classes=3, @@ -209,7 +214,8 @@ def check_samplers_fit_resample(name, sampler): ) -def check_samplers_sampling_strategy_fit_resample(name, sampler): +def check_samplers_sampling_strategy_fit_resample(name, sampler_orig): + sampler = clone(sampler_orig) # in this test we will force all samplers to not change the class 1 X, y = make_classification( n_samples=1000, @@ -236,7 +242,8 @@ def check_samplers_sampling_strategy_fit_resample(name, sampler): assert Counter(y_res)[1] == expected_stat -def check_samplers_sparse(name, sampler): +def check_samplers_sparse(name, sampler_orig): + sampler = clone(sampler_orig) # check that sparse matrices can be passed through the sampler leading to # the same results than dense X, y = make_classification( @@ -254,8 +261,9 @@ def check_samplers_sparse(name, sampler): assert_allclose(y_res_sparse, y_res) -def check_samplers_pandas(name, sampler): +def check_samplers_pandas(name, sampler_orig): pd = pytest.importorskip("pandas") + sampler = clone(sampler_orig) # Check that the samplers handle pandas dataframe and pandas series X, y = make_classification( n_samples=1000, @@ -286,7 +294,8 @@ def check_samplers_pandas(name, sampler): assert_allclose(y_res_s.to_numpy(), y_res) -def check_samplers_list(name, sampler): +def check_samplers_list(name, sampler_orig): + sampler = clone(sampler_orig) # Check that the can samplers handle simple lists X, y = make_classification( n_samples=1000, @@ -308,7 +317,8 @@ def check_samplers_list(name, sampler): assert_allclose(y_res, y_res_list) -def check_samplers_multiclass_ova(name, sampler): +def check_samplers_multiclass_ova(name, sampler_orig): + sampler = clone(sampler_orig) # Check that multiclass target lead to the same results than OVA encoding X, y = make_classification( n_samples=1000, @@ -325,7 +335,8 @@ def check_samplers_multiclass_ova(name, sampler): assert_allclose(y_res, y_res_ova.argmax(axis=1)) -def check_samplers_2d_target(name, sampler): +def check_samplers_2d_target(name, sampler_orig): + sampler = clone(sampler_orig) X, y = make_classification( n_samples=100, n_classes=3, @@ -338,7 +349,8 @@ def check_samplers_2d_target(name, sampler): sampler.fit_resample(X, y) -def check_samplers_preserve_dtype(name, sampler): +def check_samplers_preserve_dtype(name, sampler_orig): + sampler = clone(sampler_orig) X, y = make_classification( n_samples=1000, n_classes=3, @@ -354,7 +366,8 @@ def check_samplers_preserve_dtype(name, sampler): assert y.dtype == y_res.dtype, "y dtype is not preserved" -def check_samplers_sample_indices(name, sampler): +def check_samplers_sample_indices(name, sampler_orig): + sampler = clone(sampler_orig) X, y = make_classification( n_samples=1000, n_classes=3, @@ -370,17 +383,21 @@ def check_samplers_sample_indices(name, sampler): assert not hasattr(sampler, "sample_indices_") -def check_classifier_on_multilabel_or_multioutput_targets(name, estimator): +def check_classifier_on_multilabel_or_multioutput_targets( + name, estimator_orig +): + estimator = clone(estimator_orig) X, y = make_multilabel_classification(n_samples=30) msg = "Multilabel and multioutput targets are not supported." with pytest.raises(ValueError, match=msg): estimator.fit(X, y) -def check_classifiers_with_encoded_labels(name, classifier): +def check_classifiers_with_encoded_labels(name, classifier_orig): # Non-regression test for #709 # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/709 pytest.importorskip("pandas") + classifier = clone(classifier_orig) df, y = fetch_openml("iris", version=1, as_frame=True, return_X_y=True) df, y = make_imbalance( df, y, sampling_strategy={ From 4c1f726bc7f351cae1b686dce901e8398ef79bed Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 3 Feb 2021 23:01:21 +0100 Subject: [PATCH 05/20] iter --- doc/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/conf.py b/doc/conf.py index 3628318f6..b3b9ecf21 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -345,7 +345,7 @@ def patch_signature(subject, bound_method=False, follow_wrapped=True): # https://github.com/readthedocs/sphinx_rtd_theme/pull/747/files def setup(app): app.registry.documenters["class"] = PatchedClassDocumenter - app.app.add_js_file()("js/copybutton.js") + app.add_js_file()("js/copybutton.js") app.add_css_file("basic.css") # app.connect('autodoc-process-docstring', generate_example_rst) From 3bcf336ef912e18d96e53d273dbb95c94785e6c5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 3 Feb 2021 23:06:17 +0100 Subject: [PATCH 06/20] iter --- doc/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/conf.py b/doc/conf.py index b3b9ecf21..71fc33f01 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -345,7 +345,7 @@ def patch_signature(subject, bound_method=False, follow_wrapped=True): # https://github.com/readthedocs/sphinx_rtd_theme/pull/747/files def setup(app): app.registry.documenters["class"] = PatchedClassDocumenter - app.add_js_file()("js/copybutton.js") + app.add_js_file("js/copybutton.js") app.add_css_file("basic.css") # app.connect('autodoc-process-docstring', generate_example_rst) From 1fbc4a334dad22fbbe232340faa7b95f145999a7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 3 Feb 2021 23:12:58 +0100 Subject: [PATCH 07/20] iter --- doc/conf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/conf.py b/doc/conf.py index 71fc33f01..2531683fa 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -43,6 +43,9 @@ 'sphinx_gallery.gen_gallery', ] +# bibtex file +bibtex_bibfiles = ['bibtex/refs.bib'] + # this is needed for some reason... # see https://github.com/numpy/numpydoc/issues/69 numpydoc_show_class_members = False From 6a37a07ce01495db28769affa34b9a4b907c7869 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 3 Feb 2021 23:21:01 +0100 Subject: [PATCH 08/20] iter --- imblearn/utils/estimator_checks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index d803be59c..368c8ef35 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -255,6 +255,7 @@ def check_samplers_sparse(name, sampler_orig): ) X_sparse = sparse.csr_matrix(X) X_res_sparse, y_res_sparse = sampler.fit_resample(X_sparse, y) + sampler = clone(sampler) X_res, y_res = sampler.fit_resample(X, y) assert sparse.issparse(X_res_sparse) assert_allclose(X_res_sparse.A, X_res) From 0451b8bf69a3563a6b79f41e810411c0377f8ddd Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 3 Feb 2021 23:36:13 +0100 Subject: [PATCH 09/20] iter --- azure-pipelines.yml | 2 +- imblearn/utils/estimator_checks.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 1699a0d88..d0ce77834 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -32,7 +32,7 @@ jobs: # Linux environment to test the latest available dependencies and MKL. pylatest_pip_openblas_pandas: DISTRIB: 'conda-pip-latest' - PYTHON_VERSION: '3.8' + PYTHON_VERSION: '3.9' COVERAGE: 'true' PANDAS_VERSION: '*' TEST_DOCSTRINGS: 'true' diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 368c8ef35..f1bd02bc2 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -258,7 +258,7 @@ def check_samplers_sparse(name, sampler_orig): sampler = clone(sampler) X_res, y_res = sampler.fit_resample(X, y) assert sparse.issparse(X_res_sparse) - assert_allclose(X_res_sparse.A, X_res) + assert_allclose(X_res_sparse.A, X_res, rtol=1e-5) assert_allclose(y_res_sparse, y_res) From 3f9f10dee8e15813cf95303b0fdb9d7b87fe2a54 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 3 Feb 2021 23:50:57 +0100 Subject: [PATCH 10/20] debug --- imblearn/utils/estimator_checks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index f1bd02bc2..dfa82fc42 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -257,6 +257,8 @@ def check_samplers_sparse(name, sampler_orig): X_res_sparse, y_res_sparse = sampler.fit_resample(X_sparse, y) sampler = clone(sampler) X_res, y_res = sampler.fit_resample(X, y) + if not np.all(np.isclose(X_res_sparse.A, X_res)): + print(np.flatnonzero(~np.isclose(X_res_sparse.A, X_res))) assert sparse.issparse(X_res_sparse) assert_allclose(X_res_sparse.A, X_res, rtol=1e-5) assert_allclose(y_res_sparse, y_res) From d5614002c977aa6837b0a140fbfe7c2a6fdfe311 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 3 Feb 2021 23:59:54 +0100 Subject: [PATCH 11/20] iter --- build_tools/azure/test_script.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh index 37793c529..6f3cf03cc 100755 --- a/build_tools/azure/test_script.sh +++ b/build_tools/azure/test_script.sh @@ -21,7 +21,7 @@ except ImportError: python -c "import multiprocessing as mp; print('%d CPUs' % mp.cpu_count())" pip list -TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML" +TEST_CMD="python -m pytest -vsl --durations=20 --junitxml=$JUNITXML" if [[ "$COVERAGE" == "true" ]]; then export COVERAGE_PROCESS_START="$BUILD_SOURCESDIRECTORY/.coveragerc" From 86aef35d4110aa15376d8c6db6b70227a1a3cf36 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 4 Feb 2021 09:50:38 +0100 Subject: [PATCH 12/20] iter --- imblearn/utils/estimator_checks.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index dfa82fc42..436c568fd 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -258,7 +258,10 @@ def check_samplers_sparse(name, sampler_orig): sampler = clone(sampler) X_res, y_res = sampler.fit_resample(X, y) if not np.all(np.isclose(X_res_sparse.A, X_res)): - print(np.flatnonzero(~np.isclose(X_res_sparse.A, X_res))) + xx = np.nonzero(~np.isclose(X_res_sparse.A, X_res)) + print(xx) + print(X_res_sparse.A[xx[0]]) + print(X_res[xx[0]]) assert sparse.issparse(X_res_sparse) assert_allclose(X_res_sparse.A, X_res, rtol=1e-5) assert_allclose(y_res_sparse, y_res) From f5a7ac02c59b4bde9dea505525e2c966380c1d10 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 8 Feb 2021 16:50:30 +0100 Subject: [PATCH 13/20] iter --- .../under_sampling/_prototype_generation/_cluster_centroids.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py index 24afdf044..723a721ef 100644 --- a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py @@ -170,8 +170,10 @@ def _fit_resample(self, X, y): target_class_indices = np.flatnonzero(y == target_class) if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] + print(n_samples) self.estimator_.set_params(**{"n_clusters": n_samples}) self.estimator_.fit(_safe_indexing(X, target_class_indices)) + print(self.estimator_.cluster_centers_) X_new, y_new = self._generate_sample( _safe_indexing(X, target_class_indices), _safe_indexing(y, target_class_indices), From 32795b74a8ff82cc22c4c96e6dba78fd7389b04d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 8 Feb 2021 17:10:01 +0100 Subject: [PATCH 14/20] iter --- .../_prototype_generation/_cluster_centroids.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py index 723a721ef..756e5ab83 100644 --- a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py @@ -167,17 +167,16 @@ def _fit_resample(self, X, y): X_resampled, y_resampled = [], [] for target_class in np.unique(y): + estimator = clone(self.estimator_) target_class_indices = np.flatnonzero(y == target_class) if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] - print(n_samples) - self.estimator_.set_params(**{"n_clusters": n_samples}) - self.estimator_.fit(_safe_indexing(X, target_class_indices)) - print(self.estimator_.cluster_centers_) + estimator.set_params(**{"n_clusters": n_samples}) + estimator.fit(_safe_indexing(X, target_class_indices)) X_new, y_new = self._generate_sample( _safe_indexing(X, target_class_indices), _safe_indexing(y, target_class_indices), - self.estimator_.cluster_centers_, + estimator.cluster_centers_, target_class, ) X_resampled.append(X_new) From c1ccb46eab2f9b35caa6949a54fb48301d147783 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 8 Feb 2021 17:22:17 +0100 Subject: [PATCH 15/20] iter --- .../_prototype_generation/_cluster_centroids.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py index 756e5ab83..d49ded789 100644 --- a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py @@ -167,16 +167,17 @@ def _fit_resample(self, X, y): X_resampled, y_resampled = [], [] for target_class in np.unique(y): - estimator = clone(self.estimator_) target_class_indices = np.flatnonzero(y == target_class) if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] - estimator.set_params(**{"n_clusters": n_samples}) - estimator.fit(_safe_indexing(X, target_class_indices)) + self.estimator_.set_params(**{"n_clusters": n_samples}) + self.estimator_.fit(_safe_indexing(X, target_class_indices)) + print(target_class_indices) + print(_safe_indexing(X, target_class_indices)) X_new, y_new = self._generate_sample( _safe_indexing(X, target_class_indices), _safe_indexing(y, target_class_indices), - estimator.cluster_centers_, + self.estimator_.cluster_centers_, target_class, ) X_resampled.append(X_new) From b52d97c31619762899922f5e068dc3fdd9324ae2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 8 Feb 2021 17:55:58 +0100 Subject: [PATCH 16/20] debug --- .../_cluster_centroids.py | 2 - .../tests/test_cluster_centroids.py | 39 +++++++++++++++++++ imblearn/utils/estimator_checks.py | 5 --- 3 files changed, 39 insertions(+), 7 deletions(-) diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py index d49ded789..24afdf044 100644 --- a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py @@ -172,8 +172,6 @@ def _fit_resample(self, X, y): n_samples = self.sampling_strategy_[target_class] self.estimator_.set_params(**{"n_clusters": n_samples}) self.estimator_.fit(_safe_indexing(X, target_class_indices)) - print(target_class_indices) - print(_safe_indexing(X, target_class_indices)) X_new, y_new = self._generate_sample( _safe_indexing(X, target_class_indices), _safe_indexing(y, target_class_indices), diff --git a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py index aaffea261..91bb70dcc 100644 --- a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py @@ -156,3 +156,42 @@ def test_cluster_centroids_hard_target_class(): for minority_sample in X_minority_class ] assert sum(sample_from_minority_in_majority) == 0 + + +def test_xxx(): + # %% + from sklearn.datasets import make_classification + + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0, + ) + + # %% + import numpy as np + from scipy import sparse + from sklearn.utils import _safe_indexing + + target_class_indices = np.flatnonzero(y == 2) + X_class = _safe_indexing(X, target_class_indices) + X_class_sparse = sparse.csr_matrix(X_class) + + # %% + from sklearn.cluster import KMeans + + kmeans = KMeans(algorithm="full", random_state=0) + + # %% + from sklearn.base import clone + + kmeans_dense = clone(kmeans).fit(X_class) + kmeans_sparse = clone(kmeans).fit(X_class_sparse) + + # %% + np.testing.assert_allclose( + kmeans_dense.cluster_centers_, + kmeans_sparse.cluster_centers_ + ) \ No newline at end of file diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 436c568fd..f1bd02bc2 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -257,11 +257,6 @@ def check_samplers_sparse(name, sampler_orig): X_res_sparse, y_res_sparse = sampler.fit_resample(X_sparse, y) sampler = clone(sampler) X_res, y_res = sampler.fit_resample(X, y) - if not np.all(np.isclose(X_res_sparse.A, X_res)): - xx = np.nonzero(~np.isclose(X_res_sparse.A, X_res)) - print(xx) - print(X_res_sparse.A[xx[0]]) - print(X_res[xx[0]]) assert sparse.issparse(X_res_sparse) assert_allclose(X_res_sparse.A, X_res, rtol=1e-5) assert_allclose(y_res_sparse, y_res) From 301f7a840a0675923a0e6a4f8fd451b2afc14769 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 8 Feb 2021 17:58:10 +0100 Subject: [PATCH 17/20] iter --- .../_prototype_generation/tests/test_cluster_centroids.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py index 91bb70dcc..4e071487d 100644 --- a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py @@ -194,4 +194,4 @@ def test_xxx(): np.testing.assert_allclose( kmeans_dense.cluster_centers_, kmeans_sparse.cluster_centers_ - ) \ No newline at end of file + ) From 4c7a9a35b1361d48e91eb263fd405ae90142745e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 8 Feb 2021 18:04:49 +0100 Subject: [PATCH 18/20] iter --- .../under_sampling/_prototype_generation/_cluster_centroids.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py index 24afdf044..716a9e449 100644 --- a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py @@ -172,6 +172,8 @@ def _fit_resample(self, X, y): n_samples = self.sampling_strategy_[target_class] self.estimator_.set_params(**{"n_clusters": n_samples}) self.estimator_.fit(_safe_indexing(X, target_class_indices)) + print(target_class) + print(self.estimator_.cluster_centers_) X_new, y_new = self._generate_sample( _safe_indexing(X, target_class_indices), _safe_indexing(y, target_class_indices), From 6e57141603d2c4ef7ebbbe2bc9a28f17bafb0117 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 8 Feb 2021 18:17:59 +0100 Subject: [PATCH 19/20] iter --- .../_prototype_generation/tests/test_cluster_centroids.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py index 4e071487d..58e895d48 100644 --- a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py @@ -182,7 +182,7 @@ def test_xxx(): # %% from sklearn.cluster import KMeans - kmeans = KMeans(algorithm="full", random_state=0) + kmeans = KMeans(n_clusters=201, algorithm="full", random_state=0) # %% from sklearn.base import clone From bff2e51f46dc9d64655e4560ca46945cb0ad4f60 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 8 Feb 2021 20:34:51 +0100 Subject: [PATCH 20/20] iter --- .../_cluster_centroids.py | 2 - .../tests/test_cluster_centroids.py | 39 ------------------- imblearn/utils/estimator_checks.py | 2 +- 3 files changed, 1 insertion(+), 42 deletions(-) diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py index 716a9e449..24afdf044 100644 --- a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py @@ -172,8 +172,6 @@ def _fit_resample(self, X, y): n_samples = self.sampling_strategy_[target_class] self.estimator_.set_params(**{"n_clusters": n_samples}) self.estimator_.fit(_safe_indexing(X, target_class_indices)) - print(target_class) - print(self.estimator_.cluster_centers_) X_new, y_new = self._generate_sample( _safe_indexing(X, target_class_indices), _safe_indexing(y, target_class_indices), diff --git a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py index 58e895d48..aaffea261 100644 --- a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py @@ -156,42 +156,3 @@ def test_cluster_centroids_hard_target_class(): for minority_sample in X_minority_class ] assert sum(sample_from_minority_in_majority) == 0 - - -def test_xxx(): - # %% - from sklearn.datasets import make_classification - - X, y = make_classification( - n_samples=1000, - n_classes=3, - n_informative=4, - weights=[0.2, 0.3, 0.5], - random_state=0, - ) - - # %% - import numpy as np - from scipy import sparse - from sklearn.utils import _safe_indexing - - target_class_indices = np.flatnonzero(y == 2) - X_class = _safe_indexing(X, target_class_indices) - X_class_sparse = sparse.csr_matrix(X_class) - - # %% - from sklearn.cluster import KMeans - - kmeans = KMeans(n_clusters=201, algorithm="full", random_state=0) - - # %% - from sklearn.base import clone - - kmeans_dense = clone(kmeans).fit(X_class) - kmeans_sparse = clone(kmeans).fit(X_class_sparse) - - # %% - np.testing.assert_allclose( - kmeans_dense.cluster_centers_, - kmeans_sparse.cluster_centers_ - ) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index f1bd02bc2..6a3032ebf 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -44,7 +44,7 @@ def _set_checking_parameters(estimator): if name == "ClusterCentroids": estimator.set_params( voting="soft", - estimator=KMeans(random_state=0, algorithm="full"), + estimator=KMeans(random_state=0, algorithm="full", n_init=1), ) if name == "KMeansSMOTE": estimator.set_params(kmeans_estimator=12)