From 30700e3208e5adb929908f29e31d2925a43aa000 Mon Sep 17 00:00:00 2001 From: Aryan Date: Mon, 16 Sep 2024 12:29:04 +0200 Subject: [PATCH 1/3] remove mentions from single file --- docs/source/en/api/loaders/single_file.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docs/source/en/api/loaders/single_file.md b/docs/source/en/api/loaders/single_file.md index 380c8902153f..64ca02fd8387 100644 --- a/docs/source/en/api/loaders/single_file.md +++ b/docs/source/en/api/loaders/single_file.md @@ -22,9 +22,6 @@ The [`~loaders.FromSingleFileMixin.from_single_file`] method allows you to load: ## Supported pipelines -- [`CogVideoXPipeline`] -- [`CogVideoXImageToVideoPipeline`] -- [`CogVideoXVideoToVideoPipeline`] - [`StableDiffusionPipeline`] - [`StableDiffusionImg2ImgPipeline`] - [`StableDiffusionInpaintPipeline`] @@ -52,7 +49,6 @@ The [`~loaders.FromSingleFileMixin.from_single_file`] method allows you to load: - [`UNet2DConditionModel`] - [`StableCascadeUNet`] - [`AutoencoderKL`] -- [`AutoencoderKLCogVideoX`] - [`ControlNetModel`] - [`SD3Transformer2DModel`] - [`FluxTransformer2DModel`] From edd224627090643d15330dcba180ebf1e5e4df7c Mon Sep 17 00:00:00 2001 From: Aryan Date: Mon, 16 Sep 2024 12:29:16 +0200 Subject: [PATCH 2/3] update tests --- tests/pipelines/cogvideo/test_cogvideox.py | 17 +++++++++++------ .../cogvideo/test_cogvideox_image2video.py | 5 +++-- .../cogvideo/test_cogvideox_video2video.py | 17 +++++++++++------ 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/tests/pipelines/cogvideo/test_cogvideox.py b/tests/pipelines/cogvideo/test_cogvideox.py index c69dcfda93c5..32adee4d4b77 100644 --- a/tests/pipelines/cogvideo/test_cogvideox.py +++ b/tests/pipelines/cogvideo/test_cogvideox.py @@ -57,6 +57,7 @@ class CogVideoXPipelineFastTests(PipelineTesterMixin, unittest.TestCase): "callback_on_step_end_tensor_inputs", ] ) + test_xformers_attention = False def get_dummy_components(self): torch.manual_seed(0) @@ -71,8 +72,8 @@ def get_dummy_components(self): time_embed_dim=2, text_embed_dim=32, # Must match with tiny-random-t5 num_layers=1, - sample_width=16, # latent width: 2 -> final width: 16 - sample_height=16, # latent height: 2 -> final height: 16 + sample_width=2, # latent width: 2 -> final width: 16 + sample_height=2, # latent height: 2 -> final height: 16 sample_frames=9, # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9 patch_size=2, temporal_compression_ratio=4, @@ -254,6 +255,14 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2): generator_device = "cpu" components = self.get_dummy_components() + # Unlike the ImageToVideo test, this is not needed here because positional embeds can be generated on-the-fly + # due to them being static + # components["transformer"] = CogVideoXTransformer3DModel.from_config( + # components["transformer"].config, + # sample_height=16, + # sample_width=16, + # ) + pipe = self.pipeline_class(**components) pipe.to("cpu") pipe.set_progress_bar_config(disable=None) @@ -280,10 +289,6 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2): "VAE tiling should not affect the inference results", ) - @unittest.skip("xformers attention processor does not exist for CogVideoX") - def test_xformers_attention_forwardGenerator_pass(self): - pass - def test_fused_qkv_projections(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() diff --git a/tests/pipelines/cogvideo/test_cogvideox_image2video.py b/tests/pipelines/cogvideo/test_cogvideox_image2video.py index 5948fc3deb1c..ec9a5fdd153e 100644 --- a/tests/pipelines/cogvideo/test_cogvideox_image2video.py +++ b/tests/pipelines/cogvideo/test_cogvideox_image2video.py @@ -269,8 +269,9 @@ def test_vae_tiling(self, expected_diff_max: float = 0.3): generator_device = "cpu" components = self.get_dummy_components() - # The reason to modify it this way is because I2V Transformer limits the generation to resolutions. - # See the if-statement on "self.use_learned_positional_embeddings" + # The reason to modify it this way is because I2V Transformer limits the generation to resolutions used during initalization. + # This limitation comes from using learned positional embeddings which cannot be generated on-the-fly like sincos or RoPE embeddings. + # See the if-statement on "self.use_learned_positional_embeddings" in diffusers/models/embeddings.py components["transformer"] = CogVideoXTransformer3DModel.from_config( components["transformer"].config, sample_height=16, diff --git a/tests/pipelines/cogvideo/test_cogvideox_video2video.py b/tests/pipelines/cogvideo/test_cogvideox_video2video.py index 27f0c8441c55..16414f4745e3 100644 --- a/tests/pipelines/cogvideo/test_cogvideox_video2video.py +++ b/tests/pipelines/cogvideo/test_cogvideox_video2video.py @@ -51,6 +51,7 @@ class CogVideoXVideoToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestC "callback_on_step_end_tensor_inputs", ] ) + test_xformers_attention = False def get_dummy_components(self): torch.manual_seed(0) @@ -65,8 +66,8 @@ def get_dummy_components(self): time_embed_dim=2, text_embed_dim=32, # Must match with tiny-random-t5 num_layers=1, - sample_width=16, # latent width: 2 -> final width: 16 - sample_height=16, # latent height: 2 -> final height: 16 + sample_width=2, # latent width: 2 -> final width: 16 + sample_height=2, # latent height: 2 -> final height: 16 sample_frames=9, # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9 patch_size=2, temporal_compression_ratio=4, @@ -259,6 +260,14 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2): generator_device = "cpu" components = self.get_dummy_components() + # Unlike the ImageToVideo test, this is not needed here because positional embeds can be generated on-the-fly + # due to them being static + # components["transformer"] = CogVideoXTransformer3DModel.from_config( + # components["transformer"].config, + # sample_height=16, + # sample_width=16, + # ) + pipe = self.pipeline_class(**components) pipe.to("cpu") pipe.set_progress_bar_config(disable=None) @@ -285,10 +294,6 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2): "VAE tiling should not affect the inference results", ) - @unittest.skip("xformers attention processor does not exist for CogVideoX") - def test_xformers_attention_forwardGenerator_pass(self): - pass - def test_fused_qkv_projections(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() From d6a145dbdcc2632592788da5ff520d480a98f6d4 Mon Sep 17 00:00:00 2001 From: Aryan Date: Tue, 17 Sep 2024 00:42:12 +0200 Subject: [PATCH 3/3] update --- tests/pipelines/cogvideo/test_cogvideox.py | 8 -------- tests/pipelines/cogvideo/test_cogvideox_video2video.py | 8 -------- 2 files changed, 16 deletions(-) diff --git a/tests/pipelines/cogvideo/test_cogvideox.py b/tests/pipelines/cogvideo/test_cogvideox.py index 32adee4d4b77..884ddfb2a95a 100644 --- a/tests/pipelines/cogvideo/test_cogvideox.py +++ b/tests/pipelines/cogvideo/test_cogvideox.py @@ -255,14 +255,6 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2): generator_device = "cpu" components = self.get_dummy_components() - # Unlike the ImageToVideo test, this is not needed here because positional embeds can be generated on-the-fly - # due to them being static - # components["transformer"] = CogVideoXTransformer3DModel.from_config( - # components["transformer"].config, - # sample_height=16, - # sample_width=16, - # ) - pipe = self.pipeline_class(**components) pipe.to("cpu") pipe.set_progress_bar_config(disable=None) diff --git a/tests/pipelines/cogvideo/test_cogvideox_video2video.py b/tests/pipelines/cogvideo/test_cogvideox_video2video.py index 16414f4745e3..4d836cb5e2a4 100644 --- a/tests/pipelines/cogvideo/test_cogvideox_video2video.py +++ b/tests/pipelines/cogvideo/test_cogvideox_video2video.py @@ -260,14 +260,6 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2): generator_device = "cpu" components = self.get_dummy_components() - # Unlike the ImageToVideo test, this is not needed here because positional embeds can be generated on-the-fly - # due to them being static - # components["transformer"] = CogVideoXTransformer3DModel.from_config( - # components["transformer"].config, - # sample_height=16, - # sample_width=16, - # ) - pipe = self.pipeline_class(**components) pipe.to("cpu") pipe.set_progress_bar_config(disable=None)