From 30700e3208e5adb929908f29e31d2925a43aa000 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Mon, 16 Sep 2024 12:29:04 +0200
Subject: [PATCH 1/3] remove mentions from single file

---
 docs/source/en/api/loaders/single_file.md | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docs/source/en/api/loaders/single_file.md b/docs/source/en/api/loaders/single_file.md
index 380c8902153f..64ca02fd8387 100644
--- a/docs/source/en/api/loaders/single_file.md
+++ b/docs/source/en/api/loaders/single_file.md
@@ -22,9 +22,6 @@ The [`~loaders.FromSingleFileMixin.from_single_file`] method allows you to load:
 
 ## Supported pipelines
 
-- [`CogVideoXPipeline`]
-- [`CogVideoXImageToVideoPipeline`]
-- [`CogVideoXVideoToVideoPipeline`]
 - [`StableDiffusionPipeline`]
 - [`StableDiffusionImg2ImgPipeline`]
 - [`StableDiffusionInpaintPipeline`]
@@ -52,7 +49,6 @@ The [`~loaders.FromSingleFileMixin.from_single_file`] method allows you to load:
 - [`UNet2DConditionModel`]
 - [`StableCascadeUNet`]
 - [`AutoencoderKL`]
-- [`AutoencoderKLCogVideoX`]
 - [`ControlNetModel`]
 - [`SD3Transformer2DModel`]
 - [`FluxTransformer2DModel`]

From edd224627090643d15330dcba180ebf1e5e4df7c Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Mon, 16 Sep 2024 12:29:16 +0200
Subject: [PATCH 2/3] update tests

---
 tests/pipelines/cogvideo/test_cogvideox.py      | 17 +++++++++++------
 .../cogvideo/test_cogvideox_image2video.py      |  5 +++--
 .../cogvideo/test_cogvideox_video2video.py      | 17 +++++++++++------
 3 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/tests/pipelines/cogvideo/test_cogvideox.py b/tests/pipelines/cogvideo/test_cogvideox.py
index c69dcfda93c5..32adee4d4b77 100644
--- a/tests/pipelines/cogvideo/test_cogvideox.py
+++ b/tests/pipelines/cogvideo/test_cogvideox.py
@@ -57,6 +57,7 @@ class CogVideoXPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
             "callback_on_step_end_tensor_inputs",
         ]
     )
+    test_xformers_attention = False
 
     def get_dummy_components(self):
         torch.manual_seed(0)
@@ -71,8 +72,8 @@ def get_dummy_components(self):
             time_embed_dim=2,
             text_embed_dim=32,  # Must match with tiny-random-t5
             num_layers=1,
-            sample_width=16,  # latent width: 2 -> final width: 16
-            sample_height=16,  # latent height: 2 -> final height: 16
+            sample_width=2,  # latent width: 2 -> final width: 16
+            sample_height=2,  # latent height: 2 -> final height: 16
             sample_frames=9,  # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9
             patch_size=2,
             temporal_compression_ratio=4,
@@ -254,6 +255,14 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
         generator_device = "cpu"
         components = self.get_dummy_components()
 
+        # Unlike the ImageToVideo test, this is not needed here because positional embeds can be generated on-the-fly
+        # due to them being static
+        # components["transformer"] = CogVideoXTransformer3DModel.from_config(
+        #     components["transformer"].config,
+        #     sample_height=16,
+        #     sample_width=16,
+        # )
+
         pipe = self.pipeline_class(**components)
         pipe.to("cpu")
         pipe.set_progress_bar_config(disable=None)
@@ -280,10 +289,6 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
             "VAE tiling should not affect the inference results",
         )
 
-    @unittest.skip("xformers attention processor does not exist for CogVideoX")
-    def test_xformers_attention_forwardGenerator_pass(self):
-        pass
-
     def test_fused_qkv_projections(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         components = self.get_dummy_components()
diff --git a/tests/pipelines/cogvideo/test_cogvideox_image2video.py b/tests/pipelines/cogvideo/test_cogvideox_image2video.py
index 5948fc3deb1c..ec9a5fdd153e 100644
--- a/tests/pipelines/cogvideo/test_cogvideox_image2video.py
+++ b/tests/pipelines/cogvideo/test_cogvideox_image2video.py
@@ -269,8 +269,9 @@ def test_vae_tiling(self, expected_diff_max: float = 0.3):
         generator_device = "cpu"
         components = self.get_dummy_components()
 
-        # The reason to modify it this way is because I2V Transformer limits the generation to resolutions.
-        # See the if-statement on "self.use_learned_positional_embeddings"
+        # The reason to modify it this way is because I2V Transformer limits the generation to resolutions used during initalization.
+        # This limitation comes from using learned positional embeddings which cannot be generated on-the-fly like sincos or RoPE embeddings.
+        # See the if-statement on "self.use_learned_positional_embeddings" in diffusers/models/embeddings.py
         components["transformer"] = CogVideoXTransformer3DModel.from_config(
             components["transformer"].config,
             sample_height=16,
diff --git a/tests/pipelines/cogvideo/test_cogvideox_video2video.py b/tests/pipelines/cogvideo/test_cogvideox_video2video.py
index 27f0c8441c55..16414f4745e3 100644
--- a/tests/pipelines/cogvideo/test_cogvideox_video2video.py
+++ b/tests/pipelines/cogvideo/test_cogvideox_video2video.py
@@ -51,6 +51,7 @@ class CogVideoXVideoToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestC
             "callback_on_step_end_tensor_inputs",
         ]
     )
+    test_xformers_attention = False
 
     def get_dummy_components(self):
         torch.manual_seed(0)
@@ -65,8 +66,8 @@ def get_dummy_components(self):
             time_embed_dim=2,
             text_embed_dim=32,  # Must match with tiny-random-t5
             num_layers=1,
-            sample_width=16,  # latent width: 2 -> final width: 16
-            sample_height=16,  # latent height: 2 -> final height: 16
+            sample_width=2,  # latent width: 2 -> final width: 16
+            sample_height=2,  # latent height: 2 -> final height: 16
             sample_frames=9,  # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9
             patch_size=2,
             temporal_compression_ratio=4,
@@ -259,6 +260,14 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
         generator_device = "cpu"
         components = self.get_dummy_components()
 
+        # Unlike the ImageToVideo test, this is not needed here because positional embeds can be generated on-the-fly
+        # due to them being static
+        # components["transformer"] = CogVideoXTransformer3DModel.from_config(
+        #     components["transformer"].config,
+        #     sample_height=16,
+        #     sample_width=16,
+        # )
+
         pipe = self.pipeline_class(**components)
         pipe.to("cpu")
         pipe.set_progress_bar_config(disable=None)
@@ -285,10 +294,6 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
             "VAE tiling should not affect the inference results",
         )
 
-    @unittest.skip("xformers attention processor does not exist for CogVideoX")
-    def test_xformers_attention_forwardGenerator_pass(self):
-        pass
-
     def test_fused_qkv_projections(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         components = self.get_dummy_components()

From d6a145dbdcc2632592788da5ff520d480a98f6d4 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Tue, 17 Sep 2024 00:42:12 +0200
Subject: [PATCH 3/3] update

---
 tests/pipelines/cogvideo/test_cogvideox.py             | 8 --------
 tests/pipelines/cogvideo/test_cogvideox_video2video.py | 8 --------
 2 files changed, 16 deletions(-)

diff --git a/tests/pipelines/cogvideo/test_cogvideox.py b/tests/pipelines/cogvideo/test_cogvideox.py
index 32adee4d4b77..884ddfb2a95a 100644
--- a/tests/pipelines/cogvideo/test_cogvideox.py
+++ b/tests/pipelines/cogvideo/test_cogvideox.py
@@ -255,14 +255,6 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
         generator_device = "cpu"
         components = self.get_dummy_components()
 
-        # Unlike the ImageToVideo test, this is not needed here because positional embeds can be generated on-the-fly
-        # due to them being static
-        # components["transformer"] = CogVideoXTransformer3DModel.from_config(
-        #     components["transformer"].config,
-        #     sample_height=16,
-        #     sample_width=16,
-        # )
-
         pipe = self.pipeline_class(**components)
         pipe.to("cpu")
         pipe.set_progress_bar_config(disable=None)
diff --git a/tests/pipelines/cogvideo/test_cogvideox_video2video.py b/tests/pipelines/cogvideo/test_cogvideox_video2video.py
index 16414f4745e3..4d836cb5e2a4 100644
--- a/tests/pipelines/cogvideo/test_cogvideox_video2video.py
+++ b/tests/pipelines/cogvideo/test_cogvideox_video2video.py
@@ -260,14 +260,6 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
         generator_device = "cpu"
         components = self.get_dummy_components()
 
-        # Unlike the ImageToVideo test, this is not needed here because positional embeds can be generated on-the-fly
-        # due to them being static
-        # components["transformer"] = CogVideoXTransformer3DModel.from_config(
-        #     components["transformer"].config,
-        #     sample_height=16,
-        #     sample_width=16,
-        # )
-
         pipe = self.pipeline_class(**components)
         pipe.to("cpu")
         pipe.set_progress_bar_config(disable=None)