Add single file loading support for AnimateDiff (#8819)

DN6 · web-flow · commit 11d18f3217a9 · 2024-07-12T09:51:57.000+05:30
* update

* update

* update

* update
diff --git a/docs/source/en/api/pipelines/animatediff.md b/docs/source/en/api/pipelines/animatediff.md
@@ -560,6 +560,20 @@ export_to_gif(frames, "animatelcm-motion-lora.gif")
 </table>
 
 
+## Using `from_single_file` with the MotionAdapter
+
+`diffusers>=0.30.0` supports loading the AnimateDiff checkpoints into the `MotionAdapter` in their original format via `from_single_file`
+
+```python
+from diffusers import MotionAdapter
+
+ckpt_path = "https://huggingface.co/Lightricks/LongAnimateDiff/blob/main/lt_long_mm_32_frames.ckpt"
+
+adapter = MotionAdapter.from_single_file(ckpt_path, torch_dtype=torch.float16)
+pipe = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapter=adapter)
+
+```
+
 ## AnimateDiffPipeline
 
 [[autodoc]] AnimateDiffPipeline
diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
@@ -22,6 +22,7 @@
 from ..utils import deprecate, is_accelerate_available, logging
 from .single_file_utils import (
     SingleFileComponentError,
+    convert_animatediff_checkpoint_to_diffusers,
     convert_controlnet_checkpoint,
     convert_ldm_unet_checkpoint,
     convert_ldm_vae_checkpoint,
@@ -70,6 +71,9 @@
         "checkpoint_mapping_fn": convert_sd3_transformer_checkpoint_to_diffusers,
         "default_subfolder": "transformer",
     },
+    "MotionAdapter": {
+        "checkpoint_mapping_fn": convert_animatediff_checkpoint_to_diffusers,
+    },
 }
 
 
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
@@ -74,6 +74,9 @@
     "stable_cascade_stage_b": "down_blocks.1.0.channelwise.0.weight",
     "stable_cascade_stage_c": "clip_txt_mapper.weight",
     "sd3": "model.diffusion_model.joint_blocks.0.context_block.adaLN_modulation.1.bias",
+    "animatediff": "down_blocks.0.motion_modules.0.temporal_transformer.transformer_blocks.0.attention_blocks.1.pos_encoder.pe",
+    "animatediff_v2": "mid_block.motion_modules.0.temporal_transformer.norm.bias",
+    "animatediff_sdxl_beta": "up_blocks.2.motion_modules.0.temporal_transformer.norm.weight",
 }
 
 DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
@@ -103,6 +106,10 @@
     "sd3": {
         "pretrained_model_name_or_path": "stabilityai/stable-diffusion-3-medium-diffusers",
     },
+    "animatediff_v1": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5"},
+    "animatediff_v2": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5-2"},
+    "animatediff_v3": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5-3"},
+    "animatediff_sdxl_beta": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-sdxl-beta"},
 }
 
 # Use to configure model sample size when original config is provided
@@ -485,6 +492,19 @@ def infer_diffusers_model_type(checkpoint):
     elif CHECKPOINT_KEY_NAMES["sd3"] in checkpoint:
         model_type = "sd3"
 
+    elif CHECKPOINT_KEY_NAMES["animatediff"] in checkpoint:
+        if CHECKPOINT_KEY_NAMES["animatediff_v2"] in checkpoint:
+            model_type = "animatediff_v2"
+
+        elif checkpoint[CHECKPOINT_KEY_NAMES["animatediff_sdxl_beta"]].shape[-1] == 320:
+            model_type = "animatediff_sdxl_beta"
+
+        elif checkpoint[CHECKPOINT_KEY_NAMES["animatediff"]].shape[1] == 24:
+            model_type = "animatediff_v1"
+
+        else:
+            model_type = "animatediff_v3"
+
     else:
         model_type = "v1"
 
@@ -1822,3 +1842,22 @@ def create_diffusers_t5_model_from_checkpoint(
                 param.data = param.data.to(torch.float32)
 
     return model
+
+
+def convert_animatediff_checkpoint_to_diffusers(checkpoint, **kwargs):
+    converted_state_dict = {}
+    for k, v in checkpoint.items():
+        if "pos_encoder" in k:
+            continue
+
+        else:
+            converted_state_dict[
+                k.replace(".norms.0", ".norm1")
+                .replace(".norms.1", ".norm2")
+                .replace(".ff_norm", ".norm3")
+                .replace(".attention_blocks.0", ".attn1")
+                .replace(".attention_blocks.1", ".attn2")
+                .replace(".temporal_transformer", "")
+            ] = v
+
+    return converted_state_dict
diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py
@@ -19,7 +19,7 @@
 import torch.utils.checkpoint
 
 from ...configuration_utils import ConfigMixin, FrozenDict, register_to_config
-from ...loaders import UNet2DConditionLoadersMixin
+from ...loaders import FromOriginalModelMixin, UNet2DConditionLoadersMixin
 from ...utils import logging
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
@@ -93,7 +93,7 @@ def __init__(
             )
 
 
-class MotionAdapter(ModelMixin, ConfigMixin):
+class MotionAdapter(ModelMixin, ConfigMixin, FromOriginalModelMixin):
     @register_to_config
     def __init__(
         self,
diff --git a/tests/single_file/test_model_motion_adapter_single_file.py b/tests/single_file/test_model_motion_adapter_single_file.py
@@ -0,0 +1,90 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from diffusers import (
+    MotionAdapter,
+)
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+)
+
+
+enable_full_determinism()
+
+
+class MotionAdapterSingleFileTests(unittest.TestCase):
+    model_class = MotionAdapter
+
+    def test_single_file_components_version_v1_5(self):
+        ckpt_path = "https://huggingface.co/guoyww/animatediff/blob/main/mm_sd_v15.ckpt"
+        repo_id = "guoyww/animatediff-motion-adapter-v1-5"
+
+        model = self.model_class.from_pretrained(repo_id)
+        model_single_file = self.model_class.from_single_file(ckpt_path)
+
+        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
+        for param_name, param_value in model_single_file.config.items():
+            if param_name in PARAMS_TO_IGNORE:
+                continue
+            assert (
+                model.config[param_name] == param_value
+            ), f"{param_name} differs between pretrained loading and single file loading"
+
+    def test_single_file_components_version_v1_5_2(self):
+        ckpt_path = "https://huggingface.co/guoyww/animatediff/blob/main/mm_sd_v15_v2.ckpt"
+        repo_id = "guoyww/animatediff-motion-adapter-v1-5-2"
+
+        model = self.model_class.from_pretrained(repo_id)
+        model_single_file = self.model_class.from_single_file(ckpt_path)
+
+        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
+        for param_name, param_value in model_single_file.config.items():
+            if param_name in PARAMS_TO_IGNORE:
+                continue
+            assert (
+                model.config[param_name] == param_value
+            ), f"{param_name} differs between pretrained loading and single file loading"
+
+    def test_single_file_components_version_v1_5_3(self):
+        ckpt_path = "https://huggingface.co/guoyww/animatediff/blob/main/v3_sd15_mm.ckpt"
+        repo_id = "guoyww/animatediff-motion-adapter-v1-5-3"
+
+        model = self.model_class.from_pretrained(repo_id)
+        model_single_file = self.model_class.from_single_file(ckpt_path)
+
+        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
+        for param_name, param_value in model_single_file.config.items():
+            if param_name in PARAMS_TO_IGNORE:
+                continue
+            assert (
+                model.config[param_name] == param_value
+            ), f"{param_name} differs between pretrained loading and single file loading"
+
+    def test_single_file_components_version_sdxl_beta(self):
+        ckpt_path = "https://huggingface.co/guoyww/animatediff/blob/main/mm_sdxl_v10_beta.ckpt"
+        repo_id = "guoyww/animatediff-motion-adapter-sdxl-beta"
+
+        model = self.model_class.from_pretrained(repo_id)
+        model_single_file = self.model_class.from_single_file(ckpt_path)
+
+        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
+        for param_name, param_value in model_single_file.config.items():
+            if param_name in PARAMS_TO_IGNORE:
+                continue
+            assert (
+                model.config[param_name] == param_value
+            ), f"{param_name} differs between pretrained loading and single file loading"