From 7e3987cbd67f03cea7554742f41b3bf78e54b1b4 Mon Sep 17 00:00:00 2001
From: patil-suraj <surajp815@gmail.com>
Date: Fri, 5 Aug 2022 16:32:23 +0530
Subject: [PATCH 01/20] begin text2img conversion script

---
 ...xt2img_original_checkpoint_to_diffusers.py | 423 ++++++++++++++++++
 1 file changed, 423 insertions(+)
 create mode 100644 scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py

diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
new file mode 100644
index 000000000000..d83f7a69602c
--- /dev/null
+++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
@@ -0,0 +1,423 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the LDM checkpoints. """
+
+import argparse
+import json
+import torch
+
+try:
+    import OmegaConf
+except ImportError:
+    raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.")
+
+from diffusers import VQModel, DDPMScheduler, UNet2DModel, LDMPipeline
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return '.'.join(path.split('.')[n_shave_prefix_segments:])
+    else:
+        return '.'.join(path.split('.')[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace('in_layers.0', 'norm1')
+        new_item = new_item.replace('in_layers.2', 'conv1')
+
+        new_item = new_item.replace('out_layers.0', 'norm2')
+        new_item = new_item.replace('out_layers.3', 'conv2')
+
+        new_item = new_item.replace('emb_layers.1', 'time_emb_proj')
+        new_item = new_item.replace('skip_connection', 'conv_shortcut')
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({'old': old_item, 'new': new_item})
+
+    return mapping
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({'old': old_item, 'new': new_item})
+
+    return mapping
+
+
+def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming
+    to them. It splits attention layers, and takes into account additional replacements
+    that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map['query']] = query.reshape(target_shape)
+            checkpoint[path_map['key']] = key.reshape(target_shape)
+            checkpoint[path_map['value']] = value.reshape(target_shape)
+
+    for path in paths:
+        new_path = path['new']
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        # Global renaming happens here
+        new_path = new_path.replace('middle_block.0', 'mid.resnets.0')
+        new_path = new_path.replace('middle_block.1', 'mid.attentions.0')
+        new_path = new_path.replace('middle_block.2', 'mid.resnets.1')
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement['old'], replacement['new'])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path['old']][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path['old']]
+
+
+def convert_ldm_unet_checkpoint(checkpoint, config):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    unet_key = "model.diffusion_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(unet_key):
+            unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint['time_embedding.linear_1.weight'] = unet_state_dict['time_embed.0.weight']
+    new_checkpoint['time_embedding.linear_1.bias'] = unet_state_dict['time_embed.0.bias']
+    new_checkpoint['time_embedding.linear_2.weight'] = unet_state_dict['time_embed.2.weight']
+    new_checkpoint['time_embedding.linear_2.bias'] = unet_state_dict['time_embed.2.bias']
+
+    new_checkpoint['conv_in.weight'] = unet_state_dict['input_blocks.0.0.weight']
+    new_checkpoint['conv_in.bias'] = unet_state_dict['input_blocks.0.0.bias']
+
+    new_checkpoint['conv_norm_out.weight'] = unet_state_dict['out.0.weight']
+    new_checkpoint['conv_norm_out.bias'] = unet_state_dict['out.0.bias']
+    new_checkpoint['conv_out.weight'] = unet_state_dict['out.2.weight']
+    new_checkpoint['conv_out.bias'] = unet_state_dict['out.2.bias']
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'input_blocks' in layer})
+    input_blocks = {layer_id: [key for key in unet_state_dict if f'input_blocks.{layer_id}' in key] for layer_id in range(num_input_blocks)}
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'middle_block' in layer})
+    middle_blocks = {layer_id: [key for key in unet_state_dict if f'middle_block.{layer_id}' in key] for layer_id in range(num_middle_blocks)}
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'output_blocks' in layer})
+    output_blocks = {layer_id: [key for key in unet_state_dict if f'output_blocks.{layer_id}' in key] for layer_id in range(num_output_blocks)}
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config['num_res_blocks'] + 1)
+        layer_in_block_id = (i - 1) % (config['num_res_blocks'] + 1)
+
+        resnets = [key for key in input_blocks[i] if f'input_blocks.{i}.0' in key]
+        attentions = [key for key in input_blocks[i] if f'input_blocks.{i}.1' in key]
+
+        if f'input_blocks.{i}.0.op.weight' in unet_state_dict:
+            new_checkpoint[f'downsample_blocks.{block_id}.downsamplers.0.conv.weight'] = unet_state_dict[f'input_blocks.{i}.0.op.weight']
+            new_checkpoint[f'downsample_blocks.{block_id}.downsamplers.0.conv.bias'] = unet_state_dict[f'input_blocks.{i}.0.op.bias']
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {'old': f'input_blocks.{i}.0', 'new': f'downsample_blocks.{block_id}.resnets.{layer_in_block_id}'}
+        resnet_op = {'old': 'resnets.2.op', 'new': 'downsamplers.0.op'}
+        assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path, resnet_op], config=config)
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {'old': f'input_blocks.{i}.1', 'new': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}'}
+            to_split = {
+                f'input_blocks.{i}.1.qkv.bias': {
+                    'key': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.key.bias',
+                    'query': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.query.bias',
+                    'value': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.value.bias',
+                },
+                f'input_blocks.{i}.1.qkv.weight': {
+                    'key': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.key.weight',
+                    'query': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.query.weight',
+                    'value': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.value.weight',
+                },
+            }
+            assign_to_checkpoint(
+                paths,
+                new_checkpoint,
+                unet_state_dict,
+                additional_replacements=[meta_path],
+                attention_paths_to_split=to_split,
+                config=config
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    attentions_paths = renew_attention_paths(attentions)
+    to_split = {
+        'middle_block.1.qkv.bias': {
+            'key': 'mid_block.attentions.0.key.bias',
+            'query': 'mid_block.attentions.0.query.bias',
+            'value': 'mid_block.attentions.0.value.bias',
+        },
+        'middle_block.1.qkv.weight': {
+            'key': 'mid_block.attentions.0.key.weight',
+            'query': 'mid_block.attentions.0.query.weight',
+            'value': 'mid_block.attentions.0.value.weight',
+        },
+    }
+    assign_to_checkpoint(attentions_paths, new_checkpoint, unet_state_dict, attention_paths_to_split=to_split, config=config)
+
+    for i in range(num_output_blocks):
+        block_id = i // (config['num_res_blocks'] + 1)
+        layer_in_block_id = i % (config['num_res_blocks'] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split('.')[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f'output_blocks.{i}.0' in key]
+            attentions = [key for key in output_blocks[i] if f'output_blocks.{i}.1' in key]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {'old': f'output_blocks.{i}.0', 'new': f'up_blocks.{block_id}.resnets.{layer_in_block_id}'}
+            assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+
+            if ['conv.weight', 'conv.bias'] in output_block_list.values():
+                index = list(output_block_list.values()).index(['conv.weight', 'conv.bias'])
+                new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.weight'] = unet_state_dict[f'output_blocks.{i}.{index}.conv.weight']
+                new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.bias'] = unet_state_dict[f'output_blocks.{i}.{index}.conv.bias']
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    'old': f'output_blocks.{i}.1',
+                    'new': f'up_blocks.{block_id}.attentions.{layer_in_block_id}'
+                }
+                to_split = {
+                    f'output_blocks.{i}.1.qkv.bias': {
+                        'key': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.key.bias',
+                        'query': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.query.bias',
+                        'value': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.value.bias',
+                    },
+                    f'output_blocks.{i}.1.qkv.weight': {
+                        'key': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.key.weight',
+                        'query': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.query.weight',
+                        'value': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.value.weight',
+                    },
+                }
+                assign_to_checkpoint(
+                    paths,
+                    new_checkpoint,
+                    unet_state_dict,
+                    additional_replacements=[meta_path],
+                    attention_paths_to_split=to_split if any('qkv' in key for key in attentions) else None,
+                    config=config,
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = '.'.join(['output_blocks', str(i), path['old']])
+                new_path = '.'.join(['up_blocks', str(block_id), 'resnets', str(layer_in_block_id), path['new']])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    return new_checkpoint
+
+
+def convert_ldm_bert_checkpoint(checkpoint, config):
+    def _copy_attn_layer(hf_attn_layer, pt_attn_layer):
+
+        hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight
+        hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight
+        hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight
+
+        hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight
+        hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias
+
+
+    def _copy_linear(hf_linear, pt_linear):
+        hf_linear.weight = pt_linear.weight
+        hf_linear.bias = pt_linear.bias
+    
+    def _copy_mlp(hf_mlp, pt_mlp):
+        _copy_linear(hf_mlp.fc1, pt_mlp.net[0][0])
+        _copy_linear(hf_mlp.fc2, pt_mlp.net[2])
+
+
+    def _copy_layer(hf_layer, pt_layer):
+        # copy layer norms
+        _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0])
+        _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0])
+        
+        # copy attn
+        _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1])
+        
+        # copy MLP
+        pt_mlp = pt_layer[1][1]
+        _copy_linear(hf_layer.fc1, pt_mlp.net[0][0])
+        _copy_linear(hf_layer.fc2, pt_mlp.net[2])
+
+
+    def _copy_layers(hf_layers, pt_layers):
+        for i, hf_layer in enumerate(hf_layers):
+            if i != 0: i += i
+            pt_layer = pt_layers[i:i+2]
+            _copy_layer(hf_layer, pt_layer)
+    
+    hf_model = LDMBertModel(config).eval()
+
+    # copy  embeds
+    hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight
+    hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight
+
+    # copy layer norm
+    _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm)
+
+    # copy hidden layers
+    _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers)
+    
+    _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits)
+
+    return hf_model
+
+
+def convert_vae_checkpoint(checkpoint, config):
+    pass
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+
+    parser.add_argument(
+        "--original_config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The YAML config file corresponding to the original architecture.",
+    )
+
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the architecture.",
+    )
+
+    parser.add_argument(
+        "--ldm_bert_config_file",
+        default=None,
+        type=str,
+        required=False,
+        help="The config json file corresponding to the LDMBert architecture.",
+    )
+
+    parser.add_argument(
+        "--dump_path", default=None, type=str, required=True, help="Path to the output model."
+    )
+
+    args = parser.parse_args()
+
+    checkpoint = torch.load(args.checkpoint_path)
+
+    with open(args.config_file) as f:
+        config = json.loads(f.read())
+
+    converted_checkpoint = convert_ldm_checkpoint(checkpoint, config)
+
+    if "ldm" in config:
+        del config["ldm"]
+
+    model = UNet2DModel(**config)
+    model.load_state_dict(converted_checkpoint)
+
+    try:
+        scheduler = DDPMScheduler.from_config("/".join(args.checkpoint_path.split("/")[:-1]))
+        vqvae = VQModel.from_pretrained("/".join(args.checkpoint_path.split("/")[:-1]))
+
+        pipe = LDMPipeline(unet=model, scheduler=scheduler, vae=vqvae)
+        pipe.save_pretrained(args.dump_path)
+    except:
+        model.save_pretrained(args.dump_path)

From 21f4d22e1aef80e5f8203343a5f063b7b0a44dde Mon Sep 17 00:00:00 2001
From: patil-suraj <surajp815@gmail.com>
Date: Fri, 5 Aug 2022 17:13:05 +0530
Subject: [PATCH 02/20] add fn to convert config

---
 ...convert_ldm_txt2img_original_checkpoint_to_diffusers.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
index d83f7a69602c..7723e4382d15 100644
--- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
@@ -129,6 +129,13 @@ def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_s
             checkpoint[new_path] = old_checkpoint[path['old']]
 
 
+def create_unet_diffusers_config(config):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    unet_config = {}
+    
+
 def convert_ldm_unet_checkpoint(checkpoint, config):
     """
     Takes a state dict and a config, and returns a converted checkpoint.

From ee2e6791d1677b82a81ca465cad750abb7e456d8 Mon Sep 17 00:00:00 2001
From: patil-suraj <surajp815@gmail.com>
Date: Fri, 5 Aug 2022 22:40:04 +0530
Subject: [PATCH 03/20] create config if not provided

---
 ...xt2img_original_checkpoint_to_diffusers.py | 41 ++++++++++++++++---
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
index 7723e4382d15..602d5bf4be70 100644
--- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
@@ -129,11 +129,37 @@ def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_s
             checkpoint[new_path] = old_checkpoint[path['old']]
 
 
-def create_unet_diffusers_config(config):
+def create_unet_diffusers_config(original_config):
     """
     Creates a config for the diffusers based on the config of the LDM model.
     """
-    unet_config = {}
+    unet_params = config.model.params.unet_config.params
+
+    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+
+    down_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if i < len(block_out_channels) - 1 else "DownBlock2D"
+        down_block_types.append(block_type)
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "UpBlock2D" if i == 0 else "CrossAttnUpBlock2D"
+        up_block_types.append(block_type)
+
+    config = dict(
+        sample_size=unet_params.image_size,
+        in_channels=unet_params.in_channels,
+        out_channels=unet_params.out_channels,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        layers_per_block=unet_params.num_res_blocks,
+        cross_attention_dim=unet_params.context_dim,
+        attention_head_dim=unet_params.num_heads,
+    )
+
+    return config
     
 
 def convert_ldm_unet_checkpoint(checkpoint, config):
@@ -407,12 +433,17 @@ def convert_vae_checkpoint(checkpoint, config):
 
     args = parser.parse_args()
 
+    original_config = OmegaConf.load(args.original_config_file)
+
     checkpoint = torch.load(args.checkpoint_path)
 
-    with open(args.config_file) as f:
-        config = json.loads(f.read())
+    if args.config_file is not None:
+        with open(args.config_file) as f:
+            config = json.loads(f.read())
+    else:
+        config = create_unet_diffusers_config(original_config)
 
-    converted_checkpoint = convert_ldm_checkpoint(checkpoint, config)
+    converted_checkpoint = convert_ldm_unet_checkpoint(checkpoint, config)
 
     if "ldm" in config:
         del config["ldm"]

From a717a82d4c28d170d2bee9bbfd613dd1bb69ba5b Mon Sep 17 00:00:00 2001
From: patil-suraj <surajp815@gmail.com>
Date: Fri, 5 Aug 2022 22:41:58 +0530
Subject: [PATCH 04/20] update imports and use UNet2DConditionModel

---
 .../convert_ldm_txt2img_original_checkpoint_to_diffusers.py   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
index 602d5bf4be70..7da9b639e144 100644
--- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
@@ -23,7 +23,7 @@
 except ImportError:
     raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.")
 
-from diffusers import VQModel, DDPMScheduler, UNet2DModel, LDMPipeline
+from diffusers import VQModel, DDPMScheduler, LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel
 
 
 def shave_segments(path, n_shave_prefix_segments=1):
@@ -448,7 +448,7 @@ def convert_vae_checkpoint(checkpoint, config):
     if "ldm" in config:
         del config["ldm"]
 
-    model = UNet2DModel(**config)
+    model = UNet2DConditionModel(**config)
     model.load_state_dict(converted_checkpoint)
 
     try:

From b23326b15019ca4cde232642cbad76985bb3da11 Mon Sep 17 00:00:00 2001
From: patil-suraj <surajp815@gmail.com>
Date: Sat, 6 Aug 2022 13:14:10 +0530
Subject: [PATCH 05/20] fix imports, layer names

---
 ...t_ldm_txt2img_original_checkpoint_to_diffusers.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
index 7da9b639e144..96f0e65764f4 100644
--- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
@@ -19,7 +19,7 @@
 import torch
 
 try:
-    import OmegaConf
+    from omegaconf import OmegaConf
 except ImportError:
     raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.")
 
@@ -133,7 +133,7 @@ def create_unet_diffusers_config(original_config):
     """
     Creates a config for the diffusers based on the config of the LDM model.
     """
-    unet_params = config.model.params.unet_config.params
+    unet_params = original_config.model.params.unet_config.params
 
     block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
 
@@ -203,8 +203,8 @@ def convert_ldm_unet_checkpoint(checkpoint, config):
     output_blocks = {layer_id: [key for key in unet_state_dict if f'output_blocks.{layer_id}' in key] for layer_id in range(num_output_blocks)}
 
     for i in range(1, num_input_blocks):
-        block_id = (i - 1) // (config['num_res_blocks'] + 1)
-        layer_in_block_id = (i - 1) % (config['num_res_blocks'] + 1)
+        block_id = (i - 1) // (config['layers_per_block'] + 1)
+        layer_in_block_id = (i - 1) % (config['layers_per_block'] + 1)
 
         resnets = [key for key in input_blocks[i] if f'input_blocks.{i}.0' in key]
         attentions = [key for key in input_blocks[i] if f'input_blocks.{i}.1' in key]
@@ -268,8 +268,8 @@ def convert_ldm_unet_checkpoint(checkpoint, config):
     assign_to_checkpoint(attentions_paths, new_checkpoint, unet_state_dict, attention_paths_to_split=to_split, config=config)
 
     for i in range(num_output_blocks):
-        block_id = i // (config['num_res_blocks'] + 1)
-        layer_in_block_id = i % (config['num_res_blocks'] + 1)
+        block_id = i // (config['layers_per_block'] + 1)
+        layer_in_block_id = i % (config['layers_per_block'] + 1)
         output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
         output_block_list = {}
 

From cbd751f5546a0b859386b71ec46e7e22099df312 Mon Sep 17 00:00:00 2001
From: patil-suraj <surajp815@gmail.com>
Date: Sun, 7 Aug 2022 22:59:49 +0530
Subject: [PATCH 06/20] fix unet coversion

---
 ...xt2img_original_checkpoint_to_diffusers.py | 87 ++++---------------
 1 file changed, 19 insertions(+), 68 deletions(-)

diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
index 96f0e65764f4..f499dfce0aea 100644
--- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
@@ -66,13 +66,13 @@ def renew_attention_paths(old_list, n_shave_prefix_segments=0):
     for old_item in old_list:
         new_item = old_item
 
-        new_item = new_item.replace('norm.weight', 'group_norm.weight')
-        new_item = new_item.replace('norm.bias', 'group_norm.bias')
+#         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+#         new_item = new_item.replace('norm.bias', 'group_norm.bias')
 
-        new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
-        new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+#         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+#         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
 
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+#         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({'old': old_item, 'new': new_item})
 
@@ -114,9 +114,9 @@ def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_s
             continue
 
         # Global renaming happens here
-        new_path = new_path.replace('middle_block.0', 'mid.resnets.0')
-        new_path = new_path.replace('middle_block.1', 'mid.attentions.0')
-        new_path = new_path.replace('middle_block.2', 'mid.resnets.1')
+        new_path = new_path.replace('middle_block.0', 'mid_block.resnets.0')
+        new_path = new_path.replace('middle_block.1', 'mid_block.attentions.0')
+        new_path = new_path.replace('middle_block.2', 'mid_block.resnets.1')
 
         if additional_replacements is not None:
             for replacement in additional_replacements:
@@ -206,41 +206,22 @@ def convert_ldm_unet_checkpoint(checkpoint, config):
         block_id = (i - 1) // (config['layers_per_block'] + 1)
         layer_in_block_id = (i - 1) % (config['layers_per_block'] + 1)
 
-        resnets = [key for key in input_blocks[i] if f'input_blocks.{i}.0' in key]
+        resnets = [key for key in input_blocks[i] if f'input_blocks.{i}.0' in key and f'input_blocks.{i}.0.op' not in key]
         attentions = [key for key in input_blocks[i] if f'input_blocks.{i}.1' in key]
 
         if f'input_blocks.{i}.0.op.weight' in unet_state_dict:
-            new_checkpoint[f'downsample_blocks.{block_id}.downsamplers.0.conv.weight'] = unet_state_dict[f'input_blocks.{i}.0.op.weight']
-            new_checkpoint[f'downsample_blocks.{block_id}.downsamplers.0.conv.bias'] = unet_state_dict[f'input_blocks.{i}.0.op.bias']
+            new_checkpoint[f'down_blocks.{block_id}.downsamplers.0.conv.weight'] = unet_state_dict.pop(f'input_blocks.{i}.0.op.weight')
+            new_checkpoint[f'down_blocks.{block_id}.downsamplers.0.conv.bias'] = unet_state_dict.pop(f'input_blocks.{i}.0.op.bias')
 
         paths = renew_resnet_paths(resnets)
-        meta_path = {'old': f'input_blocks.{i}.0', 'new': f'downsample_blocks.{block_id}.resnets.{layer_in_block_id}'}
-        resnet_op = {'old': 'resnets.2.op', 'new': 'downsamplers.0.op'}
-        assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path, resnet_op], config=config)
+        meta_path = {'old': f'input_blocks.{i}.0', 'new': f'down_blocks.{block_id}.resnets.{layer_in_block_id}'}
+        assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
 
         if len(attentions):
             paths = renew_attention_paths(attentions)
-            meta_path = {'old': f'input_blocks.{i}.1', 'new': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}'}
-            to_split = {
-                f'input_blocks.{i}.1.qkv.bias': {
-                    'key': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.key.bias',
-                    'query': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.query.bias',
-                    'value': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.value.bias',
-                },
-                f'input_blocks.{i}.1.qkv.weight': {
-                    'key': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.key.weight',
-                    'query': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.query.weight',
-                    'value': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.value.weight',
-                },
-            }
-            assign_to_checkpoint(
-                paths,
-                new_checkpoint,
-                unet_state_dict,
-                additional_replacements=[meta_path],
-                attention_paths_to_split=to_split,
-                config=config
-            )
+            meta_path = {'old': f'input_blocks.{i}.1', 'new': f'down_blocks.{block_id}.attentions.{layer_in_block_id}'}
+            assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+
 
     resnet_0 = middle_blocks[0]
     attentions = middle_blocks[1]
@@ -253,19 +234,8 @@ def convert_ldm_unet_checkpoint(checkpoint, config):
     assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
 
     attentions_paths = renew_attention_paths(attentions)
-    to_split = {
-        'middle_block.1.qkv.bias': {
-            'key': 'mid_block.attentions.0.key.bias',
-            'query': 'mid_block.attentions.0.query.bias',
-            'value': 'mid_block.attentions.0.value.bias',
-        },
-        'middle_block.1.qkv.weight': {
-            'key': 'mid_block.attentions.0.key.weight',
-            'query': 'mid_block.attentions.0.query.weight',
-            'value': 'mid_block.attentions.0.value.weight',
-        },
-    }
-    assign_to_checkpoint(attentions_paths, new_checkpoint, unet_state_dict, attention_paths_to_split=to_split, config=config)
+    meta_path = {'old': 'middle_block.1', 'new': 'mid_block.attentions.0'}
+    assign_to_checkpoint(attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
 
     for i in range(num_output_blocks):
         block_id = i // (config['layers_per_block'] + 1)
@@ -305,26 +275,7 @@ def convert_ldm_unet_checkpoint(checkpoint, config):
                     'old': f'output_blocks.{i}.1',
                     'new': f'up_blocks.{block_id}.attentions.{layer_in_block_id}'
                 }
-                to_split = {
-                    f'output_blocks.{i}.1.qkv.bias': {
-                        'key': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.key.bias',
-                        'query': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.query.bias',
-                        'value': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.value.bias',
-                    },
-                    f'output_blocks.{i}.1.qkv.weight': {
-                        'key': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.key.weight',
-                        'query': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.query.weight',
-                        'value': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.value.weight',
-                    },
-                }
-                assign_to_checkpoint(
-                    paths,
-                    new_checkpoint,
-                    unet_state_dict,
-                    additional_replacements=[meta_path],
-                    attention_paths_to_split=to_split if any('qkv' in key for key in attentions) else None,
-                    config=config,
-                )
+                assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
         else:
             resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
             for path in resnet_0_paths:

From 956e78c4b4318619646059953cde8ac229074b71 Mon Sep 17 00:00:00 2001
From: patil-suraj <surajp815@gmail.com>
Date: Mon, 8 Aug 2022 11:12:17 +0530
Subject: [PATCH 07/20] add function to convert VAE

---
 ...xt2img_original_checkpoint_to_diffusers.py | 167 ++++++++++++++++++
 1 file changed, 167 insertions(+)

diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
index f499dfce0aea..853278a7083a 100644
--- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
@@ -58,6 +58,21 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
     return mapping
 
 
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = new_item.replace('nin_shortcut', 'conv_shortcut')
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({'old': old_item, 'new': new_item})
+
+    return mapping
+
+
 def renew_attention_paths(old_list, n_shave_prefix_segments=0):
     """
     Updates paths inside attentions to the new naming scheme (local renaming)
@@ -79,6 +94,36 @@ def renew_attention_paths(old_list, n_shave_prefix_segments=0):
     return mapping
 
 
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        new_item = new_item.replace('q.weight', 'query.weight')
+        new_item = new_item.replace('q.bias', 'query.bias')
+
+        new_item = new_item.replace('k.weight', 'key.weight')
+        new_item = new_item.replace('k.bias', 'key.bias')
+
+        new_item = new_item.replace('v.weight', 'value.weight')
+        new_item = new_item.replace('v.bias', 'value.bias')
+
+        new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({'old': old_item, 'new': new_item})
+
+    return mapping
+
+
 def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None):
     """
     This does the final conversion step: take locally converted weights and apply a global renaming
@@ -129,6 +174,14 @@ def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_s
             checkpoint[new_path] = old_checkpoint[path['old']]
 
 
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if "weight" in checkpoint.keys():
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+
+
 def create_unet_diffusers_config(original_config):
     """
     Creates a config for the diffusers based on the config of the LDM model.
@@ -160,6 +213,30 @@ def create_unet_diffusers_config(original_config):
     )
 
     return config
+
+
+def create_vae_diffusers_config(original_config):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+    latent_channles = original_config.model.params.first_stage_config.params.embed_dim
+
+    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+
+    config = dict(
+        sample_size=vae_params.resolution,
+        in_channels=vae_params.in_channels,
+        out_channels=vae_params.out_ch,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        latent_channels=vae_params.z_channels,
+        layers_per_block=vae_params.num_res_blocks,
+    )
+    return config
     
 
 def convert_ldm_unet_checkpoint(checkpoint, config):
@@ -287,6 +364,96 @@ def convert_ldm_unet_checkpoint(checkpoint, config):
     return new_checkpoint
 
 
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    vae_key = "first_stage_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+    
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({'.'.join(layer.split('.')[:2]) for layer in vae_state_dict if 'down' in layer})
+    down_blocks = {layer_id: [key for key in vae_state_dict if f'down.{layer_id}' in key] for layer_id in range(num_down_blocks)}
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({'.'.join(layer.split('.')[:2]) for layer in vae_state_dict if 'up' in layer})
+    up_blocks = {layer_id: [key for key in vae_state_dict if f'up.{layer_id}' in key] for layer_id in range(num_up_blocks)}
+
+    
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f'down.{i}' in key]
+
+        if f"encoder.down.{i}.downsample" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.weight"] = vae_state_dict[f"encoder.down.{i}.downsample.weight"]
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.bias"] = vae_state_dict[f"encoder.down.{i}.downsample.bias"]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {'old': f'down.{i}.block', 'new': f'down_blocks.{i}.resnets'}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(mid_resnets)
+        meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i}'}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {'old': 'mid.attn_1', 'new': 'mid_block.attentions.0'}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        resnets = [key for key in up_blocks[i] if f'up.{i}' in key]
+
+        if f"decoder.up.{i}.upsample" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.weight"] = vae_state_dict[f"decoder.up.{i}.upsample.weight"]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.bias"] = vae_state_dict[f"decoder.up.{i}.upsample.bias"]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {'old': f'up.{i}.block', 'new': f'up_blocks.{i}.resnets'}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(mid_resnets)
+        meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i}'}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {'old': 'mid.attn_1', 'new': 'mid_block.attentions.0'}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+
+
 def convert_ldm_bert_checkpoint(checkpoint, config):
     def _copy_attn_layer(hf_attn_layer, pt_attn_layer):
 

From 379bdc61c4b91f2e6fa1f9428e3b4da370004bb1 Mon Sep 17 00:00:00 2001
From: patil-suraj <surajp815@gmail.com>
Date: Mon, 8 Aug 2022 12:39:04 +0530
Subject: [PATCH 08/20] fix vae conversion

---
 ...xt2img_original_checkpoint_to_diffusers.py | 51 +++++++++++--------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
index 853278a7083a..6f2b1818e9d3 100644
--- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
@@ -64,8 +64,10 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
     """
     mapping = []
     for old_item in old_list:
-        new_item = new_item.replace('nin_shortcut', 'conv_shortcut')
+        new_item = old_item
 
+        new_item = new_item.replace('nin_shortcut', 'conv_shortcut')
+        
         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({'old': old_item, 'new': new_item})
@@ -162,7 +164,7 @@ def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_s
         new_path = new_path.replace('middle_block.0', 'mid_block.resnets.0')
         new_path = new_path.replace('middle_block.1', 'mid_block.attentions.0')
         new_path = new_path.replace('middle_block.2', 'mid_block.resnets.1')
-
+        
         if additional_replacements is not None:
             for replacement in additional_replacements:
                 new_path = new_path.replace(replacement['old'], replacement['new'])
@@ -176,10 +178,14 @@ def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_s
 
 def conv_attn_to_linear(checkpoint):
     keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
     for key in keys:
-        if "weight" in checkpoint.keys():
+        if ".".join(key.split(".")[-2:]) in attn_keys:
             if checkpoint[key].ndim > 2:
                 checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
 
 
 def create_unet_diffusers_config(original_config):
@@ -377,11 +383,15 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
 
     new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
     new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
     new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
     new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
 
     new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
     new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
     new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
     new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
 
@@ -392,20 +402,20 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
 
     
     # Retrieves the keys for the encoder down blocks only
-    num_down_blocks = len({'.'.join(layer.split('.')[:2]) for layer in vae_state_dict if 'down' in layer})
+    num_down_blocks = len({'.'.join(layer.split('.')[:3]) for layer in vae_state_dict if 'encoder.down' in layer})
     down_blocks = {layer_id: [key for key in vae_state_dict if f'down.{layer_id}' in key] for layer_id in range(num_down_blocks)}
 
     # Retrieves the keys for the decoder up blocks only
-    num_up_blocks = len({'.'.join(layer.split('.')[:2]) for layer in vae_state_dict if 'up' in layer})
+    num_up_blocks = len({'.'.join(layer.split('.')[:3]) for layer in vae_state_dict if 'decoder.up' in layer})
     up_blocks = {layer_id: [key for key in vae_state_dict if f'up.{layer_id}' in key] for layer_id in range(num_up_blocks)}
 
     
     for i in range(num_down_blocks):
-        resnets = [key for key in down_blocks[i] if f'down.{i}' in key]
-
-        if f"encoder.down.{i}.downsample" in vae_state_dict:
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.weight"] = vae_state_dict[f"encoder.down.{i}.downsample.weight"]
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.bias"] = vae_state_dict[f"encoder.down.{i}.downsample.bias"]
+        resnets = [key for key in down_blocks[i] if f'down.{i}' in key and f"down.{i}.downsample" not in key]
+        
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.weight")
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.bias")
 
         paths = renew_vae_resnet_paths(resnets)
         meta_path = {'old': f'down.{i}.block', 'new': f'down_blocks.{i}.resnets'}
@@ -416,8 +426,8 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
     for i in range(1, num_mid_res_blocks + 1):
         resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
 
-        paths = renew_vae_resnet_paths(mid_resnets)
-        meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i}'}
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i - 1}'}
         assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
 
     mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
@@ -427,14 +437,15 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
     conv_attn_to_linear(new_checkpoint)
 
     for i in range(num_up_blocks):
-        resnets = [key for key in up_blocks[i] if f'up.{i}' in key]
+        block_id = num_up_blocks - 1 - i
+        resnets = [key for key in up_blocks[block_id] if f'up.{block_id}' in key and f"up.{block_id}.upsample" not in key]
 
-        if f"decoder.up.{i}.upsample" in vae_state_dict:
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.weight"] = vae_state_dict[f"decoder.up.{i}.upsample.weight"]
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.bias"] = vae_state_dict[f"decoder.up.{i}.upsample.bias"]
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.weight"]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.bias"]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {'old': f'up.{i}.block', 'new': f'up_blocks.{i}.resnets'}
+        meta_path = {'old': f'up.{block_id}.block', 'new': f'up_blocks.{i}.resnets'}
         assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
     
     mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
@@ -442,8 +453,8 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
     for i in range(1, num_mid_res_blocks + 1):
         resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
 
-        paths = renew_vae_resnet_paths(mid_resnets)
-        meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i}'}
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i - 1}'}
         assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
 
     mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
@@ -451,7 +462,7 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
     meta_path = {'old': 'mid.attn_1', 'new': 'mid_block.attentions.0'}
     assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
     conv_attn_to_linear(new_checkpoint)
-
+    return new_checkpoint
 
 
 def convert_ldm_bert_checkpoint(checkpoint, config):

From bd3623357c2417ae4e3d1b5dcb523f0cc966b60b Mon Sep 17 00:00:00 2001
From: patil-suraj <surajp815@gmail.com>
Date: Mon, 8 Aug 2022 12:42:45 +0530
Subject: [PATCH 09/20] update main

---
 ...ldm_txt2img_original_checkpoint_to_diffusers.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
index 6f2b1818e9d3..9df23095a2af 100644
--- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
@@ -522,8 +522,6 @@ def _copy_layers(hf_layers, pt_layers):
     return hf_model
 
 
-def convert_vae_checkpoint(checkpoint, config):
-    pass
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -572,13 +570,21 @@ def convert_vae_checkpoint(checkpoint, config):
     else:
         config = create_unet_diffusers_config(original_config)
 
-    converted_checkpoint = convert_ldm_unet_checkpoint(checkpoint, config)
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, config)
+
+    vae_config = create_vae_diffusers_config(original_config)
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+
+    # TODO: convert bert or CLIP model
 
     if "ldm" in config:
         del config["ldm"]
 
     model = UNet2DConditionModel(**config)
-    model.load_state_dict(converted_checkpoint)
+    model.load_state_dict(converted_unet_checkpoint)
+
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
 
     try:
         scheduler = DDPMScheduler.from_config("/".join(args.checkpoint_path.split("/")[:-1]))

From f3c4f994b339bfeefbf98360e5a43da175558d95 Mon Sep 17 00:00:00 2001
From: patil-suraj <surajp815@gmail.com>
Date: Mon, 8 Aug 2022 13:06:52 +0530
Subject: [PATCH 10/20] create text model

---
 ...xt2img_original_checkpoint_to_diffusers.py | 30 +++++++++++++------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
index 9df23095a2af..91fb225c3659 100644
--- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
@@ -23,7 +23,9 @@
 except ImportError:
     raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.")
 
+from transformers import  CLIPTokenizer, CLIPTextModel
 from diffusers import VQModel, DDPMScheduler, LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel, LDMBertConfig
 
 
 def shave_segments(path, n_shave_prefix_segments=1):
@@ -243,6 +245,16 @@ def create_vae_diffusers_config(original_config):
         layers_per_block=vae_params.num_res_blocks,
     )
     return config
+
+
+def create_ldm_bert_config(original_config):
+    bert_params = original_config.model.parms.cond_stage_config.params
+    config = LDMBertConfig(
+        d_model=bert_params.n_embed,
+        encoder_layers=bert_params.n_layer,
+        encoder_ffn_dim=bert_params.n_embed * 4,
+    )
+    return config
     
 
 def convert_ldm_unet_checkpoint(checkpoint, config):
@@ -480,10 +492,6 @@ def _copy_linear(hf_linear, pt_linear):
         hf_linear.weight = pt_linear.weight
         hf_linear.bias = pt_linear.bias
     
-    def _copy_mlp(hf_mlp, pt_mlp):
-        _copy_linear(hf_mlp.fc1, pt_mlp.net[0][0])
-        _copy_linear(hf_mlp.fc2, pt_mlp.net[2])
-
 
     def _copy_layer(hf_layer, pt_layer):
         # copy layer norms
@@ -575,10 +583,14 @@ def _copy_layers(hf_layers, pt_layers):
     vae_config = create_vae_diffusers_config(original_config)
     converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
 
-    # TODO: convert bert or CLIP model
-
-    if "ldm" in config:
-        del config["ldm"]
+    text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
+    if text_model_type == "FrozenCLIPEmbedder":
+        text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    else:
+        # TODO: update the convert function to use the state_dict without the model instance.
+        text_config = create_ldm_bert_config(original_config)
+        text_checkpoint = convert_ldm_bert_checkpoint(checkpoint, text_config)
 
     model = UNet2DConditionModel(**config)
     model.load_state_dict(converted_unet_checkpoint)
@@ -590,7 +602,7 @@ def _copy_layers(hf_layers, pt_layers):
         scheduler = DDPMScheduler.from_config("/".join(args.checkpoint_path.split("/")[:-1]))
         vqvae = VQModel.from_pretrained("/".join(args.checkpoint_path.split("/")[:-1]))
 
-        pipe = LDMPipeline(unet=model, scheduler=scheduler, vae=vqvae)
+        pipe = LDMTextToImagePipeline(unet=model, scheduler=scheduler, vae=vqvae)
         pipe.save_pretrained(args.dump_path)
     except:
         model.save_pretrained(args.dump_path)

From ac0479770f2d005dd449099111e4584664be1425 Mon Sep 17 00:00:00 2001
From: patil-suraj <surajp815@gmail.com>
Date: Mon, 8 Aug 2022 17:32:17 +0530
Subject: [PATCH 11/20] update config creating logic for unet

---
 ...convert_ldm_txt2img_original_checkpoint_to_diffusers.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
index 91fb225c3659..9515982b781b 100644
--- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
@@ -199,14 +199,17 @@ def create_unet_diffusers_config(original_config):
     block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
 
     down_block_types = []
+    resolution = 1
     for i in range(len(block_out_channels)):
-        block_type = "CrossAttnDownBlock2D" if i < len(block_out_channels) - 1 else "DownBlock2D"
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
         down_block_types.append(block_type)
+        resolution *= 2
 
     up_block_types = []
     for i in range(len(block_out_channels)):
-        block_type = "UpBlock2D" if i == 0 else "CrossAttnUpBlock2D"
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" 
         up_block_types.append(block_type)
+        resolution //= 2
 
     config = dict(
         sample_size=unet_params.image_size,

From 70e9ac49ddf6be8699c90d4d2ac7be736e58ccd3 Mon Sep 17 00:00:00 2001
From: patil-suraj <surajp815@gmail.com>
Date: Mon, 8 Aug 2022 22:47:12 +0530
Subject: [PATCH 12/20] fix config creation

---
 .../convert_ldm_txt2img_original_checkpoint_to_diffusers.py    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
index 9515982b781b..08023f6c8492 100644
--- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
@@ -203,7 +203,8 @@ def create_unet_diffusers_config(original_config):
     for i in range(len(block_out_channels)):
         block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
         down_block_types.append(block_type)
-        resolution *= 2
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
 
     up_block_types = []
     for i in range(len(block_out_channels)):

From a51e2f5a3be863c15cb4a3f69f066a2283179844 Mon Sep 17 00:00:00 2001
From: patil-suraj <surajp815@gmail.com>
Date: Tue, 9 Aug 2022 16:21:22 +0530
Subject: [PATCH 13/20] update script to create and save pipeline

---
 ...xt2img_original_checkpoint_to_diffusers.py | 67 ++++++++-----------
 1 file changed, 28 insertions(+), 39 deletions(-)

diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
index 08023f6c8492..8e75f1cabe98 100644
--- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
@@ -23,8 +23,8 @@
 except ImportError:
     raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.")
 
-from transformers import  CLIPTokenizer, CLIPTextModel
-from diffusers import VQModel, DDPMScheduler, LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel
+from transformers import  BertTokenizerFast, CLIPTokenizer, CLIPTextModel
+from diffusers import VQModel, DDPMScheduler, LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel, DDIMScheduler
 from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel, LDMBertConfig
 
 
@@ -251,6 +251,16 @@ def create_vae_diffusers_config(original_config):
     return config
 
 
+def create_diffusers_schedular(original_config):
+    schedular = DDIMScheduler(
+        num_train_timesteps=original_config.model.params.timesteps,
+        beta_start=original_config.model.params.linear_start,
+        beta_end=original_config.model.params.linear_end,
+        beta_schedule="scaled_linear",
+    )
+    return schedular
+    
+
 def create_ldm_bert_config(original_config):
     bert_params = original_config.model.parms.cond_stage_config.params
     config = LDMBertConfig(
@@ -550,22 +560,6 @@ def _copy_layers(hf_layers, pt_layers):
         help="The YAML config file corresponding to the original architecture.",
     )
 
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The config json file corresponding to the architecture.",
-    )
-
-    parser.add_argument(
-        "--ldm_bert_config_file",
-        default=None,
-        type=str,
-        required=False,
-        help="The config json file corresponding to the LDMBert architecture.",
-    )
-
     parser.add_argument(
         "--dump_path", default=None, type=str, required=True, help="Path to the output model."
     )
@@ -576,17 +570,21 @@ def _copy_layers(hf_layers, pt_layers):
 
     checkpoint = torch.load(args.checkpoint_path)
 
-    if args.config_file is not None:
-        with open(args.config_file) as f:
-            config = json.loads(f.read())
-    else:
-        config = create_unet_diffusers_config(original_config)
-
-    converted_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, config)
+    # Convert the UNet2DConditionModel model.
+    unet_config = create_unet_diffusers_config(original_config)
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, unet_config)
+    
+    unet = UNet2DConditionModel(**unet_config)
+    unet.load_state_dict(converted_unet_checkpoint)
 
+    # Convert the VAE model.
     vae_config = create_vae_diffusers_config(original_config)
     converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
 
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
+
+    # Convert the text model.
     text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
     if text_model_type == "FrozenCLIPEmbedder":
         text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
@@ -594,19 +592,10 @@ def _copy_layers(hf_layers, pt_layers):
     else:
         # TODO: update the convert function to use the state_dict without the model instance.
         text_config = create_ldm_bert_config(original_config)
-        text_checkpoint = convert_ldm_bert_checkpoint(checkpoint, text_config)
-
-    model = UNet2DConditionModel(**config)
-    model.load_state_dict(converted_unet_checkpoint)
-
-    vae = AutoencoderKL(**vae_config)
-    vae.load_state_dict(converted_vae_checkpoint)
+        text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
+        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
 
-    try:
-        scheduler = DDPMScheduler.from_config("/".join(args.checkpoint_path.split("/")[:-1]))
-        vqvae = VQModel.from_pretrained("/".join(args.checkpoint_path.split("/")[:-1]))
+    scheduler = create_diffusers_schedular(original_config)
+    pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+    pipe.save_pretrained(args.dump_path)
 
-        pipe = LDMTextToImagePipeline(unet=model, scheduler=scheduler, vae=vqvae)
-        pipe.save_pretrained(args.dump_path)
-    except:
-        model.save_pretrained(args.dump_path)

From 954dca72cfece6de626f386f6bb4234935ff1fc9 Mon Sep 17 00:00:00 2001
From: patil-suraj <surajp815@gmail.com>
Date: Tue, 9 Aug 2022 16:21:50 +0530
Subject: [PATCH 14/20] remove unused imports

---
 .../convert_ldm_txt2img_original_checkpoint_to_diffusers.py    | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
index 8e75f1cabe98..d73eb4ef1a71 100644
--- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
@@ -15,7 +15,6 @@
 """ Conversion script for the LDM checkpoints. """
 
 import argparse
-import json
 import torch
 
 try:
@@ -24,7 +23,7 @@
     raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.")
 
 from transformers import  BertTokenizerFast, CLIPTokenizer, CLIPTextModel
-from diffusers import VQModel, DDPMScheduler, LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel, DDIMScheduler
+from diffusers import LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel, DDIMScheduler
 from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel, LDMBertConfig
 
 

From 2b804b94df8e2d9f6bf88b188150f5eae9c9576f Mon Sep 17 00:00:00 2001
From: patil-suraj <surajp815@gmail.com>
Date: Wed, 10 Aug 2022 20:22:40 +0530
Subject: [PATCH 15/20] fix checkpoint loading

---
 scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
index d73eb4ef1a71..d360bd968e95 100644
--- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
@@ -567,7 +567,7 @@ def _copy_layers(hf_layers, pt_layers):
 
     original_config = OmegaConf.load(args.original_config_file)
 
-    checkpoint = torch.load(args.checkpoint_path)
+    checkpoint = torch.load(args.checkpoint_path)["state_dict"]
 
     # Convert the UNet2DConditionModel model.
     unet_config = create_unet_diffusers_config(original_config)

From 6df55be10a9bd33e045902cfa0b85f0e0657eb69 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 15 Sep 2022 16:00:10 +0000
Subject: [PATCH 16/20] better name

---
 ...ers.py => convert_original_stable_diffusion_to_diffusers.py} | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename scripts/{convert_ldm_txt2img_original_checkpoint_to_diffusers.py => convert_original_stable_diffusion_to_diffusers.py} (99%)

diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py
similarity index 99%
rename from scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
rename to scripts/convert_original_stable_diffusion_to_diffusers.py
index d360bd968e95..04d2343a3d3a 100644
--- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_original_stable_diffusion_to_diffusers.py
@@ -23,7 +23,7 @@
     raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.")
 
 from transformers import  BertTokenizerFast, CLIPTokenizer, CLIPTextModel
-from diffusers import LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel, DDIMScheduler
+from diffusers import LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel, DDIMScheduler, KLM
 from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel, LDMBertConfig
 
 

From 2c4ce96d12a65f08b176a6303a4f1dbb305f4ebd Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 15 Sep 2022 16:27:41 +0000
Subject: [PATCH 17/20] save progress

---
 _                                             | 608 ++++++++++++++++++
 ..._original_stable_diffusion_to_diffusers.py |  16 +-
 2 files changed, 620 insertions(+), 4 deletions(-)
 create mode 100644 _

diff --git a/_ b/_
new file mode 100644
index 000000000000..0965f6522f6a
--- /dev/null
+++ b/_
@@ -0,0 +1,608 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the LDM checkpoints. """
+
+import argparse
+import torch
+
+try:
+    from omegaconf import OmegaConf
+except ImportError:
+    raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.")
+
+from transformers import BertTokenizerFast, CLIPTokenizer, CLIPTextModel
+from diffusers import LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, StableDiffusionPipeline
+from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel, LDMBertConfig
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return '.'.join(path.split('.')[n_shave_prefix_segments:])
+    else:
+        return '.'.join(path.split('.')[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace('in_layers.0', 'norm1')
+        new_item = new_item.replace('in_layers.2', 'conv1')
+
+        new_item = new_item.replace('out_layers.0', 'norm2')
+        new_item = new_item.replace('out_layers.3', 'conv2')
+
+        new_item = new_item.replace('emb_layers.1', 'time_emb_proj')
+        new_item = new_item.replace('skip_connection', 'conv_shortcut')
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({'old': old_item, 'new': new_item})
+
+    return mapping
+
+
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace('nin_shortcut', 'conv_shortcut')
+        
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({'old': old_item, 'new': new_item})
+
+    return mapping
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+#         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+#         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+#         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+#         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+#         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({'old': old_item, 'new': new_item})
+
+    return mapping
+
+
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        new_item = new_item.replace('q.weight', 'query.weight')
+        new_item = new_item.replace('q.bias', 'query.bias')
+
+        new_item = new_item.replace('k.weight', 'key.weight')
+        new_item = new_item.replace('k.bias', 'key.bias')
+
+        new_item = new_item.replace('v.weight', 'value.weight')
+        new_item = new_item.replace('v.bias', 'value.bias')
+
+        new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({'old': old_item, 'new': new_item})
+
+    return mapping
+
+
+def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming
+    to them. It splits attention layers, and takes into account additional replacements
+    that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map['query']] = query.reshape(target_shape)
+            checkpoint[path_map['key']] = key.reshape(target_shape)
+            checkpoint[path_map['value']] = value.reshape(target_shape)
+
+    for path in paths:
+        new_path = path['new']
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        # Global renaming happens here
+        new_path = new_path.replace('middle_block.0', 'mid_block.resnets.0')
+        new_path = new_path.replace('middle_block.1', 'mid_block.attentions.0')
+        new_path = new_path.replace('middle_block.2', 'mid_block.resnets.1')
+        
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement['old'], replacement['new'])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path['old']][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path['old']]
+
+
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+def create_unet_diffusers_config(original_config):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    unet_params = original_config.model.params.unet_config.params
+
+    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" 
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    config = dict(
+        sample_size=unet_params.image_size,
+        in_channels=unet_params.in_channels,
+        out_channels=unet_params.out_channels,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        layers_per_block=unet_params.num_res_blocks,
+        cross_attention_dim=unet_params.context_dim,
+        attention_head_dim=unet_params.num_heads,
+    )
+
+    return config
+
+
+def create_vae_diffusers_config(original_config):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+    latent_channles = original_config.model.params.first_stage_config.params.embed_dim
+
+    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+
+    config = dict(
+        sample_size=vae_params.resolution,
+        in_channels=vae_params.in_channels,
+        out_channels=vae_params.out_ch,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        latent_channels=vae_params.z_channels,
+        layers_per_block=vae_params.num_res_blocks,
+    )
+    return config
+
+
+def create_diffusers_schedular(original_config):
+    schedular = DDIMScheduler(
+        num_train_timesteps=original_config.model.params.timesteps,
+        beta_start=original_config.model.params.linear_start,
+        beta_end=original_config.model.params.linear_end,
+        beta_schedule="scaled_linear",
+    )
+    return schedular
+    
+
+def create_ldm_bert_config(original_config):
+    bert_params = original_config.model.parms.cond_stage_config.params
+    config = LDMBertConfig(
+        d_model=bert_params.n_embed,
+        encoder_layers=bert_params.n_layer,
+        encoder_ffn_dim=bert_params.n_embed * 4,
+    )
+    return config
+    
+
+def convert_ldm_unet_checkpoint(checkpoint, config):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    unet_key = "model.diffusion_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(unet_key):
+            unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint['time_embedding.linear_1.weight'] = unet_state_dict['time_embed.0.weight']
+    new_checkpoint['time_embedding.linear_1.bias'] = unet_state_dict['time_embed.0.bias']
+    new_checkpoint['time_embedding.linear_2.weight'] = unet_state_dict['time_embed.2.weight']
+    new_checkpoint['time_embedding.linear_2.bias'] = unet_state_dict['time_embed.2.bias']
+
+    new_checkpoint['conv_in.weight'] = unet_state_dict['input_blocks.0.0.weight']
+    new_checkpoint['conv_in.bias'] = unet_state_dict['input_blocks.0.0.bias']
+
+    new_checkpoint['conv_norm_out.weight'] = unet_state_dict['out.0.weight']
+    new_checkpoint['conv_norm_out.bias'] = unet_state_dict['out.0.bias']
+    new_checkpoint['conv_out.weight'] = unet_state_dict['out.2.weight']
+    new_checkpoint['conv_out.bias'] = unet_state_dict['out.2.bias']
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'input_blocks' in layer})
+    input_blocks = {layer_id: [key for key in unet_state_dict if f'input_blocks.{layer_id}' in key] for layer_id in range(num_input_blocks)}
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'middle_block' in layer})
+    middle_blocks = {layer_id: [key for key in unet_state_dict if f'middle_block.{layer_id}' in key] for layer_id in range(num_middle_blocks)}
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'output_blocks' in layer})
+    output_blocks = {layer_id: [key for key in unet_state_dict if f'output_blocks.{layer_id}' in key] for layer_id in range(num_output_blocks)}
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config['layers_per_block'] + 1)
+        layer_in_block_id = (i - 1) % (config['layers_per_block'] + 1)
+
+        resnets = [key for key in input_blocks[i] if f'input_blocks.{i}.0' in key and f'input_blocks.{i}.0.op' not in key]
+        attentions = [key for key in input_blocks[i] if f'input_blocks.{i}.1' in key]
+
+        if f'input_blocks.{i}.0.op.weight' in unet_state_dict:
+            new_checkpoint[f'down_blocks.{block_id}.downsamplers.0.conv.weight'] = unet_state_dict.pop(f'input_blocks.{i}.0.op.weight')
+            new_checkpoint[f'down_blocks.{block_id}.downsamplers.0.conv.bias'] = unet_state_dict.pop(f'input_blocks.{i}.0.op.bias')
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {'old': f'input_blocks.{i}.0', 'new': f'down_blocks.{block_id}.resnets.{layer_in_block_id}'}
+        assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {'old': f'input_blocks.{i}.1', 'new': f'down_blocks.{block_id}.attentions.{layer_in_block_id}'}
+            assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {'old': 'middle_block.1', 'new': 'mid_block.attentions.0'}
+    assign_to_checkpoint(attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+
+    for i in range(num_output_blocks):
+        block_id = i // (config['layers_per_block'] + 1)
+        layer_in_block_id = i % (config['layers_per_block'] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split('.')[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f'output_blocks.{i}.0' in key]
+            attentions = [key for key in output_blocks[i] if f'output_blocks.{i}.1' in key]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {'old': f'output_blocks.{i}.0', 'new': f'up_blocks.{block_id}.resnets.{layer_in_block_id}'}
+            assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+
+            if ['conv.weight', 'conv.bias'] in output_block_list.values():
+                index = list(output_block_list.values()).index(['conv.weight', 'conv.bias'])
+                new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.weight'] = unet_state_dict[f'output_blocks.{i}.{index}.conv.weight']
+                new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.bias'] = unet_state_dict[f'output_blocks.{i}.{index}.conv.bias']
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    'old': f'output_blocks.{i}.1',
+                    'new': f'up_blocks.{block_id}.attentions.{layer_in_block_id}'
+                }
+                assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = '.'.join(['output_blocks', str(i), path['old']])
+                new_path = '.'.join(['up_blocks', str(block_id), 'resnets', str(layer_in_block_id), path['new']])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    return new_checkpoint
+
+
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    vae_key = "first_stage_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+    
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({'.'.join(layer.split('.')[:3]) for layer in vae_state_dict if 'encoder.down' in layer})
+    down_blocks = {layer_id: [key for key in vae_state_dict if f'down.{layer_id}' in key] for layer_id in range(num_down_blocks)}
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({'.'.join(layer.split('.')[:3]) for layer in vae_state_dict if 'decoder.up' in layer})
+    up_blocks = {layer_id: [key for key in vae_state_dict if f'up.{layer_id}' in key] for layer_id in range(num_up_blocks)}
+
+    
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f'down.{i}' in key and f"down.{i}.downsample" not in key]
+        
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.weight")
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.bias")
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {'old': f'down.{i}.block', 'new': f'down_blocks.{i}.resnets'}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i - 1}'}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {'old': 'mid.attn_1', 'new': 'mid_block.attentions.0'}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [key for key in up_blocks[block_id] if f'up.{block_id}' in key and f"up.{block_id}.upsample" not in key]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.weight"]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.bias"]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {'old': f'up.{block_id}.block', 'new': f'up_blocks.{i}.resnets'}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i - 1}'}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {'old': 'mid.attn_1', 'new': 'mid_block.attentions.0'}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+
+
+def convert_ldm_bert_checkpoint(checkpoint, config):
+    def _copy_attn_layer(hf_attn_layer, pt_attn_layer):
+
+        hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight
+        hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight
+        hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight
+
+        hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight
+        hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias
+
+
+    def _copy_linear(hf_linear, pt_linear):
+        hf_linear.weight = pt_linear.weight
+        hf_linear.bias = pt_linear.bias
+    
+
+    def _copy_layer(hf_layer, pt_layer):
+        # copy layer norms
+        _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0])
+        _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0])
+        
+        # copy attn
+        _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1])
+        
+        # copy MLP
+        pt_mlp = pt_layer[1][1]
+        _copy_linear(hf_layer.fc1, pt_mlp.net[0][0])
+        _copy_linear(hf_layer.fc2, pt_mlp.net[2])
+
+
+    def _copy_layers(hf_layers, pt_layers):
+        for i, hf_layer in enumerate(hf_layers):
+            if i != 0: i += i
+            pt_layer = pt_layers[i:i+2]
+            _copy_layer(hf_layer, pt_layer)
+    
+    hf_model = LDMBertModel(config).eval()
+
+    # copy  embeds
+    hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight
+    hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight
+
+    # copy layer norm
+    _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm)
+
+    # copy hidden layers
+    _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers)
+    
+    _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits)
+
+    return hf_model
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    # !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml
+    parser.add_argument(
+        "--original_config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The YAML config file corresponding to the original architecture.",
+    )
+    parser.add_argument(
+        "--scheduler_type", default="pndm", type=str, required=True, help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim']"
+    )
+    parser.add_argument(
+        "--dump_path", default=None, type=str, required=True, help="Path to the output model."
+    )
+
+    args = parser.parse_args()
+
+    original_config = OmegaConf.load(args.original_config_file)
+
+    checkpoint = torch.load(args.checkpoint_path)["state_dict"]
+
+    if args.scheduler_type == "pndm":
+    elif args.scheduler_type == "pndm":
+    elif args.scheduler_type == "pndm":
+    else:
+        raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
+
+    # Convert the UNet2DConditionModel model.
+    unet_config = create_unet_diffusers_config(original_config)
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, unet_config)
+    
+    unet = UNet2DConditionModel(**unet_config)
+    unet.load_state_dict(converted_unet_checkpoint)
+
+    # Convert the VAE model.
+    vae_config = create_vae_diffusers_config(original_config)
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
+
+    # Convert the text model.
+    text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
+    if text_model_type == "FrozenCLIPEmbedder":
+        text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    else:
+        # TODO: update the convert function to use the state_dict without the model instance.
+        text_config = create_ldm_bert_config(original_config)
+        text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
+        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
+
+    scheduler = create_diffusers_schedular(original_config)
+    pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+    pipe.save_pretrained(args.dump_path)
+
diff --git a/scripts/convert_original_stable_diffusion_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py
index 04d2343a3d3a..0965f6522f6a 100644
--- a/scripts/convert_original_stable_diffusion_to_diffusers.py
+++ b/scripts/convert_original_stable_diffusion_to_diffusers.py
@@ -22,8 +22,8 @@
 except ImportError:
     raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.")
 
-from transformers import  BertTokenizerFast, CLIPTokenizer, CLIPTextModel
-from diffusers import LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel, DDIMScheduler, KLM
+from transformers import BertTokenizerFast, CLIPTokenizer, CLIPTextModel
+from diffusers import LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, StableDiffusionPipeline
 from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel, LDMBertConfig
 
 
@@ -550,7 +550,7 @@ def _copy_layers(hf_layers, pt_layers):
     parser.add_argument(
         "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
     )
-
+    # !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml
     parser.add_argument(
         "--original_config_file",
         default=None,
@@ -558,7 +558,9 @@ def _copy_layers(hf_layers, pt_layers):
         required=True,
         help="The YAML config file corresponding to the original architecture.",
     )
-
+    parser.add_argument(
+        "--scheduler_type", default="pndm", type=str, required=True, help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim']"
+    )
     parser.add_argument(
         "--dump_path", default=None, type=str, required=True, help="Path to the output model."
     )
@@ -569,6 +571,12 @@ def _copy_layers(hf_layers, pt_layers):
 
     checkpoint = torch.load(args.checkpoint_path)["state_dict"]
 
+    if args.scheduler_type == "pndm":
+    elif args.scheduler_type == "pndm":
+    elif args.scheduler_type == "pndm":
+    else:
+        raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
+
     # Convert the UNet2DConditionModel model.
     unet_config = create_unet_diffusers_config(original_config)
     converted_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, unet_config)

From 915aa24d2149959aa05a0b568b25c5ff685e4a52 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 15 Sep 2022 21:54:56 +0000
Subject: [PATCH 18/20] finish

---
 ..._original_stable_diffusion_to_diffusers.py | 363 +++++++++++-------
 1 file changed, 222 insertions(+), 141 deletions(-)

diff --git a/scripts/convert_original_stable_diffusion_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py
index 0965f6522f6a..cc417188f88b 100644
--- a/scripts/convert_original_stable_diffusion_to_diffusers.py
+++ b/scripts/convert_original_stable_diffusion_to_diffusers.py
@@ -16,14 +16,26 @@
 
 import argparse
 import torch
+import os
 
 try:
     from omegaconf import OmegaConf
 except ImportError:
-    raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.")
+    raise ImportError(
+        "OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`."
+    )
 
-from transformers import BertTokenizerFast, CLIPTokenizer, CLIPTextModel
-from diffusers import LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, StableDiffusionPipeline
+from transformers import BertTokenizerFast, CLIPTokenizer, CLIPTextModel, AutoFeatureExtractor
+from diffusers import (
+    LDMTextToImagePipeline,
+    AutoencoderKL,
+    UNet2DConditionModel,
+    DDIMScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+)
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
 from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel, LDMBertConfig
 
 
@@ -32,9 +44,9 @@ def shave_segments(path, n_shave_prefix_segments=1):
     Removes segments. Positive values shave the first segments, negative shave the last segments.
     """
     if n_shave_prefix_segments >= 0:
-        return '.'.join(path.split('.')[n_shave_prefix_segments:])
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
     else:
-        return '.'.join(path.split('.')[:n_shave_prefix_segments])
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
 
 
 def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
@@ -43,18 +55,18 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
     """
     mapping = []
     for old_item in old_list:
-        new_item = old_item.replace('in_layers.0', 'norm1')
-        new_item = new_item.replace('in_layers.2', 'conv1')
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
 
-        new_item = new_item.replace('out_layers.0', 'norm2')
-        new_item = new_item.replace('out_layers.3', 'conv2')
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
 
-        new_item = new_item.replace('emb_layers.1', 'time_emb_proj')
-        new_item = new_item.replace('skip_connection', 'conv_shortcut')
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
 
         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
-        mapping.append({'old': old_item, 'new': new_item})
+        mapping.append({"old": old_item, "new": new_item})
 
     return mapping
 
@@ -67,11 +79,10 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
     for old_item in old_list:
         new_item = old_item
 
-        new_item = new_item.replace('nin_shortcut', 'conv_shortcut')
-        
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
-        mapping.append({'old': old_item, 'new': new_item})
+        mapping.append({"old": old_item, "new": new_item})
 
     return mapping
 
@@ -84,15 +95,15 @@ def renew_attention_paths(old_list, n_shave_prefix_segments=0):
     for old_item in old_list:
         new_item = old_item
 
-#         new_item = new_item.replace('norm.weight', 'group_norm.weight')
-#         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
 
-#         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
-#         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
 
-#         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
-        mapping.append({'old': old_item, 'new': new_item})
+        mapping.append({"old": old_item, "new": new_item})
 
     return mapping
 
@@ -105,29 +116,31 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
     for old_item in old_list:
         new_item = old_item
 
-        new_item = new_item.replace('norm.weight', 'group_norm.weight')
-        new_item = new_item.replace('norm.bias', 'group_norm.bias')
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
 
-        new_item = new_item.replace('q.weight', 'query.weight')
-        new_item = new_item.replace('q.bias', 'query.bias')
+        new_item = new_item.replace("q.weight", "query.weight")
+        new_item = new_item.replace("q.bias", "query.bias")
 
-        new_item = new_item.replace('k.weight', 'key.weight')
-        new_item = new_item.replace('k.bias', 'key.bias')
+        new_item = new_item.replace("k.weight", "key.weight")
+        new_item = new_item.replace("k.bias", "key.bias")
 
-        new_item = new_item.replace('v.weight', 'value.weight')
-        new_item = new_item.replace('v.bias', 'value.bias')
+        new_item = new_item.replace("v.weight", "value.weight")
+        new_item = new_item.replace("v.bias", "value.bias")
 
-        new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
-        new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
 
         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
-        mapping.append({'old': old_item, 'new': new_item})
+        mapping.append({"old": old_item, "new": new_item})
 
     return mapping
 
 
-def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None):
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
     """
     This does the final conversion step: take locally converted weights and apply a global renaming
     to them. It splits attention layers, and takes into account additional replacements
@@ -150,31 +163,31 @@ def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_s
             old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
             query, key, value = old_tensor.split(channels // num_heads, dim=1)
 
-            checkpoint[path_map['query']] = query.reshape(target_shape)
-            checkpoint[path_map['key']] = key.reshape(target_shape)
-            checkpoint[path_map['value']] = value.reshape(target_shape)
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
 
     for path in paths:
-        new_path = path['new']
+        new_path = path["new"]
 
         # These have already been assigned
         if attention_paths_to_split is not None and new_path in attention_paths_to_split:
             continue
 
         # Global renaming happens here
-        new_path = new_path.replace('middle_block.0', 'mid_block.resnets.0')
-        new_path = new_path.replace('middle_block.1', 'mid_block.attentions.0')
-        new_path = new_path.replace('middle_block.2', 'mid_block.resnets.1')
-        
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
         if additional_replacements is not None:
             for replacement in additional_replacements:
-                new_path = new_path.replace(replacement['old'], replacement['new'])
+                new_path = new_path.replace(replacement["old"], replacement["new"])
 
         # proj_attn.weight has to be converted from conv 1D to linear
         if "proj_attn.weight" in new_path:
-            checkpoint[new_path] = old_checkpoint[path['old']][:, :, 0]
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
         else:
-            checkpoint[new_path] = old_checkpoint[path['old']]
+            checkpoint[new_path] = old_checkpoint[path["old"]]
 
 
 def conv_attn_to_linear(checkpoint):
@@ -207,7 +220,7 @@ def create_unet_diffusers_config(original_config):
 
     up_block_types = []
     for i in range(len(block_out_channels)):
-        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" 
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
         up_block_types.append(block_type)
         resolution //= 2
 
@@ -231,7 +244,7 @@ def create_vae_diffusers_config(original_config):
     Creates a config for the diffusers based on the config of the LDM model.
     """
     vae_params = original_config.model.params.first_stage_config.params.ddconfig
-    latent_channles = original_config.model.params.first_stage_config.params.embed_dim
+    _ = original_config.model.params.first_stage_config.params.embed_dim
 
     block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
     down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
@@ -258,7 +271,7 @@ def create_diffusers_schedular(original_config):
         beta_schedule="scaled_linear",
     )
     return schedular
-    
+
 
 def create_ldm_bert_config(original_config):
     bert_params = original_config.model.parms.cond_stage_config.params
@@ -268,7 +281,7 @@ def create_ldm_bert_config(original_config):
         encoder_ffn_dim=bert_params.n_embed * 4,
     )
     return config
-    
+
 
 def convert_ldm_unet_checkpoint(checkpoint, config):
     """
@@ -285,51 +298,69 @@ def convert_ldm_unet_checkpoint(checkpoint, config):
 
     new_checkpoint = {}
 
-    new_checkpoint['time_embedding.linear_1.weight'] = unet_state_dict['time_embed.0.weight']
-    new_checkpoint['time_embedding.linear_1.bias'] = unet_state_dict['time_embed.0.bias']
-    new_checkpoint['time_embedding.linear_2.weight'] = unet_state_dict['time_embed.2.weight']
-    new_checkpoint['time_embedding.linear_2.bias'] = unet_state_dict['time_embed.2.bias']
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
 
-    new_checkpoint['conv_in.weight'] = unet_state_dict['input_blocks.0.0.weight']
-    new_checkpoint['conv_in.bias'] = unet_state_dict['input_blocks.0.0.bias']
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
 
-    new_checkpoint['conv_norm_out.weight'] = unet_state_dict['out.0.weight']
-    new_checkpoint['conv_norm_out.bias'] = unet_state_dict['out.0.bias']
-    new_checkpoint['conv_out.weight'] = unet_state_dict['out.2.weight']
-    new_checkpoint['conv_out.bias'] = unet_state_dict['out.2.bias']
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
 
     # Retrieves the keys for the input blocks only
-    num_input_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'input_blocks' in layer})
-    input_blocks = {layer_id: [key for key in unet_state_dict if f'input_blocks.{layer_id}' in key] for layer_id in range(num_input_blocks)}
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
 
     # Retrieves the keys for the middle blocks only
-    num_middle_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'middle_block' in layer})
-    middle_blocks = {layer_id: [key for key in unet_state_dict if f'middle_block.{layer_id}' in key] for layer_id in range(num_middle_blocks)}
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
 
     # Retrieves the keys for the output blocks only
-    num_output_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'output_blocks' in layer})
-    output_blocks = {layer_id: [key for key in unet_state_dict if f'output_blocks.{layer_id}' in key] for layer_id in range(num_output_blocks)}
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
 
     for i in range(1, num_input_blocks):
-        block_id = (i - 1) // (config['layers_per_block'] + 1)
-        layer_in_block_id = (i - 1) % (config['layers_per_block'] + 1)
-
-        resnets = [key for key in input_blocks[i] if f'input_blocks.{i}.0' in key and f'input_blocks.{i}.0.op' not in key]
-        attentions = [key for key in input_blocks[i] if f'input_blocks.{i}.1' in key]
-
-        if f'input_blocks.{i}.0.op.weight' in unet_state_dict:
-            new_checkpoint[f'down_blocks.{block_id}.downsamplers.0.conv.weight'] = unet_state_dict.pop(f'input_blocks.{i}.0.op.weight')
-            new_checkpoint[f'down_blocks.{block_id}.downsamplers.0.conv.bias'] = unet_state_dict.pop(f'input_blocks.{i}.0.op.bias')
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
 
         paths = renew_resnet_paths(resnets)
-        meta_path = {'old': f'input_blocks.{i}.0', 'new': f'down_blocks.{block_id}.resnets.{layer_in_block_id}'}
-        assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
 
         if len(attentions):
             paths = renew_attention_paths(attentions)
-            meta_path = {'old': f'input_blocks.{i}.1', 'new': f'down_blocks.{block_id}.attentions.{layer_in_block_id}'}
-            assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
-
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
 
     resnet_0 = middle_blocks[0]
     attentions = middle_blocks[1]
@@ -342,36 +373,44 @@ def convert_ldm_unet_checkpoint(checkpoint, config):
     assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
 
     attentions_paths = renew_attention_paths(attentions)
-    meta_path = {'old': 'middle_block.1', 'new': 'mid_block.attentions.0'}
-    assign_to_checkpoint(attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
 
     for i in range(num_output_blocks):
-        block_id = i // (config['layers_per_block'] + 1)
-        layer_in_block_id = i % (config['layers_per_block'] + 1)
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
         output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
         output_block_list = {}
 
         for layer in output_block_layers:
-            layer_id, layer_name = layer.split('.')[0], shave_segments(layer, 1)
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
             if layer_id in output_block_list:
                 output_block_list[layer_id].append(layer_name)
             else:
                 output_block_list[layer_id] = [layer_name]
 
         if len(output_block_list) > 1:
-            resnets = [key for key in output_blocks[i] if f'output_blocks.{i}.0' in key]
-            attentions = [key for key in output_blocks[i] if f'output_blocks.{i}.1' in key]
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
 
             resnet_0_paths = renew_resnet_paths(resnets)
             paths = renew_resnet_paths(resnets)
 
-            meta_path = {'old': f'output_blocks.{i}.0', 'new': f'up_blocks.{block_id}.resnets.{layer_in_block_id}'}
-            assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
 
-            if ['conv.weight', 'conv.bias'] in output_block_list.values():
-                index = list(output_block_list.values()).index(['conv.weight', 'conv.bias'])
-                new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.weight'] = unet_state_dict[f'output_blocks.{i}.{index}.conv.weight']
-                new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.bias'] = unet_state_dict[f'output_blocks.{i}.{index}.conv.bias']
+            if ["conv.weight", "conv.bias"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
 
                 # Clear attentions as they have been attributed above.
                 if len(attentions) == 2:
@@ -380,15 +419,17 @@ def convert_ldm_unet_checkpoint(checkpoint, config):
             if len(attentions):
                 paths = renew_attention_paths(attentions)
                 meta_path = {
-                    'old': f'output_blocks.{i}.1',
-                    'new': f'up_blocks.{block_id}.attentions.{layer_in_block_id}'
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
                 }
-                assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
         else:
             resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
             for path in resnet_0_paths:
-                old_path = '.'.join(['output_blocks', str(i), path['old']])
-                new_path = '.'.join(['up_blocks', str(block_id), 'resnets', str(layer_in_block_id), path['new']])
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
 
                 new_checkpoint[new_path] = unet_state_dict[old_path]
 
@@ -403,7 +444,7 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
     for key in keys:
         if key.startswith(vae_key):
             vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
-    
+
     new_checkpoint = {}
 
     new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
@@ -425,66 +466,78 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
     new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
     new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
 
-    
     # Retrieves the keys for the encoder down blocks only
-    num_down_blocks = len({'.'.join(layer.split('.')[:3]) for layer in vae_state_dict if 'encoder.down' in layer})
-    down_blocks = {layer_id: [key for key in vae_state_dict if f'down.{layer_id}' in key] for layer_id in range(num_down_blocks)}
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
 
     # Retrieves the keys for the decoder up blocks only
-    num_up_blocks = len({'.'.join(layer.split('.')[:3]) for layer in vae_state_dict if 'decoder.up' in layer})
-    up_blocks = {layer_id: [key for key in vae_state_dict if f'up.{layer_id}' in key] for layer_id in range(num_up_blocks)}
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
 
-    
     for i in range(num_down_blocks):
-        resnets = [key for key in down_blocks[i] if f'down.{i}' in key and f"down.{i}.downsample" not in key]
-        
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+
         if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.weight")
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.bias")
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {'old': f'down.{i}.block', 'new': f'down_blocks.{i}.resnets'}
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
         assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    
+
     mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
     num_mid_res_blocks = 2
     for i in range(1, num_mid_res_blocks + 1):
         resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i - 1}'}
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
         assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
 
     mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
     paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {'old': 'mid.attn_1', 'new': 'mid_block.attentions.0'}
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
     assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
     conv_attn_to_linear(new_checkpoint)
 
     for i in range(num_up_blocks):
         block_id = num_up_blocks - 1 - i
-        resnets = [key for key in up_blocks[block_id] if f'up.{block_id}' in key and f"up.{block_id}.upsample" not in key]
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
 
         if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.weight"]
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.bias"]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {'old': f'up.{block_id}.block', 'new': f'up_blocks.{i}.resnets'}
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
         assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    
+
     mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
     num_mid_res_blocks = 2
     for i in range(1, num_mid_res_blocks + 1):
         resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i - 1}'}
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
         assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
 
     mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
     paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {'old': 'mid.attn_1', 'new': 'mid_block.attentions.0'}
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
     assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
     conv_attn_to_linear(new_checkpoint)
     return new_checkpoint
@@ -500,32 +553,30 @@ def _copy_attn_layer(hf_attn_layer, pt_attn_layer):
         hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight
         hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias
 
-
     def _copy_linear(hf_linear, pt_linear):
         hf_linear.weight = pt_linear.weight
         hf_linear.bias = pt_linear.bias
-    
 
     def _copy_layer(hf_layer, pt_layer):
         # copy layer norms
         _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0])
         _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0])
-        
+
         # copy attn
         _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1])
-        
+
         # copy MLP
         pt_mlp = pt_layer[1][1]
         _copy_linear(hf_layer.fc1, pt_mlp.net[0][0])
         _copy_linear(hf_layer.fc2, pt_mlp.net[2])
 
-
     def _copy_layers(hf_layers, pt_layers):
         for i, hf_layer in enumerate(hf_layers):
-            if i != 0: i += i
-            pt_layer = pt_layers[i:i+2]
+            if i != 0:
+                i += i
+            pt_layer = pt_layers[i : i + 2]
             _copy_layer(hf_layer, pt_layer)
-    
+
     hf_model = LDMBertModel(config).eval()
 
     # copy  embeds
@@ -537,13 +588,12 @@ def _copy_layers(hf_layers, pt_layers):
 
     # copy hidden layers
     _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers)
-    
+
     _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits)
 
     return hf_model
 
 
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
@@ -555,32 +605,55 @@ def _copy_layers(hf_layers, pt_layers):
         "--original_config_file",
         default=None,
         type=str,
-        required=True,
         help="The YAML config file corresponding to the original architecture.",
     )
     parser.add_argument(
-        "--scheduler_type", default="pndm", type=str, required=True, help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim']"
-    )
-    parser.add_argument(
-        "--dump_path", default=None, type=str, required=True, help="Path to the output model."
+        "--scheduler_type",
+        default="pndm",
+        type=str,
+        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim']",
     )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
 
     args = parser.parse_args()
 
-    original_config = OmegaConf.load(args.original_config_file)
+    if args.original_config_file is None:
+        os.system(
+            "wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
+        )
+        args.original_config_file = "./v1-inference.yaml"
 
+    original_config = OmegaConf.load(args.original_config_file)
     checkpoint = torch.load(args.checkpoint_path)["state_dict"]
 
+    num_train_timesteps = original_config.model.params.timesteps
+    beta_start = original_config.model.params.linear_start
+    beta_end = original_config.model.params.linear_end
     if args.scheduler_type == "pndm":
-    elif args.scheduler_type == "pndm":
-    elif args.scheduler_type == "pndm":
+        scheduler = PNDMScheduler(
+            beta_end=beta_end,
+            beta_schedule="scaled_linear",
+            beta_start=beta_start,
+            num_train_timesteps=num_train_timesteps,
+            skip_prk_steps=True,
+        )
+    elif args.scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
+    elif args.scheduler_type == "ddim":
+        scheduler = DDIMScheduler(
+            beta_start=beta_start,
+            beta_end=beta_end,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
     else:
         raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
 
     # Convert the UNet2DConditionModel model.
     unet_config = create_unet_diffusers_config(original_config)
     converted_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, unet_config)
-    
+
     unet = UNet2DConditionModel(**unet_config)
     unet.load_state_dict(converted_unet_checkpoint)
 
@@ -596,13 +669,21 @@ def _copy_layers(hf_layers, pt_layers):
     if text_model_type == "FrozenCLIPEmbedder":
         text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
         tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
+        feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
+        pipe = StableDiffusionPipeline(
+            vae=vae,
+            text_encoder=text_model,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
     else:
-        # TODO: update the convert function to use the state_dict without the model instance.
         text_config = create_ldm_bert_config(original_config)
         text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
         tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
+        pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
 
-    scheduler = create_diffusers_schedular(original_config)
-    pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
     pipe.save_pretrained(args.dump_path)
-

From 6c80f98ce18f37aa423ea229915b7a11839a0407 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 15 Sep 2022 21:56:52 +0000
Subject: [PATCH 19/20] up

---
 ...onvert_original_stable_diffusion_to_diffusers.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/scripts/convert_original_stable_diffusion_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py
index cc417188f88b..ee7fc335438f 100644
--- a/scripts/convert_original_stable_diffusion_to_diffusers.py
+++ b/scripts/convert_original_stable_diffusion_to_diffusers.py
@@ -15,9 +15,11 @@
 """ Conversion script for the LDM checkpoints. """
 
 import argparse
-import torch
 import os
 
+import torch
+
+
 try:
     from omegaconf import OmegaConf
 except ImportError:
@@ -25,18 +27,18 @@
         "OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`."
     )
 
-from transformers import BertTokenizerFast, CLIPTokenizer, CLIPTextModel, AutoFeatureExtractor
 from diffusers import (
-    LDMTextToImagePipeline,
     AutoencoderKL,
-    UNet2DConditionModel,
     DDIMScheduler,
+    LDMTextToImagePipeline,
     LMSDiscreteScheduler,
     PNDMScheduler,
     StableDiffusionPipeline,
+    UNet2DConditionModel,
 )
+from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
 from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel, LDMBertConfig
+from transformers import AutoFeatureExtractor, BertTokenizerFast, CLIPTextModel, CLIPTokenizer
 
 
 def shave_segments(path, n_shave_prefix_segments=1):
@@ -545,7 +547,6 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
 
 def convert_ldm_bert_checkpoint(checkpoint, config):
     def _copy_attn_layer(hf_attn_layer, pt_attn_layer):
-
         hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight
         hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight
         hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight

From 81e0393212e14877c069cc4f961211d5ab14dae6 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 15 Sep 2022 22:06:56 +0000
Subject: [PATCH 20/20] up

---
 _ | 608 --------------------------------------------------------------
 1 file changed, 608 deletions(-)
 delete mode 100644 _

diff --git a/_ b/_
deleted file mode 100644
index 0965f6522f6a..000000000000
--- a/_
+++ /dev/null
@@ -1,608 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Conversion script for the LDM checkpoints. """
-
-import argparse
-import torch
-
-try:
-    from omegaconf import OmegaConf
-except ImportError:
-    raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.")
-
-from transformers import BertTokenizerFast, CLIPTokenizer, CLIPTextModel
-from diffusers import LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, StableDiffusionPipeline
-from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel, LDMBertConfig
-
-
-def shave_segments(path, n_shave_prefix_segments=1):
-    """
-    Removes segments. Positive values shave the first segments, negative shave the last segments.
-    """
-    if n_shave_prefix_segments >= 0:
-        return '.'.join(path.split('.')[n_shave_prefix_segments:])
-    else:
-        return '.'.join(path.split('.')[:n_shave_prefix_segments])
-
-
-def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item.replace('in_layers.0', 'norm1')
-        new_item = new_item.replace('in_layers.2', 'conv1')
-
-        new_item = new_item.replace('out_layers.0', 'norm2')
-        new_item = new_item.replace('out_layers.3', 'conv2')
-
-        new_item = new_item.replace('emb_layers.1', 'time_emb_proj')
-        new_item = new_item.replace('skip_connection', 'conv_shortcut')
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({'old': old_item, 'new': new_item})
-
-    return mapping
-
-
-def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace('nin_shortcut', 'conv_shortcut')
-        
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({'old': old_item, 'new': new_item})
-
-    return mapping
-
-
-def renew_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-#         new_item = new_item.replace('norm.weight', 'group_norm.weight')
-#         new_item = new_item.replace('norm.bias', 'group_norm.bias')
-
-#         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
-#         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
-
-#         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({'old': old_item, 'new': new_item})
-
-    return mapping
-
-
-def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace('norm.weight', 'group_norm.weight')
-        new_item = new_item.replace('norm.bias', 'group_norm.bias')
-
-        new_item = new_item.replace('q.weight', 'query.weight')
-        new_item = new_item.replace('q.bias', 'query.bias')
-
-        new_item = new_item.replace('k.weight', 'key.weight')
-        new_item = new_item.replace('k.bias', 'key.bias')
-
-        new_item = new_item.replace('v.weight', 'value.weight')
-        new_item = new_item.replace('v.bias', 'value.bias')
-
-        new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
-        new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({'old': old_item, 'new': new_item})
-
-    return mapping
-
-
-def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None):
-    """
-    This does the final conversion step: take locally converted weights and apply a global renaming
-    to them. It splits attention layers, and takes into account additional replacements
-    that may arise.
-
-    Assigns the weights to the new checkpoint.
-    """
-    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
-
-    # Splits the attention layers into three variables.
-    if attention_paths_to_split is not None:
-        for path, path_map in attention_paths_to_split.items():
-            old_tensor = old_checkpoint[path]
-            channels = old_tensor.shape[0] // 3
-
-            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
-
-            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
-
-            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
-            query, key, value = old_tensor.split(channels // num_heads, dim=1)
-
-            checkpoint[path_map['query']] = query.reshape(target_shape)
-            checkpoint[path_map['key']] = key.reshape(target_shape)
-            checkpoint[path_map['value']] = value.reshape(target_shape)
-
-    for path in paths:
-        new_path = path['new']
-
-        # These have already been assigned
-        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
-            continue
-
-        # Global renaming happens here
-        new_path = new_path.replace('middle_block.0', 'mid_block.resnets.0')
-        new_path = new_path.replace('middle_block.1', 'mid_block.attentions.0')
-        new_path = new_path.replace('middle_block.2', 'mid_block.resnets.1')
-        
-        if additional_replacements is not None:
-            for replacement in additional_replacements:
-                new_path = new_path.replace(replacement['old'], replacement['new'])
-
-        # proj_attn.weight has to be converted from conv 1D to linear
-        if "proj_attn.weight" in new_path:
-            checkpoint[new_path] = old_checkpoint[path['old']][:, :, 0]
-        else:
-            checkpoint[new_path] = old_checkpoint[path['old']]
-
-
-def conv_attn_to_linear(checkpoint):
-    keys = list(checkpoint.keys())
-    attn_keys = ["query.weight", "key.weight", "value.weight"]
-    for key in keys:
-        if ".".join(key.split(".")[-2:]) in attn_keys:
-            if checkpoint[key].ndim > 2:
-                checkpoint[key] = checkpoint[key][:, :, 0, 0]
-        elif "proj_attn.weight" in key:
-            if checkpoint[key].ndim > 2:
-                checkpoint[key] = checkpoint[key][:, :, 0]
-
-
-def create_unet_diffusers_config(original_config):
-    """
-    Creates a config for the diffusers based on the config of the LDM model.
-    """
-    unet_params = original_config.model.params.unet_config.params
-
-    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
-
-    down_block_types = []
-    resolution = 1
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
-        down_block_types.append(block_type)
-        if i != len(block_out_channels) - 1:
-            resolution *= 2
-
-    up_block_types = []
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" 
-        up_block_types.append(block_type)
-        resolution //= 2
-
-    config = dict(
-        sample_size=unet_params.image_size,
-        in_channels=unet_params.in_channels,
-        out_channels=unet_params.out_channels,
-        down_block_types=tuple(down_block_types),
-        up_block_types=tuple(up_block_types),
-        block_out_channels=tuple(block_out_channels),
-        layers_per_block=unet_params.num_res_blocks,
-        cross_attention_dim=unet_params.context_dim,
-        attention_head_dim=unet_params.num_heads,
-    )
-
-    return config
-
-
-def create_vae_diffusers_config(original_config):
-    """
-    Creates a config for the diffusers based on the config of the LDM model.
-    """
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
-    latent_channles = original_config.model.params.first_stage_config.params.embed_dim
-
-    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
-    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
-    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
-
-    config = dict(
-        sample_size=vae_params.resolution,
-        in_channels=vae_params.in_channels,
-        out_channels=vae_params.out_ch,
-        down_block_types=tuple(down_block_types),
-        up_block_types=tuple(up_block_types),
-        block_out_channels=tuple(block_out_channels),
-        latent_channels=vae_params.z_channels,
-        layers_per_block=vae_params.num_res_blocks,
-    )
-    return config
-
-
-def create_diffusers_schedular(original_config):
-    schedular = DDIMScheduler(
-        num_train_timesteps=original_config.model.params.timesteps,
-        beta_start=original_config.model.params.linear_start,
-        beta_end=original_config.model.params.linear_end,
-        beta_schedule="scaled_linear",
-    )
-    return schedular
-    
-
-def create_ldm_bert_config(original_config):
-    bert_params = original_config.model.parms.cond_stage_config.params
-    config = LDMBertConfig(
-        d_model=bert_params.n_embed,
-        encoder_layers=bert_params.n_layer,
-        encoder_ffn_dim=bert_params.n_embed * 4,
-    )
-    return config
-    
-
-def convert_ldm_unet_checkpoint(checkpoint, config):
-    """
-    Takes a state dict and a config, and returns a converted checkpoint.
-    """
-
-    # extract state_dict for UNet
-    unet_state_dict = {}
-    unet_key = "model.diffusion_model."
-    keys = list(checkpoint.keys())
-    for key in keys:
-        if key.startswith(unet_key):
-            unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
-
-    new_checkpoint = {}
-
-    new_checkpoint['time_embedding.linear_1.weight'] = unet_state_dict['time_embed.0.weight']
-    new_checkpoint['time_embedding.linear_1.bias'] = unet_state_dict['time_embed.0.bias']
-    new_checkpoint['time_embedding.linear_2.weight'] = unet_state_dict['time_embed.2.weight']
-    new_checkpoint['time_embedding.linear_2.bias'] = unet_state_dict['time_embed.2.bias']
-
-    new_checkpoint['conv_in.weight'] = unet_state_dict['input_blocks.0.0.weight']
-    new_checkpoint['conv_in.bias'] = unet_state_dict['input_blocks.0.0.bias']
-
-    new_checkpoint['conv_norm_out.weight'] = unet_state_dict['out.0.weight']
-    new_checkpoint['conv_norm_out.bias'] = unet_state_dict['out.0.bias']
-    new_checkpoint['conv_out.weight'] = unet_state_dict['out.2.weight']
-    new_checkpoint['conv_out.bias'] = unet_state_dict['out.2.bias']
-
-    # Retrieves the keys for the input blocks only
-    num_input_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'input_blocks' in layer})
-    input_blocks = {layer_id: [key for key in unet_state_dict if f'input_blocks.{layer_id}' in key] for layer_id in range(num_input_blocks)}
-
-    # Retrieves the keys for the middle blocks only
-    num_middle_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'middle_block' in layer})
-    middle_blocks = {layer_id: [key for key in unet_state_dict if f'middle_block.{layer_id}' in key] for layer_id in range(num_middle_blocks)}
-
-    # Retrieves the keys for the output blocks only
-    num_output_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'output_blocks' in layer})
-    output_blocks = {layer_id: [key for key in unet_state_dict if f'output_blocks.{layer_id}' in key] for layer_id in range(num_output_blocks)}
-
-    for i in range(1, num_input_blocks):
-        block_id = (i - 1) // (config['layers_per_block'] + 1)
-        layer_in_block_id = (i - 1) % (config['layers_per_block'] + 1)
-
-        resnets = [key for key in input_blocks[i] if f'input_blocks.{i}.0' in key and f'input_blocks.{i}.0.op' not in key]
-        attentions = [key for key in input_blocks[i] if f'input_blocks.{i}.1' in key]
-
-        if f'input_blocks.{i}.0.op.weight' in unet_state_dict:
-            new_checkpoint[f'down_blocks.{block_id}.downsamplers.0.conv.weight'] = unet_state_dict.pop(f'input_blocks.{i}.0.op.weight')
-            new_checkpoint[f'down_blocks.{block_id}.downsamplers.0.conv.bias'] = unet_state_dict.pop(f'input_blocks.{i}.0.op.bias')
-
-        paths = renew_resnet_paths(resnets)
-        meta_path = {'old': f'input_blocks.{i}.0', 'new': f'down_blocks.{block_id}.resnets.{layer_in_block_id}'}
-        assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
-
-        if len(attentions):
-            paths = renew_attention_paths(attentions)
-            meta_path = {'old': f'input_blocks.{i}.1', 'new': f'down_blocks.{block_id}.attentions.{layer_in_block_id}'}
-            assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
-
-
-    resnet_0 = middle_blocks[0]
-    attentions = middle_blocks[1]
-    resnet_1 = middle_blocks[2]
-
-    resnet_0_paths = renew_resnet_paths(resnet_0)
-    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
-
-    resnet_1_paths = renew_resnet_paths(resnet_1)
-    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
-
-    attentions_paths = renew_attention_paths(attentions)
-    meta_path = {'old': 'middle_block.1', 'new': 'mid_block.attentions.0'}
-    assign_to_checkpoint(attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
-
-    for i in range(num_output_blocks):
-        block_id = i // (config['layers_per_block'] + 1)
-        layer_in_block_id = i % (config['layers_per_block'] + 1)
-        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
-        output_block_list = {}
-
-        for layer in output_block_layers:
-            layer_id, layer_name = layer.split('.')[0], shave_segments(layer, 1)
-            if layer_id in output_block_list:
-                output_block_list[layer_id].append(layer_name)
-            else:
-                output_block_list[layer_id] = [layer_name]
-
-        if len(output_block_list) > 1:
-            resnets = [key for key in output_blocks[i] if f'output_blocks.{i}.0' in key]
-            attentions = [key for key in output_blocks[i] if f'output_blocks.{i}.1' in key]
-
-            resnet_0_paths = renew_resnet_paths(resnets)
-            paths = renew_resnet_paths(resnets)
-
-            meta_path = {'old': f'output_blocks.{i}.0', 'new': f'up_blocks.{block_id}.resnets.{layer_in_block_id}'}
-            assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
-
-            if ['conv.weight', 'conv.bias'] in output_block_list.values():
-                index = list(output_block_list.values()).index(['conv.weight', 'conv.bias'])
-                new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.weight'] = unet_state_dict[f'output_blocks.{i}.{index}.conv.weight']
-                new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.bias'] = unet_state_dict[f'output_blocks.{i}.{index}.conv.bias']
-
-                # Clear attentions as they have been attributed above.
-                if len(attentions) == 2:
-                    attentions = []
-
-            if len(attentions):
-                paths = renew_attention_paths(attentions)
-                meta_path = {
-                    'old': f'output_blocks.{i}.1',
-                    'new': f'up_blocks.{block_id}.attentions.{layer_in_block_id}'
-                }
-                assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
-        else:
-            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
-            for path in resnet_0_paths:
-                old_path = '.'.join(['output_blocks', str(i), path['old']])
-                new_path = '.'.join(['up_blocks', str(block_id), 'resnets', str(layer_in_block_id), path['new']])
-
-                new_checkpoint[new_path] = unet_state_dict[old_path]
-
-    return new_checkpoint
-
-
-def convert_ldm_vae_checkpoint(checkpoint, config):
-    # extract state dict for VAE
-    vae_state_dict = {}
-    vae_key = "first_stage_model."
-    keys = list(checkpoint.keys())
-    for key in keys:
-        if key.startswith(vae_key):
-            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
-    
-    new_checkpoint = {}
-
-    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
-    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
-    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
-    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
-    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
-    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
-
-    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
-    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
-    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
-    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
-    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
-    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
-
-    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
-    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
-    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
-    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
-
-    
-    # Retrieves the keys for the encoder down blocks only
-    num_down_blocks = len({'.'.join(layer.split('.')[:3]) for layer in vae_state_dict if 'encoder.down' in layer})
-    down_blocks = {layer_id: [key for key in vae_state_dict if f'down.{layer_id}' in key] for layer_id in range(num_down_blocks)}
-
-    # Retrieves the keys for the decoder up blocks only
-    num_up_blocks = len({'.'.join(layer.split('.')[:3]) for layer in vae_state_dict if 'decoder.up' in layer})
-    up_blocks = {layer_id: [key for key in vae_state_dict if f'up.{layer_id}' in key] for layer_id in range(num_up_blocks)}
-
-    
-    for i in range(num_down_blocks):
-        resnets = [key for key in down_blocks[i] if f'down.{i}' in key and f"down.{i}.downsample" not in key]
-        
-        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.weight")
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.bias")
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {'old': f'down.{i}.block', 'new': f'down_blocks.{i}.resnets'}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    
-    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
-    num_mid_res_blocks = 2
-    for i in range(1, num_mid_res_blocks + 1):
-        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i - 1}'}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {'old': 'mid.attn_1', 'new': 'mid_block.attentions.0'}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    conv_attn_to_linear(new_checkpoint)
-
-    for i in range(num_up_blocks):
-        block_id = num_up_blocks - 1 - i
-        resnets = [key for key in up_blocks[block_id] if f'up.{block_id}' in key and f"up.{block_id}.upsample" not in key]
-
-        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.weight"]
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.bias"]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {'old': f'up.{block_id}.block', 'new': f'up_blocks.{i}.resnets'}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    
-    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
-    num_mid_res_blocks = 2
-    for i in range(1, num_mid_res_blocks + 1):
-        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i - 1}'}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {'old': 'mid.attn_1', 'new': 'mid_block.attentions.0'}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    conv_attn_to_linear(new_checkpoint)
-    return new_checkpoint
-
-
-def convert_ldm_bert_checkpoint(checkpoint, config):
-    def _copy_attn_layer(hf_attn_layer, pt_attn_layer):
-
-        hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight
-        hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight
-        hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight
-
-        hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight
-        hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias
-
-
-    def _copy_linear(hf_linear, pt_linear):
-        hf_linear.weight = pt_linear.weight
-        hf_linear.bias = pt_linear.bias
-    
-
-    def _copy_layer(hf_layer, pt_layer):
-        # copy layer norms
-        _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0])
-        _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0])
-        
-        # copy attn
-        _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1])
-        
-        # copy MLP
-        pt_mlp = pt_layer[1][1]
-        _copy_linear(hf_layer.fc1, pt_mlp.net[0][0])
-        _copy_linear(hf_layer.fc2, pt_mlp.net[2])
-
-
-    def _copy_layers(hf_layers, pt_layers):
-        for i, hf_layer in enumerate(hf_layers):
-            if i != 0: i += i
-            pt_layer = pt_layers[i:i+2]
-            _copy_layer(hf_layer, pt_layer)
-    
-    hf_model = LDMBertModel(config).eval()
-
-    # copy  embeds
-    hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight
-    hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight
-
-    # copy layer norm
-    _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm)
-
-    # copy hidden layers
-    _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers)
-    
-    _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits)
-
-    return hf_model
-
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
-    )
-    # !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml
-    parser.add_argument(
-        "--original_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The YAML config file corresponding to the original architecture.",
-    )
-    parser.add_argument(
-        "--scheduler_type", default="pndm", type=str, required=True, help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim']"
-    )
-    parser.add_argument(
-        "--dump_path", default=None, type=str, required=True, help="Path to the output model."
-    )
-
-    args = parser.parse_args()
-
-    original_config = OmegaConf.load(args.original_config_file)
-
-    checkpoint = torch.load(args.checkpoint_path)["state_dict"]
-
-    if args.scheduler_type == "pndm":
-    elif args.scheduler_type == "pndm":
-    elif args.scheduler_type == "pndm":
-    else:
-        raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
-
-    # Convert the UNet2DConditionModel model.
-    unet_config = create_unet_diffusers_config(original_config)
-    converted_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, unet_config)
-    
-    unet = UNet2DConditionModel(**unet_config)
-    unet.load_state_dict(converted_unet_checkpoint)
-
-    # Convert the VAE model.
-    vae_config = create_vae_diffusers_config(original_config)
-    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
-
-    vae = AutoencoderKL(**vae_config)
-    vae.load_state_dict(converted_vae_checkpoint)
-
-    # Convert the text model.
-    text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
-    if text_model_type == "FrozenCLIPEmbedder":
-        text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
-        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-    else:
-        # TODO: update the convert function to use the state_dict without the model instance.
-        text_config = create_ldm_bert_config(original_config)
-        text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
-        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
-
-    scheduler = create_diffusers_schedular(original_config)
-    pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
-    pipe.save_pretrained(args.dump_path)
-