From 7e3987cbd67f03cea7554742f41b3bf78e54b1b4 Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Fri, 5 Aug 2022 16:32:23 +0530 Subject: [PATCH 01/20] begin text2img conversion script --- ...xt2img_original_checkpoint_to_diffusers.py | 423 ++++++++++++++++++ 1 file changed, 423 insertions(+) create mode 100644 scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py new file mode 100644 index 000000000000..d83f7a69602c --- /dev/null +++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py @@ -0,0 +1,423 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Conversion script for the LDM checkpoints. """ + +import argparse +import json +import torch + +try: + import OmegaConf +except ImportError: + raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.") + +from diffusers import VQModel, DDPMScheduler, UNet2DModel, LDMPipeline + + +def shave_segments(path, n_shave_prefix_segments=1): + """ + Removes segments. Positive values shave the first segments, negative shave the last segments. + """ + if n_shave_prefix_segments >= 0: + return '.'.join(path.split('.')[n_shave_prefix_segments:]) + else: + return '.'.join(path.split('.')[:n_shave_prefix_segments]) + + +def renew_resnet_paths(old_list, n_shave_prefix_segments=0): + """ + Updates paths inside resnets to the new naming scheme (local renaming) + """ + mapping = [] + for old_item in old_list: + new_item = old_item.replace('in_layers.0', 'norm1') + new_item = new_item.replace('in_layers.2', 'conv1') + + new_item = new_item.replace('out_layers.0', 'norm2') + new_item = new_item.replace('out_layers.3', 'conv2') + + new_item = new_item.replace('emb_layers.1', 'time_emb_proj') + new_item = new_item.replace('skip_connection', 'conv_shortcut') + + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) + + mapping.append({'old': old_item, 'new': new_item}) + + return mapping + + +def renew_attention_paths(old_list, n_shave_prefix_segments=0): + """ + Updates paths inside attentions to the new naming scheme (local renaming) + """ + mapping = [] + for old_item in old_list: + new_item = old_item + + new_item = new_item.replace('norm.weight', 'group_norm.weight') + new_item = new_item.replace('norm.bias', 'group_norm.bias') + + new_item = new_item.replace('proj_out.weight', 'proj_attn.weight') + new_item = new_item.replace('proj_out.bias', 'proj_attn.bias') + + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) + + mapping.append({'old': old_item, 'new': new_item}) + + return mapping + + +def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None): + """ + This does the final conversion step: take locally converted weights and apply a global renaming + to them. It splits attention layers, and takes into account additional replacements + that may arise. + + Assigns the weights to the new checkpoint. + """ + assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys." + + # Splits the attention layers into three variables. + if attention_paths_to_split is not None: + for path, path_map in attention_paths_to_split.items(): + old_tensor = old_checkpoint[path] + channels = old_tensor.shape[0] // 3 + + target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1) + + num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3 + + old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:]) + query, key, value = old_tensor.split(channels // num_heads, dim=1) + + checkpoint[path_map['query']] = query.reshape(target_shape) + checkpoint[path_map['key']] = key.reshape(target_shape) + checkpoint[path_map['value']] = value.reshape(target_shape) + + for path in paths: + new_path = path['new'] + + # These have already been assigned + if attention_paths_to_split is not None and new_path in attention_paths_to_split: + continue + + # Global renaming happens here + new_path = new_path.replace('middle_block.0', 'mid.resnets.0') + new_path = new_path.replace('middle_block.1', 'mid.attentions.0') + new_path = new_path.replace('middle_block.2', 'mid.resnets.1') + + if additional_replacements is not None: + for replacement in additional_replacements: + new_path = new_path.replace(replacement['old'], replacement['new']) + + # proj_attn.weight has to be converted from conv 1D to linear + if "proj_attn.weight" in new_path: + checkpoint[new_path] = old_checkpoint[path['old']][:, :, 0] + else: + checkpoint[new_path] = old_checkpoint[path['old']] + + +def convert_ldm_unet_checkpoint(checkpoint, config): + """ + Takes a state dict and a config, and returns a converted checkpoint. + """ + + # extract state_dict for UNet + unet_state_dict = {} + unet_key = "model.diffusion_model." + keys = list(checkpoint.keys()) + for key in keys: + if key.startswith(unet_key): + unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key) + + new_checkpoint = {} + + new_checkpoint['time_embedding.linear_1.weight'] = unet_state_dict['time_embed.0.weight'] + new_checkpoint['time_embedding.linear_1.bias'] = unet_state_dict['time_embed.0.bias'] + new_checkpoint['time_embedding.linear_2.weight'] = unet_state_dict['time_embed.2.weight'] + new_checkpoint['time_embedding.linear_2.bias'] = unet_state_dict['time_embed.2.bias'] + + new_checkpoint['conv_in.weight'] = unet_state_dict['input_blocks.0.0.weight'] + new_checkpoint['conv_in.bias'] = unet_state_dict['input_blocks.0.0.bias'] + + new_checkpoint['conv_norm_out.weight'] = unet_state_dict['out.0.weight'] + new_checkpoint['conv_norm_out.bias'] = unet_state_dict['out.0.bias'] + new_checkpoint['conv_out.weight'] = unet_state_dict['out.2.weight'] + new_checkpoint['conv_out.bias'] = unet_state_dict['out.2.bias'] + + # Retrieves the keys for the input blocks only + num_input_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'input_blocks' in layer}) + input_blocks = {layer_id: [key for key in unet_state_dict if f'input_blocks.{layer_id}' in key] for layer_id in range(num_input_blocks)} + + # Retrieves the keys for the middle blocks only + num_middle_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'middle_block' in layer}) + middle_blocks = {layer_id: [key for key in unet_state_dict if f'middle_block.{layer_id}' in key] for layer_id in range(num_middle_blocks)} + + # Retrieves the keys for the output blocks only + num_output_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'output_blocks' in layer}) + output_blocks = {layer_id: [key for key in unet_state_dict if f'output_blocks.{layer_id}' in key] for layer_id in range(num_output_blocks)} + + for i in range(1, num_input_blocks): + block_id = (i - 1) // (config['num_res_blocks'] + 1) + layer_in_block_id = (i - 1) % (config['num_res_blocks'] + 1) + + resnets = [key for key in input_blocks[i] if f'input_blocks.{i}.0' in key] + attentions = [key for key in input_blocks[i] if f'input_blocks.{i}.1' in key] + + if f'input_blocks.{i}.0.op.weight' in unet_state_dict: + new_checkpoint[f'downsample_blocks.{block_id}.downsamplers.0.conv.weight'] = unet_state_dict[f'input_blocks.{i}.0.op.weight'] + new_checkpoint[f'downsample_blocks.{block_id}.downsamplers.0.conv.bias'] = unet_state_dict[f'input_blocks.{i}.0.op.bias'] + + paths = renew_resnet_paths(resnets) + meta_path = {'old': f'input_blocks.{i}.0', 'new': f'downsample_blocks.{block_id}.resnets.{layer_in_block_id}'} + resnet_op = {'old': 'resnets.2.op', 'new': 'downsamplers.0.op'} + assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path, resnet_op], config=config) + + if len(attentions): + paths = renew_attention_paths(attentions) + meta_path = {'old': f'input_blocks.{i}.1', 'new': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}'} + to_split = { + f'input_blocks.{i}.1.qkv.bias': { + 'key': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.key.bias', + 'query': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.query.bias', + 'value': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.value.bias', + }, + f'input_blocks.{i}.1.qkv.weight': { + 'key': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.key.weight', + 'query': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.query.weight', + 'value': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.value.weight', + }, + } + assign_to_checkpoint( + paths, + new_checkpoint, + unet_state_dict, + additional_replacements=[meta_path], + attention_paths_to_split=to_split, + config=config + ) + + resnet_0 = middle_blocks[0] + attentions = middle_blocks[1] + resnet_1 = middle_blocks[2] + + resnet_0_paths = renew_resnet_paths(resnet_0) + assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config) + + resnet_1_paths = renew_resnet_paths(resnet_1) + assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config) + + attentions_paths = renew_attention_paths(attentions) + to_split = { + 'middle_block.1.qkv.bias': { + 'key': 'mid_block.attentions.0.key.bias', + 'query': 'mid_block.attentions.0.query.bias', + 'value': 'mid_block.attentions.0.value.bias', + }, + 'middle_block.1.qkv.weight': { + 'key': 'mid_block.attentions.0.key.weight', + 'query': 'mid_block.attentions.0.query.weight', + 'value': 'mid_block.attentions.0.value.weight', + }, + } + assign_to_checkpoint(attentions_paths, new_checkpoint, unet_state_dict, attention_paths_to_split=to_split, config=config) + + for i in range(num_output_blocks): + block_id = i // (config['num_res_blocks'] + 1) + layer_in_block_id = i % (config['num_res_blocks'] + 1) + output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]] + output_block_list = {} + + for layer in output_block_layers: + layer_id, layer_name = layer.split('.')[0], shave_segments(layer, 1) + if layer_id in output_block_list: + output_block_list[layer_id].append(layer_name) + else: + output_block_list[layer_id] = [layer_name] + + if len(output_block_list) > 1: + resnets = [key for key in output_blocks[i] if f'output_blocks.{i}.0' in key] + attentions = [key for key in output_blocks[i] if f'output_blocks.{i}.1' in key] + + resnet_0_paths = renew_resnet_paths(resnets) + paths = renew_resnet_paths(resnets) + + meta_path = {'old': f'output_blocks.{i}.0', 'new': f'up_blocks.{block_id}.resnets.{layer_in_block_id}'} + assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) + + if ['conv.weight', 'conv.bias'] in output_block_list.values(): + index = list(output_block_list.values()).index(['conv.weight', 'conv.bias']) + new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.weight'] = unet_state_dict[f'output_blocks.{i}.{index}.conv.weight'] + new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.bias'] = unet_state_dict[f'output_blocks.{i}.{index}.conv.bias'] + + # Clear attentions as they have been attributed above. + if len(attentions) == 2: + attentions = [] + + if len(attentions): + paths = renew_attention_paths(attentions) + meta_path = { + 'old': f'output_blocks.{i}.1', + 'new': f'up_blocks.{block_id}.attentions.{layer_in_block_id}' + } + to_split = { + f'output_blocks.{i}.1.qkv.bias': { + 'key': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.key.bias', + 'query': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.query.bias', + 'value': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.value.bias', + }, + f'output_blocks.{i}.1.qkv.weight': { + 'key': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.key.weight', + 'query': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.query.weight', + 'value': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.value.weight', + }, + } + assign_to_checkpoint( + paths, + new_checkpoint, + unet_state_dict, + additional_replacements=[meta_path], + attention_paths_to_split=to_split if any('qkv' in key for key in attentions) else None, + config=config, + ) + else: + resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1) + for path in resnet_0_paths: + old_path = '.'.join(['output_blocks', str(i), path['old']]) + new_path = '.'.join(['up_blocks', str(block_id), 'resnets', str(layer_in_block_id), path['new']]) + + new_checkpoint[new_path] = unet_state_dict[old_path] + + return new_checkpoint + + +def convert_ldm_bert_checkpoint(checkpoint, config): + def _copy_attn_layer(hf_attn_layer, pt_attn_layer): + + hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight + hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight + hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight + + hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight + hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias + + + def _copy_linear(hf_linear, pt_linear): + hf_linear.weight = pt_linear.weight + hf_linear.bias = pt_linear.bias + + def _copy_mlp(hf_mlp, pt_mlp): + _copy_linear(hf_mlp.fc1, pt_mlp.net[0][0]) + _copy_linear(hf_mlp.fc2, pt_mlp.net[2]) + + + def _copy_layer(hf_layer, pt_layer): + # copy layer norms + _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0]) + _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0]) + + # copy attn + _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1]) + + # copy MLP + pt_mlp = pt_layer[1][1] + _copy_linear(hf_layer.fc1, pt_mlp.net[0][0]) + _copy_linear(hf_layer.fc2, pt_mlp.net[2]) + + + def _copy_layers(hf_layers, pt_layers): + for i, hf_layer in enumerate(hf_layers): + if i != 0: i += i + pt_layer = pt_layers[i:i+2] + _copy_layer(hf_layer, pt_layer) + + hf_model = LDMBertModel(config).eval() + + # copy embeds + hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight + hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight + + # copy layer norm + _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm) + + # copy hidden layers + _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers) + + _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits) + + return hf_model + + +def convert_vae_checkpoint(checkpoint, config): + pass + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert." + ) + + parser.add_argument( + "--original_config_file", + default=None, + type=str, + required=True, + help="The YAML config file corresponding to the original architecture.", + ) + + parser.add_argument( + "--config_file", + default=None, + type=str, + required=True, + help="The config json file corresponding to the architecture.", + ) + + parser.add_argument( + "--ldm_bert_config_file", + default=None, + type=str, + required=False, + help="The config json file corresponding to the LDMBert architecture.", + ) + + parser.add_argument( + "--dump_path", default=None, type=str, required=True, help="Path to the output model." + ) + + args = parser.parse_args() + + checkpoint = torch.load(args.checkpoint_path) + + with open(args.config_file) as f: + config = json.loads(f.read()) + + converted_checkpoint = convert_ldm_checkpoint(checkpoint, config) + + if "ldm" in config: + del config["ldm"] + + model = UNet2DModel(**config) + model.load_state_dict(converted_checkpoint) + + try: + scheduler = DDPMScheduler.from_config("/".join(args.checkpoint_path.split("/")[:-1])) + vqvae = VQModel.from_pretrained("/".join(args.checkpoint_path.split("/")[:-1])) + + pipe = LDMPipeline(unet=model, scheduler=scheduler, vae=vqvae) + pipe.save_pretrained(args.dump_path) + except: + model.save_pretrained(args.dump_path) From 21f4d22e1aef80e5f8203343a5f063b7b0a44dde Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Fri, 5 Aug 2022 17:13:05 +0530 Subject: [PATCH 02/20] add fn to convert config --- ...convert_ldm_txt2img_original_checkpoint_to_diffusers.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py index d83f7a69602c..7723e4382d15 100644 --- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py +++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py @@ -129,6 +129,13 @@ def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_s checkpoint[new_path] = old_checkpoint[path['old']] +def create_unet_diffusers_config(config): + """ + Creates a config for the diffusers based on the config of the LDM model. + """ + unet_config = {} + + def convert_ldm_unet_checkpoint(checkpoint, config): """ Takes a state dict and a config, and returns a converted checkpoint. From ee2e6791d1677b82a81ca465cad750abb7e456d8 Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Fri, 5 Aug 2022 22:40:04 +0530 Subject: [PATCH 03/20] create config if not provided --- ...xt2img_original_checkpoint_to_diffusers.py | 41 ++++++++++++++++--- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py index 7723e4382d15..602d5bf4be70 100644 --- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py +++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py @@ -129,11 +129,37 @@ def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_s checkpoint[new_path] = old_checkpoint[path['old']] -def create_unet_diffusers_config(config): +def create_unet_diffusers_config(original_config): """ Creates a config for the diffusers based on the config of the LDM model. """ - unet_config = {} + unet_params = config.model.params.unet_config.params + + block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult] + + down_block_types = [] + for i in range(len(block_out_channels)): + block_type = "CrossAttnDownBlock2D" if i < len(block_out_channels) - 1 else "DownBlock2D" + down_block_types.append(block_type) + + up_block_types = [] + for i in range(len(block_out_channels)): + block_type = "UpBlock2D" if i == 0 else "CrossAttnUpBlock2D" + up_block_types.append(block_type) + + config = dict( + sample_size=unet_params.image_size, + in_channels=unet_params.in_channels, + out_channels=unet_params.out_channels, + down_block_types=tuple(down_block_types), + up_block_types=tuple(up_block_types), + block_out_channels=tuple(block_out_channels), + layers_per_block=unet_params.num_res_blocks, + cross_attention_dim=unet_params.context_dim, + attention_head_dim=unet_params.num_heads, + ) + + return config def convert_ldm_unet_checkpoint(checkpoint, config): @@ -407,12 +433,17 @@ def convert_vae_checkpoint(checkpoint, config): args = parser.parse_args() + original_config = OmegaConf.load(args.original_config_file) + checkpoint = torch.load(args.checkpoint_path) - with open(args.config_file) as f: - config = json.loads(f.read()) + if args.config_file is not None: + with open(args.config_file) as f: + config = json.loads(f.read()) + else: + config = create_unet_diffusers_config(original_config) - converted_checkpoint = convert_ldm_checkpoint(checkpoint, config) + converted_checkpoint = convert_ldm_unet_checkpoint(checkpoint, config) if "ldm" in config: del config["ldm"] From a717a82d4c28d170d2bee9bbfd613dd1bb69ba5b Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Fri, 5 Aug 2022 22:41:58 +0530 Subject: [PATCH 04/20] update imports and use UNet2DConditionModel --- .../convert_ldm_txt2img_original_checkpoint_to_diffusers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py index 602d5bf4be70..7da9b639e144 100644 --- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py +++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py @@ -23,7 +23,7 @@ except ImportError: raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.") -from diffusers import VQModel, DDPMScheduler, UNet2DModel, LDMPipeline +from diffusers import VQModel, DDPMScheduler, LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel def shave_segments(path, n_shave_prefix_segments=1): @@ -448,7 +448,7 @@ def convert_vae_checkpoint(checkpoint, config): if "ldm" in config: del config["ldm"] - model = UNet2DModel(**config) + model = UNet2DConditionModel(**config) model.load_state_dict(converted_checkpoint) try: From b23326b15019ca4cde232642cbad76985bb3da11 Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Sat, 6 Aug 2022 13:14:10 +0530 Subject: [PATCH 05/20] fix imports, layer names --- ...t_ldm_txt2img_original_checkpoint_to_diffusers.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py index 7da9b639e144..96f0e65764f4 100644 --- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py +++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py @@ -19,7 +19,7 @@ import torch try: - import OmegaConf + from omegaconf import OmegaConf except ImportError: raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.") @@ -133,7 +133,7 @@ def create_unet_diffusers_config(original_config): """ Creates a config for the diffusers based on the config of the LDM model. """ - unet_params = config.model.params.unet_config.params + unet_params = original_config.model.params.unet_config.params block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult] @@ -203,8 +203,8 @@ def convert_ldm_unet_checkpoint(checkpoint, config): output_blocks = {layer_id: [key for key in unet_state_dict if f'output_blocks.{layer_id}' in key] for layer_id in range(num_output_blocks)} for i in range(1, num_input_blocks): - block_id = (i - 1) // (config['num_res_blocks'] + 1) - layer_in_block_id = (i - 1) % (config['num_res_blocks'] + 1) + block_id = (i - 1) // (config['layers_per_block'] + 1) + layer_in_block_id = (i - 1) % (config['layers_per_block'] + 1) resnets = [key for key in input_blocks[i] if f'input_blocks.{i}.0' in key] attentions = [key for key in input_blocks[i] if f'input_blocks.{i}.1' in key] @@ -268,8 +268,8 @@ def convert_ldm_unet_checkpoint(checkpoint, config): assign_to_checkpoint(attentions_paths, new_checkpoint, unet_state_dict, attention_paths_to_split=to_split, config=config) for i in range(num_output_blocks): - block_id = i // (config['num_res_blocks'] + 1) - layer_in_block_id = i % (config['num_res_blocks'] + 1) + block_id = i // (config['layers_per_block'] + 1) + layer_in_block_id = i % (config['layers_per_block'] + 1) output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]] output_block_list = {} From cbd751f5546a0b859386b71ec46e7e22099df312 Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Sun, 7 Aug 2022 22:59:49 +0530 Subject: [PATCH 06/20] fix unet coversion --- ...xt2img_original_checkpoint_to_diffusers.py | 87 ++++--------------- 1 file changed, 19 insertions(+), 68 deletions(-) diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py index 96f0e65764f4..f499dfce0aea 100644 --- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py +++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py @@ -66,13 +66,13 @@ def renew_attention_paths(old_list, n_shave_prefix_segments=0): for old_item in old_list: new_item = old_item - new_item = new_item.replace('norm.weight', 'group_norm.weight') - new_item = new_item.replace('norm.bias', 'group_norm.bias') +# new_item = new_item.replace('norm.weight', 'group_norm.weight') +# new_item = new_item.replace('norm.bias', 'group_norm.bias') - new_item = new_item.replace('proj_out.weight', 'proj_attn.weight') - new_item = new_item.replace('proj_out.bias', 'proj_attn.bias') +# new_item = new_item.replace('proj_out.weight', 'proj_attn.weight') +# new_item = new_item.replace('proj_out.bias', 'proj_attn.bias') - new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) +# new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({'old': old_item, 'new': new_item}) @@ -114,9 +114,9 @@ def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_s continue # Global renaming happens here - new_path = new_path.replace('middle_block.0', 'mid.resnets.0') - new_path = new_path.replace('middle_block.1', 'mid.attentions.0') - new_path = new_path.replace('middle_block.2', 'mid.resnets.1') + new_path = new_path.replace('middle_block.0', 'mid_block.resnets.0') + new_path = new_path.replace('middle_block.1', 'mid_block.attentions.0') + new_path = new_path.replace('middle_block.2', 'mid_block.resnets.1') if additional_replacements is not None: for replacement in additional_replacements: @@ -206,41 +206,22 @@ def convert_ldm_unet_checkpoint(checkpoint, config): block_id = (i - 1) // (config['layers_per_block'] + 1) layer_in_block_id = (i - 1) % (config['layers_per_block'] + 1) - resnets = [key for key in input_blocks[i] if f'input_blocks.{i}.0' in key] + resnets = [key for key in input_blocks[i] if f'input_blocks.{i}.0' in key and f'input_blocks.{i}.0.op' not in key] attentions = [key for key in input_blocks[i] if f'input_blocks.{i}.1' in key] if f'input_blocks.{i}.0.op.weight' in unet_state_dict: - new_checkpoint[f'downsample_blocks.{block_id}.downsamplers.0.conv.weight'] = unet_state_dict[f'input_blocks.{i}.0.op.weight'] - new_checkpoint[f'downsample_blocks.{block_id}.downsamplers.0.conv.bias'] = unet_state_dict[f'input_blocks.{i}.0.op.bias'] + new_checkpoint[f'down_blocks.{block_id}.downsamplers.0.conv.weight'] = unet_state_dict.pop(f'input_blocks.{i}.0.op.weight') + new_checkpoint[f'down_blocks.{block_id}.downsamplers.0.conv.bias'] = unet_state_dict.pop(f'input_blocks.{i}.0.op.bias') paths = renew_resnet_paths(resnets) - meta_path = {'old': f'input_blocks.{i}.0', 'new': f'downsample_blocks.{block_id}.resnets.{layer_in_block_id}'} - resnet_op = {'old': 'resnets.2.op', 'new': 'downsamplers.0.op'} - assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path, resnet_op], config=config) + meta_path = {'old': f'input_blocks.{i}.0', 'new': f'down_blocks.{block_id}.resnets.{layer_in_block_id}'} + assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) if len(attentions): paths = renew_attention_paths(attentions) - meta_path = {'old': f'input_blocks.{i}.1', 'new': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}'} - to_split = { - f'input_blocks.{i}.1.qkv.bias': { - 'key': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.key.bias', - 'query': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.query.bias', - 'value': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.value.bias', - }, - f'input_blocks.{i}.1.qkv.weight': { - 'key': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.key.weight', - 'query': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.query.weight', - 'value': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.value.weight', - }, - } - assign_to_checkpoint( - paths, - new_checkpoint, - unet_state_dict, - additional_replacements=[meta_path], - attention_paths_to_split=to_split, - config=config - ) + meta_path = {'old': f'input_blocks.{i}.1', 'new': f'down_blocks.{block_id}.attentions.{layer_in_block_id}'} + assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) + resnet_0 = middle_blocks[0] attentions = middle_blocks[1] @@ -253,19 +234,8 @@ def convert_ldm_unet_checkpoint(checkpoint, config): assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config) attentions_paths = renew_attention_paths(attentions) - to_split = { - 'middle_block.1.qkv.bias': { - 'key': 'mid_block.attentions.0.key.bias', - 'query': 'mid_block.attentions.0.query.bias', - 'value': 'mid_block.attentions.0.value.bias', - }, - 'middle_block.1.qkv.weight': { - 'key': 'mid_block.attentions.0.key.weight', - 'query': 'mid_block.attentions.0.query.weight', - 'value': 'mid_block.attentions.0.value.weight', - }, - } - assign_to_checkpoint(attentions_paths, new_checkpoint, unet_state_dict, attention_paths_to_split=to_split, config=config) + meta_path = {'old': 'middle_block.1', 'new': 'mid_block.attentions.0'} + assign_to_checkpoint(attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) for i in range(num_output_blocks): block_id = i // (config['layers_per_block'] + 1) @@ -305,26 +275,7 @@ def convert_ldm_unet_checkpoint(checkpoint, config): 'old': f'output_blocks.{i}.1', 'new': f'up_blocks.{block_id}.attentions.{layer_in_block_id}' } - to_split = { - f'output_blocks.{i}.1.qkv.bias': { - 'key': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.key.bias', - 'query': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.query.bias', - 'value': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.value.bias', - }, - f'output_blocks.{i}.1.qkv.weight': { - 'key': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.key.weight', - 'query': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.query.weight', - 'value': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.value.weight', - }, - } - assign_to_checkpoint( - paths, - new_checkpoint, - unet_state_dict, - additional_replacements=[meta_path], - attention_paths_to_split=to_split if any('qkv' in key for key in attentions) else None, - config=config, - ) + assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) else: resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1) for path in resnet_0_paths: From 956e78c4b4318619646059953cde8ac229074b71 Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Mon, 8 Aug 2022 11:12:17 +0530 Subject: [PATCH 07/20] add function to convert VAE --- ...xt2img_original_checkpoint_to_diffusers.py | 167 ++++++++++++++++++ 1 file changed, 167 insertions(+) diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py index f499dfce0aea..853278a7083a 100644 --- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py +++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py @@ -58,6 +58,21 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0): return mapping +def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0): + """ + Updates paths inside resnets to the new naming scheme (local renaming) + """ + mapping = [] + for old_item in old_list: + new_item = new_item.replace('nin_shortcut', 'conv_shortcut') + + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) + + mapping.append({'old': old_item, 'new': new_item}) + + return mapping + + def renew_attention_paths(old_list, n_shave_prefix_segments=0): """ Updates paths inside attentions to the new naming scheme (local renaming) @@ -79,6 +94,36 @@ def renew_attention_paths(old_list, n_shave_prefix_segments=0): return mapping +def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): + """ + Updates paths inside attentions to the new naming scheme (local renaming) + """ + mapping = [] + for old_item in old_list: + new_item = old_item + + new_item = new_item.replace('norm.weight', 'group_norm.weight') + new_item = new_item.replace('norm.bias', 'group_norm.bias') + + new_item = new_item.replace('q.weight', 'query.weight') + new_item = new_item.replace('q.bias', 'query.bias') + + new_item = new_item.replace('k.weight', 'key.weight') + new_item = new_item.replace('k.bias', 'key.bias') + + new_item = new_item.replace('v.weight', 'value.weight') + new_item = new_item.replace('v.bias', 'value.bias') + + new_item = new_item.replace('proj_out.weight', 'proj_attn.weight') + new_item = new_item.replace('proj_out.bias', 'proj_attn.bias') + + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) + + mapping.append({'old': old_item, 'new': new_item}) + + return mapping + + def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None): """ This does the final conversion step: take locally converted weights and apply a global renaming @@ -129,6 +174,14 @@ def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_s checkpoint[new_path] = old_checkpoint[path['old']] +def conv_attn_to_linear(checkpoint): + keys = list(checkpoint.keys()) + for key in keys: + if "weight" in checkpoint.keys(): + if checkpoint[key].ndim > 2: + checkpoint[key] = checkpoint[key][:, :, 0, 0] + + def create_unet_diffusers_config(original_config): """ Creates a config for the diffusers based on the config of the LDM model. @@ -160,6 +213,30 @@ def create_unet_diffusers_config(original_config): ) return config + + +def create_vae_diffusers_config(original_config): + """ + Creates a config for the diffusers based on the config of the LDM model. + """ + vae_params = original_config.model.params.first_stage_config.params.ddconfig + latent_channles = original_config.model.params.first_stage_config.params.embed_dim + + block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult] + down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels) + up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels) + + config = dict( + sample_size=vae_params.resolution, + in_channels=vae_params.in_channels, + out_channels=vae_params.out_ch, + down_block_types=tuple(down_block_types), + up_block_types=tuple(up_block_types), + block_out_channels=tuple(block_out_channels), + latent_channels=vae_params.z_channels, + layers_per_block=vae_params.num_res_blocks, + ) + return config def convert_ldm_unet_checkpoint(checkpoint, config): @@ -287,6 +364,96 @@ def convert_ldm_unet_checkpoint(checkpoint, config): return new_checkpoint +def convert_ldm_vae_checkpoint(checkpoint, config): + # extract state dict for VAE + vae_state_dict = {} + vae_key = "first_stage_model." + keys = list(checkpoint.keys()) + for key in keys: + if key.startswith(vae_key): + vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key) + + new_checkpoint = {} + + new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"] + new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"] + new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"] + new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"] + + new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"] + new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"] + new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"] + new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"] + + new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"] + new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"] + new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"] + new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"] + + + # Retrieves the keys for the encoder down blocks only + num_down_blocks = len({'.'.join(layer.split('.')[:2]) for layer in vae_state_dict if 'down' in layer}) + down_blocks = {layer_id: [key for key in vae_state_dict if f'down.{layer_id}' in key] for layer_id in range(num_down_blocks)} + + # Retrieves the keys for the decoder up blocks only + num_up_blocks = len({'.'.join(layer.split('.')[:2]) for layer in vae_state_dict if 'up' in layer}) + up_blocks = {layer_id: [key for key in vae_state_dict if f'up.{layer_id}' in key] for layer_id in range(num_up_blocks)} + + + for i in range(num_down_blocks): + resnets = [key for key in down_blocks[i] if f'down.{i}' in key] + + if f"encoder.down.{i}.downsample" in vae_state_dict: + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.weight"] = vae_state_dict[f"encoder.down.{i}.downsample.weight"] + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.bias"] = vae_state_dict[f"encoder.down.{i}.downsample.bias"] + + paths = renew_vae_resnet_paths(resnets) + meta_path = {'old': f'down.{i}.block', 'new': f'down_blocks.{i}.resnets'} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + + mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key] + num_mid_res_blocks = 2 + for i in range(1, num_mid_res_blocks + 1): + resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key] + + paths = renew_vae_resnet_paths(mid_resnets) + meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i}'} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + + mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key] + paths = renew_vae_attention_paths(mid_attentions) + meta_path = {'old': 'mid.attn_1', 'new': 'mid_block.attentions.0'} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + conv_attn_to_linear(new_checkpoint) + + for i in range(num_up_blocks): + resnets = [key for key in up_blocks[i] if f'up.{i}' in key] + + if f"decoder.up.{i}.upsample" in vae_state_dict: + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.weight"] = vae_state_dict[f"decoder.up.{i}.upsample.weight"] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.bias"] = vae_state_dict[f"decoder.up.{i}.upsample.bias"] + + paths = renew_vae_resnet_paths(resnets) + meta_path = {'old': f'up.{i}.block', 'new': f'up_blocks.{i}.resnets'} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + + mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key] + num_mid_res_blocks = 2 + for i in range(1, num_mid_res_blocks + 1): + resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key] + + paths = renew_vae_resnet_paths(mid_resnets) + meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i}'} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + + mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key] + paths = renew_vae_attention_paths(mid_attentions) + meta_path = {'old': 'mid.attn_1', 'new': 'mid_block.attentions.0'} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + conv_attn_to_linear(new_checkpoint) + + + def convert_ldm_bert_checkpoint(checkpoint, config): def _copy_attn_layer(hf_attn_layer, pt_attn_layer): From 379bdc61c4b91f2e6fa1f9428e3b4da370004bb1 Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Mon, 8 Aug 2022 12:39:04 +0530 Subject: [PATCH 08/20] fix vae conversion --- ...xt2img_original_checkpoint_to_diffusers.py | 51 +++++++++++-------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py index 853278a7083a..6f2b1818e9d3 100644 --- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py +++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py @@ -64,8 +64,10 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0): """ mapping = [] for old_item in old_list: - new_item = new_item.replace('nin_shortcut', 'conv_shortcut') + new_item = old_item + new_item = new_item.replace('nin_shortcut', 'conv_shortcut') + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({'old': old_item, 'new': new_item}) @@ -162,7 +164,7 @@ def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_s new_path = new_path.replace('middle_block.0', 'mid_block.resnets.0') new_path = new_path.replace('middle_block.1', 'mid_block.attentions.0') new_path = new_path.replace('middle_block.2', 'mid_block.resnets.1') - + if additional_replacements is not None: for replacement in additional_replacements: new_path = new_path.replace(replacement['old'], replacement['new']) @@ -176,10 +178,14 @@ def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_s def conv_attn_to_linear(checkpoint): keys = list(checkpoint.keys()) + attn_keys = ["query.weight", "key.weight", "value.weight"] for key in keys: - if "weight" in checkpoint.keys(): + if ".".join(key.split(".")[-2:]) in attn_keys: if checkpoint[key].ndim > 2: checkpoint[key] = checkpoint[key][:, :, 0, 0] + elif "proj_attn.weight" in key: + if checkpoint[key].ndim > 2: + checkpoint[key] = checkpoint[key][:, :, 0] def create_unet_diffusers_config(original_config): @@ -377,11 +383,15 @@ def convert_ldm_vae_checkpoint(checkpoint, config): new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"] new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"] + new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"] + new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"] new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"] new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"] new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"] new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"] + new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"] + new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"] new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"] new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"] @@ -392,20 +402,20 @@ def convert_ldm_vae_checkpoint(checkpoint, config): # Retrieves the keys for the encoder down blocks only - num_down_blocks = len({'.'.join(layer.split('.')[:2]) for layer in vae_state_dict if 'down' in layer}) + num_down_blocks = len({'.'.join(layer.split('.')[:3]) for layer in vae_state_dict if 'encoder.down' in layer}) down_blocks = {layer_id: [key for key in vae_state_dict if f'down.{layer_id}' in key] for layer_id in range(num_down_blocks)} # Retrieves the keys for the decoder up blocks only - num_up_blocks = len({'.'.join(layer.split('.')[:2]) for layer in vae_state_dict if 'up' in layer}) + num_up_blocks = len({'.'.join(layer.split('.')[:3]) for layer in vae_state_dict if 'decoder.up' in layer}) up_blocks = {layer_id: [key for key in vae_state_dict if f'up.{layer_id}' in key] for layer_id in range(num_up_blocks)} for i in range(num_down_blocks): - resnets = [key for key in down_blocks[i] if f'down.{i}' in key] - - if f"encoder.down.{i}.downsample" in vae_state_dict: - new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.weight"] = vae_state_dict[f"encoder.down.{i}.downsample.weight"] - new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.bias"] = vae_state_dict[f"encoder.down.{i}.downsample.bias"] + resnets = [key for key in down_blocks[i] if f'down.{i}' in key and f"down.{i}.downsample" not in key] + + if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict: + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.weight") + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.bias") paths = renew_vae_resnet_paths(resnets) meta_path = {'old': f'down.{i}.block', 'new': f'down_blocks.{i}.resnets'} @@ -416,8 +426,8 @@ def convert_ldm_vae_checkpoint(checkpoint, config): for i in range(1, num_mid_res_blocks + 1): resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key] - paths = renew_vae_resnet_paths(mid_resnets) - meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i}'} + paths = renew_vae_resnet_paths(resnets) + meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i - 1}'} assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key] @@ -427,14 +437,15 @@ def convert_ldm_vae_checkpoint(checkpoint, config): conv_attn_to_linear(new_checkpoint) for i in range(num_up_blocks): - resnets = [key for key in up_blocks[i] if f'up.{i}' in key] + block_id = num_up_blocks - 1 - i + resnets = [key for key in up_blocks[block_id] if f'up.{block_id}' in key and f"up.{block_id}.upsample" not in key] - if f"decoder.up.{i}.upsample" in vae_state_dict: - new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.weight"] = vae_state_dict[f"decoder.up.{i}.upsample.weight"] - new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.bias"] = vae_state_dict[f"decoder.up.{i}.upsample.bias"] + if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict: + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.weight"] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.bias"] paths = renew_vae_resnet_paths(resnets) - meta_path = {'old': f'up.{i}.block', 'new': f'up_blocks.{i}.resnets'} + meta_path = {'old': f'up.{block_id}.block', 'new': f'up_blocks.{i}.resnets'} assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key] @@ -442,8 +453,8 @@ def convert_ldm_vae_checkpoint(checkpoint, config): for i in range(1, num_mid_res_blocks + 1): resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key] - paths = renew_vae_resnet_paths(mid_resnets) - meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i}'} + paths = renew_vae_resnet_paths(resnets) + meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i - 1}'} assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key] @@ -451,7 +462,7 @@ def convert_ldm_vae_checkpoint(checkpoint, config): meta_path = {'old': 'mid.attn_1', 'new': 'mid_block.attentions.0'} assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) conv_attn_to_linear(new_checkpoint) - + return new_checkpoint def convert_ldm_bert_checkpoint(checkpoint, config): From bd3623357c2417ae4e3d1b5dcb523f0cc966b60b Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Mon, 8 Aug 2022 12:42:45 +0530 Subject: [PATCH 09/20] update main --- ...ldm_txt2img_original_checkpoint_to_diffusers.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py index 6f2b1818e9d3..9df23095a2af 100644 --- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py +++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py @@ -522,8 +522,6 @@ def _copy_layers(hf_layers, pt_layers): return hf_model -def convert_vae_checkpoint(checkpoint, config): - pass if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -572,13 +570,21 @@ def convert_vae_checkpoint(checkpoint, config): else: config = create_unet_diffusers_config(original_config) - converted_checkpoint = convert_ldm_unet_checkpoint(checkpoint, config) + converted_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, config) + + vae_config = create_vae_diffusers_config(original_config) + converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config) + + # TODO: convert bert or CLIP model if "ldm" in config: del config["ldm"] model = UNet2DConditionModel(**config) - model.load_state_dict(converted_checkpoint) + model.load_state_dict(converted_unet_checkpoint) + + vae = AutoencoderKL(**vae_config) + vae.load_state_dict(converted_vae_checkpoint) try: scheduler = DDPMScheduler.from_config("/".join(args.checkpoint_path.split("/")[:-1])) From f3c4f994b339bfeefbf98360e5a43da175558d95 Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Mon, 8 Aug 2022 13:06:52 +0530 Subject: [PATCH 10/20] create text model --- ...xt2img_original_checkpoint_to_diffusers.py | 30 +++++++++++++------ 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py index 9df23095a2af..91fb225c3659 100644 --- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py +++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py @@ -23,7 +23,9 @@ except ImportError: raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.") +from transformers import CLIPTokenizer, CLIPTextModel from diffusers import VQModel, DDPMScheduler, LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel +from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel, LDMBertConfig def shave_segments(path, n_shave_prefix_segments=1): @@ -243,6 +245,16 @@ def create_vae_diffusers_config(original_config): layers_per_block=vae_params.num_res_blocks, ) return config + + +def create_ldm_bert_config(original_config): + bert_params = original_config.model.parms.cond_stage_config.params + config = LDMBertConfig( + d_model=bert_params.n_embed, + encoder_layers=bert_params.n_layer, + encoder_ffn_dim=bert_params.n_embed * 4, + ) + return config def convert_ldm_unet_checkpoint(checkpoint, config): @@ -480,10 +492,6 @@ def _copy_linear(hf_linear, pt_linear): hf_linear.weight = pt_linear.weight hf_linear.bias = pt_linear.bias - def _copy_mlp(hf_mlp, pt_mlp): - _copy_linear(hf_mlp.fc1, pt_mlp.net[0][0]) - _copy_linear(hf_mlp.fc2, pt_mlp.net[2]) - def _copy_layer(hf_layer, pt_layer): # copy layer norms @@ -575,10 +583,14 @@ def _copy_layers(hf_layers, pt_layers): vae_config = create_vae_diffusers_config(original_config) converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config) - # TODO: convert bert or CLIP model - - if "ldm" in config: - del config["ldm"] + text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1] + if text_model_type == "FrozenCLIPEmbedder": + text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") + tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") + else: + # TODO: update the convert function to use the state_dict without the model instance. + text_config = create_ldm_bert_config(original_config) + text_checkpoint = convert_ldm_bert_checkpoint(checkpoint, text_config) model = UNet2DConditionModel(**config) model.load_state_dict(converted_unet_checkpoint) @@ -590,7 +602,7 @@ def _copy_layers(hf_layers, pt_layers): scheduler = DDPMScheduler.from_config("/".join(args.checkpoint_path.split("/")[:-1])) vqvae = VQModel.from_pretrained("/".join(args.checkpoint_path.split("/")[:-1])) - pipe = LDMPipeline(unet=model, scheduler=scheduler, vae=vqvae) + pipe = LDMTextToImagePipeline(unet=model, scheduler=scheduler, vae=vqvae) pipe.save_pretrained(args.dump_path) except: model.save_pretrained(args.dump_path) From ac0479770f2d005dd449099111e4584664be1425 Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Mon, 8 Aug 2022 17:32:17 +0530 Subject: [PATCH 11/20] update config creating logic for unet --- ...convert_ldm_txt2img_original_checkpoint_to_diffusers.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py index 91fb225c3659..9515982b781b 100644 --- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py +++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py @@ -199,14 +199,17 @@ def create_unet_diffusers_config(original_config): block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult] down_block_types = [] + resolution = 1 for i in range(len(block_out_channels)): - block_type = "CrossAttnDownBlock2D" if i < len(block_out_channels) - 1 else "DownBlock2D" + block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D" down_block_types.append(block_type) + resolution *= 2 up_block_types = [] for i in range(len(block_out_channels)): - block_type = "UpBlock2D" if i == 0 else "CrossAttnUpBlock2D" + block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" up_block_types.append(block_type) + resolution //= 2 config = dict( sample_size=unet_params.image_size, From 70e9ac49ddf6be8699c90d4d2ac7be736e58ccd3 Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Mon, 8 Aug 2022 22:47:12 +0530 Subject: [PATCH 12/20] fix config creation --- .../convert_ldm_txt2img_original_checkpoint_to_diffusers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py index 9515982b781b..08023f6c8492 100644 --- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py +++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py @@ -203,7 +203,8 @@ def create_unet_diffusers_config(original_config): for i in range(len(block_out_channels)): block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D" down_block_types.append(block_type) - resolution *= 2 + if i != len(block_out_channels) - 1: + resolution *= 2 up_block_types = [] for i in range(len(block_out_channels)): From a51e2f5a3be863c15cb4a3f69f066a2283179844 Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Tue, 9 Aug 2022 16:21:22 +0530 Subject: [PATCH 13/20] update script to create and save pipeline --- ...xt2img_original_checkpoint_to_diffusers.py | 67 ++++++++----------- 1 file changed, 28 insertions(+), 39 deletions(-) diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py index 08023f6c8492..8e75f1cabe98 100644 --- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py +++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py @@ -23,8 +23,8 @@ except ImportError: raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.") -from transformers import CLIPTokenizer, CLIPTextModel -from diffusers import VQModel, DDPMScheduler, LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel +from transformers import BertTokenizerFast, CLIPTokenizer, CLIPTextModel +from diffusers import VQModel, DDPMScheduler, LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel, DDIMScheduler from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel, LDMBertConfig @@ -251,6 +251,16 @@ def create_vae_diffusers_config(original_config): return config +def create_diffusers_schedular(original_config): + schedular = DDIMScheduler( + num_train_timesteps=original_config.model.params.timesteps, + beta_start=original_config.model.params.linear_start, + beta_end=original_config.model.params.linear_end, + beta_schedule="scaled_linear", + ) + return schedular + + def create_ldm_bert_config(original_config): bert_params = original_config.model.parms.cond_stage_config.params config = LDMBertConfig( @@ -550,22 +560,6 @@ def _copy_layers(hf_layers, pt_layers): help="The YAML config file corresponding to the original architecture.", ) - parser.add_argument( - "--config_file", - default=None, - type=str, - required=True, - help="The config json file corresponding to the architecture.", - ) - - parser.add_argument( - "--ldm_bert_config_file", - default=None, - type=str, - required=False, - help="The config json file corresponding to the LDMBert architecture.", - ) - parser.add_argument( "--dump_path", default=None, type=str, required=True, help="Path to the output model." ) @@ -576,17 +570,21 @@ def _copy_layers(hf_layers, pt_layers): checkpoint = torch.load(args.checkpoint_path) - if args.config_file is not None: - with open(args.config_file) as f: - config = json.loads(f.read()) - else: - config = create_unet_diffusers_config(original_config) - - converted_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, config) + # Convert the UNet2DConditionModel model. + unet_config = create_unet_diffusers_config(original_config) + converted_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, unet_config) + + unet = UNet2DConditionModel(**unet_config) + unet.load_state_dict(converted_unet_checkpoint) + # Convert the VAE model. vae_config = create_vae_diffusers_config(original_config) converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config) + vae = AutoencoderKL(**vae_config) + vae.load_state_dict(converted_vae_checkpoint) + + # Convert the text model. text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1] if text_model_type == "FrozenCLIPEmbedder": text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") @@ -594,19 +592,10 @@ def _copy_layers(hf_layers, pt_layers): else: # TODO: update the convert function to use the state_dict without the model instance. text_config = create_ldm_bert_config(original_config) - text_checkpoint = convert_ldm_bert_checkpoint(checkpoint, text_config) - - model = UNet2DConditionModel(**config) - model.load_state_dict(converted_unet_checkpoint) - - vae = AutoencoderKL(**vae_config) - vae.load_state_dict(converted_vae_checkpoint) + text_model = convert_ldm_bert_checkpoint(checkpoint, text_config) + tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") - try: - scheduler = DDPMScheduler.from_config("/".join(args.checkpoint_path.split("/")[:-1])) - vqvae = VQModel.from_pretrained("/".join(args.checkpoint_path.split("/")[:-1])) + scheduler = create_diffusers_schedular(original_config) + pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler) + pipe.save_pretrained(args.dump_path) - pipe = LDMTextToImagePipeline(unet=model, scheduler=scheduler, vae=vqvae) - pipe.save_pretrained(args.dump_path) - except: - model.save_pretrained(args.dump_path) From 954dca72cfece6de626f386f6bb4234935ff1fc9 Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Tue, 9 Aug 2022 16:21:50 +0530 Subject: [PATCH 14/20] remove unused imports --- .../convert_ldm_txt2img_original_checkpoint_to_diffusers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py index 8e75f1cabe98..d73eb4ef1a71 100644 --- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py +++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py @@ -15,7 +15,6 @@ """ Conversion script for the LDM checkpoints. """ import argparse -import json import torch try: @@ -24,7 +23,7 @@ raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.") from transformers import BertTokenizerFast, CLIPTokenizer, CLIPTextModel -from diffusers import VQModel, DDPMScheduler, LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel, DDIMScheduler +from diffusers import LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel, DDIMScheduler from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel, LDMBertConfig From 2b804b94df8e2d9f6bf88b188150f5eae9c9576f Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Wed, 10 Aug 2022 20:22:40 +0530 Subject: [PATCH 15/20] fix checkpoint loading --- scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py index d73eb4ef1a71..d360bd968e95 100644 --- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py +++ b/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py @@ -567,7 +567,7 @@ def _copy_layers(hf_layers, pt_layers): original_config = OmegaConf.load(args.original_config_file) - checkpoint = torch.load(args.checkpoint_path) + checkpoint = torch.load(args.checkpoint_path)["state_dict"] # Convert the UNet2DConditionModel model. unet_config = create_unet_diffusers_config(original_config) From 6df55be10a9bd33e045902cfa0b85f0e0657eb69 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 15 Sep 2022 16:00:10 +0000 Subject: [PATCH 16/20] better name --- ...ers.py => convert_original_stable_diffusion_to_diffusers.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename scripts/{convert_ldm_txt2img_original_checkpoint_to_diffusers.py => convert_original_stable_diffusion_to_diffusers.py} (99%) diff --git a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py similarity index 99% rename from scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py rename to scripts/convert_original_stable_diffusion_to_diffusers.py index d360bd968e95..04d2343a3d3a 100644 --- a/scripts/convert_ldm_txt2img_original_checkpoint_to_diffusers.py +++ b/scripts/convert_original_stable_diffusion_to_diffusers.py @@ -23,7 +23,7 @@ raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.") from transformers import BertTokenizerFast, CLIPTokenizer, CLIPTextModel -from diffusers import LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel, DDIMScheduler +from diffusers import LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel, DDIMScheduler, KLM from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel, LDMBertConfig From 2c4ce96d12a65f08b176a6303a4f1dbb305f4ebd Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 15 Sep 2022 16:27:41 +0000 Subject: [PATCH 17/20] save progress --- _ | 608 ++++++++++++++++++ ..._original_stable_diffusion_to_diffusers.py | 16 +- 2 files changed, 620 insertions(+), 4 deletions(-) create mode 100644 _ diff --git a/_ b/_ new file mode 100644 index 000000000000..0965f6522f6a --- /dev/null +++ b/_ @@ -0,0 +1,608 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Conversion script for the LDM checkpoints. """ + +import argparse +import torch + +try: + from omegaconf import OmegaConf +except ImportError: + raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.") + +from transformers import BertTokenizerFast, CLIPTokenizer, CLIPTextModel +from diffusers import LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, StableDiffusionPipeline +from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel, LDMBertConfig + + +def shave_segments(path, n_shave_prefix_segments=1): + """ + Removes segments. Positive values shave the first segments, negative shave the last segments. + """ + if n_shave_prefix_segments >= 0: + return '.'.join(path.split('.')[n_shave_prefix_segments:]) + else: + return '.'.join(path.split('.')[:n_shave_prefix_segments]) + + +def renew_resnet_paths(old_list, n_shave_prefix_segments=0): + """ + Updates paths inside resnets to the new naming scheme (local renaming) + """ + mapping = [] + for old_item in old_list: + new_item = old_item.replace('in_layers.0', 'norm1') + new_item = new_item.replace('in_layers.2', 'conv1') + + new_item = new_item.replace('out_layers.0', 'norm2') + new_item = new_item.replace('out_layers.3', 'conv2') + + new_item = new_item.replace('emb_layers.1', 'time_emb_proj') + new_item = new_item.replace('skip_connection', 'conv_shortcut') + + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) + + mapping.append({'old': old_item, 'new': new_item}) + + return mapping + + +def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0): + """ + Updates paths inside resnets to the new naming scheme (local renaming) + """ + mapping = [] + for old_item in old_list: + new_item = old_item + + new_item = new_item.replace('nin_shortcut', 'conv_shortcut') + + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) + + mapping.append({'old': old_item, 'new': new_item}) + + return mapping + + +def renew_attention_paths(old_list, n_shave_prefix_segments=0): + """ + Updates paths inside attentions to the new naming scheme (local renaming) + """ + mapping = [] + for old_item in old_list: + new_item = old_item + +# new_item = new_item.replace('norm.weight', 'group_norm.weight') +# new_item = new_item.replace('norm.bias', 'group_norm.bias') + +# new_item = new_item.replace('proj_out.weight', 'proj_attn.weight') +# new_item = new_item.replace('proj_out.bias', 'proj_attn.bias') + +# new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) + + mapping.append({'old': old_item, 'new': new_item}) + + return mapping + + +def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): + """ + Updates paths inside attentions to the new naming scheme (local renaming) + """ + mapping = [] + for old_item in old_list: + new_item = old_item + + new_item = new_item.replace('norm.weight', 'group_norm.weight') + new_item = new_item.replace('norm.bias', 'group_norm.bias') + + new_item = new_item.replace('q.weight', 'query.weight') + new_item = new_item.replace('q.bias', 'query.bias') + + new_item = new_item.replace('k.weight', 'key.weight') + new_item = new_item.replace('k.bias', 'key.bias') + + new_item = new_item.replace('v.weight', 'value.weight') + new_item = new_item.replace('v.bias', 'value.bias') + + new_item = new_item.replace('proj_out.weight', 'proj_attn.weight') + new_item = new_item.replace('proj_out.bias', 'proj_attn.bias') + + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) + + mapping.append({'old': old_item, 'new': new_item}) + + return mapping + + +def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None): + """ + This does the final conversion step: take locally converted weights and apply a global renaming + to them. It splits attention layers, and takes into account additional replacements + that may arise. + + Assigns the weights to the new checkpoint. + """ + assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys." + + # Splits the attention layers into three variables. + if attention_paths_to_split is not None: + for path, path_map in attention_paths_to_split.items(): + old_tensor = old_checkpoint[path] + channels = old_tensor.shape[0] // 3 + + target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1) + + num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3 + + old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:]) + query, key, value = old_tensor.split(channels // num_heads, dim=1) + + checkpoint[path_map['query']] = query.reshape(target_shape) + checkpoint[path_map['key']] = key.reshape(target_shape) + checkpoint[path_map['value']] = value.reshape(target_shape) + + for path in paths: + new_path = path['new'] + + # These have already been assigned + if attention_paths_to_split is not None and new_path in attention_paths_to_split: + continue + + # Global renaming happens here + new_path = new_path.replace('middle_block.0', 'mid_block.resnets.0') + new_path = new_path.replace('middle_block.1', 'mid_block.attentions.0') + new_path = new_path.replace('middle_block.2', 'mid_block.resnets.1') + + if additional_replacements is not None: + for replacement in additional_replacements: + new_path = new_path.replace(replacement['old'], replacement['new']) + + # proj_attn.weight has to be converted from conv 1D to linear + if "proj_attn.weight" in new_path: + checkpoint[new_path] = old_checkpoint[path['old']][:, :, 0] + else: + checkpoint[new_path] = old_checkpoint[path['old']] + + +def conv_attn_to_linear(checkpoint): + keys = list(checkpoint.keys()) + attn_keys = ["query.weight", "key.weight", "value.weight"] + for key in keys: + if ".".join(key.split(".")[-2:]) in attn_keys: + if checkpoint[key].ndim > 2: + checkpoint[key] = checkpoint[key][:, :, 0, 0] + elif "proj_attn.weight" in key: + if checkpoint[key].ndim > 2: + checkpoint[key] = checkpoint[key][:, :, 0] + + +def create_unet_diffusers_config(original_config): + """ + Creates a config for the diffusers based on the config of the LDM model. + """ + unet_params = original_config.model.params.unet_config.params + + block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult] + + down_block_types = [] + resolution = 1 + for i in range(len(block_out_channels)): + block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D" + down_block_types.append(block_type) + if i != len(block_out_channels) - 1: + resolution *= 2 + + up_block_types = [] + for i in range(len(block_out_channels)): + block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" + up_block_types.append(block_type) + resolution //= 2 + + config = dict( + sample_size=unet_params.image_size, + in_channels=unet_params.in_channels, + out_channels=unet_params.out_channels, + down_block_types=tuple(down_block_types), + up_block_types=tuple(up_block_types), + block_out_channels=tuple(block_out_channels), + layers_per_block=unet_params.num_res_blocks, + cross_attention_dim=unet_params.context_dim, + attention_head_dim=unet_params.num_heads, + ) + + return config + + +def create_vae_diffusers_config(original_config): + """ + Creates a config for the diffusers based on the config of the LDM model. + """ + vae_params = original_config.model.params.first_stage_config.params.ddconfig + latent_channles = original_config.model.params.first_stage_config.params.embed_dim + + block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult] + down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels) + up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels) + + config = dict( + sample_size=vae_params.resolution, + in_channels=vae_params.in_channels, + out_channels=vae_params.out_ch, + down_block_types=tuple(down_block_types), + up_block_types=tuple(up_block_types), + block_out_channels=tuple(block_out_channels), + latent_channels=vae_params.z_channels, + layers_per_block=vae_params.num_res_blocks, + ) + return config + + +def create_diffusers_schedular(original_config): + schedular = DDIMScheduler( + num_train_timesteps=original_config.model.params.timesteps, + beta_start=original_config.model.params.linear_start, + beta_end=original_config.model.params.linear_end, + beta_schedule="scaled_linear", + ) + return schedular + + +def create_ldm_bert_config(original_config): + bert_params = original_config.model.parms.cond_stage_config.params + config = LDMBertConfig( + d_model=bert_params.n_embed, + encoder_layers=bert_params.n_layer, + encoder_ffn_dim=bert_params.n_embed * 4, + ) + return config + + +def convert_ldm_unet_checkpoint(checkpoint, config): + """ + Takes a state dict and a config, and returns a converted checkpoint. + """ + + # extract state_dict for UNet + unet_state_dict = {} + unet_key = "model.diffusion_model." + keys = list(checkpoint.keys()) + for key in keys: + if key.startswith(unet_key): + unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key) + + new_checkpoint = {} + + new_checkpoint['time_embedding.linear_1.weight'] = unet_state_dict['time_embed.0.weight'] + new_checkpoint['time_embedding.linear_1.bias'] = unet_state_dict['time_embed.0.bias'] + new_checkpoint['time_embedding.linear_2.weight'] = unet_state_dict['time_embed.2.weight'] + new_checkpoint['time_embedding.linear_2.bias'] = unet_state_dict['time_embed.2.bias'] + + new_checkpoint['conv_in.weight'] = unet_state_dict['input_blocks.0.0.weight'] + new_checkpoint['conv_in.bias'] = unet_state_dict['input_blocks.0.0.bias'] + + new_checkpoint['conv_norm_out.weight'] = unet_state_dict['out.0.weight'] + new_checkpoint['conv_norm_out.bias'] = unet_state_dict['out.0.bias'] + new_checkpoint['conv_out.weight'] = unet_state_dict['out.2.weight'] + new_checkpoint['conv_out.bias'] = unet_state_dict['out.2.bias'] + + # Retrieves the keys for the input blocks only + num_input_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'input_blocks' in layer}) + input_blocks = {layer_id: [key for key in unet_state_dict if f'input_blocks.{layer_id}' in key] for layer_id in range(num_input_blocks)} + + # Retrieves the keys for the middle blocks only + num_middle_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'middle_block' in layer}) + middle_blocks = {layer_id: [key for key in unet_state_dict if f'middle_block.{layer_id}' in key] for layer_id in range(num_middle_blocks)} + + # Retrieves the keys for the output blocks only + num_output_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'output_blocks' in layer}) + output_blocks = {layer_id: [key for key in unet_state_dict if f'output_blocks.{layer_id}' in key] for layer_id in range(num_output_blocks)} + + for i in range(1, num_input_blocks): + block_id = (i - 1) // (config['layers_per_block'] + 1) + layer_in_block_id = (i - 1) % (config['layers_per_block'] + 1) + + resnets = [key for key in input_blocks[i] if f'input_blocks.{i}.0' in key and f'input_blocks.{i}.0.op' not in key] + attentions = [key for key in input_blocks[i] if f'input_blocks.{i}.1' in key] + + if f'input_blocks.{i}.0.op.weight' in unet_state_dict: + new_checkpoint[f'down_blocks.{block_id}.downsamplers.0.conv.weight'] = unet_state_dict.pop(f'input_blocks.{i}.0.op.weight') + new_checkpoint[f'down_blocks.{block_id}.downsamplers.0.conv.bias'] = unet_state_dict.pop(f'input_blocks.{i}.0.op.bias') + + paths = renew_resnet_paths(resnets) + meta_path = {'old': f'input_blocks.{i}.0', 'new': f'down_blocks.{block_id}.resnets.{layer_in_block_id}'} + assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) + + if len(attentions): + paths = renew_attention_paths(attentions) + meta_path = {'old': f'input_blocks.{i}.1', 'new': f'down_blocks.{block_id}.attentions.{layer_in_block_id}'} + assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) + + + resnet_0 = middle_blocks[0] + attentions = middle_blocks[1] + resnet_1 = middle_blocks[2] + + resnet_0_paths = renew_resnet_paths(resnet_0) + assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config) + + resnet_1_paths = renew_resnet_paths(resnet_1) + assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config) + + attentions_paths = renew_attention_paths(attentions) + meta_path = {'old': 'middle_block.1', 'new': 'mid_block.attentions.0'} + assign_to_checkpoint(attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) + + for i in range(num_output_blocks): + block_id = i // (config['layers_per_block'] + 1) + layer_in_block_id = i % (config['layers_per_block'] + 1) + output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]] + output_block_list = {} + + for layer in output_block_layers: + layer_id, layer_name = layer.split('.')[0], shave_segments(layer, 1) + if layer_id in output_block_list: + output_block_list[layer_id].append(layer_name) + else: + output_block_list[layer_id] = [layer_name] + + if len(output_block_list) > 1: + resnets = [key for key in output_blocks[i] if f'output_blocks.{i}.0' in key] + attentions = [key for key in output_blocks[i] if f'output_blocks.{i}.1' in key] + + resnet_0_paths = renew_resnet_paths(resnets) + paths = renew_resnet_paths(resnets) + + meta_path = {'old': f'output_blocks.{i}.0', 'new': f'up_blocks.{block_id}.resnets.{layer_in_block_id}'} + assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) + + if ['conv.weight', 'conv.bias'] in output_block_list.values(): + index = list(output_block_list.values()).index(['conv.weight', 'conv.bias']) + new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.weight'] = unet_state_dict[f'output_blocks.{i}.{index}.conv.weight'] + new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.bias'] = unet_state_dict[f'output_blocks.{i}.{index}.conv.bias'] + + # Clear attentions as they have been attributed above. + if len(attentions) == 2: + attentions = [] + + if len(attentions): + paths = renew_attention_paths(attentions) + meta_path = { + 'old': f'output_blocks.{i}.1', + 'new': f'up_blocks.{block_id}.attentions.{layer_in_block_id}' + } + assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) + else: + resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1) + for path in resnet_0_paths: + old_path = '.'.join(['output_blocks', str(i), path['old']]) + new_path = '.'.join(['up_blocks', str(block_id), 'resnets', str(layer_in_block_id), path['new']]) + + new_checkpoint[new_path] = unet_state_dict[old_path] + + return new_checkpoint + + +def convert_ldm_vae_checkpoint(checkpoint, config): + # extract state dict for VAE + vae_state_dict = {} + vae_key = "first_stage_model." + keys = list(checkpoint.keys()) + for key in keys: + if key.startswith(vae_key): + vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key) + + new_checkpoint = {} + + new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"] + new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"] + new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"] + new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"] + new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"] + new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"] + + new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"] + new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"] + new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"] + new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"] + new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"] + new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"] + + new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"] + new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"] + new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"] + new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"] + + + # Retrieves the keys for the encoder down blocks only + num_down_blocks = len({'.'.join(layer.split('.')[:3]) for layer in vae_state_dict if 'encoder.down' in layer}) + down_blocks = {layer_id: [key for key in vae_state_dict if f'down.{layer_id}' in key] for layer_id in range(num_down_blocks)} + + # Retrieves the keys for the decoder up blocks only + num_up_blocks = len({'.'.join(layer.split('.')[:3]) for layer in vae_state_dict if 'decoder.up' in layer}) + up_blocks = {layer_id: [key for key in vae_state_dict if f'up.{layer_id}' in key] for layer_id in range(num_up_blocks)} + + + for i in range(num_down_blocks): + resnets = [key for key in down_blocks[i] if f'down.{i}' in key and f"down.{i}.downsample" not in key] + + if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict: + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.weight") + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.bias") + + paths = renew_vae_resnet_paths(resnets) + meta_path = {'old': f'down.{i}.block', 'new': f'down_blocks.{i}.resnets'} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + + mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key] + num_mid_res_blocks = 2 + for i in range(1, num_mid_res_blocks + 1): + resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key] + + paths = renew_vae_resnet_paths(resnets) + meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i - 1}'} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + + mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key] + paths = renew_vae_attention_paths(mid_attentions) + meta_path = {'old': 'mid.attn_1', 'new': 'mid_block.attentions.0'} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + conv_attn_to_linear(new_checkpoint) + + for i in range(num_up_blocks): + block_id = num_up_blocks - 1 - i + resnets = [key for key in up_blocks[block_id] if f'up.{block_id}' in key and f"up.{block_id}.upsample" not in key] + + if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict: + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.weight"] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.bias"] + + paths = renew_vae_resnet_paths(resnets) + meta_path = {'old': f'up.{block_id}.block', 'new': f'up_blocks.{i}.resnets'} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + + mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key] + num_mid_res_blocks = 2 + for i in range(1, num_mid_res_blocks + 1): + resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key] + + paths = renew_vae_resnet_paths(resnets) + meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i - 1}'} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + + mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key] + paths = renew_vae_attention_paths(mid_attentions) + meta_path = {'old': 'mid.attn_1', 'new': 'mid_block.attentions.0'} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + conv_attn_to_linear(new_checkpoint) + return new_checkpoint + + +def convert_ldm_bert_checkpoint(checkpoint, config): + def _copy_attn_layer(hf_attn_layer, pt_attn_layer): + + hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight + hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight + hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight + + hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight + hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias + + + def _copy_linear(hf_linear, pt_linear): + hf_linear.weight = pt_linear.weight + hf_linear.bias = pt_linear.bias + + + def _copy_layer(hf_layer, pt_layer): + # copy layer norms + _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0]) + _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0]) + + # copy attn + _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1]) + + # copy MLP + pt_mlp = pt_layer[1][1] + _copy_linear(hf_layer.fc1, pt_mlp.net[0][0]) + _copy_linear(hf_layer.fc2, pt_mlp.net[2]) + + + def _copy_layers(hf_layers, pt_layers): + for i, hf_layer in enumerate(hf_layers): + if i != 0: i += i + pt_layer = pt_layers[i:i+2] + _copy_layer(hf_layer, pt_layer) + + hf_model = LDMBertModel(config).eval() + + # copy embeds + hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight + hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight + + # copy layer norm + _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm) + + # copy hidden layers + _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers) + + _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits) + + return hf_model + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert." + ) + # !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml + parser.add_argument( + "--original_config_file", + default=None, + type=str, + required=True, + help="The YAML config file corresponding to the original architecture.", + ) + parser.add_argument( + "--scheduler_type", default="pndm", type=str, required=True, help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim']" + ) + parser.add_argument( + "--dump_path", default=None, type=str, required=True, help="Path to the output model." + ) + + args = parser.parse_args() + + original_config = OmegaConf.load(args.original_config_file) + + checkpoint = torch.load(args.checkpoint_path)["state_dict"] + + if args.scheduler_type == "pndm": + elif args.scheduler_type == "pndm": + elif args.scheduler_type == "pndm": + else: + raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!") + + # Convert the UNet2DConditionModel model. + unet_config = create_unet_diffusers_config(original_config) + converted_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, unet_config) + + unet = UNet2DConditionModel(**unet_config) + unet.load_state_dict(converted_unet_checkpoint) + + # Convert the VAE model. + vae_config = create_vae_diffusers_config(original_config) + converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config) + + vae = AutoencoderKL(**vae_config) + vae.load_state_dict(converted_vae_checkpoint) + + # Convert the text model. + text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1] + if text_model_type == "FrozenCLIPEmbedder": + text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") + tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") + else: + # TODO: update the convert function to use the state_dict without the model instance. + text_config = create_ldm_bert_config(original_config) + text_model = convert_ldm_bert_checkpoint(checkpoint, text_config) + tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") + + scheduler = create_diffusers_schedular(original_config) + pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler) + pipe.save_pretrained(args.dump_path) + diff --git a/scripts/convert_original_stable_diffusion_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py index 04d2343a3d3a..0965f6522f6a 100644 --- a/scripts/convert_original_stable_diffusion_to_diffusers.py +++ b/scripts/convert_original_stable_diffusion_to_diffusers.py @@ -22,8 +22,8 @@ except ImportError: raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.") -from transformers import BertTokenizerFast, CLIPTokenizer, CLIPTextModel -from diffusers import LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel, DDIMScheduler, KLM +from transformers import BertTokenizerFast, CLIPTokenizer, CLIPTextModel +from diffusers import LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, StableDiffusionPipeline from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel, LDMBertConfig @@ -550,7 +550,7 @@ def _copy_layers(hf_layers, pt_layers): parser.add_argument( "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert." ) - + # !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml parser.add_argument( "--original_config_file", default=None, @@ -558,7 +558,9 @@ def _copy_layers(hf_layers, pt_layers): required=True, help="The YAML config file corresponding to the original architecture.", ) - + parser.add_argument( + "--scheduler_type", default="pndm", type=str, required=True, help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim']" + ) parser.add_argument( "--dump_path", default=None, type=str, required=True, help="Path to the output model." ) @@ -569,6 +571,12 @@ def _copy_layers(hf_layers, pt_layers): checkpoint = torch.load(args.checkpoint_path)["state_dict"] + if args.scheduler_type == "pndm": + elif args.scheduler_type == "pndm": + elif args.scheduler_type == "pndm": + else: + raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!") + # Convert the UNet2DConditionModel model. unet_config = create_unet_diffusers_config(original_config) converted_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, unet_config) From 915aa24d2149959aa05a0b568b25c5ff685e4a52 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 15 Sep 2022 21:54:56 +0000 Subject: [PATCH 18/20] finish --- ..._original_stable_diffusion_to_diffusers.py | 363 +++++++++++------- 1 file changed, 222 insertions(+), 141 deletions(-) diff --git a/scripts/convert_original_stable_diffusion_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py index 0965f6522f6a..cc417188f88b 100644 --- a/scripts/convert_original_stable_diffusion_to_diffusers.py +++ b/scripts/convert_original_stable_diffusion_to_diffusers.py @@ -16,14 +16,26 @@ import argparse import torch +import os try: from omegaconf import OmegaConf except ImportError: - raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.") + raise ImportError( + "OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`." + ) -from transformers import BertTokenizerFast, CLIPTokenizer, CLIPTextModel -from diffusers import LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, StableDiffusionPipeline +from transformers import BertTokenizerFast, CLIPTokenizer, CLIPTextModel, AutoFeatureExtractor +from diffusers import ( + LDMTextToImagePipeline, + AutoencoderKL, + UNet2DConditionModel, + DDIMScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusionPipeline, +) +from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel, LDMBertConfig @@ -32,9 +44,9 @@ def shave_segments(path, n_shave_prefix_segments=1): Removes segments. Positive values shave the first segments, negative shave the last segments. """ if n_shave_prefix_segments >= 0: - return '.'.join(path.split('.')[n_shave_prefix_segments:]) + return ".".join(path.split(".")[n_shave_prefix_segments:]) else: - return '.'.join(path.split('.')[:n_shave_prefix_segments]) + return ".".join(path.split(".")[:n_shave_prefix_segments]) def renew_resnet_paths(old_list, n_shave_prefix_segments=0): @@ -43,18 +55,18 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0): """ mapping = [] for old_item in old_list: - new_item = old_item.replace('in_layers.0', 'norm1') - new_item = new_item.replace('in_layers.2', 'conv1') + new_item = old_item.replace("in_layers.0", "norm1") + new_item = new_item.replace("in_layers.2", "conv1") - new_item = new_item.replace('out_layers.0', 'norm2') - new_item = new_item.replace('out_layers.3', 'conv2') + new_item = new_item.replace("out_layers.0", "norm2") + new_item = new_item.replace("out_layers.3", "conv2") - new_item = new_item.replace('emb_layers.1', 'time_emb_proj') - new_item = new_item.replace('skip_connection', 'conv_shortcut') + new_item = new_item.replace("emb_layers.1", "time_emb_proj") + new_item = new_item.replace("skip_connection", "conv_shortcut") new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) - mapping.append({'old': old_item, 'new': new_item}) + mapping.append({"old": old_item, "new": new_item}) return mapping @@ -67,11 +79,10 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0): for old_item in old_list: new_item = old_item - new_item = new_item.replace('nin_shortcut', 'conv_shortcut') - + new_item = new_item.replace("nin_shortcut", "conv_shortcut") new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) - mapping.append({'old': old_item, 'new': new_item}) + mapping.append({"old": old_item, "new": new_item}) return mapping @@ -84,15 +95,15 @@ def renew_attention_paths(old_list, n_shave_prefix_segments=0): for old_item in old_list: new_item = old_item -# new_item = new_item.replace('norm.weight', 'group_norm.weight') -# new_item = new_item.replace('norm.bias', 'group_norm.bias') + # new_item = new_item.replace('norm.weight', 'group_norm.weight') + # new_item = new_item.replace('norm.bias', 'group_norm.bias') -# new_item = new_item.replace('proj_out.weight', 'proj_attn.weight') -# new_item = new_item.replace('proj_out.bias', 'proj_attn.bias') + # new_item = new_item.replace('proj_out.weight', 'proj_attn.weight') + # new_item = new_item.replace('proj_out.bias', 'proj_attn.bias') -# new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) + # new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) - mapping.append({'old': old_item, 'new': new_item}) + mapping.append({"old": old_item, "new": new_item}) return mapping @@ -105,29 +116,31 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): for old_item in old_list: new_item = old_item - new_item = new_item.replace('norm.weight', 'group_norm.weight') - new_item = new_item.replace('norm.bias', 'group_norm.bias') + new_item = new_item.replace("norm.weight", "group_norm.weight") + new_item = new_item.replace("norm.bias", "group_norm.bias") - new_item = new_item.replace('q.weight', 'query.weight') - new_item = new_item.replace('q.bias', 'query.bias') + new_item = new_item.replace("q.weight", "query.weight") + new_item = new_item.replace("q.bias", "query.bias") - new_item = new_item.replace('k.weight', 'key.weight') - new_item = new_item.replace('k.bias', 'key.bias') + new_item = new_item.replace("k.weight", "key.weight") + new_item = new_item.replace("k.bias", "key.bias") - new_item = new_item.replace('v.weight', 'value.weight') - new_item = new_item.replace('v.bias', 'value.bias') + new_item = new_item.replace("v.weight", "value.weight") + new_item = new_item.replace("v.bias", "value.bias") - new_item = new_item.replace('proj_out.weight', 'proj_attn.weight') - new_item = new_item.replace('proj_out.bias', 'proj_attn.bias') + new_item = new_item.replace("proj_out.weight", "proj_attn.weight") + new_item = new_item.replace("proj_out.bias", "proj_attn.bias") new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) - mapping.append({'old': old_item, 'new': new_item}) + mapping.append({"old": old_item, "new": new_item}) return mapping -def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None): +def assign_to_checkpoint( + paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None +): """ This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits attention layers, and takes into account additional replacements @@ -150,31 +163,31 @@ def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_s old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:]) query, key, value = old_tensor.split(channels // num_heads, dim=1) - checkpoint[path_map['query']] = query.reshape(target_shape) - checkpoint[path_map['key']] = key.reshape(target_shape) - checkpoint[path_map['value']] = value.reshape(target_shape) + checkpoint[path_map["query"]] = query.reshape(target_shape) + checkpoint[path_map["key"]] = key.reshape(target_shape) + checkpoint[path_map["value"]] = value.reshape(target_shape) for path in paths: - new_path = path['new'] + new_path = path["new"] # These have already been assigned if attention_paths_to_split is not None and new_path in attention_paths_to_split: continue # Global renaming happens here - new_path = new_path.replace('middle_block.0', 'mid_block.resnets.0') - new_path = new_path.replace('middle_block.1', 'mid_block.attentions.0') - new_path = new_path.replace('middle_block.2', 'mid_block.resnets.1') - + new_path = new_path.replace("middle_block.0", "mid_block.resnets.0") + new_path = new_path.replace("middle_block.1", "mid_block.attentions.0") + new_path = new_path.replace("middle_block.2", "mid_block.resnets.1") + if additional_replacements is not None: for replacement in additional_replacements: - new_path = new_path.replace(replacement['old'], replacement['new']) + new_path = new_path.replace(replacement["old"], replacement["new"]) # proj_attn.weight has to be converted from conv 1D to linear if "proj_attn.weight" in new_path: - checkpoint[new_path] = old_checkpoint[path['old']][:, :, 0] + checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0] else: - checkpoint[new_path] = old_checkpoint[path['old']] + checkpoint[new_path] = old_checkpoint[path["old"]] def conv_attn_to_linear(checkpoint): @@ -207,7 +220,7 @@ def create_unet_diffusers_config(original_config): up_block_types = [] for i in range(len(block_out_channels)): - block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" + block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" up_block_types.append(block_type) resolution //= 2 @@ -231,7 +244,7 @@ def create_vae_diffusers_config(original_config): Creates a config for the diffusers based on the config of the LDM model. """ vae_params = original_config.model.params.first_stage_config.params.ddconfig - latent_channles = original_config.model.params.first_stage_config.params.embed_dim + _ = original_config.model.params.first_stage_config.params.embed_dim block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult] down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels) @@ -258,7 +271,7 @@ def create_diffusers_schedular(original_config): beta_schedule="scaled_linear", ) return schedular - + def create_ldm_bert_config(original_config): bert_params = original_config.model.parms.cond_stage_config.params @@ -268,7 +281,7 @@ def create_ldm_bert_config(original_config): encoder_ffn_dim=bert_params.n_embed * 4, ) return config - + def convert_ldm_unet_checkpoint(checkpoint, config): """ @@ -285,51 +298,69 @@ def convert_ldm_unet_checkpoint(checkpoint, config): new_checkpoint = {} - new_checkpoint['time_embedding.linear_1.weight'] = unet_state_dict['time_embed.0.weight'] - new_checkpoint['time_embedding.linear_1.bias'] = unet_state_dict['time_embed.0.bias'] - new_checkpoint['time_embedding.linear_2.weight'] = unet_state_dict['time_embed.2.weight'] - new_checkpoint['time_embedding.linear_2.bias'] = unet_state_dict['time_embed.2.bias'] + new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"] + new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"] + new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"] + new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"] - new_checkpoint['conv_in.weight'] = unet_state_dict['input_blocks.0.0.weight'] - new_checkpoint['conv_in.bias'] = unet_state_dict['input_blocks.0.0.bias'] + new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"] + new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"] - new_checkpoint['conv_norm_out.weight'] = unet_state_dict['out.0.weight'] - new_checkpoint['conv_norm_out.bias'] = unet_state_dict['out.0.bias'] - new_checkpoint['conv_out.weight'] = unet_state_dict['out.2.weight'] - new_checkpoint['conv_out.bias'] = unet_state_dict['out.2.bias'] + new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"] + new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"] + new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"] + new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"] # Retrieves the keys for the input blocks only - num_input_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'input_blocks' in layer}) - input_blocks = {layer_id: [key for key in unet_state_dict if f'input_blocks.{layer_id}' in key] for layer_id in range(num_input_blocks)} + num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer}) + input_blocks = { + layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key] + for layer_id in range(num_input_blocks) + } # Retrieves the keys for the middle blocks only - num_middle_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'middle_block' in layer}) - middle_blocks = {layer_id: [key for key in unet_state_dict if f'middle_block.{layer_id}' in key] for layer_id in range(num_middle_blocks)} + num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer}) + middle_blocks = { + layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key] + for layer_id in range(num_middle_blocks) + } # Retrieves the keys for the output blocks only - num_output_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'output_blocks' in layer}) - output_blocks = {layer_id: [key for key in unet_state_dict if f'output_blocks.{layer_id}' in key] for layer_id in range(num_output_blocks)} + num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer}) + output_blocks = { + layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key] + for layer_id in range(num_output_blocks) + } for i in range(1, num_input_blocks): - block_id = (i - 1) // (config['layers_per_block'] + 1) - layer_in_block_id = (i - 1) % (config['layers_per_block'] + 1) - - resnets = [key for key in input_blocks[i] if f'input_blocks.{i}.0' in key and f'input_blocks.{i}.0.op' not in key] - attentions = [key for key in input_blocks[i] if f'input_blocks.{i}.1' in key] - - if f'input_blocks.{i}.0.op.weight' in unet_state_dict: - new_checkpoint[f'down_blocks.{block_id}.downsamplers.0.conv.weight'] = unet_state_dict.pop(f'input_blocks.{i}.0.op.weight') - new_checkpoint[f'down_blocks.{block_id}.downsamplers.0.conv.bias'] = unet_state_dict.pop(f'input_blocks.{i}.0.op.bias') + block_id = (i - 1) // (config["layers_per_block"] + 1) + layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1) + + resnets = [ + key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key + ] + attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key] + + if f"input_blocks.{i}.0.op.weight" in unet_state_dict: + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop( + f"input_blocks.{i}.0.op.weight" + ) + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop( + f"input_blocks.{i}.0.op.bias" + ) paths = renew_resnet_paths(resnets) - meta_path = {'old': f'input_blocks.{i}.0', 'new': f'down_blocks.{block_id}.resnets.{layer_in_block_id}'} - assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) + meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"} + assign_to_checkpoint( + paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config + ) if len(attentions): paths = renew_attention_paths(attentions) - meta_path = {'old': f'input_blocks.{i}.1', 'new': f'down_blocks.{block_id}.attentions.{layer_in_block_id}'} - assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) - + meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"} + assign_to_checkpoint( + paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config + ) resnet_0 = middle_blocks[0] attentions = middle_blocks[1] @@ -342,36 +373,44 @@ def convert_ldm_unet_checkpoint(checkpoint, config): assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config) attentions_paths = renew_attention_paths(attentions) - meta_path = {'old': 'middle_block.1', 'new': 'mid_block.attentions.0'} - assign_to_checkpoint(attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) + meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"} + assign_to_checkpoint( + attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config + ) for i in range(num_output_blocks): - block_id = i // (config['layers_per_block'] + 1) - layer_in_block_id = i % (config['layers_per_block'] + 1) + block_id = i // (config["layers_per_block"] + 1) + layer_in_block_id = i % (config["layers_per_block"] + 1) output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]] output_block_list = {} for layer in output_block_layers: - layer_id, layer_name = layer.split('.')[0], shave_segments(layer, 1) + layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1) if layer_id in output_block_list: output_block_list[layer_id].append(layer_name) else: output_block_list[layer_id] = [layer_name] if len(output_block_list) > 1: - resnets = [key for key in output_blocks[i] if f'output_blocks.{i}.0' in key] - attentions = [key for key in output_blocks[i] if f'output_blocks.{i}.1' in key] + resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key] + attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key] resnet_0_paths = renew_resnet_paths(resnets) paths = renew_resnet_paths(resnets) - meta_path = {'old': f'output_blocks.{i}.0', 'new': f'up_blocks.{block_id}.resnets.{layer_in_block_id}'} - assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) + meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"} + assign_to_checkpoint( + paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config + ) - if ['conv.weight', 'conv.bias'] in output_block_list.values(): - index = list(output_block_list.values()).index(['conv.weight', 'conv.bias']) - new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.weight'] = unet_state_dict[f'output_blocks.{i}.{index}.conv.weight'] - new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.bias'] = unet_state_dict[f'output_blocks.{i}.{index}.conv.bias'] + if ["conv.weight", "conv.bias"] in output_block_list.values(): + index = list(output_block_list.values()).index(["conv.weight", "conv.bias"]) + new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[ + f"output_blocks.{i}.{index}.conv.weight" + ] + new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[ + f"output_blocks.{i}.{index}.conv.bias" + ] # Clear attentions as they have been attributed above. if len(attentions) == 2: @@ -380,15 +419,17 @@ def convert_ldm_unet_checkpoint(checkpoint, config): if len(attentions): paths = renew_attention_paths(attentions) meta_path = { - 'old': f'output_blocks.{i}.1', - 'new': f'up_blocks.{block_id}.attentions.{layer_in_block_id}' + "old": f"output_blocks.{i}.1", + "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}", } - assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) + assign_to_checkpoint( + paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config + ) else: resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1) for path in resnet_0_paths: - old_path = '.'.join(['output_blocks', str(i), path['old']]) - new_path = '.'.join(['up_blocks', str(block_id), 'resnets', str(layer_in_block_id), path['new']]) + old_path = ".".join(["output_blocks", str(i), path["old"]]) + new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]]) new_checkpoint[new_path] = unet_state_dict[old_path] @@ -403,7 +444,7 @@ def convert_ldm_vae_checkpoint(checkpoint, config): for key in keys: if key.startswith(vae_key): vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key) - + new_checkpoint = {} new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"] @@ -425,66 +466,78 @@ def convert_ldm_vae_checkpoint(checkpoint, config): new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"] new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"] - # Retrieves the keys for the encoder down blocks only - num_down_blocks = len({'.'.join(layer.split('.')[:3]) for layer in vae_state_dict if 'encoder.down' in layer}) - down_blocks = {layer_id: [key for key in vae_state_dict if f'down.{layer_id}' in key] for layer_id in range(num_down_blocks)} + num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer}) + down_blocks = { + layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks) + } # Retrieves the keys for the decoder up blocks only - num_up_blocks = len({'.'.join(layer.split('.')[:3]) for layer in vae_state_dict if 'decoder.up' in layer}) - up_blocks = {layer_id: [key for key in vae_state_dict if f'up.{layer_id}' in key] for layer_id in range(num_up_blocks)} + num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer}) + up_blocks = { + layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks) + } - for i in range(num_down_blocks): - resnets = [key for key in down_blocks[i] if f'down.{i}' in key and f"down.{i}.downsample" not in key] - + resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key] + if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict: - new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.weight") - new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.bias") + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop( + f"encoder.down.{i}.downsample.conv.weight" + ) + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop( + f"encoder.down.{i}.downsample.conv.bias" + ) paths = renew_vae_resnet_paths(resnets) - meta_path = {'old': f'down.{i}.block', 'new': f'down_blocks.{i}.resnets'} + meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"} assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) - + mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key] num_mid_res_blocks = 2 for i in range(1, num_mid_res_blocks + 1): resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key] paths = renew_vae_resnet_paths(resnets) - meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i - 1}'} + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key] paths = renew_vae_attention_paths(mid_attentions) - meta_path = {'old': 'mid.attn_1', 'new': 'mid_block.attentions.0'} + meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) conv_attn_to_linear(new_checkpoint) for i in range(num_up_blocks): block_id = num_up_blocks - 1 - i - resnets = [key for key in up_blocks[block_id] if f'up.{block_id}' in key and f"up.{block_id}.upsample" not in key] + resnets = [ + key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key + ] if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict: - new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.weight"] - new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.bias"] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[ + f"decoder.up.{block_id}.upsample.conv.weight" + ] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[ + f"decoder.up.{block_id}.upsample.conv.bias" + ] paths = renew_vae_resnet_paths(resnets) - meta_path = {'old': f'up.{block_id}.block', 'new': f'up_blocks.{i}.resnets'} + meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"} assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) - + mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key] num_mid_res_blocks = 2 for i in range(1, num_mid_res_blocks + 1): resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key] paths = renew_vae_resnet_paths(resnets) - meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i - 1}'} + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key] paths = renew_vae_attention_paths(mid_attentions) - meta_path = {'old': 'mid.attn_1', 'new': 'mid_block.attentions.0'} + meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) conv_attn_to_linear(new_checkpoint) return new_checkpoint @@ -500,32 +553,30 @@ def _copy_attn_layer(hf_attn_layer, pt_attn_layer): hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias - def _copy_linear(hf_linear, pt_linear): hf_linear.weight = pt_linear.weight hf_linear.bias = pt_linear.bias - def _copy_layer(hf_layer, pt_layer): # copy layer norms _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0]) _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0]) - + # copy attn _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1]) - + # copy MLP pt_mlp = pt_layer[1][1] _copy_linear(hf_layer.fc1, pt_mlp.net[0][0]) _copy_linear(hf_layer.fc2, pt_mlp.net[2]) - def _copy_layers(hf_layers, pt_layers): for i, hf_layer in enumerate(hf_layers): - if i != 0: i += i - pt_layer = pt_layers[i:i+2] + if i != 0: + i += i + pt_layer = pt_layers[i : i + 2] _copy_layer(hf_layer, pt_layer) - + hf_model = LDMBertModel(config).eval() # copy embeds @@ -537,13 +588,12 @@ def _copy_layers(hf_layers, pt_layers): # copy hidden layers _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers) - + _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits) return hf_model - if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -555,32 +605,55 @@ def _copy_layers(hf_layers, pt_layers): "--original_config_file", default=None, type=str, - required=True, help="The YAML config file corresponding to the original architecture.", ) parser.add_argument( - "--scheduler_type", default="pndm", type=str, required=True, help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim']" - ) - parser.add_argument( - "--dump_path", default=None, type=str, required=True, help="Path to the output model." + "--scheduler_type", + default="pndm", + type=str, + help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim']", ) + parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.") args = parser.parse_args() - original_config = OmegaConf.load(args.original_config_file) + if args.original_config_file is None: + os.system( + "wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml" + ) + args.original_config_file = "./v1-inference.yaml" + original_config = OmegaConf.load(args.original_config_file) checkpoint = torch.load(args.checkpoint_path)["state_dict"] + num_train_timesteps = original_config.model.params.timesteps + beta_start = original_config.model.params.linear_start + beta_end = original_config.model.params.linear_end if args.scheduler_type == "pndm": - elif args.scheduler_type == "pndm": - elif args.scheduler_type == "pndm": + scheduler = PNDMScheduler( + beta_end=beta_end, + beta_schedule="scaled_linear", + beta_start=beta_start, + num_train_timesteps=num_train_timesteps, + skip_prk_steps=True, + ) + elif args.scheduler_type == "lms": + scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear") + elif args.scheduler_type == "ddim": + scheduler = DDIMScheduler( + beta_start=beta_start, + beta_end=beta_end, + beta_schedule="scaled_linear", + clip_sample=False, + set_alpha_to_one=False, + ) else: raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!") # Convert the UNet2DConditionModel model. unet_config = create_unet_diffusers_config(original_config) converted_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, unet_config) - + unet = UNet2DConditionModel(**unet_config) unet.load_state_dict(converted_unet_checkpoint) @@ -596,13 +669,21 @@ def _copy_layers(hf_layers, pt_layers): if text_model_type == "FrozenCLIPEmbedder": text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") + safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") + feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker") + pipe = StableDiffusionPipeline( + vae=vae, + text_encoder=text_model, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + ) else: - # TODO: update the convert function to use the state_dict without the model instance. text_config = create_ldm_bert_config(original_config) text_model = convert_ldm_bert_checkpoint(checkpoint, text_config) tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") + pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler) - scheduler = create_diffusers_schedular(original_config) - pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler) pipe.save_pretrained(args.dump_path) - From 6c80f98ce18f37aa423ea229915b7a11839a0407 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 15 Sep 2022 21:56:52 +0000 Subject: [PATCH 19/20] up --- ...onvert_original_stable_diffusion_to_diffusers.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/scripts/convert_original_stable_diffusion_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py index cc417188f88b..ee7fc335438f 100644 --- a/scripts/convert_original_stable_diffusion_to_diffusers.py +++ b/scripts/convert_original_stable_diffusion_to_diffusers.py @@ -15,9 +15,11 @@ """ Conversion script for the LDM checkpoints. """ import argparse -import torch import os +import torch + + try: from omegaconf import OmegaConf except ImportError: @@ -25,18 +27,18 @@ "OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`." ) -from transformers import BertTokenizerFast, CLIPTokenizer, CLIPTextModel, AutoFeatureExtractor from diffusers import ( - LDMTextToImagePipeline, AutoencoderKL, - UNet2DConditionModel, DDIMScheduler, + LDMTextToImagePipeline, LMSDiscreteScheduler, PNDMScheduler, StableDiffusionPipeline, + UNet2DConditionModel, ) +from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker -from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel, LDMBertConfig +from transformers import AutoFeatureExtractor, BertTokenizerFast, CLIPTextModel, CLIPTokenizer def shave_segments(path, n_shave_prefix_segments=1): @@ -545,7 +547,6 @@ def convert_ldm_vae_checkpoint(checkpoint, config): def convert_ldm_bert_checkpoint(checkpoint, config): def _copy_attn_layer(hf_attn_layer, pt_attn_layer): - hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight From 81e0393212e14877c069cc4f961211d5ab14dae6 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 15 Sep 2022 22:06:56 +0000 Subject: [PATCH 20/20] up --- _ | 608 -------------------------------------------------------------- 1 file changed, 608 deletions(-) delete mode 100644 _ diff --git a/_ b/_ deleted file mode 100644 index 0965f6522f6a..000000000000 --- a/_ +++ /dev/null @@ -1,608 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Conversion script for the LDM checkpoints. """ - -import argparse -import torch - -try: - from omegaconf import OmegaConf -except ImportError: - raise ImportError("OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`.") - -from transformers import BertTokenizerFast, CLIPTokenizer, CLIPTextModel -from diffusers import LDMTextToImagePipeline, AutoencoderKL, UNet2DConditionModel, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, StableDiffusionPipeline -from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel, LDMBertConfig - - -def shave_segments(path, n_shave_prefix_segments=1): - """ - Removes segments. Positive values shave the first segments, negative shave the last segments. - """ - if n_shave_prefix_segments >= 0: - return '.'.join(path.split('.')[n_shave_prefix_segments:]) - else: - return '.'.join(path.split('.')[:n_shave_prefix_segments]) - - -def renew_resnet_paths(old_list, n_shave_prefix_segments=0): - """ - Updates paths inside resnets to the new naming scheme (local renaming) - """ - mapping = [] - for old_item in old_list: - new_item = old_item.replace('in_layers.0', 'norm1') - new_item = new_item.replace('in_layers.2', 'conv1') - - new_item = new_item.replace('out_layers.0', 'norm2') - new_item = new_item.replace('out_layers.3', 'conv2') - - new_item = new_item.replace('emb_layers.1', 'time_emb_proj') - new_item = new_item.replace('skip_connection', 'conv_shortcut') - - new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) - - mapping.append({'old': old_item, 'new': new_item}) - - return mapping - - -def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0): - """ - Updates paths inside resnets to the new naming scheme (local renaming) - """ - mapping = [] - for old_item in old_list: - new_item = old_item - - new_item = new_item.replace('nin_shortcut', 'conv_shortcut') - - new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) - - mapping.append({'old': old_item, 'new': new_item}) - - return mapping - - -def renew_attention_paths(old_list, n_shave_prefix_segments=0): - """ - Updates paths inside attentions to the new naming scheme (local renaming) - """ - mapping = [] - for old_item in old_list: - new_item = old_item - -# new_item = new_item.replace('norm.weight', 'group_norm.weight') -# new_item = new_item.replace('norm.bias', 'group_norm.bias') - -# new_item = new_item.replace('proj_out.weight', 'proj_attn.weight') -# new_item = new_item.replace('proj_out.bias', 'proj_attn.bias') - -# new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) - - mapping.append({'old': old_item, 'new': new_item}) - - return mapping - - -def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): - """ - Updates paths inside attentions to the new naming scheme (local renaming) - """ - mapping = [] - for old_item in old_list: - new_item = old_item - - new_item = new_item.replace('norm.weight', 'group_norm.weight') - new_item = new_item.replace('norm.bias', 'group_norm.bias') - - new_item = new_item.replace('q.weight', 'query.weight') - new_item = new_item.replace('q.bias', 'query.bias') - - new_item = new_item.replace('k.weight', 'key.weight') - new_item = new_item.replace('k.bias', 'key.bias') - - new_item = new_item.replace('v.weight', 'value.weight') - new_item = new_item.replace('v.bias', 'value.bias') - - new_item = new_item.replace('proj_out.weight', 'proj_attn.weight') - new_item = new_item.replace('proj_out.bias', 'proj_attn.bias') - - new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) - - mapping.append({'old': old_item, 'new': new_item}) - - return mapping - - -def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None): - """ - This does the final conversion step: take locally converted weights and apply a global renaming - to them. It splits attention layers, and takes into account additional replacements - that may arise. - - Assigns the weights to the new checkpoint. - """ - assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys." - - # Splits the attention layers into three variables. - if attention_paths_to_split is not None: - for path, path_map in attention_paths_to_split.items(): - old_tensor = old_checkpoint[path] - channels = old_tensor.shape[0] // 3 - - target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1) - - num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3 - - old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:]) - query, key, value = old_tensor.split(channels // num_heads, dim=1) - - checkpoint[path_map['query']] = query.reshape(target_shape) - checkpoint[path_map['key']] = key.reshape(target_shape) - checkpoint[path_map['value']] = value.reshape(target_shape) - - for path in paths: - new_path = path['new'] - - # These have already been assigned - if attention_paths_to_split is not None and new_path in attention_paths_to_split: - continue - - # Global renaming happens here - new_path = new_path.replace('middle_block.0', 'mid_block.resnets.0') - new_path = new_path.replace('middle_block.1', 'mid_block.attentions.0') - new_path = new_path.replace('middle_block.2', 'mid_block.resnets.1') - - if additional_replacements is not None: - for replacement in additional_replacements: - new_path = new_path.replace(replacement['old'], replacement['new']) - - # proj_attn.weight has to be converted from conv 1D to linear - if "proj_attn.weight" in new_path: - checkpoint[new_path] = old_checkpoint[path['old']][:, :, 0] - else: - checkpoint[new_path] = old_checkpoint[path['old']] - - -def conv_attn_to_linear(checkpoint): - keys = list(checkpoint.keys()) - attn_keys = ["query.weight", "key.weight", "value.weight"] - for key in keys: - if ".".join(key.split(".")[-2:]) in attn_keys: - if checkpoint[key].ndim > 2: - checkpoint[key] = checkpoint[key][:, :, 0, 0] - elif "proj_attn.weight" in key: - if checkpoint[key].ndim > 2: - checkpoint[key] = checkpoint[key][:, :, 0] - - -def create_unet_diffusers_config(original_config): - """ - Creates a config for the diffusers based on the config of the LDM model. - """ - unet_params = original_config.model.params.unet_config.params - - block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult] - - down_block_types = [] - resolution = 1 - for i in range(len(block_out_channels)): - block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D" - down_block_types.append(block_type) - if i != len(block_out_channels) - 1: - resolution *= 2 - - up_block_types = [] - for i in range(len(block_out_channels)): - block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" - up_block_types.append(block_type) - resolution //= 2 - - config = dict( - sample_size=unet_params.image_size, - in_channels=unet_params.in_channels, - out_channels=unet_params.out_channels, - down_block_types=tuple(down_block_types), - up_block_types=tuple(up_block_types), - block_out_channels=tuple(block_out_channels), - layers_per_block=unet_params.num_res_blocks, - cross_attention_dim=unet_params.context_dim, - attention_head_dim=unet_params.num_heads, - ) - - return config - - -def create_vae_diffusers_config(original_config): - """ - Creates a config for the diffusers based on the config of the LDM model. - """ - vae_params = original_config.model.params.first_stage_config.params.ddconfig - latent_channles = original_config.model.params.first_stage_config.params.embed_dim - - block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult] - down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels) - up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels) - - config = dict( - sample_size=vae_params.resolution, - in_channels=vae_params.in_channels, - out_channels=vae_params.out_ch, - down_block_types=tuple(down_block_types), - up_block_types=tuple(up_block_types), - block_out_channels=tuple(block_out_channels), - latent_channels=vae_params.z_channels, - layers_per_block=vae_params.num_res_blocks, - ) - return config - - -def create_diffusers_schedular(original_config): - schedular = DDIMScheduler( - num_train_timesteps=original_config.model.params.timesteps, - beta_start=original_config.model.params.linear_start, - beta_end=original_config.model.params.linear_end, - beta_schedule="scaled_linear", - ) - return schedular - - -def create_ldm_bert_config(original_config): - bert_params = original_config.model.parms.cond_stage_config.params - config = LDMBertConfig( - d_model=bert_params.n_embed, - encoder_layers=bert_params.n_layer, - encoder_ffn_dim=bert_params.n_embed * 4, - ) - return config - - -def convert_ldm_unet_checkpoint(checkpoint, config): - """ - Takes a state dict and a config, and returns a converted checkpoint. - """ - - # extract state_dict for UNet - unet_state_dict = {} - unet_key = "model.diffusion_model." - keys = list(checkpoint.keys()) - for key in keys: - if key.startswith(unet_key): - unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key) - - new_checkpoint = {} - - new_checkpoint['time_embedding.linear_1.weight'] = unet_state_dict['time_embed.0.weight'] - new_checkpoint['time_embedding.linear_1.bias'] = unet_state_dict['time_embed.0.bias'] - new_checkpoint['time_embedding.linear_2.weight'] = unet_state_dict['time_embed.2.weight'] - new_checkpoint['time_embedding.linear_2.bias'] = unet_state_dict['time_embed.2.bias'] - - new_checkpoint['conv_in.weight'] = unet_state_dict['input_blocks.0.0.weight'] - new_checkpoint['conv_in.bias'] = unet_state_dict['input_blocks.0.0.bias'] - - new_checkpoint['conv_norm_out.weight'] = unet_state_dict['out.0.weight'] - new_checkpoint['conv_norm_out.bias'] = unet_state_dict['out.0.bias'] - new_checkpoint['conv_out.weight'] = unet_state_dict['out.2.weight'] - new_checkpoint['conv_out.bias'] = unet_state_dict['out.2.bias'] - - # Retrieves the keys for the input blocks only - num_input_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'input_blocks' in layer}) - input_blocks = {layer_id: [key for key in unet_state_dict if f'input_blocks.{layer_id}' in key] for layer_id in range(num_input_blocks)} - - # Retrieves the keys for the middle blocks only - num_middle_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'middle_block' in layer}) - middle_blocks = {layer_id: [key for key in unet_state_dict if f'middle_block.{layer_id}' in key] for layer_id in range(num_middle_blocks)} - - # Retrieves the keys for the output blocks only - num_output_blocks = len({'.'.join(layer.split('.')[:2]) for layer in unet_state_dict if 'output_blocks' in layer}) - output_blocks = {layer_id: [key for key in unet_state_dict if f'output_blocks.{layer_id}' in key] for layer_id in range(num_output_blocks)} - - for i in range(1, num_input_blocks): - block_id = (i - 1) // (config['layers_per_block'] + 1) - layer_in_block_id = (i - 1) % (config['layers_per_block'] + 1) - - resnets = [key for key in input_blocks[i] if f'input_blocks.{i}.0' in key and f'input_blocks.{i}.0.op' not in key] - attentions = [key for key in input_blocks[i] if f'input_blocks.{i}.1' in key] - - if f'input_blocks.{i}.0.op.weight' in unet_state_dict: - new_checkpoint[f'down_blocks.{block_id}.downsamplers.0.conv.weight'] = unet_state_dict.pop(f'input_blocks.{i}.0.op.weight') - new_checkpoint[f'down_blocks.{block_id}.downsamplers.0.conv.bias'] = unet_state_dict.pop(f'input_blocks.{i}.0.op.bias') - - paths = renew_resnet_paths(resnets) - meta_path = {'old': f'input_blocks.{i}.0', 'new': f'down_blocks.{block_id}.resnets.{layer_in_block_id}'} - assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) - - if len(attentions): - paths = renew_attention_paths(attentions) - meta_path = {'old': f'input_blocks.{i}.1', 'new': f'down_blocks.{block_id}.attentions.{layer_in_block_id}'} - assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) - - - resnet_0 = middle_blocks[0] - attentions = middle_blocks[1] - resnet_1 = middle_blocks[2] - - resnet_0_paths = renew_resnet_paths(resnet_0) - assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config) - - resnet_1_paths = renew_resnet_paths(resnet_1) - assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config) - - attentions_paths = renew_attention_paths(attentions) - meta_path = {'old': 'middle_block.1', 'new': 'mid_block.attentions.0'} - assign_to_checkpoint(attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) - - for i in range(num_output_blocks): - block_id = i // (config['layers_per_block'] + 1) - layer_in_block_id = i % (config['layers_per_block'] + 1) - output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]] - output_block_list = {} - - for layer in output_block_layers: - layer_id, layer_name = layer.split('.')[0], shave_segments(layer, 1) - if layer_id in output_block_list: - output_block_list[layer_id].append(layer_name) - else: - output_block_list[layer_id] = [layer_name] - - if len(output_block_list) > 1: - resnets = [key for key in output_blocks[i] if f'output_blocks.{i}.0' in key] - attentions = [key for key in output_blocks[i] if f'output_blocks.{i}.1' in key] - - resnet_0_paths = renew_resnet_paths(resnets) - paths = renew_resnet_paths(resnets) - - meta_path = {'old': f'output_blocks.{i}.0', 'new': f'up_blocks.{block_id}.resnets.{layer_in_block_id}'} - assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) - - if ['conv.weight', 'conv.bias'] in output_block_list.values(): - index = list(output_block_list.values()).index(['conv.weight', 'conv.bias']) - new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.weight'] = unet_state_dict[f'output_blocks.{i}.{index}.conv.weight'] - new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.bias'] = unet_state_dict[f'output_blocks.{i}.{index}.conv.bias'] - - # Clear attentions as they have been attributed above. - if len(attentions) == 2: - attentions = [] - - if len(attentions): - paths = renew_attention_paths(attentions) - meta_path = { - 'old': f'output_blocks.{i}.1', - 'new': f'up_blocks.{block_id}.attentions.{layer_in_block_id}' - } - assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) - else: - resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1) - for path in resnet_0_paths: - old_path = '.'.join(['output_blocks', str(i), path['old']]) - new_path = '.'.join(['up_blocks', str(block_id), 'resnets', str(layer_in_block_id), path['new']]) - - new_checkpoint[new_path] = unet_state_dict[old_path] - - return new_checkpoint - - -def convert_ldm_vae_checkpoint(checkpoint, config): - # extract state dict for VAE - vae_state_dict = {} - vae_key = "first_stage_model." - keys = list(checkpoint.keys()) - for key in keys: - if key.startswith(vae_key): - vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key) - - new_checkpoint = {} - - new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"] - new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"] - new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"] - new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"] - new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"] - new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"] - - new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"] - new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"] - new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"] - new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"] - new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"] - new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"] - - new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"] - new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"] - new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"] - new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"] - - - # Retrieves the keys for the encoder down blocks only - num_down_blocks = len({'.'.join(layer.split('.')[:3]) for layer in vae_state_dict if 'encoder.down' in layer}) - down_blocks = {layer_id: [key for key in vae_state_dict if f'down.{layer_id}' in key] for layer_id in range(num_down_blocks)} - - # Retrieves the keys for the decoder up blocks only - num_up_blocks = len({'.'.join(layer.split('.')[:3]) for layer in vae_state_dict if 'decoder.up' in layer}) - up_blocks = {layer_id: [key for key in vae_state_dict if f'up.{layer_id}' in key] for layer_id in range(num_up_blocks)} - - - for i in range(num_down_blocks): - resnets = [key for key in down_blocks[i] if f'down.{i}' in key and f"down.{i}.downsample" not in key] - - if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict: - new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.weight") - new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.bias") - - paths = renew_vae_resnet_paths(resnets) - meta_path = {'old': f'down.{i}.block', 'new': f'down_blocks.{i}.resnets'} - assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) - - mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key] - num_mid_res_blocks = 2 - for i in range(1, num_mid_res_blocks + 1): - resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key] - - paths = renew_vae_resnet_paths(resnets) - meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i - 1}'} - assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) - - mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key] - paths = renew_vae_attention_paths(mid_attentions) - meta_path = {'old': 'mid.attn_1', 'new': 'mid_block.attentions.0'} - assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) - conv_attn_to_linear(new_checkpoint) - - for i in range(num_up_blocks): - block_id = num_up_blocks - 1 - i - resnets = [key for key in up_blocks[block_id] if f'up.{block_id}' in key and f"up.{block_id}.upsample" not in key] - - if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict: - new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.weight"] - new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.bias"] - - paths = renew_vae_resnet_paths(resnets) - meta_path = {'old': f'up.{block_id}.block', 'new': f'up_blocks.{i}.resnets'} - assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) - - mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key] - num_mid_res_blocks = 2 - for i in range(1, num_mid_res_blocks + 1): - resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key] - - paths = renew_vae_resnet_paths(resnets) - meta_path = {'old': f'mid.block_{i}', 'new': f'mid_block.resnets.{i - 1}'} - assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) - - mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key] - paths = renew_vae_attention_paths(mid_attentions) - meta_path = {'old': 'mid.attn_1', 'new': 'mid_block.attentions.0'} - assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) - conv_attn_to_linear(new_checkpoint) - return new_checkpoint - - -def convert_ldm_bert_checkpoint(checkpoint, config): - def _copy_attn_layer(hf_attn_layer, pt_attn_layer): - - hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight - hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight - hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight - - hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight - hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias - - - def _copy_linear(hf_linear, pt_linear): - hf_linear.weight = pt_linear.weight - hf_linear.bias = pt_linear.bias - - - def _copy_layer(hf_layer, pt_layer): - # copy layer norms - _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0]) - _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0]) - - # copy attn - _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1]) - - # copy MLP - pt_mlp = pt_layer[1][1] - _copy_linear(hf_layer.fc1, pt_mlp.net[0][0]) - _copy_linear(hf_layer.fc2, pt_mlp.net[2]) - - - def _copy_layers(hf_layers, pt_layers): - for i, hf_layer in enumerate(hf_layers): - if i != 0: i += i - pt_layer = pt_layers[i:i+2] - _copy_layer(hf_layer, pt_layer) - - hf_model = LDMBertModel(config).eval() - - # copy embeds - hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight - hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight - - # copy layer norm - _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm) - - # copy hidden layers - _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers) - - _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits) - - return hf_model - - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert." - ) - # !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml - parser.add_argument( - "--original_config_file", - default=None, - type=str, - required=True, - help="The YAML config file corresponding to the original architecture.", - ) - parser.add_argument( - "--scheduler_type", default="pndm", type=str, required=True, help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim']" - ) - parser.add_argument( - "--dump_path", default=None, type=str, required=True, help="Path to the output model." - ) - - args = parser.parse_args() - - original_config = OmegaConf.load(args.original_config_file) - - checkpoint = torch.load(args.checkpoint_path)["state_dict"] - - if args.scheduler_type == "pndm": - elif args.scheduler_type == "pndm": - elif args.scheduler_type == "pndm": - else: - raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!") - - # Convert the UNet2DConditionModel model. - unet_config = create_unet_diffusers_config(original_config) - converted_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, unet_config) - - unet = UNet2DConditionModel(**unet_config) - unet.load_state_dict(converted_unet_checkpoint) - - # Convert the VAE model. - vae_config = create_vae_diffusers_config(original_config) - converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config) - - vae = AutoencoderKL(**vae_config) - vae.load_state_dict(converted_vae_checkpoint) - - # Convert the text model. - text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1] - if text_model_type == "FrozenCLIPEmbedder": - text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") - tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") - else: - # TODO: update the convert function to use the state_dict without the model instance. - text_config = create_ldm_bert_config(original_config) - text_model = convert_ldm_bert_checkpoint(checkpoint, text_config) - tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") - - scheduler = create_diffusers_schedular(original_config) - pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler) - pipe.save_pretrained(args.dump_path) -