From 1b2c6ac6c9635a3bc3146dcb9ebbd436c3c4e5ba Mon Sep 17 00:00:00 2001 From: Kane Wallmann <57159130+kanewallmann@users.noreply.github.com> Date: Sun, 2 Oct 2022 14:41:03 +1000 Subject: [PATCH] Include CLIPTextModel parameters in conversion --- ...t_original_stable_diffusion_to_diffusers.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/scripts/convert_original_stable_diffusion_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py index ee7fc335438f..db1b30736984 100644 --- a/scripts/convert_original_stable_diffusion_to_diffusers.py +++ b/scripts/convert_original_stable_diffusion_to_diffusers.py @@ -595,6 +595,22 @@ def _copy_layers(hf_layers, pt_layers): return hf_model +def convert_ldm_clip_checkpoint(checkpoint): + text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") + + keys = list(checkpoint.keys()) + + text_model_dict = {} + + for key in keys: + if key.startswith("cond_stage_model.transformer"): + text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key] + + text_model.load_state_dict(text_model_dict) + + return text_model + + if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -668,7 +684,7 @@ def _copy_layers(hf_layers, pt_layers): # Convert the text model. text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1] if text_model_type == "FrozenCLIPEmbedder": - text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") + text_model = convert_ldm_clip_checkpoint(checkpoint) tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")