diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index d73b089439a1..df9055b725e8 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -28,7 +28,7 @@ title: "Using Diffusers" - sections: - local: optimization/fp16 - title: "Torch Float16" + title: "Memory and Speed" - local: optimization/onnx title: "ONNX" - local: optimization/open_vino diff --git a/docs/source/optimization/fp16.mdx b/docs/source/optimization/fp16.mdx index 044f3937b9bb..064bc58f8c2b 100644 --- a/docs/source/optimization/fp16.mdx +++ b/docs/source/optimization/fp16.mdx @@ -10,23 +10,67 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> +# Memory and speed +We present some techniques and ideas to optimize 🤗 Diffusers _inference_ for memory or speed. -# Quicktour +## CUDA `autocast` -Start using Diffusers🧨 quickly! -To start, use the [`DiffusionPipeline`] for quick inference and sample generations! +If you use a CUDA GPU, you can take advantage of `torch.autocast` to perform inference roughly twice as fast at the cost of slightly lower precision. All you need to do is put your inference call inside an `autocast` context manager. The following example shows how to do it using Stable Diffusion text-to-image generation as an example: +```Python +from torch import autocast +from diffusers import StableDiffusionPipeline + +pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=True) +pipe = pipe.to("cuda") + +prompt = "a photo of an astronaut riding a horse on mars" +with autocast("cuda"): + image = pipe(prompt).images[0] ``` -pip install diffusers + +Despite the precision loss, in our experience the final image results look the same as the `float32` versions. Feel free to experiment and report back! + +## Half precision weights + +To save more GPU memory, you can load the model weights directly in half precision. This involves loading the float16 version of the weights, which was saved to a branch named `fp16`, and telling PyTorch to use the `float16` type when loading them: + +```Python +pipe = StableDiffusionPipeline.from_pretrained( + "CompVis/stable-diffusion-v1-4", + revision="fp16", + torch_dtype=torch.float16, + use_auth_token=True +) ``` -## Main classes +## Sliced attention for additional memory savings + +For even additional memory savings, you can use a sliced version of attention that performs the computation in steps instead of all at once. -### Models + +Attention slicing is useful even if a batch size of just 1 is used - as long as the model uses more than one attention head. If there is more than one attention head the *QK^T* attention matrix can be computed sequentially for each head which can save a significant amount of memory. + -### Schedulers +To perform the attention computation sequentially over each head, you only need to invoke [`~StableDiffusionPipeline.enable_attention_slicing`] in your pipeline before inference, like here: -### Pipeliens +```Python +import torch +from diffusers import StableDiffusionPipeline +pipe = StableDiffusionPipeline.from_pretrained( + "CompVis/stable-diffusion-v1-4", + revision="fp16", + torch_dtype=torch.float16, + use_auth_token=True +) +pipe = pipe.to("cuda") + +prompt = "a photo of an astronaut riding a horse on mars" +pipe.enable_attention_slicing() +with torch.autocast("cuda"): + image = pipe(prompt).images[0] +``` +There's a small performance penalty of about 10% slower inference times, but this method allows you to use Stable Diffusion in as little as 3.2 GB of VRAM! diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 47e98d99f525..289785a1d312 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -67,13 +67,13 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto r""" Enable sliced attention computation. - When this option is enabled, the attention module will split the input batch in slices, to compute attention in - several steps. This is useful to save some memory in exchange for a small speed decrease. + When this option is enabled, the attention module will split the input tensor in slices, to compute attention + in several steps. This is useful to save some memory in exchange for a small speed decrease. Args: slice_size (`str` or `int`, *optional*, defaults to `"auto"`): - When `"auto"`, halves the input batch to the attention heads, so attention will be computed in two - steps. If a number is provided, use as many slices as `attention_head_dim // slice_size`. In this case, + When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If + a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` must be a multiple of `slice_size`. """ if slice_size == "auto": diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 18a2477810bb..5c639920904b 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -78,13 +78,13 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto r""" Enable sliced attention computation. - When this option is enabled, the attention module will split the input batch in slices, to compute attention in - several steps. This is useful to save some memory in exchange for a small speed decrease. + When this option is enabled, the attention module will split the input tensor in slices, to compute attention + in several steps. This is useful to save some memory in exchange for a small speed decrease. Args: slice_size (`str` or `int`, *optional*, defaults to `"auto"`): - When `"auto"`, halves the input batch to the attention heads, so attention will be computed in two - steps. If a number is provided, use as many slices as `attention_head_dim // slice_size`. In this case, + When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If + a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` must be a multiple of `slice_size`. """ if slice_size == "auto": diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index f8677b187035..9e6b5c9a9b6d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -92,13 +92,13 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto r""" Enable sliced attention computation. - When this option is enabled, the attention module will split the input batch in slices, to compute attention in - several steps. This is useful to save some memory in exchange for a small speed decrease. + When this option is enabled, the attention module will split the input tensor in slices, to compute attention + in several steps. This is useful to save some memory in exchange for a small speed decrease. Args: slice_size (`str` or `int`, *optional*, defaults to `"auto"`): - When `"auto"`, halves the input batch to the attention heads, so attention will be computed in two - steps. If a number is provided, use as many slices as `attention_head_dim // slice_size`. In this case, + When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If + a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` must be a multiple of `slice_size`. """ if slice_size == "auto":