sd3推理优化——避免同步 (#695)

chang-wenbin · web-flow · commit ad3160057b04 · 2024-09-04T19:05:19.000+08:00
when s_churn == 0.0,not need to compute gamma, Can avoid cuda
synchronization；可以加速SD3端到端性能。
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_flow_match_euler_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_flow_match_euler_discrete.py
@@ -24,7 +24,6 @@
 from ..utils.paddle_utils import randn_tensor
 from .scheduling_utils import SchedulerMixin
 
-
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
@@ -245,12 +244,13 @@ def step(
         sample = sample.cast(paddle.float32)
 
         sigma = self.sigmas[self.step_index]
+        # NOTE:(changwenbin & zhoukangkang) when s_churn == 0.0,not need to compute gamma, Can avoid cuda synchronization
+        if s_churn == 0.0:
+            gamma = 0.0
+        else:
+            gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
 
-        gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
-
-        noise = randn_tensor(
-            model_output.shape, dtype=model_output.dtype, generator=generator
-        )
+        noise = randn_tensor(model_output.shape, dtype=model_output.dtype, generator=generator)
 
         eps = noise * s_noise
         sigma_hat = sigma * (gamma + 1)
@@ -283,4 +283,4 @@ def step(
         return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample)
 
     def __len__(self):
-        return self.config.num_train_timesteps
+        return self.config.num_train_timesteps