diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index ba280a1832d5..963f76b4adbd 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -817,7 +817,13 @@ def train( logger.info(f" Total num train samples = {num_train_samples:,}") # per_device_trainable_numel = sum(p.numel().item() for p in model.parameters() if not p.stop_gradient) # TODO: Temporary fix since Tensor.numel() not supported in distributed mode - per_device_trainable_numel = sum(np.prod(p.shape) for p in model.parameters() if not p.stop_gradient) + if self.args.enable_auto_parallel: + per_device_trainable_numel = 0 + for p in model.parameters(): + if not p.stop_gradient: + per_device_trainable_numel += np.prod(p._local_shape) if p.is_dist() else np.prod(p.shape) + else: + per_device_trainable_numel = sum(np.prod(p.shape) for p in model.parameters() if not p.stop_gradient) logger.debug(f" Number of trainable parameters = {per_device_trainable_numel:,} (per device)") if self.args.use_hybrid_parallel: # todo fix for pipeline_parallel_degree