From dce969f1bddb2087636e5ba7bfad0d7aa67fe040 Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Mon, 28 Oct 2024 15:49:50 +0800 Subject: [PATCH] fix parameter calculation in auto_parallel mode --- paddlenlp/trainer/trainer.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index ba280a1832d5..963f76b4adbd 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -817,7 +817,13 @@ def train( logger.info(f" Total num train samples = {num_train_samples:,}") # per_device_trainable_numel = sum(p.numel().item() for p in model.parameters() if not p.stop_gradient) # TODO: Temporary fix since Tensor.numel() not supported in distributed mode - per_device_trainable_numel = sum(np.prod(p.shape) for p in model.parameters() if not p.stop_gradient) + if self.args.enable_auto_parallel: + per_device_trainable_numel = 0 + for p in model.parameters(): + if not p.stop_gradient: + per_device_trainable_numel += np.prod(p._local_shape) if p.is_dist() else np.prod(p.shape) + else: + per_device_trainable_numel = sum(np.prod(p.shape) for p in model.parameters() if not p.stop_gradient) logger.debug(f" Number of trainable parameters = {per_device_trainable_numel:,} (per device)") if self.args.use_hybrid_parallel: # todo fix for pipeline_parallel_degree