Skip to content

Commit 4452d29

Browse files
ForFishesMangodadada
authored andcommitted
Fix checker of nan/inf (PaddlePaddle#9029)
* fix checker position for nan/inf * fix checker position for nan/inf
1 parent cb1e378 commit 4452d29

File tree

1 file changed

+5
-5
lines changed

1 file changed

+5
-5
lines changed

paddlenlp/trainer/trainer.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -992,10 +992,6 @@ def _inner_training_loop(
992992
else:
993993
tr_loss_step = self.training_step(model, inputs)
994994

995-
if not args.fp16:
996-
if not paddle.isfinite(tr_loss_step).all().item():
997-
raise ValueError(f"Loss contains inf or nan values at rank {paddle.distributed.get_rank()}")
998-
999995
tr_loss += tr_loss_step
1000996

1001997
def fused_allreduce_gradients_no_sync(paramlist, hcg):
@@ -1294,7 +1290,11 @@ def _print_timer(self):
12941290

12951291
def _get_item_from_loss(self, loss):
12961292
assert isinstance(loss, paddle.Tensor) and loss._is_initialized()
1297-
return loss.item()
1293+
loss_value = loss.item()
1294+
if not self.args.fp16:
1295+
if not np.isfinite(loss_value).all():
1296+
raise ValueError(f"Loss contains inf or nan values, its value is {loss_value}")
1297+
return loss_value
12981298

12991299
def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval, **kwargs):
13001300
if self.control.should_log:

0 commit comments

Comments
 (0)