Skip to content

Commit 3ad4984

Browse files
authored
Revert "[LLM] add memory stats to logger of trainer (#8269)"
This reverts commit beb433a.
1 parent beb433a commit 3ad4984

File tree

1 file changed

+2
-16
lines changed

1 file changed

+2
-16
lines changed

paddlenlp/trainer/trainer.py

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,6 @@
3939
import paddle.distributed as dist
4040
import paddle.nn as nn
4141
from packaging import version
42-
from paddle import framework
43-
from paddle.base import core
4442
from paddle.distributed import fleet
4543
from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.hybrid_parallel_optimizer import (
4644
HybridParallelOptimizer,
@@ -1258,20 +1256,6 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval,
12581256
logs["learning_rate"] = float("{0:.3e}".format(self._get_learning_rate()))
12591257
logs["global_step"] = int(self.state.global_step)
12601258

1261-
divisor = 2**30
1262-
# TODO(@gexiao): replace these codes with unified APIs in Paddle
1263-
current_device = framework._current_expected_place_()
1264-
if str(current_device) != "Place(cpu)":
1265-
device_id = current_device.get_device_id()
1266-
current_memory_allocated = core.device_memory_stat_current_value("Allocated", device_id)
1267-
current_memory_reserved = core.device_memory_stat_current_value("Reserved", device_id)
1268-
max_memory_allocated = core.device_memory_stat_peak_value("Allocated", device_id)
1269-
max_memory_reserved = core.device_memory_stat_peak_value("Reserved", device_id)
1270-
logs["current_memory_allocated"] = current_memory_allocated / divisor
1271-
logs["current_memory_reserved"] = current_memory_reserved / divisor
1272-
logs["max_memory_allocated"] = max_memory_allocated / divisor
1273-
logs["max_memory_reserved"] = max_memory_reserved / divisor
1274-
12751259
total_train_batch_size = (
12761260
self.args.train_batch_size * self.args.gradient_accumulation_steps * self.args.dataset_world_size
12771261
)
@@ -1602,6 +1586,8 @@ def _load_rng_state(self, checkpoint):
16021586
random.setstate(checkpoint_rng_state["python"])
16031587
np.random.set_state(checkpoint_rng_state["numpy"])
16041588

1589+
core = paddle.framework.core
1590+
16051591
core.default_cpu_generator().set_state(checkpoint_rng_state["cpu"])
16061592
if core.is_compiled_with_cuda():
16071593
if not len(checkpoint_rng_state["cuda"]) == core.get_cuda_device_count():

0 commit comments

Comments
 (0)