|
39 | 39 | import paddle.distributed as dist
|
40 | 40 | import paddle.nn as nn
|
41 | 41 | from packaging import version
|
| 42 | +from paddle import framework |
| 43 | +from paddle.base import core |
42 | 44 | from paddle.distributed import fleet
|
43 | 45 | from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.hybrid_parallel_optimizer import (
|
44 | 46 | HybridParallelOptimizer,
|
@@ -1256,6 +1258,20 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval,
|
1256 | 1258 | logs["learning_rate"] = float("{0:.3e}".format(self._get_learning_rate()))
|
1257 | 1259 | logs["global_step"] = int(self.state.global_step)
|
1258 | 1260 |
|
| 1261 | + divisor = 2**30 |
| 1262 | + # TODO(@gexiao): replace these codes with unified APIs in Paddle |
| 1263 | + current_device = framework._current_expected_place_() |
| 1264 | + if str(current_device) != "Place(cpu)": |
| 1265 | + device_id = current_device.get_device_id() |
| 1266 | + current_memory_allocated = core.device_memory_stat_current_value("Allocated", device_id) |
| 1267 | + current_memory_reserved = core.device_memory_stat_current_value("Reserved", device_id) |
| 1268 | + max_memory_allocated = core.device_memory_stat_peak_value("Allocated", device_id) |
| 1269 | + max_memory_reserved = core.device_memory_stat_peak_value("Reserved", device_id) |
| 1270 | + logs["current_memory_allocated"] = current_memory_allocated / divisor |
| 1271 | + logs["current_memory_reserved"] = current_memory_reserved / divisor |
| 1272 | + logs["max_memory_allocated"] = max_memory_allocated / divisor |
| 1273 | + logs["max_memory_reserved"] = max_memory_reserved / divisor |
| 1274 | + |
1259 | 1275 | total_train_batch_size = (
|
1260 | 1276 | self.args.train_batch_size * self.args.gradient_accumulation_steps * self.args.dataset_world_size
|
1261 | 1277 | )
|
@@ -1586,8 +1602,6 @@ def _load_rng_state(self, checkpoint):
|
1586 | 1602 | random.setstate(checkpoint_rng_state["python"])
|
1587 | 1603 | np.random.set_state(checkpoint_rng_state["numpy"])
|
1588 | 1604 |
|
1589 |
| - core = paddle.framework.core |
1590 |
| - |
1591 | 1605 | core.default_cpu_generator().set_state(checkpoint_rng_state["cpu"])
|
1592 | 1606 | if core.is_compiled_with_cuda():
|
1593 | 1607 | if not len(checkpoint_rng_state["cuda"]) == core.get_cuda_device_count():
|
|
0 commit comments