|
39 | 39 | import paddle.distributed as dist
|
40 | 40 | import paddle.nn as nn
|
41 | 41 | from packaging import version
|
42 |
| -from paddle import framework |
43 |
| -from paddle.base import core |
44 | 42 | from paddle.distributed import fleet
|
45 | 43 | from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.hybrid_parallel_optimizer import (
|
46 | 44 | HybridParallelOptimizer,
|
@@ -1258,20 +1256,6 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval,
|
1258 | 1256 | logs["learning_rate"] = float("{0:.3e}".format(self._get_learning_rate()))
|
1259 | 1257 | logs["global_step"] = int(self.state.global_step)
|
1260 | 1258 |
|
1261 |
| - divisor = 2**30 |
1262 |
| - # TODO(@gexiao): replace these codes with unified APIs in Paddle |
1263 |
| - current_device = framework._current_expected_place_() |
1264 |
| - if str(current_device) != "Place(cpu)": |
1265 |
| - device_id = current_device.get_device_id() |
1266 |
| - current_memory_allocated = core.device_memory_stat_current_value("Allocated", device_id) |
1267 |
| - current_memory_reserved = core.device_memory_stat_current_value("Reserved", device_id) |
1268 |
| - max_memory_allocated = core.device_memory_stat_peak_value("Allocated", device_id) |
1269 |
| - max_memory_reserved = core.device_memory_stat_peak_value("Reserved", device_id) |
1270 |
| - logs["current_memory_allocated"] = current_memory_allocated / divisor |
1271 |
| - logs["current_memory_reserved"] = current_memory_reserved / divisor |
1272 |
| - logs["max_memory_allocated"] = max_memory_allocated / divisor |
1273 |
| - logs["max_memory_reserved"] = max_memory_reserved / divisor |
1274 |
| - |
1275 | 1259 | total_train_batch_size = (
|
1276 | 1260 | self.args.train_batch_size * self.args.gradient_accumulation_steps * self.args.dataset_world_size
|
1277 | 1261 | )
|
@@ -1602,6 +1586,8 @@ def _load_rng_state(self, checkpoint):
|
1602 | 1586 | random.setstate(checkpoint_rng_state["python"])
|
1603 | 1587 | np.random.set_state(checkpoint_rng_state["numpy"])
|
1604 | 1588 |
|
| 1589 | + core = paddle.framework.core |
| 1590 | + |
1605 | 1591 | core.default_cpu_generator().set_state(checkpoint_rng_state["cpu"])
|
1606 | 1592 | if core.is_compiled_with_cuda():
|
1607 | 1593 | if not len(checkpoint_rng_state["cuda"]) == core.get_cuda_device_count():
|
|
0 commit comments