From 2b78b055c2c9788933a90c4865e1e4be790c9479 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 21 Aug 2024 13:30:54 +0800 Subject: [PATCH 1/3] update optimizer async save signal --- paddlenlp/trainer/trainer.py | 7 ++++++- paddlenlp/trainer/trainer_utils.py | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index cf0e95973675..2d78ef5d7d6f 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -2249,7 +2249,12 @@ def _save_checkpoint(self, model, metrics=None): self.optimizer.state_dict(), os.path.join(output_dir, optimizer_name), ) - + else: + if self.args.unified_checkpoint and "async_save" in self.args.unified_checkpoint_config: + global_rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1 else -1 + paddle.save(global_rank, os.path.join(output_dir, f".optimizer_weight.done.{global_rank}")) + if "skip_save_model_weight" not in self.args.unified_checkpoint_config: + paddle.save(global_rank, os.path.join(output_dir, f".master_weight.done.{global_rank}")) if self.args.should_save or self.args.use_expert_parallel: if not self.args.use_hybrid_parallel: logger.info("Saving optimizer files.") diff --git a/paddlenlp/trainer/trainer_utils.py b/paddlenlp/trainer/trainer_utils.py index c689e5b80857..18836b345961 100644 --- a/paddlenlp/trainer/trainer_utils.py +++ b/paddlenlp/trainer/trainer_utils.py @@ -46,6 +46,7 @@ from ..transformers.tokenizer_utils_base import BatchEncoding from ..utils.import_utils import is_paddle_cuda_available, is_psutil_available from ..utils.log import logger +from .utils.helper import distributed_file __all__ = [ "TrainOutput", @@ -265,7 +266,7 @@ def get_last_checkpoint(folder, uc_async_save=False): if os.path.exists(os.path.join(current_path, ".checkpoint_done")): return current_path else: - saving_info = paddle.load(os.path.join(current_path, ".saving_info")) + saving_info = paddle.load(distributed_file(os.path.join(current_path, ".saving_info"))) pre_world_size = saving_info.get("world_size", 1) ignore_save_lr_and_optim = saving_info.get("ignore_save_lr_and_optim", False) skip_save_model_weight = saving_info.get("skip_save_model_weight", False) From 7258e6e2e05f9b16a22c5e32dcd890ca6c371048 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 4 Sep 2024 17:32:03 +0800 Subject: [PATCH 2/3] fix uc lora config --- paddlenlp/trainer/plugins/unified_checkpoint.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/paddlenlp/trainer/plugins/unified_checkpoint.py b/paddlenlp/trainer/plugins/unified_checkpoint.py index 78e94bba4b74..51a5d1f75d07 100644 --- a/paddlenlp/trainer/plugins/unified_checkpoint.py +++ b/paddlenlp/trainer/plugins/unified_checkpoint.py @@ -350,7 +350,10 @@ def save_unified_checkpoint(self, model, optimizer, output_dir): # save the config config_to_save = save_config(model_to_save) # Attach architecture to the config - config_to_save.architectures = [model_to_save.__class__.__name__] + if isinstance(model_to_save, LoRAModel) or isinstance(model_to_save, PrefixModelForCausalLM): + config_to_save.architectures = [model_to_save.model.__class__.__name__] + else: + config_to_save.architectures = [model_to_save.__class__.__name__] if self.args.should_save: config_to_save.save_pretrained(save_directory) paddle.device.cuda.empty_cache() From 4f4c03a1964d2587b67b5c32441eeec2bc9e23fb Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 4 Sep 2024 20:10:06 +0800 Subject: [PATCH 3/3] fix pp release_grads --- paddlenlp/trainer/training_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py index ee95a63aebb8..eed8090258e8 100644 --- a/paddlenlp/trainer/training_args.py +++ b/paddlenlp/trainer/training_args.py @@ -1041,7 +1041,7 @@ def __post_init__(self): "dp_comm_overlap": enable_dp_comm_overlap, "sharding_comm_overlap": enable_sharding_comm_overlap, "enable_timer": "enable_timer" in pipeline_parallel_config, - "release_gradients": "enable_release_grads" in pipeline_parallel_config, + "release_gradients": "enable_release_grads" in pipeline_parallel_config or self.release_grads, "overlap_p2p_comm": "enable_overlap_p2p_comm" in pipeline_parallel_config, "clear_every_step_cache": "enable_clear_every_step_cache" in pipeline_parallel_config, "use_batch_p2p_comm": "disable_batch_p2p_comm" not in pipeline_parallel_config,