diff --git a/paddlenlp/data/causal_dataset.py b/paddlenlp/data/causal_dataset.py index 75a2ba193dca..9d70506205ac 100644 --- a/paddlenlp/data/causal_dataset.py +++ b/paddlenlp/data/causal_dataset.py @@ -94,10 +94,11 @@ def get_datasets_weights_and_num_samples(data_prefix, train_val_test_num_samples # Add 0.5% (the 1.005 factor) so in case the bleding dataset does # not uniformly distribute the number of samples, we still have # samples left to feed to the network. + # (NOTE, yujun06): This is a workaround to avoid issues with indexing in the blending dataset. Therefore, we need to add 20 samples to each dataset. datasets_train_valid_test_num_samples = [] for weight in weights: datasets_train_valid_test_num_samples.append( - [int(math.ceil(val * weight * 1.005)) for val in train_val_test_num_samples] + [int(math.ceil(val * weight * 1.005)) + 20 for val in train_val_test_num_samples] ) return prefixes, weights, datasets_train_valid_test_num_samples @@ -146,7 +147,9 @@ def build_train_valid_test_datasets( # Parse the values. output = get_datasets_weights_and_num_samples(data_prefix, train_val_test_num_samples) prefixes, weights, datasets_train_valid_test_num_samples = output - train_num_samples, valid_num_samples, test_num_samples = map(sum, zip(*datasets_train_valid_test_num_samples)) + # NOTE: megatron/gpt_dataset.py has been updated. When creating BlendableDataset, we will use the raw train_val_test_num_samples instead of the expanded ones. + # Please refer to https://github.com/NVIDIA/NeMo/blob/72f630d087d45655b1a069dc72debf01dfdbdb2d/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py#L74-L80 for more information + train_num_samples, valid_num_samples, test_num_samples = train_val_test_num_samples # Build individual datasets. train_datasets = [] diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index 746b7e252516..d7615680fc69 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -1051,11 +1051,12 @@ def _inner_training_loop( if optimizer_was_run: self.lr_scheduler.step() - if enable_release_grads and args.pipeline_parallel_degree > 1: + if args.release_grads or enable_release_grads: self.optimizer.clear_grad(set_to_zero=False) - for _, buffers in model._chunk_2_comm_buffers.items(): - for buffer in buffers: - buffer._clear_grad_storage() + if args.pipeline_parallel_degree > 1: + for _, buffers in model._chunk_2_comm_buffers.items(): + for buffer in buffers: + buffer._clear_grad_storage() else: self.optimizer.clear_grad() @@ -1434,6 +1435,7 @@ def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoa eval_dataset, batch_size=self.args.per_device_eval_batch_size, collate_fn=self.data_collator, + drop_last=self.args.dataloader_drop_last, num_workers=0, eval=True, ) @@ -1442,6 +1444,7 @@ def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoa eval_dataset, batch_size=self.args.per_device_eval_batch_size, collate_fn=self.data_collator, + drop_last=self.args.dataloader_drop_last, num_workers=0, ) @@ -1454,7 +1457,7 @@ def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoa eval_dataset, batch_sampler=eval_sampler, collate_fn=self.data_collator, - num_workers=self.args.dataloader_num_workers, + num_workers=0, eval=True, ) else: @@ -1462,7 +1465,7 @@ def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoa eval_dataset, batch_sampler=eval_sampler, collate_fn=self.data_collator, - num_workers=self.args.dataloader_num_workers, + num_workers=0, ) def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: @@ -1500,6 +1503,7 @@ def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: batch_size=self.args.per_device_eval_batch_size * self.world_size, collate_fn=self.data_collator, # _get_collator_with_removed_columns num_workers=0, + drop_last=self.args.dataloader_drop_last, eval=True, ) else: @@ -1507,6 +1511,7 @@ def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: test_dataset, batch_size=self.args.per_device_eval_batch_size * self.world_size, collate_fn=self.data_collator, # _get_collator_with_removed_columns + drop_last=self.args.dataloader_drop_last, num_workers=0, ) @@ -1520,6 +1525,7 @@ def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: test_dataset, batch_sampler=test_sampler, collate_fn=self.data_collator, + num_workers=0, drop_last=self.args.dataloader_drop_last, eval=True, ) @@ -1529,6 +1535,7 @@ def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: batch_sampler=test_sampler, collate_fn=self.data_collator, drop_last=self.args.dataloader_drop_last, + num_workers=0, ) def create_optimizer_and_scheduler(self, num_training_steps: int): @@ -1748,16 +1755,8 @@ def _wrap_model(self, model, training=True): in_sep_parallel_mode = self.args.sep_parallel_degree > 1 # Multi-gpu training - if ( - self.args.world_size > 1 - and not self.args.use_hybrid_parallel - or not ( - in_pipeline_parallel_mode - or in_sharding_parallel_mode - or in_tensor_parallel_mode - or in_sep_parallel_mode - ) - ): + if self.args.world_size > 1 and (not self.args.use_hybrid_parallel): + # MOE use DDP to broadcaset parameters. model = paddle.DataParallel(model) # Distributed training (should be after fp16 initialization) diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py index 2ed9d343ceaa..9f4a4d765721 100644 --- a/paddlenlp/trainer/training_args.py +++ b/paddlenlp/trainer/training_args.py @@ -344,6 +344,8 @@ class TrainingArguments: Whether skip profile timer, timer will record time usage of forward/ backward/ step, etc. distributed_dataloader (`bool`, *optional*): Whether to use distributed dataloader. Default is `False`. + release_grads (`bool`, *optional*): + Whether to release gradients during training. Default is `False`. """ output_dir: str = field( @@ -787,6 +789,9 @@ class TrainingArguments: default=False, metadata={"help": "whether to run distributed training in auto parallel mode"}, ) + release_grads: Optional[bool] = field( + default=False, metadata={"help": "Whether to release gradients during training. Default is `False`."} + ) def __post_init__(self): env_local_rank = int(os.environ.get("PADDLE_RANK_IN_NODE", -1)) @@ -1030,7 +1035,7 @@ def __post_init__(self): "dp_comm_overlap": enable_dp_comm_overlap, "sharding_comm_overlap": enable_sharding_comm_overlap, "enable_timer": "enable_timer" in pipeline_parallel_config, - "release_gradients": "enable_release_grads" in pipeline_parallel_config, + "release_gradients": "enable_release_grads" in pipeline_parallel_config or self.release_grads, "overlap_p2p_comm": "enable_overlap_p2p_comm" in pipeline_parallel_config, "clear_every_step_cache": "enable_clear_every_step_cache" in pipeline_parallel_config, "use_batch_p2p_comm": "disable_batch_p2p_comm" not in pipeline_parallel_config, @@ -1400,7 +1405,7 @@ def is_segment_parallel_supported(): if world_size > 1: if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized(): if self.unified_checkpoint: - self.use_hybrid_parallel = True + # DP use hybrid group strategy = fleet.DistributedStrategy() fleet.init(is_collective=True, strategy=strategy) else: diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py index dc1c753206c4..21d0d191059c 100644 --- a/paddlenlp/transformers/model_utils.py +++ b/paddlenlp/transformers/model_utils.py @@ -798,7 +798,7 @@ def _load_state_dict_into_meta_model( dtype = convert_np_dtype_to_dtype_(dtype) error_msgs = [] - + model_state_dict = model.state_dict() for param_name, param in state_dict.items(): # First part of the test is always true as loaded_state_dict_keys always contains state_dict keys. if param_name not in loaded_state_dict_keys or param_name not in expected_keys: @@ -833,7 +833,7 @@ def _load_state_dict_into_meta_model( if old_param is not None: param = param.astype(dtype=old_param.dtype) with paddle.no_grad(): - model.state_dict()[param_name].get_tensor()._share_data_with(param.value().get_tensor()) + model_state_dict[param_name].get_tensor()._share_data_with(param.value().get_tensor()) param.value().get_tensor()._clear() return error_msgs @@ -1890,7 +1890,7 @@ def _find_mismatched_keys( if ( shard_file.endswith(".safetensors") and config.tensor_parallel_degree > 1 - and "tp" not in shard_file + and "tp" not in os.path.split(shard_file)[-1] ): pre_tensor_parallel_split = True assert loaded_keys is not None, "loaded_keys is not None." diff --git a/paddlenlp/utils/batch_sampler.py b/paddlenlp/utils/batch_sampler.py index 1cee8d1cb4c6..619904a6d33f 100644 --- a/paddlenlp/utils/batch_sampler.py +++ b/paddlenlp/utils/batch_sampler.py @@ -14,8 +14,6 @@ from __future__ import division, print_function -import math - import paddle __all__ = ["DistributedBatchSampler"] @@ -110,7 +108,7 @@ def __init__( # In pre-training mode when using distributed dataloader, the input dataset can be None. We should handle this situation. self.num_samples = 0 else: - self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.nranks)) + self.num_samples = int(len(self.dataset) * 1.0 / self.nranks) self.total_size = self.num_samples * self.nranks def get_start_end_idx(self): @@ -125,7 +123,7 @@ def __iter__(self): self.consumed_samples, self.nranks, ) - self.remain_num_samples = int(math.ceil((len(self.dataset) - self.consumed_samples) * 1.0 / self.nranks)) + self.remain_num_samples = int((len(self.dataset) - self.consumed_samples) * 1.0 / self.nranks) self.remain_total_size = self.remain_num_samples * self.nranks self.batch_size_times_rank_size = self.batch_size * self.nranks diff --git a/paddlenlp/utils/safetensors.py b/paddlenlp/utils/safetensors.py index 422a7d09961c..c273d0d973c2 100644 --- a/paddlenlp/utils/safetensors.py +++ b/paddlenlp/utils/safetensors.py @@ -157,16 +157,16 @@ def __getitem__(self, index): out_start, out_stop, out_step = copy.deepcopy((self.start, self.stop, self.step)) for i, (start, stop, step, slice_) in enumerate(zip(self.start, self.stop, self.step, index)): - out_start[i] = slice_.start or 0 - out_step[i] = slice_.step or 1 - out_stop[i] = slice_.stop or stop - start + out_start[i] = slice_.start if slice_.start is not None else 0 + out_step[i] = slice_.step if slice_.step is not None else 1 + out_stop[i] = slice_.stop if slice_.stop is not None else stop - start out_stop[i] = min(stop, out_stop[i]) target_shape = [] - for x, y, z in zip(out_start, out_stop, out_step): + for x, y, z, sli in zip(out_start, out_stop, out_step, index): assert z == 1, "only support step = 1" - if y - x > 1: - target_shape.append(int(y - x)) + if y - x > 1 or sli.step is None: + target_shape.append(max(int(y - x), 0)) if len(target_shape) == 0: if self.shape == [1]: diff --git a/pyproject.toml b/pyproject.toml index 715323d09e37..858508037fce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ exclude = ['.flake8'] [tool.pytest.ini_options] minversion = "6.0" -addopts = "-ra -q --ignore model_zoo/gpt-3/" +addopts = "-ra -q --dist loadgroup" pythonpath = ["."] testpaths = [ "tests/data", @@ -28,7 +28,7 @@ testpaths = [ "tests/prompt", # "tests/taskflow", TODO (paddle 2.5.1 breaks this test suite, debug later) "tests/utils", - "model_zoo", + # "model_zoo", ] python_files = [ "test.py", diff --git a/tests/trainer/test_lora_unified_checkpoint.py b/tests/trainer/test_lora_unified_checkpoint.py index 98d5516d2388..0abfc257d4f7 100644 --- a/tests/trainer/test_lora_unified_checkpoint.py +++ b/tests/trainer/test_lora_unified_checkpoint.py @@ -149,7 +149,7 @@ def __test__(cls): def setUp(self): """ - 1. update runfrist and rerun to run defined different config + 1. update runfirst and rerun to run defined different config 2. update need_allclose to True if you want to check the result 3. update rtol to the relative value you want to check """ @@ -169,7 +169,7 @@ def setUp(self): self.run_lora_file = "llm/finetune_generation.py" - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_lora_file, **train_args) def rerun(self, train_args): @@ -181,7 +181,7 @@ def testTP4PP2(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["TP4PP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -196,7 +196,7 @@ def testTP2Sharding4(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["TP2Sharding4"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -213,7 +213,7 @@ def testTP8(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["TP8"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -227,7 +227,7 @@ def testTP4DP2(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["TP4DP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -242,7 +242,7 @@ def testTP4Sharding2(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["TP4Sharding2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -257,7 +257,7 @@ def testTP2PP4(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["TP2PP4"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -272,7 +272,7 @@ def testPP8(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["PP8"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -287,7 +287,7 @@ def testPP4DP2(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["PP4DP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -302,7 +302,7 @@ def testPP4Sharding2(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["PP4Sharding2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -317,7 +317,7 @@ def testSharding8S1(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["Sharding8S1"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -332,7 +332,7 @@ def testSharding8S2(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["Sharding8S2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -347,7 +347,7 @@ def testSharding4S1DP2(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["Sharding4S1DP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -362,7 +362,7 @@ def testSharding4S2DP2(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["Sharding4S2DP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -377,7 +377,7 @@ def testSharding2S1DP4(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["Sharding2S1DP4"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -392,7 +392,7 @@ def testSharding2S2DP4(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["Sharding2S2DP4"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -407,7 +407,7 @@ def testDP8(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["DP8"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -416,19 +416,21 @@ def testDP8(self): np.testing.assert_allclose(res[0], res[1], self.rtol) +@pytest.mark.skipif(True, reason="Skip for None CE") class TestUnifiedCheckpointOnN2C4(TestUnifiedCheckpointBase): def setUp(self): super().setUp() self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n2c4(self.run_lora_file, **train_args) def rerun(self, train_args): self.run_n2c4(self.run_lora_file, **train_args) +@pytest.mark.skipif(True, reason="Skip for None CE") class TestUnifiedCheckpointOnN1C8CheckpointCompatible(TestUnifiedCheckpointBase): def setUp(self): super().setUp() @@ -436,7 +438,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["unified_checkpoint"] = 0 self.run_n1c8(self.run_lora_file, **train_args) @@ -445,6 +447,7 @@ def rerun(self, train_args): self.run_n1c8(self.run_lora_file, **train_args) +@pytest.mark.skipif(True, reason="Skip for None CE") class TestPaddleCheckpointOnN1C8Reset(TestUnifiedCheckpointBase): def setUp(self): super().setUp() @@ -452,7 +455,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["unified_checkpoint"] = 0 self.run_n1c8(self.run_lora_file, **train_args) @@ -469,7 +472,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["unified_checkpoint"] = 0 self.run_n2c4(self.run_lora_file, **train_args) diff --git a/tests/trainer/test_unified_checkpoint.py b/tests/trainer/test_unified_checkpoint.py index f8cc0ed7bfac..5ce99b36ff19 100644 --- a/tests/trainer/test_unified_checkpoint.py +++ b/tests/trainer/test_unified_checkpoint.py @@ -175,7 +175,7 @@ def __test__(cls): def setUp(self): """ - 1. update runfrist and rerun to run defined diffrent config + 1. update runfirst and rerun to run defined diffrent config 2. update need_allclose to True if you want to check the result 3. update rtol to the relative value you want to check """ @@ -194,7 +194,7 @@ def setUp(self): self.run_pretrain_file = "llm/llama/run_pretrain.py" - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) def rerun(self, train_args): @@ -206,7 +206,7 @@ def testTP4PP2(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["TP4PP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -221,7 +221,7 @@ def testTP2Sharding4(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["TP2Sharding4"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -238,7 +238,7 @@ def testTP8(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["TP8"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -252,7 +252,7 @@ def testTP4DP2(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["TP4DP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -267,7 +267,7 @@ def testTP4Sharding2(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["TP4Sharding2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -282,7 +282,7 @@ def testTP2PP4(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["TP2PP4"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -297,7 +297,7 @@ def testPP8(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["PP8"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -312,7 +312,7 @@ def testPP4DP2(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["PP4DP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -327,7 +327,7 @@ def testPP4Sharding2(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["PP4Sharding2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -342,7 +342,7 @@ def testSharding8S1(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["Sharding8S1"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -357,7 +357,7 @@ def testSharding8S2(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["Sharding8S2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -372,7 +372,7 @@ def testSharding4S1DP2(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["Sharding4S1DP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -387,7 +387,7 @@ def testSharding4S2DP2(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["Sharding4S2DP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -402,7 +402,7 @@ def testSharding2S1DP4(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["Sharding2S1DP4"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -417,7 +417,7 @@ def testSharding2S2DP4(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["Sharding2S2DP4"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -432,7 +432,7 @@ def testDP8(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["DP8"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -441,13 +441,14 @@ def testDP8(self): np.testing.assert_allclose(res[0], res[1], self.rtol) +@pytest.mark.skipif(True, reason="Skip for None CE") class TestUnifiedCheckpointOnN2C4(TestUnifiedCheckpointBase): def setUp(self): super().setUp() self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n2c4(self.run_pretrain_file, **train_args) def rerun(self, train_args): @@ -463,7 +464,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) def rerun(self, train_args): @@ -485,7 +486,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n2c4(self.run_pretrain_file, **train_args) def rerun(self, train_args): @@ -507,7 +508,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) move_checkpoint_N1C8_to_N2C4() @@ -529,7 +530,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n2c4(self.run_pretrain_file, **train_args) move_checkpoint_N2C4_to_N1C8() @@ -557,7 +558,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) def rerun(self, train_args): @@ -576,7 +577,7 @@ def setUp(self): self.need_allclose = False - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["fp16_opt_level"] = "O1" self.run_n1c8(self.run_pretrain_file, **train_args) @@ -585,6 +586,7 @@ def rerun(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) +@pytest.mark.skipif(True, reason="Skip for None CE") class TestUnifiedCheckpointOnN1C8MasterWeightCompatibleO2ToO1(TestUnifiedCheckpointBase): def setUp(self): super().setUp() @@ -596,7 +598,7 @@ def setUp(self): self.need_allclose = False - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["fp16_opt_level"] = "O2" self.run_n1c8(self.run_pretrain_file, **train_args) @@ -605,6 +607,7 @@ def rerun(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) +@pytest.mark.skipif(True, reason="Skip for None CE") class TestUnifiedCheckpointOnN1C8CheckpointCompatible(TestUnifiedCheckpointBase): def setUp(self): super().setUp() @@ -612,7 +615,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["unified_checkpoint"] = 0 self.run_n1c8(self.run_pretrain_file, **train_args) @@ -621,6 +624,7 @@ def rerun(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) +@pytest.mark.skipif(True, reason="Skip for None CE") class TestPaddleCheckpointOnN1C8Reset(TestUnifiedCheckpointBase): def setUp(self): super().setUp() @@ -628,7 +632,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["unified_checkpoint"] = 0 self.run_n1c8(self.run_pretrain_file, **train_args) @@ -637,6 +641,7 @@ def rerun(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) +@pytest.mark.skipif(True, reason="Skip for None CE") class TestPaddleCheckpointOnN1C2Reset(TestMultipleGpus): def setUp(self): self.configs = get_pretrain_arguments(pretrain_arguments) @@ -653,7 +658,7 @@ def setUp(self): self.run_pretrain_file = "llm/llama/run_pretrain.py" - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["unified_checkpoint"] = 0 self.run_n1c2(self.run_pretrain_file, **train_args) @@ -669,7 +674,7 @@ def testTP2(self): train_args = self.configs["TP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -678,6 +683,7 @@ def testTP2(self): np.testing.assert_allclose(res[0], res[1], self.rtol) +@pytest.mark.skipif(True, reason="Skip for None CE") class TestUnifiedCheckpointOnN1C2Reset(TestMultipleGpus): def setUp(self): self.configs = get_pretrain_arguments(pretrain_arguments) @@ -714,7 +720,7 @@ def setUp(self): "training_args.bin", ] - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["unified_checkpoint"] = 1 self.run_n1c2(self.run_pretrain_file, **train_args) @@ -730,7 +736,7 @@ def testTP2(self): train_args = self.configs["TP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -748,7 +754,7 @@ def testFileLists(self): base_ckpt_path = os.path.join(pretrain_arguments["output_dir"], "checkpoint-%d" % save_steps) train_args = self.configs["TP2"] - self.runfrist(train_args) + self.runfirst(train_args) assert sorted(self.filelists) == sorted(os.listdir(base_ckpt_path)) self.rerun(train_args) @@ -761,7 +767,7 @@ def testFileLists(self): remove_logs() remove_ckpt(pretrain_arguments["output_dir"]) train_args["unified_checkpoint_config"] = "skip_save_model_weight" - self.runfrist(train_args) + self.runfirst(train_args) unsave_filelists = [ "master_weights-00001-of-00002.safetensors", "master_weights-00002-of-00002.safetensors", @@ -788,7 +794,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) def rerun(self, train_args): @@ -809,7 +815,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n2c4(self.run_pretrain_file, **train_args) def rerun(self, train_args): @@ -828,7 +834,7 @@ def setUp(self): self.need_allclose = False - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["fp16_opt_level"] = "O1" self.run_n2c4(self.run_pretrain_file, **train_args) @@ -849,7 +855,7 @@ def setUp(self): self.need_allclose = False - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["fp16_opt_level"] = "O2" self.run_n2c4(self.run_pretrain_file, **train_args) @@ -866,7 +872,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["unified_checkpoint"] = 0 self.run_n2c4(self.run_pretrain_file, **train_args) @@ -886,7 +892,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n2c4(self.run_pretrain_file, **train_args) def rerun(self, train_args): @@ -909,7 +915,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) move_checkpoint_N1C8_to_N2C4() @@ -937,7 +943,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["fp16_opt_level"] = "O1" self.run_n1c8(self.run_pretrain_file, **train_args) move_checkpoint_N1C8_to_N2C4() @@ -967,7 +973,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["fp16_opt_level"] = "O2" self.run_n1c8(self.run_pretrain_file, **train_args) move_checkpoint_N1C8_to_N2C4() @@ -995,7 +1001,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) move_checkpoint_N1C8_to_N2C4() @@ -1023,7 +1029,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n2c4(self.run_pretrain_file, **train_args) move_checkpoint_N2C4_to_N1C8() @@ -1051,7 +1057,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["fp16_opt_level"] = "O1" self.run_n2c4(self.run_pretrain_file, **train_args) move_checkpoint_N2C4_to_N1C8() @@ -1081,7 +1087,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["fp16_opt_level"] = "O2" self.run_n2c4(self.run_pretrain_file, **train_args) move_checkpoint_N2C4_to_N1C8() @@ -1109,7 +1115,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n2c4(self.run_pretrain_file, **train_args) move_checkpoint_N2C4_to_N1C8() @@ -1123,6 +1129,7 @@ def rerun(self, train_args): np.testing.assert_allclose(res[0], res[-1], rtol=self.rtol) +@pytest.mark.skipif(True, reason="Skip for None CE") class TestUnifiedCheckpointOnN1C8EnableAll(TestUnifiedCheckpointBase): def setUp(self): super().setUp() @@ -1133,7 +1140,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) def rerun(self, train_args): @@ -1153,7 +1160,7 @@ def setUp(self): self.need_allclose = False self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_pretrain_file, log_dir="log_uc", **train_args) def rerun(self, train_args): @@ -1172,7 +1179,7 @@ def setUp(self): self.need_allclose = False self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_pretrain_file, log_dir="log_pd", **train_args) def rerun(self, train_args): diff --git a/tests/transformers/test_safetensors.py b/tests/transformers/test_safetensors.py index 3c143e26a0b5..85b291e42349 100644 --- a/tests/transformers/test_safetensors.py +++ b/tests/transformers/test_safetensors.py @@ -28,7 +28,14 @@ class FastSafetensors(unittest.TestCase): def setUp(self): super().setUp() self.weigth_map = {} - tensors = [([10, 10], "float32"), ([8], "float16"), ([5, 5, 5], "int32")] + tensors = [ + ([10, 1, 10], "float32"), + ([1, 1, 10], "float32"), + ([1, 1, 1, 10], "float32"), + ([10, 10], "float32"), + ([8], "float16"), + ([5, 5, 5], "int32"), + ] count = 0 for shape, dtype in tensors: self.weigth_map[f"weight_{count}"] = (np.random.random(shape) * 100).astype(dtype) @@ -53,5 +60,10 @@ def test_safe_open(self): with fast_safe_open(path, framework="np") as f: for key in f.keys(): safe_slice = f.get_slice(key) + # np.testing.assert_equal(self.weigth_map[key][2:1, ...], safe_slice[2:1, ...]) + np.testing.assert_equal(self.weigth_map[key][0, ...], safe_slice[0, ...]) + np.testing.assert_equal(self.weigth_map[key][0:1, ...], safe_slice[0:1, ...]) + np.testing.assert_equal(self.weigth_map[key][..., 2:], safe_slice[..., 2:]) + np.testing.assert_equal(self.weigth_map[key][..., 1], safe_slice[..., 1]) np.testing.assert_equal(self.weigth_map[key][:2, ...], safe_slice[:2, ...]) np.testing.assert_equal(self.weigth_map[key][..., :4], safe_slice[..., :4])