From a5ed9ed35056b273a1d18095acc96778d6bca962 Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Thu, 9 May 2024 12:03:32 +0800 Subject: [PATCH 01/12] update --- paddlenlp/transformers/llama/modeling.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index ac3074aae7ce..be88d6af74dc 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -98,9 +98,6 @@ def swiglu(x, y=None): ] -npu_is_casual = False - - def _get_interleave(n): def _get_interleave_power_of_2(n): start = 2 ** (-(2 ** -(math.log2(n) - 3))) @@ -212,6 +209,7 @@ def scaled_dot_product_attention( alibi=None, sequence_parallel=False, reshard_layer=None, + npu_is_casual=False, ): bsz, q_len, num_heads, head_dim = query_states.shape _, kv_seq_len, _, _ = value_states.shape @@ -851,6 +849,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, alibi: Optional[paddle.Tensor] = None, + npu_is_casual: bool = False, ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: """Input shape: Batch x Time x Channel""" # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism) @@ -1078,6 +1077,7 @@ def forward( alibi, self.sequence_parallel, reshard_layer=self.reshard_layer, + npu_is_casual=npu_is_casual, ) if output_attentions: attn_output, attn_weights = outputs @@ -1130,6 +1130,7 @@ def forward( past_key_value: Optional[Tuple[paddle.Tensor]] = None, use_cache: Optional[bool] = False, alibi: Optional[paddle.Tensor] = None, + npu_is_casual: bool = False, ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]: """ Args: @@ -1177,6 +1178,7 @@ def forward( output_attentions, use_cache, alibi, + npu_is_casual=npu_is_casual, ) if type(outputs) is tuple: @@ -1614,6 +1616,7 @@ def forward( attention_mask = self._prepare_decoder_attention_mask( attention_mask, (batch_size, seq_length), cache_length, inputs_embeds.dtype ) # [bs, 1, seq_len, seq_len] + is_casual = False if self.config.use_flash_attention: is_casual = is_casual_mask(attention_mask) if get_env_device() != "npu": @@ -1658,6 +1661,7 @@ def forward( past_key_value, use_cache, alibi=alibi, + npu_is_casual=is_casual, ) # NOTE: clear outdate cache after it has been used for memory saving From bd0aa8773f6df5b4b783b6561b6ec23efdbaf006 Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Thu, 9 May 2024 13:33:51 +0800 Subject: [PATCH 02/12] add llama-npu-opt-script --- llm/llama/npu/dev_opt_lora.sh | 86 +++++++++++++++++++++++++++++++++ llm/llama/npu/dev_opt_ppt.sh | 91 +++++++++++++++++++++++++++++++++++ llm/llama/npu/dev_opt_sft.sh | 81 +++++++++++++++++++++++++++++++ 3 files changed, 258 insertions(+) create mode 100644 llm/llama/npu/dev_opt_lora.sh create mode 100644 llm/llama/npu/dev_opt_ppt.sh create mode 100644 llm/llama/npu/dev_opt_sft.sh diff --git a/llm/llama/npu/dev_opt_lora.sh b/llm/llama/npu/dev_opt_lora.sh new file mode 100644 index 000000000000..09719c572eee --- /dev/null +++ b/llm/llama/npu/dev_opt_lora.sh @@ -0,0 +1,86 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +max_steps=${1:-1000} +lock_seed_flag=${2:-close} +if [[ ${lock_seed_flag} =~ "open_lock_seed" ]];then + export npu_deterministic=true + export ACL_OP_DETERMINISTIC=true + export ACL_OPT_DETERMINISTIC=true + export HCCL_DETERMINISTIC=true +fi +echo lock_seed_flag +echo $lock_seed_flag +echo npu_deterministic ACL_OP_DETERMINISTIC ACL_OPT_DETERMINISTIC HCCL_DETERMINISTIC +echo $npu_deterministic $ACL_OP_DETERMINISTIC $ACL_OPT_DETERMINISTIC $HCCL_DETERMINISTIC + +export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" +export FLAGS_use_stride_kernel=0 +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS +export FLAGS_NPU_MC2=1 +export MC2_Recompute=1 +export MC2=1 +export FLAGS_allocator_strategy=naive_best_fit +source /usr/local/Ascend/ascend-toolkit/set_env.sh + +#240411新增 +# export MC2=1 + +rm -rf lora_bf16_llama_N1C8 +rm -rf output/lora_bf16_llama_N1C8 +ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9 +ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -9 +export PYTHONPATH=../../:$PYTHONPATH +python -u -m paddle.distributed.launch \ + --devices "0,1,2,3,4,5,6,7" \ + --log_dir "./lora_bf16_llama_N1C8" \ + ../finetune_generation.py \ + --device "npu" \ + --model_name_or_path "meta-llama/Llama-2-13b" \ + --dataset_name_or_path "data/" \ + --output_dir "./output/lora_bf16_llama_N1C8" \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 16 \ + --per_device_eval_batch_size 1 \ + --eval_accumulation_steps 1 \ + --max_steps ${max_steps} \ + --decay_steps 2000 \ + --learning_rate 3e-06 \ + --warmup_steps 2 \ + --save_steps 1000 \ + --logging_steps 1 \ + --evaluation_strategy "epoch" \ + --src_length 1024 \ + --max_length 4096 \ + --bf16 true \ + --fp16_opt_level "O2" \ + --do_train true \ + --disable_tqdm true \ + --eval_with_do_generation false \ + --metric_for_best_model "accuracy" \ + --recompute false \ + --tensor_parallel_degree 8 \ + --pipeline_parallel_degree 1 \ + --zero_padding 0 \ + --sequence_parallel 1 \ + --amp_master_grad true \ + --fuse_attention_qkv true \ + --fuse_attention_ffn true \ + --use_flash_attention 1 \ + --use_fused_rope 1 \ + --use_fused_rms_norm 1 \ + --lora true \ + --lora_rank 32 \ + --pad_to_multiple_of 4096 \ No newline at end of file diff --git a/llm/llama/npu/dev_opt_ppt.sh b/llm/llama/npu/dev_opt_ppt.sh new file mode 100644 index 000000000000..b0cfb12f3223 --- /dev/null +++ b/llm/llama/npu/dev_opt_ppt.sh @@ -0,0 +1,91 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +max_steps=${1:-800} + +set -x +ps aux | grep run_pretrain.py | grep -v grep | awk '{print $2}' | xargs kill -9 +rm -rf ./log_8.0 +rm -rf output +export PYTHONPATH=../:$PYTHONPATH +export MC2=1 +export GLOG_v=0 +export FLAGS_npu_storage_format=1 +export HCCL_INTRA_PCIE_EHABLE=0 +export HCCL_INTRA_ROCE_ENABLE=1 +export FLAGS_allocator_strategy=naive_best_fit +export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" +export FLAGS_NPU_MC2=1 +export MC2_Recompute=1 +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS + +#240411新增 +export FLAGS_use_stride_kernel=0 +export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE +export MULTI_STREAM_MEMORY_REUSE=1 + +source /usr/local/Ascend/ascend-toolkit/set_env.sh + +python -u -m paddle.distributed.launch \ + --log_dir "./log_8.0" \ + run_pretrain.py \ + --model_name_or_path "meta-llama/Llama-2-13b" \ + --tokenizer_name_or_path "meta-llama/Llama-2-13b" \ + --input_dir "./pre-data" \ + --output_dir "./output" \ + --split 949,50,1 \ + --max_seq_length 4096 \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 32 \ + --per_device_eval_batch_size 1 \ + --use_flash_attention 1 \ + --use_fused_rms_norm 1 \ + --virtual_pp_degree 1 \ + --learning_rate 0.00001 \ + --min_learning_rate 0.000001 \ + --max_steps ${max_steps} \ + --decay_steps 2000 \ + --save_steps 2000 \ + --seed 100 \ + --weight_decay 0.01 \ + --warmup_steps 20 \ + --max_grad_norm 1.0 \ + --logging_steps 1 \ + --dataloader_num_workers 1 \ + --eval_steps 1001 \ + --tensor_parallel_degree 4 \ + --disable_tqdm true \ + --continue_training 0 \ + --do_train \ + --device "npu" \ + --enable_linear_fused_grad_add false \ + --fuse_attention_qkv true \ + --fuse_attention_ffn true \ + --use_fused_rope true \ + --recompute_use_reentrant true \ + --data_cache "./data_cache" \ + --bf16 \ + --fp16_opt_level "O2" \ + --amp_master_grad \ + --load_sharded_model true \ + --save_sharded_model true \ + --pipeline_parallel_degree 1 \ + --ignore_data_skip 0 \ + --force_reshard_pp true \ + --tensor_parallel_config "enable_mp_async_allreduce enable_mp_skip_c_identity" \ + --sequence_parallel 1 \ + --pipeline_parallel_config "disable_partial_send_recv" \ + --sharding "stage1" \ + --sharding_parallel_degree 2 diff --git a/llm/llama/npu/dev_opt_sft.sh b/llm/llama/npu/dev_opt_sft.sh new file mode 100644 index 000000000000..3a72d24721b6 --- /dev/null +++ b/llm/llama/npu/dev_opt_sft.sh @@ -0,0 +1,81 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +export FLAGS_use_stride_kernel=0 +export FLAGS_npu_storage_format=1 +export HCCL_INTRA_PCIE_EHABLE=0 +export HCCL_INTRA_ROCE_ENABLE=1 +export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" + +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS +export GLOG_v=0 +export FLAGS_NPU_MC2=1 +export MC2_Recompute=1 +export MC2=1 +export FLAGS_allocator_strategy=naive_best_fit +source /usr/local/Ascend/ascend-toolkit/set_env.sh + +#240411新增 +# export MC2=1 +export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE +export MULTI_STREAM_MEMORY_REUSE=1 + +export PYTHONPATH=../../:$PYTHONPATH +rm -rf sft_bf16_llama_N1C8 +rm -rf output/sft_bf16_llama_N1C8 +ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9 +ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -9 +python -u -m paddle.distributed.launch \ + --devices "0,1,2,3,4,5,6,7" \ + --log_dir "./sft_bf16_llama_N1C8" \ + ../finetune_generation.py \ + --device "npu" \ + --model_name_or_path "meta-llama/Llama-2-13b" \ + --dataset_name_or_path "data/" \ + --output_dir "./output/sft_bf16_llama_N1C8" \ + --logging_dir "./sft_logs" \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 32 \ + --per_device_eval_batch_size 1 \ + --eval_accumulation_steps 1 \ + --max_steps 2000 \ + --learning_rate 3e-06 \ + --warmup_steps 2 \ + --save_steps 1000 \ + --logging_steps 1 \ + --evaluation_strategy "epoch" \ + --src_length 1024 \ + --max_length 4096 \ + --fp16 true \ + --fp16_opt_level "O2" \ + --do_train true \ + --disable_tqdm true \ + --eval_with_do_generation false \ + --metric_for_best_model "accuracy" \ + --recompute false \ + --tensor_parallel_degree 4 \ + --pipeline_parallel_degree 1 \ + --zero_padding 0 \ + --amp_master_grad true \ + --fuse_attention_qkv true \ + --fuse_attention_ffn true \ + --sequence_parallel 1 \ + --use_flash_attention 1 \ + --use_fused_rope 1 \ + --use_fused_rms_norm 1 \ + --sharding_parallel_degree 2 \ + --pad_to_multiple_of 4096 \ + --sharding "stage1" \ + --sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap" \ No newline at end of file From cc2413268e35b31f23c64c202845ec73d4688586 Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Thu, 9 May 2024 13:52:27 +0800 Subject: [PATCH 03/12] Update dev_opt_lora.sh --- llm/llama/npu/dev_opt_lora.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llm/llama/npu/dev_opt_lora.sh b/llm/llama/npu/dev_opt_lora.sh index 09719c572eee..8da29cbbb788 100644 --- a/llm/llama/npu/dev_opt_lora.sh +++ b/llm/llama/npu/dev_opt_lora.sh @@ -42,7 +42,7 @@ rm -rf lora_bf16_llama_N1C8 rm -rf output/lora_bf16_llama_N1C8 ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9 ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -9 -export PYTHONPATH=../../:$PYTHONPATH +export PYTHONPATH=../../../:$PYTHONPATH python -u -m paddle.distributed.launch \ --devices "0,1,2,3,4,5,6,7" \ --log_dir "./lora_bf16_llama_N1C8" \ @@ -83,4 +83,4 @@ python -u -m paddle.distributed.launch \ --use_fused_rms_norm 1 \ --lora true \ --lora_rank 32 \ - --pad_to_multiple_of 4096 \ No newline at end of file + --pad_to_multiple_of 4096 From 036d03c401b811c7295ac47ff1dc47bbef2a8fa2 Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Thu, 9 May 2024 13:52:57 +0800 Subject: [PATCH 04/12] Update dev_opt_ppt.sh --- llm/llama/npu/dev_opt_ppt.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/llama/npu/dev_opt_ppt.sh b/llm/llama/npu/dev_opt_ppt.sh index b0cfb12f3223..81365289bef7 100644 --- a/llm/llama/npu/dev_opt_ppt.sh +++ b/llm/llama/npu/dev_opt_ppt.sh @@ -18,7 +18,7 @@ set -x ps aux | grep run_pretrain.py | grep -v grep | awk '{print $2}' | xargs kill -9 rm -rf ./log_8.0 rm -rf output -export PYTHONPATH=../:$PYTHONPATH +export PYTHONPATH=../../../:$PYTHONPATH export MC2=1 export GLOG_v=0 export FLAGS_npu_storage_format=1 From 8dd2d020ac6c180368952a2f2f62426a1b16ad8f Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Thu, 9 May 2024 13:54:05 +0800 Subject: [PATCH 05/12] Update dev_opt_lora.sh --- llm/llama/npu/dev_opt_lora.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/llama/npu/dev_opt_lora.sh b/llm/llama/npu/dev_opt_lora.sh index 8da29cbbb788..981a833d9c56 100644 --- a/llm/llama/npu/dev_opt_lora.sh +++ b/llm/llama/npu/dev_opt_lora.sh @@ -46,7 +46,7 @@ export PYTHONPATH=../../../:$PYTHONPATH python -u -m paddle.distributed.launch \ --devices "0,1,2,3,4,5,6,7" \ --log_dir "./lora_bf16_llama_N1C8" \ - ../finetune_generation.py \ + ../../finetune_generation.py \ --device "npu" \ --model_name_or_path "meta-llama/Llama-2-13b" \ --dataset_name_or_path "data/" \ From 96e69aa7faa8c58f5e003bdc658ab7a970dad1a6 Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Thu, 9 May 2024 13:54:33 +0800 Subject: [PATCH 06/12] Update dev_opt_ppt.sh --- llm/llama/npu/dev_opt_ppt.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/llama/npu/dev_opt_ppt.sh b/llm/llama/npu/dev_opt_ppt.sh index 81365289bef7..4db9d6a728a1 100644 --- a/llm/llama/npu/dev_opt_ppt.sh +++ b/llm/llama/npu/dev_opt_ppt.sh @@ -40,7 +40,7 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh python -u -m paddle.distributed.launch \ --log_dir "./log_8.0" \ - run_pretrain.py \ + ../run_pretrain.py \ --model_name_or_path "meta-llama/Llama-2-13b" \ --tokenizer_name_or_path "meta-llama/Llama-2-13b" \ --input_dir "./pre-data" \ From a35ba59e291b631b5ee37d40da07fed8a3c2561d Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Thu, 9 May 2024 13:55:20 +0800 Subject: [PATCH 07/12] Update dev_opt_sft.sh --- llm/llama/npu/dev_opt_sft.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llm/llama/npu/dev_opt_sft.sh b/llm/llama/npu/dev_opt_sft.sh index 3a72d24721b6..bce6867d234a 100644 --- a/llm/llama/npu/dev_opt_sft.sh +++ b/llm/llama/npu/dev_opt_sft.sh @@ -32,7 +32,7 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE export MULTI_STREAM_MEMORY_REUSE=1 -export PYTHONPATH=../../:$PYTHONPATH +export PYTHONPATH=../../../:$PYTHONPATH rm -rf sft_bf16_llama_N1C8 rm -rf output/sft_bf16_llama_N1C8 ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9 @@ -40,7 +40,7 @@ ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill - python -u -m paddle.distributed.launch \ --devices "0,1,2,3,4,5,6,7" \ --log_dir "./sft_bf16_llama_N1C8" \ - ../finetune_generation.py \ + ../../finetune_generation.py \ --device "npu" \ --model_name_or_path "meta-llama/Llama-2-13b" \ --dataset_name_or_path "data/" \ @@ -78,4 +78,4 @@ python -u -m paddle.distributed.launch \ --sharding_parallel_degree 2 \ --pad_to_multiple_of 4096 \ --sharding "stage1" \ - --sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap" \ No newline at end of file + --sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap" From 68388a7f09c5ed8197161f9f9b23044c2ade2b3b Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Sat, 11 May 2024 18:48:02 +0800 Subject: [PATCH 08/12] Rename dev_opt_lora.sh to llama_npu_opt_lora.sh --- llm/llama/npu/{dev_opt_lora.sh => llama_npu_opt_lora.sh} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename llm/llama/npu/{dev_opt_lora.sh => llama_npu_opt_lora.sh} (100%) diff --git a/llm/llama/npu/dev_opt_lora.sh b/llm/llama/npu/llama_npu_opt_lora.sh similarity index 100% rename from llm/llama/npu/dev_opt_lora.sh rename to llm/llama/npu/llama_npu_opt_lora.sh From fee8f04a006de61a589a7ff9888b8ee414f5a2e5 Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Sat, 11 May 2024 18:48:41 +0800 Subject: [PATCH 09/12] Update dev_opt_ppt.sh --- llm/llama/npu/dev_opt_ppt.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/llm/llama/npu/dev_opt_ppt.sh b/llm/llama/npu/dev_opt_ppt.sh index 4db9d6a728a1..d3a082f49c0d 100644 --- a/llm/llama/npu/dev_opt_ppt.sh +++ b/llm/llama/npu/dev_opt_ppt.sh @@ -31,7 +31,6 @@ export MC2_Recompute=1 unset PADDLE_TRAINER_ENDPOINTS unset DISTRIBUTED_TRAINER_ENDPOINTS -#240411新增 export FLAGS_use_stride_kernel=0 export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE export MULTI_STREAM_MEMORY_REUSE=1 From 783de3b70952f519d56709110fbe2a227511f34a Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Sat, 11 May 2024 18:49:19 +0800 Subject: [PATCH 10/12] Rename dev_opt_ppt.sh to llama_npu_opt_ppt.sh --- llm/llama/npu/{dev_opt_ppt.sh => llama_npu_opt_ppt.sh} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename llm/llama/npu/{dev_opt_ppt.sh => llama_npu_opt_ppt.sh} (100%) diff --git a/llm/llama/npu/dev_opt_ppt.sh b/llm/llama/npu/llama_npu_opt_ppt.sh similarity index 100% rename from llm/llama/npu/dev_opt_ppt.sh rename to llm/llama/npu/llama_npu_opt_ppt.sh From 10f94155fdce1397389e7daa66e80b52f29eceef Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Sat, 11 May 2024 18:50:01 +0800 Subject: [PATCH 11/12] Update llama_npu_opt_lora.sh --- llm/llama/npu/llama_npu_opt_lora.sh | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/llm/llama/npu/llama_npu_opt_lora.sh b/llm/llama/npu/llama_npu_opt_lora.sh index 981a833d9c56..fd1b004a8af5 100644 --- a/llm/llama/npu/llama_npu_opt_lora.sh +++ b/llm/llama/npu/llama_npu_opt_lora.sh @@ -13,17 +13,6 @@ # limitations under the License. max_steps=${1:-1000} -lock_seed_flag=${2:-close} -if [[ ${lock_seed_flag} =~ "open_lock_seed" ]];then - export npu_deterministic=true - export ACL_OP_DETERMINISTIC=true - export ACL_OPT_DETERMINISTIC=true - export HCCL_DETERMINISTIC=true -fi -echo lock_seed_flag -echo $lock_seed_flag -echo npu_deterministic ACL_OP_DETERMINISTIC ACL_OPT_DETERMINISTIC HCCL_DETERMINISTIC -echo $npu_deterministic $ACL_OP_DETERMINISTIC $ACL_OPT_DETERMINISTIC $HCCL_DETERMINISTIC export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" export FLAGS_use_stride_kernel=0 @@ -35,13 +24,10 @@ export MC2=1 export FLAGS_allocator_strategy=naive_best_fit source /usr/local/Ascend/ascend-toolkit/set_env.sh -#240411新增 -# export MC2=1 rm -rf lora_bf16_llama_N1C8 rm -rf output/lora_bf16_llama_N1C8 -ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9 -ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -9 +ps aux | grep "finetune_generation.py" | grep -v grep | awk '{print $2}' | xargs kill -9 export PYTHONPATH=../../../:$PYTHONPATH python -u -m paddle.distributed.launch \ --devices "0,1,2,3,4,5,6,7" \ From f3d96e519e7907f38ae1dff3d94bd77c8698f8ae Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Sat, 11 May 2024 18:51:04 +0800 Subject: [PATCH 12/12] Update and rename dev_opt_sft.sh to llama_npu_opt_sft.sh --- llm/llama/npu/{dev_opt_sft.sh => llama_npu_opt_sft.sh} | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) rename llm/llama/npu/{dev_opt_sft.sh => llama_npu_opt_sft.sh} (93%) diff --git a/llm/llama/npu/dev_opt_sft.sh b/llm/llama/npu/llama_npu_opt_sft.sh similarity index 93% rename from llm/llama/npu/dev_opt_sft.sh rename to llm/llama/npu/llama_npu_opt_sft.sh index bce6867d234a..786e6cf835aa 100644 --- a/llm/llama/npu/dev_opt_sft.sh +++ b/llm/llama/npu/llama_npu_opt_sft.sh @@ -27,16 +27,13 @@ export MC2=1 export FLAGS_allocator_strategy=naive_best_fit source /usr/local/Ascend/ascend-toolkit/set_env.sh -#240411新增 -# export MC2=1 export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE export MULTI_STREAM_MEMORY_REUSE=1 export PYTHONPATH=../../../:$PYTHONPATH rm -rf sft_bf16_llama_N1C8 rm -rf output/sft_bf16_llama_N1C8 -ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9 -ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -9 +ps aux | grep "finetune_generation.py" | grep -v grep | awk '{print $2}' | xargs kill -9 python -u -m paddle.distributed.launch \ --devices "0,1,2,3,4,5,6,7" \ --log_dir "./sft_bf16_llama_N1C8" \