From a5ed9ed35056b273a1d18095acc96778d6bca962 Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Thu, 9 May 2024 12:03:32 +0800
Subject: [PATCH 01/12] update

---
 paddlenlp/transformers/llama/modeling.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
index ac3074aae7ce..be88d6af74dc 100755
--- a/paddlenlp/transformers/llama/modeling.py
+++ b/paddlenlp/transformers/llama/modeling.py
@@ -98,9 +98,6 @@ def swiglu(x, y=None):
 ]
 
 
-npu_is_casual = False
-
-
 def _get_interleave(n):
     def _get_interleave_power_of_2(n):
         start = 2 ** (-(2 ** -(math.log2(n) - 3)))
@@ -212,6 +209,7 @@ def scaled_dot_product_attention(
     alibi=None,
     sequence_parallel=False,
     reshard_layer=None,
+    npu_is_casual=False,
 ):
     bsz, q_len, num_heads, head_dim = query_states.shape
     _, kv_seq_len, _, _ = value_states.shape
@@ -851,6 +849,7 @@ def forward(
         output_attentions: bool = False,
         use_cache: bool = False,
         alibi: Optional[paddle.Tensor] = None,
+        npu_is_casual: bool = False,
     ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
         # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism)
@@ -1078,6 +1077,7 @@ def forward(
                 alibi,
                 self.sequence_parallel,
                 reshard_layer=self.reshard_layer,
+                npu_is_casual=npu_is_casual,
             )
         if output_attentions:
             attn_output, attn_weights = outputs
@@ -1130,6 +1130,7 @@ def forward(
         past_key_value: Optional[Tuple[paddle.Tensor]] = None,
         use_cache: Optional[bool] = False,
         alibi: Optional[paddle.Tensor] = None,
+        npu_is_casual: bool = False,
     ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
         """
         Args:
@@ -1177,6 +1178,7 @@ def forward(
                 output_attentions,
                 use_cache,
                 alibi,
+                npu_is_casual=npu_is_casual,
             )
 
         if type(outputs) is tuple:
@@ -1614,6 +1616,7 @@ def forward(
         attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, (batch_size, seq_length), cache_length, inputs_embeds.dtype
         )  # [bs, 1, seq_len, seq_len]
+        is_casual = False
         if self.config.use_flash_attention:
             is_casual = is_casual_mask(attention_mask)
             if get_env_device() != "npu":
@@ -1658,6 +1661,7 @@ def forward(
                     past_key_value,
                     use_cache,
                     alibi=alibi,
+                    npu_is_casual=is_casual,
                 )
 
             # NOTE: clear outdate cache after it has been used for memory saving

From bd0aa8773f6df5b4b783b6561b6ec23efdbaf006 Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Thu, 9 May 2024 13:33:51 +0800
Subject: [PATCH 02/12] add llama-npu-opt-script

---
 llm/llama/npu/dev_opt_lora.sh | 86 +++++++++++++++++++++++++++++++++
 llm/llama/npu/dev_opt_ppt.sh  | 91 +++++++++++++++++++++++++++++++++++
 llm/llama/npu/dev_opt_sft.sh  | 81 +++++++++++++++++++++++++++++++
 3 files changed, 258 insertions(+)
 create mode 100644 llm/llama/npu/dev_opt_lora.sh
 create mode 100644 llm/llama/npu/dev_opt_ppt.sh
 create mode 100644 llm/llama/npu/dev_opt_sft.sh

diff --git a/llm/llama/npu/dev_opt_lora.sh b/llm/llama/npu/dev_opt_lora.sh
new file mode 100644
index 000000000000..09719c572eee
--- /dev/null
+++ b/llm/llama/npu/dev_opt_lora.sh
@@ -0,0 +1,86 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+max_steps=${1:-1000}
+lock_seed_flag=${2:-close}
+if [[ ${lock_seed_flag} =~ "open_lock_seed" ]];then
+    export npu_deterministic=true
+    export ACL_OP_DETERMINISTIC=true
+    export ACL_OPT_DETERMINISTIC=true
+    export HCCL_DETERMINISTIC=true
+fi
+echo lock_seed_flag 
+echo $lock_seed_flag
+echo npu_deterministic ACL_OP_DETERMINISTIC ACL_OPT_DETERMINISTIC HCCL_DETERMINISTIC
+echo $npu_deterministic $ACL_OP_DETERMINISTIC $ACL_OPT_DETERMINISTIC $HCCL_DETERMINISTIC
+
+export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+export FLAGS_use_stride_kernel=0
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+export FLAGS_NPU_MC2=1
+export MC2_Recompute=1
+export MC2=1
+export FLAGS_allocator_strategy=naive_best_fit
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+#240411新增
+# export MC2=1
+
+rm -rf lora_bf16_llama_N1C8
+rm -rf output/lora_bf16_llama_N1C8
+ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9
+ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -9
+export PYTHONPATH=../../:$PYTHONPATH
+python -u  -m paddle.distributed.launch \
+    --devices "0,1,2,3,4,5,6,7" \
+    --log_dir "./lora_bf16_llama_N1C8" \
+    ../finetune_generation.py \
+    --device "npu" \
+    --model_name_or_path "meta-llama/Llama-2-13b" \
+    --dataset_name_or_path "data/" \
+    --output_dir "./output/lora_bf16_llama_N1C8" \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 16 \
+    --per_device_eval_batch_size 1 \
+    --eval_accumulation_steps 1 \
+    --max_steps ${max_steps} \
+    --decay_steps 2000 \
+    --learning_rate 3e-06 \
+    --warmup_steps 2 \
+    --save_steps 1000 \
+    --logging_steps 1 \
+    --evaluation_strategy "epoch" \
+    --src_length 1024 \
+    --max_length 4096 \
+    --bf16 true \
+    --fp16_opt_level "O2" \
+    --do_train true \
+    --disable_tqdm true \
+    --eval_with_do_generation false \
+    --metric_for_best_model "accuracy" \
+    --recompute false \
+    --tensor_parallel_degree 8 \
+    --pipeline_parallel_degree 1 \
+    --zero_padding 0 \
+    --sequence_parallel 1 \
+    --amp_master_grad true \
+    --fuse_attention_qkv true \
+    --fuse_attention_ffn true \
+    --use_flash_attention 1 \
+    --use_fused_rope 1 \
+    --use_fused_rms_norm 1 \
+    --lora true \
+    --lora_rank 32 \
+    --pad_to_multiple_of 4096
\ No newline at end of file
diff --git a/llm/llama/npu/dev_opt_ppt.sh b/llm/llama/npu/dev_opt_ppt.sh
new file mode 100644
index 000000000000..b0cfb12f3223
--- /dev/null
+++ b/llm/llama/npu/dev_opt_ppt.sh
@@ -0,0 +1,91 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+max_steps=${1:-800}
+
+set -x
+ps aux | grep run_pretrain.py | grep -v grep | awk '{print $2}' | xargs kill -9
+rm -rf ./log_8.0
+rm -rf output
+export PYTHONPATH=../:$PYTHONPATH
+export MC2=1
+export GLOG_v=0
+export FLAGS_npu_storage_format=1
+export HCCL_INTRA_PCIE_EHABLE=0
+export HCCL_INTRA_ROCE_ENABLE=1
+export FLAGS_allocator_strategy=naive_best_fit
+export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+export FLAGS_NPU_MC2=1
+export MC2_Recompute=1
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+
+#240411新增
+export FLAGS_use_stride_kernel=0
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export MULTI_STREAM_MEMORY_REUSE=1
+
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+python -u  -m paddle.distributed.launch \
+    --log_dir "./log_8.0" \
+    run_pretrain.py \
+    --model_name_or_path "meta-llama/Llama-2-13b" \
+    --tokenizer_name_or_path "meta-llama/Llama-2-13b" \
+    --input_dir "./pre-data" \
+    --output_dir "./output" \
+    --split 949,50,1 \
+    --max_seq_length 4096 \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 32 \
+    --per_device_eval_batch_size 1 \
+    --use_flash_attention 1 \
+    --use_fused_rms_norm 1 \
+    --virtual_pp_degree 1 \
+    --learning_rate 0.00001 \
+    --min_learning_rate 0.000001 \
+    --max_steps ${max_steps} \
+    --decay_steps 2000 \
+    --save_steps 2000 \
+    --seed 100 \
+    --weight_decay 0.01 \
+    --warmup_steps 20 \
+    --max_grad_norm 1.0 \
+    --logging_steps 1 \
+    --dataloader_num_workers 1 \
+    --eval_steps 1001 \
+    --tensor_parallel_degree 4 \
+    --disable_tqdm true \
+    --continue_training 0 \
+    --do_train \
+    --device "npu" \
+    --enable_linear_fused_grad_add false \
+    --fuse_attention_qkv true \
+    --fuse_attention_ffn true \
+    --use_fused_rope true \
+    --recompute_use_reentrant true \
+    --data_cache "./data_cache" \
+    --bf16 \
+    --fp16_opt_level "O2" \
+    --amp_master_grad \
+    --load_sharded_model true \
+    --save_sharded_model true \
+    --pipeline_parallel_degree 1 \
+    --ignore_data_skip 0 \
+    --force_reshard_pp true \
+    --tensor_parallel_config "enable_mp_async_allreduce enable_mp_skip_c_identity" \
+    --sequence_parallel 1 \
+    --pipeline_parallel_config "disable_partial_send_recv" \
+    --sharding "stage1" \
+    --sharding_parallel_degree 2
diff --git a/llm/llama/npu/dev_opt_sft.sh b/llm/llama/npu/dev_opt_sft.sh
new file mode 100644
index 000000000000..3a72d24721b6
--- /dev/null
+++ b/llm/llama/npu/dev_opt_sft.sh
@@ -0,0 +1,81 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export FLAGS_use_stride_kernel=0
+export FLAGS_npu_storage_format=1
+export HCCL_INTRA_PCIE_EHABLE=0
+export HCCL_INTRA_ROCE_ENABLE=1
+export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+export GLOG_v=0
+export FLAGS_NPU_MC2=1
+export MC2_Recompute=1
+export MC2=1
+export FLAGS_allocator_strategy=naive_best_fit
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+#240411新增
+# export MC2=1
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export MULTI_STREAM_MEMORY_REUSE=1
+
+export PYTHONPATH=../../:$PYTHONPATH
+rm -rf sft_bf16_llama_N1C8
+rm -rf output/sft_bf16_llama_N1C8
+ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9
+ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -9
+python -u  -m paddle.distributed.launch \
+    --devices "0,1,2,3,4,5,6,7" \
+    --log_dir "./sft_bf16_llama_N1C8" \
+    ../finetune_generation.py \
+    --device "npu" \
+    --model_name_or_path "meta-llama/Llama-2-13b" \
+    --dataset_name_or_path "data/" \
+    --output_dir "./output/sft_bf16_llama_N1C8" \
+    --logging_dir "./sft_logs" \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 32 \
+    --per_device_eval_batch_size 1 \
+    --eval_accumulation_steps 1 \
+    --max_steps 2000 \
+    --learning_rate 3e-06 \
+    --warmup_steps 2 \
+    --save_steps 1000 \
+    --logging_steps 1 \
+    --evaluation_strategy "epoch" \
+    --src_length 1024 \
+    --max_length 4096 \
+    --fp16 true \
+    --fp16_opt_level "O2" \
+    --do_train true \
+    --disable_tqdm true \
+    --eval_with_do_generation false \
+    --metric_for_best_model "accuracy" \
+    --recompute false \
+    --tensor_parallel_degree 4 \
+    --pipeline_parallel_degree 1 \
+    --zero_padding 0 \
+    --amp_master_grad true \
+    --fuse_attention_qkv true \
+    --fuse_attention_ffn true \
+    --sequence_parallel 1 \
+    --use_flash_attention 1 \
+    --use_fused_rope 1 \
+    --use_fused_rms_norm 1 \
+        --sharding_parallel_degree 2 \
+    --pad_to_multiple_of 4096 \
+        --sharding "stage1" \
+        --sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap"
\ No newline at end of file

From cc2413268e35b31f23c64c202845ec73d4688586 Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Thu, 9 May 2024 13:52:27 +0800
Subject: [PATCH 03/12] Update dev_opt_lora.sh

---
 llm/llama/npu/dev_opt_lora.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llm/llama/npu/dev_opt_lora.sh b/llm/llama/npu/dev_opt_lora.sh
index 09719c572eee..8da29cbbb788 100644
--- a/llm/llama/npu/dev_opt_lora.sh
+++ b/llm/llama/npu/dev_opt_lora.sh
@@ -42,7 +42,7 @@ rm -rf lora_bf16_llama_N1C8
 rm -rf output/lora_bf16_llama_N1C8
 ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9
 ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -9
-export PYTHONPATH=../../:$PYTHONPATH
+export PYTHONPATH=../../../:$PYTHONPATH
 python -u  -m paddle.distributed.launch \
     --devices "0,1,2,3,4,5,6,7" \
     --log_dir "./lora_bf16_llama_N1C8" \
@@ -83,4 +83,4 @@ python -u  -m paddle.distributed.launch \
     --use_fused_rms_norm 1 \
     --lora true \
     --lora_rank 32 \
-    --pad_to_multiple_of 4096
\ No newline at end of file
+    --pad_to_multiple_of 4096

From 036d03c401b811c7295ac47ff1dc47bbef2a8fa2 Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Thu, 9 May 2024 13:52:57 +0800
Subject: [PATCH 04/12] Update dev_opt_ppt.sh

---
 llm/llama/npu/dev_opt_ppt.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/llama/npu/dev_opt_ppt.sh b/llm/llama/npu/dev_opt_ppt.sh
index b0cfb12f3223..81365289bef7 100644
--- a/llm/llama/npu/dev_opt_ppt.sh
+++ b/llm/llama/npu/dev_opt_ppt.sh
@@ -18,7 +18,7 @@ set -x
 ps aux | grep run_pretrain.py | grep -v grep | awk '{print $2}' | xargs kill -9
 rm -rf ./log_8.0
 rm -rf output
-export PYTHONPATH=../:$PYTHONPATH
+export PYTHONPATH=../../../:$PYTHONPATH
 export MC2=1
 export GLOG_v=0
 export FLAGS_npu_storage_format=1

From 8dd2d020ac6c180368952a2f2f62426a1b16ad8f Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Thu, 9 May 2024 13:54:05 +0800
Subject: [PATCH 05/12] Update dev_opt_lora.sh

---
 llm/llama/npu/dev_opt_lora.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/llama/npu/dev_opt_lora.sh b/llm/llama/npu/dev_opt_lora.sh
index 8da29cbbb788..981a833d9c56 100644
--- a/llm/llama/npu/dev_opt_lora.sh
+++ b/llm/llama/npu/dev_opt_lora.sh
@@ -46,7 +46,7 @@ export PYTHONPATH=../../../:$PYTHONPATH
 python -u  -m paddle.distributed.launch \
     --devices "0,1,2,3,4,5,6,7" \
     --log_dir "./lora_bf16_llama_N1C8" \
-    ../finetune_generation.py \
+    ../../finetune_generation.py \
     --device "npu" \
     --model_name_or_path "meta-llama/Llama-2-13b" \
     --dataset_name_or_path "data/" \

From 96e69aa7faa8c58f5e003bdc658ab7a970dad1a6 Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Thu, 9 May 2024 13:54:33 +0800
Subject: [PATCH 06/12] Update dev_opt_ppt.sh

---
 llm/llama/npu/dev_opt_ppt.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/llama/npu/dev_opt_ppt.sh b/llm/llama/npu/dev_opt_ppt.sh
index 81365289bef7..4db9d6a728a1 100644
--- a/llm/llama/npu/dev_opt_ppt.sh
+++ b/llm/llama/npu/dev_opt_ppt.sh
@@ -40,7 +40,7 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh
 
 python -u  -m paddle.distributed.launch \
     --log_dir "./log_8.0" \
-    run_pretrain.py \
+    ../run_pretrain.py \
     --model_name_or_path "meta-llama/Llama-2-13b" \
     --tokenizer_name_or_path "meta-llama/Llama-2-13b" \
     --input_dir "./pre-data" \

From a35ba59e291b631b5ee37d40da07fed8a3c2561d Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Thu, 9 May 2024 13:55:20 +0800
Subject: [PATCH 07/12] Update dev_opt_sft.sh

---
 llm/llama/npu/dev_opt_sft.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llm/llama/npu/dev_opt_sft.sh b/llm/llama/npu/dev_opt_sft.sh
index 3a72d24721b6..bce6867d234a 100644
--- a/llm/llama/npu/dev_opt_sft.sh
+++ b/llm/llama/npu/dev_opt_sft.sh
@@ -32,7 +32,7 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh
 export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
 export MULTI_STREAM_MEMORY_REUSE=1
 
-export PYTHONPATH=../../:$PYTHONPATH
+export PYTHONPATH=../../../:$PYTHONPATH
 rm -rf sft_bf16_llama_N1C8
 rm -rf output/sft_bf16_llama_N1C8
 ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9
@@ -40,7 +40,7 @@ ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -
 python -u  -m paddle.distributed.launch \
     --devices "0,1,2,3,4,5,6,7" \
     --log_dir "./sft_bf16_llama_N1C8" \
-    ../finetune_generation.py \
+    ../../finetune_generation.py \
     --device "npu" \
     --model_name_or_path "meta-llama/Llama-2-13b" \
     --dataset_name_or_path "data/" \
@@ -78,4 +78,4 @@ python -u  -m paddle.distributed.launch \
         --sharding_parallel_degree 2 \
     --pad_to_multiple_of 4096 \
         --sharding "stage1" \
-        --sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap"
\ No newline at end of file
+        --sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap"

From 68388a7f09c5ed8197161f9f9b23044c2ade2b3b Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Sat, 11 May 2024 18:48:02 +0800
Subject: [PATCH 08/12] Rename dev_opt_lora.sh to llama_npu_opt_lora.sh

---
 llm/llama/npu/{dev_opt_lora.sh => llama_npu_opt_lora.sh} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llm/llama/npu/{dev_opt_lora.sh => llama_npu_opt_lora.sh} (100%)

diff --git a/llm/llama/npu/dev_opt_lora.sh b/llm/llama/npu/llama_npu_opt_lora.sh
similarity index 100%
rename from llm/llama/npu/dev_opt_lora.sh
rename to llm/llama/npu/llama_npu_opt_lora.sh

From fee8f04a006de61a589a7ff9888b8ee414f5a2e5 Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Sat, 11 May 2024 18:48:41 +0800
Subject: [PATCH 09/12] Update dev_opt_ppt.sh

---
 llm/llama/npu/dev_opt_ppt.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llm/llama/npu/dev_opt_ppt.sh b/llm/llama/npu/dev_opt_ppt.sh
index 4db9d6a728a1..d3a082f49c0d 100644
--- a/llm/llama/npu/dev_opt_ppt.sh
+++ b/llm/llama/npu/dev_opt_ppt.sh
@@ -31,7 +31,6 @@ export MC2_Recompute=1
 unset PADDLE_TRAINER_ENDPOINTS
 unset DISTRIBUTED_TRAINER_ENDPOINTS
 
-#240411新增
 export FLAGS_use_stride_kernel=0
 export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
 export MULTI_STREAM_MEMORY_REUSE=1

From 783de3b70952f519d56709110fbe2a227511f34a Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Sat, 11 May 2024 18:49:19 +0800
Subject: [PATCH 10/12] Rename dev_opt_ppt.sh to llama_npu_opt_ppt.sh

---
 llm/llama/npu/{dev_opt_ppt.sh => llama_npu_opt_ppt.sh} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llm/llama/npu/{dev_opt_ppt.sh => llama_npu_opt_ppt.sh} (100%)

diff --git a/llm/llama/npu/dev_opt_ppt.sh b/llm/llama/npu/llama_npu_opt_ppt.sh
similarity index 100%
rename from llm/llama/npu/dev_opt_ppt.sh
rename to llm/llama/npu/llama_npu_opt_ppt.sh

From 10f94155fdce1397389e7daa66e80b52f29eceef Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Sat, 11 May 2024 18:50:01 +0800
Subject: [PATCH 11/12] Update llama_npu_opt_lora.sh

---
 llm/llama/npu/llama_npu_opt_lora.sh | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/llm/llama/npu/llama_npu_opt_lora.sh b/llm/llama/npu/llama_npu_opt_lora.sh
index 981a833d9c56..fd1b004a8af5 100644
--- a/llm/llama/npu/llama_npu_opt_lora.sh
+++ b/llm/llama/npu/llama_npu_opt_lora.sh
@@ -13,17 +13,6 @@
 # limitations under the License.
 
 max_steps=${1:-1000}
-lock_seed_flag=${2:-close}
-if [[ ${lock_seed_flag} =~ "open_lock_seed" ]];then
-    export npu_deterministic=true
-    export ACL_OP_DETERMINISTIC=true
-    export ACL_OPT_DETERMINISTIC=true
-    export HCCL_DETERMINISTIC=true
-fi
-echo lock_seed_flag 
-echo $lock_seed_flag
-echo npu_deterministic ACL_OP_DETERMINISTIC ACL_OPT_DETERMINISTIC HCCL_DETERMINISTIC
-echo $npu_deterministic $ACL_OP_DETERMINISTIC $ACL_OPT_DETERMINISTIC $HCCL_DETERMINISTIC
 
 export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
 export FLAGS_use_stride_kernel=0
@@ -35,13 +24,10 @@ export MC2=1
 export FLAGS_allocator_strategy=naive_best_fit
 source /usr/local/Ascend/ascend-toolkit/set_env.sh
 
-#240411新增
-# export MC2=1
 
 rm -rf lora_bf16_llama_N1C8
 rm -rf output/lora_bf16_llama_N1C8
-ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9
-ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -9
+ps aux | grep "finetune_generation.py" | grep -v grep | awk '{print $2}' | xargs kill -9
 export PYTHONPATH=../../../:$PYTHONPATH
 python -u  -m paddle.distributed.launch \
     --devices "0,1,2,3,4,5,6,7" \

From f3d96e519e7907f38ae1dff3d94bd77c8698f8ae Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Sat, 11 May 2024 18:51:04 +0800
Subject: [PATCH 12/12] Update and rename dev_opt_sft.sh to
 llama_npu_opt_sft.sh

---
 llm/llama/npu/{dev_opt_sft.sh => llama_npu_opt_sft.sh} | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)
 rename llm/llama/npu/{dev_opt_sft.sh => llama_npu_opt_sft.sh} (93%)

diff --git a/llm/llama/npu/dev_opt_sft.sh b/llm/llama/npu/llama_npu_opt_sft.sh
similarity index 93%
rename from llm/llama/npu/dev_opt_sft.sh
rename to llm/llama/npu/llama_npu_opt_sft.sh
index bce6867d234a..786e6cf835aa 100644
--- a/llm/llama/npu/dev_opt_sft.sh
+++ b/llm/llama/npu/llama_npu_opt_sft.sh
@@ -27,16 +27,13 @@ export MC2=1
 export FLAGS_allocator_strategy=naive_best_fit
 source /usr/local/Ascend/ascend-toolkit/set_env.sh
 
-#240411新增
-# export MC2=1
 export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
 export MULTI_STREAM_MEMORY_REUSE=1
 
 export PYTHONPATH=../../../:$PYTHONPATH
 rm -rf sft_bf16_llama_N1C8
 rm -rf output/sft_bf16_llama_N1C8
-ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9
-ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -9
+ps aux | grep "finetune_generation.py" | grep -v grep | awk '{print $2}' | xargs kill -9
 python -u  -m paddle.distributed.launch \
     --devices "0,1,2,3,4,5,6,7" \
     --log_dir "./sft_bf16_llama_N1C8" \