From a5ed9ed35056b273a1d18095acc96778d6bca962 Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Thu, 9 May 2024 12:03:32 +0800
Subject: [PATCH 01/31] update

---
 paddlenlp/transformers/llama/modeling.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
index ac3074aae7ce..be88d6af74dc 100755
--- a/paddlenlp/transformers/llama/modeling.py
+++ b/paddlenlp/transformers/llama/modeling.py
@@ -98,9 +98,6 @@ def swiglu(x, y=None):
 ]
 
 
-npu_is_casual = False
-
-
 def _get_interleave(n):
     def _get_interleave_power_of_2(n):
         start = 2 ** (-(2 ** -(math.log2(n) - 3)))
@@ -212,6 +209,7 @@ def scaled_dot_product_attention(
     alibi=None,
     sequence_parallel=False,
     reshard_layer=None,
+    npu_is_casual=False,
 ):
     bsz, q_len, num_heads, head_dim = query_states.shape
     _, kv_seq_len, _, _ = value_states.shape
@@ -851,6 +849,7 @@ def forward(
         output_attentions: bool = False,
         use_cache: bool = False,
         alibi: Optional[paddle.Tensor] = None,
+        npu_is_casual: bool = False,
     ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
         # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism)
@@ -1078,6 +1077,7 @@ def forward(
                 alibi,
                 self.sequence_parallel,
                 reshard_layer=self.reshard_layer,
+                npu_is_casual=npu_is_casual,
             )
         if output_attentions:
             attn_output, attn_weights = outputs
@@ -1130,6 +1130,7 @@ def forward(
         past_key_value: Optional[Tuple[paddle.Tensor]] = None,
         use_cache: Optional[bool] = False,
         alibi: Optional[paddle.Tensor] = None,
+        npu_is_casual: bool = False,
     ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
         """
         Args:
@@ -1177,6 +1178,7 @@ def forward(
                 output_attentions,
                 use_cache,
                 alibi,
+                npu_is_casual=npu_is_casual,
             )
 
         if type(outputs) is tuple:
@@ -1614,6 +1616,7 @@ def forward(
         attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, (batch_size, seq_length), cache_length, inputs_embeds.dtype
         )  # [bs, 1, seq_len, seq_len]
+        is_casual = False
         if self.config.use_flash_attention:
             is_casual = is_casual_mask(attention_mask)
             if get_env_device() != "npu":
@@ -1658,6 +1661,7 @@ def forward(
                     past_key_value,
                     use_cache,
                     alibi=alibi,
+                    npu_is_casual=is_casual,
                 )
 
             # NOTE: clear outdate cache after it has been used for memory saving

From bd0aa8773f6df5b4b783b6561b6ec23efdbaf006 Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Thu, 9 May 2024 13:33:51 +0800
Subject: [PATCH 02/31] add llama-npu-opt-script

---
 llm/llama/npu/dev_opt_lora.sh | 86 +++++++++++++++++++++++++++++++++
 llm/llama/npu/dev_opt_ppt.sh  | 91 +++++++++++++++++++++++++++++++++++
 llm/llama/npu/dev_opt_sft.sh  | 81 +++++++++++++++++++++++++++++++
 3 files changed, 258 insertions(+)
 create mode 100644 llm/llama/npu/dev_opt_lora.sh
 create mode 100644 llm/llama/npu/dev_opt_ppt.sh
 create mode 100644 llm/llama/npu/dev_opt_sft.sh

diff --git a/llm/llama/npu/dev_opt_lora.sh b/llm/llama/npu/dev_opt_lora.sh
new file mode 100644
index 000000000000..09719c572eee
--- /dev/null
+++ b/llm/llama/npu/dev_opt_lora.sh
@@ -0,0 +1,86 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+max_steps=${1:-1000}
+lock_seed_flag=${2:-close}
+if [[ ${lock_seed_flag} =~ "open_lock_seed" ]];then
+    export npu_deterministic=true
+    export ACL_OP_DETERMINISTIC=true
+    export ACL_OPT_DETERMINISTIC=true
+    export HCCL_DETERMINISTIC=true
+fi
+echo lock_seed_flag 
+echo $lock_seed_flag
+echo npu_deterministic ACL_OP_DETERMINISTIC ACL_OPT_DETERMINISTIC HCCL_DETERMINISTIC
+echo $npu_deterministic $ACL_OP_DETERMINISTIC $ACL_OPT_DETERMINISTIC $HCCL_DETERMINISTIC
+
+export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+export FLAGS_use_stride_kernel=0
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+export FLAGS_NPU_MC2=1
+export MC2_Recompute=1
+export MC2=1
+export FLAGS_allocator_strategy=naive_best_fit
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+#240411新增
+# export MC2=1
+
+rm -rf lora_bf16_llama_N1C8
+rm -rf output/lora_bf16_llama_N1C8
+ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9
+ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -9
+export PYTHONPATH=../../:$PYTHONPATH
+python -u  -m paddle.distributed.launch \
+    --devices "0,1,2,3,4,5,6,7" \
+    --log_dir "./lora_bf16_llama_N1C8" \
+    ../finetune_generation.py \
+    --device "npu" \
+    --model_name_or_path "meta-llama/Llama-2-13b" \
+    --dataset_name_or_path "data/" \
+    --output_dir "./output/lora_bf16_llama_N1C8" \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 16 \
+    --per_device_eval_batch_size 1 \
+    --eval_accumulation_steps 1 \
+    --max_steps ${max_steps} \
+    --decay_steps 2000 \
+    --learning_rate 3e-06 \
+    --warmup_steps 2 \
+    --save_steps 1000 \
+    --logging_steps 1 \
+    --evaluation_strategy "epoch" \
+    --src_length 1024 \
+    --max_length 4096 \
+    --bf16 true \
+    --fp16_opt_level "O2" \
+    --do_train true \
+    --disable_tqdm true \
+    --eval_with_do_generation false \
+    --metric_for_best_model "accuracy" \
+    --recompute false \
+    --tensor_parallel_degree 8 \
+    --pipeline_parallel_degree 1 \
+    --zero_padding 0 \
+    --sequence_parallel 1 \
+    --amp_master_grad true \
+    --fuse_attention_qkv true \
+    --fuse_attention_ffn true \
+    --use_flash_attention 1 \
+    --use_fused_rope 1 \
+    --use_fused_rms_norm 1 \
+    --lora true \
+    --lora_rank 32 \
+    --pad_to_multiple_of 4096
\ No newline at end of file
diff --git a/llm/llama/npu/dev_opt_ppt.sh b/llm/llama/npu/dev_opt_ppt.sh
new file mode 100644
index 000000000000..b0cfb12f3223
--- /dev/null
+++ b/llm/llama/npu/dev_opt_ppt.sh
@@ -0,0 +1,91 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+max_steps=${1:-800}
+
+set -x
+ps aux | grep run_pretrain.py | grep -v grep | awk '{print $2}' | xargs kill -9
+rm -rf ./log_8.0
+rm -rf output
+export PYTHONPATH=../:$PYTHONPATH
+export MC2=1
+export GLOG_v=0
+export FLAGS_npu_storage_format=1
+export HCCL_INTRA_PCIE_EHABLE=0
+export HCCL_INTRA_ROCE_ENABLE=1
+export FLAGS_allocator_strategy=naive_best_fit
+export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+export FLAGS_NPU_MC2=1
+export MC2_Recompute=1
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+
+#240411新增
+export FLAGS_use_stride_kernel=0
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export MULTI_STREAM_MEMORY_REUSE=1
+
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+python -u  -m paddle.distributed.launch \
+    --log_dir "./log_8.0" \
+    run_pretrain.py \
+    --model_name_or_path "meta-llama/Llama-2-13b" \
+    --tokenizer_name_or_path "meta-llama/Llama-2-13b" \
+    --input_dir "./pre-data" \
+    --output_dir "./output" \
+    --split 949,50,1 \
+    --max_seq_length 4096 \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 32 \
+    --per_device_eval_batch_size 1 \
+    --use_flash_attention 1 \
+    --use_fused_rms_norm 1 \
+    --virtual_pp_degree 1 \
+    --learning_rate 0.00001 \
+    --min_learning_rate 0.000001 \
+    --max_steps ${max_steps} \
+    --decay_steps 2000 \
+    --save_steps 2000 \
+    --seed 100 \
+    --weight_decay 0.01 \
+    --warmup_steps 20 \
+    --max_grad_norm 1.0 \
+    --logging_steps 1 \
+    --dataloader_num_workers 1 \
+    --eval_steps 1001 \
+    --tensor_parallel_degree 4 \
+    --disable_tqdm true \
+    --continue_training 0 \
+    --do_train \
+    --device "npu" \
+    --enable_linear_fused_grad_add false \
+    --fuse_attention_qkv true \
+    --fuse_attention_ffn true \
+    --use_fused_rope true \
+    --recompute_use_reentrant true \
+    --data_cache "./data_cache" \
+    --bf16 \
+    --fp16_opt_level "O2" \
+    --amp_master_grad \
+    --load_sharded_model true \
+    --save_sharded_model true \
+    --pipeline_parallel_degree 1 \
+    --ignore_data_skip 0 \
+    --force_reshard_pp true \
+    --tensor_parallel_config "enable_mp_async_allreduce enable_mp_skip_c_identity" \
+    --sequence_parallel 1 \
+    --pipeline_parallel_config "disable_partial_send_recv" \
+    --sharding "stage1" \
+    --sharding_parallel_degree 2
diff --git a/llm/llama/npu/dev_opt_sft.sh b/llm/llama/npu/dev_opt_sft.sh
new file mode 100644
index 000000000000..3a72d24721b6
--- /dev/null
+++ b/llm/llama/npu/dev_opt_sft.sh
@@ -0,0 +1,81 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export FLAGS_use_stride_kernel=0
+export FLAGS_npu_storage_format=1
+export HCCL_INTRA_PCIE_EHABLE=0
+export HCCL_INTRA_ROCE_ENABLE=1
+export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+export GLOG_v=0
+export FLAGS_NPU_MC2=1
+export MC2_Recompute=1
+export MC2=1
+export FLAGS_allocator_strategy=naive_best_fit
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+#240411新增
+# export MC2=1
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export MULTI_STREAM_MEMORY_REUSE=1
+
+export PYTHONPATH=../../:$PYTHONPATH
+rm -rf sft_bf16_llama_N1C8
+rm -rf output/sft_bf16_llama_N1C8
+ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9
+ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -9
+python -u  -m paddle.distributed.launch \
+    --devices "0,1,2,3,4,5,6,7" \
+    --log_dir "./sft_bf16_llama_N1C8" \
+    ../finetune_generation.py \
+    --device "npu" \
+    --model_name_or_path "meta-llama/Llama-2-13b" \
+    --dataset_name_or_path "data/" \
+    --output_dir "./output/sft_bf16_llama_N1C8" \
+    --logging_dir "./sft_logs" \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 32 \
+    --per_device_eval_batch_size 1 \
+    --eval_accumulation_steps 1 \
+    --max_steps 2000 \
+    --learning_rate 3e-06 \
+    --warmup_steps 2 \
+    --save_steps 1000 \
+    --logging_steps 1 \
+    --evaluation_strategy "epoch" \
+    --src_length 1024 \
+    --max_length 4096 \
+    --fp16 true \
+    --fp16_opt_level "O2" \
+    --do_train true \
+    --disable_tqdm true \
+    --eval_with_do_generation false \
+    --metric_for_best_model "accuracy" \
+    --recompute false \
+    --tensor_parallel_degree 4 \
+    --pipeline_parallel_degree 1 \
+    --zero_padding 0 \
+    --amp_master_grad true \
+    --fuse_attention_qkv true \
+    --fuse_attention_ffn true \
+    --sequence_parallel 1 \
+    --use_flash_attention 1 \
+    --use_fused_rope 1 \
+    --use_fused_rms_norm 1 \
+        --sharding_parallel_degree 2 \
+    --pad_to_multiple_of 4096 \
+        --sharding "stage1" \
+        --sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap"
\ No newline at end of file

From cc2413268e35b31f23c64c202845ec73d4688586 Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Thu, 9 May 2024 13:52:27 +0800
Subject: [PATCH 03/31] Update dev_opt_lora.sh

---
 llm/llama/npu/dev_opt_lora.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llm/llama/npu/dev_opt_lora.sh b/llm/llama/npu/dev_opt_lora.sh
index 09719c572eee..8da29cbbb788 100644
--- a/llm/llama/npu/dev_opt_lora.sh
+++ b/llm/llama/npu/dev_opt_lora.sh
@@ -42,7 +42,7 @@ rm -rf lora_bf16_llama_N1C8
 rm -rf output/lora_bf16_llama_N1C8
 ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9
 ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -9
-export PYTHONPATH=../../:$PYTHONPATH
+export PYTHONPATH=../../../:$PYTHONPATH
 python -u  -m paddle.distributed.launch \
     --devices "0,1,2,3,4,5,6,7" \
     --log_dir "./lora_bf16_llama_N1C8" \
@@ -83,4 +83,4 @@ python -u  -m paddle.distributed.launch \
     --use_fused_rms_norm 1 \
     --lora true \
     --lora_rank 32 \
-    --pad_to_multiple_of 4096
\ No newline at end of file
+    --pad_to_multiple_of 4096

From 036d03c401b811c7295ac47ff1dc47bbef2a8fa2 Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Thu, 9 May 2024 13:52:57 +0800
Subject: [PATCH 04/31] Update dev_opt_ppt.sh

---
 llm/llama/npu/dev_opt_ppt.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/llama/npu/dev_opt_ppt.sh b/llm/llama/npu/dev_opt_ppt.sh
index b0cfb12f3223..81365289bef7 100644
--- a/llm/llama/npu/dev_opt_ppt.sh
+++ b/llm/llama/npu/dev_opt_ppt.sh
@@ -18,7 +18,7 @@ set -x
 ps aux | grep run_pretrain.py | grep -v grep | awk '{print $2}' | xargs kill -9
 rm -rf ./log_8.0
 rm -rf output
-export PYTHONPATH=../:$PYTHONPATH
+export PYTHONPATH=../../../:$PYTHONPATH
 export MC2=1
 export GLOG_v=0
 export FLAGS_npu_storage_format=1

From 8dd2d020ac6c180368952a2f2f62426a1b16ad8f Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Thu, 9 May 2024 13:54:05 +0800
Subject: [PATCH 05/31] Update dev_opt_lora.sh

---
 llm/llama/npu/dev_opt_lora.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/llama/npu/dev_opt_lora.sh b/llm/llama/npu/dev_opt_lora.sh
index 8da29cbbb788..981a833d9c56 100644
--- a/llm/llama/npu/dev_opt_lora.sh
+++ b/llm/llama/npu/dev_opt_lora.sh
@@ -46,7 +46,7 @@ export PYTHONPATH=../../../:$PYTHONPATH
 python -u  -m paddle.distributed.launch \
     --devices "0,1,2,3,4,5,6,7" \
     --log_dir "./lora_bf16_llama_N1C8" \
-    ../finetune_generation.py \
+    ../../finetune_generation.py \
     --device "npu" \
     --model_name_or_path "meta-llama/Llama-2-13b" \
     --dataset_name_or_path "data/" \

From 96e69aa7faa8c58f5e003bdc658ab7a970dad1a6 Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Thu, 9 May 2024 13:54:33 +0800
Subject: [PATCH 06/31] Update dev_opt_ppt.sh

---
 llm/llama/npu/dev_opt_ppt.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/llama/npu/dev_opt_ppt.sh b/llm/llama/npu/dev_opt_ppt.sh
index 81365289bef7..4db9d6a728a1 100644
--- a/llm/llama/npu/dev_opt_ppt.sh
+++ b/llm/llama/npu/dev_opt_ppt.sh
@@ -40,7 +40,7 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh
 
 python -u  -m paddle.distributed.launch \
     --log_dir "./log_8.0" \
-    run_pretrain.py \
+    ../run_pretrain.py \
     --model_name_or_path "meta-llama/Llama-2-13b" \
     --tokenizer_name_or_path "meta-llama/Llama-2-13b" \
     --input_dir "./pre-data" \

From a35ba59e291b631b5ee37d40da07fed8a3c2561d Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Thu, 9 May 2024 13:55:20 +0800
Subject: [PATCH 07/31] Update dev_opt_sft.sh

---
 llm/llama/npu/dev_opt_sft.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llm/llama/npu/dev_opt_sft.sh b/llm/llama/npu/dev_opt_sft.sh
index 3a72d24721b6..bce6867d234a 100644
--- a/llm/llama/npu/dev_opt_sft.sh
+++ b/llm/llama/npu/dev_opt_sft.sh
@@ -32,7 +32,7 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh
 export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
 export MULTI_STREAM_MEMORY_REUSE=1
 
-export PYTHONPATH=../../:$PYTHONPATH
+export PYTHONPATH=../../../:$PYTHONPATH
 rm -rf sft_bf16_llama_N1C8
 rm -rf output/sft_bf16_llama_N1C8
 ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9
@@ -40,7 +40,7 @@ ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -
 python -u  -m paddle.distributed.launch \
     --devices "0,1,2,3,4,5,6,7" \
     --log_dir "./sft_bf16_llama_N1C8" \
-    ../finetune_generation.py \
+    ../../finetune_generation.py \
     --device "npu" \
     --model_name_or_path "meta-llama/Llama-2-13b" \
     --dataset_name_or_path "data/" \
@@ -78,4 +78,4 @@ python -u  -m paddle.distributed.launch \
         --sharding_parallel_degree 2 \
     --pad_to_multiple_of 4096 \
         --sharding "stage1" \
-        --sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap"
\ No newline at end of file
+        --sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap"

From 68388a7f09c5ed8197161f9f9b23044c2ade2b3b Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Sat, 11 May 2024 18:48:02 +0800
Subject: [PATCH 08/31] Rename dev_opt_lora.sh to llama_npu_opt_lora.sh

---
 llm/llama/npu/{dev_opt_lora.sh => llama_npu_opt_lora.sh} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llm/llama/npu/{dev_opt_lora.sh => llama_npu_opt_lora.sh} (100%)

diff --git a/llm/llama/npu/dev_opt_lora.sh b/llm/llama/npu/llama_npu_opt_lora.sh
similarity index 100%
rename from llm/llama/npu/dev_opt_lora.sh
rename to llm/llama/npu/llama_npu_opt_lora.sh

From fee8f04a006de61a589a7ff9888b8ee414f5a2e5 Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Sat, 11 May 2024 18:48:41 +0800
Subject: [PATCH 09/31] Update dev_opt_ppt.sh

---
 llm/llama/npu/dev_opt_ppt.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llm/llama/npu/dev_opt_ppt.sh b/llm/llama/npu/dev_opt_ppt.sh
index 4db9d6a728a1..d3a082f49c0d 100644
--- a/llm/llama/npu/dev_opt_ppt.sh
+++ b/llm/llama/npu/dev_opt_ppt.sh
@@ -31,7 +31,6 @@ export MC2_Recompute=1
 unset PADDLE_TRAINER_ENDPOINTS
 unset DISTRIBUTED_TRAINER_ENDPOINTS
 
-#240411新增
 export FLAGS_use_stride_kernel=0
 export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
 export MULTI_STREAM_MEMORY_REUSE=1

From 783de3b70952f519d56709110fbe2a227511f34a Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Sat, 11 May 2024 18:49:19 +0800
Subject: [PATCH 10/31] Rename dev_opt_ppt.sh to llama_npu_opt_ppt.sh

---
 llm/llama/npu/{dev_opt_ppt.sh => llama_npu_opt_ppt.sh} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llm/llama/npu/{dev_opt_ppt.sh => llama_npu_opt_ppt.sh} (100%)

diff --git a/llm/llama/npu/dev_opt_ppt.sh b/llm/llama/npu/llama_npu_opt_ppt.sh
similarity index 100%
rename from llm/llama/npu/dev_opt_ppt.sh
rename to llm/llama/npu/llama_npu_opt_ppt.sh

From 10f94155fdce1397389e7daa66e80b52f29eceef Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Sat, 11 May 2024 18:50:01 +0800
Subject: [PATCH 11/31] Update llama_npu_opt_lora.sh

---
 llm/llama/npu/llama_npu_opt_lora.sh | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/llm/llama/npu/llama_npu_opt_lora.sh b/llm/llama/npu/llama_npu_opt_lora.sh
index 981a833d9c56..fd1b004a8af5 100644
--- a/llm/llama/npu/llama_npu_opt_lora.sh
+++ b/llm/llama/npu/llama_npu_opt_lora.sh
@@ -13,17 +13,6 @@
 # limitations under the License.
 
 max_steps=${1:-1000}
-lock_seed_flag=${2:-close}
-if [[ ${lock_seed_flag} =~ "open_lock_seed" ]];then
-    export npu_deterministic=true
-    export ACL_OP_DETERMINISTIC=true
-    export ACL_OPT_DETERMINISTIC=true
-    export HCCL_DETERMINISTIC=true
-fi
-echo lock_seed_flag 
-echo $lock_seed_flag
-echo npu_deterministic ACL_OP_DETERMINISTIC ACL_OPT_DETERMINISTIC HCCL_DETERMINISTIC
-echo $npu_deterministic $ACL_OP_DETERMINISTIC $ACL_OPT_DETERMINISTIC $HCCL_DETERMINISTIC
 
 export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
 export FLAGS_use_stride_kernel=0
@@ -35,13 +24,10 @@ export MC2=1
 export FLAGS_allocator_strategy=naive_best_fit
 source /usr/local/Ascend/ascend-toolkit/set_env.sh
 
-#240411新增
-# export MC2=1
 
 rm -rf lora_bf16_llama_N1C8
 rm -rf output/lora_bf16_llama_N1C8
-ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9
-ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -9
+ps aux | grep "finetune_generation.py" | grep -v grep | awk '{print $2}' | xargs kill -9
 export PYTHONPATH=../../../:$PYTHONPATH
 python -u  -m paddle.distributed.launch \
     --devices "0,1,2,3,4,5,6,7" \

From f3d96e519e7907f38ae1dff3d94bd77c8698f8ae Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Sat, 11 May 2024 18:51:04 +0800
Subject: [PATCH 12/31] Update and rename dev_opt_sft.sh to
 llama_npu_opt_sft.sh

---
 llm/llama/npu/{dev_opt_sft.sh => llama_npu_opt_sft.sh} | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)
 rename llm/llama/npu/{dev_opt_sft.sh => llama_npu_opt_sft.sh} (93%)

diff --git a/llm/llama/npu/dev_opt_sft.sh b/llm/llama/npu/llama_npu_opt_sft.sh
similarity index 93%
rename from llm/llama/npu/dev_opt_sft.sh
rename to llm/llama/npu/llama_npu_opt_sft.sh
index bce6867d234a..786e6cf835aa 100644
--- a/llm/llama/npu/dev_opt_sft.sh
+++ b/llm/llama/npu/llama_npu_opt_sft.sh
@@ -27,16 +27,13 @@ export MC2=1
 export FLAGS_allocator_strategy=naive_best_fit
 source /usr/local/Ascend/ascend-toolkit/set_env.sh
 
-#240411新增
-# export MC2=1
 export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
 export MULTI_STREAM_MEMORY_REUSE=1
 
 export PYTHONPATH=../../../:$PYTHONPATH
 rm -rf sft_bf16_llama_N1C8
 rm -rf output/sft_bf16_llama_N1C8
-ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9
-ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -9
+ps aux | grep "finetune_generation.py" | grep -v grep | awk '{print $2}' | xargs kill -9
 python -u  -m paddle.distributed.launch \
     --devices "0,1,2,3,4,5,6,7" \
     --log_dir "./sft_bf16_llama_N1C8" \

From 6771aa9163162a30e87c75fe0c13b6ed0ff6db31 Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Mon, 13 May 2024 14:51:39 +0800
Subject: [PATCH 13/31] add funsion ops

---
 paddlenlp/transformers/llama/fusion_ops.py | 120 +++++++++++++++++++++
 paddlenlp/transformers/llama/modeling.py   |  72 +++----------
 2 files changed, 134 insertions(+), 58 deletions(-)
 create mode 100644 paddlenlp/transformers/llama/fusion_ops.py

diff --git a/paddlenlp/transformers/llama/fusion_ops.py b/paddlenlp/transformers/llama/fusion_ops.py
new file mode 100644
index 000000000000..4bc8ea2d0bd2
--- /dev/null
+++ b/paddlenlp/transformers/llama/fusion_ops.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import paddle
+import paddle.nn.functional as F
+
+try:
+    from paddle.incubate.nn.functional import fused_rotary_position_embedding
+except ImportError:
+    fused_rotary_position_embedding = None
+
+try:
+    from paddle.incubate.nn.functional import swiglu
+except ImportError:
+
+    def swiglu(x, y=None):
+        if y is None:
+            x, y = paddle.chunk(x, chunks=2, axis=-1)
+        return F.silu(x) * y
+
+
+from paddlenlp.utils.tools import get_env_device
+
+try:
+    if get_env_device() == "npu":
+        from paddle.base import core
+
+        for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")):
+            if lib.endswith(".so"):
+                paddle.utils.cpp_extension.extension_utils.load_op_meta_info_and_register_op(lib)
+    from paddle.nn.functional.flash_attention import flash_attention
+except:
+    flash_attention = None
+
+
+def fusion_flash_attention(
+    query_states,
+    config,
+    key_states,
+    value_states,
+    attention_mask,
+    output_attentions,
+    alibi=None,
+    sequence_parallel=False,
+    reshard_layer=None,
+    npu_is_casual=False,
+):
+    bsz, q_len, num_heads, head_dim = query_states.shape
+    _, kv_seq_len, _, _ = value_states.shape
+    version = paddle.version.full_version
+    if version != "0.0.0" and version <= "2.5.2":
+        if alibi is not None:
+            raise ValueError("Flash Attention doesn't support alibi")
+        attn_output, attn_weights = flash_attention(
+            query_states,
+            key_states,
+            value_states,
+            causal=True,
+            return_softmax=output_attentions,
+        )
+    else:
+        if alibi is not None:
+            alibi = alibi.reshape([bsz, num_heads, 1, -1])
+            attention_mask = attention_mask.cast(alibi.dtype) + alibi
+        if get_env_device() == "npu":
+            attn_output = core.eager._run_custom_op(
+                "flash_attention_npu",
+                query_states,
+                key_states,
+                value_states,
+                None,
+                attention_mask,
+                0.0,
+                attention_mask is None,
+                True,
+                False,
+                npu_is_casual,
+            )[0]
+        else:
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                is_causal=attention_mask is None,
+            )
+        attn_weights = None
+
+    if reshard_layer is not None:
+        # attn_output shape: [bs, seqlen, num_head/sep, head_dim]
+        attn_output = reshard_layer(
+            attn_output,
+            split_axis=1,
+            concat_axis=2,
+        )
+        # attn_output shape: [bs, seqlen/sep, num_head, head_dim]
+        assert (
+            config.sep_parallel_degree > 1 and q_len % config.sep_parallel_degree == 0
+        ), f"q_len:{q_len}, config.sep_parallel_degree:{config.sep_parallel_degree}"
+        q_len = q_len // config.sep_parallel_degree
+        num_heads = num_heads * config.sep_parallel_degree
+
+    if sequence_parallel:
+        attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
+    else:
+        attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+    return (attn_output, attn_weights) if output_attentions else attn_output
diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
index be88d6af74dc..76e1cb80c59e 100755
--- a/paddlenlp/transformers/llama/modeling.py
+++ b/paddlenlp/transformers/llama/modeling.py
@@ -89,6 +89,7 @@ def swiglu(x, y=None):
     from paddle.nn.functional.flash_attention import flash_attention
 except:
     flash_attention = None
+from funsion_ops import fusion_flash_attention
 
 __all__ = [
     "LlamaModel",
@@ -215,67 +216,22 @@ def scaled_dot_product_attention(
     _, kv_seq_len, _, _ = value_states.shape
 
     if config.use_flash_attention and flash_attention:
+        fusion_flash_attention(
+            query_states,
+            config,
+            key_states,
+            value_states,
+            attention_mask,
+            output_attentions,
+            alibi,
+            sequence_parallel,
+            reshard_layer,
+            npu_is_casual,
+        )
+
         # Paddle Flash Attention input [ bz, seqlen, nhead, head_dim]
         # Torch Flash Attention input [ bz, nhead, seqlen, head_dim]
 
-        version = paddle.version.full_version
-        if version != "0.0.0" and version <= "2.5.2":
-            if alibi is not None:
-                raise ValueError("Flash Attention doesn't support alibi")
-            attn_output, attn_weights = flash_attention(
-                query_states,
-                key_states,
-                value_states,
-                causal=True,
-                return_softmax=output_attentions,
-            )
-        else:
-            if alibi is not None:
-                alibi = alibi.reshape([bsz, num_heads, 1, -1])
-                attention_mask = attention_mask.cast(alibi.dtype) + alibi
-            if get_env_device() == "npu":
-                attn_output = core.eager._run_custom_op(
-                    "flash_attention_npu",
-                    query_states,
-                    key_states,
-                    value_states,
-                    None,
-                    attention_mask,
-                    0.0,
-                    attention_mask is None,
-                    True,
-                    False,
-                    npu_is_casual,
-                )[0]
-            else:
-                attn_output = F.scaled_dot_product_attention(
-                    query_states,
-                    key_states,
-                    value_states,
-                    attn_mask=attention_mask,
-                    is_causal=attention_mask is None,
-                )
-            attn_weights = None
-
-        if reshard_layer is not None:
-            # attn_output shape: [bs, seqlen, num_head/sep, head_dim]
-            attn_output = reshard_layer(
-                attn_output,
-                split_axis=1,
-                concat_axis=2,
-            )
-            # attn_output shape: [bs, seqlen/sep, num_head, head_dim]
-            assert (
-                config.sep_parallel_degree > 1 and q_len % config.sep_parallel_degree == 0
-            ), f"q_len:{q_len}, config.sep_parallel_degree:{config.sep_parallel_degree}"
-            q_len = q_len // config.sep_parallel_degree
-            num_heads = num_heads * config.sep_parallel_degree
-
-        if sequence_parallel:
-            attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
-        else:
-            attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
-        return (attn_output, attn_weights) if output_attentions else attn_output
     else:
         #  [ bz, seqlen, nhead, head_dim] -> [bs, nhead, seq_len, head_dim]
         query_states = paddle.transpose(query_states, [0, 2, 1, 3])

From 61dc79c70a44ac08b7b8179bd226fad51785f73d Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Mon, 13 May 2024 15:04:57 +0800
Subject: [PATCH 14/31] add funsion ops

---
 paddlenlp/transformers/llama/modeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
index 76e1cb80c59e..47fc0adb60e0 100755
--- a/paddlenlp/transformers/llama/modeling.py
+++ b/paddlenlp/transformers/llama/modeling.py
@@ -89,7 +89,7 @@ def swiglu(x, y=None):
     from paddle.nn.functional.flash_attention import flash_attention
 except:
     flash_attention = None
-from funsion_ops import fusion_flash_attention
+from fusion_ops import fusion_flash_attention
 
 __all__ = [
     "LlamaModel",

From 558200f62363fb47bd4d60df59e4c3328f8ce9a0 Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Mon, 13 May 2024 15:10:25 +0800
Subject: [PATCH 15/31] add funsion ops

---
 paddlenlp/transformers/llama/modeling.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
index 47fc0adb60e0..306258ae4aed 100755
--- a/paddlenlp/transformers/llama/modeling.py
+++ b/paddlenlp/transformers/llama/modeling.py
@@ -89,7 +89,7 @@ def swiglu(x, y=None):
     from paddle.nn.functional.flash_attention import flash_attention
 except:
     flash_attention = None
-from fusion_ops import fusion_flash_attention
+import fusion_ops
 
 __all__ = [
     "LlamaModel",
@@ -216,7 +216,7 @@ def scaled_dot_product_attention(
     _, kv_seq_len, _, _ = value_states.shape
 
     if config.use_flash_attention and flash_attention:
-        fusion_flash_attention(
+        fusion_ops.fusion_flash_attention(
             query_states,
             config,
             key_states,

From f387c3007669885d58391677b6c0a963e9f1c8b5 Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Mon, 13 May 2024 15:38:26 +0800
Subject: [PATCH 16/31] add funsion ops

---
 paddlenlp/transformers/llama/fusion_ops.py | 120 ---------------------
 1 file changed, 120 deletions(-)
 delete mode 100644 paddlenlp/transformers/llama/fusion_ops.py

diff --git a/paddlenlp/transformers/llama/fusion_ops.py b/paddlenlp/transformers/llama/fusion_ops.py
deleted file mode 100644
index 4bc8ea2d0bd2..000000000000
--- a/paddlenlp/transformers/llama/fusion_ops.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import paddle
-import paddle.nn.functional as F
-
-try:
-    from paddle.incubate.nn.functional import fused_rotary_position_embedding
-except ImportError:
-    fused_rotary_position_embedding = None
-
-try:
-    from paddle.incubate.nn.functional import swiglu
-except ImportError:
-
-    def swiglu(x, y=None):
-        if y is None:
-            x, y = paddle.chunk(x, chunks=2, axis=-1)
-        return F.silu(x) * y
-
-
-from paddlenlp.utils.tools import get_env_device
-
-try:
-    if get_env_device() == "npu":
-        from paddle.base import core
-
-        for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")):
-            if lib.endswith(".so"):
-                paddle.utils.cpp_extension.extension_utils.load_op_meta_info_and_register_op(lib)
-    from paddle.nn.functional.flash_attention import flash_attention
-except:
-    flash_attention = None
-
-
-def fusion_flash_attention(
-    query_states,
-    config,
-    key_states,
-    value_states,
-    attention_mask,
-    output_attentions,
-    alibi=None,
-    sequence_parallel=False,
-    reshard_layer=None,
-    npu_is_casual=False,
-):
-    bsz, q_len, num_heads, head_dim = query_states.shape
-    _, kv_seq_len, _, _ = value_states.shape
-    version = paddle.version.full_version
-    if version != "0.0.0" and version <= "2.5.2":
-        if alibi is not None:
-            raise ValueError("Flash Attention doesn't support alibi")
-        attn_output, attn_weights = flash_attention(
-            query_states,
-            key_states,
-            value_states,
-            causal=True,
-            return_softmax=output_attentions,
-        )
-    else:
-        if alibi is not None:
-            alibi = alibi.reshape([bsz, num_heads, 1, -1])
-            attention_mask = attention_mask.cast(alibi.dtype) + alibi
-        if get_env_device() == "npu":
-            attn_output = core.eager._run_custom_op(
-                "flash_attention_npu",
-                query_states,
-                key_states,
-                value_states,
-                None,
-                attention_mask,
-                0.0,
-                attention_mask is None,
-                True,
-                False,
-                npu_is_casual,
-            )[0]
-        else:
-            attn_output = F.scaled_dot_product_attention(
-                query_states,
-                key_states,
-                value_states,
-                attn_mask=attention_mask,
-                is_causal=attention_mask is None,
-            )
-        attn_weights = None
-
-    if reshard_layer is not None:
-        # attn_output shape: [bs, seqlen, num_head/sep, head_dim]
-        attn_output = reshard_layer(
-            attn_output,
-            split_axis=1,
-            concat_axis=2,
-        )
-        # attn_output shape: [bs, seqlen/sep, num_head, head_dim]
-        assert (
-            config.sep_parallel_degree > 1 and q_len % config.sep_parallel_degree == 0
-        ), f"q_len:{q_len}, config.sep_parallel_degree:{config.sep_parallel_degree}"
-        q_len = q_len // config.sep_parallel_degree
-        num_heads = num_heads * config.sep_parallel_degree
-
-    if sequence_parallel:
-        attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
-    else:
-        attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
-    return (attn_output, attn_weights) if output_attentions else attn_output

From a12947b603962e25dd9c0c4eab77ec24cc64c2a8 Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Mon, 13 May 2024 15:39:43 +0800
Subject: [PATCH 17/31] add funsion ops

---
 paddlenlp/transformers/fusion_ops.py     | 120 +++++++++++++++++++++++
 paddlenlp/transformers/llama/modeling.py |   2 +-
 2 files changed, 121 insertions(+), 1 deletion(-)
 create mode 100644 paddlenlp/transformers/fusion_ops.py

diff --git a/paddlenlp/transformers/fusion_ops.py b/paddlenlp/transformers/fusion_ops.py
new file mode 100644
index 000000000000..4bc8ea2d0bd2
--- /dev/null
+++ b/paddlenlp/transformers/fusion_ops.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import paddle
+import paddle.nn.functional as F
+
+try:
+    from paddle.incubate.nn.functional import fused_rotary_position_embedding
+except ImportError:
+    fused_rotary_position_embedding = None
+
+try:
+    from paddle.incubate.nn.functional import swiglu
+except ImportError:
+
+    def swiglu(x, y=None):
+        if y is None:
+            x, y = paddle.chunk(x, chunks=2, axis=-1)
+        return F.silu(x) * y
+
+
+from paddlenlp.utils.tools import get_env_device
+
+try:
+    if get_env_device() == "npu":
+        from paddle.base import core
+
+        for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")):
+            if lib.endswith(".so"):
+                paddle.utils.cpp_extension.extension_utils.load_op_meta_info_and_register_op(lib)
+    from paddle.nn.functional.flash_attention import flash_attention
+except:
+    flash_attention = None
+
+
+def fusion_flash_attention(
+    query_states,
+    config,
+    key_states,
+    value_states,
+    attention_mask,
+    output_attentions,
+    alibi=None,
+    sequence_parallel=False,
+    reshard_layer=None,
+    npu_is_casual=False,
+):
+    bsz, q_len, num_heads, head_dim = query_states.shape
+    _, kv_seq_len, _, _ = value_states.shape
+    version = paddle.version.full_version
+    if version != "0.0.0" and version <= "2.5.2":
+        if alibi is not None:
+            raise ValueError("Flash Attention doesn't support alibi")
+        attn_output, attn_weights = flash_attention(
+            query_states,
+            key_states,
+            value_states,
+            causal=True,
+            return_softmax=output_attentions,
+        )
+    else:
+        if alibi is not None:
+            alibi = alibi.reshape([bsz, num_heads, 1, -1])
+            attention_mask = attention_mask.cast(alibi.dtype) + alibi
+        if get_env_device() == "npu":
+            attn_output = core.eager._run_custom_op(
+                "flash_attention_npu",
+                query_states,
+                key_states,
+                value_states,
+                None,
+                attention_mask,
+                0.0,
+                attention_mask is None,
+                True,
+                False,
+                npu_is_casual,
+            )[0]
+        else:
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                is_causal=attention_mask is None,
+            )
+        attn_weights = None
+
+    if reshard_layer is not None:
+        # attn_output shape: [bs, seqlen, num_head/sep, head_dim]
+        attn_output = reshard_layer(
+            attn_output,
+            split_axis=1,
+            concat_axis=2,
+        )
+        # attn_output shape: [bs, seqlen/sep, num_head, head_dim]
+        assert (
+            config.sep_parallel_degree > 1 and q_len % config.sep_parallel_degree == 0
+        ), f"q_len:{q_len}, config.sep_parallel_degree:{config.sep_parallel_degree}"
+        q_len = q_len // config.sep_parallel_degree
+        num_heads = num_heads * config.sep_parallel_degree
+
+    if sequence_parallel:
+        attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
+    else:
+        attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+    return (attn_output, attn_weights) if output_attentions else attn_output
diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
index 306258ae4aed..55fdff304887 100755
--- a/paddlenlp/transformers/llama/modeling.py
+++ b/paddlenlp/transformers/llama/modeling.py
@@ -89,7 +89,7 @@ def swiglu(x, y=None):
     from paddle.nn.functional.flash_attention import flash_attention
 except:
     flash_attention = None
-import fusion_ops
+from .. import fusion_ops
 
 __all__ = [
     "LlamaModel",

From aff105e670e9990c8db5b7854c0e122e63058148 Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Mon, 13 May 2024 16:12:53 +0800
Subject: [PATCH 18/31] add funsion ops

---
 paddlenlp/transformers/llama/fusion_ops.py | 142 +++++++++++++++++++++
 paddlenlp/transformers/llama/modeling.py   |   2 +-
 2 files changed, 143 insertions(+), 1 deletion(-)
 create mode 100644 paddlenlp/transformers/llama/fusion_ops.py

diff --git a/paddlenlp/transformers/llama/fusion_ops.py b/paddlenlp/transformers/llama/fusion_ops.py
new file mode 100644
index 000000000000..1fc762e9c2e6
--- /dev/null
+++ b/paddlenlp/transformers/llama/fusion_ops.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import paddle
+import paddle.nn.functional as F
+
+try:
+    from paddle.incubate.nn.functional import fused_rotary_position_embedding
+except ImportError:
+    fused_rotary_position_embedding = None
+
+try:
+    from paddle.incubate.nn.functional import swiglu
+except ImportError:
+
+    def swiglu(x, y=None):
+        if y is None:
+            x, y = paddle.chunk(x, chunks=2, axis=-1)
+        return F.silu(x) * y
+
+
+from paddle.utils import try_import
+
+from paddlenlp.utils.tools import get_env_device
+
+try:
+    if get_env_device() == "npu":
+        from paddle.base import core
+
+        for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")):
+            if lib.endswith(".so"):
+                paddle.utils.cpp_extension.extension_utils.load_op_meta_info_and_register_op(lib)
+    from paddle.nn.functional.flash_attention import flash_attention
+except:
+    flash_attention = None
+
+
+def rms_norm_fused(x_in, w, eps):
+    fused_ln = try_import("fused_ln")
+    return fused_ln.fused_rms_norm(x_in, w, eps)[0]
+
+
+def fusion_rms_norm(hidden_states, weight, variance_epsilon):
+    if get_env_device() == "npu":
+        return core.eager._run_custom_op("rms_norm_npu", hidden_states, weight, variance_epsilon)[0]
+    elif get_env_device() == "xpu":
+        try:
+            import paddle_xpu_nn  # noqa: F821
+
+            return paddle_xpu_nn.xpu_rms_norm(hidden_states, weight, variance_epsilon)[0]
+        except ImportError:
+            raise NotImplementedError(
+                f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature"
+            )
+    return rms_norm_fused(hidden_states, weight, variance_epsilon)
+
+
+def fusion_flash_attention(
+    query_states,
+    config,
+    key_states,
+    value_states,
+    attention_mask,
+    output_attentions,
+    alibi=None,
+    sequence_parallel=False,
+    reshard_layer=None,
+    npu_is_casual=False,
+):
+    bsz, q_len, num_heads, head_dim = query_states.shape
+    _, kv_seq_len, _, _ = value_states.shape
+    version = paddle.version.full_version
+    if version != "0.0.0" and version <= "2.5.2":
+        if alibi is not None:
+            raise ValueError("Flash Attention doesn't support alibi")
+        attn_output, attn_weights = flash_attention(
+            query_states,
+            key_states,
+            value_states,
+            causal=True,
+            return_softmax=output_attentions,
+        )
+    else:
+        if alibi is not None:
+            alibi = alibi.reshape([bsz, num_heads, 1, -1])
+            attention_mask = attention_mask.cast(alibi.dtype) + alibi
+        if get_env_device() == "npu":
+            attn_output = core.eager._run_custom_op(
+                "flash_attention_npu",
+                query_states,
+                key_states,
+                value_states,
+                None,
+                attention_mask,
+                0.0,
+                attention_mask is None,
+                True,
+                False,
+                npu_is_casual,
+            )[0]
+        else:
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                is_causal=attention_mask is None,
+            )
+        attn_weights = None
+
+    if reshard_layer is not None:
+        # attn_output shape: [bs, seqlen, num_head/sep, head_dim]
+        attn_output = reshard_layer(
+            attn_output,
+            split_axis=1,
+            concat_axis=2,
+        )
+        # attn_output shape: [bs, seqlen/sep, num_head, head_dim]
+        assert (
+            config.sep_parallel_degree > 1 and q_len % config.sep_parallel_degree == 0
+        ), f"q_len:{q_len}, config.sep_parallel_degree:{config.sep_parallel_degree}"
+        q_len = q_len // config.sep_parallel_degree
+        num_heads = num_heads * config.sep_parallel_degree
+
+    if sequence_parallel:
+        attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
+    else:
+        attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+    return (attn_output, attn_weights) if output_attentions else attn_output
diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
index 55fdff304887..99a8431095d4 100755
--- a/paddlenlp/transformers/llama/modeling.py
+++ b/paddlenlp/transformers/llama/modeling.py
@@ -216,7 +216,7 @@ def scaled_dot_product_attention(
     _, kv_seq_len, _, _ = value_states.shape
 
     if config.use_flash_attention and flash_attention:
-        fusion_ops.fusion_flash_attention(
+        return fusion_ops.fusion_flash_attention(
             query_states,
             config,
             key_states,

From 075c8de78732641c3f5a32395a5f5e4b131c0d3f Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Mon, 13 May 2024 16:39:45 +0800
Subject: [PATCH 19/31] add funsion ops

---
 paddlenlp/transformers/llama/fusion_ops.py |  4 ++++
 paddlenlp/transformers/llama/modeling.py   | 13 +------------
 2 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/paddlenlp/transformers/llama/fusion_ops.py b/paddlenlp/transformers/llama/fusion_ops.py
index 1fc762e9c2e6..0ee27057a3e8 100644
--- a/paddlenlp/transformers/llama/fusion_ops.py
+++ b/paddlenlp/transformers/llama/fusion_ops.py
@@ -48,6 +48,10 @@ def swiglu(x, y=None):
     flash_attention = None
 
 
+def fusion_rope():
+    pass
+
+
 def rms_norm_fused(x_in, w, eps):
     fused_ln = try_import("fused_ln")
     return fused_ln.fused_rms_norm(x_in, w, eps)[0]
diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
index 99a8431095d4..803c527d20ed 100755
--- a/paddlenlp/transformers/llama/modeling.py
+++ b/paddlenlp/transformers/llama/modeling.py
@@ -363,18 +363,7 @@ def __init__(self, config):
 
     def forward(self, hidden_states):
         if self.config.use_fused_rms_norm:
-            if get_env_device() == "npu":
-                return core.eager._run_custom_op("rms_norm_npu", hidden_states, self.weight, self.variance_epsilon)[0]
-            elif get_env_device() == "xpu":
-                try:
-                    import paddle_xpu_nn  # noqa: F821
-
-                    return paddle_xpu_nn.xpu_rms_norm(hidden_states, self.weight, self.variance_epsilon)[0]
-                except ImportError:
-                    raise NotImplementedError(
-                        f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature"
-                    )
-            return rms_norm_fused(hidden_states, self.weight, self.variance_epsilon)
+            return fusion_ops.fusion_rms_norm(hidden_states, self.weight, self.variance_epsilon)
 
         if paddle.in_dynamic_mode():
             with paddle.amp.auto_cast(False):

From 15f2fe35e6c6cca1321cfa8b4aa992df84343a97 Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Mon, 13 May 2024 16:55:14 +0800
Subject: [PATCH 20/31] add funsion ops

---
 paddlenlp/transformers/llama/fusion_ops.py | 11 ++++++++++-
 paddlenlp/transformers/llama/modeling.py   | 13 ++++++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/paddlenlp/transformers/llama/fusion_ops.py b/paddlenlp/transformers/llama/fusion_ops.py
index 0ee27057a3e8..044669be2e56 100644
--- a/paddlenlp/transformers/llama/fusion_ops.py
+++ b/paddlenlp/transformers/llama/fusion_ops.py
@@ -48,7 +48,16 @@ def swiglu(x, y=None):
     flash_attention = None
 
 
-def fusion_rope():
+def fusion_rope(
+    hidden_states,
+    position_ids,
+    past_key_value,
+    attention_mask,
+    output_attentions,
+    use_cache,
+    alibi,
+    npu_is_casual: bool = False,
+):
     pass
 
 
diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
index 803c527d20ed..99a8431095d4 100755
--- a/paddlenlp/transformers/llama/modeling.py
+++ b/paddlenlp/transformers/llama/modeling.py
@@ -363,7 +363,18 @@ def __init__(self, config):
 
     def forward(self, hidden_states):
         if self.config.use_fused_rms_norm:
-            return fusion_ops.fusion_rms_norm(hidden_states, self.weight, self.variance_epsilon)
+            if get_env_device() == "npu":
+                return core.eager._run_custom_op("rms_norm_npu", hidden_states, self.weight, self.variance_epsilon)[0]
+            elif get_env_device() == "xpu":
+                try:
+                    import paddle_xpu_nn  # noqa: F821
+
+                    return paddle_xpu_nn.xpu_rms_norm(hidden_states, self.weight, self.variance_epsilon)[0]
+                except ImportError:
+                    raise NotImplementedError(
+                        f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature"
+                    )
+            return rms_norm_fused(hidden_states, self.weight, self.variance_epsilon)
 
         if paddle.in_dynamic_mode():
             with paddle.amp.auto_cast(False):

From 27417693078910136dbf67c49578391ae04a80b1 Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Mon, 13 May 2024 17:15:22 +0800
Subject: [PATCH 21/31] add funsion ops

---
 paddlenlp/transformers/llama/fusion_ops.py | 56 +++++++++++---
 paddlenlp/transformers/llama/modeling.py   | 87 ++++++++++++----------
 2 files changed, 93 insertions(+), 50 deletions(-)

diff --git a/paddlenlp/transformers/llama/fusion_ops.py b/paddlenlp/transformers/llama/fusion_ops.py
index 044669be2e56..96f160534fc7 100644
--- a/paddlenlp/transformers/llama/fusion_ops.py
+++ b/paddlenlp/transformers/llama/fusion_ops.py
@@ -36,6 +36,10 @@ def swiglu(x, y=None):
 
 from paddlenlp.utils.tools import get_env_device
 
+try:
+    from paddle.incubate.nn.functional import fused_rotary_position_embedding
+except ImportError:
+    fused_rotary_position_embedding = None
 try:
     if get_env_device() == "npu":
         from paddle.base import core
@@ -48,17 +52,47 @@ def swiglu(x, y=None):
     flash_attention = None
 
 
-def fusion_rope(
-    hidden_states,
-    position_ids,
-    past_key_value,
-    attention_mask,
-    output_attentions,
-    use_cache,
-    alibi,
-    npu_is_casual: bool = False,
-):
-    pass
+def fusion_rope(query_states, key_states, value_states, hidden_states, position_ids, past_key_value, rotary_emb):
+    assert past_key_value is None, "fuse rotary not support cache kv for now"
+    batch_size, seq_length, num_heads, head_dim = query_states.shape
+    _, kv_seq_len, num_key_value_heads, _ = key_states.shape
+    cos, sin = rotary_emb(value_states, seq_len=kv_seq_len)
+    if get_env_device() == "npu":
+        query_states = core.eager._run_custom_op("fused_rope", query_states, cos, sin)[0]
+        key_states = core.eager._run_custom_op("fused_rope", key_states, cos, sin)[0]
+    else:
+        # paddle version > 2.6 or develop support q and k/v with different num_heads
+        paddle_version = float(paddle.__version__[:3])
+        if ((paddle_version != 0.0) and (paddle_version <= 2.6)) and (num_heads != num_key_value_heads):
+            query_states, _, _ = fused_rotary_position_embedding(
+                query_states,
+                None,
+                None,
+                sin=sin,
+                cos=cos,
+                position_ids=position_ids,
+                use_neox_rotary_style=False,
+            )
+            key_states, _, _ = fused_rotary_position_embedding(
+                key_states,
+                None,
+                None,
+                sin=sin,
+                cos=cos,
+                position_ids=position_ids,
+                use_neox_rotary_style=False,
+            )
+        else:
+            query_states, key_states, _ = fused_rotary_position_embedding(
+                query_states,
+                key_states,
+                v=None,
+                sin=sin,
+                cos=cos,
+                position_ids=position_ids,
+                use_neox_rotary_style=False,
+            )
+    return query_states, key_states
 
 
 def rms_norm_fused(x_in, w, eps):
diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
index 99a8431095d4..6d54fde21348 100755
--- a/paddlenlp/transformers/llama/modeling.py
+++ b/paddlenlp/transformers/llama/modeling.py
@@ -930,45 +930,54 @@ def forward(
                 batch_size, seq_length, _, _ = query_states.shape
                 position_ids = paddle.arange(seq_length, dtype="int64").expand((batch_size, seq_length))
             if self.use_fused_rope:
-                assert past_key_value is None, "fuse rotary not support cache kv for now"
-                batch_size, seq_length, num_heads, head_dim = query_states.shape
-                _, kv_seq_len, num_key_value_heads, _ = key_states.shape
-                cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-                if get_env_device() == "npu":
-                    query_states = core.eager._run_custom_op("fused_rope", query_states, cos, sin)[0]
-                    key_states = core.eager._run_custom_op("fused_rope", key_states, cos, sin)[0]
-                else:
-                    # paddle version > 2.6 or develop support q and k/v with different num_heads
-                    paddle_version = float(paddle.__version__[:3])
-                    if ((paddle_version != 0.0) and (paddle_version <= 2.6)) and (num_heads != num_key_value_heads):
-                        query_states, _, _ = fused_rotary_position_embedding(
-                            query_states,
-                            None,
-                            None,
-                            sin=sin,
-                            cos=cos,
-                            position_ids=position_ids,
-                            use_neox_rotary_style=False,
-                        )
-                        key_states, _, _ = fused_rotary_position_embedding(
-                            key_states,
-                            None,
-                            None,
-                            sin=sin,
-                            cos=cos,
-                            position_ids=position_ids,
-                            use_neox_rotary_style=False,
-                        )
-                    else:
-                        query_states, key_states, _ = fused_rotary_position_embedding(
-                            query_states,
-                            key_states,
-                            v=None,
-                            sin=sin,
-                            cos=cos,
-                            position_ids=position_ids,
-                            use_neox_rotary_style=False,
-                        )
+                query_states, key_states = fusion_ops.fusion_rope(
+                    query_states,
+                    key_states,
+                    value_states,
+                    hidden_states,
+                    position_ids,
+                    past_key_value,
+                    self.rotary_emb,
+                )
+                # assert past_key_value is None, "fuse rotary not support cache kv for now"
+                # batch_size, seq_length, num_heads, head_dim = query_states.shape
+                # _, kv_seq_len, num_key_value_heads, _ = key_states.shape
+                # cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+                # if get_env_device() == "npu":
+                #     query_states = core.eager._run_custom_op("fused_rope", query_states, cos, sin)[0]
+                #     key_states = core.eager._run_custom_op("fused_rope", key_states, cos, sin)[0]
+                # else:
+                #     # paddle version > 2.6 or develop support q and k/v with different num_heads
+                #     paddle_version = float(paddle.__version__[:3])
+                #     if ((paddle_version != 0.0) and (paddle_version <= 2.6)) and (num_heads != num_key_value_heads):
+                #         query_states, _, _ = fused_rotary_position_embedding(
+                #             query_states,
+                #             None,
+                #             None,
+                #             sin=sin,
+                #             cos=cos,
+                #             position_ids=position_ids,
+                #             use_neox_rotary_style=False,
+                #         )
+                #         key_states, _, _ = fused_rotary_position_embedding(
+                #             key_states,
+                #             None,
+                #             None,
+                #             sin=sin,
+                #             cos=cos,
+                #             position_ids=position_ids,
+                #             use_neox_rotary_style=False,
+                #         )
+                #     else:
+                #         query_states, key_states, _ = fused_rotary_position_embedding(
+                #             query_states,
+                #             key_states,
+                #             v=None,
+                #             sin=sin,
+                #             cos=cos,
+                #             position_ids=position_ids,
+                #             use_neox_rotary_style=False,
+                #         )
             else:
                 if self.config.use_long_sequence_strategies:
                     cos, sin = self.rotary_emb(seq_len=kv_seq_len)

From 12fc0489d7beca11679697d1095a6c1dae058257 Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Mon, 13 May 2024 17:33:16 +0800
Subject: [PATCH 22/31] add funsion ops

---
 paddlenlp/transformers/fusion_ops.py | 69 ++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/paddlenlp/transformers/fusion_ops.py b/paddlenlp/transformers/fusion_ops.py
index 4bc8ea2d0bd2..96f160534fc7 100644
--- a/paddlenlp/transformers/fusion_ops.py
+++ b/paddlenlp/transformers/fusion_ops.py
@@ -32,8 +32,14 @@ def swiglu(x, y=None):
         return F.silu(x) * y
 
 
+from paddle.utils import try_import
+
 from paddlenlp.utils.tools import get_env_device
 
+try:
+    from paddle.incubate.nn.functional import fused_rotary_position_embedding
+except ImportError:
+    fused_rotary_position_embedding = None
 try:
     if get_env_device() == "npu":
         from paddle.base import core
@@ -46,6 +52,69 @@ def swiglu(x, y=None):
     flash_attention = None
 
 
+def fusion_rope(query_states, key_states, value_states, hidden_states, position_ids, past_key_value, rotary_emb):
+    assert past_key_value is None, "fuse rotary not support cache kv for now"
+    batch_size, seq_length, num_heads, head_dim = query_states.shape
+    _, kv_seq_len, num_key_value_heads, _ = key_states.shape
+    cos, sin = rotary_emb(value_states, seq_len=kv_seq_len)
+    if get_env_device() == "npu":
+        query_states = core.eager._run_custom_op("fused_rope", query_states, cos, sin)[0]
+        key_states = core.eager._run_custom_op("fused_rope", key_states, cos, sin)[0]
+    else:
+        # paddle version > 2.6 or develop support q and k/v with different num_heads
+        paddle_version = float(paddle.__version__[:3])
+        if ((paddle_version != 0.0) and (paddle_version <= 2.6)) and (num_heads != num_key_value_heads):
+            query_states, _, _ = fused_rotary_position_embedding(
+                query_states,
+                None,
+                None,
+                sin=sin,
+                cos=cos,
+                position_ids=position_ids,
+                use_neox_rotary_style=False,
+            )
+            key_states, _, _ = fused_rotary_position_embedding(
+                key_states,
+                None,
+                None,
+                sin=sin,
+                cos=cos,
+                position_ids=position_ids,
+                use_neox_rotary_style=False,
+            )
+        else:
+            query_states, key_states, _ = fused_rotary_position_embedding(
+                query_states,
+                key_states,
+                v=None,
+                sin=sin,
+                cos=cos,
+                position_ids=position_ids,
+                use_neox_rotary_style=False,
+            )
+    return query_states, key_states
+
+
+def rms_norm_fused(x_in, w, eps):
+    fused_ln = try_import("fused_ln")
+    return fused_ln.fused_rms_norm(x_in, w, eps)[0]
+
+
+def fusion_rms_norm(hidden_states, weight, variance_epsilon):
+    if get_env_device() == "npu":
+        return core.eager._run_custom_op("rms_norm_npu", hidden_states, weight, variance_epsilon)[0]
+    elif get_env_device() == "xpu":
+        try:
+            import paddle_xpu_nn  # noqa: F821
+
+            return paddle_xpu_nn.xpu_rms_norm(hidden_states, weight, variance_epsilon)[0]
+        except ImportError:
+            raise NotImplementedError(
+                f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature"
+            )
+    return rms_norm_fused(hidden_states, weight, variance_epsilon)
+
+
 def fusion_flash_attention(
     query_states,
     config,

From f678361ff3fc0c7e2227d8705150e34f61c0a1aa Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Mon, 13 May 2024 17:45:24 +0800
Subject: [PATCH 23/31] add funsion ops

---
 paddlenlp/transformers/llama/modeling.py | 66 ++++++------------------
 1 file changed, 15 insertions(+), 51 deletions(-)

diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
index 6d54fde21348..62e39bf0ba97 100755
--- a/paddlenlp/transformers/llama/modeling.py
+++ b/paddlenlp/transformers/llama/modeling.py
@@ -81,7 +81,7 @@ def swiglu(x, y=None):
 
 try:
     if get_env_device() == "npu":
-        from paddle.base import core
+        # from paddle.base import core
 
         for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")):
             if lib.endswith(".so"):
@@ -363,18 +363,20 @@ def __init__(self, config):
 
     def forward(self, hidden_states):
         if self.config.use_fused_rms_norm:
-            if get_env_device() == "npu":
-                return core.eager._run_custom_op("rms_norm_npu", hidden_states, self.weight, self.variance_epsilon)[0]
-            elif get_env_device() == "xpu":
-                try:
-                    import paddle_xpu_nn  # noqa: F821
+            return fusion_ops.fusion_rms_norm(hidden_states, self.weight, self.variance_epsilon)
 
-                    return paddle_xpu_nn.xpu_rms_norm(hidden_states, self.weight, self.variance_epsilon)[0]
-                except ImportError:
-                    raise NotImplementedError(
-                        f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature"
-                    )
-            return rms_norm_fused(hidden_states, self.weight, self.variance_epsilon)
+            # if get_env_device() == "npu":
+            #     return core.eager._run_custom_op("rms_norm_npu", hidden_states, self.weight, self.variance_epsilon)[0]
+            # elif get_env_device() == "xpu":
+            #     try:
+            #         import paddle_xpu_nn  # noqa: F821
+
+            #         return paddle_xpu_nn.xpu_rms_norm(hidden_states, self.weight, self.variance_epsilon)[0]
+            #     except ImportError:
+            #         raise NotImplementedError(
+            #             f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature"
+            #         )
+            # return rms_norm_fused(hidden_states, self.weight, self.variance_epsilon)
 
         if paddle.in_dynamic_mode():
             with paddle.amp.auto_cast(False):
@@ -939,45 +941,7 @@ def forward(
                     past_key_value,
                     self.rotary_emb,
                 )
-                # assert past_key_value is None, "fuse rotary not support cache kv for now"
-                # batch_size, seq_length, num_heads, head_dim = query_states.shape
-                # _, kv_seq_len, num_key_value_heads, _ = key_states.shape
-                # cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-                # if get_env_device() == "npu":
-                #     query_states = core.eager._run_custom_op("fused_rope", query_states, cos, sin)[0]
-                #     key_states = core.eager._run_custom_op("fused_rope", key_states, cos, sin)[0]
-                # else:
-                #     # paddle version > 2.6 or develop support q and k/v with different num_heads
-                #     paddle_version = float(paddle.__version__[:3])
-                #     if ((paddle_version != 0.0) and (paddle_version <= 2.6)) and (num_heads != num_key_value_heads):
-                #         query_states, _, _ = fused_rotary_position_embedding(
-                #             query_states,
-                #             None,
-                #             None,
-                #             sin=sin,
-                #             cos=cos,
-                #             position_ids=position_ids,
-                #             use_neox_rotary_style=False,
-                #         )
-                #         key_states, _, _ = fused_rotary_position_embedding(
-                #             key_states,
-                #             None,
-                #             None,
-                #             sin=sin,
-                #             cos=cos,
-                #             position_ids=position_ids,
-                #             use_neox_rotary_style=False,
-                #         )
-                #     else:
-                #         query_states, key_states, _ = fused_rotary_position_embedding(
-                #             query_states,
-                #             key_states,
-                #             v=None,
-                #             sin=sin,
-                #             cos=cos,
-                #             position_ids=position_ids,
-                #             use_neox_rotary_style=False,
-                #         )
+
             else:
                 if self.config.use_long_sequence_strategies:
                     cos, sin = self.rotary_emb(seq_len=kv_seq_len)

From 9b2ca6bcbc2fa3eb8fcbf2456adba9d313eddd7b Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Mon, 13 May 2024 17:54:17 +0800
Subject: [PATCH 24/31] add funsion ops

---
 paddlenlp/transformers/llama/modeling.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
index 62e39bf0ba97..f726cbe7ff86 100755
--- a/paddlenlp/transformers/llama/modeling.py
+++ b/paddlenlp/transformers/llama/modeling.py
@@ -81,7 +81,6 @@ def swiglu(x, y=None):
 
 try:
     if get_env_device() == "npu":
-        # from paddle.base import core
 
         for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")):
             if lib.endswith(".so"):
@@ -365,19 +364,6 @@ def forward(self, hidden_states):
         if self.config.use_fused_rms_norm:
             return fusion_ops.fusion_rms_norm(hidden_states, self.weight, self.variance_epsilon)
 
-            # if get_env_device() == "npu":
-            #     return core.eager._run_custom_op("rms_norm_npu", hidden_states, self.weight, self.variance_epsilon)[0]
-            # elif get_env_device() == "xpu":
-            #     try:
-            #         import paddle_xpu_nn  # noqa: F821
-
-            #         return paddle_xpu_nn.xpu_rms_norm(hidden_states, self.weight, self.variance_epsilon)[0]
-            #     except ImportError:
-            #         raise NotImplementedError(
-            #             f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature"
-            #         )
-            # return rms_norm_fused(hidden_states, self.weight, self.variance_epsilon)
-
         if paddle.in_dynamic_mode():
             with paddle.amp.auto_cast(False):
                 hidden_states = hidden_states.astype("float32")

From cac0f8e605f094221bd95960f27a7ee4d4b6deae Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Mon, 13 May 2024 18:13:41 +0800
Subject: [PATCH 25/31] add funsion ops

---
 paddlenlp/transformers/llama/fusion_ops.py | 189 ---------------------
 1 file changed, 189 deletions(-)
 delete mode 100644 paddlenlp/transformers/llama/fusion_ops.py

diff --git a/paddlenlp/transformers/llama/fusion_ops.py b/paddlenlp/transformers/llama/fusion_ops.py
deleted file mode 100644
index 96f160534fc7..000000000000
--- a/paddlenlp/transformers/llama/fusion_ops.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import paddle
-import paddle.nn.functional as F
-
-try:
-    from paddle.incubate.nn.functional import fused_rotary_position_embedding
-except ImportError:
-    fused_rotary_position_embedding = None
-
-try:
-    from paddle.incubate.nn.functional import swiglu
-except ImportError:
-
-    def swiglu(x, y=None):
-        if y is None:
-            x, y = paddle.chunk(x, chunks=2, axis=-1)
-        return F.silu(x) * y
-
-
-from paddle.utils import try_import
-
-from paddlenlp.utils.tools import get_env_device
-
-try:
-    from paddle.incubate.nn.functional import fused_rotary_position_embedding
-except ImportError:
-    fused_rotary_position_embedding = None
-try:
-    if get_env_device() == "npu":
-        from paddle.base import core
-
-        for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")):
-            if lib.endswith(".so"):
-                paddle.utils.cpp_extension.extension_utils.load_op_meta_info_and_register_op(lib)
-    from paddle.nn.functional.flash_attention import flash_attention
-except:
-    flash_attention = None
-
-
-def fusion_rope(query_states, key_states, value_states, hidden_states, position_ids, past_key_value, rotary_emb):
-    assert past_key_value is None, "fuse rotary not support cache kv for now"
-    batch_size, seq_length, num_heads, head_dim = query_states.shape
-    _, kv_seq_len, num_key_value_heads, _ = key_states.shape
-    cos, sin = rotary_emb(value_states, seq_len=kv_seq_len)
-    if get_env_device() == "npu":
-        query_states = core.eager._run_custom_op("fused_rope", query_states, cos, sin)[0]
-        key_states = core.eager._run_custom_op("fused_rope", key_states, cos, sin)[0]
-    else:
-        # paddle version > 2.6 or develop support q and k/v with different num_heads
-        paddle_version = float(paddle.__version__[:3])
-        if ((paddle_version != 0.0) and (paddle_version <= 2.6)) and (num_heads != num_key_value_heads):
-            query_states, _, _ = fused_rotary_position_embedding(
-                query_states,
-                None,
-                None,
-                sin=sin,
-                cos=cos,
-                position_ids=position_ids,
-                use_neox_rotary_style=False,
-            )
-            key_states, _, _ = fused_rotary_position_embedding(
-                key_states,
-                None,
-                None,
-                sin=sin,
-                cos=cos,
-                position_ids=position_ids,
-                use_neox_rotary_style=False,
-            )
-        else:
-            query_states, key_states, _ = fused_rotary_position_embedding(
-                query_states,
-                key_states,
-                v=None,
-                sin=sin,
-                cos=cos,
-                position_ids=position_ids,
-                use_neox_rotary_style=False,
-            )
-    return query_states, key_states
-
-
-def rms_norm_fused(x_in, w, eps):
-    fused_ln = try_import("fused_ln")
-    return fused_ln.fused_rms_norm(x_in, w, eps)[0]
-
-
-def fusion_rms_norm(hidden_states, weight, variance_epsilon):
-    if get_env_device() == "npu":
-        return core.eager._run_custom_op("rms_norm_npu", hidden_states, weight, variance_epsilon)[0]
-    elif get_env_device() == "xpu":
-        try:
-            import paddle_xpu_nn  # noqa: F821
-
-            return paddle_xpu_nn.xpu_rms_norm(hidden_states, weight, variance_epsilon)[0]
-        except ImportError:
-            raise NotImplementedError(
-                f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature"
-            )
-    return rms_norm_fused(hidden_states, weight, variance_epsilon)
-
-
-def fusion_flash_attention(
-    query_states,
-    config,
-    key_states,
-    value_states,
-    attention_mask,
-    output_attentions,
-    alibi=None,
-    sequence_parallel=False,
-    reshard_layer=None,
-    npu_is_casual=False,
-):
-    bsz, q_len, num_heads, head_dim = query_states.shape
-    _, kv_seq_len, _, _ = value_states.shape
-    version = paddle.version.full_version
-    if version != "0.0.0" and version <= "2.5.2":
-        if alibi is not None:
-            raise ValueError("Flash Attention doesn't support alibi")
-        attn_output, attn_weights = flash_attention(
-            query_states,
-            key_states,
-            value_states,
-            causal=True,
-            return_softmax=output_attentions,
-        )
-    else:
-        if alibi is not None:
-            alibi = alibi.reshape([bsz, num_heads, 1, -1])
-            attention_mask = attention_mask.cast(alibi.dtype) + alibi
-        if get_env_device() == "npu":
-            attn_output = core.eager._run_custom_op(
-                "flash_attention_npu",
-                query_states,
-                key_states,
-                value_states,
-                None,
-                attention_mask,
-                0.0,
-                attention_mask is None,
-                True,
-                False,
-                npu_is_casual,
-            )[0]
-        else:
-            attn_output = F.scaled_dot_product_attention(
-                query_states,
-                key_states,
-                value_states,
-                attn_mask=attention_mask,
-                is_causal=attention_mask is None,
-            )
-        attn_weights = None
-
-    if reshard_layer is not None:
-        # attn_output shape: [bs, seqlen, num_head/sep, head_dim]
-        attn_output = reshard_layer(
-            attn_output,
-            split_axis=1,
-            concat_axis=2,
-        )
-        # attn_output shape: [bs, seqlen/sep, num_head, head_dim]
-        assert (
-            config.sep_parallel_degree > 1 and q_len % config.sep_parallel_degree == 0
-        ), f"q_len:{q_len}, config.sep_parallel_degree:{config.sep_parallel_degree}"
-        q_len = q_len // config.sep_parallel_degree
-        num_heads = num_heads * config.sep_parallel_degree
-
-    if sequence_parallel:
-        attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
-    else:
-        attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
-    return (attn_output, attn_weights) if output_attentions else attn_output

From 73866a297c9aeb0e3356e25c9991b083c63fd358 Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Mon, 13 May 2024 18:42:26 +0800
Subject: [PATCH 26/31] add funsion ops

---
 paddlenlp/transformers/llama/modeling.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
index f726cbe7ff86..154dbad6117c 100755
--- a/paddlenlp/transformers/llama/modeling.py
+++ b/paddlenlp/transformers/llama/modeling.py
@@ -55,7 +55,6 @@ def swiglu(x, y=None):
     )
 except:
     pass
-from paddle.utils import try_import
 
 from paddlenlp.transformers.conversion_utils import (
     StateDictNameMapping,
@@ -340,11 +339,6 @@ def _expand_2d_mask(mask, dtype, tgt_length):
     return expanded_mask
 
 
-def rms_norm_fused(x_in, w, eps):
-    fused_ln = try_import("fused_ln")
-    return fused_ln.fused_rms_norm(x_in, w, eps)[0]
-
-
 class LlamaRMSNorm(nn.Layer):
     def __init__(self, config):
         super().__init__()

From d8f19500d7d75235ef4bd5b6841344cf271fba76 Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Mon, 13 May 2024 18:44:22 +0800
Subject: [PATCH 27/31] add funsion ops

---
 paddlenlp/transformers/llama/modeling.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
index 154dbad6117c..82786f3383b3 100755
--- a/paddlenlp/transformers/llama/modeling.py
+++ b/paddlenlp/transformers/llama/modeling.py
@@ -89,6 +89,8 @@ def swiglu(x, y=None):
     flash_attention = None
 from .. import fusion_ops
 
+rms_norm_fused = fusion_ops.fusion_ops
+
 __all__ = [
     "LlamaModel",
     "LlamaPretrainedModel",

From 9a2f1c53dc243f16c6e8acb8727c53ceee6f24fc Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Mon, 13 May 2024 18:45:33 +0800
Subject: [PATCH 28/31] add funsion ops

---
 paddlenlp/transformers/llama/modeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
index 82786f3383b3..634d532753f4 100755
--- a/paddlenlp/transformers/llama/modeling.py
+++ b/paddlenlp/transformers/llama/modeling.py
@@ -89,7 +89,7 @@ def swiglu(x, y=None):
     flash_attention = None
 from .. import fusion_ops
 
-rms_norm_fused = fusion_ops.fusion_ops
+rms_norm_fused = fusion_ops.rms_norm_fused
 
 __all__ = [
     "LlamaModel",

From df78b71dc327e98ad8e2d2579b5cea3903dcab26 Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Tue, 14 May 2024 11:28:02 +0800
Subject: [PATCH 29/31] update

---
 paddlenlp/transformers/llama/fusion_ops.py | 189 +++++++++++++++++++++
 paddlenlp/transformers/llama/modeling.py   |   2 +-
 2 files changed, 190 insertions(+), 1 deletion(-)
 create mode 100644 paddlenlp/transformers/llama/fusion_ops.py

diff --git a/paddlenlp/transformers/llama/fusion_ops.py b/paddlenlp/transformers/llama/fusion_ops.py
new file mode 100644
index 000000000000..96f160534fc7
--- /dev/null
+++ b/paddlenlp/transformers/llama/fusion_ops.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import paddle
+import paddle.nn.functional as F
+
+try:
+    from paddle.incubate.nn.functional import fused_rotary_position_embedding
+except ImportError:
+    fused_rotary_position_embedding = None
+
+try:
+    from paddle.incubate.nn.functional import swiglu
+except ImportError:
+
+    def swiglu(x, y=None):
+        if y is None:
+            x, y = paddle.chunk(x, chunks=2, axis=-1)
+        return F.silu(x) * y
+
+
+from paddle.utils import try_import
+
+from paddlenlp.utils.tools import get_env_device
+
+try:
+    from paddle.incubate.nn.functional import fused_rotary_position_embedding
+except ImportError:
+    fused_rotary_position_embedding = None
+try:
+    if get_env_device() == "npu":
+        from paddle.base import core
+
+        for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")):
+            if lib.endswith(".so"):
+                paddle.utils.cpp_extension.extension_utils.load_op_meta_info_and_register_op(lib)
+    from paddle.nn.functional.flash_attention import flash_attention
+except:
+    flash_attention = None
+
+
+def fusion_rope(query_states, key_states, value_states, hidden_states, position_ids, past_key_value, rotary_emb):
+    assert past_key_value is None, "fuse rotary not support cache kv for now"
+    batch_size, seq_length, num_heads, head_dim = query_states.shape
+    _, kv_seq_len, num_key_value_heads, _ = key_states.shape
+    cos, sin = rotary_emb(value_states, seq_len=kv_seq_len)
+    if get_env_device() == "npu":
+        query_states = core.eager._run_custom_op("fused_rope", query_states, cos, sin)[0]
+        key_states = core.eager._run_custom_op("fused_rope", key_states, cos, sin)[0]
+    else:
+        # paddle version > 2.6 or develop support q and k/v with different num_heads
+        paddle_version = float(paddle.__version__[:3])
+        if ((paddle_version != 0.0) and (paddle_version <= 2.6)) and (num_heads != num_key_value_heads):
+            query_states, _, _ = fused_rotary_position_embedding(
+                query_states,
+                None,
+                None,
+                sin=sin,
+                cos=cos,
+                position_ids=position_ids,
+                use_neox_rotary_style=False,
+            )
+            key_states, _, _ = fused_rotary_position_embedding(
+                key_states,
+                None,
+                None,
+                sin=sin,
+                cos=cos,
+                position_ids=position_ids,
+                use_neox_rotary_style=False,
+            )
+        else:
+            query_states, key_states, _ = fused_rotary_position_embedding(
+                query_states,
+                key_states,
+                v=None,
+                sin=sin,
+                cos=cos,
+                position_ids=position_ids,
+                use_neox_rotary_style=False,
+            )
+    return query_states, key_states
+
+
+def rms_norm_fused(x_in, w, eps):
+    fused_ln = try_import("fused_ln")
+    return fused_ln.fused_rms_norm(x_in, w, eps)[0]
+
+
+def fusion_rms_norm(hidden_states, weight, variance_epsilon):
+    if get_env_device() == "npu":
+        return core.eager._run_custom_op("rms_norm_npu", hidden_states, weight, variance_epsilon)[0]
+    elif get_env_device() == "xpu":
+        try:
+            import paddle_xpu_nn  # noqa: F821
+
+            return paddle_xpu_nn.xpu_rms_norm(hidden_states, weight, variance_epsilon)[0]
+        except ImportError:
+            raise NotImplementedError(
+                f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature"
+            )
+    return rms_norm_fused(hidden_states, weight, variance_epsilon)
+
+
+def fusion_flash_attention(
+    query_states,
+    config,
+    key_states,
+    value_states,
+    attention_mask,
+    output_attentions,
+    alibi=None,
+    sequence_parallel=False,
+    reshard_layer=None,
+    npu_is_casual=False,
+):
+    bsz, q_len, num_heads, head_dim = query_states.shape
+    _, kv_seq_len, _, _ = value_states.shape
+    version = paddle.version.full_version
+    if version != "0.0.0" and version <= "2.5.2":
+        if alibi is not None:
+            raise ValueError("Flash Attention doesn't support alibi")
+        attn_output, attn_weights = flash_attention(
+            query_states,
+            key_states,
+            value_states,
+            causal=True,
+            return_softmax=output_attentions,
+        )
+    else:
+        if alibi is not None:
+            alibi = alibi.reshape([bsz, num_heads, 1, -1])
+            attention_mask = attention_mask.cast(alibi.dtype) + alibi
+        if get_env_device() == "npu":
+            attn_output = core.eager._run_custom_op(
+                "flash_attention_npu",
+                query_states,
+                key_states,
+                value_states,
+                None,
+                attention_mask,
+                0.0,
+                attention_mask is None,
+                True,
+                False,
+                npu_is_casual,
+            )[0]
+        else:
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                is_causal=attention_mask is None,
+            )
+        attn_weights = None
+
+    if reshard_layer is not None:
+        # attn_output shape: [bs, seqlen, num_head/sep, head_dim]
+        attn_output = reshard_layer(
+            attn_output,
+            split_axis=1,
+            concat_axis=2,
+        )
+        # attn_output shape: [bs, seqlen/sep, num_head, head_dim]
+        assert (
+            config.sep_parallel_degree > 1 and q_len % config.sep_parallel_degree == 0
+        ), f"q_len:{q_len}, config.sep_parallel_degree:{config.sep_parallel_degree}"
+        q_len = q_len // config.sep_parallel_degree
+        num_heads = num_heads * config.sep_parallel_degree
+
+    if sequence_parallel:
+        attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
+    else:
+        attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+    return (attn_output, attn_weights) if output_attentions else attn_output
diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
index 634d532753f4..548a4e0a8cb3 100755
--- a/paddlenlp/transformers/llama/modeling.py
+++ b/paddlenlp/transformers/llama/modeling.py
@@ -87,7 +87,7 @@ def swiglu(x, y=None):
     from paddle.nn.functional.flash_attention import flash_attention
 except:
     flash_attention = None
-from .. import fusion_ops
+from . import fusion_ops
 
 rms_norm_fused = fusion_ops.rms_norm_fused
 

From 8c3cd0d2f307b38391770bde8e8bb09acf0a6b62 Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Tue, 14 May 2024 11:30:17 +0800
Subject: [PATCH 30/31] Update fusion_ops.py

---
 paddlenlp/transformers/fusion_ops.py | 188 ---------------------------
 1 file changed, 188 deletions(-)

diff --git a/paddlenlp/transformers/fusion_ops.py b/paddlenlp/transformers/fusion_ops.py
index 96f160534fc7..8b137891791f 100644
--- a/paddlenlp/transformers/fusion_ops.py
+++ b/paddlenlp/transformers/fusion_ops.py
@@ -1,189 +1 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
-import os
-
-import paddle
-import paddle.nn.functional as F
-
-try:
-    from paddle.incubate.nn.functional import fused_rotary_position_embedding
-except ImportError:
-    fused_rotary_position_embedding = None
-
-try:
-    from paddle.incubate.nn.functional import swiglu
-except ImportError:
-
-    def swiglu(x, y=None):
-        if y is None:
-            x, y = paddle.chunk(x, chunks=2, axis=-1)
-        return F.silu(x) * y
-
-
-from paddle.utils import try_import
-
-from paddlenlp.utils.tools import get_env_device
-
-try:
-    from paddle.incubate.nn.functional import fused_rotary_position_embedding
-except ImportError:
-    fused_rotary_position_embedding = None
-try:
-    if get_env_device() == "npu":
-        from paddle.base import core
-
-        for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")):
-            if lib.endswith(".so"):
-                paddle.utils.cpp_extension.extension_utils.load_op_meta_info_and_register_op(lib)
-    from paddle.nn.functional.flash_attention import flash_attention
-except:
-    flash_attention = None
-
-
-def fusion_rope(query_states, key_states, value_states, hidden_states, position_ids, past_key_value, rotary_emb):
-    assert past_key_value is None, "fuse rotary not support cache kv for now"
-    batch_size, seq_length, num_heads, head_dim = query_states.shape
-    _, kv_seq_len, num_key_value_heads, _ = key_states.shape
-    cos, sin = rotary_emb(value_states, seq_len=kv_seq_len)
-    if get_env_device() == "npu":
-        query_states = core.eager._run_custom_op("fused_rope", query_states, cos, sin)[0]
-        key_states = core.eager._run_custom_op("fused_rope", key_states, cos, sin)[0]
-    else:
-        # paddle version > 2.6 or develop support q and k/v with different num_heads
-        paddle_version = float(paddle.__version__[:3])
-        if ((paddle_version != 0.0) and (paddle_version <= 2.6)) and (num_heads != num_key_value_heads):
-            query_states, _, _ = fused_rotary_position_embedding(
-                query_states,
-                None,
-                None,
-                sin=sin,
-                cos=cos,
-                position_ids=position_ids,
-                use_neox_rotary_style=False,
-            )
-            key_states, _, _ = fused_rotary_position_embedding(
-                key_states,
-                None,
-                None,
-                sin=sin,
-                cos=cos,
-                position_ids=position_ids,
-                use_neox_rotary_style=False,
-            )
-        else:
-            query_states, key_states, _ = fused_rotary_position_embedding(
-                query_states,
-                key_states,
-                v=None,
-                sin=sin,
-                cos=cos,
-                position_ids=position_ids,
-                use_neox_rotary_style=False,
-            )
-    return query_states, key_states
-
-
-def rms_norm_fused(x_in, w, eps):
-    fused_ln = try_import("fused_ln")
-    return fused_ln.fused_rms_norm(x_in, w, eps)[0]
-
-
-def fusion_rms_norm(hidden_states, weight, variance_epsilon):
-    if get_env_device() == "npu":
-        return core.eager._run_custom_op("rms_norm_npu", hidden_states, weight, variance_epsilon)[0]
-    elif get_env_device() == "xpu":
-        try:
-            import paddle_xpu_nn  # noqa: F821
-
-            return paddle_xpu_nn.xpu_rms_norm(hidden_states, weight, variance_epsilon)[0]
-        except ImportError:
-            raise NotImplementedError(
-                f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature"
-            )
-    return rms_norm_fused(hidden_states, weight, variance_epsilon)
-
-
-def fusion_flash_attention(
-    query_states,
-    config,
-    key_states,
-    value_states,
-    attention_mask,
-    output_attentions,
-    alibi=None,
-    sequence_parallel=False,
-    reshard_layer=None,
-    npu_is_casual=False,
-):
-    bsz, q_len, num_heads, head_dim = query_states.shape
-    _, kv_seq_len, _, _ = value_states.shape
-    version = paddle.version.full_version
-    if version != "0.0.0" and version <= "2.5.2":
-        if alibi is not None:
-            raise ValueError("Flash Attention doesn't support alibi")
-        attn_output, attn_weights = flash_attention(
-            query_states,
-            key_states,
-            value_states,
-            causal=True,
-            return_softmax=output_attentions,
-        )
-    else:
-        if alibi is not None:
-            alibi = alibi.reshape([bsz, num_heads, 1, -1])
-            attention_mask = attention_mask.cast(alibi.dtype) + alibi
-        if get_env_device() == "npu":
-            attn_output = core.eager._run_custom_op(
-                "flash_attention_npu",
-                query_states,
-                key_states,
-                value_states,
-                None,
-                attention_mask,
-                0.0,
-                attention_mask is None,
-                True,
-                False,
-                npu_is_casual,
-            )[0]
-        else:
-            attn_output = F.scaled_dot_product_attention(
-                query_states,
-                key_states,
-                value_states,
-                attn_mask=attention_mask,
-                is_causal=attention_mask is None,
-            )
-        attn_weights = None
-
-    if reshard_layer is not None:
-        # attn_output shape: [bs, seqlen, num_head/sep, head_dim]
-        attn_output = reshard_layer(
-            attn_output,
-            split_axis=1,
-            concat_axis=2,
-        )
-        # attn_output shape: [bs, seqlen/sep, num_head, head_dim]
-        assert (
-            config.sep_parallel_degree > 1 and q_len % config.sep_parallel_degree == 0
-        ), f"q_len:{q_len}, config.sep_parallel_degree:{config.sep_parallel_degree}"
-        q_len = q_len // config.sep_parallel_degree
-        num_heads = num_heads * config.sep_parallel_degree
-
-    if sequence_parallel:
-        attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
-    else:
-        attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
-    return (attn_output, attn_weights) if output_attentions else attn_output

From 0a6d6b8196d9588401af32a6e9b63e6314352fd8 Mon Sep 17 00:00:00 2001
From: Yangrl <2535184404@qq.com>
Date: Tue, 14 May 2024 11:32:20 +0800
Subject: [PATCH 31/31] update

---
 paddlenlp/transformers/fusion_ops.py | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 paddlenlp/transformers/fusion_ops.py

diff --git a/paddlenlp/transformers/fusion_ops.py b/paddlenlp/transformers/fusion_ops.py
deleted file mode 100644
index 8b137891791f..000000000000
--- a/paddlenlp/transformers/fusion_ops.py
+++ /dev/null
@@ -1 +0,0 @@
-