From a5ed9ed35056b273a1d18095acc96778d6bca962 Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Thu, 9 May 2024 12:03:32 +0800 Subject: [PATCH 01/31] update --- paddlenlp/transformers/llama/modeling.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index ac3074aae7ce..be88d6af74dc 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -98,9 +98,6 @@ def swiglu(x, y=None): ] -npu_is_casual = False - - def _get_interleave(n): def _get_interleave_power_of_2(n): start = 2 ** (-(2 ** -(math.log2(n) - 3))) @@ -212,6 +209,7 @@ def scaled_dot_product_attention( alibi=None, sequence_parallel=False, reshard_layer=None, + npu_is_casual=False, ): bsz, q_len, num_heads, head_dim = query_states.shape _, kv_seq_len, _, _ = value_states.shape @@ -851,6 +849,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, alibi: Optional[paddle.Tensor] = None, + npu_is_casual: bool = False, ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: """Input shape: Batch x Time x Channel""" # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism) @@ -1078,6 +1077,7 @@ def forward( alibi, self.sequence_parallel, reshard_layer=self.reshard_layer, + npu_is_casual=npu_is_casual, ) if output_attentions: attn_output, attn_weights = outputs @@ -1130,6 +1130,7 @@ def forward( past_key_value: Optional[Tuple[paddle.Tensor]] = None, use_cache: Optional[bool] = False, alibi: Optional[paddle.Tensor] = None, + npu_is_casual: bool = False, ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]: """ Args: @@ -1177,6 +1178,7 @@ def forward( output_attentions, use_cache, alibi, + npu_is_casual=npu_is_casual, ) if type(outputs) is tuple: @@ -1614,6 +1616,7 @@ def forward( attention_mask = self._prepare_decoder_attention_mask( attention_mask, (batch_size, seq_length), cache_length, inputs_embeds.dtype ) # [bs, 1, seq_len, seq_len] + is_casual = False if self.config.use_flash_attention: is_casual = is_casual_mask(attention_mask) if get_env_device() != "npu": @@ -1658,6 +1661,7 @@ def forward( past_key_value, use_cache, alibi=alibi, + npu_is_casual=is_casual, ) # NOTE: clear outdate cache after it has been used for memory saving From bd0aa8773f6df5b4b783b6561b6ec23efdbaf006 Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Thu, 9 May 2024 13:33:51 +0800 Subject: [PATCH 02/31] add llama-npu-opt-script --- llm/llama/npu/dev_opt_lora.sh | 86 +++++++++++++++++++++++++++++++++ llm/llama/npu/dev_opt_ppt.sh | 91 +++++++++++++++++++++++++++++++++++ llm/llama/npu/dev_opt_sft.sh | 81 +++++++++++++++++++++++++++++++ 3 files changed, 258 insertions(+) create mode 100644 llm/llama/npu/dev_opt_lora.sh create mode 100644 llm/llama/npu/dev_opt_ppt.sh create mode 100644 llm/llama/npu/dev_opt_sft.sh diff --git a/llm/llama/npu/dev_opt_lora.sh b/llm/llama/npu/dev_opt_lora.sh new file mode 100644 index 000000000000..09719c572eee --- /dev/null +++ b/llm/llama/npu/dev_opt_lora.sh @@ -0,0 +1,86 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +max_steps=${1:-1000} +lock_seed_flag=${2:-close} +if [[ ${lock_seed_flag} =~ "open_lock_seed" ]];then + export npu_deterministic=true + export ACL_OP_DETERMINISTIC=true + export ACL_OPT_DETERMINISTIC=true + export HCCL_DETERMINISTIC=true +fi +echo lock_seed_flag +echo $lock_seed_flag +echo npu_deterministic ACL_OP_DETERMINISTIC ACL_OPT_DETERMINISTIC HCCL_DETERMINISTIC +echo $npu_deterministic $ACL_OP_DETERMINISTIC $ACL_OPT_DETERMINISTIC $HCCL_DETERMINISTIC + +export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" +export FLAGS_use_stride_kernel=0 +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS +export FLAGS_NPU_MC2=1 +export MC2_Recompute=1 +export MC2=1 +export FLAGS_allocator_strategy=naive_best_fit +source /usr/local/Ascend/ascend-toolkit/set_env.sh + +#240411新增 +# export MC2=1 + +rm -rf lora_bf16_llama_N1C8 +rm -rf output/lora_bf16_llama_N1C8 +ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9 +ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -9 +export PYTHONPATH=../../:$PYTHONPATH +python -u -m paddle.distributed.launch \ + --devices "0,1,2,3,4,5,6,7" \ + --log_dir "./lora_bf16_llama_N1C8" \ + ../finetune_generation.py \ + --device "npu" \ + --model_name_or_path "meta-llama/Llama-2-13b" \ + --dataset_name_or_path "data/" \ + --output_dir "./output/lora_bf16_llama_N1C8" \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 16 \ + --per_device_eval_batch_size 1 \ + --eval_accumulation_steps 1 \ + --max_steps ${max_steps} \ + --decay_steps 2000 \ + --learning_rate 3e-06 \ + --warmup_steps 2 \ + --save_steps 1000 \ + --logging_steps 1 \ + --evaluation_strategy "epoch" \ + --src_length 1024 \ + --max_length 4096 \ + --bf16 true \ + --fp16_opt_level "O2" \ + --do_train true \ + --disable_tqdm true \ + --eval_with_do_generation false \ + --metric_for_best_model "accuracy" \ + --recompute false \ + --tensor_parallel_degree 8 \ + --pipeline_parallel_degree 1 \ + --zero_padding 0 \ + --sequence_parallel 1 \ + --amp_master_grad true \ + --fuse_attention_qkv true \ + --fuse_attention_ffn true \ + --use_flash_attention 1 \ + --use_fused_rope 1 \ + --use_fused_rms_norm 1 \ + --lora true \ + --lora_rank 32 \ + --pad_to_multiple_of 4096 \ No newline at end of file diff --git a/llm/llama/npu/dev_opt_ppt.sh b/llm/llama/npu/dev_opt_ppt.sh new file mode 100644 index 000000000000..b0cfb12f3223 --- /dev/null +++ b/llm/llama/npu/dev_opt_ppt.sh @@ -0,0 +1,91 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +max_steps=${1:-800} + +set -x +ps aux | grep run_pretrain.py | grep -v grep | awk '{print $2}' | xargs kill -9 +rm -rf ./log_8.0 +rm -rf output +export PYTHONPATH=../:$PYTHONPATH +export MC2=1 +export GLOG_v=0 +export FLAGS_npu_storage_format=1 +export HCCL_INTRA_PCIE_EHABLE=0 +export HCCL_INTRA_ROCE_ENABLE=1 +export FLAGS_allocator_strategy=naive_best_fit +export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" +export FLAGS_NPU_MC2=1 +export MC2_Recompute=1 +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS + +#240411新增 +export FLAGS_use_stride_kernel=0 +export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE +export MULTI_STREAM_MEMORY_REUSE=1 + +source /usr/local/Ascend/ascend-toolkit/set_env.sh + +python -u -m paddle.distributed.launch \ + --log_dir "./log_8.0" \ + run_pretrain.py \ + --model_name_or_path "meta-llama/Llama-2-13b" \ + --tokenizer_name_or_path "meta-llama/Llama-2-13b" \ + --input_dir "./pre-data" \ + --output_dir "./output" \ + --split 949,50,1 \ + --max_seq_length 4096 \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 32 \ + --per_device_eval_batch_size 1 \ + --use_flash_attention 1 \ + --use_fused_rms_norm 1 \ + --virtual_pp_degree 1 \ + --learning_rate 0.00001 \ + --min_learning_rate 0.000001 \ + --max_steps ${max_steps} \ + --decay_steps 2000 \ + --save_steps 2000 \ + --seed 100 \ + --weight_decay 0.01 \ + --warmup_steps 20 \ + --max_grad_norm 1.0 \ + --logging_steps 1 \ + --dataloader_num_workers 1 \ + --eval_steps 1001 \ + --tensor_parallel_degree 4 \ + --disable_tqdm true \ + --continue_training 0 \ + --do_train \ + --device "npu" \ + --enable_linear_fused_grad_add false \ + --fuse_attention_qkv true \ + --fuse_attention_ffn true \ + --use_fused_rope true \ + --recompute_use_reentrant true \ + --data_cache "./data_cache" \ + --bf16 \ + --fp16_opt_level "O2" \ + --amp_master_grad \ + --load_sharded_model true \ + --save_sharded_model true \ + --pipeline_parallel_degree 1 \ + --ignore_data_skip 0 \ + --force_reshard_pp true \ + --tensor_parallel_config "enable_mp_async_allreduce enable_mp_skip_c_identity" \ + --sequence_parallel 1 \ + --pipeline_parallel_config "disable_partial_send_recv" \ + --sharding "stage1" \ + --sharding_parallel_degree 2 diff --git a/llm/llama/npu/dev_opt_sft.sh b/llm/llama/npu/dev_opt_sft.sh new file mode 100644 index 000000000000..3a72d24721b6 --- /dev/null +++ b/llm/llama/npu/dev_opt_sft.sh @@ -0,0 +1,81 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +export FLAGS_use_stride_kernel=0 +export FLAGS_npu_storage_format=1 +export HCCL_INTRA_PCIE_EHABLE=0 +export HCCL_INTRA_ROCE_ENABLE=1 +export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" + +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS +export GLOG_v=0 +export FLAGS_NPU_MC2=1 +export MC2_Recompute=1 +export MC2=1 +export FLAGS_allocator_strategy=naive_best_fit +source /usr/local/Ascend/ascend-toolkit/set_env.sh + +#240411新增 +# export MC2=1 +export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE +export MULTI_STREAM_MEMORY_REUSE=1 + +export PYTHONPATH=../../:$PYTHONPATH +rm -rf sft_bf16_llama_N1C8 +rm -rf output/sft_bf16_llama_N1C8 +ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9 +ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -9 +python -u -m paddle.distributed.launch \ + --devices "0,1,2,3,4,5,6,7" \ + --log_dir "./sft_bf16_llama_N1C8" \ + ../finetune_generation.py \ + --device "npu" \ + --model_name_or_path "meta-llama/Llama-2-13b" \ + --dataset_name_or_path "data/" \ + --output_dir "./output/sft_bf16_llama_N1C8" \ + --logging_dir "./sft_logs" \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 32 \ + --per_device_eval_batch_size 1 \ + --eval_accumulation_steps 1 \ + --max_steps 2000 \ + --learning_rate 3e-06 \ + --warmup_steps 2 \ + --save_steps 1000 \ + --logging_steps 1 \ + --evaluation_strategy "epoch" \ + --src_length 1024 \ + --max_length 4096 \ + --fp16 true \ + --fp16_opt_level "O2" \ + --do_train true \ + --disable_tqdm true \ + --eval_with_do_generation false \ + --metric_for_best_model "accuracy" \ + --recompute false \ + --tensor_parallel_degree 4 \ + --pipeline_parallel_degree 1 \ + --zero_padding 0 \ + --amp_master_grad true \ + --fuse_attention_qkv true \ + --fuse_attention_ffn true \ + --sequence_parallel 1 \ + --use_flash_attention 1 \ + --use_fused_rope 1 \ + --use_fused_rms_norm 1 \ + --sharding_parallel_degree 2 \ + --pad_to_multiple_of 4096 \ + --sharding "stage1" \ + --sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap" \ No newline at end of file From cc2413268e35b31f23c64c202845ec73d4688586 Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Thu, 9 May 2024 13:52:27 +0800 Subject: [PATCH 03/31] Update dev_opt_lora.sh --- llm/llama/npu/dev_opt_lora.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llm/llama/npu/dev_opt_lora.sh b/llm/llama/npu/dev_opt_lora.sh index 09719c572eee..8da29cbbb788 100644 --- a/llm/llama/npu/dev_opt_lora.sh +++ b/llm/llama/npu/dev_opt_lora.sh @@ -42,7 +42,7 @@ rm -rf lora_bf16_llama_N1C8 rm -rf output/lora_bf16_llama_N1C8 ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9 ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -9 -export PYTHONPATH=../../:$PYTHONPATH +export PYTHONPATH=../../../:$PYTHONPATH python -u -m paddle.distributed.launch \ --devices "0,1,2,3,4,5,6,7" \ --log_dir "./lora_bf16_llama_N1C8" \ @@ -83,4 +83,4 @@ python -u -m paddle.distributed.launch \ --use_fused_rms_norm 1 \ --lora true \ --lora_rank 32 \ - --pad_to_multiple_of 4096 \ No newline at end of file + --pad_to_multiple_of 4096 From 036d03c401b811c7295ac47ff1dc47bbef2a8fa2 Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Thu, 9 May 2024 13:52:57 +0800 Subject: [PATCH 04/31] Update dev_opt_ppt.sh --- llm/llama/npu/dev_opt_ppt.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/llama/npu/dev_opt_ppt.sh b/llm/llama/npu/dev_opt_ppt.sh index b0cfb12f3223..81365289bef7 100644 --- a/llm/llama/npu/dev_opt_ppt.sh +++ b/llm/llama/npu/dev_opt_ppt.sh @@ -18,7 +18,7 @@ set -x ps aux | grep run_pretrain.py | grep -v grep | awk '{print $2}' | xargs kill -9 rm -rf ./log_8.0 rm -rf output -export PYTHONPATH=../:$PYTHONPATH +export PYTHONPATH=../../../:$PYTHONPATH export MC2=1 export GLOG_v=0 export FLAGS_npu_storage_format=1 From 8dd2d020ac6c180368952a2f2f62426a1b16ad8f Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Thu, 9 May 2024 13:54:05 +0800 Subject: [PATCH 05/31] Update dev_opt_lora.sh --- llm/llama/npu/dev_opt_lora.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/llama/npu/dev_opt_lora.sh b/llm/llama/npu/dev_opt_lora.sh index 8da29cbbb788..981a833d9c56 100644 --- a/llm/llama/npu/dev_opt_lora.sh +++ b/llm/llama/npu/dev_opt_lora.sh @@ -46,7 +46,7 @@ export PYTHONPATH=../../../:$PYTHONPATH python -u -m paddle.distributed.launch \ --devices "0,1,2,3,4,5,6,7" \ --log_dir "./lora_bf16_llama_N1C8" \ - ../finetune_generation.py \ + ../../finetune_generation.py \ --device "npu" \ --model_name_or_path "meta-llama/Llama-2-13b" \ --dataset_name_or_path "data/" \ From 96e69aa7faa8c58f5e003bdc658ab7a970dad1a6 Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Thu, 9 May 2024 13:54:33 +0800 Subject: [PATCH 06/31] Update dev_opt_ppt.sh --- llm/llama/npu/dev_opt_ppt.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/llama/npu/dev_opt_ppt.sh b/llm/llama/npu/dev_opt_ppt.sh index 81365289bef7..4db9d6a728a1 100644 --- a/llm/llama/npu/dev_opt_ppt.sh +++ b/llm/llama/npu/dev_opt_ppt.sh @@ -40,7 +40,7 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh python -u -m paddle.distributed.launch \ --log_dir "./log_8.0" \ - run_pretrain.py \ + ../run_pretrain.py \ --model_name_or_path "meta-llama/Llama-2-13b" \ --tokenizer_name_or_path "meta-llama/Llama-2-13b" \ --input_dir "./pre-data" \ From a35ba59e291b631b5ee37d40da07fed8a3c2561d Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Thu, 9 May 2024 13:55:20 +0800 Subject: [PATCH 07/31] Update dev_opt_sft.sh --- llm/llama/npu/dev_opt_sft.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llm/llama/npu/dev_opt_sft.sh b/llm/llama/npu/dev_opt_sft.sh index 3a72d24721b6..bce6867d234a 100644 --- a/llm/llama/npu/dev_opt_sft.sh +++ b/llm/llama/npu/dev_opt_sft.sh @@ -32,7 +32,7 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE export MULTI_STREAM_MEMORY_REUSE=1 -export PYTHONPATH=../../:$PYTHONPATH +export PYTHONPATH=../../../:$PYTHONPATH rm -rf sft_bf16_llama_N1C8 rm -rf output/sft_bf16_llama_N1C8 ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9 @@ -40,7 +40,7 @@ ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill - python -u -m paddle.distributed.launch \ --devices "0,1,2,3,4,5,6,7" \ --log_dir "./sft_bf16_llama_N1C8" \ - ../finetune_generation.py \ + ../../finetune_generation.py \ --device "npu" \ --model_name_or_path "meta-llama/Llama-2-13b" \ --dataset_name_or_path "data/" \ @@ -78,4 +78,4 @@ python -u -m paddle.distributed.launch \ --sharding_parallel_degree 2 \ --pad_to_multiple_of 4096 \ --sharding "stage1" \ - --sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap" \ No newline at end of file + --sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap" From 68388a7f09c5ed8197161f9f9b23044c2ade2b3b Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Sat, 11 May 2024 18:48:02 +0800 Subject: [PATCH 08/31] Rename dev_opt_lora.sh to llama_npu_opt_lora.sh --- llm/llama/npu/{dev_opt_lora.sh => llama_npu_opt_lora.sh} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename llm/llama/npu/{dev_opt_lora.sh => llama_npu_opt_lora.sh} (100%) diff --git a/llm/llama/npu/dev_opt_lora.sh b/llm/llama/npu/llama_npu_opt_lora.sh similarity index 100% rename from llm/llama/npu/dev_opt_lora.sh rename to llm/llama/npu/llama_npu_opt_lora.sh From fee8f04a006de61a589a7ff9888b8ee414f5a2e5 Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Sat, 11 May 2024 18:48:41 +0800 Subject: [PATCH 09/31] Update dev_opt_ppt.sh --- llm/llama/npu/dev_opt_ppt.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/llm/llama/npu/dev_opt_ppt.sh b/llm/llama/npu/dev_opt_ppt.sh index 4db9d6a728a1..d3a082f49c0d 100644 --- a/llm/llama/npu/dev_opt_ppt.sh +++ b/llm/llama/npu/dev_opt_ppt.sh @@ -31,7 +31,6 @@ export MC2_Recompute=1 unset PADDLE_TRAINER_ENDPOINTS unset DISTRIBUTED_TRAINER_ENDPOINTS -#240411新增 export FLAGS_use_stride_kernel=0 export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE export MULTI_STREAM_MEMORY_REUSE=1 From 783de3b70952f519d56709110fbe2a227511f34a Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Sat, 11 May 2024 18:49:19 +0800 Subject: [PATCH 10/31] Rename dev_opt_ppt.sh to llama_npu_opt_ppt.sh --- llm/llama/npu/{dev_opt_ppt.sh => llama_npu_opt_ppt.sh} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename llm/llama/npu/{dev_opt_ppt.sh => llama_npu_opt_ppt.sh} (100%) diff --git a/llm/llama/npu/dev_opt_ppt.sh b/llm/llama/npu/llama_npu_opt_ppt.sh similarity index 100% rename from llm/llama/npu/dev_opt_ppt.sh rename to llm/llama/npu/llama_npu_opt_ppt.sh From 10f94155fdce1397389e7daa66e80b52f29eceef Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Sat, 11 May 2024 18:50:01 +0800 Subject: [PATCH 11/31] Update llama_npu_opt_lora.sh --- llm/llama/npu/llama_npu_opt_lora.sh | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/llm/llama/npu/llama_npu_opt_lora.sh b/llm/llama/npu/llama_npu_opt_lora.sh index 981a833d9c56..fd1b004a8af5 100644 --- a/llm/llama/npu/llama_npu_opt_lora.sh +++ b/llm/llama/npu/llama_npu_opt_lora.sh @@ -13,17 +13,6 @@ # limitations under the License. max_steps=${1:-1000} -lock_seed_flag=${2:-close} -if [[ ${lock_seed_flag} =~ "open_lock_seed" ]];then - export npu_deterministic=true - export ACL_OP_DETERMINISTIC=true - export ACL_OPT_DETERMINISTIC=true - export HCCL_DETERMINISTIC=true -fi -echo lock_seed_flag -echo $lock_seed_flag -echo npu_deterministic ACL_OP_DETERMINISTIC ACL_OPT_DETERMINISTIC HCCL_DETERMINISTIC -echo $npu_deterministic $ACL_OP_DETERMINISTIC $ACL_OPT_DETERMINISTIC $HCCL_DETERMINISTIC export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" export FLAGS_use_stride_kernel=0 @@ -35,13 +24,10 @@ export MC2=1 export FLAGS_allocator_strategy=naive_best_fit source /usr/local/Ascend/ascend-toolkit/set_env.sh -#240411新增 -# export MC2=1 rm -rf lora_bf16_llama_N1C8 rm -rf output/lora_bf16_llama_N1C8 -ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9 -ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -9 +ps aux | grep "finetune_generation.py" | grep -v grep | awk '{print $2}' | xargs kill -9 export PYTHONPATH=../../../:$PYTHONPATH python -u -m paddle.distributed.launch \ --devices "0,1,2,3,4,5,6,7" \ From f3d96e519e7907f38ae1dff3d94bd77c8698f8ae Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Sat, 11 May 2024 18:51:04 +0800 Subject: [PATCH 12/31] Update and rename dev_opt_sft.sh to llama_npu_opt_sft.sh --- llm/llama/npu/{dev_opt_sft.sh => llama_npu_opt_sft.sh} | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) rename llm/llama/npu/{dev_opt_sft.sh => llama_npu_opt_sft.sh} (93%) diff --git a/llm/llama/npu/dev_opt_sft.sh b/llm/llama/npu/llama_npu_opt_sft.sh similarity index 93% rename from llm/llama/npu/dev_opt_sft.sh rename to llm/llama/npu/llama_npu_opt_sft.sh index bce6867d234a..786e6cf835aa 100644 --- a/llm/llama/npu/dev_opt_sft.sh +++ b/llm/llama/npu/llama_npu_opt_sft.sh @@ -27,16 +27,13 @@ export MC2=1 export FLAGS_allocator_strategy=naive_best_fit source /usr/local/Ascend/ascend-toolkit/set_env.sh -#240411新增 -# export MC2=1 export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE export MULTI_STREAM_MEMORY_REUSE=1 export PYTHONPATH=../../../:$PYTHONPATH rm -rf sft_bf16_llama_N1C8 rm -rf output/sft_bf16_llama_N1C8 -ps aux | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9 -ps aux | grep "run_pretrain.py" | grep -v grep | awk '{print $2}' | xargs kill -9 +ps aux | grep "finetune_generation.py" | grep -v grep | awk '{print $2}' | xargs kill -9 python -u -m paddle.distributed.launch \ --devices "0,1,2,3,4,5,6,7" \ --log_dir "./sft_bf16_llama_N1C8" \ From 6771aa9163162a30e87c75fe0c13b6ed0ff6db31 Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Mon, 13 May 2024 14:51:39 +0800 Subject: [PATCH 13/31] add funsion ops --- paddlenlp/transformers/llama/fusion_ops.py | 120 +++++++++++++++++++++ paddlenlp/transformers/llama/modeling.py | 72 +++---------- 2 files changed, 134 insertions(+), 58 deletions(-) create mode 100644 paddlenlp/transformers/llama/fusion_ops.py diff --git a/paddlenlp/transformers/llama/fusion_ops.py b/paddlenlp/transformers/llama/fusion_ops.py new file mode 100644 index 000000000000..4bc8ea2d0bd2 --- /dev/null +++ b/paddlenlp/transformers/llama/fusion_ops.py @@ -0,0 +1,120 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import paddle +import paddle.nn.functional as F + +try: + from paddle.incubate.nn.functional import fused_rotary_position_embedding +except ImportError: + fused_rotary_position_embedding = None + +try: + from paddle.incubate.nn.functional import swiglu +except ImportError: + + def swiglu(x, y=None): + if y is None: + x, y = paddle.chunk(x, chunks=2, axis=-1) + return F.silu(x) * y + + +from paddlenlp.utils.tools import get_env_device + +try: + if get_env_device() == "npu": + from paddle.base import core + + for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")): + if lib.endswith(".so"): + paddle.utils.cpp_extension.extension_utils.load_op_meta_info_and_register_op(lib) + from paddle.nn.functional.flash_attention import flash_attention +except: + flash_attention = None + + +def fusion_flash_attention( + query_states, + config, + key_states, + value_states, + attention_mask, + output_attentions, + alibi=None, + sequence_parallel=False, + reshard_layer=None, + npu_is_casual=False, +): + bsz, q_len, num_heads, head_dim = query_states.shape + _, kv_seq_len, _, _ = value_states.shape + version = paddle.version.full_version + if version != "0.0.0" and version <= "2.5.2": + if alibi is not None: + raise ValueError("Flash Attention doesn't support alibi") + attn_output, attn_weights = flash_attention( + query_states, + key_states, + value_states, + causal=True, + return_softmax=output_attentions, + ) + else: + if alibi is not None: + alibi = alibi.reshape([bsz, num_heads, 1, -1]) + attention_mask = attention_mask.cast(alibi.dtype) + alibi + if get_env_device() == "npu": + attn_output = core.eager._run_custom_op( + "flash_attention_npu", + query_states, + key_states, + value_states, + None, + attention_mask, + 0.0, + attention_mask is None, + True, + False, + npu_is_casual, + )[0] + else: + attn_output = F.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + is_causal=attention_mask is None, + ) + attn_weights = None + + if reshard_layer is not None: + # attn_output shape: [bs, seqlen, num_head/sep, head_dim] + attn_output = reshard_layer( + attn_output, + split_axis=1, + concat_axis=2, + ) + # attn_output shape: [bs, seqlen/sep, num_head, head_dim] + assert ( + config.sep_parallel_degree > 1 and q_len % config.sep_parallel_degree == 0 + ), f"q_len:{q_len}, config.sep_parallel_degree:{config.sep_parallel_degree}" + q_len = q_len // config.sep_parallel_degree + num_heads = num_heads * config.sep_parallel_degree + + if sequence_parallel: + attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads]) + else: + attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads]) + return (attn_output, attn_weights) if output_attentions else attn_output diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index be88d6af74dc..76e1cb80c59e 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -89,6 +89,7 @@ def swiglu(x, y=None): from paddle.nn.functional.flash_attention import flash_attention except: flash_attention = None +from funsion_ops import fusion_flash_attention __all__ = [ "LlamaModel", @@ -215,67 +216,22 @@ def scaled_dot_product_attention( _, kv_seq_len, _, _ = value_states.shape if config.use_flash_attention and flash_attention: + fusion_flash_attention( + query_states, + config, + key_states, + value_states, + attention_mask, + output_attentions, + alibi, + sequence_parallel, + reshard_layer, + npu_is_casual, + ) + # Paddle Flash Attention input [ bz, seqlen, nhead, head_dim] # Torch Flash Attention input [ bz, nhead, seqlen, head_dim] - version = paddle.version.full_version - if version != "0.0.0" and version <= "2.5.2": - if alibi is not None: - raise ValueError("Flash Attention doesn't support alibi") - attn_output, attn_weights = flash_attention( - query_states, - key_states, - value_states, - causal=True, - return_softmax=output_attentions, - ) - else: - if alibi is not None: - alibi = alibi.reshape([bsz, num_heads, 1, -1]) - attention_mask = attention_mask.cast(alibi.dtype) + alibi - if get_env_device() == "npu": - attn_output = core.eager._run_custom_op( - "flash_attention_npu", - query_states, - key_states, - value_states, - None, - attention_mask, - 0.0, - attention_mask is None, - True, - False, - npu_is_casual, - )[0] - else: - attn_output = F.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=attention_mask, - is_causal=attention_mask is None, - ) - attn_weights = None - - if reshard_layer is not None: - # attn_output shape: [bs, seqlen, num_head/sep, head_dim] - attn_output = reshard_layer( - attn_output, - split_axis=1, - concat_axis=2, - ) - # attn_output shape: [bs, seqlen/sep, num_head, head_dim] - assert ( - config.sep_parallel_degree > 1 and q_len % config.sep_parallel_degree == 0 - ), f"q_len:{q_len}, config.sep_parallel_degree:{config.sep_parallel_degree}" - q_len = q_len // config.sep_parallel_degree - num_heads = num_heads * config.sep_parallel_degree - - if sequence_parallel: - attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads]) - else: - attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads]) - return (attn_output, attn_weights) if output_attentions else attn_output else: # [ bz, seqlen, nhead, head_dim] -> [bs, nhead, seq_len, head_dim] query_states = paddle.transpose(query_states, [0, 2, 1, 3]) From 61dc79c70a44ac08b7b8179bd226fad51785f73d Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Mon, 13 May 2024 15:04:57 +0800 Subject: [PATCH 14/31] add funsion ops --- paddlenlp/transformers/llama/modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index 76e1cb80c59e..47fc0adb60e0 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -89,7 +89,7 @@ def swiglu(x, y=None): from paddle.nn.functional.flash_attention import flash_attention except: flash_attention = None -from funsion_ops import fusion_flash_attention +from fusion_ops import fusion_flash_attention __all__ = [ "LlamaModel", From 558200f62363fb47bd4d60df59e4c3328f8ce9a0 Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Mon, 13 May 2024 15:10:25 +0800 Subject: [PATCH 15/31] add funsion ops --- paddlenlp/transformers/llama/modeling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index 47fc0adb60e0..306258ae4aed 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -89,7 +89,7 @@ def swiglu(x, y=None): from paddle.nn.functional.flash_attention import flash_attention except: flash_attention = None -from fusion_ops import fusion_flash_attention +import fusion_ops __all__ = [ "LlamaModel", @@ -216,7 +216,7 @@ def scaled_dot_product_attention( _, kv_seq_len, _, _ = value_states.shape if config.use_flash_attention and flash_attention: - fusion_flash_attention( + fusion_ops.fusion_flash_attention( query_states, config, key_states, From f387c3007669885d58391677b6c0a963e9f1c8b5 Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Mon, 13 May 2024 15:38:26 +0800 Subject: [PATCH 16/31] add funsion ops --- paddlenlp/transformers/llama/fusion_ops.py | 120 --------------------- 1 file changed, 120 deletions(-) delete mode 100644 paddlenlp/transformers/llama/fusion_ops.py diff --git a/paddlenlp/transformers/llama/fusion_ops.py b/paddlenlp/transformers/llama/fusion_ops.py deleted file mode 100644 index 4bc8ea2d0bd2..000000000000 --- a/paddlenlp/transformers/llama/fusion_ops.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import paddle -import paddle.nn.functional as F - -try: - from paddle.incubate.nn.functional import fused_rotary_position_embedding -except ImportError: - fused_rotary_position_embedding = None - -try: - from paddle.incubate.nn.functional import swiglu -except ImportError: - - def swiglu(x, y=None): - if y is None: - x, y = paddle.chunk(x, chunks=2, axis=-1) - return F.silu(x) * y - - -from paddlenlp.utils.tools import get_env_device - -try: - if get_env_device() == "npu": - from paddle.base import core - - for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")): - if lib.endswith(".so"): - paddle.utils.cpp_extension.extension_utils.load_op_meta_info_and_register_op(lib) - from paddle.nn.functional.flash_attention import flash_attention -except: - flash_attention = None - - -def fusion_flash_attention( - query_states, - config, - key_states, - value_states, - attention_mask, - output_attentions, - alibi=None, - sequence_parallel=False, - reshard_layer=None, - npu_is_casual=False, -): - bsz, q_len, num_heads, head_dim = query_states.shape - _, kv_seq_len, _, _ = value_states.shape - version = paddle.version.full_version - if version != "0.0.0" and version <= "2.5.2": - if alibi is not None: - raise ValueError("Flash Attention doesn't support alibi") - attn_output, attn_weights = flash_attention( - query_states, - key_states, - value_states, - causal=True, - return_softmax=output_attentions, - ) - else: - if alibi is not None: - alibi = alibi.reshape([bsz, num_heads, 1, -1]) - attention_mask = attention_mask.cast(alibi.dtype) + alibi - if get_env_device() == "npu": - attn_output = core.eager._run_custom_op( - "flash_attention_npu", - query_states, - key_states, - value_states, - None, - attention_mask, - 0.0, - attention_mask is None, - True, - False, - npu_is_casual, - )[0] - else: - attn_output = F.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=attention_mask, - is_causal=attention_mask is None, - ) - attn_weights = None - - if reshard_layer is not None: - # attn_output shape: [bs, seqlen, num_head/sep, head_dim] - attn_output = reshard_layer( - attn_output, - split_axis=1, - concat_axis=2, - ) - # attn_output shape: [bs, seqlen/sep, num_head, head_dim] - assert ( - config.sep_parallel_degree > 1 and q_len % config.sep_parallel_degree == 0 - ), f"q_len:{q_len}, config.sep_parallel_degree:{config.sep_parallel_degree}" - q_len = q_len // config.sep_parallel_degree - num_heads = num_heads * config.sep_parallel_degree - - if sequence_parallel: - attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads]) - else: - attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads]) - return (attn_output, attn_weights) if output_attentions else attn_output From a12947b603962e25dd9c0c4eab77ec24cc64c2a8 Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Mon, 13 May 2024 15:39:43 +0800 Subject: [PATCH 17/31] add funsion ops --- paddlenlp/transformers/fusion_ops.py | 120 +++++++++++++++++++++++ paddlenlp/transformers/llama/modeling.py | 2 +- 2 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 paddlenlp/transformers/fusion_ops.py diff --git a/paddlenlp/transformers/fusion_ops.py b/paddlenlp/transformers/fusion_ops.py new file mode 100644 index 000000000000..4bc8ea2d0bd2 --- /dev/null +++ b/paddlenlp/transformers/fusion_ops.py @@ -0,0 +1,120 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import paddle +import paddle.nn.functional as F + +try: + from paddle.incubate.nn.functional import fused_rotary_position_embedding +except ImportError: + fused_rotary_position_embedding = None + +try: + from paddle.incubate.nn.functional import swiglu +except ImportError: + + def swiglu(x, y=None): + if y is None: + x, y = paddle.chunk(x, chunks=2, axis=-1) + return F.silu(x) * y + + +from paddlenlp.utils.tools import get_env_device + +try: + if get_env_device() == "npu": + from paddle.base import core + + for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")): + if lib.endswith(".so"): + paddle.utils.cpp_extension.extension_utils.load_op_meta_info_and_register_op(lib) + from paddle.nn.functional.flash_attention import flash_attention +except: + flash_attention = None + + +def fusion_flash_attention( + query_states, + config, + key_states, + value_states, + attention_mask, + output_attentions, + alibi=None, + sequence_parallel=False, + reshard_layer=None, + npu_is_casual=False, +): + bsz, q_len, num_heads, head_dim = query_states.shape + _, kv_seq_len, _, _ = value_states.shape + version = paddle.version.full_version + if version != "0.0.0" and version <= "2.5.2": + if alibi is not None: + raise ValueError("Flash Attention doesn't support alibi") + attn_output, attn_weights = flash_attention( + query_states, + key_states, + value_states, + causal=True, + return_softmax=output_attentions, + ) + else: + if alibi is not None: + alibi = alibi.reshape([bsz, num_heads, 1, -1]) + attention_mask = attention_mask.cast(alibi.dtype) + alibi + if get_env_device() == "npu": + attn_output = core.eager._run_custom_op( + "flash_attention_npu", + query_states, + key_states, + value_states, + None, + attention_mask, + 0.0, + attention_mask is None, + True, + False, + npu_is_casual, + )[0] + else: + attn_output = F.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + is_causal=attention_mask is None, + ) + attn_weights = None + + if reshard_layer is not None: + # attn_output shape: [bs, seqlen, num_head/sep, head_dim] + attn_output = reshard_layer( + attn_output, + split_axis=1, + concat_axis=2, + ) + # attn_output shape: [bs, seqlen/sep, num_head, head_dim] + assert ( + config.sep_parallel_degree > 1 and q_len % config.sep_parallel_degree == 0 + ), f"q_len:{q_len}, config.sep_parallel_degree:{config.sep_parallel_degree}" + q_len = q_len // config.sep_parallel_degree + num_heads = num_heads * config.sep_parallel_degree + + if sequence_parallel: + attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads]) + else: + attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads]) + return (attn_output, attn_weights) if output_attentions else attn_output diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index 306258ae4aed..55fdff304887 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -89,7 +89,7 @@ def swiglu(x, y=None): from paddle.nn.functional.flash_attention import flash_attention except: flash_attention = None -import fusion_ops +from .. import fusion_ops __all__ = [ "LlamaModel", From aff105e670e9990c8db5b7854c0e122e63058148 Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Mon, 13 May 2024 16:12:53 +0800 Subject: [PATCH 18/31] add funsion ops --- paddlenlp/transformers/llama/fusion_ops.py | 142 +++++++++++++++++++++ paddlenlp/transformers/llama/modeling.py | 2 +- 2 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 paddlenlp/transformers/llama/fusion_ops.py diff --git a/paddlenlp/transformers/llama/fusion_ops.py b/paddlenlp/transformers/llama/fusion_ops.py new file mode 100644 index 000000000000..1fc762e9c2e6 --- /dev/null +++ b/paddlenlp/transformers/llama/fusion_ops.py @@ -0,0 +1,142 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import paddle +import paddle.nn.functional as F + +try: + from paddle.incubate.nn.functional import fused_rotary_position_embedding +except ImportError: + fused_rotary_position_embedding = None + +try: + from paddle.incubate.nn.functional import swiglu +except ImportError: + + def swiglu(x, y=None): + if y is None: + x, y = paddle.chunk(x, chunks=2, axis=-1) + return F.silu(x) * y + + +from paddle.utils import try_import + +from paddlenlp.utils.tools import get_env_device + +try: + if get_env_device() == "npu": + from paddle.base import core + + for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")): + if lib.endswith(".so"): + paddle.utils.cpp_extension.extension_utils.load_op_meta_info_and_register_op(lib) + from paddle.nn.functional.flash_attention import flash_attention +except: + flash_attention = None + + +def rms_norm_fused(x_in, w, eps): + fused_ln = try_import("fused_ln") + return fused_ln.fused_rms_norm(x_in, w, eps)[0] + + +def fusion_rms_norm(hidden_states, weight, variance_epsilon): + if get_env_device() == "npu": + return core.eager._run_custom_op("rms_norm_npu", hidden_states, weight, variance_epsilon)[0] + elif get_env_device() == "xpu": + try: + import paddle_xpu_nn # noqa: F821 + + return paddle_xpu_nn.xpu_rms_norm(hidden_states, weight, variance_epsilon)[0] + except ImportError: + raise NotImplementedError( + f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature" + ) + return rms_norm_fused(hidden_states, weight, variance_epsilon) + + +def fusion_flash_attention( + query_states, + config, + key_states, + value_states, + attention_mask, + output_attentions, + alibi=None, + sequence_parallel=False, + reshard_layer=None, + npu_is_casual=False, +): + bsz, q_len, num_heads, head_dim = query_states.shape + _, kv_seq_len, _, _ = value_states.shape + version = paddle.version.full_version + if version != "0.0.0" and version <= "2.5.2": + if alibi is not None: + raise ValueError("Flash Attention doesn't support alibi") + attn_output, attn_weights = flash_attention( + query_states, + key_states, + value_states, + causal=True, + return_softmax=output_attentions, + ) + else: + if alibi is not None: + alibi = alibi.reshape([bsz, num_heads, 1, -1]) + attention_mask = attention_mask.cast(alibi.dtype) + alibi + if get_env_device() == "npu": + attn_output = core.eager._run_custom_op( + "flash_attention_npu", + query_states, + key_states, + value_states, + None, + attention_mask, + 0.0, + attention_mask is None, + True, + False, + npu_is_casual, + )[0] + else: + attn_output = F.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + is_causal=attention_mask is None, + ) + attn_weights = None + + if reshard_layer is not None: + # attn_output shape: [bs, seqlen, num_head/sep, head_dim] + attn_output = reshard_layer( + attn_output, + split_axis=1, + concat_axis=2, + ) + # attn_output shape: [bs, seqlen/sep, num_head, head_dim] + assert ( + config.sep_parallel_degree > 1 and q_len % config.sep_parallel_degree == 0 + ), f"q_len:{q_len}, config.sep_parallel_degree:{config.sep_parallel_degree}" + q_len = q_len // config.sep_parallel_degree + num_heads = num_heads * config.sep_parallel_degree + + if sequence_parallel: + attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads]) + else: + attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads]) + return (attn_output, attn_weights) if output_attentions else attn_output diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index 55fdff304887..99a8431095d4 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -216,7 +216,7 @@ def scaled_dot_product_attention( _, kv_seq_len, _, _ = value_states.shape if config.use_flash_attention and flash_attention: - fusion_ops.fusion_flash_attention( + return fusion_ops.fusion_flash_attention( query_states, config, key_states, From 075c8de78732641c3f5a32395a5f5e4b131c0d3f Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Mon, 13 May 2024 16:39:45 +0800 Subject: [PATCH 19/31] add funsion ops --- paddlenlp/transformers/llama/fusion_ops.py | 4 ++++ paddlenlp/transformers/llama/modeling.py | 13 +------------ 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/paddlenlp/transformers/llama/fusion_ops.py b/paddlenlp/transformers/llama/fusion_ops.py index 1fc762e9c2e6..0ee27057a3e8 100644 --- a/paddlenlp/transformers/llama/fusion_ops.py +++ b/paddlenlp/transformers/llama/fusion_ops.py @@ -48,6 +48,10 @@ def swiglu(x, y=None): flash_attention = None +def fusion_rope(): + pass + + def rms_norm_fused(x_in, w, eps): fused_ln = try_import("fused_ln") return fused_ln.fused_rms_norm(x_in, w, eps)[0] diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index 99a8431095d4..803c527d20ed 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -363,18 +363,7 @@ def __init__(self, config): def forward(self, hidden_states): if self.config.use_fused_rms_norm: - if get_env_device() == "npu": - return core.eager._run_custom_op("rms_norm_npu", hidden_states, self.weight, self.variance_epsilon)[0] - elif get_env_device() == "xpu": - try: - import paddle_xpu_nn # noqa: F821 - - return paddle_xpu_nn.xpu_rms_norm(hidden_states, self.weight, self.variance_epsilon)[0] - except ImportError: - raise NotImplementedError( - f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature" - ) - return rms_norm_fused(hidden_states, self.weight, self.variance_epsilon) + return fusion_ops.fusion_rms_norm(hidden_states, self.weight, self.variance_epsilon) if paddle.in_dynamic_mode(): with paddle.amp.auto_cast(False): From 15f2fe35e6c6cca1321cfa8b4aa992df84343a97 Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Mon, 13 May 2024 16:55:14 +0800 Subject: [PATCH 20/31] add funsion ops --- paddlenlp/transformers/llama/fusion_ops.py | 11 ++++++++++- paddlenlp/transformers/llama/modeling.py | 13 ++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/paddlenlp/transformers/llama/fusion_ops.py b/paddlenlp/transformers/llama/fusion_ops.py index 0ee27057a3e8..044669be2e56 100644 --- a/paddlenlp/transformers/llama/fusion_ops.py +++ b/paddlenlp/transformers/llama/fusion_ops.py @@ -48,7 +48,16 @@ def swiglu(x, y=None): flash_attention = None -def fusion_rope(): +def fusion_rope( + hidden_states, + position_ids, + past_key_value, + attention_mask, + output_attentions, + use_cache, + alibi, + npu_is_casual: bool = False, +): pass diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index 803c527d20ed..99a8431095d4 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -363,7 +363,18 @@ def __init__(self, config): def forward(self, hidden_states): if self.config.use_fused_rms_norm: - return fusion_ops.fusion_rms_norm(hidden_states, self.weight, self.variance_epsilon) + if get_env_device() == "npu": + return core.eager._run_custom_op("rms_norm_npu", hidden_states, self.weight, self.variance_epsilon)[0] + elif get_env_device() == "xpu": + try: + import paddle_xpu_nn # noqa: F821 + + return paddle_xpu_nn.xpu_rms_norm(hidden_states, self.weight, self.variance_epsilon)[0] + except ImportError: + raise NotImplementedError( + f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature" + ) + return rms_norm_fused(hidden_states, self.weight, self.variance_epsilon) if paddle.in_dynamic_mode(): with paddle.amp.auto_cast(False): From 27417693078910136dbf67c49578391ae04a80b1 Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Mon, 13 May 2024 17:15:22 +0800 Subject: [PATCH 21/31] add funsion ops --- paddlenlp/transformers/llama/fusion_ops.py | 56 +++++++++++--- paddlenlp/transformers/llama/modeling.py | 87 ++++++++++++---------- 2 files changed, 93 insertions(+), 50 deletions(-) diff --git a/paddlenlp/transformers/llama/fusion_ops.py b/paddlenlp/transformers/llama/fusion_ops.py index 044669be2e56..96f160534fc7 100644 --- a/paddlenlp/transformers/llama/fusion_ops.py +++ b/paddlenlp/transformers/llama/fusion_ops.py @@ -36,6 +36,10 @@ def swiglu(x, y=None): from paddlenlp.utils.tools import get_env_device +try: + from paddle.incubate.nn.functional import fused_rotary_position_embedding +except ImportError: + fused_rotary_position_embedding = None try: if get_env_device() == "npu": from paddle.base import core @@ -48,17 +52,47 @@ def swiglu(x, y=None): flash_attention = None -def fusion_rope( - hidden_states, - position_ids, - past_key_value, - attention_mask, - output_attentions, - use_cache, - alibi, - npu_is_casual: bool = False, -): - pass +def fusion_rope(query_states, key_states, value_states, hidden_states, position_ids, past_key_value, rotary_emb): + assert past_key_value is None, "fuse rotary not support cache kv for now" + batch_size, seq_length, num_heads, head_dim = query_states.shape + _, kv_seq_len, num_key_value_heads, _ = key_states.shape + cos, sin = rotary_emb(value_states, seq_len=kv_seq_len) + if get_env_device() == "npu": + query_states = core.eager._run_custom_op("fused_rope", query_states, cos, sin)[0] + key_states = core.eager._run_custom_op("fused_rope", key_states, cos, sin)[0] + else: + # paddle version > 2.6 or develop support q and k/v with different num_heads + paddle_version = float(paddle.__version__[:3]) + if ((paddle_version != 0.0) and (paddle_version <= 2.6)) and (num_heads != num_key_value_heads): + query_states, _, _ = fused_rotary_position_embedding( + query_states, + None, + None, + sin=sin, + cos=cos, + position_ids=position_ids, + use_neox_rotary_style=False, + ) + key_states, _, _ = fused_rotary_position_embedding( + key_states, + None, + None, + sin=sin, + cos=cos, + position_ids=position_ids, + use_neox_rotary_style=False, + ) + else: + query_states, key_states, _ = fused_rotary_position_embedding( + query_states, + key_states, + v=None, + sin=sin, + cos=cos, + position_ids=position_ids, + use_neox_rotary_style=False, + ) + return query_states, key_states def rms_norm_fused(x_in, w, eps): diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index 99a8431095d4..6d54fde21348 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -930,45 +930,54 @@ def forward( batch_size, seq_length, _, _ = query_states.shape position_ids = paddle.arange(seq_length, dtype="int64").expand((batch_size, seq_length)) if self.use_fused_rope: - assert past_key_value is None, "fuse rotary not support cache kv for now" - batch_size, seq_length, num_heads, head_dim = query_states.shape - _, kv_seq_len, num_key_value_heads, _ = key_states.shape - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - if get_env_device() == "npu": - query_states = core.eager._run_custom_op("fused_rope", query_states, cos, sin)[0] - key_states = core.eager._run_custom_op("fused_rope", key_states, cos, sin)[0] - else: - # paddle version > 2.6 or develop support q and k/v with different num_heads - paddle_version = float(paddle.__version__[:3]) - if ((paddle_version != 0.0) and (paddle_version <= 2.6)) and (num_heads != num_key_value_heads): - query_states, _, _ = fused_rotary_position_embedding( - query_states, - None, - None, - sin=sin, - cos=cos, - position_ids=position_ids, - use_neox_rotary_style=False, - ) - key_states, _, _ = fused_rotary_position_embedding( - key_states, - None, - None, - sin=sin, - cos=cos, - position_ids=position_ids, - use_neox_rotary_style=False, - ) - else: - query_states, key_states, _ = fused_rotary_position_embedding( - query_states, - key_states, - v=None, - sin=sin, - cos=cos, - position_ids=position_ids, - use_neox_rotary_style=False, - ) + query_states, key_states = fusion_ops.fusion_rope( + query_states, + key_states, + value_states, + hidden_states, + position_ids, + past_key_value, + self.rotary_emb, + ) + # assert past_key_value is None, "fuse rotary not support cache kv for now" + # batch_size, seq_length, num_heads, head_dim = query_states.shape + # _, kv_seq_len, num_key_value_heads, _ = key_states.shape + # cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + # if get_env_device() == "npu": + # query_states = core.eager._run_custom_op("fused_rope", query_states, cos, sin)[0] + # key_states = core.eager._run_custom_op("fused_rope", key_states, cos, sin)[0] + # else: + # # paddle version > 2.6 or develop support q and k/v with different num_heads + # paddle_version = float(paddle.__version__[:3]) + # if ((paddle_version != 0.0) and (paddle_version <= 2.6)) and (num_heads != num_key_value_heads): + # query_states, _, _ = fused_rotary_position_embedding( + # query_states, + # None, + # None, + # sin=sin, + # cos=cos, + # position_ids=position_ids, + # use_neox_rotary_style=False, + # ) + # key_states, _, _ = fused_rotary_position_embedding( + # key_states, + # None, + # None, + # sin=sin, + # cos=cos, + # position_ids=position_ids, + # use_neox_rotary_style=False, + # ) + # else: + # query_states, key_states, _ = fused_rotary_position_embedding( + # query_states, + # key_states, + # v=None, + # sin=sin, + # cos=cos, + # position_ids=position_ids, + # use_neox_rotary_style=False, + # ) else: if self.config.use_long_sequence_strategies: cos, sin = self.rotary_emb(seq_len=kv_seq_len) From 12fc0489d7beca11679697d1095a6c1dae058257 Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Mon, 13 May 2024 17:33:16 +0800 Subject: [PATCH 22/31] add funsion ops --- paddlenlp/transformers/fusion_ops.py | 69 ++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/paddlenlp/transformers/fusion_ops.py b/paddlenlp/transformers/fusion_ops.py index 4bc8ea2d0bd2..96f160534fc7 100644 --- a/paddlenlp/transformers/fusion_ops.py +++ b/paddlenlp/transformers/fusion_ops.py @@ -32,8 +32,14 @@ def swiglu(x, y=None): return F.silu(x) * y +from paddle.utils import try_import + from paddlenlp.utils.tools import get_env_device +try: + from paddle.incubate.nn.functional import fused_rotary_position_embedding +except ImportError: + fused_rotary_position_embedding = None try: if get_env_device() == "npu": from paddle.base import core @@ -46,6 +52,69 @@ def swiglu(x, y=None): flash_attention = None +def fusion_rope(query_states, key_states, value_states, hidden_states, position_ids, past_key_value, rotary_emb): + assert past_key_value is None, "fuse rotary not support cache kv for now" + batch_size, seq_length, num_heads, head_dim = query_states.shape + _, kv_seq_len, num_key_value_heads, _ = key_states.shape + cos, sin = rotary_emb(value_states, seq_len=kv_seq_len) + if get_env_device() == "npu": + query_states = core.eager._run_custom_op("fused_rope", query_states, cos, sin)[0] + key_states = core.eager._run_custom_op("fused_rope", key_states, cos, sin)[0] + else: + # paddle version > 2.6 or develop support q and k/v with different num_heads + paddle_version = float(paddle.__version__[:3]) + if ((paddle_version != 0.0) and (paddle_version <= 2.6)) and (num_heads != num_key_value_heads): + query_states, _, _ = fused_rotary_position_embedding( + query_states, + None, + None, + sin=sin, + cos=cos, + position_ids=position_ids, + use_neox_rotary_style=False, + ) + key_states, _, _ = fused_rotary_position_embedding( + key_states, + None, + None, + sin=sin, + cos=cos, + position_ids=position_ids, + use_neox_rotary_style=False, + ) + else: + query_states, key_states, _ = fused_rotary_position_embedding( + query_states, + key_states, + v=None, + sin=sin, + cos=cos, + position_ids=position_ids, + use_neox_rotary_style=False, + ) + return query_states, key_states + + +def rms_norm_fused(x_in, w, eps): + fused_ln = try_import("fused_ln") + return fused_ln.fused_rms_norm(x_in, w, eps)[0] + + +def fusion_rms_norm(hidden_states, weight, variance_epsilon): + if get_env_device() == "npu": + return core.eager._run_custom_op("rms_norm_npu", hidden_states, weight, variance_epsilon)[0] + elif get_env_device() == "xpu": + try: + import paddle_xpu_nn # noqa: F821 + + return paddle_xpu_nn.xpu_rms_norm(hidden_states, weight, variance_epsilon)[0] + except ImportError: + raise NotImplementedError( + f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature" + ) + return rms_norm_fused(hidden_states, weight, variance_epsilon) + + def fusion_flash_attention( query_states, config, From f678361ff3fc0c7e2227d8705150e34f61c0a1aa Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Mon, 13 May 2024 17:45:24 +0800 Subject: [PATCH 23/31] add funsion ops --- paddlenlp/transformers/llama/modeling.py | 66 ++++++------------------ 1 file changed, 15 insertions(+), 51 deletions(-) diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index 6d54fde21348..62e39bf0ba97 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -81,7 +81,7 @@ def swiglu(x, y=None): try: if get_env_device() == "npu": - from paddle.base import core + # from paddle.base import core for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")): if lib.endswith(".so"): @@ -363,18 +363,20 @@ def __init__(self, config): def forward(self, hidden_states): if self.config.use_fused_rms_norm: - if get_env_device() == "npu": - return core.eager._run_custom_op("rms_norm_npu", hidden_states, self.weight, self.variance_epsilon)[0] - elif get_env_device() == "xpu": - try: - import paddle_xpu_nn # noqa: F821 + return fusion_ops.fusion_rms_norm(hidden_states, self.weight, self.variance_epsilon) - return paddle_xpu_nn.xpu_rms_norm(hidden_states, self.weight, self.variance_epsilon)[0] - except ImportError: - raise NotImplementedError( - f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature" - ) - return rms_norm_fused(hidden_states, self.weight, self.variance_epsilon) + # if get_env_device() == "npu": + # return core.eager._run_custom_op("rms_norm_npu", hidden_states, self.weight, self.variance_epsilon)[0] + # elif get_env_device() == "xpu": + # try: + # import paddle_xpu_nn # noqa: F821 + + # return paddle_xpu_nn.xpu_rms_norm(hidden_states, self.weight, self.variance_epsilon)[0] + # except ImportError: + # raise NotImplementedError( + # f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature" + # ) + # return rms_norm_fused(hidden_states, self.weight, self.variance_epsilon) if paddle.in_dynamic_mode(): with paddle.amp.auto_cast(False): @@ -939,45 +941,7 @@ def forward( past_key_value, self.rotary_emb, ) - # assert past_key_value is None, "fuse rotary not support cache kv for now" - # batch_size, seq_length, num_heads, head_dim = query_states.shape - # _, kv_seq_len, num_key_value_heads, _ = key_states.shape - # cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - # if get_env_device() == "npu": - # query_states = core.eager._run_custom_op("fused_rope", query_states, cos, sin)[0] - # key_states = core.eager._run_custom_op("fused_rope", key_states, cos, sin)[0] - # else: - # # paddle version > 2.6 or develop support q and k/v with different num_heads - # paddle_version = float(paddle.__version__[:3]) - # if ((paddle_version != 0.0) and (paddle_version <= 2.6)) and (num_heads != num_key_value_heads): - # query_states, _, _ = fused_rotary_position_embedding( - # query_states, - # None, - # None, - # sin=sin, - # cos=cos, - # position_ids=position_ids, - # use_neox_rotary_style=False, - # ) - # key_states, _, _ = fused_rotary_position_embedding( - # key_states, - # None, - # None, - # sin=sin, - # cos=cos, - # position_ids=position_ids, - # use_neox_rotary_style=False, - # ) - # else: - # query_states, key_states, _ = fused_rotary_position_embedding( - # query_states, - # key_states, - # v=None, - # sin=sin, - # cos=cos, - # position_ids=position_ids, - # use_neox_rotary_style=False, - # ) + else: if self.config.use_long_sequence_strategies: cos, sin = self.rotary_emb(seq_len=kv_seq_len) From 9b2ca6bcbc2fa3eb8fcbf2456adba9d313eddd7b Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Mon, 13 May 2024 17:54:17 +0800 Subject: [PATCH 24/31] add funsion ops --- paddlenlp/transformers/llama/modeling.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index 62e39bf0ba97..f726cbe7ff86 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -81,7 +81,6 @@ def swiglu(x, y=None): try: if get_env_device() == "npu": - # from paddle.base import core for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")): if lib.endswith(".so"): @@ -365,19 +364,6 @@ def forward(self, hidden_states): if self.config.use_fused_rms_norm: return fusion_ops.fusion_rms_norm(hidden_states, self.weight, self.variance_epsilon) - # if get_env_device() == "npu": - # return core.eager._run_custom_op("rms_norm_npu", hidden_states, self.weight, self.variance_epsilon)[0] - # elif get_env_device() == "xpu": - # try: - # import paddle_xpu_nn # noqa: F821 - - # return paddle_xpu_nn.xpu_rms_norm(hidden_states, self.weight, self.variance_epsilon)[0] - # except ImportError: - # raise NotImplementedError( - # f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature" - # ) - # return rms_norm_fused(hidden_states, self.weight, self.variance_epsilon) - if paddle.in_dynamic_mode(): with paddle.amp.auto_cast(False): hidden_states = hidden_states.astype("float32") From cac0f8e605f094221bd95960f27a7ee4d4b6deae Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Mon, 13 May 2024 18:13:41 +0800 Subject: [PATCH 25/31] add funsion ops --- paddlenlp/transformers/llama/fusion_ops.py | 189 --------------------- 1 file changed, 189 deletions(-) delete mode 100644 paddlenlp/transformers/llama/fusion_ops.py diff --git a/paddlenlp/transformers/llama/fusion_ops.py b/paddlenlp/transformers/llama/fusion_ops.py deleted file mode 100644 index 96f160534fc7..000000000000 --- a/paddlenlp/transformers/llama/fusion_ops.py +++ /dev/null @@ -1,189 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import paddle -import paddle.nn.functional as F - -try: - from paddle.incubate.nn.functional import fused_rotary_position_embedding -except ImportError: - fused_rotary_position_embedding = None - -try: - from paddle.incubate.nn.functional import swiglu -except ImportError: - - def swiglu(x, y=None): - if y is None: - x, y = paddle.chunk(x, chunks=2, axis=-1) - return F.silu(x) * y - - -from paddle.utils import try_import - -from paddlenlp.utils.tools import get_env_device - -try: - from paddle.incubate.nn.functional import fused_rotary_position_embedding -except ImportError: - fused_rotary_position_embedding = None -try: - if get_env_device() == "npu": - from paddle.base import core - - for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")): - if lib.endswith(".so"): - paddle.utils.cpp_extension.extension_utils.load_op_meta_info_and_register_op(lib) - from paddle.nn.functional.flash_attention import flash_attention -except: - flash_attention = None - - -def fusion_rope(query_states, key_states, value_states, hidden_states, position_ids, past_key_value, rotary_emb): - assert past_key_value is None, "fuse rotary not support cache kv for now" - batch_size, seq_length, num_heads, head_dim = query_states.shape - _, kv_seq_len, num_key_value_heads, _ = key_states.shape - cos, sin = rotary_emb(value_states, seq_len=kv_seq_len) - if get_env_device() == "npu": - query_states = core.eager._run_custom_op("fused_rope", query_states, cos, sin)[0] - key_states = core.eager._run_custom_op("fused_rope", key_states, cos, sin)[0] - else: - # paddle version > 2.6 or develop support q and k/v with different num_heads - paddle_version = float(paddle.__version__[:3]) - if ((paddle_version != 0.0) and (paddle_version <= 2.6)) and (num_heads != num_key_value_heads): - query_states, _, _ = fused_rotary_position_embedding( - query_states, - None, - None, - sin=sin, - cos=cos, - position_ids=position_ids, - use_neox_rotary_style=False, - ) - key_states, _, _ = fused_rotary_position_embedding( - key_states, - None, - None, - sin=sin, - cos=cos, - position_ids=position_ids, - use_neox_rotary_style=False, - ) - else: - query_states, key_states, _ = fused_rotary_position_embedding( - query_states, - key_states, - v=None, - sin=sin, - cos=cos, - position_ids=position_ids, - use_neox_rotary_style=False, - ) - return query_states, key_states - - -def rms_norm_fused(x_in, w, eps): - fused_ln = try_import("fused_ln") - return fused_ln.fused_rms_norm(x_in, w, eps)[0] - - -def fusion_rms_norm(hidden_states, weight, variance_epsilon): - if get_env_device() == "npu": - return core.eager._run_custom_op("rms_norm_npu", hidden_states, weight, variance_epsilon)[0] - elif get_env_device() == "xpu": - try: - import paddle_xpu_nn # noqa: F821 - - return paddle_xpu_nn.xpu_rms_norm(hidden_states, weight, variance_epsilon)[0] - except ImportError: - raise NotImplementedError( - f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature" - ) - return rms_norm_fused(hidden_states, weight, variance_epsilon) - - -def fusion_flash_attention( - query_states, - config, - key_states, - value_states, - attention_mask, - output_attentions, - alibi=None, - sequence_parallel=False, - reshard_layer=None, - npu_is_casual=False, -): - bsz, q_len, num_heads, head_dim = query_states.shape - _, kv_seq_len, _, _ = value_states.shape - version = paddle.version.full_version - if version != "0.0.0" and version <= "2.5.2": - if alibi is not None: - raise ValueError("Flash Attention doesn't support alibi") - attn_output, attn_weights = flash_attention( - query_states, - key_states, - value_states, - causal=True, - return_softmax=output_attentions, - ) - else: - if alibi is not None: - alibi = alibi.reshape([bsz, num_heads, 1, -1]) - attention_mask = attention_mask.cast(alibi.dtype) + alibi - if get_env_device() == "npu": - attn_output = core.eager._run_custom_op( - "flash_attention_npu", - query_states, - key_states, - value_states, - None, - attention_mask, - 0.0, - attention_mask is None, - True, - False, - npu_is_casual, - )[0] - else: - attn_output = F.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=attention_mask, - is_causal=attention_mask is None, - ) - attn_weights = None - - if reshard_layer is not None: - # attn_output shape: [bs, seqlen, num_head/sep, head_dim] - attn_output = reshard_layer( - attn_output, - split_axis=1, - concat_axis=2, - ) - # attn_output shape: [bs, seqlen/sep, num_head, head_dim] - assert ( - config.sep_parallel_degree > 1 and q_len % config.sep_parallel_degree == 0 - ), f"q_len:{q_len}, config.sep_parallel_degree:{config.sep_parallel_degree}" - q_len = q_len // config.sep_parallel_degree - num_heads = num_heads * config.sep_parallel_degree - - if sequence_parallel: - attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads]) - else: - attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads]) - return (attn_output, attn_weights) if output_attentions else attn_output From 73866a297c9aeb0e3356e25c9991b083c63fd358 Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Mon, 13 May 2024 18:42:26 +0800 Subject: [PATCH 26/31] add funsion ops --- paddlenlp/transformers/llama/modeling.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index f726cbe7ff86..154dbad6117c 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -55,7 +55,6 @@ def swiglu(x, y=None): ) except: pass -from paddle.utils import try_import from paddlenlp.transformers.conversion_utils import ( StateDictNameMapping, @@ -340,11 +339,6 @@ def _expand_2d_mask(mask, dtype, tgt_length): return expanded_mask -def rms_norm_fused(x_in, w, eps): - fused_ln = try_import("fused_ln") - return fused_ln.fused_rms_norm(x_in, w, eps)[0] - - class LlamaRMSNorm(nn.Layer): def __init__(self, config): super().__init__() From d8f19500d7d75235ef4bd5b6841344cf271fba76 Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Mon, 13 May 2024 18:44:22 +0800 Subject: [PATCH 27/31] add funsion ops --- paddlenlp/transformers/llama/modeling.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index 154dbad6117c..82786f3383b3 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -89,6 +89,8 @@ def swiglu(x, y=None): flash_attention = None from .. import fusion_ops +rms_norm_fused = fusion_ops.fusion_ops + __all__ = [ "LlamaModel", "LlamaPretrainedModel", From 9a2f1c53dc243f16c6e8acb8727c53ceee6f24fc Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Mon, 13 May 2024 18:45:33 +0800 Subject: [PATCH 28/31] add funsion ops --- paddlenlp/transformers/llama/modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index 82786f3383b3..634d532753f4 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -89,7 +89,7 @@ def swiglu(x, y=None): flash_attention = None from .. import fusion_ops -rms_norm_fused = fusion_ops.fusion_ops +rms_norm_fused = fusion_ops.rms_norm_fused __all__ = [ "LlamaModel", From df78b71dc327e98ad8e2d2579b5cea3903dcab26 Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Tue, 14 May 2024 11:28:02 +0800 Subject: [PATCH 29/31] update --- paddlenlp/transformers/llama/fusion_ops.py | 189 +++++++++++++++++++++ paddlenlp/transformers/llama/modeling.py | 2 +- 2 files changed, 190 insertions(+), 1 deletion(-) create mode 100644 paddlenlp/transformers/llama/fusion_ops.py diff --git a/paddlenlp/transformers/llama/fusion_ops.py b/paddlenlp/transformers/llama/fusion_ops.py new file mode 100644 index 000000000000..96f160534fc7 --- /dev/null +++ b/paddlenlp/transformers/llama/fusion_ops.py @@ -0,0 +1,189 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import paddle +import paddle.nn.functional as F + +try: + from paddle.incubate.nn.functional import fused_rotary_position_embedding +except ImportError: + fused_rotary_position_embedding = None + +try: + from paddle.incubate.nn.functional import swiglu +except ImportError: + + def swiglu(x, y=None): + if y is None: + x, y = paddle.chunk(x, chunks=2, axis=-1) + return F.silu(x) * y + + +from paddle.utils import try_import + +from paddlenlp.utils.tools import get_env_device + +try: + from paddle.incubate.nn.functional import fused_rotary_position_embedding +except ImportError: + fused_rotary_position_embedding = None +try: + if get_env_device() == "npu": + from paddle.base import core + + for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")): + if lib.endswith(".so"): + paddle.utils.cpp_extension.extension_utils.load_op_meta_info_and_register_op(lib) + from paddle.nn.functional.flash_attention import flash_attention +except: + flash_attention = None + + +def fusion_rope(query_states, key_states, value_states, hidden_states, position_ids, past_key_value, rotary_emb): + assert past_key_value is None, "fuse rotary not support cache kv for now" + batch_size, seq_length, num_heads, head_dim = query_states.shape + _, kv_seq_len, num_key_value_heads, _ = key_states.shape + cos, sin = rotary_emb(value_states, seq_len=kv_seq_len) + if get_env_device() == "npu": + query_states = core.eager._run_custom_op("fused_rope", query_states, cos, sin)[0] + key_states = core.eager._run_custom_op("fused_rope", key_states, cos, sin)[0] + else: + # paddle version > 2.6 or develop support q and k/v with different num_heads + paddle_version = float(paddle.__version__[:3]) + if ((paddle_version != 0.0) and (paddle_version <= 2.6)) and (num_heads != num_key_value_heads): + query_states, _, _ = fused_rotary_position_embedding( + query_states, + None, + None, + sin=sin, + cos=cos, + position_ids=position_ids, + use_neox_rotary_style=False, + ) + key_states, _, _ = fused_rotary_position_embedding( + key_states, + None, + None, + sin=sin, + cos=cos, + position_ids=position_ids, + use_neox_rotary_style=False, + ) + else: + query_states, key_states, _ = fused_rotary_position_embedding( + query_states, + key_states, + v=None, + sin=sin, + cos=cos, + position_ids=position_ids, + use_neox_rotary_style=False, + ) + return query_states, key_states + + +def rms_norm_fused(x_in, w, eps): + fused_ln = try_import("fused_ln") + return fused_ln.fused_rms_norm(x_in, w, eps)[0] + + +def fusion_rms_norm(hidden_states, weight, variance_epsilon): + if get_env_device() == "npu": + return core.eager._run_custom_op("rms_norm_npu", hidden_states, weight, variance_epsilon)[0] + elif get_env_device() == "xpu": + try: + import paddle_xpu_nn # noqa: F821 + + return paddle_xpu_nn.xpu_rms_norm(hidden_states, weight, variance_epsilon)[0] + except ImportError: + raise NotImplementedError( + f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature" + ) + return rms_norm_fused(hidden_states, weight, variance_epsilon) + + +def fusion_flash_attention( + query_states, + config, + key_states, + value_states, + attention_mask, + output_attentions, + alibi=None, + sequence_parallel=False, + reshard_layer=None, + npu_is_casual=False, +): + bsz, q_len, num_heads, head_dim = query_states.shape + _, kv_seq_len, _, _ = value_states.shape + version = paddle.version.full_version + if version != "0.0.0" and version <= "2.5.2": + if alibi is not None: + raise ValueError("Flash Attention doesn't support alibi") + attn_output, attn_weights = flash_attention( + query_states, + key_states, + value_states, + causal=True, + return_softmax=output_attentions, + ) + else: + if alibi is not None: + alibi = alibi.reshape([bsz, num_heads, 1, -1]) + attention_mask = attention_mask.cast(alibi.dtype) + alibi + if get_env_device() == "npu": + attn_output = core.eager._run_custom_op( + "flash_attention_npu", + query_states, + key_states, + value_states, + None, + attention_mask, + 0.0, + attention_mask is None, + True, + False, + npu_is_casual, + )[0] + else: + attn_output = F.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + is_causal=attention_mask is None, + ) + attn_weights = None + + if reshard_layer is not None: + # attn_output shape: [bs, seqlen, num_head/sep, head_dim] + attn_output = reshard_layer( + attn_output, + split_axis=1, + concat_axis=2, + ) + # attn_output shape: [bs, seqlen/sep, num_head, head_dim] + assert ( + config.sep_parallel_degree > 1 and q_len % config.sep_parallel_degree == 0 + ), f"q_len:{q_len}, config.sep_parallel_degree:{config.sep_parallel_degree}" + q_len = q_len // config.sep_parallel_degree + num_heads = num_heads * config.sep_parallel_degree + + if sequence_parallel: + attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads]) + else: + attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads]) + return (attn_output, attn_weights) if output_attentions else attn_output diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index 634d532753f4..548a4e0a8cb3 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -87,7 +87,7 @@ def swiglu(x, y=None): from paddle.nn.functional.flash_attention import flash_attention except: flash_attention = None -from .. import fusion_ops +from . import fusion_ops rms_norm_fused = fusion_ops.rms_norm_fused From 8c3cd0d2f307b38391770bde8e8bb09acf0a6b62 Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Tue, 14 May 2024 11:30:17 +0800 Subject: [PATCH 30/31] Update fusion_ops.py --- paddlenlp/transformers/fusion_ops.py | 188 --------------------------- 1 file changed, 188 deletions(-) diff --git a/paddlenlp/transformers/fusion_ops.py b/paddlenlp/transformers/fusion_ops.py index 96f160534fc7..8b137891791f 100644 --- a/paddlenlp/transformers/fusion_ops.py +++ b/paddlenlp/transformers/fusion_ops.py @@ -1,189 +1 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os - -import paddle -import paddle.nn.functional as F - -try: - from paddle.incubate.nn.functional import fused_rotary_position_embedding -except ImportError: - fused_rotary_position_embedding = None - -try: - from paddle.incubate.nn.functional import swiglu -except ImportError: - - def swiglu(x, y=None): - if y is None: - x, y = paddle.chunk(x, chunks=2, axis=-1) - return F.silu(x) * y - - -from paddle.utils import try_import - -from paddlenlp.utils.tools import get_env_device - -try: - from paddle.incubate.nn.functional import fused_rotary_position_embedding -except ImportError: - fused_rotary_position_embedding = None -try: - if get_env_device() == "npu": - from paddle.base import core - - for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")): - if lib.endswith(".so"): - paddle.utils.cpp_extension.extension_utils.load_op_meta_info_and_register_op(lib) - from paddle.nn.functional.flash_attention import flash_attention -except: - flash_attention = None - - -def fusion_rope(query_states, key_states, value_states, hidden_states, position_ids, past_key_value, rotary_emb): - assert past_key_value is None, "fuse rotary not support cache kv for now" - batch_size, seq_length, num_heads, head_dim = query_states.shape - _, kv_seq_len, num_key_value_heads, _ = key_states.shape - cos, sin = rotary_emb(value_states, seq_len=kv_seq_len) - if get_env_device() == "npu": - query_states = core.eager._run_custom_op("fused_rope", query_states, cos, sin)[0] - key_states = core.eager._run_custom_op("fused_rope", key_states, cos, sin)[0] - else: - # paddle version > 2.6 or develop support q and k/v with different num_heads - paddle_version = float(paddle.__version__[:3]) - if ((paddle_version != 0.0) and (paddle_version <= 2.6)) and (num_heads != num_key_value_heads): - query_states, _, _ = fused_rotary_position_embedding( - query_states, - None, - None, - sin=sin, - cos=cos, - position_ids=position_ids, - use_neox_rotary_style=False, - ) - key_states, _, _ = fused_rotary_position_embedding( - key_states, - None, - None, - sin=sin, - cos=cos, - position_ids=position_ids, - use_neox_rotary_style=False, - ) - else: - query_states, key_states, _ = fused_rotary_position_embedding( - query_states, - key_states, - v=None, - sin=sin, - cos=cos, - position_ids=position_ids, - use_neox_rotary_style=False, - ) - return query_states, key_states - - -def rms_norm_fused(x_in, w, eps): - fused_ln = try_import("fused_ln") - return fused_ln.fused_rms_norm(x_in, w, eps)[0] - - -def fusion_rms_norm(hidden_states, weight, variance_epsilon): - if get_env_device() == "npu": - return core.eager._run_custom_op("rms_norm_npu", hidden_states, weight, variance_epsilon)[0] - elif get_env_device() == "xpu": - try: - import paddle_xpu_nn # noqa: F821 - - return paddle_xpu_nn.xpu_rms_norm(hidden_states, weight, variance_epsilon)[0] - except ImportError: - raise NotImplementedError( - f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature" - ) - return rms_norm_fused(hidden_states, weight, variance_epsilon) - - -def fusion_flash_attention( - query_states, - config, - key_states, - value_states, - attention_mask, - output_attentions, - alibi=None, - sequence_parallel=False, - reshard_layer=None, - npu_is_casual=False, -): - bsz, q_len, num_heads, head_dim = query_states.shape - _, kv_seq_len, _, _ = value_states.shape - version = paddle.version.full_version - if version != "0.0.0" and version <= "2.5.2": - if alibi is not None: - raise ValueError("Flash Attention doesn't support alibi") - attn_output, attn_weights = flash_attention( - query_states, - key_states, - value_states, - causal=True, - return_softmax=output_attentions, - ) - else: - if alibi is not None: - alibi = alibi.reshape([bsz, num_heads, 1, -1]) - attention_mask = attention_mask.cast(alibi.dtype) + alibi - if get_env_device() == "npu": - attn_output = core.eager._run_custom_op( - "flash_attention_npu", - query_states, - key_states, - value_states, - None, - attention_mask, - 0.0, - attention_mask is None, - True, - False, - npu_is_casual, - )[0] - else: - attn_output = F.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=attention_mask, - is_causal=attention_mask is None, - ) - attn_weights = None - - if reshard_layer is not None: - # attn_output shape: [bs, seqlen, num_head/sep, head_dim] - attn_output = reshard_layer( - attn_output, - split_axis=1, - concat_axis=2, - ) - # attn_output shape: [bs, seqlen/sep, num_head, head_dim] - assert ( - config.sep_parallel_degree > 1 and q_len % config.sep_parallel_degree == 0 - ), f"q_len:{q_len}, config.sep_parallel_degree:{config.sep_parallel_degree}" - q_len = q_len // config.sep_parallel_degree - num_heads = num_heads * config.sep_parallel_degree - - if sequence_parallel: - attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads]) - else: - attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads]) - return (attn_output, attn_weights) if output_attentions else attn_output From 0a6d6b8196d9588401af32a6e9b63e6314352fd8 Mon Sep 17 00:00:00 2001 From: Yangrl <2535184404@qq.com> Date: Tue, 14 May 2024 11:32:20 +0800 Subject: [PATCH 31/31] update --- paddlenlp/transformers/fusion_ops.py | 1 - 1 file changed, 1 deletion(-) delete mode 100644 paddlenlp/transformers/fusion_ops.py diff --git a/paddlenlp/transformers/fusion_ops.py b/paddlenlp/transformers/fusion_ops.py deleted file mode 100644 index 8b137891791f..000000000000 --- a/paddlenlp/transformers/fusion_ops.py +++ /dev/null @@ -1 +0,0 @@ -