add npu-llama-opt0-script (#8401)

Galaxy1458 · web-flow · commit 17fb497af4d2 · 2024-05-13T13:59:53.000+08:00
* update

* add llama-npu-opt-script

* Update dev_opt_lora.sh

* Update dev_opt_ppt.sh

* Update dev_opt_lora.sh

* Update dev_opt_ppt.sh

* Update dev_opt_sft.sh

* Rename dev_opt_lora.sh to llama_npu_opt_lora.sh

* Update dev_opt_ppt.sh

* Rename dev_opt_ppt.sh to llama_npu_opt_ppt.sh

* Update llama_npu_opt_lora.sh

* Update and rename dev_opt_sft.sh to llama_npu_opt_sft.sh
diff --git a/llm/llama/npu/llama_npu_opt_lora.sh b/llm/llama/npu/llama_npu_opt_lora.sh
@@ -0,0 +1,72 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+max_steps=${1:-1000}
+
+export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+export FLAGS_use_stride_kernel=0
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+export FLAGS_NPU_MC2=1
+export MC2_Recompute=1
+export MC2=1
+export FLAGS_allocator_strategy=naive_best_fit
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+
+rm -rf lora_bf16_llama_N1C8
+rm -rf output/lora_bf16_llama_N1C8
+ps aux | grep "finetune_generation.py" | grep -v grep | awk '{print $2}' | xargs kill -9
+export PYTHONPATH=../../../:$PYTHONPATH
+python -u  -m paddle.distributed.launch \
+    --devices "0,1,2,3,4,5,6,7" \
+    --log_dir "./lora_bf16_llama_N1C8" \
+    ../../finetune_generation.py \
+    --device "npu" \
+    --model_name_or_path "meta-llama/Llama-2-13b" \
+    --dataset_name_or_path "data/" \
+    --output_dir "./output/lora_bf16_llama_N1C8" \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 16 \
+    --per_device_eval_batch_size 1 \
+    --eval_accumulation_steps 1 \
+    --max_steps ${max_steps} \
+    --decay_steps 2000 \
+    --learning_rate 3e-06 \
+    --warmup_steps 2 \
+    --save_steps 1000 \
+    --logging_steps 1 \
+    --evaluation_strategy "epoch" \
+    --src_length 1024 \
+    --max_length 4096 \
+    --bf16 true \
+    --fp16_opt_level "O2" \
+    --do_train true \
+    --disable_tqdm true \
+    --eval_with_do_generation false \
+    --metric_for_best_model "accuracy" \
+    --recompute false \
+    --tensor_parallel_degree 8 \
+    --pipeline_parallel_degree 1 \
+    --zero_padding 0 \
+    --sequence_parallel 1 \
+    --amp_master_grad true \
+    --fuse_attention_qkv true \
+    --fuse_attention_ffn true \
+    --use_flash_attention 1 \
+    --use_fused_rope 1 \
+    --use_fused_rms_norm 1 \
+    --lora true \
+    --lora_rank 32 \
+    --pad_to_multiple_of 4096
diff --git a/llm/llama/npu/llama_npu_opt_ppt.sh b/llm/llama/npu/llama_npu_opt_ppt.sh
@@ -0,0 +1,90 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+max_steps=${1:-800}
+
+set -x
+ps aux | grep run_pretrain.py | grep -v grep | awk '{print $2}' | xargs kill -9
+rm -rf ./log_8.0
+rm -rf output
+export PYTHONPATH=../../../:$PYTHONPATH
+export MC2=1
+export GLOG_v=0
+export FLAGS_npu_storage_format=1
+export HCCL_INTRA_PCIE_EHABLE=0
+export HCCL_INTRA_ROCE_ENABLE=1
+export FLAGS_allocator_strategy=naive_best_fit
+export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+export FLAGS_NPU_MC2=1
+export MC2_Recompute=1
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+
+export FLAGS_use_stride_kernel=0
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export MULTI_STREAM_MEMORY_REUSE=1
+
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+python -u  -m paddle.distributed.launch \
+    --log_dir "./log_8.0" \
+    ../run_pretrain.py \
+    --model_name_or_path "meta-llama/Llama-2-13b" \
+    --tokenizer_name_or_path "meta-llama/Llama-2-13b" \
+    --input_dir "./pre-data" \
+    --output_dir "./output" \
+    --split 949,50,1 \
+    --max_seq_length 4096 \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 32 \
+    --per_device_eval_batch_size 1 \
+    --use_flash_attention 1 \
+    --use_fused_rms_norm 1 \
+    --virtual_pp_degree 1 \
+    --learning_rate 0.00001 \
+    --min_learning_rate 0.000001 \
+    --max_steps ${max_steps} \
+    --decay_steps 2000 \
+    --save_steps 2000 \
+    --seed 100 \
+    --weight_decay 0.01 \
+    --warmup_steps 20 \
+    --max_grad_norm 1.0 \
+    --logging_steps 1 \
+    --dataloader_num_workers 1 \
+    --eval_steps 1001 \
+    --tensor_parallel_degree 4 \
+    --disable_tqdm true \
+    --continue_training 0 \
+    --do_train \
+    --device "npu" \
+    --enable_linear_fused_grad_add false \
+    --fuse_attention_qkv true \
+    --fuse_attention_ffn true \
+    --use_fused_rope true \
+    --recompute_use_reentrant true \
+    --data_cache "./data_cache" \
+    --bf16 \
+    --fp16_opt_level "O2" \
+    --amp_master_grad \
+    --load_sharded_model true \
+    --save_sharded_model true \
+    --pipeline_parallel_degree 1 \
+    --ignore_data_skip 0 \
+    --force_reshard_pp true \
+    --tensor_parallel_config "enable_mp_async_allreduce enable_mp_skip_c_identity" \
+    --sequence_parallel 1 \
+    --pipeline_parallel_config "disable_partial_send_recv" \
+    --sharding "stage1" \
+    --sharding_parallel_degree 2
diff --git a/llm/llama/npu/llama_npu_opt_sft.sh b/llm/llama/npu/llama_npu_opt_sft.sh
@@ -0,0 +1,78 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export FLAGS_use_stride_kernel=0
+export FLAGS_npu_storage_format=1
+export HCCL_INTRA_PCIE_EHABLE=0
+export HCCL_INTRA_ROCE_ENABLE=1
+export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+export GLOG_v=0
+export FLAGS_NPU_MC2=1
+export MC2_Recompute=1
+export MC2=1
+export FLAGS_allocator_strategy=naive_best_fit
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export MULTI_STREAM_MEMORY_REUSE=1
+
+export PYTHONPATH=../../../:$PYTHONPATH
+rm -rf sft_bf16_llama_N1C8
+rm -rf output/sft_bf16_llama_N1C8
+ps aux | grep "finetune_generation.py" | grep -v grep | awk '{print $2}' | xargs kill -9
+python -u  -m paddle.distributed.launch \
+    --devices "0,1,2,3,4,5,6,7" \
+    --log_dir "./sft_bf16_llama_N1C8" \
+    ../../finetune_generation.py \
+    --device "npu" \
+    --model_name_or_path "meta-llama/Llama-2-13b" \
+    --dataset_name_or_path "data/" \
+    --output_dir "./output/sft_bf16_llama_N1C8" \
+    --logging_dir "./sft_logs" \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 32 \
+    --per_device_eval_batch_size 1 \
+    --eval_accumulation_steps 1 \
+    --max_steps 2000 \
+    --learning_rate 3e-06 \
+    --warmup_steps 2 \
+    --save_steps 1000 \
+    --logging_steps 1 \
+    --evaluation_strategy "epoch" \
+    --src_length 1024 \
+    --max_length 4096 \
+    --fp16 true \
+    --fp16_opt_level "O2" \
+    --do_train true \
+    --disable_tqdm true \
+    --eval_with_do_generation false \
+    --metric_for_best_model "accuracy" \
+    --recompute false \
+    --tensor_parallel_degree 4 \
+    --pipeline_parallel_degree 1 \
+    --zero_padding 0 \
+    --amp_master_grad true \
+    --fuse_attention_qkv true \
+    --fuse_attention_ffn true \
+    --sequence_parallel 1 \
+    --use_flash_attention 1 \
+    --use_fused_rope 1 \
+    --use_fused_rms_norm 1 \
+        --sharding_parallel_degree 2 \
+    --pad_to_multiple_of 4096 \
+        --sharding "stage1" \
+        --sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap"