diff --git a/llm/llama/npu/llama_npu_opt_lora.sh b/llm/llama/npu/llama_npu_opt_lora.sh new file mode 100644 index 000000000000..fd1b004a8af5 --- /dev/null +++ b/llm/llama/npu/llama_npu_opt_lora.sh @@ -0,0 +1,72 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +max_steps=${1:-1000} + +export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" +export FLAGS_use_stride_kernel=0 +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS +export FLAGS_NPU_MC2=1 +export MC2_Recompute=1 +export MC2=1 +export FLAGS_allocator_strategy=naive_best_fit +source /usr/local/Ascend/ascend-toolkit/set_env.sh + + +rm -rf lora_bf16_llama_N1C8 +rm -rf output/lora_bf16_llama_N1C8 +ps aux | grep "finetune_generation.py" | grep -v grep | awk '{print $2}' | xargs kill -9 +export PYTHONPATH=../../../:$PYTHONPATH +python -u -m paddle.distributed.launch \ + --devices "0,1,2,3,4,5,6,7" \ + --log_dir "./lora_bf16_llama_N1C8" \ + ../../finetune_generation.py \ + --device "npu" \ + --model_name_or_path "meta-llama/Llama-2-13b" \ + --dataset_name_or_path "data/" \ + --output_dir "./output/lora_bf16_llama_N1C8" \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 16 \ + --per_device_eval_batch_size 1 \ + --eval_accumulation_steps 1 \ + --max_steps ${max_steps} \ + --decay_steps 2000 \ + --learning_rate 3e-06 \ + --warmup_steps 2 \ + --save_steps 1000 \ + --logging_steps 1 \ + --evaluation_strategy "epoch" \ + --src_length 1024 \ + --max_length 4096 \ + --bf16 true \ + --fp16_opt_level "O2" \ + --do_train true \ + --disable_tqdm true \ + --eval_with_do_generation false \ + --metric_for_best_model "accuracy" \ + --recompute false \ + --tensor_parallel_degree 8 \ + --pipeline_parallel_degree 1 \ + --zero_padding 0 \ + --sequence_parallel 1 \ + --amp_master_grad true \ + --fuse_attention_qkv true \ + --fuse_attention_ffn true \ + --use_flash_attention 1 \ + --use_fused_rope 1 \ + --use_fused_rms_norm 1 \ + --lora true \ + --lora_rank 32 \ + --pad_to_multiple_of 4096 diff --git a/llm/llama/npu/llama_npu_opt_ppt.sh b/llm/llama/npu/llama_npu_opt_ppt.sh new file mode 100644 index 000000000000..d3a082f49c0d --- /dev/null +++ b/llm/llama/npu/llama_npu_opt_ppt.sh @@ -0,0 +1,90 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +max_steps=${1:-800} + +set -x +ps aux | grep run_pretrain.py | grep -v grep | awk '{print $2}' | xargs kill -9 +rm -rf ./log_8.0 +rm -rf output +export PYTHONPATH=../../../:$PYTHONPATH +export MC2=1 +export GLOG_v=0 +export FLAGS_npu_storage_format=1 +export HCCL_INTRA_PCIE_EHABLE=0 +export HCCL_INTRA_ROCE_ENABLE=1 +export FLAGS_allocator_strategy=naive_best_fit +export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" +export FLAGS_NPU_MC2=1 +export MC2_Recompute=1 +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS + +export FLAGS_use_stride_kernel=0 +export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE +export MULTI_STREAM_MEMORY_REUSE=1 + +source /usr/local/Ascend/ascend-toolkit/set_env.sh + +python -u -m paddle.distributed.launch \ + --log_dir "./log_8.0" \ + ../run_pretrain.py \ + --model_name_or_path "meta-llama/Llama-2-13b" \ + --tokenizer_name_or_path "meta-llama/Llama-2-13b" \ + --input_dir "./pre-data" \ + --output_dir "./output" \ + --split 949,50,1 \ + --max_seq_length 4096 \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 32 \ + --per_device_eval_batch_size 1 \ + --use_flash_attention 1 \ + --use_fused_rms_norm 1 \ + --virtual_pp_degree 1 \ + --learning_rate 0.00001 \ + --min_learning_rate 0.000001 \ + --max_steps ${max_steps} \ + --decay_steps 2000 \ + --save_steps 2000 \ + --seed 100 \ + --weight_decay 0.01 \ + --warmup_steps 20 \ + --max_grad_norm 1.0 \ + --logging_steps 1 \ + --dataloader_num_workers 1 \ + --eval_steps 1001 \ + --tensor_parallel_degree 4 \ + --disable_tqdm true \ + --continue_training 0 \ + --do_train \ + --device "npu" \ + --enable_linear_fused_grad_add false \ + --fuse_attention_qkv true \ + --fuse_attention_ffn true \ + --use_fused_rope true \ + --recompute_use_reentrant true \ + --data_cache "./data_cache" \ + --bf16 \ + --fp16_opt_level "O2" \ + --amp_master_grad \ + --load_sharded_model true \ + --save_sharded_model true \ + --pipeline_parallel_degree 1 \ + --ignore_data_skip 0 \ + --force_reshard_pp true \ + --tensor_parallel_config "enable_mp_async_allreduce enable_mp_skip_c_identity" \ + --sequence_parallel 1 \ + --pipeline_parallel_config "disable_partial_send_recv" \ + --sharding "stage1" \ + --sharding_parallel_degree 2 diff --git a/llm/llama/npu/llama_npu_opt_sft.sh b/llm/llama/npu/llama_npu_opt_sft.sh new file mode 100644 index 000000000000..786e6cf835aa --- /dev/null +++ b/llm/llama/npu/llama_npu_opt_sft.sh @@ -0,0 +1,78 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +export FLAGS_use_stride_kernel=0 +export FLAGS_npu_storage_format=1 +export HCCL_INTRA_PCIE_EHABLE=0 +export HCCL_INTRA_ROCE_ENABLE=1 +export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" + +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS +export GLOG_v=0 +export FLAGS_NPU_MC2=1 +export MC2_Recompute=1 +export MC2=1 +export FLAGS_allocator_strategy=naive_best_fit +source /usr/local/Ascend/ascend-toolkit/set_env.sh + +export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE +export MULTI_STREAM_MEMORY_REUSE=1 + +export PYTHONPATH=../../../:$PYTHONPATH +rm -rf sft_bf16_llama_N1C8 +rm -rf output/sft_bf16_llama_N1C8 +ps aux | grep "finetune_generation.py" | grep -v grep | awk '{print $2}' | xargs kill -9 +python -u -m paddle.distributed.launch \ + --devices "0,1,2,3,4,5,6,7" \ + --log_dir "./sft_bf16_llama_N1C8" \ + ../../finetune_generation.py \ + --device "npu" \ + --model_name_or_path "meta-llama/Llama-2-13b" \ + --dataset_name_or_path "data/" \ + --output_dir "./output/sft_bf16_llama_N1C8" \ + --logging_dir "./sft_logs" \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 32 \ + --per_device_eval_batch_size 1 \ + --eval_accumulation_steps 1 \ + --max_steps 2000 \ + --learning_rate 3e-06 \ + --warmup_steps 2 \ + --save_steps 1000 \ + --logging_steps 1 \ + --evaluation_strategy "epoch" \ + --src_length 1024 \ + --max_length 4096 \ + --fp16 true \ + --fp16_opt_level "O2" \ + --do_train true \ + --disable_tqdm true \ + --eval_with_do_generation false \ + --metric_for_best_model "accuracy" \ + --recompute false \ + --tensor_parallel_degree 4 \ + --pipeline_parallel_degree 1 \ + --zero_padding 0 \ + --amp_master_grad true \ + --fuse_attention_qkv true \ + --fuse_attention_ffn true \ + --sequence_parallel 1 \ + --use_flash_attention 1 \ + --use_fused_rope 1 \ + --use_fused_rms_norm 1 \ + --sharding_parallel_degree 2 \ + --pad_to_multiple_of 4096 \ + --sharding "stage1" \ + --sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap"