Skip to content

Commit 17fb497

Browse files
authored
add npu-llama-opt0-script (#8401)
* update * add llama-npu-opt-script * Update dev_opt_lora.sh * Update dev_opt_ppt.sh * Update dev_opt_lora.sh * Update dev_opt_ppt.sh * Update dev_opt_sft.sh * Rename dev_opt_lora.sh to llama_npu_opt_lora.sh * Update dev_opt_ppt.sh * Rename dev_opt_ppt.sh to llama_npu_opt_ppt.sh * Update llama_npu_opt_lora.sh * Update and rename dev_opt_sft.sh to llama_npu_opt_sft.sh
1 parent 474aaaa commit 17fb497

File tree

3 files changed

+240
-0
lines changed

3 files changed

+240
-0
lines changed

llm/llama/npu/llama_npu_opt_lora.sh

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
max_steps=${1:-1000}
16+
17+
export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
18+
export FLAGS_use_stride_kernel=0
19+
unset PADDLE_TRAINER_ENDPOINTS
20+
unset DISTRIBUTED_TRAINER_ENDPOINTS
21+
export FLAGS_NPU_MC2=1
22+
export MC2_Recompute=1
23+
export MC2=1
24+
export FLAGS_allocator_strategy=naive_best_fit
25+
source /usr/local/Ascend/ascend-toolkit/set_env.sh
26+
27+
28+
rm -rf lora_bf16_llama_N1C8
29+
rm -rf output/lora_bf16_llama_N1C8
30+
ps aux | grep "finetune_generation.py" | grep -v grep | awk '{print $2}' | xargs kill -9
31+
export PYTHONPATH=../../../:$PYTHONPATH
32+
python -u -m paddle.distributed.launch \
33+
--devices "0,1,2,3,4,5,6,7" \
34+
--log_dir "./lora_bf16_llama_N1C8" \
35+
../../finetune_generation.py \
36+
--device "npu" \
37+
--model_name_or_path "meta-llama/Llama-2-13b" \
38+
--dataset_name_or_path "data/" \
39+
--output_dir "./output/lora_bf16_llama_N1C8" \
40+
--per_device_train_batch_size 2 \
41+
--gradient_accumulation_steps 16 \
42+
--per_device_eval_batch_size 1 \
43+
--eval_accumulation_steps 1 \
44+
--max_steps ${max_steps} \
45+
--decay_steps 2000 \
46+
--learning_rate 3e-06 \
47+
--warmup_steps 2 \
48+
--save_steps 1000 \
49+
--logging_steps 1 \
50+
--evaluation_strategy "epoch" \
51+
--src_length 1024 \
52+
--max_length 4096 \
53+
--bf16 true \
54+
--fp16_opt_level "O2" \
55+
--do_train true \
56+
--disable_tqdm true \
57+
--eval_with_do_generation false \
58+
--metric_for_best_model "accuracy" \
59+
--recompute false \
60+
--tensor_parallel_degree 8 \
61+
--pipeline_parallel_degree 1 \
62+
--zero_padding 0 \
63+
--sequence_parallel 1 \
64+
--amp_master_grad true \
65+
--fuse_attention_qkv true \
66+
--fuse_attention_ffn true \
67+
--use_flash_attention 1 \
68+
--use_fused_rope 1 \
69+
--use_fused_rms_norm 1 \
70+
--lora true \
71+
--lora_rank 32 \
72+
--pad_to_multiple_of 4096

llm/llama/npu/llama_npu_opt_ppt.sh

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
max_steps=${1:-800}
16+
17+
set -x
18+
ps aux | grep run_pretrain.py | grep -v grep | awk '{print $2}' | xargs kill -9
19+
rm -rf ./log_8.0
20+
rm -rf output
21+
export PYTHONPATH=../../../:$PYTHONPATH
22+
export MC2=1
23+
export GLOG_v=0
24+
export FLAGS_npu_storage_format=1
25+
export HCCL_INTRA_PCIE_EHABLE=0
26+
export HCCL_INTRA_ROCE_ENABLE=1
27+
export FLAGS_allocator_strategy=naive_best_fit
28+
export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
29+
export FLAGS_NPU_MC2=1
30+
export MC2_Recompute=1
31+
unset PADDLE_TRAINER_ENDPOINTS
32+
unset DISTRIBUTED_TRAINER_ENDPOINTS
33+
34+
export FLAGS_use_stride_kernel=0
35+
export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
36+
export MULTI_STREAM_MEMORY_REUSE=1
37+
38+
source /usr/local/Ascend/ascend-toolkit/set_env.sh
39+
40+
python -u -m paddle.distributed.launch \
41+
--log_dir "./log_8.0" \
42+
../run_pretrain.py \
43+
--model_name_or_path "meta-llama/Llama-2-13b" \
44+
--tokenizer_name_or_path "meta-llama/Llama-2-13b" \
45+
--input_dir "./pre-data" \
46+
--output_dir "./output" \
47+
--split 949,50,1 \
48+
--max_seq_length 4096 \
49+
--per_device_train_batch_size 2 \
50+
--gradient_accumulation_steps 32 \
51+
--per_device_eval_batch_size 1 \
52+
--use_flash_attention 1 \
53+
--use_fused_rms_norm 1 \
54+
--virtual_pp_degree 1 \
55+
--learning_rate 0.00001 \
56+
--min_learning_rate 0.000001 \
57+
--max_steps ${max_steps} \
58+
--decay_steps 2000 \
59+
--save_steps 2000 \
60+
--seed 100 \
61+
--weight_decay 0.01 \
62+
--warmup_steps 20 \
63+
--max_grad_norm 1.0 \
64+
--logging_steps 1 \
65+
--dataloader_num_workers 1 \
66+
--eval_steps 1001 \
67+
--tensor_parallel_degree 4 \
68+
--disable_tqdm true \
69+
--continue_training 0 \
70+
--do_train \
71+
--device "npu" \
72+
--enable_linear_fused_grad_add false \
73+
--fuse_attention_qkv true \
74+
--fuse_attention_ffn true \
75+
--use_fused_rope true \
76+
--recompute_use_reentrant true \
77+
--data_cache "./data_cache" \
78+
--bf16 \
79+
--fp16_opt_level "O2" \
80+
--amp_master_grad \
81+
--load_sharded_model true \
82+
--save_sharded_model true \
83+
--pipeline_parallel_degree 1 \
84+
--ignore_data_skip 0 \
85+
--force_reshard_pp true \
86+
--tensor_parallel_config "enable_mp_async_allreduce enable_mp_skip_c_identity" \
87+
--sequence_parallel 1 \
88+
--pipeline_parallel_config "disable_partial_send_recv" \
89+
--sharding "stage1" \
90+
--sharding_parallel_degree 2

llm/llama/npu/llama_npu_opt_sft.sh

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
export FLAGS_use_stride_kernel=0
16+
export FLAGS_npu_storage_format=1
17+
export HCCL_INTRA_PCIE_EHABLE=0
18+
export HCCL_INTRA_ROCE_ENABLE=1
19+
export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
20+
21+
unset PADDLE_TRAINER_ENDPOINTS
22+
unset DISTRIBUTED_TRAINER_ENDPOINTS
23+
export GLOG_v=0
24+
export FLAGS_NPU_MC2=1
25+
export MC2_Recompute=1
26+
export MC2=1
27+
export FLAGS_allocator_strategy=naive_best_fit
28+
source /usr/local/Ascend/ascend-toolkit/set_env.sh
29+
30+
export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
31+
export MULTI_STREAM_MEMORY_REUSE=1
32+
33+
export PYTHONPATH=../../../:$PYTHONPATH
34+
rm -rf sft_bf16_llama_N1C8
35+
rm -rf output/sft_bf16_llama_N1C8
36+
ps aux | grep "finetune_generation.py" | grep -v grep | awk '{print $2}' | xargs kill -9
37+
python -u -m paddle.distributed.launch \
38+
--devices "0,1,2,3,4,5,6,7" \
39+
--log_dir "./sft_bf16_llama_N1C8" \
40+
../../finetune_generation.py \
41+
--device "npu" \
42+
--model_name_or_path "meta-llama/Llama-2-13b" \
43+
--dataset_name_or_path "data/" \
44+
--output_dir "./output/sft_bf16_llama_N1C8" \
45+
--logging_dir "./sft_logs" \
46+
--per_device_train_batch_size 2 \
47+
--gradient_accumulation_steps 32 \
48+
--per_device_eval_batch_size 1 \
49+
--eval_accumulation_steps 1 \
50+
--max_steps 2000 \
51+
--learning_rate 3e-06 \
52+
--warmup_steps 2 \
53+
--save_steps 1000 \
54+
--logging_steps 1 \
55+
--evaluation_strategy "epoch" \
56+
--src_length 1024 \
57+
--max_length 4096 \
58+
--fp16 true \
59+
--fp16_opt_level "O2" \
60+
--do_train true \
61+
--disable_tqdm true \
62+
--eval_with_do_generation false \
63+
--metric_for_best_model "accuracy" \
64+
--recompute false \
65+
--tensor_parallel_degree 4 \
66+
--pipeline_parallel_degree 1 \
67+
--zero_padding 0 \
68+
--amp_master_grad true \
69+
--fuse_attention_qkv true \
70+
--fuse_attention_ffn true \
71+
--sequence_parallel 1 \
72+
--use_flash_attention 1 \
73+
--use_fused_rope 1 \
74+
--use_fused_rms_norm 1 \
75+
--sharding_parallel_degree 2 \
76+
--pad_to_multiple_of 4096 \
77+
--sharding "stage1" \
78+
--sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap"

0 commit comments

Comments
 (0)