@@ -55,6 +55,8 @@ function llama_case_list_auto() {
55
55
56
56
llama_align_dygraph_dy2st_auto_bs2_bf16_DP2-MP1-PP1
57
57
llama_convert_hybrid_ckpt_to_auto_parallel_bs2_fp32_DP2-MP1-PP1
58
+ llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP1-SP
59
+ llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP2-SP
58
60
}
59
61
60
62
function llm_gpt_case_list_auto() {
@@ -971,6 +973,187 @@ function llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2-VPP3_split_bw() {
971
973
echo " =========== $FUNCNAME run end ==========="
972
974
}
973
975
976
+ function llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP1-SP() {
977
+ echo " =========== $FUNCNAME run begin ==========="
978
+ export PYTHONPATH=$root_path /:$PYTHONPATH
979
+ export PYTHONPATH=/paddle/Paddle/build_gpu/python/:$PYTHONPATH
980
+ export FLAGS_call_stack_level=3
981
+ export FLAGS_enable_pir_api=1
982
+ export FLAGS_dynamic_static_unified_comm=1
983
+
984
+ export NVIDIA_TF32_OVERRIDE=0
985
+ export FLAGS_cudnn_deterministic=1
986
+ export FLAGS_embedding_deterministic=1
987
+
988
+ task_name=" llama_align_dygraph_dy2st_pir_auto_bs2_bf16_dp2mp2pp1_sp"
989
+ case_out_dir=" output/$task_name "
990
+ case_log_dir=" output/$task_name " " _log"
991
+
992
+ for to_static in " 0" " 1" ; do
993
+ rm -rf $case_out_dir
994
+ rm -rf $case_log_dir
995
+ python -u -m paddle.distributed.launch \
996
+ --gpus " 0,1,2,3" \
997
+ --log_dir $case_log_dir \
998
+ run_pretrain_auto.py \
999
+ --model_type " llama" \
1000
+ --model_name_or_path " facebook/llama-7b" \
1001
+ --tokenizer_name_or_path " facebook/llama-7b" \
1002
+ --input_dir " ./data" \
1003
+ --output_dir $case_out_dir \
1004
+ --split 949,50,1 \
1005
+ --weight_decay 0.01 \
1006
+ --warmup_ratio 0.01 \
1007
+ --max_grad_norm 0.0 \
1008
+ --learning_rate 3e-05 \
1009
+ --min_learning_rate 3e-06 \
1010
+ --max_steps 10 \
1011
+ --logging_steps 10 \
1012
+ --eval_steps 1000 \
1013
+ --save_steps 50000 \
1014
+ --continue_training 0 \
1015
+ --do_train true \
1016
+ --do_eval false \
1017
+ --do_predict false \
1018
+ --disable_tqdm true \
1019
+ --skip_profile_timer true \
1020
+ --save_total_limit 2 \
1021
+ --device gpu \
1022
+ --disable_tqdm true \
1023
+ --dataloader_num_workers 1 \
1024
+ --enable_auto_parallel 1 \
1025
+ --per_device_train_batch_size 1 \
1026
+ --gradient_accumulation_steps 1 \
1027
+ --per_device_eval_batch_size 2 \
1028
+ --recompute false \
1029
+ --bf16 1\
1030
+ --fp16_opt_level " O2" \
1031
+ --amp_custom_black_list " reduce_sum" " c_softmax_with_cross_entropy" \
1032
+ --amp_custom_white_list " lookup_table" " lookup_table_v2" \
1033
+ --amp_master_grad 1 \
1034
+ --fuse_attention_ffn false \
1035
+ --fuse_attention_qkv false \
1036
+ --fuse_sequence_parallel_allreduce false \
1037
+ --use_flash_attention 0 \
1038
+ --use_fused_rope false \
1039
+ --use_fused_rms_norm 0 \
1040
+ --max_seq_length 4096 \
1041
+ --sep_parallel_degree 1 \
1042
+ --sequence_parallel true \
1043
+ --pipeline_parallel_degree 1 \
1044
+ --sharding_parallel_degree 1 \
1045
+ --tensor_parallel_degree 2 \
1046
+ --virtual_pp_degree 1 \
1047
+ --sharding " " \
1048
+ --to_static ${to_static} \
1049
+ --num_hidden_layers 4 \
1050
+ >> ${log_path} /$FUNCNAME 2>&1
1051
+ loss=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1052
+ loss_md5=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss_md5: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1053
+ ips=-1
1054
+ mem=-1
1055
+ echo " result: to_static=$to_static loss=$loss ips=$ips mem=$mem "
1056
+ loss_base=9.34753036
1057
+ loss_md5_base=8ea72495fba4e1b9ba004b4431e27218
1058
+ ips_base=-1
1059
+ mem_base=-1
1060
+ check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
1061
+ check_md5_result $FUNCNAME ${loss_md5_base} ${loss_md5}
1062
+ done
1063
+ echo " =========== $FUNCNAME run end ==========="
1064
+ }
1065
+
1066
+ function llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP2-SP() {
1067
+ echo " =========== $FUNCNAME run begin ==========="
1068
+ export PYTHONPATH=$root_path /:$PYTHONPATH
1069
+ export PYTHONPATH=/paddle/Paddle/build_gpu/python/:$PYTHONPATH
1070
+ export FLAGS_call_stack_level=3
1071
+ export FLAGS_enable_pir_api=1
1072
+ export FLAGS_dynamic_static_unified_comm=1
1073
+
1074
+ export NVIDIA_TF32_OVERRIDE=0
1075
+ export FLAGS_cudnn_deterministic=1
1076
+ export FLAGS_embedding_deterministic=1
1077
+
1078
+ task_name=" llama_align_dygraph_dy2st_pir_auto_bs2_bf16_dp2mp2pp2_sp"
1079
+ case_out_dir=" output/$task_name "
1080
+ case_log_dir=" output/$task_name " " _log"
1081
+
1082
+ for to_static in " 0" " 1" ; do
1083
+ rm -rf $case_out_dir
1084
+ rm -rf $case_log_dir
1085
+ python -u -m paddle.distributed.launch \
1086
+ --gpus " 0,1,2,3,4,5,6,7" \
1087
+ --log_dir $case_log_dir \
1088
+ run_pretrain_auto.py \
1089
+ --model_type " llama" \
1090
+ --model_name_or_path " facebook/llama-7b" \
1091
+ --tokenizer_name_or_path " facebook/llama-7b" \
1092
+ --input_dir " ./data" \
1093
+ --output_dir $case_out_dir \
1094
+ --split 949,50,1 \
1095
+ --weight_decay 0.01 \
1096
+ --warmup_ratio 0.01 \
1097
+ --max_grad_norm 0.0 \
1098
+ --learning_rate 3e-05 \
1099
+ --min_learning_rate 3e-06 \
1100
+ --max_steps 10 \
1101
+ --logging_steps 10 \
1102
+ --eval_steps 1000 \
1103
+ --save_steps 50000 \
1104
+ --continue_training 0 \
1105
+ --do_train true \
1106
+ --do_eval false \
1107
+ --do_predict false \
1108
+ --disable_tqdm true \
1109
+ --skip_profile_timer true \
1110
+ --save_total_limit 2 \
1111
+ --device gpu \
1112
+ --disable_tqdm true \
1113
+ --dataloader_num_workers 1 \
1114
+ --enable_auto_parallel 1 \
1115
+ --per_device_train_batch_size 1 \
1116
+ --gradient_accumulation_steps 1 \
1117
+ --per_device_eval_batch_size 2 \
1118
+ --recompute false \
1119
+ --bf16 1\
1120
+ --fp16_opt_level " O2" \
1121
+ --amp_custom_black_list " reduce_sum" " c_softmax_with_cross_entropy" \
1122
+ --amp_custom_white_list " lookup_table" " lookup_table_v2" \
1123
+ --amp_master_grad 1 \
1124
+ --fuse_attention_ffn false \
1125
+ --fuse_attention_qkv false \
1126
+ --fuse_sequence_parallel_allreduce false \
1127
+ --use_flash_attention 0 \
1128
+ --use_fused_rope false \
1129
+ --use_fused_rms_norm 0 \
1130
+ --max_seq_length 4096 \
1131
+ --sep_parallel_degree 1 \
1132
+ --sequence_parallel true \
1133
+ --pipeline_parallel_degree 2 \
1134
+ --sharding_parallel_degree 1 \
1135
+ --tensor_parallel_degree 2 \
1136
+ --virtual_pp_degree 1 \
1137
+ --sharding " " \
1138
+ --to_static ${to_static} \
1139
+ --num_hidden_layers 4 \
1140
+ >> ${log_path} /$FUNCNAME 2>&1
1141
+ loss=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1142
+ loss_md5=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss_md5: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1143
+ ips=-1
1144
+ mem=-1
1145
+ echo " result: to_static=$to_static loss=$loss loss_md5=$loss_md5 ips=$ips mem=$mem "
1146
+ loss_base=9.25199432
1147
+ loss_md5_base=83531e98ee11cd271db175150ab254bb
1148
+ ips_base=-1
1149
+ mem_base=-1
1150
+ check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
1151
+ check_md5_result $FUNCNAME ${loss_md5_base} ${loss_md5}
1152
+ done
1153
+ echo " =========== $FUNCNAME run end ==========="
1154
+ }
1155
+
1156
+
974
1157
function llama_align_dygraph_dy2st_auto_bs2_bf16_DP2-MP1-PP1() {
975
1158
echo " =========== $FUNCNAME run begin ==========="
976
1159
export PYTHONPATH=$root_path /:$PYTHONPATH
@@ -1886,6 +2069,21 @@ EOF
1886
2069
1887
2070
# ########### case end ############
1888
2071
2072
+ function check_md5_result() {
2073
+ echo -e " $1 " >> ${log_path} /result.log
2074
+
2075
+ if [ $# -ne 3 ]; then
2076
+ echo -e " \033[31m $1 parameter transfer failed: $@ \033[0m" | tee -a ${log_path} /result.log
2077
+ exit -1
2078
+ fi
2079
+
2080
+ echo -e " loss_md5_base: $2 loss_md5: $3 " | tee -a ${log_path} /result.log
2081
+ if [ $2 != $3 ]; then
2082
+ echo -e " \033[31m $1 loss_md5 diff check failed! \033[0m" | tee -a ${log_path} /result.log
2083
+ exit -1
2084
+ fi
2085
+ }
2086
+
1889
2087
function check_result() {
1890
2088
echo -e " $1 " >> ${log_path} /result.log
1891
2089
if [ $? -ne 0 ]; then
0 commit comments