@@ -667,80 +667,84 @@ function llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP1-SP() {
667
667
case_log_dir=" output/$task_name " " _log"
668
668
669
669
for to_static in " 0" " 1" ; do
670
- rm -rf $case_out_dir
671
- rm -rf $case_log_dir
672
- python -u -m paddle.distributed.launch \
673
- --gpus " 0,1,2,3" \
674
- --log_dir $case_log_dir \
675
- run_pretrain_auto.py \
676
- --model_type " llama" \
677
- --model_name_or_path " facebook/llama-7b" \
678
- --tokenizer_name_or_path " facebook/llama-7b" \
679
- --input_dir " ./data" \
680
- --output_dir $case_out_dir \
681
- --split 949,50,1 \
682
- --weight_decay 0.01 \
683
- --warmup_ratio 0.01 \
684
- --max_grad_norm 0.0 \
685
- --learning_rate 3e-05 \
686
- --min_learning_rate 3e-06 \
687
- --max_steps 10 \
688
- --logging_steps 10 \
689
- --eval_steps 1000 \
690
- --save_steps 50000 \
691
- --continue_training 0 \
692
- --do_train true \
693
- --do_eval false \
694
- --do_predict false \
695
- --disable_tqdm true \
696
- --skip_profile_timer true \
697
- --save_total_limit 2 \
698
- --device gpu \
699
- --disable_tqdm true \
700
- --dataloader_num_workers 1 \
701
- --enable_auto_parallel 1 \
702
- --per_device_train_batch_size 1 \
703
- --gradient_accumulation_steps 1 \
704
- --per_device_eval_batch_size 2 \
705
- --recompute false \
706
- --bf16 1\
707
- --fp16_opt_level " O2" \
708
- --amp_custom_black_list " reduce_sum" " c_softmax_with_cross_entropy" \
709
- --amp_custom_white_list " lookup_table" " lookup_table_v2" \
710
- --amp_master_grad 1 \
711
- --fuse_attention_ffn false \
712
- --fuse_attention_qkv false \
713
- --fuse_sequence_parallel_allreduce false \
714
- --use_flash_attention 0 \
715
- --use_fused_rope false \
716
- --use_fused_rms_norm 0 \
717
- --max_seq_length 4096 \
718
- --sep_parallel_degree 1 \
719
- --sequence_parallel true \
720
- --pipeline_parallel_degree 1 \
721
- --sharding_parallel_degree 1 \
722
- --tensor_parallel_degree 2 \
723
- --virtual_pp_degree 1 \
724
- --sharding " " \
725
- --to_static ${to_static} \
726
- --num_hidden_layers 4 \
727
- >> ${log_path} /$FUNCNAME 2>&1
728
- loss=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
729
- loss_md5=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss_md5: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
730
- ips=-1
731
- mem=-1
732
- echo " result: to_static=$to_static loss=$loss ips=$ips mem=$mem "
733
- loss_base=9.16783295
734
- loss_md5_base=8ea72495fba4e1b9ba004b4431e27218
735
- if [ $IS_A100 -ne 0 ] && [ $to_static -eq 0 ]; then
736
- loss_base=9.37966919
737
- elif [ $IS_A100 -ne 0 ] && [ $to_static -eq 1 ]; then
738
- loss_base=9.38012543
739
- fi
740
- ips_base=-1
741
- mem_base=-1
742
- check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
743
- # check_md5_result $FUNCNAME ${loss_md5_base} ${loss_md5}
670
+ for use_recompute in " 1" " 0" ; do
671
+ rm -rf $case_out_dir
672
+ rm -rf $case_log_dir
673
+ python -u -m paddle.distributed.launch \
674
+ --gpus " 0,1,2,3" \
675
+ --log_dir $case_log_dir \
676
+ run_pretrain_auto.py \
677
+ --model_type " llama" \
678
+ --model_name_or_path " facebook/llama-7b" \
679
+ --tokenizer_name_or_path " facebook/llama-7b" \
680
+ --input_dir " ./data" \
681
+ --output_dir $case_out_dir \
682
+ --split 949,50,1 \
683
+ --weight_decay 0.01 \
684
+ --warmup_ratio 0.01 \
685
+ --max_grad_norm 0.0 \
686
+ --learning_rate 3e-05 \
687
+ --min_learning_rate 3e-06 \
688
+ --max_steps 10 \
689
+ --logging_steps 10 \
690
+ --eval_steps 1000 \
691
+ --save_steps 50000 \
692
+ --continue_training 0 \
693
+ --do_train true \
694
+ --do_eval false \
695
+ --do_predict false \
696
+ --disable_tqdm true \
697
+ --skip_profile_timer true \
698
+ --save_total_limit 2 \
699
+ --device gpu \
700
+ --disable_tqdm true \
701
+ --dataloader_num_workers 1 \
702
+ --enable_auto_parallel 1 \
703
+ --per_device_train_batch_size 1 \
704
+ --gradient_accumulation_steps 1 \
705
+ --per_device_eval_batch_size 2 \
706
+ --recompute ${use_recompute} \
707
+ --bf16 1\
708
+ --fp16_opt_level " O2" \
709
+ --amp_custom_black_list " reduce_sum" " c_softmax_with_cross_entropy" \
710
+ --amp_custom_white_list " lookup_table" " lookup_table_v2" \
711
+ --amp_master_grad 1 \
712
+ --fuse_attention_ffn false \
713
+ --fuse_attention_qkv false \
714
+ --fuse_sequence_parallel_allreduce false \
715
+ --use_flash_attention 0 \
716
+ --use_fused_rope false \
717
+ --use_fused_rms_norm 0 \
718
+ --max_seq_length 4096 \
719
+ --sep_parallel_degree 1 \
720
+ --sequence_parallel true \
721
+ --pipeline_parallel_degree 1 \
722
+ --sharding_parallel_degree 1 \
723
+ --tensor_parallel_degree 2 \
724
+ --virtual_pp_degree 1 \
725
+ --sharding " " \
726
+ --to_static ${to_static} \
727
+ --num_hidden_layers 4 \
728
+ >> ${log_path} /$FUNCNAME 2>&1
729
+ loss=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
730
+ loss_md5=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss_md5: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
731
+ ips=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' interval_tokens_per_second_per_device: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
732
+ mem=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' max_memory_reserved: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
733
+ echo " result: to_static=$to_static loss=$loss ips=$ips mem=$mem "
734
+ loss_base=9.16783295
735
+ loss_md5_base=8ea72495fba4e1b9ba004b4431e27218
736
+ if [ $IS_A100 -ne 0 ] && [ $to_static -eq 0 ]; then
737
+ loss_base=9.37966919
738
+ elif [ $IS_A100 -ne 0 ] && [ $to_static -eq 1 ]; then
739
+ loss_base=9.38012543
740
+ fi
741
+ ips=-1
742
+ mem=-1
743
+ ips_base=-1
744
+ mem_base=-1
745
+ check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
746
+ # check_md5_result $FUNCNAME ${loss_md5_base} ${loss_md5}
747
+ done
744
748
done
745
749
echo " =========== $FUNCNAME run end ==========="
746
750
}
0 commit comments