@@ -60,6 +60,7 @@ function llama_case_list_auto() {
60
60
llama_dygraph_auto_bs8_fp32_DP2-MP2
61
61
llama_dygraph_auto_bs8_fp32_DP2-MP2-PP2
62
62
llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2
63
+ llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2-VPP3_split_bw
63
64
llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2
64
65
65
66
llama_static_auto_recompute_bs8_fp32_DP1-MP1-PP1
@@ -1668,6 +1669,12 @@ function llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2() {
1668
1669
export FLAGS_call_stack_level=3
1669
1670
export NVIDIA_TF32_OVERRIDE=0
1670
1671
1672
+ export FLAGS_cudnn_deterministic=1
1673
+ export FLAGS_embedding_deterministic=1
1674
+
1675
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
1676
+ export PARALLEL_CROSS_ENTROPY=true
1677
+
1671
1678
task_name=" llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2"
1672
1679
case_out_dir=" output/$task_name "
1673
1680
case_log_dir=" output/$task_name " " _log"
@@ -1724,7 +1731,7 @@ function llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2() {
1724
1731
--max_seq_length 4096 \
1725
1732
--sep_parallel_degree 1 \
1726
1733
--sequence_parallel false \
1727
- --pipeline_parallel_degree 2 \
1734
+ --pipeline_parallel_degree 4 \
1728
1735
--sharding_parallel_degree 2 \
1729
1736
--tensor_parallel_degree 1 \
1730
1737
--virtual_pp_degree 3 \
@@ -1741,12 +1748,111 @@ function llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2() {
1741
1748
--skip_memory_metrics 0 \
1742
1749
>> ${log_path} /$FUNCNAME 2>&1
1743
1750
loss=` cat $case_log_dir /workerlog.0 | grep ' global_step: 30' | awk -F ' loss: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1744
- ips=` cat $case_log_dir /workerlog.0 | grep ' global_step: 30' | awk -F ' interval_samples_per_second: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1745
- mem=` cat $case_log_dir /workerlog.0 | grep ' global_step: 30' | awk -F ' current_memory_reserved: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1751
+ ips=` cat $case_log_dir /workerlog.0 | grep ' global_step: 30' | awk -F ' interval_tokens_per_second_per_device: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1752
+ mem=` cat $case_log_dir /workerlog.0 | grep ' global_step: 30' | awk -F ' max_memory_reserved: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1753
+ echo " result: loss=$loss ips=$ips mem=$mem "
1754
+ loss_base=7.5364624
1755
+ ips_base=5442.5208
1756
+ mem_base=22.387750148773193
1757
+ check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
1758
+ echo " =========== $FUNCNAME run end ==========="
1759
+ }
1760
+
1761
+ function llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2-VPP3_split_bw() {
1762
+ # Only A100 support this case.
1763
+ if [ $IS_A100 -eq 0 ]; then
1764
+ return
1765
+ fi
1766
+ echo " =========== $FUNCNAME run begin ==========="
1767
+ export PYTHONPATH=$root_path /:$PYTHONPATH
1768
+ export FLAGS_call_stack_level=3
1769
+ export NVIDIA_TF32_OVERRIDE=0
1770
+
1771
+ export FLAGS_cudnn_deterministic=1
1772
+ export FLAGS_embedding_deterministic=1
1773
+
1774
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
1775
+ export PARALLEL_CROSS_ENTROPY=true
1776
+
1777
+ task_name=" llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2-VPP3_split_bw"
1778
+ case_out_dir=" output/$task_name "
1779
+ case_log_dir=" output/$task_name " " _log"
1780
+ rm -rf $case_out_dir
1781
+ rm -rf $case_log_dir
1782
+
1783
+ python -u -m paddle.distributed.launch \
1784
+ --gpus " 0,1,2,3,4,5,6,7" \
1785
+ --log_dir " output/$task_name " " _log" \
1786
+ ./run_pretrain_auto.py \
1787
+ --model_name_or_path " meta-llama/Llama-2-13b" \
1788
+ --tokenizer_name_or_path " meta-llama/Llama-2-13b" \
1789
+ --input_dir " ./data" \
1790
+ --output_dir " ./output" \
1791
+ --split 949,50,1 \
1792
+ --weight_decay 0.01 \
1793
+ --warmup_ratio 0.01 \
1794
+ --max_grad_norm 1.0 \
1795
+ --learning_rate 3e-05 \
1796
+ --min_learning_rate 3e-06 \
1797
+ --max_steps 30 \
1798
+ --logging_steps 10 \
1799
+ --eval_steps 1000 \
1800
+ --save_steps 50000 \
1801
+ --continue_training 0 \
1802
+ --do_train true \
1803
+ --do_eval false \
1804
+ --do_predict false \
1805
+ --disable_tqdm true \
1806
+ --skip_profile_timer true \
1807
+ --save_total_limit 2 \
1808
+ --device gpu \
1809
+ --disable_tqdm true \
1810
+ --dataloader_num_workers 1 \
1811
+ --distributed_dataloader 0 \
1812
+ --enable_auto_parallel 1 \
1813
+ --per_device_train_batch_size 1 \
1814
+ --gradient_accumulation_steps 4 \
1815
+ --per_device_eval_batch_size 1 \
1816
+ --recompute false \
1817
+ --recompute_use_reentrant true \
1818
+ --recompute_granularity full \
1819
+ --pp_recompute_interval 0 \
1820
+ --bf16 true \
1821
+ --fp16_opt_level " O2" \
1822
+ --amp_master_grad true \
1823
+ --fuse_attention_ffn false \
1824
+ --fuse_attention_qkv true \
1825
+ --fused_linear_param_grad_add 1 \
1826
+ --fuse_sequence_parallel_allreduce false \
1827
+ --use_flash_attention true \
1828
+ --use_fused_rope true \
1829
+ --use_fused_rms_norm true \
1830
+ --max_seq_length 4096 \
1831
+ --sep_parallel_degree 1 \
1832
+ --sequence_parallel false \
1833
+ --pipeline_parallel_degree 4 \
1834
+ --sharding_parallel_degree 2 \
1835
+ --tensor_parallel_degree 1 \
1836
+ --virtual_pp_degree 3 \
1837
+ --pipeline_schedule_mode " VPP" \
1838
+ --sharding " stage2" \
1839
+ --pipeline_parallel_config " enable_send_recv_overlap enable_split_backward" \
1840
+ --data_parallel_config " enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \
1841
+ --sharding_parallel_config " enable_stage2_overlap" \
1842
+ --tensor_parallel_config " enable_mp_async_allreduce" \
1843
+ --to_static 1 \
1844
+ --amp_custom_black_list " reduce_sum" " c_softmax_with_cross_entropy" \
1845
+ --amp_custom_white_list " lookup_table" " lookup_table_v2" \
1846
+ --num_hidden_layers 12 \
1847
+ --skip_memory_metrics 0 \
1848
+ >> ${log_path} /$FUNCNAME 2>&1
1849
+ loss=` cat $case_log_dir /workerlog.0 | grep ' global_step: 30' | awk -F ' loss: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1850
+ ips=` cat $case_log_dir /workerlog.0 | grep ' global_step: 30' | awk -F ' interval_tokens_per_second_per_device: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1851
+ mem=` cat $case_log_dir /workerlog.0 | grep ' global_step: 30' | awk -F ' max_memory_reserved: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1746
1852
echo " result: loss=$loss ips=$ips mem=$mem "
1747
- loss_base=7.52383575
1748
- ips_base=12.4135
1749
- mem_base=29.140248775482178
1853
+ loss_base=7.5364624
1854
+ ips_base=5864.2898
1855
+ mem_base=23.745134115219116
1750
1856
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
1751
1857
echo " =========== $FUNCNAME run end ==========="
1752
1858
}
0 commit comments