From 47b79c2050d490695eadb9693c5eab823570ffdc Mon Sep 17 00:00:00 2001 From: liujie44 Date: Mon, 13 May 2024 10:00:23 +0800 Subject: [PATCH 1/5] fix ci requirements --- scripts/distribute/ci_case_auto.sh | 2 ++ scripts/distribute/ci_case_dy.sh | 2 ++ 2 files changed, 4 insertions(+) diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh index 56d8cad73525..a211ce19c9af 100755 --- a/scripts/distribute/ci_case_auto.sh +++ b/scripts/distribute/ci_case_auto.sh @@ -2108,6 +2108,8 @@ function before_hook_for_gpt() { if [[ $FLAGS_install_deps == 0 ]];then echo -e "\033[31m ---- Install requirements for GPT auto cases \033[0m" python -m pip install -r requirements.txt --force-reinstall + python -m pip install -r $root_path/requirements.txt + python -m pip install -r $root_path/requirements-dev.txt python -m pip install --no-cache-dir https://paddlenlp.bj.bcebos.com/wheels/paddlenlp-ci-py3-none-any.whl --force-reinstall --no-dependencies python -c "import paddlenlp; print('paddlenlp commit:',paddlenlp.version.commit)"; else diff --git a/scripts/distribute/ci_case_dy.sh b/scripts/distribute/ci_case_dy.sh index 4e9697f26403..ad6979f07902 100644 --- a/scripts/distribute/ci_case_dy.sh +++ b/scripts/distribute/ci_case_dy.sh @@ -515,6 +515,8 @@ function before_hook_for_gpt() { if [[ $FLAGS_install_deps == 0 ]];then echo -e "\033[31m ---- Install requirements for GPT dygraph cases \033[0m" python -m pip install -r requirements.txt --force-reinstall + python -m pip install -r $root_path/requirements.txt + python -m pip install -r $root_path/requirements-dev.txt python -m pip install --no-cache-dir https://paddlenlp.bj.bcebos.com/wheels/paddlenlp-ci-py3-none-any.whl --force-reinstall --no-dependencies python -c "import paddlenlp; print('paddlenlp commit:',paddlenlp.version.commit)"; else From ae73d7697acc9511f0912ebaf2f1dc3aa66e7dea Mon Sep 17 00:00:00 2001 From: liujie44 Date: Mon, 13 May 2024 10:38:45 +0800 Subject: [PATCH 2/5] add llama2_auto benchmark --- ..._bf16_DP1_MP1_PP4_VPP5_Sharding8_Stage2.sh | 25 +++ .../llama2/benchmark_common/prepare.sh | 36 ++++ .../llama2/benchmark_common/run_benchmark.sh | 164 ++++++++++++++++++ .../pretrain-llama2_13b.json | 53 ++++++ 4 files changed, 278 insertions(+) create mode 100644 tests/test_tipc/static/auto_parallel/llama2/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_VPP5_Sharding8_Stage2.sh create mode 100644 tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh create mode 100644 tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh create mode 100644 tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_13b/pretrain-llama2_13b.json diff --git a/tests/test_tipc/static/auto_parallel/llama2/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_VPP5_Sharding8_Stage2.sh b/tests/test_tipc/static/auto_parallel/llama2/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_VPP5_Sharding8_Stage2.sh new file mode 100644 index 000000000000..6e0413f39a6a --- /dev/null +++ b/tests/test_tipc/static/auto_parallel/llama2/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_VPP5_Sharding8_Stage2.sh @@ -0,0 +1,25 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +param="model_item=meta-llama-Llama-2-13b_pretrain_dy2st " +param+="run_mode=DP1_MP1_PP4_VPP5_Sharding8_Stage2 " +param+="device_num=N4C32 " +param+="global_batch_size=32 " +param+="nnodes=4 " +param+="model_type=llama2_13b " + +cd ./tests +bash ./test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh new file mode 100644 index 000000000000..ff12fc5b0346 --- /dev/null +++ b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh @@ -0,0 +1,36 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +python -m pip install -r ../requirements.txt +python -m pip install -r ../requirements-dev.txt + +# install fused_ln custom ops +cd ../model_zoo/gpt-3/external_ops/ +python setup.py install + +# install tool_helpers +cd ../../../llm/llama +python -m pip install tool_helpers + +# download data +cd auto_parallel +wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy +wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz +mkdir data +mv llama_openwebtext_100k_ids.npy ./data +mv llama_openwebtext_100k_idx.npz ./data + +# mv pretrain_config +rm -rf pretrain_config_* +cp -r ../../tests/test_tipc/static/auto_parallel/llama2/pretrain_config_* ./ diff --git a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh new file mode 100644 index 000000000000..c03ad99c99f1 --- /dev/null +++ b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh @@ -0,0 +1,164 @@ +#!/usr/bin/env bash + +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Test training benchmark for a model. +# Usage:bash benchmark/run_benchmark.sh ${model_name_or_path} ${per_device_train_batch_size} ${tensor_parallel_degree} ${pipeline_parallel_degree} ${virtual_pp_degree} ${sequence_parallel} ${sharding_parallel_degree} ${sharding} ${recompute} ${run_mode} ${device_num} +function _set_params(){ + model_item=${model_item:-"meta-llama-Llama-2-7b_pretrain"} + run_mode=${run_mode:-"MP2-PP1"} + device_num=${device_num:-"N1C8"} + global_batch_size=${global_batch_size:-64} + fp_item="bf16" + MODEL_TYPE=${model_type:-"llama2_7b"} + + ip_lists=($(echo $TRAINER_INSTANCES | tr ',' ' ')) + master_ip=${ip_lists[0]} + nnodes=${nnodes:-1} + + base_batch_size=${global_batch_size} + profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 + model_repo="PaddleNLP" # (必选) 模型套件的名字 + speed_unit="tokens/s" # (必选)速度指标单位 + skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step + keyword="interval_tokens_per_second_per_device:" # (必选)解析日志,筛选出性能数据所在行的关键字 + convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" + model_mode=5 # 获取ips数据及单位,仅跳过skip_steps后计算均值,单位保持token/s不变 + + # 以下为通用执行命令,无特殊可不用修改 + model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 + device=${CUDA_VISIBLE_DEVICES//,/ } + arr=(${device}) + num_gpu_devices=${#arr[*]} + run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 + profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 + speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} + train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log + mkdir -p $(dirname ${train_log_file}) + + profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling + mkdir -p $(dirname ${profiling_log_file}) + + speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed + mkdir -p $(dirname ${speed_log_file}) + + OUTPUT_PATH=${run_log_path}/output +} + +function _train(){ + batch_size=${per_device_train_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs + + if [ -d $OUTPUT_PATH ]; then + rm -rf $OUTPUT_PATH + fi + mkdir $OUTPUT_PATH + + echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" + + if [ ${profiling} == "true" ];then + add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" + log_file=${profiling_log_file} + else + add_options="" + log_file=${train_log_file} + fi + + # Disable for hanging bug + # if [ "${tensor_parallel_degree}" != "1" ]; then + # export CUDA_DEVICE_MAX_CONNECTIONS=1 + # fi + + # if [ ${run_mode} == "autotuner" ]; then + # unset PADDLE_ELASTIC_JOB_ID + # unset PADDLE_TRAINER_ENDPOINTS + # unset DISTRIBUTED_TRAINER_ENDPOINTS + # unset FLAGS_START_PORT + # unset PADDLE_ELASTIC_TIMEOUT + # unset PADDLE_TRAINERS_NUM + # unset PADDLE_TRAINER_ID + # autoconfig_args="--auto_tuner_json ./auto_config_${MODEL_TYPE}/${MODEL_TYPE}_pretrain_autoconfig.json" + # else + # autoconfig_args="" + # fi + + if [ ${PADDLE_TRAINER_ID} ]; then + PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" + else + PADDLE_RANK_OPTION="" + fi + + # if [ "$autoconfig_args" != "" ]; then + # distributed_args="--master etcd://$master_ip:2379 --nnodes $nnodes:$nnodes" + # else + # distributed_args="--master $master_ip:36677 --nnodes $nnodes ${PADDLE_RANK_OPTION} --run_mode=collective" + # fi + + echo "==========System Env=============" + env + echo "=================================" + + # 以下为通用执行命令,无特殊可不用修改 + case ${device_num} in + N1C8) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" + train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ + --nnodes 1 --nproc_per_node 8 \ + --log_dir mylog run_pretrain_auto.py \ + ./pretrain_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}.json" + ;; + N4C32) echo "Run with: device_num=${device_num} run_mode=${run_mode}" + train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ + --log_dir mylog run_pretrain_auto.py \ + ./pretrain_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}.json" + ;; + *) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" + train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ + --log_dir mylog run_pretrain_auto.py \ + ./pretrain_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}.json" + ;; + esac + cd ../llm/llama/auto_parallel/ + # rm -rf ./auto_config_${MODEL_TYPE}/*GBS* + # rm -rf ./auto_config_${MODEL_TYPE}/*auto_tuner.log + # rm -rf ./auto_config_${MODEL_TYPE}/*csv + # rm -rf ./auto_config_${MODEL_TYPE}/best_* + rm -rf mylog && rm -rf checkpoints + + echo "train_cmd: ${train_cmd} log_file: ${log_file}" + timeout 15m ${train_cmd} > ${log_file} 2>&1 + + if [ $? -ne 0 ];then + echo -e "${model_name}, FAIL" + else + echo -e "${model_name}, SUCCESS" + fi + + #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` + if [ ${device_num} != "N1C1" ]; then + case_path=$PWD && cd - && mkdir -p mylog # PaddleNLP/tests/mylog + cp -r ${case_path}/mylog/workerlog.* ./mylog/ + fi +} + +export FLAGS_selected_gpus="0,1,2,3,4,5,6,7" +export NCCL_IB_DISABLE=0 +export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export PARALLEL_CROSS_ENTROPY=true + +source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 +_set_params $@ +#_train # 如果只产出训练log,不解析,可取消注释 +_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 diff --git a/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_13b/pretrain-llama2_13b.json b/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_13b/pretrain-llama2_13b.json new file mode 100644 index 000000000000..3645473961b3 --- /dev/null +++ b/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_13b/pretrain-llama2_13b.json @@ -0,0 +1,53 @@ +{ + "model_name_or_path": "meta-llama/Llama-2-13b", + "tokenizer_name_or_path": "meta-llama/Llama-2-13b", + "input_dir": "./data", + "output_dir": "./checkpoints/llama2_pretrain_ckpts", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 4, + "per_device_eval_batch_size": 4, + "tensor_parallel_degree": 1, + "pipeline_parallel_degree": 4, + "sharding": "stage2", + "sharding_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate", + "tensor_parallel_config": "enable_mp_async_allreduce", + "pipeline_parallel_config": "enable_send_recv_overlap", + "virtual_pp_degree": 5, + "sequence_parallel": 0, + "use_flash_attention": true, + "use_fused_rms_norm": true, + "fuse_attention_ffn": true, + "fuse_attention_qkv": true, + "use_fused_rope": true, + "fused_linear_param_grad_add": true, + "max_seq_length": 4096, + "learning_rate": 3e-05, + "min_learning_rate": 3e-06, + "warmup_steps": 30, + "logging_steps": 1, + "max_steps": 50, + "save_steps": 5000, + "eval_steps": 1000, + "weight_decay": 0.01, + "bf16": true, + "fp16_opt_level": "O2", + "amp_custom_black_list": "reduce_sum c_softmax_with_cross_entropy", + "amp_custom_white_list": "lookup_table lookup_table_v2", + "amp_master_grad": true, + "warmup_ratio": 0.01, + "max_grad_norm": 1.0, + "dataloader_num_workers": 1, + "continue_training": 0, + "do_train": true, + "do_eval": false, + "do_predict": false, + "disable_tqdm": true, + "skip_profile_timer": true, + "recompute": false, + "recompute_use_reentrant": true, + "distributed_dataloader": 0, + "recompute_granularity": "full", + "save_total_limit": 2, + "device": "gpu", + "to_static": true +} From 45b282466276f993537711d8d1667143356f3aba Mon Sep 17 00:00:00 2001 From: liujie44 Date: Wed, 15 May 2024 09:49:42 +0800 Subject: [PATCH 3/5] fix --- .../static/auto_parallel/llama2/benchmark_common/prepare.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh index ff12fc5b0346..edb4590e2f15 100644 --- a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh +++ b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh @@ -33,4 +33,4 @@ mv llama_openwebtext_100k_idx.npz ./data # mv pretrain_config rm -rf pretrain_config_* -cp -r ../../tests/test_tipc/static/auto_parallel/llama2/pretrain_config_* ./ +cp -r ../../../tests/test_tipc/static/auto_parallel/llama2/pretrain_config_* ./ From 7c42a16f018e9429c89125288b9044d7e656c250 Mon Sep 17 00:00:00 2001 From: liujie44 Date: Wed, 15 May 2024 14:40:07 +0800 Subject: [PATCH 4/5] update config --- .../pretrain-llama2_13b.json | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_13b/pretrain-llama2_13b.json b/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_13b/pretrain-llama2_13b.json index 3645473961b3..0c323896d9fa 100644 --- a/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_13b/pretrain-llama2_13b.json +++ b/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_13b/pretrain-llama2_13b.json @@ -9,9 +9,11 @@ "tensor_parallel_degree": 1, "pipeline_parallel_degree": 4, "sharding": "stage2", - "sharding_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate", + "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate", + "sharding_parallel_config": "enable_stage2_overlap", "tensor_parallel_config": "enable_mp_async_allreduce", "pipeline_parallel_config": "enable_send_recv_overlap", + "pipeline_schedule_mode": "VPP", "virtual_pp_degree": 5, "sequence_parallel": 0, "use_flash_attention": true, @@ -31,8 +33,8 @@ "weight_decay": 0.01, "bf16": true, "fp16_opt_level": "O2", - "amp_custom_black_list": "reduce_sum c_softmax_with_cross_entropy", - "amp_custom_white_list": "lookup_table lookup_table_v2", + "amp_custom_black_list": ["reduce_sum", "c_softmax_with_cross_entropy"], + "amp_custom_white_list": ["lookup_table", "lookup_table_v2"], "amp_master_grad": true, "warmup_ratio": 0.01, "max_grad_norm": 1.0, @@ -49,5 +51,6 @@ "recompute_granularity": "full", "save_total_limit": 2, "device": "gpu", - "to_static": true + "to_static": true, + "enable_auto_parallel": true } From 5f45efd98ff087f18b3889ff3a546bd5c2eb30c7 Mon Sep 17 00:00:00 2001 From: liujie44 Date: Fri, 17 May 2024 09:47:19 +0800 Subject: [PATCH 5/5] Add ips log for per card --- llm/llama/auto_parallel/run_pretrain_auto.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llm/llama/auto_parallel/run_pretrain_auto.py b/llm/llama/auto_parallel/run_pretrain_auto.py index 6308af2aa06a..e7be917eecc7 100644 --- a/llm/llama/auto_parallel/run_pretrain_auto.py +++ b/llm/llama/auto_parallel/run_pretrain_auto.py @@ -377,6 +377,7 @@ def get_train_data_file(args): class PretrainingTrainer(AutoTrainer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.is_pretraining = True def _wrap_for_dist_loader(self, train_dataloader): dist_loader = super()._wrap_for_dist_loader(train_dataloader)