From 47b79c2050d490695eadb9693c5eab823570ffdc Mon Sep 17 00:00:00 2001
From: liujie44 <liujie44@baidu.com>
Date: Mon, 13 May 2024 10:00:23 +0800
Subject: [PATCH 1/5] fix ci requirements

---
 scripts/distribute/ci_case_auto.sh | 2 ++
 scripts/distribute/ci_case_dy.sh   | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh
index 56d8cad73525..a211ce19c9af 100755
--- a/scripts/distribute/ci_case_auto.sh
+++ b/scripts/distribute/ci_case_auto.sh
@@ -2108,6 +2108,8 @@ function before_hook_for_gpt() {
     if [[ $FLAGS_install_deps == 0 ]];then
         echo -e "\033[31m ---- Install requirements for GPT auto cases  \033[0m"
         python -m pip install -r requirements.txt --force-reinstall
+        python -m pip install -r $root_path/requirements.txt
+        python -m pip install -r $root_path/requirements-dev.txt
         python -m pip install --no-cache-dir https://paddlenlp.bj.bcebos.com/wheels/paddlenlp-ci-py3-none-any.whl --force-reinstall --no-dependencies
         python -c "import paddlenlp; print('paddlenlp commit:',paddlenlp.version.commit)";
     else
diff --git a/scripts/distribute/ci_case_dy.sh b/scripts/distribute/ci_case_dy.sh
index 4e9697f26403..ad6979f07902 100644
--- a/scripts/distribute/ci_case_dy.sh
+++ b/scripts/distribute/ci_case_dy.sh
@@ -515,6 +515,8 @@ function before_hook_for_gpt() {
     if [[ $FLAGS_install_deps == 0 ]];then
         echo -e "\033[31m ---- Install requirements for GPT dygraph cases  \033[0m"
         python -m pip install -r requirements.txt --force-reinstall
+        python -m pip install -r $root_path/requirements.txt
+        python -m pip install -r $root_path/requirements-dev.txt
         python -m pip install --no-cache-dir https://paddlenlp.bj.bcebos.com/wheels/paddlenlp-ci-py3-none-any.whl --force-reinstall --no-dependencies
         python -c "import paddlenlp; print('paddlenlp commit:',paddlenlp.version.commit)";
     else

From ae73d7697acc9511f0912ebaf2f1dc3aa66e7dea Mon Sep 17 00:00:00 2001
From: liujie44 <liujie44@baidu.com>
Date: Mon, 13 May 2024 10:38:45 +0800
Subject: [PATCH 2/5] add llama2_auto benchmark

---
 ..._bf16_DP1_MP1_PP4_VPP5_Sharding8_Stage2.sh |  25 +++
 .../llama2/benchmark_common/prepare.sh        |  36 ++++
 .../llama2/benchmark_common/run_benchmark.sh  | 164 ++++++++++++++++++
 .../pretrain-llama2_13b.json                  |  53 ++++++
 4 files changed, 278 insertions(+)
 create mode 100644 tests/test_tipc/static/auto_parallel/llama2/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_VPP5_Sharding8_Stage2.sh
 create mode 100644 tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh
 create mode 100644 tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh
 create mode 100644 tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_13b/pretrain-llama2_13b.json

diff --git a/tests/test_tipc/static/auto_parallel/llama2/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_VPP5_Sharding8_Stage2.sh b/tests/test_tipc/static/auto_parallel/llama2/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_VPP5_Sharding8_Stage2.sh
new file mode 100644
index 000000000000..6e0413f39a6a
--- /dev/null
+++ b/tests/test_tipc/static/auto_parallel/llama2/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_VPP5_Sharding8_Stage2.sh
@@ -0,0 +1,25 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+param="model_item=meta-llama-Llama-2-13b_pretrain_dy2st "
+param+="run_mode=DP1_MP1_PP4_VPP5_Sharding8_Stage2 "
+param+="device_num=N4C32 "
+param+="global_batch_size=32 "
+param+="nnodes=4 "
+param+="model_type=llama2_13b "
+
+cd ./tests
+bash ./test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh
new file mode 100644
index 000000000000..ff12fc5b0346
--- /dev/null
+++ b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh
@@ -0,0 +1,36 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+python -m pip install -r ../requirements.txt
+python -m pip install -r ../requirements-dev.txt
+
+# install fused_ln custom ops
+cd ../model_zoo/gpt-3/external_ops/
+python setup.py install
+
+# install tool_helpers
+cd ../../../llm/llama
+python -m pip install tool_helpers
+
+# download data
+cd auto_parallel
+wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
+wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz
+mkdir data
+mv llama_openwebtext_100k_ids.npy ./data
+mv llama_openwebtext_100k_idx.npz ./data
+
+# mv pretrain_config
+rm -rf pretrain_config_*
+cp -r ../../tests/test_tipc/static/auto_parallel/llama2/pretrain_config_* ./
diff --git a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh
new file mode 100644
index 000000000000..c03ad99c99f1
--- /dev/null
+++ b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh
@@ -0,0 +1,164 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Test training benchmark for a model.
+# Usage：bash benchmark/run_benchmark.sh ${model_name_or_path} ${per_device_train_batch_size} ${tensor_parallel_degree} ${pipeline_parallel_degree} ${virtual_pp_degree} ${sequence_parallel} ${sharding_parallel_degree} ${sharding} ${recompute} ${run_mode} ${device_num}
+function _set_params(){
+    model_item=${model_item:-"meta-llama-Llama-2-7b_pretrain"}
+    run_mode=${run_mode:-"MP2-PP1"}
+    device_num=${device_num:-"N1C8"}
+    global_batch_size=${global_batch_size:-64}
+    fp_item="bf16"
+    MODEL_TYPE=${model_type:-"llama2_7b"}
+
+    ip_lists=($(echo $TRAINER_INSTANCES | tr ',' ' '))
+    master_ip=${ip_lists[0]}
+    nnodes=${nnodes:-1}
+
+    base_batch_size=${global_batch_size}
+    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
+    model_repo="PaddleNLP"          # (必选) 模型套件的名字
+    speed_unit="tokens/s"         # (必选)速度指标单位
+    skip_steps=10                  # (必选)解析日志，跳过模型前几个性能不稳定的step
+    keyword="interval_tokens_per_second_per_device:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
+    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
+    model_mode=5                   # 获取ips数据及单位，仅跳过skip_steps后计算均值，单位保持token/s不变
+    
+    # 以下为通用执行命令，无特殊可不用修改
+    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
+    device=${CUDA_VISIBLE_DEVICES//,/ }
+    arr=(${device})
+    num_gpu_devices=${#arr[*]}
+    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量
+    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
+    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
+    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
+    mkdir -p $(dirname ${train_log_file})
+
+    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
+    mkdir -p $(dirname ${profiling_log_file})
+
+    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed
+    mkdir -p $(dirname ${speed_log_file})
+
+    OUTPUT_PATH=${run_log_path}/output
+}
+
+function _train(){
+    batch_size=${per_device_train_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
+
+    if [ -d $OUTPUT_PATH ]; then
+        rm -rf $OUTPUT_PATH
+    fi
+    mkdir $OUTPUT_PATH
+
+    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"
+
+    if [ ${profiling} == "true" ];then
+        add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
+        log_file=${profiling_log_file}
+    else
+        add_options=""
+        log_file=${train_log_file}
+    fi
+
+    # Disable for hanging bug
+    # if [ "${tensor_parallel_degree}" != "1" ]; then
+    #     export CUDA_DEVICE_MAX_CONNECTIONS=1
+    # fi
+
+    # if [ ${run_mode} == "autotuner" ]; then
+    #     unset PADDLE_ELASTIC_JOB_ID
+    #     unset PADDLE_TRAINER_ENDPOINTS
+    #     unset DISTRIBUTED_TRAINER_ENDPOINTS
+    #     unset FLAGS_START_PORT
+    #     unset PADDLE_ELASTIC_TIMEOUT
+    #     unset PADDLE_TRAINERS_NUM
+    #     unset PADDLE_TRAINER_ID
+    #     autoconfig_args="--auto_tuner_json ./auto_config_${MODEL_TYPE}/${MODEL_TYPE}_pretrain_autoconfig.json"
+    # else
+    #     autoconfig_args=""
+    # fi
+    
+    if [ ${PADDLE_TRAINER_ID} ]; then
+        PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}"
+    else
+        PADDLE_RANK_OPTION=""
+    fi
+
+    # if [ "$autoconfig_args" != "" ]; then
+    #     distributed_args="--master etcd://$master_ip:2379 --nnodes $nnodes:$nnodes"
+    # else
+    #     distributed_args="--master $master_ip:36677 --nnodes $nnodes ${PADDLE_RANK_OPTION} --run_mode=collective"
+    # fi
+
+    echo "==========System Env============="
+    env
+    echo "================================="
+
+    # 以下为通用执行命令，无特殊可不用修改
+    case ${device_num} in
+    N1C8) echo "Run with: device_num=${device_num}, run_mode=${run_mode}"
+        train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \
+            --nnodes 1 --nproc_per_node 8 \
+            --log_dir mylog run_pretrain_auto.py \
+            ./pretrain_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}.json"
+        ;;
+    N4C32) echo "Run with: device_num=${device_num} run_mode=${run_mode}"
+        train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \
+            --log_dir mylog run_pretrain_auto.py \
+            ./pretrain_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}.json"
+        ;;
+    *) echo "Run with: device_num=${device_num}, run_mode=${run_mode}"
+        train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \
+            --log_dir mylog run_pretrain_auto.py \
+            ./pretrain_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}.json"
+        ;;
+    esac
+    cd ../llm/llama/auto_parallel/
+    # rm -rf ./auto_config_${MODEL_TYPE}/*GBS*
+    # rm -rf ./auto_config_${MODEL_TYPE}/*auto_tuner.log
+    # rm -rf ./auto_config_${MODEL_TYPE}/*csv
+    # rm -rf ./auto_config_${MODEL_TYPE}/best_*
+    rm -rf mylog && rm -rf checkpoints
+    
+    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
+    timeout 15m ${train_cmd} > ${log_file} 2>&1
+
+    if [ $? -ne 0 ];then
+        echo -e "${model_name}, FAIL"
+    else
+        echo -e "${model_name}, SUCCESS"
+    fi
+
+    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
+    if [ ${device_num} != "N1C1" ]; then
+        case_path=$PWD && cd - && mkdir -p mylog      # PaddleNLP/tests/mylog
+        cp -r ${case_path}/mylog/workerlog.* ./mylog/
+    fi
+}
+
+export FLAGS_selected_gpus="0,1,2,3,4,5,6,7"
+export NCCL_IB_DISABLE=0
+export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export PARALLEL_CROSS_ENTROPY=true
+
+source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
+_set_params $@
+#_train       # 如果只产出训练log,不解析,可取消注释
+_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开
diff --git a/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_13b/pretrain-llama2_13b.json b/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_13b/pretrain-llama2_13b.json
new file mode 100644
index 000000000000..3645473961b3
--- /dev/null
+++ b/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_13b/pretrain-llama2_13b.json
@@ -0,0 +1,53 @@
+{
+  "model_name_or_path": "meta-llama/Llama-2-13b",
+  "tokenizer_name_or_path": "meta-llama/Llama-2-13b",
+  "input_dir": "./data",
+  "output_dir": "./checkpoints/llama2_pretrain_ckpts",
+  "per_device_train_batch_size": 1,
+  "gradient_accumulation_steps": 4,
+  "per_device_eval_batch_size": 4,
+  "tensor_parallel_degree": 1,
+  "pipeline_parallel_degree": 4,
+  "sharding": "stage2",
+  "sharding_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
+  "tensor_parallel_config": "enable_mp_async_allreduce",
+  "pipeline_parallel_config": "enable_send_recv_overlap",
+  "virtual_pp_degree": 5,
+  "sequence_parallel": 0,   
+  "use_flash_attention": true,
+  "use_fused_rms_norm": true,
+  "fuse_attention_ffn": true,
+  "fuse_attention_qkv": true,
+  "use_fused_rope": true,
+  "fused_linear_param_grad_add": true,
+  "max_seq_length": 4096,
+  "learning_rate": 3e-05,
+  "min_learning_rate": 3e-06,
+  "warmup_steps": 30,
+  "logging_steps": 1,
+  "max_steps": 50,
+  "save_steps": 5000,
+  "eval_steps": 1000,
+  "weight_decay": 0.01,
+  "bf16": true,
+  "fp16_opt_level": "O2",
+  "amp_custom_black_list": "reduce_sum c_softmax_with_cross_entropy",
+  "amp_custom_white_list": "lookup_table lookup_table_v2",
+  "amp_master_grad": true,
+  "warmup_ratio": 0.01,
+  "max_grad_norm": 1.0,
+  "dataloader_num_workers": 1,
+  "continue_training": 0,
+  "do_train": true,
+  "do_eval": false,
+  "do_predict": false,
+  "disable_tqdm": true,
+  "skip_profile_timer": true,
+  "recompute": false,
+  "recompute_use_reentrant": true,
+  "distributed_dataloader": 0,
+  "recompute_granularity": "full",
+  "save_total_limit": 2,
+  "device": "gpu",
+  "to_static": true
+}

From 45b282466276f993537711d8d1667143356f3aba Mon Sep 17 00:00:00 2001
From: liujie44 <liujie44@baidu.com>
Date: Wed, 15 May 2024 09:49:42 +0800
Subject: [PATCH 3/5] fix

---
 .../static/auto_parallel/llama2/benchmark_common/prepare.sh     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh
index ff12fc5b0346..edb4590e2f15 100644
--- a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh
+++ b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh
@@ -33,4 +33,4 @@ mv llama_openwebtext_100k_idx.npz ./data
 
 # mv pretrain_config
 rm -rf pretrain_config_*
-cp -r ../../tests/test_tipc/static/auto_parallel/llama2/pretrain_config_* ./
+cp -r ../../../tests/test_tipc/static/auto_parallel/llama2/pretrain_config_* ./

From 7c42a16f018e9429c89125288b9044d7e656c250 Mon Sep 17 00:00:00 2001
From: liujie44 <liujie44@baidu.com>
Date: Wed, 15 May 2024 14:40:07 +0800
Subject: [PATCH 4/5] update config

---
 .../pretrain-llama2_13b.json                          | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_13b/pretrain-llama2_13b.json b/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_13b/pretrain-llama2_13b.json
index 3645473961b3..0c323896d9fa 100644
--- a/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_13b/pretrain-llama2_13b.json
+++ b/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_13b/pretrain-llama2_13b.json
@@ -9,9 +9,11 @@
   "tensor_parallel_degree": 1,
   "pipeline_parallel_degree": 4,
   "sharding": "stage2",
-  "sharding_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
+  "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
+  "sharding_parallel_config": "enable_stage2_overlap",
   "tensor_parallel_config": "enable_mp_async_allreduce",
   "pipeline_parallel_config": "enable_send_recv_overlap",
+  "pipeline_schedule_mode": "VPP", 
   "virtual_pp_degree": 5,
   "sequence_parallel": 0,   
   "use_flash_attention": true,
@@ -31,8 +33,8 @@
   "weight_decay": 0.01,
   "bf16": true,
   "fp16_opt_level": "O2",
-  "amp_custom_black_list": "reduce_sum c_softmax_with_cross_entropy",
-  "amp_custom_white_list": "lookup_table lookup_table_v2",
+  "amp_custom_black_list": ["reduce_sum", "c_softmax_with_cross_entropy"],
+  "amp_custom_white_list": ["lookup_table", "lookup_table_v2"],
   "amp_master_grad": true,
   "warmup_ratio": 0.01,
   "max_grad_norm": 1.0,
@@ -49,5 +51,6 @@
   "recompute_granularity": "full",
   "save_total_limit": 2,
   "device": "gpu",
-  "to_static": true
+  "to_static": true,
+  "enable_auto_parallel": true
 }

From 5f45efd98ff087f18b3889ff3a546bd5c2eb30c7 Mon Sep 17 00:00:00 2001
From: liujie44 <liujie44@baidu.com>
Date: Fri, 17 May 2024 09:47:19 +0800
Subject: [PATCH 5/5] Add ips log for per card

---
 llm/llama/auto_parallel/run_pretrain_auto.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llm/llama/auto_parallel/run_pretrain_auto.py b/llm/llama/auto_parallel/run_pretrain_auto.py
index 6308af2aa06a..e7be917eecc7 100644
--- a/llm/llama/auto_parallel/run_pretrain_auto.py
+++ b/llm/llama/auto_parallel/run_pretrain_auto.py
@@ -377,6 +377,7 @@ def get_train_data_file(args):
 class PretrainingTrainer(AutoTrainer):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self.is_pretraining = True
 
     def _wrap_for_dist_loader(self, train_dataloader):
         dist_loader = super()._wrap_for_dist_loader(train_dataloader)