From ef5bd1de8e259d7552d7e536552d0a296e2a071a Mon Sep 17 00:00:00 2001
From: mmglove <gmm_email@163.com>
Date: Fri, 10 May 2024 17:39:17 +0800
Subject: [PATCH 01/12] add llama-7b_auto_dp2mp2pp2 benchmark sh

---
 ...lama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh |  12 ++
 .../benchmark_common/analysis_log.py          |  76 ++++++++
 .../benchmark_common/prepare.sh               |  16 ++
 .../benchmark_common/run_benchmark.sh         | 173 +++++++++++++++++
 .../dygraph/llama-7b_auto_dp2mp2pp2/readme.MD |   4 +
 ...lama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh |  12 ++
 .../benchmark_common/analysis_log.py          |  76 ++++++++
 .../benchmark_common/prepare.sh               |  16 ++
 .../benchmark_common/run_benchmark.sh         | 177 ++++++++++++++++++
 .../llama-7b_auto_dp2mp2pp2/readme.MD         |   4 +
 10 files changed, 566 insertions(+)
 create mode 100644 tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh
 create mode 100644 tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py
 create mode 100644 tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/prepare.sh
 create mode 100644 tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
 create mode 100644 tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/readme.MD
 create mode 100644 tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh
 create mode 100644 tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py
 create mode 100644 tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/prepare.sh
 create mode 100644 tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
 create mode 100644 tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/readme.MD

diff --git a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh
new file mode 100644
index 000000000000..947334833d05
--- /dev/null
+++ b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh
@@ -0,0 +1,12 @@
+model_item=llama-7b_auto_dp2mp2pp2
+bs_item=1
+fp_item=fp16
+run_mode=DP
+device_num=N1C1
+
+max_iter=100
+
+# prepare
+# bash ./test_tipc/dygraph/${model_item}/benchmark_common/prepare.sh
+# run
+bash ./test_tipc/dygraph/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} 2>&1;
diff --git a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py
new file mode 100644
index 000000000000..f4fd00825eae
--- /dev/null
+++ b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+import json
+import os
+import re
+import sys
+
+import numpy as np
+
+
+def analyze(model_item, log_file, res_log_file, device_num, bs, fp_item):
+    with open(str(log_file), "r", encoding="utf8") as f:
+        data = f.readlines()
+    ips_lines = []
+    for eachline in data:
+        if "train_samples_per_second:" in eachline:
+            ips = float(eachline.split("train_samples_per_second: ")[1].split()[0].replace(',', ''))
+            print("----ips: ", ips)
+            ips_lines.append(ips)
+    print("----ips_lines: ", ips_lines)
+    ips = np.round(np.mean(ips_lines), 3)
+    ngpus = int(re.findall("\d+", device_num)[-1])
+    print("----ips: ", ips, "ngpus", ngpus)
+    ips *= ngpus
+    run_mode = "DP"
+
+    model_name = model_item + "_" + "bs" + str(bs) + "_" + fp_item + "_" + run_mode
+    info = {
+        "model_branch": os.getenv("model_branch"),
+        "model_commit": os.getenv("model_commit"),
+        "model_name": model_name,
+        "batch_size": bs,
+        "fp_item": fp_item,
+        "run_mode": run_mode,
+        "convergence_value": 0,
+        "convergence_key": "",
+        "ips": ips,
+        "speed_unit": "sample/sec",
+        "device_num": device_num,
+        "model_run_time": os.getenv("model_run_time"),
+        "frame_commit": os.getenv("frame_commit"),
+        "frame_version": os.getenv("frame_version"),
+    }
+    json_info = json.dumps(info)
+    print(json_info)
+    with open(res_log_file, "w") as of:
+        of.write(json_info)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 7:
+        print("Usage:" + sys.argv[0] + " model_item path/to/log/file path/to/res/log/file")
+        sys.exit()
+
+    model_item = sys.argv[1]
+    log_file = sys.argv[2]
+    res_log_file = sys.argv[3]
+    device_num = sys.argv[4]
+    bs = int(sys.argv[5])
+    fp_item = sys.argv[6]
+
+    analyze(model_item, log_file, res_log_file, device_num, bs, fp_item)
\ No newline at end of file
diff --git a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/prepare.sh
new file mode 100644
index 000000000000..d9c47aeeb7c9
--- /dev/null
+++ b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/prepare.sh
@@ -0,0 +1,16 @@
+# install可选
+cd ../  # PaddleNLP 根目录
+pwd
+pip install -e .
+cd -
+
+# 下载、解压、拷贝必要数据集
+cd ../llm/llama/auto_parallel/
+# llama 模型数据下载
+wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
+wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz
+
+mkdir data
+mv llama_openwebtext_100k_ids.npy ./data
+mv llama_openwebtext_100k_idx.npz ./data
+cd -
\ No newline at end of file
diff --git a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
new file mode 100644
index 000000000000..7d32efac8430
--- /dev/null
+++ b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
@@ -0,0 +1,173 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Test training benchmark for a model.
+# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${mp_degree} ${pp_degree} ${dp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}
+function _set_params(){
+    model_item=${1:-"llama-7b_auto_dp2mp2pp2"}   # (必选) 模型 item
+    base_batch_size=${2:-"1"}       # (必选)
+    fp_item=${3:-"fp32"}            # (必选) fp32|fp16|bf16
+    run_mode=${4:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
+    device_num=${5:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
+    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
+
+    model_repo="PaddleNLP"          # (必选) 模型套件的名字
+    speed_unit="sample/s"         # (必选)速度指标单位
+    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step
+    keyword="ips:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
+    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
+    max_iter=${6:-100}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
+    num_workers=0                  # (可选)
+
+    # 以下为通用执行命令，无特殊可不用修改
+    model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
+    device=${CUDA_VISIBLE_DEVICES//,/ }
+    arr=(${device})
+    num_gpu_devices=${#arr[*]}
+    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量
+    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
+    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
+
+    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
+    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
+    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed
+}
+function _train(){
+    batch_size=${base_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
+    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"
+
+    if [ ${profiling} = "true" ];then
+        add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
+        log_file=${profiling_log_file}
+    else
+        add_options=""
+        log_file=${train_log_file}
+    fi
+    export FLAGS_group_schedule_tiling_first=1
+    export FLAGS_cinn_bucket_compile=1
+    export FLAGS_cinn_new_cluster_op_method=1
+    export FLAGS_deny_cinn_ops="gather"
+    export FLAGS_prim_forward_blacklist="pd_op.embedding"
+    export FLAGS_enable_prim_after_distribute=True
+    export FLAGS_disable_dyshape_in_train=True
+    export FLAGS_enable_pir_in_executor=True
+    export FLAGS_enable_prim_after_distribute=1
+    export FLAGS_enable_cinn_compile_cache=0
+    export PYTHONUNBUFFERED=1
+
+    use_fp16_cmd=""
+    if [ $fp_item = "fp16" ]; then
+        use_fp16_cmd="--fp16 1 --fp16_opt_level O2"
+    fi
+    to_static=0  # 是否开启动转静训练
+    train_cmd="run_pretrain_auto.py \
+            --model_type "llama" \
+            --model_name_or_path "facebook/llama-7b" \
+            --tokenizer_name_or_path "facebook/llama-7b" \
+            --input_dir "./data" \
+            --output_dir "output/$model_item" \
+            --split 949,50,1 \
+            --max_seq_length 2048 \
+            --per_device_train_batch_size ${batch_size} \
+            --per_device_eval_batch_size 2 \
+            --gradient_accumulation_steps 1 \
+            --use_flash_attention 1 \
+            --use_fused_rms_norm 0 \
+            --scale_loss 1024 \
+            --pipeline_parallel_degree 1 \
+            --tensor_parallel_degree 1 \
+            --sharding_parallel_degree 1 \
+            --learning_rate 0.0001 \
+            --min_learning_rate 0.00001 \
+            --max_steps ${max_iter} \
+            --save_steps 5000000 \
+            --weight_decay 0.01 \
+            --warmup_ratio 0.01 \
+            --logging_steps 20 \
+            --dataloader_num_workers 1 \
+            --sharding '' \
+            --eval_steps 1000000 \
+            --disable_tqdm true \
+            --continue_training 0\
+            --recompute 0 \
+            --do_train \
+            --do_eval \
+            --device 'gpu' \
+            --data_impl 'mmap' \
+            --enable_auto_parallel 1 \
+            --max_grad_norm 1.0 \
+            --num_hidden_layers 4 \
+            --to_static ${to_static} \
+            ${use_fp16_cmd} "
+
+    # 以下为通用执行命令，无特殊可不用修改
+    case ${run_mode} in
+    DP)
+        rm -rf ./mylog   # 注意执行前删掉log目录
+        rm -rf output/$model_item
+        train_cmd="python -u -m paddle.distributed.launch --log_dir=./mylog \
+            --gpus $CUDA_VISIBLE_DEVICES ${train_cmd}"
+        ;;
+    DP1-MP1-PP1)  echo "run run_mode: DP1-MP1-PP1" ;;
+    *) echo "choose run_mode "; exit 1;
+    esac
+
+    cd ../llm/llama/auto_parallel/
+    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
+
+    python -c "import paddlenlp"
+    timeout 15m ${train_cmd} > ${log_file} 2>&1
+    if [ $? -ne 0 ];then
+        echo -e "${model_name}, FAIL"
+    else
+        echo -e "${model_name}, SUCCESS"
+    fi
+    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
+    if [ -d mylog ]; then
+        case_path=$PWD && cd - && mkdir -p mylog      # PaddleNLP/tests/mylog
+        cp -r ${case_path}/mylog/workerlog.* ./mylog/
+        rm ${log_file}
+        cp ${case_path}/mylog/workerlog.0 ${log_file}
+    fi
+
+    echo ${train_cmd} >> ${log_file}
+    cat ${log_file}
+}
+
+function _analysis_log(){
+    # PaddleNLP/tests 目录
+    analysis_log_cmd="python test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py \
+        ${model_item} ${log_file} ${speed_log_file} ${device_num} ${base_batch_size} ${fp_item}"
+    echo ${analysis_log_cmd}
+    eval ${analysis_log_cmd}
+}
+
+_set_params $@
+str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`)
+export frame_version=${str_tmp%%.post*}
+export frame_commit=$(echo `python -c "import paddle;print(paddle.version.commit)"`)
+export model_branch=`git symbolic-ref HEAD 2>/dev/null | cut -d"/" -f 3`
+export model_commit=$(git log|head -n1|awk '{print $2}')
+echo "---------frame_version is ${frame_version}"
+echo "---------Paddle commit is ${frame_commit}"
+echo "---------Model commit is ${model_commit}"
+echo "---------model_branch is ${model_branch}"
+
+job_bt=`date '+%Y%m%d%H%M%S'`
+_train
+job_et=`date '+%Y%m%d%H%M%S'`
+export model_run_time=$((${job_et}-${job_bt}))
+_analysis_log
diff --git a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/readme.MD b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/readme.MD
new file mode 100644
index 000000000000..c58fd32f08c6
--- /dev/null
+++ b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/readme.MD
@@ -0,0 +1,4 @@
+添加记录:
+2024Q2在CINN推全项目中添加.
+模型名llama_auto_dp2mp2pp2
+执行脚本:llm/llama/auto_parallel/run_pretrain_auto.py
diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh
new file mode 100644
index 000000000000..947334833d05
--- /dev/null
+++ b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh
@@ -0,0 +1,12 @@
+model_item=llama-7b_auto_dp2mp2pp2
+bs_item=1
+fp_item=fp16
+run_mode=DP
+device_num=N1C1
+
+max_iter=100
+
+# prepare
+# bash ./test_tipc/dygraph/${model_item}/benchmark_common/prepare.sh
+# run
+bash ./test_tipc/dygraph/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} 2>&1;
diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py
new file mode 100644
index 000000000000..f4fd00825eae
--- /dev/null
+++ b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+import json
+import os
+import re
+import sys
+
+import numpy as np
+
+
+def analyze(model_item, log_file, res_log_file, device_num, bs, fp_item):
+    with open(str(log_file), "r", encoding="utf8") as f:
+        data = f.readlines()
+    ips_lines = []
+    for eachline in data:
+        if "train_samples_per_second:" in eachline:
+            ips = float(eachline.split("train_samples_per_second: ")[1].split()[0].replace(',', ''))
+            print("----ips: ", ips)
+            ips_lines.append(ips)
+    print("----ips_lines: ", ips_lines)
+    ips = np.round(np.mean(ips_lines), 3)
+    ngpus = int(re.findall("\d+", device_num)[-1])
+    print("----ips: ", ips, "ngpus", ngpus)
+    ips *= ngpus
+    run_mode = "DP"
+
+    model_name = model_item + "_" + "bs" + str(bs) + "_" + fp_item + "_" + run_mode
+    info = {
+        "model_branch": os.getenv("model_branch"),
+        "model_commit": os.getenv("model_commit"),
+        "model_name": model_name,
+        "batch_size": bs,
+        "fp_item": fp_item,
+        "run_mode": run_mode,
+        "convergence_value": 0,
+        "convergence_key": "",
+        "ips": ips,
+        "speed_unit": "sample/sec",
+        "device_num": device_num,
+        "model_run_time": os.getenv("model_run_time"),
+        "frame_commit": os.getenv("frame_commit"),
+        "frame_version": os.getenv("frame_version"),
+    }
+    json_info = json.dumps(info)
+    print(json_info)
+    with open(res_log_file, "w") as of:
+        of.write(json_info)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 7:
+        print("Usage:" + sys.argv[0] + " model_item path/to/log/file path/to/res/log/file")
+        sys.exit()
+
+    model_item = sys.argv[1]
+    log_file = sys.argv[2]
+    res_log_file = sys.argv[3]
+    device_num = sys.argv[4]
+    bs = int(sys.argv[5])
+    fp_item = sys.argv[6]
+
+    analyze(model_item, log_file, res_log_file, device_num, bs, fp_item)
\ No newline at end of file
diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/prepare.sh b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/prepare.sh
new file mode 100644
index 000000000000..d9c47aeeb7c9
--- /dev/null
+++ b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/prepare.sh
@@ -0,0 +1,16 @@
+# install可选
+cd ../  # PaddleNLP 根目录
+pwd
+pip install -e .
+cd -
+
+# 下载、解压、拷贝必要数据集
+cd ../llm/llama/auto_parallel/
+# llama 模型数据下载
+wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
+wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz
+
+mkdir data
+mv llama_openwebtext_100k_ids.npy ./data
+mv llama_openwebtext_100k_idx.npz ./data
+cd -
\ No newline at end of file
diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
new file mode 100644
index 000000000000..b0a958ffd17c
--- /dev/null
+++ b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
@@ -0,0 +1,177 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Test training benchmark for a model.
+# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${mp_degree} ${pp_degree} ${dp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}
+function _set_params(){
+    model_item=${1:-"llama-7b_auto_dp2mp2pp2"}   # (必选) 模型 item
+    base_batch_size=${2:-"1"}       # (必选)
+    fp_item=${3:-"fp32"}            # (必选) fp32|fp16|bf16
+    run_mode=${4:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
+    device_num=${5:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
+    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
+
+    model_repo="PaddleNLP"          # (必选) 模型套件的名字
+    speed_unit="sample/s"         # (必选)速度指标单位
+    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step
+    keyword="ips:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
+    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
+    max_iter=${6:-100}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
+    num_workers=0                  # (可选)
+
+    # 以下为通用执行命令，无特殊可不用修改
+    model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
+    device=${CUDA_VISIBLE_DEVICES//,/ }
+    arr=(${device})
+    num_gpu_devices=${#arr[*]}
+    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量
+    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
+    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
+
+    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_d2sT_log
+    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_d2sT_profiling
+    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_d2sT_speed
+}
+function _train(){
+    batch_size=${base_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
+    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"
+
+    if [ ${profiling} = "true" ];then
+        add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
+        log_file=${profiling_log_file}
+    else
+        add_options=""
+        log_file=${train_log_file}
+    fi
+
+    export FLAGS_group_schedule_tiling_first=1
+    export FLAGS_cinn_bucket_compile=1
+    export FLAGS_cinn_new_cluster_op_method=1
+    export FLAGS_deny_cinn_ops="gather"
+    export FLAGS_prim_forward_blacklist="pd_op.embedding"
+    export FLAGS_enable_prim_after_distribute=True
+    export FLAGS_disable_dyshape_in_train=True
+    export FLAGS_enable_pir_in_executor=True
+    export FLAGS_enable_prim_after_distribute=1
+    export FLAGS_enable_cinn_compile_cache=0
+    export PYTHONUNBUFFERED=1
+
+    export ENABLE_FALL_BACK=True # 开启SOT
+    # export FLAGS_use_cinn=True  # 是否开启cinn ,在benchmark中设置
+
+    use_fp16_cmd=""
+    if [ $fp_item = "fp16" ]; then
+        use_fp16_cmd="--fp16 1 --fp16_opt_level O2"
+    fi
+    to_static=1  # 是否开启动转静训练
+    train_cmd="run_pretrain_auto.py \
+            --model_type "llama" \
+            --model_name_or_path "facebook/llama-7b" \
+            --tokenizer_name_or_path "facebook/llama-7b" \
+            --input_dir "./data" \
+            --output_dir "output/$model_item" \
+            --split 949,50,1 \
+            --max_seq_length 2048 \
+            --per_device_train_batch_size ${batch_size} \
+            --per_device_eval_batch_size 2 \
+            --gradient_accumulation_steps 1 \
+            --use_flash_attention 1 \
+            --use_fused_rms_norm 0 \
+            --scale_loss 1024 \
+            --pipeline_parallel_degree 1 \
+            --tensor_parallel_degree 1 \
+            --sharding_parallel_degree 1 \
+            --learning_rate 0.0001 \
+            --min_learning_rate 0.00001 \
+            --max_steps ${max_iter} \
+            --save_steps 5000000 \
+            --weight_decay 0.01 \
+            --warmup_ratio 0.01 \
+            --logging_steps 20 \
+            --dataloader_num_workers 1 \
+            --sharding '' \
+            --eval_steps 1000000 \
+            --disable_tqdm true \
+            --continue_training 0\
+            --recompute 0 \
+            --do_train \
+            --do_eval \
+            --device 'gpu' \
+            --data_impl 'mmap' \
+            --enable_auto_parallel 1 \
+            --max_grad_norm 1.0 \
+            --num_hidden_layers 4 \
+            --to_static ${to_static} \
+            ${use_fp16_cmd} "
+
+    # 以下为通用执行命令，无特殊可不用修改
+    case ${run_mode} in
+    DP)
+        rm -rf ./mylog   # 注意执行前删掉log目录
+        rm -rf output/$model_item
+        train_cmd="python -u -m paddle.distributed.launch --log_dir=./mylog \
+            --gpus $CUDA_VISIBLE_DEVICES ${train_cmd}"
+        ;;
+    DP1-MP1-PP1)  echo "run run_mode: DP1-MP1-PP1" ;;
+    *) echo "choose run_mode "; exit 1;
+    esac
+
+    cd ../llm/llama/auto_parallel/
+    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
+
+    python -c "import paddlenlp"
+    timeout 15m ${train_cmd} > ${log_file} 2>&1
+    if [ $? -ne 0 ];then
+        echo -e "${model_name}, FAIL"
+    else
+        echo -e "${model_name}, SUCCESS"
+    fi
+    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
+    if [ -d mylog ]; then
+        case_path=$PWD && cd - && mkdir -p mylog      # PaddleNLP/tests/mylog
+        cp -r ${case_path}/mylog/workerlog.* ./mylog/
+        rm ${log_file}
+        cp ${case_path}/mylog/workerlog.0 ${log_file}
+    fi
+
+    echo ${train_cmd} >> ${log_file}
+    cat ${log_file}
+}
+
+function _analysis_log(){
+    # PaddleNLP/tests 目录
+    analysis_log_cmd="python test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py \
+        ${model_item} ${log_file} ${speed_log_file} ${device_num} ${base_batch_size} ${fp_item}"
+    echo ${analysis_log_cmd}
+    eval ${analysis_log_cmd}
+}
+
+_set_params $@
+str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`)
+export frame_version=${str_tmp%%.post*}
+export frame_commit=$(echo `python -c "import paddle;print(paddle.version.commit)"`)
+export model_branch=`git symbolic-ref HEAD 2>/dev/null | cut -d"/" -f 3`
+export model_commit=$(git log|head -n1|awk '{print $2}')
+echo "---------frame_version is ${frame_version}"
+echo "---------Paddle commit is ${frame_commit}"
+echo "---------Model commit is ${model_commit}"
+echo "---------model_branch is ${model_branch}"
+
+job_bt=`date '+%Y%m%d%H%M%S'`
+_train
+job_et=`date '+%Y%m%d%H%M%S'`
+export model_run_time=$((${job_et}-${job_bt}))
+_analysis_log
diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/readme.MD b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/readme.MD
new file mode 100644
index 000000000000..c58fd32f08c6
--- /dev/null
+++ b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/readme.MD
@@ -0,0 +1,4 @@
+添加记录:
+2024Q2在CINN推全项目中添加.
+模型名llama_auto_dp2mp2pp2
+执行脚本:llm/llama/auto_parallel/run_pretrain_auto.py

From fc252a0e8832debdc4aca5cf405af3884d8e5d96 Mon Sep 17 00:00:00 2001
From: mmglove <guomengmeng01@baidu.com>
Date: Sat, 11 May 2024 11:17:48 +0800
Subject: [PATCH 02/12] add llama-7b_auto_dp2mp2pp2 benchmark script for cinn

---
 .../N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh        | 2 +-
 .../benchmark_common/run_benchmark.sh                       | 6 ++++--
 .../N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh        | 4 ++--
 .../benchmark_common/run_benchmark.sh                       | 6 ++++--
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh
index 947334833d05..d88157d6c2ce 100644
--- a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh
+++ b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh
@@ -7,6 +7,6 @@ device_num=N1C1
 max_iter=100
 
 # prepare
-# bash ./test_tipc/dygraph/${model_item}/benchmark_common/prepare.sh
+bash ./test_tipc/dygraph/${model_item}/benchmark_common/prepare.sh
 # run
 bash ./test_tipc/dygraph/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} 2>&1;
diff --git a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
index 7d32efac8430..6db4da602b15 100644
--- a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
+++ b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
@@ -105,8 +105,8 @@ function _train(){
             --recompute 0 \
             --do_train \
             --do_eval \
-            --device 'gpu' \
-            --data_impl 'mmap' \
+            --device gpu \
+            --data_impl mmap \
             --enable_auto_parallel 1 \
             --max_grad_norm 1.0 \
             --num_hidden_layers 4 \
@@ -126,6 +126,8 @@ function _train(){
     esac
 
     cd ../llm/llama/auto_parallel/
+    rm -rf ./mylog   # 注意执行前删掉log目录
+    rm -rf output/$model_item
     echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
 
     python -c "import paddlenlp"
diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh
index 947334833d05..d83deec68bae 100644
--- a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh
+++ b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh
@@ -7,6 +7,6 @@ device_num=N1C1
 max_iter=100
 
 # prepare
-# bash ./test_tipc/dygraph/${model_item}/benchmark_common/prepare.sh
+bash ./test_tipc/dynamicTostatic/${model_item}/benchmark_common/prepare.sh
 # run
-bash ./test_tipc/dygraph/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} 2>&1;
+bash ./test_tipc/dynamicTostatic/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} 2>&1;
diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
index b0a958ffd17c..73f63a0a65f3 100644
--- a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
+++ b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
@@ -109,8 +109,8 @@ function _train(){
             --recompute 0 \
             --do_train \
             --do_eval \
-            --device 'gpu' \
-            --data_impl 'mmap' \
+            --device gpu \
+            --data_impl mmap \
             --enable_auto_parallel 1 \
             --max_grad_norm 1.0 \
             --num_hidden_layers 4 \
@@ -130,6 +130,8 @@ function _train(){
     esac
 
     cd ../llm/llama/auto_parallel/
+    rm -rf ./mylog   # 注意执行前删掉log目录
+    rm -rf output/$model_item
     echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
 
     python -c "import paddlenlp"

From bcee7ede9bdcc447fef6b7754b39bdaed13564c6 Mon Sep 17 00:00:00 2001
From: mmglove <gmm_email@163.com>
Date: Sat, 11 May 2024 14:53:52 +0800
Subject: [PATCH 03/12] update llama-7b_auto_dp2mp2pp2 benchmark script

---
 ...bs1_fp16_DP_N1C1.sh => llama-7b_auto_dp2mp2pp2_bs1_fp16_DP.sh} | 0
 ...bs1_fp16_DP_N1C1.sh => llama-7b_auto_dp2mp2pp2_bs1_fp16_DP.sh} | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/{llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh => llama-7b_auto_dp2mp2pp2_bs1_fp16_DP.sh} (100%)
 rename tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/{llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh => llama-7b_auto_dp2mp2pp2_bs1_fp16_DP.sh} (100%)

diff --git a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP.sh
similarity index 100%
rename from tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh
rename to tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP.sh
diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP.sh
similarity index 100%
rename from tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh
rename to tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP.sh

From 25d068170d5a8fd40c371e27df6c676a31d98760 Mon Sep 17 00:00:00 2001
From: gmm <38800877+mmglove@users.noreply.github.com>
Date: Tue, 28 May 2024 19:48:20 +0800
Subject: [PATCH 04/12] Update run_benchmark.sh

---
 .../llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh  | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
index 6db4da602b15..2637ada65c48 100644
--- a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
+++ b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
@@ -65,8 +65,7 @@ function _train(){
     export FLAGS_disable_dyshape_in_train=True
     export FLAGS_enable_pir_in_executor=True
     export FLAGS_enable_prim_after_distribute=1
-    export FLAGS_enable_cinn_compile_cache=0
-    export PYTHONUNBUFFERED=1
+
 
     use_fp16_cmd=""
     if [ $fp_item = "fp16" ]; then

From 09cd3378bfc25b38d566c080da2ff92136c003ba Mon Sep 17 00:00:00 2001
From: gmm <38800877+mmglove@users.noreply.github.com>
Date: Tue, 28 May 2024 19:48:44 +0800
Subject: [PATCH 05/12] Update run_benchmark.sh

---
 .../llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh  | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
index 73f63a0a65f3..2e6ecd63b90a 100644
--- a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
+++ b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
@@ -66,8 +66,7 @@ function _train(){
     export FLAGS_disable_dyshape_in_train=True
     export FLAGS_enable_pir_in_executor=True
     export FLAGS_enable_prim_after_distribute=1
-    export FLAGS_enable_cinn_compile_cache=0
-    export PYTHONUNBUFFERED=1
+
 
     export ENABLE_FALL_BACK=True # 开启SOT
     # export FLAGS_use_cinn=True  # 是否开启cinn ,在benchmark中设置

From 0ea8f239b1985a8d438a742e2a61de3d5dbdba5f Mon Sep 17 00:00:00 2001
From: mmglove <gmm_email@163.com>
Date: Tue, 28 May 2024 20:13:56 +0800
Subject: [PATCH 06/12] fix llama-7b_auto_dp2mp2pp2/benchmark_common

---
 .../llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py | 5 ++---
 .../llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py
index f4fd00825eae..0fa28a3d7a8a 100644
--- a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py
+++ b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 
-
 import json
 import os
 import re
@@ -28,7 +27,7 @@ def analyze(model_item, log_file, res_log_file, device_num, bs, fp_item):
     ips_lines = []
     for eachline in data:
         if "train_samples_per_second:" in eachline:
-            ips = float(eachline.split("train_samples_per_second: ")[1].split()[0].replace(',', ''))
+            ips = float(eachline.split("train_samples_per_second: ")[1].split()[0].replace(",", ""))
             print("----ips: ", ips)
             ips_lines.append(ips)
     print("----ips_lines: ", ips_lines)
@@ -73,4 +72,4 @@ def analyze(model_item, log_file, res_log_file, device_num, bs, fp_item):
     bs = int(sys.argv[5])
     fp_item = sys.argv[6]
 
-    analyze(model_item, log_file, res_log_file, device_num, bs, fp_item)
\ No newline at end of file
+    analyze(model_item, log_file, res_log_file, device_num, bs, fp_item)
diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py
index f4fd00825eae..0fa28a3d7a8a 100644
--- a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py
+++ b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 
-
 import json
 import os
 import re
@@ -28,7 +27,7 @@ def analyze(model_item, log_file, res_log_file, device_num, bs, fp_item):
     ips_lines = []
     for eachline in data:
         if "train_samples_per_second:" in eachline:
-            ips = float(eachline.split("train_samples_per_second: ")[1].split()[0].replace(',', ''))
+            ips = float(eachline.split("train_samples_per_second: ")[1].split()[0].replace(",", ""))
             print("----ips: ", ips)
             ips_lines.append(ips)
     print("----ips_lines: ", ips_lines)
@@ -73,4 +72,4 @@ def analyze(model_item, log_file, res_log_file, device_num, bs, fp_item):
     bs = int(sys.argv[5])
     fp_item = sys.argv[6]
 
-    analyze(model_item, log_file, res_log_file, device_num, bs, fp_item)
\ No newline at end of file
+    analyze(model_item, log_file, res_log_file, device_num, bs, fp_item)

From 26fabaae72c5bba59c3060ada006f47c44449f16 Mon Sep 17 00:00:00 2001
From: gmm <38800877+mmglove@users.noreply.github.com>
Date: Fri, 31 May 2024 15:24:25 +0800
Subject: [PATCH 07/12] Update run_benchmark.sh

---
 .../llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
index 2e6ecd63b90a..5fe003b2cf2d 100644
--- a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
+++ b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
@@ -61,7 +61,7 @@ function _train(){
     export FLAGS_cinn_bucket_compile=1
     export FLAGS_cinn_new_cluster_op_method=1
     export FLAGS_deny_cinn_ops="gather"
-    export FLAGS_prim_forward_blacklist="pd_op.embedding"
+    export FLAGS_prim_forward_blacklist="pd_op.embedding;pd_op.squared_l2_norm"
     export FLAGS_enable_prim_after_distribute=True
     export FLAGS_disable_dyshape_in_train=True
     export FLAGS_enable_pir_in_executor=True

From c49b7e1dd13b41b9c12f4803fe73b77dc0f2e38a Mon Sep 17 00:00:00 2001
From: gmm <38800877+mmglove@users.noreply.github.com>
Date: Thu, 20 Jun 2024 18:34:19 +0800
Subject: [PATCH 08/12] Update prepare.sh

---
 .../auto_tuner/llama_pretrain/benchmark_common/prepare.sh     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh
index 5472a36e94c4..3e51c5dc6fb7 100644
--- a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh
+++ b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh
@@ -16,11 +16,11 @@ python -m pip install -r ../requirements.txt
 python -m pip install -r ../requirements-dev.txt
 
 # install fused_ln custom ops
-cd ../model_zoo/gpt-3/external_ops/
+cd ../legacy/model_zoo/gpt-3/external_ops/
 python setup.py install
 
 # install tool_helpers
-cd ../../../llm/llama
+cd ../../../../llm/llama
 python -m pip install tool_helpers
 
 rm -rf data && mkdir data

From f895c9ce6ee3f6c8c43804193c45c3bca220c548 Mon Sep 17 00:00:00 2001
From: gmm <38800877+mmglove@users.noreply.github.com>
Date: Thu, 20 Jun 2024 18:35:03 +0800
Subject: [PATCH 09/12] Update prepare.sh

---
 .../static/auto_parallel/llama2/benchmark_common/prepare.sh   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh
index edb4590e2f15..697d5d1d92e0 100644
--- a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh
+++ b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh
@@ -16,11 +16,11 @@ python -m pip install -r ../requirements.txt
 python -m pip install -r ../requirements-dev.txt
 
 # install fused_ln custom ops
-cd ../model_zoo/gpt-3/external_ops/
+cd ../legacy/model_zoo/gpt-3/external_ops/
 python setup.py install
 
 # install tool_helpers
-cd ../../../llm/llama
+cd ../../../../llm/llama
 python -m pip install tool_helpers
 
 # download data

From 55754a1a7330ff09720248fbced25eff2b9670e4 Mon Sep 17 00:00:00 2001
From: gmm <38800877+mmglove@users.noreply.github.com>
Date: Thu, 20 Jun 2024 18:35:55 +0800
Subject: [PATCH 10/12] Update prepare.sh

---
 .../hybrid_parallelism/qwen/benchmark_common/prepare.sh     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh
index 1d8a79cc2a0e..bf6952c135ca 100644
--- a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh
+++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh
@@ -18,11 +18,11 @@ python -m pip install -r ../requirements-dev.txt
 python -m pip install tiktoken
 
 # install fused_ln custom ops
-cd ../model_zoo/gpt-3/external_ops/
+cd ../legacy/model_zoo/gpt-3/external_ops/
 python setup.py install
 
 # install tool_helpers
-cd ../../../llm/qwen
+cd ../../../../llm/qwen
 python -m pip install tool_helpers
 
 wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
@@ -30,4 +30,4 @@ wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwe
 
 mkdir data
 mv llama_openwebtext_100k_ids.npy ./data
-mv llama_openwebtext_100k_idx.npz ./data
\ No newline at end of file
+mv llama_openwebtext_100k_idx.npz ./data

From 564ea16839b7b8ce4834127f961f07694e7350bf Mon Sep 17 00:00:00 2001
From: gmm <38800877+mmglove@users.noreply.github.com>
Date: Thu, 20 Jun 2024 18:36:39 +0800
Subject: [PATCH 11/12] Update prepare.sh

---
 .../hybrid_parallelism/llama/benchmark_common/prepare.sh    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh
index 0563a1aaabac..388b179e6905 100644
--- a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh
+++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh
@@ -16,11 +16,11 @@ python -m pip install -r ../requirements.txt
 python -m pip install -r ../requirements-dev.txt
 
 # install fused_ln custom ops
-cd ../model_zoo/gpt-3/external_ops/
+cd ../legacy/model_zoo/gpt-3/external_ops/
 python setup.py install
 
 # install tool_helpers
-cd ../../../llm/llama
+cd ../../../../llm/llama
 python -m pip install tool_helpers
 
 wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
@@ -28,4 +28,4 @@ wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwe
 
 mkdir data
 mv llama_openwebtext_100k_ids.npy ./data
-mv llama_openwebtext_100k_idx.npz ./data
\ No newline at end of file
+mv llama_openwebtext_100k_idx.npz ./data

From 41c33fc13505c6a60ec7beeaa7687839c38c17eb Mon Sep 17 00:00:00 2001
From: gmm <38800877+mmglove@users.noreply.github.com>
Date: Thu, 20 Jun 2024 18:37:32 +0800
Subject: [PATCH 12/12] Update prepare.sh

---
 .../hybrid_parallelism/llama2/benchmark_common/prepare.sh     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh
index 45fd82fad914..9405521c7b3f 100644
--- a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh
+++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh
@@ -16,11 +16,11 @@ python -m pip install -r ../requirements.txt
 python -m pip install -r ../requirements-dev.txt
 
 # install fused_ln custom ops
-cd ../model_zoo/gpt-3/external_ops/
+cd ../legacy/model_zoo/gpt-3/external_ops/
 python setup.py install
 
 # install tool_helpers
-cd ../../../llm/llama
+cd ../../../../llm/llama
 python -m pip install tool_helpers
 
 # download data