From ef5bd1de8e259d7552d7e536552d0a296e2a071a Mon Sep 17 00:00:00 2001 From: mmglove Date: Fri, 10 May 2024 17:39:17 +0800 Subject: [PATCH 01/12] add llama-7b_auto_dp2mp2pp2 benchmark sh --- ...lama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh | 12 ++ .../benchmark_common/analysis_log.py | 76 ++++++++ .../benchmark_common/prepare.sh | 16 ++ .../benchmark_common/run_benchmark.sh | 173 +++++++++++++++++ .../dygraph/llama-7b_auto_dp2mp2pp2/readme.MD | 4 + ...lama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh | 12 ++ .../benchmark_common/analysis_log.py | 76 ++++++++ .../benchmark_common/prepare.sh | 16 ++ .../benchmark_common/run_benchmark.sh | 177 ++++++++++++++++++ .../llama-7b_auto_dp2mp2pp2/readme.MD | 4 + 10 files changed, 566 insertions(+) create mode 100644 tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh create mode 100644 tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py create mode 100644 tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/prepare.sh create mode 100644 tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh create mode 100644 tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/readme.MD create mode 100644 tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh create mode 100644 tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py create mode 100644 tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/prepare.sh create mode 100644 tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh create mode 100644 tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/readme.MD diff --git a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh new file mode 100644 index 000000000000..947334833d05 --- /dev/null +++ b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh @@ -0,0 +1,12 @@ +model_item=llama-7b_auto_dp2mp2pp2 +bs_item=1 +fp_item=fp16 +run_mode=DP +device_num=N1C1 + +max_iter=100 + +# prepare +# bash ./test_tipc/dygraph/${model_item}/benchmark_common/prepare.sh +# run +bash ./test_tipc/dygraph/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} 2>&1; diff --git a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py new file mode 100644 index 000000000000..f4fd00825eae --- /dev/null +++ b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py @@ -0,0 +1,76 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + +import json +import os +import re +import sys + +import numpy as np + + +def analyze(model_item, log_file, res_log_file, device_num, bs, fp_item): + with open(str(log_file), "r", encoding="utf8") as f: + data = f.readlines() + ips_lines = [] + for eachline in data: + if "train_samples_per_second:" in eachline: + ips = float(eachline.split("train_samples_per_second: ")[1].split()[0].replace(',', '')) + print("----ips: ", ips) + ips_lines.append(ips) + print("----ips_lines: ", ips_lines) + ips = np.round(np.mean(ips_lines), 3) + ngpus = int(re.findall("\d+", device_num)[-1]) + print("----ips: ", ips, "ngpus", ngpus) + ips *= ngpus + run_mode = "DP" + + model_name = model_item + "_" + "bs" + str(bs) + "_" + fp_item + "_" + run_mode + info = { + "model_branch": os.getenv("model_branch"), + "model_commit": os.getenv("model_commit"), + "model_name": model_name, + "batch_size": bs, + "fp_item": fp_item, + "run_mode": run_mode, + "convergence_value": 0, + "convergence_key": "", + "ips": ips, + "speed_unit": "sample/sec", + "device_num": device_num, + "model_run_time": os.getenv("model_run_time"), + "frame_commit": os.getenv("frame_commit"), + "frame_version": os.getenv("frame_version"), + } + json_info = json.dumps(info) + print(json_info) + with open(res_log_file, "w") as of: + of.write(json_info) + + +if __name__ == "__main__": + if len(sys.argv) != 7: + print("Usage:" + sys.argv[0] + " model_item path/to/log/file path/to/res/log/file") + sys.exit() + + model_item = sys.argv[1] + log_file = sys.argv[2] + res_log_file = sys.argv[3] + device_num = sys.argv[4] + bs = int(sys.argv[5]) + fp_item = sys.argv[6] + + analyze(model_item, log_file, res_log_file, device_num, bs, fp_item) \ No newline at end of file diff --git a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/prepare.sh new file mode 100644 index 000000000000..d9c47aeeb7c9 --- /dev/null +++ b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/prepare.sh @@ -0,0 +1,16 @@ +# install可选 +cd ../ # PaddleNLP 根目录 +pwd +pip install -e . +cd - + +# 下载、解压、拷贝必要数据集 +cd ../llm/llama/auto_parallel/ +# llama 模型数据下载 +wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy +wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz + +mkdir data +mv llama_openwebtext_100k_ids.npy ./data +mv llama_openwebtext_100k_idx.npz ./data +cd - \ No newline at end of file diff --git a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh new file mode 100644 index 000000000000..7d32efac8430 --- /dev/null +++ b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh @@ -0,0 +1,173 @@ +#!/usr/bin/env bash + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Test training benchmark for a model. +# Usage:bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${mp_degree} ${pp_degree} ${dp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding} +function _set_params(){ + model_item=${1:-"llama-7b_auto_dp2mp2pp2"} # (必选) 模型 item + base_batch_size=${2:-"1"} # (必选) + fp_item=${3:-"fp32"} # (必选) fp32|fp16|bf16 + run_mode=${4:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1 + device_num=${5:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) + profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 + + model_repo="PaddleNLP" # (必选) 模型套件的名字 + speed_unit="sample/s" # (必选)速度指标单位 + skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step + keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 + convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" + max_iter=${6:-100} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 + num_workers=0 # (可选) + + # 以下为通用执行命令,无特殊可不用修改 + model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 + device=${CUDA_VISIBLE_DEVICES//,/ } + arr=(${device}) + num_gpu_devices=${#arr[*]} + run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 + profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 + speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} + + train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log + profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling + speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed +} +function _train(){ + batch_size=${base_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs + echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" + + if [ ${profiling} = "true" ];then + add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" + log_file=${profiling_log_file} + else + add_options="" + log_file=${train_log_file} + fi + export FLAGS_group_schedule_tiling_first=1 + export FLAGS_cinn_bucket_compile=1 + export FLAGS_cinn_new_cluster_op_method=1 + export FLAGS_deny_cinn_ops="gather" + export FLAGS_prim_forward_blacklist="pd_op.embedding" + export FLAGS_enable_prim_after_distribute=True + export FLAGS_disable_dyshape_in_train=True + export FLAGS_enable_pir_in_executor=True + export FLAGS_enable_prim_after_distribute=1 + export FLAGS_enable_cinn_compile_cache=0 + export PYTHONUNBUFFERED=1 + + use_fp16_cmd="" + if [ $fp_item = "fp16" ]; then + use_fp16_cmd="--fp16 1 --fp16_opt_level O2" + fi + to_static=0 # 是否开启动转静训练 + train_cmd="run_pretrain_auto.py \ + --model_type "llama" \ + --model_name_or_path "facebook/llama-7b" \ + --tokenizer_name_or_path "facebook/llama-7b" \ + --input_dir "./data" \ + --output_dir "output/$model_item" \ + --split 949,50,1 \ + --max_seq_length 2048 \ + --per_device_train_batch_size ${batch_size} \ + --per_device_eval_batch_size 2 \ + --gradient_accumulation_steps 1 \ + --use_flash_attention 1 \ + --use_fused_rms_norm 0 \ + --scale_loss 1024 \ + --pipeline_parallel_degree 1 \ + --tensor_parallel_degree 1 \ + --sharding_parallel_degree 1 \ + --learning_rate 0.0001 \ + --min_learning_rate 0.00001 \ + --max_steps ${max_iter} \ + --save_steps 5000000 \ + --weight_decay 0.01 \ + --warmup_ratio 0.01 \ + --logging_steps 20 \ + --dataloader_num_workers 1 \ + --sharding '' \ + --eval_steps 1000000 \ + --disable_tqdm true \ + --continue_training 0\ + --recompute 0 \ + --do_train \ + --do_eval \ + --device 'gpu' \ + --data_impl 'mmap' \ + --enable_auto_parallel 1 \ + --max_grad_norm 1.0 \ + --num_hidden_layers 4 \ + --to_static ${to_static} \ + ${use_fp16_cmd} " + + # 以下为通用执行命令,无特殊可不用修改 + case ${run_mode} in + DP) + rm -rf ./mylog # 注意执行前删掉log目录 + rm -rf output/$model_item + train_cmd="python -u -m paddle.distributed.launch --log_dir=./mylog \ + --gpus $CUDA_VISIBLE_DEVICES ${train_cmd}" + ;; + DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1" ;; + *) echo "choose run_mode "; exit 1; + esac + + cd ../llm/llama/auto_parallel/ + echo "train_cmd: ${train_cmd} log_file: ${log_file}" + + python -c "import paddlenlp" + timeout 15m ${train_cmd} > ${log_file} 2>&1 + if [ $? -ne 0 ];then + echo -e "${model_name}, FAIL" + else + echo -e "${model_name}, SUCCESS" + fi + #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` + if [ -d mylog ]; then + case_path=$PWD && cd - && mkdir -p mylog # PaddleNLP/tests/mylog + cp -r ${case_path}/mylog/workerlog.* ./mylog/ + rm ${log_file} + cp ${case_path}/mylog/workerlog.0 ${log_file} + fi + + echo ${train_cmd} >> ${log_file} + cat ${log_file} +} + +function _analysis_log(){ + # PaddleNLP/tests 目录 + analysis_log_cmd="python test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py \ + ${model_item} ${log_file} ${speed_log_file} ${device_num} ${base_batch_size} ${fp_item}" + echo ${analysis_log_cmd} + eval ${analysis_log_cmd} +} + +_set_params $@ +str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`) +export frame_version=${str_tmp%%.post*} +export frame_commit=$(echo `python -c "import paddle;print(paddle.version.commit)"`) +export model_branch=`git symbolic-ref HEAD 2>/dev/null | cut -d"/" -f 3` +export model_commit=$(git log|head -n1|awk '{print $2}') +echo "---------frame_version is ${frame_version}" +echo "---------Paddle commit is ${frame_commit}" +echo "---------Model commit is ${model_commit}" +echo "---------model_branch is ${model_branch}" + +job_bt=`date '+%Y%m%d%H%M%S'` +_train +job_et=`date '+%Y%m%d%H%M%S'` +export model_run_time=$((${job_et}-${job_bt})) +_analysis_log diff --git a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/readme.MD b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/readme.MD new file mode 100644 index 000000000000..c58fd32f08c6 --- /dev/null +++ b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/readme.MD @@ -0,0 +1,4 @@ +添加记录: +2024Q2在CINN推全项目中添加. +模型名llama_auto_dp2mp2pp2 +执行脚本:llm/llama/auto_parallel/run_pretrain_auto.py diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh new file mode 100644 index 000000000000..947334833d05 --- /dev/null +++ b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh @@ -0,0 +1,12 @@ +model_item=llama-7b_auto_dp2mp2pp2 +bs_item=1 +fp_item=fp16 +run_mode=DP +device_num=N1C1 + +max_iter=100 + +# prepare +# bash ./test_tipc/dygraph/${model_item}/benchmark_common/prepare.sh +# run +bash ./test_tipc/dygraph/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} 2>&1; diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py new file mode 100644 index 000000000000..f4fd00825eae --- /dev/null +++ b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py @@ -0,0 +1,76 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + +import json +import os +import re +import sys + +import numpy as np + + +def analyze(model_item, log_file, res_log_file, device_num, bs, fp_item): + with open(str(log_file), "r", encoding="utf8") as f: + data = f.readlines() + ips_lines = [] + for eachline in data: + if "train_samples_per_second:" in eachline: + ips = float(eachline.split("train_samples_per_second: ")[1].split()[0].replace(',', '')) + print("----ips: ", ips) + ips_lines.append(ips) + print("----ips_lines: ", ips_lines) + ips = np.round(np.mean(ips_lines), 3) + ngpus = int(re.findall("\d+", device_num)[-1]) + print("----ips: ", ips, "ngpus", ngpus) + ips *= ngpus + run_mode = "DP" + + model_name = model_item + "_" + "bs" + str(bs) + "_" + fp_item + "_" + run_mode + info = { + "model_branch": os.getenv("model_branch"), + "model_commit": os.getenv("model_commit"), + "model_name": model_name, + "batch_size": bs, + "fp_item": fp_item, + "run_mode": run_mode, + "convergence_value": 0, + "convergence_key": "", + "ips": ips, + "speed_unit": "sample/sec", + "device_num": device_num, + "model_run_time": os.getenv("model_run_time"), + "frame_commit": os.getenv("frame_commit"), + "frame_version": os.getenv("frame_version"), + } + json_info = json.dumps(info) + print(json_info) + with open(res_log_file, "w") as of: + of.write(json_info) + + +if __name__ == "__main__": + if len(sys.argv) != 7: + print("Usage:" + sys.argv[0] + " model_item path/to/log/file path/to/res/log/file") + sys.exit() + + model_item = sys.argv[1] + log_file = sys.argv[2] + res_log_file = sys.argv[3] + device_num = sys.argv[4] + bs = int(sys.argv[5]) + fp_item = sys.argv[6] + + analyze(model_item, log_file, res_log_file, device_num, bs, fp_item) \ No newline at end of file diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/prepare.sh b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/prepare.sh new file mode 100644 index 000000000000..d9c47aeeb7c9 --- /dev/null +++ b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/prepare.sh @@ -0,0 +1,16 @@ +# install可选 +cd ../ # PaddleNLP 根目录 +pwd +pip install -e . +cd - + +# 下载、解压、拷贝必要数据集 +cd ../llm/llama/auto_parallel/ +# llama 模型数据下载 +wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy +wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz + +mkdir data +mv llama_openwebtext_100k_ids.npy ./data +mv llama_openwebtext_100k_idx.npz ./data +cd - \ No newline at end of file diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh new file mode 100644 index 000000000000..b0a958ffd17c --- /dev/null +++ b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh @@ -0,0 +1,177 @@ +#!/usr/bin/env bash + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Test training benchmark for a model. +# Usage:bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${mp_degree} ${pp_degree} ${dp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding} +function _set_params(){ + model_item=${1:-"llama-7b_auto_dp2mp2pp2"} # (必选) 模型 item + base_batch_size=${2:-"1"} # (必选) + fp_item=${3:-"fp32"} # (必选) fp32|fp16|bf16 + run_mode=${4:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1 + device_num=${5:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) + profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 + + model_repo="PaddleNLP" # (必选) 模型套件的名字 + speed_unit="sample/s" # (必选)速度指标单位 + skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step + keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 + convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" + max_iter=${6:-100} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 + num_workers=0 # (可选) + + # 以下为通用执行命令,无特殊可不用修改 + model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 + device=${CUDA_VISIBLE_DEVICES//,/ } + arr=(${device}) + num_gpu_devices=${#arr[*]} + run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 + profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 + speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} + + train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_d2sT_log + profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_d2sT_profiling + speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_d2sT_speed +} +function _train(){ + batch_size=${base_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs + echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" + + if [ ${profiling} = "true" ];then + add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" + log_file=${profiling_log_file} + else + add_options="" + log_file=${train_log_file} + fi + + export FLAGS_group_schedule_tiling_first=1 + export FLAGS_cinn_bucket_compile=1 + export FLAGS_cinn_new_cluster_op_method=1 + export FLAGS_deny_cinn_ops="gather" + export FLAGS_prim_forward_blacklist="pd_op.embedding" + export FLAGS_enable_prim_after_distribute=True + export FLAGS_disable_dyshape_in_train=True + export FLAGS_enable_pir_in_executor=True + export FLAGS_enable_prim_after_distribute=1 + export FLAGS_enable_cinn_compile_cache=0 + export PYTHONUNBUFFERED=1 + + export ENABLE_FALL_BACK=True # 开启SOT + # export FLAGS_use_cinn=True # 是否开启cinn ,在benchmark中设置 + + use_fp16_cmd="" + if [ $fp_item = "fp16" ]; then + use_fp16_cmd="--fp16 1 --fp16_opt_level O2" + fi + to_static=1 # 是否开启动转静训练 + train_cmd="run_pretrain_auto.py \ + --model_type "llama" \ + --model_name_or_path "facebook/llama-7b" \ + --tokenizer_name_or_path "facebook/llama-7b" \ + --input_dir "./data" \ + --output_dir "output/$model_item" \ + --split 949,50,1 \ + --max_seq_length 2048 \ + --per_device_train_batch_size ${batch_size} \ + --per_device_eval_batch_size 2 \ + --gradient_accumulation_steps 1 \ + --use_flash_attention 1 \ + --use_fused_rms_norm 0 \ + --scale_loss 1024 \ + --pipeline_parallel_degree 1 \ + --tensor_parallel_degree 1 \ + --sharding_parallel_degree 1 \ + --learning_rate 0.0001 \ + --min_learning_rate 0.00001 \ + --max_steps ${max_iter} \ + --save_steps 5000000 \ + --weight_decay 0.01 \ + --warmup_ratio 0.01 \ + --logging_steps 20 \ + --dataloader_num_workers 1 \ + --sharding '' \ + --eval_steps 1000000 \ + --disable_tqdm true \ + --continue_training 0\ + --recompute 0 \ + --do_train \ + --do_eval \ + --device 'gpu' \ + --data_impl 'mmap' \ + --enable_auto_parallel 1 \ + --max_grad_norm 1.0 \ + --num_hidden_layers 4 \ + --to_static ${to_static} \ + ${use_fp16_cmd} " + + # 以下为通用执行命令,无特殊可不用修改 + case ${run_mode} in + DP) + rm -rf ./mylog # 注意执行前删掉log目录 + rm -rf output/$model_item + train_cmd="python -u -m paddle.distributed.launch --log_dir=./mylog \ + --gpus $CUDA_VISIBLE_DEVICES ${train_cmd}" + ;; + DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1" ;; + *) echo "choose run_mode "; exit 1; + esac + + cd ../llm/llama/auto_parallel/ + echo "train_cmd: ${train_cmd} log_file: ${log_file}" + + python -c "import paddlenlp" + timeout 15m ${train_cmd} > ${log_file} 2>&1 + if [ $? -ne 0 ];then + echo -e "${model_name}, FAIL" + else + echo -e "${model_name}, SUCCESS" + fi + #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` + if [ -d mylog ]; then + case_path=$PWD && cd - && mkdir -p mylog # PaddleNLP/tests/mylog + cp -r ${case_path}/mylog/workerlog.* ./mylog/ + rm ${log_file} + cp ${case_path}/mylog/workerlog.0 ${log_file} + fi + + echo ${train_cmd} >> ${log_file} + cat ${log_file} +} + +function _analysis_log(){ + # PaddleNLP/tests 目录 + analysis_log_cmd="python test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py \ + ${model_item} ${log_file} ${speed_log_file} ${device_num} ${base_batch_size} ${fp_item}" + echo ${analysis_log_cmd} + eval ${analysis_log_cmd} +} + +_set_params $@ +str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`) +export frame_version=${str_tmp%%.post*} +export frame_commit=$(echo `python -c "import paddle;print(paddle.version.commit)"`) +export model_branch=`git symbolic-ref HEAD 2>/dev/null | cut -d"/" -f 3` +export model_commit=$(git log|head -n1|awk '{print $2}') +echo "---------frame_version is ${frame_version}" +echo "---------Paddle commit is ${frame_commit}" +echo "---------Model commit is ${model_commit}" +echo "---------model_branch is ${model_branch}" + +job_bt=`date '+%Y%m%d%H%M%S'` +_train +job_et=`date '+%Y%m%d%H%M%S'` +export model_run_time=$((${job_et}-${job_bt})) +_analysis_log diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/readme.MD b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/readme.MD new file mode 100644 index 000000000000..c58fd32f08c6 --- /dev/null +++ b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/readme.MD @@ -0,0 +1,4 @@ +添加记录: +2024Q2在CINN推全项目中添加. +模型名llama_auto_dp2mp2pp2 +执行脚本:llm/llama/auto_parallel/run_pretrain_auto.py From fc252a0e8832debdc4aca5cf405af3884d8e5d96 Mon Sep 17 00:00:00 2001 From: mmglove Date: Sat, 11 May 2024 11:17:48 +0800 Subject: [PATCH 02/12] add llama-7b_auto_dp2mp2pp2 benchmark script for cinn --- .../N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh | 2 +- .../benchmark_common/run_benchmark.sh | 6 ++++-- .../N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh | 4 ++-- .../benchmark_common/run_benchmark.sh | 6 ++++-- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh index 947334833d05..d88157d6c2ce 100644 --- a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh +++ b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh @@ -7,6 +7,6 @@ device_num=N1C1 max_iter=100 # prepare -# bash ./test_tipc/dygraph/${model_item}/benchmark_common/prepare.sh +bash ./test_tipc/dygraph/${model_item}/benchmark_common/prepare.sh # run bash ./test_tipc/dygraph/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} 2>&1; diff --git a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh index 7d32efac8430..6db4da602b15 100644 --- a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh @@ -105,8 +105,8 @@ function _train(){ --recompute 0 \ --do_train \ --do_eval \ - --device 'gpu' \ - --data_impl 'mmap' \ + --device gpu \ + --data_impl mmap \ --enable_auto_parallel 1 \ --max_grad_norm 1.0 \ --num_hidden_layers 4 \ @@ -126,6 +126,8 @@ function _train(){ esac cd ../llm/llama/auto_parallel/ + rm -rf ./mylog # 注意执行前删掉log目录 + rm -rf output/$model_item echo "train_cmd: ${train_cmd} log_file: ${log_file}" python -c "import paddlenlp" diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh index 947334833d05..d83deec68bae 100644 --- a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh +++ b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh @@ -7,6 +7,6 @@ device_num=N1C1 max_iter=100 # prepare -# bash ./test_tipc/dygraph/${model_item}/benchmark_common/prepare.sh +bash ./test_tipc/dynamicTostatic/${model_item}/benchmark_common/prepare.sh # run -bash ./test_tipc/dygraph/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} 2>&1; +bash ./test_tipc/dynamicTostatic/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} 2>&1; diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh index b0a958ffd17c..73f63a0a65f3 100644 --- a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh @@ -109,8 +109,8 @@ function _train(){ --recompute 0 \ --do_train \ --do_eval \ - --device 'gpu' \ - --data_impl 'mmap' \ + --device gpu \ + --data_impl mmap \ --enable_auto_parallel 1 \ --max_grad_norm 1.0 \ --num_hidden_layers 4 \ @@ -130,6 +130,8 @@ function _train(){ esac cd ../llm/llama/auto_parallel/ + rm -rf ./mylog # 注意执行前删掉log目录 + rm -rf output/$model_item echo "train_cmd: ${train_cmd} log_file: ${log_file}" python -c "import paddlenlp" From bcee7ede9bdcc447fef6b7754b39bdaed13564c6 Mon Sep 17 00:00:00 2001 From: mmglove Date: Sat, 11 May 2024 14:53:52 +0800 Subject: [PATCH 03/12] update llama-7b_auto_dp2mp2pp2 benchmark script --- ...bs1_fp16_DP_N1C1.sh => llama-7b_auto_dp2mp2pp2_bs1_fp16_DP.sh} | 0 ...bs1_fp16_DP_N1C1.sh => llama-7b_auto_dp2mp2pp2_bs1_fp16_DP.sh} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/{llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh => llama-7b_auto_dp2mp2pp2_bs1_fp16_DP.sh} (100%) rename tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/{llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh => llama-7b_auto_dp2mp2pp2_bs1_fp16_DP.sh} (100%) diff --git a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP.sh similarity index 100% rename from tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh rename to tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP.sh diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP.sh similarity index 100% rename from tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP_N1C1.sh rename to tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP.sh From 25d068170d5a8fd40c371e27df6c676a31d98760 Mon Sep 17 00:00:00 2001 From: gmm <38800877+mmglove@users.noreply.github.com> Date: Tue, 28 May 2024 19:48:20 +0800 Subject: [PATCH 04/12] Update run_benchmark.sh --- .../llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh index 6db4da602b15..2637ada65c48 100644 --- a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh @@ -65,8 +65,7 @@ function _train(){ export FLAGS_disable_dyshape_in_train=True export FLAGS_enable_pir_in_executor=True export FLAGS_enable_prim_after_distribute=1 - export FLAGS_enable_cinn_compile_cache=0 - export PYTHONUNBUFFERED=1 + use_fp16_cmd="" if [ $fp_item = "fp16" ]; then From 09cd3378bfc25b38d566c080da2ff92136c003ba Mon Sep 17 00:00:00 2001 From: gmm <38800877+mmglove@users.noreply.github.com> Date: Tue, 28 May 2024 19:48:44 +0800 Subject: [PATCH 05/12] Update run_benchmark.sh --- .../llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh index 73f63a0a65f3..2e6ecd63b90a 100644 --- a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh @@ -66,8 +66,7 @@ function _train(){ export FLAGS_disable_dyshape_in_train=True export FLAGS_enable_pir_in_executor=True export FLAGS_enable_prim_after_distribute=1 - export FLAGS_enable_cinn_compile_cache=0 - export PYTHONUNBUFFERED=1 + export ENABLE_FALL_BACK=True # 开启SOT # export FLAGS_use_cinn=True # 是否开启cinn ,在benchmark中设置 From 0ea8f239b1985a8d438a742e2a61de3d5dbdba5f Mon Sep 17 00:00:00 2001 From: mmglove Date: Tue, 28 May 2024 20:13:56 +0800 Subject: [PATCH 06/12] fix llama-7b_auto_dp2mp2pp2/benchmark_common --- .../llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py | 5 ++--- .../llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py index f4fd00825eae..0fa28a3d7a8a 100644 --- a/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py +++ b/tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py @@ -13,7 +13,6 @@ # limitations under the License. - import json import os import re @@ -28,7 +27,7 @@ def analyze(model_item, log_file, res_log_file, device_num, bs, fp_item): ips_lines = [] for eachline in data: if "train_samples_per_second:" in eachline: - ips = float(eachline.split("train_samples_per_second: ")[1].split()[0].replace(',', '')) + ips = float(eachline.split("train_samples_per_second: ")[1].split()[0].replace(",", "")) print("----ips: ", ips) ips_lines.append(ips) print("----ips_lines: ", ips_lines) @@ -73,4 +72,4 @@ def analyze(model_item, log_file, res_log_file, device_num, bs, fp_item): bs = int(sys.argv[5]) fp_item = sys.argv[6] - analyze(model_item, log_file, res_log_file, device_num, bs, fp_item) \ No newline at end of file + analyze(model_item, log_file, res_log_file, device_num, bs, fp_item) diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py index f4fd00825eae..0fa28a3d7a8a 100644 --- a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py +++ b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py @@ -13,7 +13,6 @@ # limitations under the License. - import json import os import re @@ -28,7 +27,7 @@ def analyze(model_item, log_file, res_log_file, device_num, bs, fp_item): ips_lines = [] for eachline in data: if "train_samples_per_second:" in eachline: - ips = float(eachline.split("train_samples_per_second: ")[1].split()[0].replace(',', '')) + ips = float(eachline.split("train_samples_per_second: ")[1].split()[0].replace(",", "")) print("----ips: ", ips) ips_lines.append(ips) print("----ips_lines: ", ips_lines) @@ -73,4 +72,4 @@ def analyze(model_item, log_file, res_log_file, device_num, bs, fp_item): bs = int(sys.argv[5]) fp_item = sys.argv[6] - analyze(model_item, log_file, res_log_file, device_num, bs, fp_item) \ No newline at end of file + analyze(model_item, log_file, res_log_file, device_num, bs, fp_item) From 26fabaae72c5bba59c3060ada006f47c44449f16 Mon Sep 17 00:00:00 2001 From: gmm <38800877+mmglove@users.noreply.github.com> Date: Fri, 31 May 2024 15:24:25 +0800 Subject: [PATCH 07/12] Update run_benchmark.sh --- .../llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh index 2e6ecd63b90a..5fe003b2cf2d 100644 --- a/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh @@ -61,7 +61,7 @@ function _train(){ export FLAGS_cinn_bucket_compile=1 export FLAGS_cinn_new_cluster_op_method=1 export FLAGS_deny_cinn_ops="gather" - export FLAGS_prim_forward_blacklist="pd_op.embedding" + export FLAGS_prim_forward_blacklist="pd_op.embedding;pd_op.squared_l2_norm" export FLAGS_enable_prim_after_distribute=True export FLAGS_disable_dyshape_in_train=True export FLAGS_enable_pir_in_executor=True From c49b7e1dd13b41b9c12f4803fe73b77dc0f2e38a Mon Sep 17 00:00:00 2001 From: gmm <38800877+mmglove@users.noreply.github.com> Date: Thu, 20 Jun 2024 18:34:19 +0800 Subject: [PATCH 08/12] Update prepare.sh --- .../auto_tuner/llama_pretrain/benchmark_common/prepare.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh index 5472a36e94c4..3e51c5dc6fb7 100644 --- a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh +++ b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh @@ -16,11 +16,11 @@ python -m pip install -r ../requirements.txt python -m pip install -r ../requirements-dev.txt # install fused_ln custom ops -cd ../model_zoo/gpt-3/external_ops/ +cd ../legacy/model_zoo/gpt-3/external_ops/ python setup.py install # install tool_helpers -cd ../../../llm/llama +cd ../../../../llm/llama python -m pip install tool_helpers rm -rf data && mkdir data From f895c9ce6ee3f6c8c43804193c45c3bca220c548 Mon Sep 17 00:00:00 2001 From: gmm <38800877+mmglove@users.noreply.github.com> Date: Thu, 20 Jun 2024 18:35:03 +0800 Subject: [PATCH 09/12] Update prepare.sh --- .../static/auto_parallel/llama2/benchmark_common/prepare.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh index edb4590e2f15..697d5d1d92e0 100644 --- a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh +++ b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh @@ -16,11 +16,11 @@ python -m pip install -r ../requirements.txt python -m pip install -r ../requirements-dev.txt # install fused_ln custom ops -cd ../model_zoo/gpt-3/external_ops/ +cd ../legacy/model_zoo/gpt-3/external_ops/ python setup.py install # install tool_helpers -cd ../../../llm/llama +cd ../../../../llm/llama python -m pip install tool_helpers # download data From 55754a1a7330ff09720248fbced25eff2b9670e4 Mon Sep 17 00:00:00 2001 From: gmm <38800877+mmglove@users.noreply.github.com> Date: Thu, 20 Jun 2024 18:35:55 +0800 Subject: [PATCH 10/12] Update prepare.sh --- .../hybrid_parallelism/qwen/benchmark_common/prepare.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh index 1d8a79cc2a0e..bf6952c135ca 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh @@ -18,11 +18,11 @@ python -m pip install -r ../requirements-dev.txt python -m pip install tiktoken # install fused_ln custom ops -cd ../model_zoo/gpt-3/external_ops/ +cd ../legacy/model_zoo/gpt-3/external_ops/ python setup.py install # install tool_helpers -cd ../../../llm/qwen +cd ../../../../llm/qwen python -m pip install tool_helpers wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy @@ -30,4 +30,4 @@ wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwe mkdir data mv llama_openwebtext_100k_ids.npy ./data -mv llama_openwebtext_100k_idx.npz ./data \ No newline at end of file +mv llama_openwebtext_100k_idx.npz ./data From 564ea16839b7b8ce4834127f961f07694e7350bf Mon Sep 17 00:00:00 2001 From: gmm <38800877+mmglove@users.noreply.github.com> Date: Thu, 20 Jun 2024 18:36:39 +0800 Subject: [PATCH 11/12] Update prepare.sh --- .../hybrid_parallelism/llama/benchmark_common/prepare.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh index 0563a1aaabac..388b179e6905 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh @@ -16,11 +16,11 @@ python -m pip install -r ../requirements.txt python -m pip install -r ../requirements-dev.txt # install fused_ln custom ops -cd ../model_zoo/gpt-3/external_ops/ +cd ../legacy/model_zoo/gpt-3/external_ops/ python setup.py install # install tool_helpers -cd ../../../llm/llama +cd ../../../../llm/llama python -m pip install tool_helpers wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy @@ -28,4 +28,4 @@ wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwe mkdir data mv llama_openwebtext_100k_ids.npy ./data -mv llama_openwebtext_100k_idx.npz ./data \ No newline at end of file +mv llama_openwebtext_100k_idx.npz ./data From 41c33fc13505c6a60ec7beeaa7687839c38c17eb Mon Sep 17 00:00:00 2001 From: gmm <38800877+mmglove@users.noreply.github.com> Date: Thu, 20 Jun 2024 18:37:32 +0800 Subject: [PATCH 12/12] Update prepare.sh --- .../hybrid_parallelism/llama2/benchmark_common/prepare.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh index 45fd82fad914..9405521c7b3f 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh @@ -16,11 +16,11 @@ python -m pip install -r ../requirements.txt python -m pip install -r ../requirements-dev.txt # install fused_ln custom ops -cd ../model_zoo/gpt-3/external_ops/ +cd ../legacy/model_zoo/gpt-3/external_ops/ python setup.py install # install tool_helpers -cd ../../../llm/llama +cd ../../../../llm/llama python -m pip install tool_helpers # download data