From d03d564186f6f8b48f38b549fffc64e5691d48f3 Mon Sep 17 00:00:00 2001 From: XieYunshen <1084314248@qq.com> Date: Thu, 2 Jan 2025 15:55:28 +0800 Subject: [PATCH] Modify the environment variables and model configuration of the benchmark test case. --- .../hybrid_parallelism/gpt3/benchmark_common/run_benchmark.sh | 2 ++ ...etrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh | 2 +- .../auto_parallel/baichuan2/benchmark_common/run_benchmark.sh | 2 ++ .../static/auto_parallel/gpt3/benchmark_common/run_benchmark.sh | 2 ++ .../static/auto_parallel/qwen/benchmark_common/run_benchmark.sh | 2 ++ 5 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/benchmark_common/run_benchmark.sh index 2b5a830a7c52..abbec3f3387f 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/benchmark_common/run_benchmark.sh @@ -127,6 +127,8 @@ function _train(){ export FLAGS_selected_gpus="0,1,2,3,4,5,6,7" export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH +# benchmark框架中会默认设置CUDA_MODULE_LOADING=LAZY,影响case执行,修复框架问题后再移除该变量 +unset CUDA_MODULE_LOADING source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ diff --git a/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/baichuan-inc-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh b/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/baichuan-inc-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh index 07cd78f92054..ee1ebfa5c8de 100644 --- a/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/baichuan-inc-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh +++ b/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/baichuan-inc-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -param="model_item=baichuan-inc-baichuan-2-13b_pretrain " +param="model_item=baichuan-inc-baichuan-2-13b_pretrain_dy2st " param+="run_mode=DP1_MP4_PP2_1F1B_Sharding4_Stage1 " param+="device_num=N4C32 " param+="global_batch_size=128 " diff --git a/tests/test_tipc/static/auto_parallel/baichuan2/benchmark_common/run_benchmark.sh b/tests/test_tipc/static/auto_parallel/baichuan2/benchmark_common/run_benchmark.sh index e550b7256f7f..c29841c3db5b 100644 --- a/tests/test_tipc/static/auto_parallel/baichuan2/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/static/auto_parallel/baichuan2/benchmark_common/run_benchmark.sh @@ -242,6 +242,8 @@ export FLAGS_enable_sharding_stage1_tensor_fusion=1 # 只有13b的任务需要打开CUDA_DEVICE_MAX_CONNECTIONS,7b与13b关闭 export CUDA_DEVICE_MAX_CONNECTIONS=1 export PARALLEL_CROSS_ENTROPY=true +# benchmark框架中会默认设置CUDA_MODULE_LOADING=LAZY,影响case执行,修复框架问题后再移除该变量 +unset CUDA_MODULE_LOADING source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ diff --git a/tests/test_tipc/static/auto_parallel/gpt3/benchmark_common/run_benchmark.sh b/tests/test_tipc/static/auto_parallel/gpt3/benchmark_common/run_benchmark.sh index f11a624ad854..9eb16e442663 100644 --- a/tests/test_tipc/static/auto_parallel/gpt3/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/static/auto_parallel/gpt3/benchmark_common/run_benchmark.sh @@ -246,6 +246,8 @@ export FLAGS_enable_sharding_stage1_tensor_fusion=1 # 只有13b的任务需要打开CUDA_DEVICE_MAX_CONNECTIONS,7b与13b关闭 export CUDA_DEVICE_MAX_CONNECTIONS=1 export PARALLEL_CROSS_ENTROPY=true +# benchmark框架中会默认设置CUDA_MODULE_LOADING=LAZY,影响case执行,修复框架问题后再移除该变量 +unset CUDA_MODULE_LOADING source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ diff --git a/tests/test_tipc/static/auto_parallel/qwen/benchmark_common/run_benchmark.sh b/tests/test_tipc/static/auto_parallel/qwen/benchmark_common/run_benchmark.sh index 704081a29214..0c3de2c56904 100644 --- a/tests/test_tipc/static/auto_parallel/qwen/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/static/auto_parallel/qwen/benchmark_common/run_benchmark.sh @@ -246,6 +246,8 @@ export FLAGS_enable_sharding_stage1_tensor_fusion=1 # 只有13b的任务需要打开CUDA_DEVICE_MAX_CONNECTIONS,7b与13b关闭 export CUDA_DEVICE_MAX_CONNECTIONS=1 export PARALLEL_CROSS_ENTROPY=true +# benchmark框架中会默认设置CUDA_MODULE_LOADING=LAZY,影响case执行,修复框架问题后再移除该变量 +unset CUDA_MODULE_LOADING source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@