Skip to content

Commit a90f163

Browse files
authored
fix run_benchmark for llama2_70b in auto_parallel (#8484)
* remove tsinghua pypi * modify gpt dateset addr for benchmark * fix run_benchmark for llama2_70b in auto_parallel
1 parent 0cd8fe7 commit a90f163

File tree

1 file changed

+6
-2
lines changed

1 file changed

+6
-2
lines changed

tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,11 @@ function _train(){
7474
add_options=""
7575
log_file=${train_log_file}
7676
fi
77-
77+
78+
# 70b需要关闭这个开关,否则会hang
79+
if [[ "${MODEL_TYPE}" =~ "70b" ]]; then
80+
unset CUDA_DEVICE_MAX_CONNECTIONS
81+
fi
7882
# Disable for hanging bug
7983
# if [ "${tensor_parallel_degree}" != "1" ]; then
8084
# export CUDA_DEVICE_MAX_CONNECTIONS=1
@@ -136,7 +140,7 @@ function _train(){
136140
rm -rf mylog && rm -rf checkpoints
137141

138142
echo "train_cmd: ${train_cmd} log_file: ${log_file}"
139-
timeout 15m ${train_cmd} > ${log_file} 2>&1
143+
timeout 40m ${train_cmd} > ${log_file} 2>&1
140144

141145
if [ $? -ne 0 ];then
142146
echo -e "${model_name}, FAIL"

0 commit comments

Comments
 (0)