We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 0cd8fe7 commit a90f163Copy full SHA for a90f163
tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh
@@ -74,7 +74,11 @@ function _train(){
74
add_options=""
75
log_file=${train_log_file}
76
fi
77
-
+
78
+ # 70b需要关闭这个开关,否则会hang
79
+ if [[ "${MODEL_TYPE}" =~ "70b" ]]; then
80
+ unset CUDA_DEVICE_MAX_CONNECTIONS
81
+ fi
82
# Disable for hanging bug
83
# if [ "${tensor_parallel_degree}" != "1" ]; then
84
# export CUDA_DEVICE_MAX_CONNECTIONS=1
@@ -136,7 +140,7 @@ function _train(){
136
140
rm -rf mylog && rm -rf checkpoints
137
141
138
142
echo "train_cmd: ${train_cmd} log_file: ${log_file}"
139
- timeout 15m ${train_cmd} > ${log_file} 2>&1
143
+ timeout 40m ${train_cmd} > ${log_file} 2>&1
144
145
if [ $? -ne 0 ];then
146
echo -e "${model_name}, FAIL"
0 commit comments