Merge commit '525eef76f0513f205f5d6e6122cdb53f52505386' into xpullama2

zhink · zhink · commit 9250d64e585b · 2024-06-12T18:01:59.000+08:00
diff --git a/paddlenlp/transformers/qwen2/modeling.py b/paddlenlp/transformers/qwen2/modeling.py
@@ -18,6 +18,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Paddle Qwen2 model."""
+from __future__ import annotations
 
 import math
 import warnings
@@ -187,11 +188,11 @@ def scaled_dot_product_attention(
     else:
         #  [ bz, seqlen, nhead, head_dim] -> [bs, nhead, seq_len, head_dim]
         query_states = paddle.transpose(query_states, [0, 2, 1, 3])
-        # merge with the next tranpose
+        # merge with the next transpose
         key_states = paddle.transpose(key_states, [0, 2, 1, 3])
         value_states = paddle.transpose(value_states, [0, 2, 1, 3])
 
-        # matmul and devide by sqrt(head_dim)
+        # matmul and divide by sqrt(head_dim)
         attn_weights = paddle.matmul(query_states / math.sqrt(head_dim), key_states.transpose([0, 1, 3, 2]))
 
         if attn_weights.shape != [bsz, num_heads, q_len, kv_seq_len]:
@@ -1127,7 +1128,7 @@ def forward(self, prediction_scores, masked_lm_labels):
         if self.enable_parallel_cross_entropy:
             if prediction_scores.shape[-1] == self.config.vocab_size:
                 warnings.warn(
-                    f"enable_parallel_cross_entropy, the vocab_size should be splited: {prediction_scores.shape[-1]}, {self.config.vocab_size}"
+                    f"enable_parallel_cross_entropy, the vocab_size should be splitted: {prediction_scores.shape[-1]}, {self.config.vocab_size}"
                 )
                 self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
 
@@ -1202,14 +1203,7 @@ def get_decoder(self):
         return self.qwen2
 
     def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        use_cache=False,
-        past_key_values=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        output_router_logits=False,
-        **kwargs
+        self, input_ids, use_cache=False, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
         batch_size, seq_length = input_ids.shape
         position_ids = kwargs.get("position_ids", paddle.arange(seq_length).expand((batch_size, seq_length)))
@@ -1230,7 +1224,6 @@ def prepare_inputs_for_generation(
                 "past_key_values": past_key_values,
                 "use_cache": use_cache,
                 "attention_mask": attention_mask,
-                "output_router_logits": output_router_logits,
             }
         )
         return model_inputs
@@ -1325,7 +1318,7 @@ def forward(
         hidden_states = outputs[0]
 
         # if labels is None，means we need full output, instead of tensor_parallel_output
-        # tensor_parallel_output is togather with ParallelCrossEntropy
+        # tensor_parallel_output is together with ParallelCrossEntropy
         tensor_parallel_output = (
             self.config.tensor_parallel_output and labels is not None and self.config.tensor_parallel_degree > 1
         )
diff --git a/tests/test_tipc/auto_tuner/autoconfig/check_mem_usage.sh b/tests/test_tipc/auto_tuner/autoconfig/check_mem_usage.sh
@@ -19,7 +19,7 @@ auto_log_file=./autoconfig/${model_name}_auto_tuner.log
 
 log="./llama7b_pretrain_auto_tuner.log"
 launch_best_cfg=$(sed -n "s/.*Launch best cfg: \(.*\)}/\1/p" "$auto_log_file")
-cfg_max_mem_usage=$(echo "$launch_best_cfg" | awk -F"max_mem_usage': " '{print $2}' | awk -F, '{print $1}')
+cfg_max_mem_usage=$(echo "$launch_best_cfg" | awk -F"\"max_mem_usage\":" '{print $2}' | awk -F, '{print $1}')
 
 buffer=$(sed -n 's/.*"buffer":\([^,}]*\).*/\1/p' $autoconfig_json_file | awk '{print $1}')
 max_mem_usage=$(sed -n 's/.*"max_mem_usage":\([^,}]*\).*/\1/p' $autoconfig_json_file | awk '{print $1}')
diff --git a/tests/test_tipc/auto_tuner/autoconfig/resume.csv b/tests/test_tipc/auto_tuner/autoconfig/resume.csv
@@ -1,6 +1,5 @@
 job_id,mp_degree,sharding_degree,pp_degree,dp_degree,sharding_stage,micro_batch_size,vpp_degree,use_recompute,recompute_granularity,acc_steps,global_batch_size,exec_time,interval_samples_per_second,max_mem_usage,error_info
-5,4,2,1,1,2,4,1,True,full,1,8,72.38,4.74138,26130,
-2,4,1,2,1,1,8,1,True,full,1,8,57.35,2.58279,27722,
+4,4,2,1,1,2,4,1,True,full,1,8,72.38,4.74138,26130,
+2,4,1,2,1,2,4,1,True,full,2,8,40.17,3.3612,24590,
 1,8,1,1,1,2,1,1,True,full,8,8,55.32,2.3064,19050,
-3,4,1,2,1,1,8,1,True,full_attn,1,8,45.29,,OOM,['Out of memory']
-4,4,1,2,1,1,4,1,True,full_attn,2,8,55.15,,OOM,['Out of memory']
+3,4,1,2,1,2,4,1,True,full_attn,2,8,39.38,,OOM,['Out of memory']
diff --git a/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh b/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh
@@ -34,7 +34,7 @@ else
   rank=$PADDLE_TRAINER_ID
   echo $master_ip $rank
   if [ $rank == 0 ]; then
-    net=$(netstat -anp | grep 2379 | grep "LISTEN")
+    net=$(netstat -anp | grep :2379 | grep "LISTEN")
     if [ ${#net} == 0 ]; then
         apt-get install -y --allow-downgrades etcd
         nohup etcd -data-dir ~/data.etcd -advertise-client-urls  http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
diff --git a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh
@@ -44,7 +44,7 @@ else
   rank=$PADDLE_TRAINER_ID
   echo $master_ip $rank
   if [ $rank == 0 ]; then
-    net=$(netstat -anp | grep 2379 | grep "LISTEN")
+    net=$(netstat -anp | grep :2379 | grep "LISTEN")
     if [ ${#net} == 0 ]; then
         apt-get install -y --allow-downgrades etcd
         nohup etcd -data-dir ~/data.etcd -advertise-client-urls  http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
diff --git a/tests/test_tipc/static/auto_parallel/llama2/N4C32/meta-llama-Llama-2-7b_pretrain_dy2st_bs32_bf16_Sharding32_Stage2.sh b/tests/test_tipc/static/auto_parallel/llama2/N4C32/meta-llama-Llama-2-7b_pretrain_dy2st_bs32_bf16_Sharding32_Stage2.sh
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 param="model_item=meta-llama-Llama-2-7b_pretrain_dy2st "
-param+="run_mode=Sharding_Stage2 "
+param+="run_mode=Sharding32_Stage2 "
 param+="device_num=N4C32 "
 param+="global_batch_size=32 "
 param+="nnodes=4 "