PaddlePaddle
diff --git a/‎paddlenlp/transformers/llama/fusion_ops.py
Lines changed: 13 additions & 7 deletions b/‎paddlenlp/transformers/llama/fusion_ops.py
Lines changed: 13 additions & 7 deletions
diff --git a/‎paddlenlp/transformers/llama/modeling.py
Lines changed: 1 addition & 3 deletions b/‎paddlenlp/transformers/llama/modeling.py
Lines changed: 1 addition & 3 deletions
@@ -52,9 +52,18 @@ def swiglu(x, y=None):
     flash_attention = None
 
 from paddlenlp.transformers.ring_flash_attention import RingFlashAttention
-from paddlenlp.transformers.context_parallel_utils import split_inputs_sequence_dim_load_balance
 
-def fusion_rope(query_states, key_states, value_states, hidden_states, position_ids, past_key_value, rotary_emb, cp_parallel_degree=-1):
+
+def fusion_rope(
+    query_states,
+    key_states,
+    value_states,
+    hidden_states,
+    position_ids,
+    past_key_value,
+    rotary_emb,
+    cp_parallel_degree=-1,
+):
     if get_env_device() != "gcu":
         assert past_key_value is None, "fuse rotary not support cache kv for now"
     batch_size, seq_length, num_heads, head_dim = query_states.shape
@@ -64,9 +73,6 @@ def fusion_rope(query_states, key_states, value_states, hidden_states, position_
         kv_seq_len *= cp_parallel_degree
     if get_env_device() != "gcu":
         cos, sin = rotary_emb(value_states, seq_len=kv_seq_len)
-    if cp_parallel_degree > 1:
-        cos = split_inputs_sequence_dim_load_balance(cos)
-        sin = split_inputs_sequence_dim_load_balance(sin)
     if get_env_device() == "npu":
         query_states = core.eager._run_custom_op("fused_rope", query_states, cos, sin)[0]
         key_states = core.eager._run_custom_op("fused_rope", key_states, cos, sin)[0]
@@ -165,7 +171,7 @@ def fusion_flash_attention(
             attention_mask = attention_mask.cast(alibi.dtype) + alibi
         if get_env_device() == "npu":
             if config.cp_parallel_degree > 1:
-                raise ValueError(f"Context parallel is not implemented for npu")
+                raise ValueError("Context parallel is not implemented for npu")
             attn_output = core.eager._run_custom_op(
                 "flash_attention_npu",
                 query_states,
@@ -181,7 +187,7 @@ def fusion_flash_attention(
             )[0]
         elif get_env_device() == "gcu":
             if config.cp_parallel_degree > 1:
-                raise ValueError(f"Context parallel is not implemented for gcu")
+                raise ValueError("Context parallel is not implemented for gcu")
             attn_output = core.eager._run_custom_op(
                 "fused_sdp_flash_attention_gcu",
                 query_states,
 
@@ -99,7 +99,6 @@ def swiglu(x, y=None):
 ]
 
 
-
 def _get_interleave(n):
     def _get_interleave_power_of_2(n):
         start = 2 ** (-(2 ** -(math.log2(n) - 3)))
@@ -956,7 +955,7 @@ def forward(
                     position_ids,
                     past_key_value,
                     self.rotary_emb,
-                    self.cp_parallel_degree
+                    self.config.cp_parallel_degree,
                 )
 
             else:
@@ -972,7 +971,6 @@ def forward(
                     )
                 else:
                     cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
                 query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
         # [bs, seq_len, num_head, head_dim]
Original file line number	Diff line number	Diff line change
`@@ -99,7 +99,6 @@ def swiglu(x, y=None):`
`99`	`99`	`]`
`100`	`100`
`101`	`101`
`102`		`-`
`103`	`102`	`def _get_interleave(n):`
`104`	`103`	`def _get_interleave_power_of_2(n):`
`105`	`104`	`start = 2 (-(2 -(math.log2(n) - 3)))`
`@@ -956,7 +955,7 @@ def forward(`
`956`	`955`	`position_ids,`
`957`	`956`	`past_key_value,`
`958`	`957`	`self.rotary_emb,`
`959`		`- self.cp_parallel_degree`
	`958`	`+ self.config.cp_parallel_degree,`
`960`	`959`	`)`
`961`	`960`
`962`	`961`	`else:`
`@@ -972,7 +971,6 @@ def forward(`
`972`	`971`	`)`
`973`	`972`	`else:`
`974`	`973`	`cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)`
`975`		`-`
`976`	`974`	`query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)`
`977`	`975`
`978`	`976`	`# [bs, seq_len, num_head, head_dim]`