Merge pull request #4 from lizhenyun01/fix_rope

yuanlehome · web-flow · commit acc025fa764a · 2025-01-15T21:38:22.000+08:00
Fix rope&amp;fix precision
diff --git a/paddlenlp/experimental/transformers/deepseek_v2/modeling.py b/paddlenlp/experimental/transformers/deepseek_v2/modeling.py
@@ -114,8 +114,9 @@ def _compute_cos_sin_cache(self) -> paddle.Tensor:
         inv_freq = self._compute_inv_freq(self.scaling_factor)
         t = paddle.arange(self.max_position_embeddings * self.scaling_factor, dtype=paddle.float32)
         freqs = paddle.einsum("i,j -> ij", t, inv_freq)
-        cos = freqs.cos() * self.mscale
-        sin = freqs.sin() * self.mscale
+        emb = paddle.concat((freqs, freqs), axis=-1)
+        cos = emb.cos() * self.mscale
+        sin = emb.sin() * self.mscale
         cache = paddle.concat((cos, sin), axis=-1)
         return cache
 
@@ -125,28 +126,28 @@ def forward(
         query: paddle.Tensor,
         key: paddle.Tensor,
     ) -> Tuple[paddle.Tensor, paddle.Tensor]:
-        query_rot = query[..., : self.rotary_dim]
-        key_rot = key[..., : self.rotary_dim]
+        q = query[..., : self.rotary_dim]
+        k = key[..., : self.rotary_dim]
         if self.rotary_dim < self.head_size:
             query_pass = query[..., self.rotary_dim :]
             key_pass = key[..., self.rotary_dim :]
-
-        cos_sin = self.cos_sin_cache[position_ids]
+        cos_sin = self.cos_sin_cache[position_ids].unsqueeze(1)
         cos, sin = cos_sin.chunk(2, axis=-1)
 
-        cos = cos.repeat_interleave(2, axis=-1).unsqueeze(-2)
-        sin = sin.repeat_interleave(2, axis=-1).unsqueeze(-2)
+        s, h, d = q.shape
+        q = q.reshape([s, h, d // 2, 2]).transpose([0, 1, 3, 2]).reshape([s, h, d])
 
-        def _rotate_gptj(x: paddle.Tensor) -> paddle.Tensor:
-            x1 = x[..., ::2]
-            x2 = x[..., 1::2]
-            x = paddle.stack((-x2, x1), axis=-1)
-            return x.flatten(-2)
+        s, h, d = k.shape
+        k = k.reshape([s, h, d // 2, 2]).transpose([0, 1, 3, 2]).reshape([s, h, d])
 
-        rotate_fn = _rotate_gptj
+        def rotate_half(x):
+            """Rotates half the hidden axiss of the input."""
+            x1 = x[..., : x.shape[-1] // 2]
+            x2 = x[..., x.shape[-1] // 2 :]
+            return paddle.concat([-x2, x1], axis=-1)  # shape is the same as x
 
-        query_rot = query_rot * cos + rotate_fn(query_rot) * sin
-        key_rot = key_rot * cos + rotate_fn(key_rot) * sin
+        query_rot = (q * cos) + (rotate_half(q) * sin)
+        key_rot = (k * cos) + (rotate_half(k) * sin)
 
         if self.rotary_dim < self.head_size:
             query = paddle.concat((query_rot, query_pass), axis=-1)
diff --git a/paddlenlp/experimental/transformers/fused_transformer_layers.py b/paddlenlp/experimental/transformers/fused_transformer_layers.py
@@ -949,7 +949,7 @@ def compute_layernorm_before_qkv(self, src, i):
 
         return ln_out
 
-    def compute_qkv_linear(self, ln_out, i):
+    def compute_qkv_linear(self, ln_out, i, position_ids=None):
         if self.config.mla_config.use_mla():
             if self.config.mla_config.q_lora_rank is not None:
                 query = paddle.matmul(ln_out, self.q_a_proj_weights[i])
@@ -989,7 +989,6 @@ def compute_qkv_linear(self, ln_out, i):
                 key_value, [self.config.mla_config.qk_nope_head_dim, self.config.mla_config.v_head_dim], axis=-1
             )
 
-            position_ids = paddle.arange(ln_out.shape[0]).reshape((1, -1))
             query_pe, key_pe = self.config.rotary_emb(position_ids, query_pe, key_pe)
 
             query[..., self.config.mla_config.qk_nope_head_dim :] = query_pe
@@ -1018,9 +1017,9 @@ def compute_qkv_linear(self, ln_out, i):
 
         return qkv_out
 
-    def compute_qkv(self, src, residual_input, i):
+    def compute_qkv(self, src, residual_input, i, position_ids=None):
         ln_out = self.compute_layernorm_before_qkv(src, i)
-        qkv_out = self.compute_qkv_linear(ln_out, i)
+        qkv_out = self.compute_qkv_linear(ln_out, i, position_ids)
         return qkv_out, residual_input
 
     def compute_max_len(self, seq_lens_encoder, seq_lens_decoder, cum_offsets):
@@ -1406,10 +1405,23 @@ def forward(
                 kwargs.get("block_size", 64),
                 self.config.speculate_config.speculate_max_draft_token_num,
             )
+        seq_lens_this_time = kwargs.get("seq_lens_this_time", None)
+        bsz = seq_lens_this_time.shape[0]
+        position_ids = []
+        for i in range(bsz):
+            cur_seq_len = kwargs.get("seq_lens_encoder", None)[i]
+            if cur_seq_len > 0:
+                for j in range(cur_seq_len):
+                    position_ids.append(j)
+            else:
+                ids = kwargs.get("seq_lens_decoder", None)[i].item()
 
+                if ids > 0:
+                    position_ids.append(ids)
+        # print("position_ids;", position_ids)
         residual_input = src
         for i in range(self.num_layers):
-            qkv_out, residual_input = self.compute_qkv(src, residual_input, i)
+            qkv_out, residual_input = self.compute_qkv(src, residual_input, i, position_ids)
             out_linear_out = self.compute_attn(
                 time_step,
                 qkv_out,