support fused weights for export_model

ronny1996 · ronny1996 · commit 8dc2cf782f44 · 2024-06-05T18:14:54.000+08:00
diff --git a/paddlenlp/experimental/transformers/llama/modeling.py b/paddlenlp/experimental/transformers/llama/modeling.py
@@ -474,47 +474,50 @@ def set_state_dict(self, state_dict):
         unfused_state_dict = {}
         head_size = self.hidden_size // self.num_attention_heads
 
-        self.embed_tokens.weight.set_value(paddle.to_tensor(state_dict["llama.embed_tokens.weight"]))
+        self.embed_tokens.weight.set_value(paddle.to_tensor(state_dict["llama.embed_tokens.weight"], dtype=self.embed_tokens.weight.dtype))
         self.norm.weight.set_value(paddle.to_tensor(state_dict["llama.norm.weight"], dtype=self.norm.weight.dtype))
 
         for idx in range(self.config.num_hidden_layers):
             logger.info(f"set state for layer {idx}")
 
             if self.use_weight_only:
                 logger.info("weight only is enabled")
-            unfused_state_dict = {}
-            unfused_state_dict["self_attn.q_proj.weight"] = state_dict[
-                "llama.layers.{}.self_attn.q_proj.weight".format(idx)
-            ]
-            unfused_state_dict["self_attn.k_proj.weight"] = state_dict[
-                "llama.layers.{}.self_attn.k_proj.weight".format(idx)
-            ]
-            unfused_state_dict["self_attn.v_proj.weight"] = state_dict[
-                "llama.layers.{}.self_attn.v_proj.weight".format(idx)
-            ]
-
-            concated_qkv_weight = (
-                np.concatenate(
-                    [
-                        unfused_state_dict["self_attn.q_proj.weight"],
-                        unfused_state_dict["self_attn.k_proj.weight"],
-                        unfused_state_dict["self_attn.v_proj.weight"],
-                    ],
-                    axis=-1,
-                )
-                .transpose(1, 0)
-                .reshape(
-                    3 * (self.num_attention_heads // self.config.tensor_parallel_degree) * (head_size),
-                    self.hidden_size,
+            if "llama.layers.{}.self_attn.qkv_proj.weight".format(idx) in state_dict.keys():
+                concated_qkv_weight = state_dict["llama.layers.{}.self_attn.qkv_proj.weight".format(idx)].transpose([1, 0])
+            else:
+                unfused_state_dict = {}
+                unfused_state_dict["self_attn.q_proj.weight"] = state_dict[
+                    "llama.layers.{}.self_attn.q_proj.weight".format(idx)
+                ]
+                unfused_state_dict["self_attn.k_proj.weight"] = state_dict[
+                    "llama.layers.{}.self_attn.k_proj.weight".format(idx)
+                ]
+                unfused_state_dict["self_attn.v_proj.weight"] = state_dict[
+                    "llama.layers.{}.self_attn.v_proj.weight".format(idx)
+                ]
+                concated_qkv_weight = (
+                    np.concatenate(
+                        [
+                            unfused_state_dict["self_attn.q_proj.weight"],
+                            unfused_state_dict["self_attn.k_proj.weight"],
+                            unfused_state_dict["self_attn.v_proj.weight"],
+                        ],
+                        axis=-1,
+                    )
+                    .transpose(1, 0)
+                    .reshape(
+                        3 * (self.num_attention_heads // self.config.tensor_parallel_degree) * (head_size),
+                        self.hidden_size,
+                    )
+                )  # reshape(3, self.num_attention_heself.hidden_sizeads // self.config.tensor_parallel_degree, head_size, )
+            if "llama.layers.{}.mlp.gate_up_fused_proj.weight".format(idx) in state_dict.keys():
+                concated_ffn1_weight = state_dict["llama.layers.{}.mlp.gate_up_fused_proj.weight".format(idx)]
+            else:
+                unfused_state_dict["mlp.gate_proj.weight"] = state_dict["llama.layers.{}.mlp.gate_proj.weight".format(idx)]
+                unfused_state_dict["mlp.up_proj.weight"] = state_dict["llama.layers.{}.mlp.up_proj.weight".format(idx)]
+                concated_ffn1_weight = np.concatenate(
+                    [unfused_state_dict["mlp.gate_proj.weight"], unfused_state_dict["mlp.up_proj.weight"]], axis=-1
                 )
-            )  # reshape(3, self.num_attention_heself.hidden_sizeads // self.config.tensor_parallel_degree, head_size, )
-
-            unfused_state_dict["mlp.gate_proj.weight"] = state_dict["llama.layers.{}.mlp.gate_proj.weight".format(idx)]
-            unfused_state_dict["mlp.up_proj.weight"] = state_dict["llama.layers.{}.mlp.up_proj.weight".format(idx)]
-
-            concated_ffn1_weight = np.concatenate(
-                [unfused_state_dict["mlp.gate_proj.weight"], unfused_state_dict["mlp.up_proj.weight"]], axis=-1
-            )
             ffn1_weight_tensor = paddle.to_tensor(concated_ffn1_weight)
 
             qkv_weight_tensor = paddle.to_tensor(concated_qkv_weight)
@@ -534,7 +537,7 @@ def set_state_dict(self, state_dict):
                     paddle.cast(paddle.to_tensor(concated_qkv_weight), "int8")
                 )
             else:
-                self.transformer_block.qkv_weights[idx].set_value(qkv_weight_tensor)
+                self.transformer_block.qkv_weights[idx].set_value(qkv_weight_tensor.cast(self.transformer_block.qkv_weights[idx].dtype))
 
             linear_weight_tensor = paddle.to_tensor(state_dict["llama.layers.{}.self_attn.o_proj.weight".format(idx)])
             if self.use_weight_only:
@@ -556,7 +559,7 @@ def set_state_dict(self, state_dict):
                     )
                 )
             else:
-                self.transformer_block.linear_weights[idx].set_value(linear_weight_tensor)
+                self.transformer_block.linear_weights[idx].set_value(linear_weight_tensor.cast(self.transformer_block.linear_weights[idx].dtype))
 
             if self.use_weight_only:
                 ffn1_quanted_weight_tensor, ffn1_weight_scale_tensor = weight_quantize(
@@ -572,7 +575,7 @@ def set_state_dict(self, state_dict):
                     paddle.cast(paddle.to_tensor(concated_ffn1_weight).transpose((1, 0)), "int8")
                 )
             else:
-                self.transformer_block.ffn1_weights[idx].set_value(ffn1_weight_tensor)
+                self.transformer_block.ffn1_weights[idx].set_value(ffn1_weight_tensor.cast(self.transformer_block.ffn1_weights[idx].dtype))
 
             ffn2_weight_tensor = paddle.to_tensor(state_dict["llama.layers.{}.mlp.down_proj.weight".format(idx)])
             if self.use_weight_only:
@@ -594,7 +597,7 @@ def set_state_dict(self, state_dict):
                     )
                 )
             else:
-                self.transformer_block.ffn2_weights[idx].set_value(ffn2_weight_tensor)
+                self.transformer_block.ffn2_weights[idx].set_value(ffn2_weight_tensor.cast(self.transformer_block.ffn2_weights[idx].dtype))
 
             if self.quant_type == "a8w8":
                 if self.shift_smooth_all_linears:
@@ -1264,7 +1267,7 @@ def forward(
     @paddle.no_grad()
     def set_state_dict(self, state_dict):
         if "lm_head.weight" in state_dict:
-            self.lm_head.weight.set_value(state_dict["lm_head.weight"])
+            self.lm_head.weight.set_value(paddle.to_tensor(state_dict["lm_head.weight"], dtype=self.lm_head.weight.dtype))
         self.llama.set_state_dict({k: state_dict[k] for k in state_dict.keys()})