Reduce padding overhead for sharedMoE (#3606)

jianan-gu · web-flow · commit 88e23ee68594 · 2025-04-02T22:51:59.000+08:00
diff --git a/csrc/cpu/aten/DSMoE.cpp b/csrc/cpu/aten/DSMoE.cpp
@@ -9,6 +9,98 @@ namespace torch_ipex {
 namespace cpu {
 
 IPEX_DEFINE_DISPATCH(fused_experts_impl_stub);
+template <typename T>
+inline void copy_and_fill(
+    T* __restrict__ out,
+    const T* __restrict__ input,
+    int size,
+    int pad_size,
+    T fill_value) {
+  using Vec = at::vec::Vectorized<T>;
+  int d = 0;
+  if (size >= Vec::size()) {
+#pragma GCC unroll 4
+    for (; d < size; d += Vec::size()) {
+      Vec data = Vec::loadu(input + d);
+      data.store(out + d);
+    }
+  }
+  for (; d < size; ++d) {
+    out[d] = input[d];
+  }
+  // using scalar padding as pad_size is less than vec size here
+  for (; d < pad_size; ++d) {
+    out[d] = fill_value;
+  }
+}
+
+at::Tensor fused_experts_with_shared(
+    const at::Tensor& hidden_states,
+    const at::Tensor& w1,
+    const at::Tensor& w2,
+    const at::Tensor& topk_weights,
+    const at::Tensor& topk_ids,
+    bool inplace,
+    bool is_vnni,
+    bool is_distributed,
+    bool is_woq,
+    int64_t woq_weight_dtype,
+    int64_t woq_group_size,
+    int64_t woq_lowp_mode,
+    const std::optional<at::Tensor>& w1_scale,
+    const std::optional<at::Tensor>& w1_zp,
+    const std::optional<at::Tensor>& w1_compensation,
+    const std::optional<at::Tensor>& w2_scale,
+    const std::optional<at::Tensor>& w2_zp,
+    const std::optional<at::Tensor>& w2_compensation) {
+  RECORD_FUNCTION(
+      "ipex::fused_experts_with_shared", c10::ArrayRef<c10::IValue>({}));
+  int32_t num_tokens = topk_weights.size(0);
+  int32_t num_topk_experts = topk_weights.size(1);
+  int32_t num_topk_experts_pad = num_topk_experts + 1;
+  int32_t num_experts = w1.size(0);
+  auto pad_weight =
+      at::empty({num_tokens, num_topk_experts_pad}, topk_weights.options());
+  auto pad_ids =
+      at::empty({num_tokens, num_topk_experts_pad}, topk_ids.options());
+  // padding 1 shared expert to routed expert
+  // topk_id is num_experts - 1, and topk weights is 1.0
+  for (int id = 0; id < num_tokens; id++) {
+    copy_and_fill<int32_t>(
+        pad_ids.data_ptr<int32_t>() + id * num_topk_experts_pad,
+        topk_ids.data_ptr<int32_t>() + id * num_topk_experts,
+        num_topk_experts,
+        num_topk_experts_pad,
+        num_experts - 1);
+    copy_and_fill<float>(
+        pad_weight.data_ptr<float>() + id * num_topk_experts_pad,
+        topk_weights.data_ptr<float>() + id * num_topk_experts,
+        num_topk_experts,
+        num_topk_experts_pad,
+        1.0);
+  }
+  return fused_experts_impl_stub(
+      kCPU,
+      hidden_states,
+      w1,
+      w2,
+      pad_weight,
+      pad_ids,
+      inplace,
+      is_vnni,
+      is_distributed,
+      is_woq,
+      woq_weight_dtype,
+      woq_group_size,
+      woq_lowp_mode,
+      w1_scale,
+      w1_zp,
+      w1_compensation,
+      w2_scale,
+      w2_zp,
+      w2_compensation);
+}
+
 at::Tensor fused_experts(
     const at::Tensor& hidden_states,
     const at::Tensor& w1,
@@ -334,6 +426,15 @@ TORCH_LIBRARY_FRAGMENT(torch_ipex, m) {
        Tensor? w1_scale, Tensor? w1_zp, Tensor? w1_compensation, Tensor? w2_scale, Tensor? w2_zp, Tensor? w2_compensation) -> Tensor");
   m.impl(
       "fused_experts", c10::DispatchKey::CPU, torch_ipex::cpu::fused_experts);
+  m.def(
+      "fused_experts_with_shared(Tensor hidden_states, Tensor w1, Tensor w2, Tensor topk_weights, \
+         Tensor topk_ids, bool inplace, bool is_vnni, \
+         bool is_distributed, bool is_woq, int woq_weight_dtype, int woq_group_size, int woq_lowp_mode, \
+         Tensor? w1_scale, Tensor? w1_zp, Tensor? w1_compensation, Tensor? w2_scale, Tensor? w2_zp, Tensor? w2_compensation) -> Tensor");
+  m.impl(
+      "fused_experts_with_shared",
+      c10::DispatchKey::CPU,
+      torch_ipex::cpu::fused_experts_with_shared);
   m.def(
       "grouped_topk(Tensor hidden_states, Tensor gating_output, \
         int topk, bool renormalize, int num_expert_group, int topk_group, Tensor e_score_correction_bias, Tensor routed_scaling_factor)  -> (Tensor, Tensor)");
diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py
@@ -1956,15 +1956,7 @@ def JambaMambaDecoderLayer_forward(
 def moe_infer(self, x, topk_ids, topk_weight):
     if self.use_fused_moe or self.use_fused_moe_woq:
         if self.unify_experts:
-            pad_weights = torch.ones(x.size(0), 1)
-            pad_ids = torch.full((x.size(0), 1), self.unify_shared_expert_id - 1).to(
-                torch.int
-            )
-            topk_weight = torch.cat((topk_weight.to(torch.float), pad_weights), -1).to(
-                torch.float
-            )
-            topk_ids = torch.cat((topk_ids.to(torch.int), pad_ids), -1).to(torch.int)
-            final_out = torch.ops.torch_ipex.fused_experts(
+            final_out = torch.ops.torch_ipex.fused_experts_with_shared(
                 x,
                 self.w13_weight,
                 self.w2_weight,
diff --git a/tests/cpu/test_deepseek_ops.py b/tests/cpu/test_deepseek_ops.py
@@ -815,6 +815,51 @@ def fuse_moe_with_sharedmoe(a, w1, w2, score, topk, renormalize):
                 w2_comp,
             )
 
+        def fuse_moe_with_sharedmoe_v2(a, w1, w2, score, topk, renormalize):
+
+            G = 1
+            topk_group = 1
+
+            B, D = a.shape
+            topk_weights = torch.empty(B, topk, dtype=torch.float32)
+            topk_ids = torch.empty(B, topk, dtype=torch.int32)
+            topk_weights, topk_ids = grouped_topk_native(
+                a, score, topk, renormalize, G, topk_group
+            )
+
+            packed_w1 = torch.ops.torch_ipex.convert_weight_packed_bf16(w1)
+            packed_w2 = torch.ops.torch_ipex.convert_weight_packed_bf16(w2)
+            w13_scale = None
+            w13_zp = None
+            w13_comp = None
+            w2_scale = None
+            w2_zp = None
+            w2_comp = None
+            inplace = False
+            group_size = -1
+            weight_dtype = WoqWeightDtype.INT8
+            lowp_mode = WoqLowpMode.BF16
+            return torch.ops.torch_ipex.fused_experts_with_shared(
+                a,
+                packed_w1,
+                packed_w2,
+                topk_weights,
+                topk_ids,
+                inplace,
+                True,
+                False,
+                False,
+                weight_dtype,
+                group_size,
+                lowp_mode,
+                w13_scale,
+                w13_zp,
+                w13_comp,
+                w2_scale,
+                w2_zp,
+                w2_comp,
+            )
+
         def run_single_test(m, n, k, e, topk, dtype, renormalize=False):
 
             a = torch.randn((m, k), device="cpu", dtype=dtype) / 10
@@ -841,8 +886,11 @@ def run_single_test(m, n, k, e, topk, dtype, renormalize=False):
             fused_output = fuse_moe_with_sharedmoe(
                 a, w1_, w2_, score, topk, renormalize
             )
-
+            fused_output_v2 = fuse_moe_with_sharedmoe_v2(
+                a, w1_, w2_, score, topk, renormalize
+            )
             compare(torch_output, fused_output)
+            compare(torch_output, fused_output_v2)
 
         run_single_test(2, 2048, 2048, 4, 2, torch.bfloat16, renormalize=True)
         run_single_test(2, 128, 32, 4, 2, torch.bfloat16, renormalize=True)