PaddlePaddle
diff --git a/‎csrc/gpu/moe/fused_moe/moe/wint2_unzip_impl_op.h
Lines changed: 151 additions & 0 deletions b/‎csrc/gpu/moe/fused_moe/moe/wint2_unzip_impl_op.h
Lines changed: 151 additions & 0 deletions
diff --git a/‎csrc/gpu/moe/fused_moe/wint2_unzip.cu
Lines changed: 133 additions & 0 deletions b/‎csrc/gpu/moe/fused_moe/wint2_unzip.cu
Lines changed: 133 additions & 0 deletions
@@ -0,0 +1,151 @@
+
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <stdio.h>
+#include <cstdint>
+
+
+struct WeightOnlyTraits {
+  static constexpr int32_t kGroupSize = 64;
+  static constexpr int32_t kPackNum = 4;
+  static constexpr int16_t kWeightMask = 0x3F;
+  static constexpr int32_t kBBZip = 32;
+};
+
+template <typename T, int64_t TileRows, int64_t TileColumns>
+struct Wint2UnzipFunctor {
+  using ScaleComputeT = float;
+
+  static constexpr int64_t kTileRows = TileRows;
+  static constexpr int64_t kTileColumns = TileColumns;
+
+  struct Arguments {
+    const uint8_t* w_ptr;
+    const T* w_scale_ptr;
+    const float* w_code_scale_ptr;
+    const float* w_code_zp_ptr;
+    const T* w_super_scale_ptr;
+    T* out_ptr;
+    const int in_stride;
+  };
+
+  __device__ void operator()(const Arguments& args, const int tid, const int num_threads) {
+    int16_t shift_bits[4] = {9, 6, 3, 0};
+
+    for (int col = tid; col < kTileColumns; col += num_threads) {
+      for (int row = 0; row < kTileRows; ++row) {
+        int w_row = row / WeightOnlyTraits::kPackNum;
+        int w_offset = w_row * args.in_stride + col;
+        ScaleComputeT w = static_cast<ScaleComputeT>(args.w_ptr[w_offset]);
+        ScaleComputeT w_code_scale = static_cast<ScaleComputeT>(args.w_code_scale_ptr[col]);
+        ScaleComputeT w_code_zp = static_cast<ScaleComputeT>(args.w_code_zp_ptr[col]);
+        
+        int16_t w_zipped_value = static_cast<int16_t>(floor(w * w_code_scale + w_code_zp + 0.5));
+        int16_t shift_bit = shift_bits[row % WeightOnlyTraits::kPackNum];
+        int16_t w_shifted_value = (w_zipped_value >> shift_bit) & WeightOnlyTraits::kWeightMask;
+
+        int w_scale_row = row / WeightOnlyTraits::kGroupSize;
+        int w_scale_offset = w_scale_row * args.in_stride + col;
+        T w_scale = static_cast<T>(args.w_scale_ptr[w_scale_offset]);
+        
+        if (args.w_super_scale_ptr) {
+          T w_super_scale = static_cast<T>(args.w_super_scale_ptr[col]);
+          w_scale = w_scale * w_super_scale;
+        }
+
+        args.out_ptr[row * kTileColumns + col] = static_cast<T>(w_scale) * (static_cast<T>(w_shifted_value) - static_cast<T>(WeightOnlyTraits::kBBZip));
+      }
+    }
+    __syncthreads();
+  }
+};
+
+template <typename T, int64_t TileRows, int64_t TileColumns>
+__global__ void Wint2UnzipKernel(
+    const uint8_t* w_ptr,
+    const T* w_scale_ptr,
+    const float* w_code_scale_ptr,
+    const float* w_code_zp_ptr,
+    const T* w_super_scale_ptr,
+    T* output_tensor_ptr,
+    const int64_t batch,
+    const int64_t num_rows,
+    const int64_t num_columns) {
+  __shared__ T smem[TileRows * TileColumns];
+
+  int64_t block_start_column = blockIdx.x * TileColumns;
+
+  int64_t block_start_row = blockIdx.z * num_rows + blockIdx.y * TileRows;
+
+  int64_t block_start_w_row = block_start_row / WeightOnlyTraits::kPackNum;
+  int64_t block_w_offset = block_start_w_row * num_columns + block_start_column;
+  const uint8_t* block_w_ptr = w_ptr + block_w_offset;
+
+  int64_t block_start_w_scale_row = block_start_row / WeightOnlyTraits::kGroupSize;
+  int64_t block_w_scale_offset = block_start_w_scale_row * num_columns + block_start_column;
+  const T* block_w_scale_ptr = w_scale_ptr + block_w_scale_offset;
+
+  const float* block_w_code_scale_ptr = w_code_scale_ptr + blockIdx.z * num_columns + block_start_column;
+  const float* block_w_code_zp_ptr = w_code_zp_ptr + blockIdx.z * num_columns + block_start_column;
+  const T* block_w_super_scale_ptr = w_super_scale_ptr ? w_super_scale_ptr + blockIdx.z * num_columns + block_start_column : nullptr;
+
+  // unzip to shared memory
+  typename Wint2UnzipFunctor<T, TileRows, TileColumns>::Arguments args{
+      block_w_ptr, block_w_scale_ptr, block_w_code_scale_ptr, block_w_code_zp_ptr, block_w_super_scale_ptr, smem, num_columns};
+
+  Wint2UnzipFunctor<T, TileRows, TileColumns> winx_unzipper;
+  winx_unzipper(args, threadIdx.x, blockDim.x);
+
+  // write back to global memory
+  for (int row = 0; row < TileRows; ++row) {
+    for (int col = 0; col < TileColumns; ++col) {
+      int64_t global_row = block_start_row + row;
+      int64_t global_col = block_start_column + col;
+      output_tensor_ptr[global_row * num_columns + global_col] = smem[row * TileColumns + col];
+    }
+  }
+}
+
+template <typename T>
+void Wint2UnzipKernelLauncher(
+    const uint8_t* w_ptr,
+    const T* w_scale_ptr,
+    const float* w_code_scale_ptr,
+    const float* w_code_zp_ptr,
+    const T* w_super_scale_ptr,
+    T* output_tensor_ptr,
+    const int64_t batch,
+    const int64_t num_rows,
+    const int64_t num_columns) {
+  constexpr int kTileRows = 64;
+  constexpr int kTileColumns = 128;
+
+  const int num_threads = 128;
+  const int block_dim_x = (num_columns + kTileColumns - 1) / kTileColumns;
+  const int block_dim_y = (num_rows + kTileRows - 1) / kTileRows;
+
+  dim3 block_dim(num_threads, 1, 1); 
+  dim3 grid_dim(block_dim_x, block_dim_y, batch);
+  // printf("Launch config: grid_dim={%d, %d, %d}, block_dim={%d, 1, 1}\n", block_dim_x, block_dim_y, batch, num_threads);
+
+  Wint2UnzipKernel<T, kTileRows, kTileColumns><<<grid_dim, block_dim>>>(
+      w_ptr, w_scale_ptr, w_code_scale_ptr, w_code_zp_ptr, w_super_scale_ptr, output_tensor_ptr, batch, num_rows, num_columns);
+}
+
@@ -0,0 +1,133 @@
+
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "moe/wint2_unzip_impl_op.h"
+#include "helper.h"
+
+template <paddle::DataType T>
+void Wint2UnzipKernel(const paddle::Tensor& w,
+                      const paddle::Tensor& w_scale,
+                      const paddle::Tensor& w_code_scale,
+                      const paddle::Tensor& w_code_zp,
+                      const paddle::Tensor& w_super_scale,
+                      paddle::Tensor& output_tensor,
+                      const std::string& quant_method) {
+    using data_t = typename PDTraits<T>::data_t;
+    using NvType = typename PDTraits<T>::DataType;
+
+    if (quant_method == "weight_only_int2") {
+        const uint8_t* w_ptr = w.data<uint8_t>();
+        const NvType* w_scale_ptr = reinterpret_cast<const NvType*>(w_scale.data<data_t>());
+        const float* w_code_scale_ptr = w_code_scale.data<float>();
+        const float* w_code_zp_ptr = w_code_zp.data<float>();
+        const NvType* w_super_scale_kernel_ptr = w_super_scale.initialized() ? reinterpret_cast<const NvType*>(w_super_scale.data<data_t>()) : nullptr;
+
+        NvType* output_tensor_ptr = reinterpret_cast<NvType*>(output_tensor.data<data_t>());
+
+        const int64_t batch = output_tensor.shape()[0];
+        const int64_t num_rows = output_tensor.shape()[1];
+        const int64_t num_columns = output_tensor.shape()[2];
+        Wint2UnzipKernelLauncher<NvType>(
+            w_ptr,
+            w_scale_ptr, 
+            w_code_scale_ptr, 
+            w_code_zp_ptr, 
+            w_super_scale_kernel_ptr, 
+            output_tensor_ptr,
+            batch, num_rows, num_columns);
+    } else {
+        PD_THROW("Unsupported quant_method for Wint2Unzip.");
+    }
+}
+
+std::vector<paddle::Tensor> Wint2Unzip(const paddle::Tensor& w, 
+                                       const paddle::Tensor& w_scale,
+                                       const paddle::Tensor& w_code_scale,
+                                       const paddle::Tensor& w_code_zp,
+                                       const paddle::Tensor& w_super_scale,
+                                       const std::string& quant_method) {
+  auto place = w.place();
+  auto dtype = w_scale.dtype();                              
+
+  auto output_dims = w.dims();
+  const int unzip_axis = 1;
+
+  if (quant_method == "weight_only_int2") {
+    output_dims[unzip_axis] = output_dims[unzip_axis] * WeightOnlyTraits::kPackNum;
+    // PD_CHECK(output_shape[unzip_axis] % WeightOnlyTraits::kGroupSize == 0, "unzip_size must be divisible by 64 in wint2!");
+  } else {
+    PD_THROW("Unsupported data type for Wint2Unzip");
+  }
+  auto output_tensor = GetEmptyTensor(output_dims, dtype, place);
+
+  switch (w_scale.dtype()) {
+    case paddle::DataType::BFLOAT16:
+      Wint2UnzipKernel<paddle::DataType::BFLOAT16>(w,
+                                                   w_scale,
+                                                   w_code_scale,
+                                                   w_code_zp,
+                                                   w_super_scale,
+                                                   output_tensor,
+                                                   quant_method);
+      break;
+    case paddle::DataType::FLOAT16:
+      Wint2UnzipKernel<paddle::DataType::FLOAT16>(w,
+                                                  w_scale,
+                                                  w_code_scale,
+                                                  w_code_zp,
+                                                  w_super_scale,
+                                                  output_tensor,
+                                                  quant_method);
+      break;
+    default:
+      PD_THROW("Unsupported data type for Wint2Unzip");
+  }
+  return {output_tensor};
+}
+
+std::vector<std::vector<int64_t>> Wint2UnzipInferShape(
+    const std::vector<int64_t>& w_shape,
+    const std::vector<int64_t>& w_scale_shape,
+    const std::vector<int64_t>& w_code_scale_shape,
+    const std::vector<int64_t>& w_code_zp_shape,
+    const std::vector<int64_t>& w_super_scale_shape,
+    const std::string& quant_method) {
+    std::vector<int64_t> output_shape(w_shape);
+    const int unzip_axis = 1;
+    if(quant_method == "weight_only_int2") {
+        output_shape[unzip_axis] = w_shape[unzip_axis] * WeightOnlyTraits::kPackNum;
+        PD_CHECK(output_shape[unzip_axis] % WeightOnlyTraits::kGroupSize == 0, "unzip_size must be divisible by 64 in wint2!");
+    } else {
+        PD_THROW("Unsupported data type for Wint2Unzip");
+    }
+    return {output_shape};
+}
+
+std::vector<paddle::DataType> Wint2UnzipInferDtype(
+    const paddle::DataType& w_dtype,
+    const paddle::DataType& w_scale_dtype,
+    const paddle::DataType& w_code_scale_dtype,
+    const paddle::DataType& w_code_zp_dtype,
+    const paddle::DataType& w_super_scale_dtype) {
+    return {w_scale_dtype};
+}
+
+PD_BUILD_OP(win2_unzip)
+    .Inputs({"w", "w_scale", "w_code_scale", "w_code_zp", "w_super_scale"})
+    .Outputs({"output_tensor"})
+    .Attrs({"quant_method:std::string"})
+    .SetKernelFn(PD_KERNEL(Wint2Unzip))
+    .SetInferShapeFn(PD_INFER_SHAPE(Wint2UnzipInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(Wint2UnzipInferDtype));