PaddlePaddle · chen2016013 · Jun 10, 2025 · Jun 11, 2025 · Jun 11, 2025 · Jun 11, 2025
diff --git a/ops/csrc/fp8/deep_gemm/__init__.py b/ops/csrc/fp8/deep_gemm/__init__.py
@@ -23,8 +23,10 @@
     get_col_major_tma_aligned_tensor,
     get_m_alignment_for_contiguous_layout,
     get_num_sms,
+    k_grouped_wgrad_gemm_fp8_fp8_fp32_nt,
     m_grouped_gemm_fp8_fp8_bf16_nt_contiguous,
     m_grouped_gemm_fp8_fp8_bf16_nt_masked,
     set_num_sms,
+    wgrad_gemm_fp8_fp8_fp32_nt,
 )
-from .utils import bench, calc_diff, get_cuda_home
+from .utils import calc_diff
diff --git a/ops/csrc/fp8/deep_gemm/include/deep_gemm/fp8_gemm.cuh b/ops/csrc/fp8/deep_gemm/include/deep_gemm/fp8_gemm.cuh
diff --git a/ops/csrc/fp8/deep_gemm/include/deep_gemm/fp8_wgrad_gemm.cuh b/ops/csrc/fp8/deep_gemm/include/deep_gemm/fp8_wgrad_gemm.cuh
diff --git a/ops/csrc/fp8/deep_gemm/include/deep_gemm/mma_utils.cuh b/ops/csrc/fp8/deep_gemm/include/deep_gemm/mma_utils.cuh
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// The file has been adapted from DeepSeek DeepEP project
+// The file has been adapted from DeepSeek DeepGEMM project
 // Copyright (c) 2025 DeepSeek
-// Licensed under the MIT License - https://github.com/deepseek-ai/DeepEP/blob/main/LICENSE
+// Licensed under the MIT License - https://github.com/deepseek-ai/DeepGEMM/blob/main/LICENSE
 
 #pragma once
 
+#ifndef __CUDACC_RTC__
 #include <cuda.h>
+#endif
 
 #include <cute/arch/mma_sm90_gmma.hpp>
 #include <cute/arch/mma_sm90_gmma_ext.hpp>
@@ -84,6 +86,12 @@ __device__  __forceinline__ float ld_shared(const float* __restrict__ ptr) {
     return ret;
 }
 
+__device__  __forceinline__ float2 ld_shared(const float2* __restrict__ ptr) {
+    float2 ret;
+    asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : "l"(ptr));
+    return ret;
+}
+
 __device__ __forceinline__ void st_shared(const float* ptr, float val) {
     asm volatile("st.shared.f32 [%0], %1;" :: "l"(ptr), "f"(val));
 }
@@ -92,6 +100,10 @@ __device__ __forceinline__ void st_shared(const uint32_t* ptr, uint32_t val) {
     asm volatile("st.shared.u32 [%0], %1;" :: "l"(ptr), "r"(val));
 }
 
+__device__ __forceinline__ void st_shared(const float2* ptr, float2 val) {
+    asm volatile("st.shared.v2.f32 [%0], {%1, %2};" :: "l"(ptr), "f"(val.x), "f"(val.y));
+}
+
 template <int N>
 __device__ void warpgroup_wait() {
     DG_STATIC_ASSERT(N >= 0 and N <= 7, "WGMMA wait: N must be in range [0, 7]");
@@ -186,6 +198,7 @@ struct FP8MMASelector {
         if constexpr (N == 112) return MMA_64x112x32_F32E4M3E4M3_SS_TN();
         if constexpr (N == 120) return MMA_64x120x32_F32E4M3E4M3_SS_TN();
         if constexpr (N == 128) return MMA_64x128x32_F32E4M3E4M3_SS_TN();
+        if constexpr (N == 136) return MMA_64x136x32_F32E4M3E4M3_SS_TN();
         if constexpr (N == 144) return MMA_64x144x32_F32E4M3E4M3_SS_TN();
         if constexpr (N == 152) return MMA_64x152x32_F32E4M3E4M3_SS_TN();
         if constexpr (N == 160) return MMA_64x160x32_F32E4M3E4M3_SS_TN();
@@ -199,4 +212,19 @@ struct FP8MMASelector {
     using type = decltype(select_type());
 };
 
+enum class Layout {
+    RowMajor,
+    ColMajor
+};
+
+__device__ __host__ constexpr int get_num_math_warpgroups(int block_m) {
+    return block_m == 64 ? 1 : 2;
+}
+
+template <uint32_t kNumTMAThreads, uint32_t kNumMathThreadsPerGroup>
+__device__ __host__ constexpr int get_num_threads_per_sm(int block_m) {
+    DG_STATIC_ASSERT(kNumMathThreadsPerGroup == 128, "Only support 128 threads per math group");
+    return get_num_math_warpgroups(block_m) * kNumMathThreadsPerGroup + kNumTMAThreads;
+}
+
 } // namespace deep_gemm
diff --git a/ops/csrc/fp8/deep_gemm/include/deep_gemm/nvrtc_std.cuh b/ops/csrc/fp8/deep_gemm/include/deep_gemm/nvrtc_std.cuh
@@ -0,0 +1,104 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The file has been adapted from DeepSeek DeepGEMM project
+// Copyright (c) 2025 DeepSeek
+// Licensed under the MIT License - https://github.com/deepseek-ai/DeepGEMM/blob/main/LICENSE
+
+#pragma once
+
+#ifdef __CUDACC_RTC__
+
+using int8_t = signed char;
+using uint8_t = unsigned char;
+using int16_t = signed short;
+using uint16_t = unsigned short;
+using int32_t = signed int;
+using uint32_t = unsigned int;
+using int64_t = signed long long;
+using uint64_t = unsigned long long;
+using cuuint64_t = unsigned long long;
+
+#ifndef CU_TENSOR_MAP_NUM_QWORDS
+#define CU_TENSOR_MAP_NUM_QWORDS 16
+
+struct CUtensorMap_st {
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+    alignas(64)
+#elif __STDC_VERSION__ >= 201112L
+    _Alignas(64)
+#endif
+        cuuint64_t opaque[CU_TENSOR_MAP_NUM_QWORDS];
+};
+
+using CUtensorMap = CUtensorMap_st;
+#endif
+
+namespace std {
+
+template <class T, T v> struct integral_constant {
+  static constexpr T value = v;
+
+  using value_type = T;
+  using type = integral_constant;
+
+  __device__ constexpr operator value_type() const noexcept { return value; }
+
+  __device__ constexpr value_type operator()() const noexcept { return value; }
+};
+
+using false_type = integral_constant<bool, false>;
+using true_type = integral_constant<bool, true>;
+
+template <class T, class U> struct is_same : false_type {};
+
+template <class T> struct is_same<T, T> : true_type {};
+
+template <class T, class U>
+inline constexpr bool is_same_v = is_same<T, U>::value;
+
+namespace index_sequence_impl {
+
+// Based on https://stackoverflow.com/a/32223343/11717224
+template <size_t... Ints> struct index_sequence {
+  using type = index_sequence;
+  using value_type = size_t;
+  static constexpr size_t size() noexcept { return sizeof...(Ints); }
+};
+
+template <class Sequence1, class Sequence2> struct _merge_and_renumber;
+
+template <size_t... I1, size_t... I2>
+struct _merge_and_renumber<index_sequence<I1...>, index_sequence<I2...>>
+    : index_sequence<I1..., (sizeof...(I1) + I2)...> {};
+
+template <size_t N>
+struct make_index_sequence
+    : _merge_and_renumber<typename make_index_sequence<N / 2>::type,
+                          typename make_index_sequence<N - N / 2>::type> {};
+
+template <> struct make_index_sequence<0> : index_sequence<> {};
+template <> struct make_index_sequence<1> : index_sequence<0> {};
+
+} // namespace index_sequence_impl
+
+template <size_t... Ns>
+using index_sequence = index_sequence_impl::index_sequence<Ns...>;
+
+template <size_t N>
+using make_index_sequence = index_sequence_impl::make_index_sequence<N>;
+
+} // namespace std
+
+#endif
diff --git a/ops/csrc/fp8/deep_gemm/include/deep_gemm/scheduler.cuh b/ops/csrc/fp8/deep_gemm/include/deep_gemm/scheduler.cuh
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// The file has been adapted from DeepSeek DeepEP project
+// The file has been adapted from DeepSeek DeepGEMM project
 // Copyright (c) 2025 DeepSeek
-// Licensed under the MIT License - https://github.com/deepseek-ai/DeepEP/blob/main/LICENSE
+// Licensed under the MIT License - https://github.com/deepseek-ai/DeepGEMM/blob/main/LICENSE
 
+#pragma once
 #include "utils.cuh"
 
 namespace deep_gemm {
@@ -41,13 +42,16 @@ struct Scheduler {
     // For normal GEMM
     // Maybe not used in the masked grouped GEMM
     uint32_t num_blocks;
+    uint32_t num_blocks_in_group;
+    bool is_peer_cta_alive = true;
 
     // For grouped GEMM
     int* grouped_layout;
+
     // Only used for masked layout
     uint32_t curr_group_idx, curr_cumsum;
 
-    __device__ __forceinline__ explicit Scheduler(const uint32_t shape_m,
+    __device__ __forceinline__ explicit Scheduler(const uint32_t& shape_m,
                                                   int* grouped_layout = nullptr) {
         num_aligned_m_blocks = ceil_div(shape_m, BLOCK_M);
         if constexpr (kGemmType == GemmType::Normal) {
@@ -61,39 +65,77 @@ struct Scheduler {
         }
     }
 
-    __device__ __forceinline__ void get_swizzled_block_idx(const uint32_t num_m_blocks, int block_idx, uint32_t& m_block_idx, uint32_t& n_block_idx) {
+    // ReSharper disable once CppNotAllPathsReturnValue
+    __device__ __forceinline__ bool is_computation_valid(const uint32_t& m_block_idx, const uint32_t& m_offset) const {
+        if constexpr (kGemmType == GemmType::Normal) {
+            return true;
+        } else if constexpr (kGemmType == GemmType::GroupedContiguous) {
+            return __ldg(grouped_layout + m_offset + m_block_idx * BLOCK_M) >= 0;
+        } else if constexpr (kGemmType == GemmType::GroupedMasked) {
+            return m_offset + m_block_idx * BLOCK_M < __ldg(grouped_layout + curr_group_idx);
+        }
+    }
+
+    __device__ __forceinline__ bool is_tma_multicast_valid(const uint32_t& m_block_idx) const {
+        if (num_blocks_in_group == 1)
+            return false;
+        if constexpr (kGemmType == GemmType::Normal or kGemmType == GemmType::GroupedMasked) {
+            return true;
+        } else {
+            DG_STATIC_ASSERT(kGemmType == GemmType::GroupedContiguous, "Invalid Gemm type");
+            if constexpr (kIsTMAMulticastOnA) {
+                return true;
+            } else {
+                auto group_idx = __ldg(grouped_layout + m_block_idx * BLOCK_M);
+                auto peer_group_idx = __ldg(grouped_layout + (m_block_idx ^ 1) * BLOCK_M);
+                return group_idx == peer_group_idx;
+            }
+        }
+    }
+
+    __device__ __forceinline__ void get_swizzled_block_idx(const uint32_t& num_m_blocks, const uint32_t& block_idx,
+                                                           uint32_t& m_block_idx, uint32_t& n_block_idx) {
         DG_STATIC_ASSERT(kNum1DBlocksPerGroup % kNumTMAMulticast == 0, "Invalid group size");
 
         // Swizzle for better L2 usages
-        // TODO: unify these 2 branches
+        auto primary_num_blocks = kIsTMAMulticastOnA ? kNumNBlocks : num_m_blocks;
+        auto secondary_num_blocks = kIsTMAMulticastOnA ? num_m_blocks : kNumNBlocks;
+        auto num_blocks_per_group = secondary_num_blocks * kNum1DBlocksPerGroup;
+        auto group_idx = block_idx / num_blocks_per_group;
+        auto first_block_idx = group_idx * kNum1DBlocksPerGroup;
+        auto in_group_idx = block_idx % num_blocks_per_group;
+        num_blocks_in_group = min(kNum1DBlocksPerGroup, primary_num_blocks - first_block_idx);
+
+        // Fix unaligned TMA multicast
+        if (kNumTMAMulticast > 1 and num_blocks_in_group % 2 != 0) {
+            if (in_group_idx < (num_blocks_in_group ^ 1) * secondary_num_blocks) {
+                num_blocks_in_group = num_blocks_in_group ^ 1;
+            } else {
+                in_group_idx = in_group_idx - (num_blocks_in_group ^ 1) * secondary_num_blocks;
+                first_block_idx += num_blocks_in_group ^ 1;
+                num_blocks_in_group = 1;
+            }
+        }
+
+        // Convert to final M/N block indices
         if constexpr (kIsTMAMulticastOnA) {
-            auto num_blocks_per_group = num_m_blocks * kNum1DBlocksPerGroup;
-            auto group_idx = block_idx / num_blocks_per_group;
-            auto first_n_block_idx = group_idx * kNum1DBlocksPerGroup;
-            auto num_n_blocks_in_group = min(kNum1DBlocksPerGroup, kNumNBlocks - first_n_block_idx);
-            auto in_group_idx = block_idx % num_blocks_per_group;
-            m_block_idx = in_group_idx / num_n_blocks_in_group;
-            n_block_idx = first_n_block_idx + in_group_idx % num_n_blocks_in_group;
+            m_block_idx = in_group_idx / num_blocks_in_group;
+            n_block_idx = first_block_idx + in_group_idx % num_blocks_in_group;
         } else {
-            auto num_blocks_per_group = kNumNBlocks * kNum1DBlocksPerGroup;
-            auto group_idx = block_idx / num_blocks_per_group;
-            auto first_m_block_idx = group_idx * kNum1DBlocksPerGroup;
-            auto num_m_blocks_in_group = min(kNum1DBlocksPerGroup, num_m_blocks - first_m_block_idx);
-            auto in_group_idx = block_idx % num_blocks_per_group;
-            m_block_idx = first_m_block_idx + in_group_idx % num_m_blocks_in_group;
-            n_block_idx = in_group_idx / num_m_blocks_in_group;
+            m_block_idx = first_block_idx + in_group_idx % num_blocks_in_group;
+            n_block_idx = in_group_idx / num_blocks_in_group;
         }
     }
 
     template <bool kIgnoreGroupedForGroupedContiguous=true>
-    __device__ __forceinline__ uint32_t get_global_idx(const uint32_t shape_dim, const uint32_t block_size,
+    __device__ __forceinline__ uint32_t get_global_idx(const uint32_t& shape_dim, const uint32_t& block_size,
                                                        const uint32_t& block_idx, const uint32_t& m_block_idx=0) {
         if constexpr (kGemmType == GemmType::Normal) {
             return block_idx * block_size;
-        } else if (kGemmType == GemmType::GroupedContiguous) {
+        } else if constexpr (kGemmType == GemmType::GroupedContiguous) {
             auto offset = kIgnoreGroupedForGroupedContiguous ? 0 : __ldg(grouped_layout + m_block_idx * BLOCK_M);
             return offset * shape_dim + block_idx * block_size;
-        } else if (kGemmType == GemmType::GroupedMasked) {
+        } else if constexpr (kGemmType == GemmType::GroupedMasked) {
             return curr_group_idx * shape_dim + block_idx * block_size;
         }
     }
@@ -108,7 +150,7 @@ struct Scheduler {
                 if (curr_group_idx == kNumGroups)
                     return false;
 
-                // Within current group
+                // Within the current group
                 num_m_blocks = ceil_div(static_cast<uint32_t>(__ldg(grouped_layout + curr_group_idx)), BLOCK_M);
                 auto current_m_block_cumsum = curr_cumsum + num_m_blocks;
                 if (next_block_idx < current_m_block_cumsum * kNumNBlocks)
@@ -123,6 +165,10 @@ struct Scheduler {
             if (next_block_idx >= num_blocks)
                 return false;
 
+            // NOTES: we don't have to set `is_peer_cta_alive` for masked grouped GEMM, as it must be aligned
+            is_peer_cta_alive = kNumNBlocks % kNumTMAMulticast == 0 or          // Always aligned on N (constant bypass)
+                                num_aligned_m_blocks % kNumTMAMulticast == 0 or // Always aligned on M (constant bypass)
+                                (next_block_idx ^ 1) < num_blocks;              // Peer CTA in bound
             get_swizzled_block_idx(num_aligned_m_blocks, next_block_idx, m_block_idx, n_block_idx);
         }
         return true;