SM100 Cutlass MLA decode with unrestricted num_heads (< 128) for DeepSeek TP (#20769)

Signed-off-by: Alexander Matveev <amatveev@redhat.com>
2025-07-14 21:06:38 -04:00
parent 61e20828da
commit 8cdc371217
12 changed files with 3283 additions and 2 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -553,7 +553,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
    set(SRCS
-      "csrc/attention/mla/cutlass_mla_kernels.cu")
+      "csrc/attention/mla/cutlass_mla_kernels.cu"
+      "csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${MLA_ARCHS}")