Update Cutlass to v3.1 (#94188)

Now that we are on CUDA 11+ exclusively, we can update Nvidia's Cutlass to the next version. Pull Request resolved: https://github.com/pytorch/pytorch/pull/94188 Approved by: https://github.com/ezyang, https://github.com/jansel, https://github.com/malfet
2025-10-20 21:14:14 +08:00 · 2023-04-25 22:02:42 +00:00
parent 15e1bee269
commit dfba65be8b
4 changed files with 5 additions and 5 deletions
--- a/aten/src/ATen/native/cuda/KernelUtils.cuh
+++ b/aten/src/ATen/native/cuda/KernelUtils.cuh
@ -49,14 +49,14 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(

  if (low_byte && index < (numel - 1)) {
    __half2 value2;
-    value2.x = value;
+    value2.x = static_cast<__half>(value);
    value2.y = __int2half_rz(0);
    atomicAdd(reinterpret_cast<__half2*>(target_addr), value2);

  } else if (!low_byte && index > 0) {
    __half2 value2;
    value2.x = __int2half_rz(0);
-    value2.y = value;
+    value2.y = static_cast<__half>(value);
    atomicAdd(reinterpret_cast<__half2*>(target_addr - 1), value2);

  } else {
--- a/aten/src/ATen/test/cuda_half_test.cu
+++ b/aten/src/ATen/test/cuda_half_test.cu
@ -21,7 +21,7 @@ __device__ void test(){

  __half a = __float2half(3.0f);
  __half b = __float2half(2.0f);
-  __half c = a - Half(b);
+  __half c = Half(a) - Half(b);
  assert(static_cast<Half>(c) == Half(1.0));

  // asserting if the  functions used on
--- a/third_party/cutlass
+++ b/third_party/cutlass
--- a/third_party/cutlass.BUILD
+++ b/third_party/cutlass.BUILD
@ -5,7 +5,7 @@ load("@rules_cc//cc:defs.bzl", "cc_library")

 cc_library(
    name = "cutlass",
-    hdrs = glob(["include/**/*.h"]),
+    hdrs = glob(["include/**/*.h", "include/**/*.hpp"]),
    includes = ["include/"],
    visibility = ["//visibility:public"],
 )