Add scaled_mm python API, test (#164142)

Summary: * Add `torch.nn.functional.scaled_mm` as an abstraction around the C++ methods * Wraps `torch._scaled_mm_v2` API by default, but user can force use of the older `torch._scaled_mm` interface. * Scaled MM tests now run on the new API Test Plan: `pytest test/test_scaled_matmul_cuda.py` Reviewers: Subscribers: Tasks: Tags: Signed-off-by: Simon Layton <simonlaytonmeta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/164142 Approved by: https://github.com/drisspg ghstack dependencies: #164141
2025-10-20 21:14:14 +08:00 · 2025-10-08 16:44:39 -07:00
parent 512b6b59f0
commit 6a7f5c0d21
11 changed files with 363 additions and 78 deletions
--- a/aten/src/ATen/BlasBackend.h
+++ b/aten/src/ATen/BlasBackend.h
@ -28,4 +28,19 @@ inline std::ostream& operator<<(std::ostream& stream, at::BlasBackend backend) {
  return stream << BlasBackendToString(backend);
 }
 namespace blas {
 enum class ScalingType : std::uint8_t {
  TensorWise, // fp32 scales
  RowWise, // fp32 scales
  BlockWise1x16, // fp8_e4m3fn scales
  BlockWise1x32, // fp8_e8m0fnu scales
  BlockWise1x128, // fp32 scales
  BlockWise128x128, // fp32 scales
 };
 enum class SwizzleType : std::uint8_t { NO_SWIZZLE = 0, SWIZZLE_32_4_4 = 1 };
 } // namespace blas
 } // namespace at
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -1861,6 +1861,8 @@ template bool gemm_and_bias(
    int64_t result_ld,
    GEMMAndBiasActivationEpilogue activation);
 using at::blas::ScalingType;
 int get_scale_mode(ScalingType scaling_type, ScalarType scale_dtype, bool use_fast_accum) {
  switch (scaling_type) {
    case ScalingType::BlockWise1x32:
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@ -14,6 +14,7 @@
 */
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/BlasBackend.h>
 #include <ATen/OpMathType.h>
 namespace at::cuda::blas {
@ -136,15 +137,6 @@ void int8_gemm(
    int32_t* result_ptr,
    int64_t result_ld);
 enum class ScalingType : std::uint8_t {
  TensorWise,  // fp32 scales
  RowWise,  // fp32 scales
  BlockWise1x16,  // fp8_e4m3fn scales
  BlockWise1x32,  // fp8_e8m0fnu scales
  BlockWise1x128,  // fp32 scales
  BlockWise128x128,  // fp32 scales
 };
 void scaled_gemm(
    char transa,
    char transb,
@ -156,13 +148,13 @@ void scaled_gemm(
    int64_t mat1_ld,
    ScalarType mat1_dtype,
    ScalarType mat1_scale_dtype,
-    ScalingType mat1_scaling_type,
+    at::blas::ScalingType mat1_scaling_type,
    const void* mat2_ptr,
    const void* mat2_scale_ptr,
    int64_t mat2_ld,
    ScalarType mat2_dtype,
    ScalarType mat2_scale_dtype,
-    ScalingType mat2_scaling_type,
+    at::blas::ScalingType mat2_scaling_type,
    const void* bias_ptr,
    ScalarType bias_dtype,
    void* result_ptr,
--- a/aten/src/ATen/cuda/tunable/GemmCommon.h
+++ b/aten/src/ATen/cuda/tunable/GemmCommon.h
@ -29,7 +29,7 @@
 namespace at::cuda::tunable {
-using at::cuda::blas::ScalingType;
+using at::blas::ScalingType;
 enum class BlasOp {
  N = 0,
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -106,7 +106,8 @@ c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, b
  }
 }
-using at::cuda::blas::ScalingType;
+using at::blas::ScalingType;
 using at::blas::SwizzleType;
 /**
 * @brief Prepares matrices for CUBLAS operation
@ -1093,7 +1094,7 @@ namespace{
 *   - Returns Error.
 */
-using at::cuda::blas::ScalingType;
+using at::blas::ScalingType;
 bool is_tensorwise_scaling(const at::Tensor& t, const at::Tensor& scale) {
  return isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat && scale.numel() == 1;
@ -1546,7 +1547,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
        args.scale_result_ptr,
        args.result_ld,
        out_dtype_,
-        false /*use_fast_accum*/);
+        use_fast_accum);
  }
  return out;
@ -1664,11 +1665,6 @@ _scaled_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
  return _scaled_mm_out_cuda(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
 }
 enum class SwizzleType {
  NO_SWIZZLE = 0,
  SWIZZLE_32_4_4 = 1
 };
 /**
 * Track concrete implementations available
 */
@ -1890,9 +1886,9 @@ std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 8>
    ScaledGemmImplementation::BLOCK_128x128_1x128},
  { "block_1x128_1x128", std::bind(check_deepseek_recipe, ScalingType::BlockWise1x128, ScalingType::BlockWise1x128, _1, _2, _3, _4, _5, _6),
    ScaledGemmImplementation::BLOCK_1x128_1x128},
-  { "nvfp4", check_nvfp4_recipe, ScaledGemmImplementation::NVFP4_NVFP4},
+  { "nvfp4_nvfp4", check_nvfp4_recipe, ScaledGemmImplementation::NVFP4_NVFP4},
-  { "nvfp4_single_scale", check_nvfp4_recipe_single_scale, ScaledGemmImplementation::NVFP4_NVFP4_SINGLE_SCALE },
+  { "nvfp4_nvfp4_single_scale", check_nvfp4_recipe_single_scale, ScaledGemmImplementation::NVFP4_NVFP4_SINGLE_SCALE },
-  { "mxfp8", check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8}}};
+  { "mxfp8_mxfp8", check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8}}};
 Tensor&
 _cutlass_scaled_gemm(
--- a/docs/source/nn.functional.rst
+++ b/docs/source/nn.functional.rst
@ -218,3 +218,13 @@ DataParallel functions (multi-GPU, distributed)
    :nosignatures:
    torch.nn.parallel.data_parallel
 Low-Precision functions
 -----------------------
 .. autosummary::
    :toctree: generated
    :nosignatures:
    ScalingType
    SwizzleType
    scaled_mm
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@ -35,7 +35,6 @@ and supported quantized modules and functions.
    quantization-support
 .. torch.ao is missing documentation. Since part of it is mentioned here, adding them here for now.
 .. They are here for tracking purposes until they are more permanently fixed.
 .. py:module:: torch.ao
--- a/test/test_scaled_matmul_cuda.py
+++ b/test/test_scaled_matmul_cuda.py
@ -11,15 +11,17 @@ from typing import Optional
 import torch
 from torch.nn.functional import scaled_mm, ScalingType, SwizzleType
 from torch.testing._internal.common_cuda import (
-    SM89OrLater,
+    IS_SM90,
    SM90OrLater,
    _get_torch_cuda_version,
    PLATFORM_SUPPORTS_FP8,
    PLATFORM_SUPPORTS_FP8_GROUPED_GEMM,
    PLATFORM_SUPPORTS_MX_GEMM,
    PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM,
-    IS_SM90,
+    SM100OrLater,
    SM89OrLater,
    SM90OrLater,
    with_tf32_off,
 )
 from torch.testing._internal.common_device_type import (
@ -107,6 +109,112 @@ def tensor_to_scale_block(
    scale = scale.flatten(2, 3).flatten(0, 1)
    return x, scale
 def round_up(x: int, y: int) -> int:
    return ((x + y - 1) // y) * y
 def infer_scale_swizzle(mat, scale):
    # Tensor-wise
    if scale.numel() == 1:
        return ScalingType.TensorWise, SwizzleType.NO_SWIZZLE
    # Row-wise
    if (scale.shape[0] == mat.shape[0] and scale.shape[1] == 1) or (
        scale.shape[0] == 1 and scale.shape[1] == mat.shape[1]
    ):
        return ScalingType.RowWise, SwizzleType.NO_SWIZZLE
    # deepgemm 1x128 / 128x1
    if len(scale.shape) > 1:
        if (
            scale.shape[0] == mat.shape[0]
            and scale.shape[1] == math.ceil(mat.shape[1] // 128)
            or scale.shape[1] == mat.shape[1]
            and scale.shape[0] == math.ceil(mat.shape[0] // 128)
        ):
            return ScalingType.BlockWise1x128, SwizzleType.NO_SWIZZLE
        # deepgemm 128x128
        if scale.shape[0] == math.ceil(mat.shape[0] // 128) and scale.shape[
            1
        ] == math.ceil(mat.shape[1] // 128):
            return ScalingType.BlockWise128x128, SwizzleType.NO_SWIZZLE
    # NVFP4
    if (
        scale.numel()
        == round_up(mat.shape[0], 128) * round_up(math.ceil(2 * mat.shape[1] // 16), 4)
        or scale.numel()
        == round_up(mat.shape[1], 128) * round_up(math.ceil(2 * mat.shape[0] // 16), 4)
        and mat.dtype == torch.float4_e2m1fn_x2
        and scale.dtype == torch.float8_e4m3fn
    ):
        return ScalingType.BlockWise1x16, SwizzleType.SWIZZLE_32_4_4
    # MX
    if (
        scale.numel()
        == round_up(mat.shape[0], 128) * round_up(math.ceil(mat.shape[1] // 32), 4)
        or scale.numel()
        == round_up(mat.shape[1], 128) * round_up(math.ceil(mat.shape[0] // 32), 4)
        and scale.dtype == torch.float8_e8m0fnu
    ):
        return ScalingType.BlockWise1x32, SwizzleType.SWIZZLE_32_4_4
    return None, None
 wrap: bool = True
 def scaled_mm_wrap(
    a,
    b,
    scale_a,
    scale_b,
    scale_recipe_a=None,
    scale_recipe_b=None,
    swizzle_a=SwizzleType.NO_SWIZZLE,
    swizzle_b=SwizzleType.NO_SWIZZLE,
    scale_result=None,
    out_dtype=torch.bfloat16,
    use_fast_accum=False,
    bias=None,
    wrap_v2=wrap,
 ):
    if not wrap_v2:
        return torch._scaled_mm(
            a,
            b,
            scale_a,
            scale_b,
            scale_result=scale_result,
            out_dtype=out_dtype,
            bias=bias,
            use_fast_accum=use_fast_accum,
        )
    else:
        # infer scalingtype and swizzle from scales
        if scale_recipe_a is None:
            scale_recipe_a, swizzle_a = infer_scale_swizzle(a, scale_a)
        if scale_recipe_b is None:
            scale_recipe_b, swizzle_b = infer_scale_swizzle(b, scale_b)
        out = scaled_mm(
            a,
            b,
            scale_a,
            scale_recipe_a,
            scale_b,
            scale_recipe_b,
            swizzle_a=swizzle_a,
            swizzle_b=swizzle_b,
            bias=bias,
            output_dtype=out_dtype,
            use_fast_accum=use_fast_accum,
        )
        return out
 def mm_float8_emulated(x, x_scale, y, y_scale, out_dtype) -> torch.Tensor:
    # naive implementation: dq -> op -> q
    x_fp32 = x.to(torch.float) / x_scale
@ -139,7 +247,7 @@ def addmm_float8_unwrapped(
    b_inverse_scale = b_scale.reciprocal()
    if output_dtype == torch.float32 and bias is not None:
        # Bias is not supported by _scaled_mm when output is fp32
-        output = torch._scaled_mm(
+        output = scaled_mm_wrap(
            a_data,
            b_data,
            scale_a=a_inverse_scale,
@ -149,7 +257,7 @@ def addmm_float8_unwrapped(
        )
        output += bias
        return output
-    output = torch._scaled_mm(
+    output = scaled_mm_wrap(
        a_data,
        b_data,
        bias=bias,
@ -290,7 +398,7 @@ class TestFP8Matmul(TestCase):
        out_fp32 = torch.mm(x_fp8.to(torch.float), y_fp8.to(torch.float))
        scale_a = torch.tensor(1.0, device=device)
        scale_b = torch.tensor(1.0, device=device)
-        out_fp8 = torch._scaled_mm(x_fp8, y_fp8, scale_a, scale_b, out_dtype=out_dtype)
+        out_fp8 = scaled_mm_wrap(x_fp8, y_fp8, scale_a, scale_b, out_dtype=out_dtype)
        if out_dtype is not None:
            self.assertEqual(out_dtype, out_fp8.dtype)
        self.assertEqual(out_fp32, out_fp8.to(torch.float))
@ -301,7 +409,7 @@ class TestFP8Matmul(TestCase):
        self._test_tautological_mm(device, e4m3_type, e4m3_type, size=16)
        # According to https://docs.nvidia.com/cuda/cublas/#id99 8F_E5M2 MM is unsupported
        # supported on ROCm but fails on CUDA
-        ctx = self.assertRaises(RuntimeError) if torch.version.hip is None and device != "cpu" else contextlib.nullcontext()
+        ctx = self.assertRaises(ValueError) if torch.version.hip is None and device != "cpu" else contextlib.nullcontext()
        with ctx:
            self._test_tautological_mm(device, e5m2_type, e5m2_type)
@ -326,9 +434,9 @@ class TestFP8Matmul(TestCase):
        scale_one = torch.tensor(1.0, device=device)
        scale_a = torch.tensor(1.5, device=device)
        scale_b = torch.tensor(0.66, device=device)
-        out_fp8 = torch._scaled_mm(x, y, scale_a=scale_one, scale_b=scale_one)
+        out_fp8 = scaled_mm_wrap(x, y, scale_a=scale_one, scale_b=scale_one)
        self.assertEqual(out_fp8.to(torch.float), torch.full(size, 4., device=device))
-        out_fp8_s = torch._scaled_mm(x, y, scale_a=scale_a, scale_b=scale_b)
+        out_fp8_s = scaled_mm_wrap(x, y, scale_a=scale_a, scale_b=scale_b)
        self.assertEqual(out_fp8, out_fp8_s)
    @unittest.skipIf(not PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM, mxfp8_grouped_mm_skip_msg)
@ -506,6 +614,7 @@ class TestFP8Matmul(TestCase):
        x_scale = tensor_to_scale(x, input_dtype).float()
        y_scale = tensor_to_scale(y, input_dtype).float()
        x_fp8 = to_fp8_saturated(x * x_scale, input_dtype)
        y_fp8 = to_fp8_saturated(y * y_scale, input_dtype)
@ -600,11 +709,11 @@ class TestFP8Matmul(TestCase):
        (k, l, m) = (16, 48, 32)
        x = torch.ones((k, l), device=device).to(e4m3_type)
        y = torch.full((m, l), .25, device=device, dtype=e4m3_type).t()
-        bias = torch.full((m,), 4.0, device=device, dtype=torch.half)
+        bias = torch.full((m,), 4.0, device=device, dtype=torch.bfloat16)
        scale_a = torch.tensor(1.0, device=device)
        scale_b = torch.tensor(1.0, device=device)
-        out_fp8 = torch._scaled_mm(x, y, scale_a=scale_a, scale_b=scale_b)
+        out_fp8 = scaled_mm_wrap(x, y, scale_a=scale_a, scale_b=scale_b)
-        outb_fp8 = torch._scaled_mm(x, y, scale_a=scale_a, scale_b=scale_b, bias=bias)
+        outb_fp8 = scaled_mm_wrap(x, y, scale_a=scale_a, scale_b=scale_b, bias=bias)
        # this fails on ROCm currently because hipblaslt doesn't have amax op
        out_fp32 = out_fp8.to(torch.float32)
        outb_fp32 = outb_fp8.to(torch.float32)
@ -621,8 +730,8 @@ class TestFP8Matmul(TestCase):
        scale_b = torch.tensor(1.0, device=device)
        input_bias = None
        if bias:
-            input_bias = torch.rand((16,), device=device).to(torch.half)
+            input_bias = torch.rand((16,), device=device).to(torch.bfloat16)
-        _ = torch._scaled_mm(x, y, scale_a, scale_b, bias=input_bias)
+        _ = scaled_mm_wrap(x, y, scale_a, scale_b, bias=input_bias)
    @onlyCUDA
    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
@ -630,10 +739,10 @@ class TestFP8Matmul(TestCase):
        (k, l, m) = (16, 48, 32)
        x = torch.full((k, l), 0.0, device=device).to(e4m3_type)
        y = torch.full((m, l), 1.0, device=device, dtype=e4m3_type).t()
-        bias = torch.full((m,), -3.0, device=device, dtype=torch.half)
+        bias = torch.full((m,), -3.0, device=device, dtype=torch.bfloat16)
        scale_a = torch.tensor(1.0, device=device)
        scale_b = torch.tensor(1.0, device=device)
-        outb_fp8 = torch._scaled_mm(x, y, scale_a, scale_b, bias=bias)
+        outb_fp8 = scaled_mm_wrap(x, y, scale_a, scale_b, bias=bias)
        outb_fp32 = outb_fp8.to(torch.float32)
        self.assertEqual(outb_fp32, torch.tensor(-3.0, device=device).expand_as(outb_fp32))
@ -647,9 +756,9 @@ class TestFP8Matmul(TestCase):
        scale_b = torch.tensor(1.0, device=device)
        bias = torch.full((m,), 4.0, device=device, dtype=torch.bfloat16)
        self.assertRaisesRegex(
-            RuntimeError,
+            ValueError,
            "Bias is not supported when out_dtype is set to Float32",
-            lambda: torch._scaled_mm(x, y, scale_a, scale_b, bias=bias, out_dtype=torch.float32),
+            lambda: scaled_mm_wrap(x, y, scale_a, scale_b, bias=bias, out_dtype=torch.float32),
        )
    @onlyCUDA
@ -663,10 +772,11 @@ class TestFP8Matmul(TestCase):
        self.assertRaisesRegex(
            RuntimeError,
            r"torch\.\_scaled\_mm is only supported on CUDA devices with compute capability \>\= 9\.0 or 8\.9, or ROCm MI300\+",
-            lambda: torch._scaled_mm(x, y, scale_a, scale_b, out_dtype=torch.float32),
+            lambda: scaled_mm_wrap(x, y, scale_a, scale_b, out_dtype=torch.float32),
        )
    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
    @unittest.skipIf(SM100OrLater, "fast_accum is SM90-only")
    def test_float8_scale_fast_accum(self, device) -> None:
        size = (16, 16)
        x = torch.full(size, .5, device=device, dtype=e4m3_type)
@ -675,9 +785,9 @@ class TestFP8Matmul(TestCase):
        y = torch.full(size, .5, device=device, dtype=y_type).t()
        scale_a = torch.tensor(1.5, device=device)
        scale_b = torch.tensor(0.66, device=device)
-        out_fp8 = torch._scaled_mm(x, y, scale_a, scale_b, use_fast_accum=True)
+        out_fp8 = scaled_mm_wrap(x, y, scale_a, scale_b, out_dtype=torch.float8_e4m3fn, use_fast_accum=True)
        self.assertEqual(out_fp8.to(torch.float), torch.full(size, 4., device=device))
-        out_fp8_s = torch._scaled_mm(x, y, scale_a=scale_a, scale_b=scale_b, use_fast_accum=True)
+        out_fp8_s = scaled_mm_wrap(x, y, scale_a=scale_a, scale_b=scale_b, out_dtype=torch.float8_e4m3fn, use_fast_accum=True)
        self.assertEqual(out_fp8, out_fp8_s)
    @onlyCUDA
@ -696,7 +806,7 @@ class TestFP8Matmul(TestCase):
        x_fp8 = x.to(e4m3_type)
        y_fp8 = y.to(e4m3_type).t()
-        out_fp8 = torch._scaled_mm(
+        out_fp8 = scaled_mm_wrap(
            x_fp8,
            y_fp8,
            scale_a=x_scales,
@ -720,50 +830,58 @@ class TestFP8Matmul(TestCase):
        y_fp8 = y.to(e4m3_type).t()
        with self.assertRaisesRegex(
-            RuntimeError, re.escape("Invalid scaling configuration")
+            ValueError, re.escape("scale_b must have 1 Float element")
        ):
-            torch._scaled_mm(
+            scaled_mm_wrap(
                x_fp8,
                y_fp8,
                scale_a=torch.ones((1, 1), device="cuda"),
                scale_b=torch.ones((1, 2), device="cuda"),
                scale_recipe_a=ScalingType.TensorWise,
                scale_recipe_b=ScalingType.TensorWise,
                out_dtype=torch.bfloat16,
            )
        with self.assertRaisesRegex(
-            RuntimeError, re.escape("Invalid scaling configuration")
+            ValueError, re.escape(f"scale_b must have {N} Float elements, got {N + 1}"),
        ):
-            torch._scaled_mm(
+            scaled_mm_wrap(
                x_fp8,
                y_fp8,
                scale_a=torch.ones((M, 1), device="cuda"),
                scale_b=torch.ones((1, N + 1), device="cuda"),
                scale_recipe_a=ScalingType.RowWise,
                scale_recipe_b=ScalingType.RowWise,
                out_dtype=torch.bfloat16,
            )
        with self.assertRaisesRegex(
-            RuntimeError, re.escape("Invalid scaling configuration")
+            IndexError, re.escape("Dimension out of range")
        ):
-            torch._scaled_mm(
+            scaled_mm_wrap(
                x_fp8,
                y_fp8,
                scale_a=torch.ones((M), device="cuda"),
                scale_b=torch.ones((N, 1), device="cuda"),
                scale_recipe_a=ScalingType.RowWise,
                scale_recipe_b=ScalingType.RowWise,
                out_dtype=torch.bfloat16,
            )
        with self.assertRaisesRegex(
-            RuntimeError, re.escape("Invalid scaling configuration")
+            ValueError, re.escape("expected scale_b.stride(1) to be 1, but got 2"),
        ):
-            torch._scaled_mm(
+            scaled_mm_wrap(
                x_fp8,
                y_fp8,
                scale_a=torch.ones((M, 1), device="cuda"),
                scale_b=torch.ones((1, N * 2), device="cuda")[:, ::2],
                scale_recipe_a=ScalingType.RowWise,
                scale_recipe_b=ScalingType.RowWise,
                out_dtype=torch.bfloat16,
            )
        def e5m2():
-            out = torch._scaled_mm(
+            out = scaled_mm_wrap(
                x_fp8,
                y_fp8.to(e5m2_type),
                scale_a=torch.ones((M, 1), device="cuda"),
@ -838,6 +956,7 @@ class TestFP8Matmul(TestCase):
        else:
            test()
    # Note: Removed parameterization over M,N,K from #163829 as it failed tests as-is
    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
    @unittest.skipIf(not IS_SM90, "cuBLAS blockwise scaling requires sm90+")
    @unittest.skipIf(
@ -859,12 +978,20 @@ class TestFP8Matmul(TestCase):
        # 1x128 blocks need scales to be outer-dim-major
        if lhs_block == 1:
            x_scales = x_scales.t().contiguous().t()
            lhs_recipe = ScalingType.BlockWise1x128
        else:
            lhs_recipe = ScalingType.BlockWise128x128
        if rhs_block == 1:
            y_scales = y_scales.t().contiguous().t()
            rhs_recipe = ScalingType.BlockWise1x128
        else:
            rhs_recipe = ScalingType.BlockWise128x128
        # Calculate actual F8 mm
-        out_scaled_mm = mm_float8(
+        out_scaled_mm = scaled_mm_wrap(
-            x_fp8, y_fp8.t(), a_scale=x_scales, b_scale=y_scales.t(), output_dtype=output_dtype
+            x_fp8, y_fp8.t(), scale_a=x_scales.reciprocal(), scale_b=y_scales.reciprocal().t(), out_dtype=output_dtype,
            scale_recipe_a=lhs_recipe, scale_recipe_b=rhs_recipe
        )
        # Calculate emulated F8 mm
@ -912,9 +1039,9 @@ class TestFP8Matmul(TestCase):
        out_fp32 = torch.mm(x_fp8.to(torch.float), y_fp8.to(torch.float))
        scale_a = torch.tensor(float('-inf'), device=device)
        scale_b = torch.tensor(float('-inf'), device=device)
-        f = torch._scaled_mm
+        f = scaled_mm_wrap
        if use_torch_compile:
-            f = torch.compile(torch._scaled_mm)
+            f = torch.compile(scaled_mm_wrap)
        out_fp8 = f(x_fp8, y_fp8, scale_a, scale_b, out_dtype=out_dtype)
        self.assertEqual(out_dtype, out_fp8.dtype)
        self.assertEqual(out_fp32, out_fp8.to(torch.float))
@ -938,16 +1065,16 @@ class TestFP8Matmul(TestCase):
        with tempfile.NamedTemporaryFile() as f:
            with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:
                self.assertIsNone(torch._C._get_sm_carveout_experimental())
-                torch._scaled_mm(x_fp8, y_fp8, scale_a=x_scales, scale_b=y_scales, out_dtype=torch.bfloat16)
+                scaled_mm_wrap(x_fp8, y_fp8, scale_a=x_scales, scale_b=y_scales, out_dtype=torch.bfloat16)
                torch._C._set_sm_carveout_experimental(0)
                self.assertEqual(torch._C._get_sm_carveout_experimental(), 0)
-                torch._scaled_mm(x_fp8, y_fp8, scale_a=x_scales, scale_b=y_scales, out_dtype=torch.bfloat16)
+                scaled_mm_wrap(x_fp8, y_fp8, scale_a=x_scales, scale_b=y_scales, out_dtype=torch.bfloat16)
                torch._C._set_sm_carveout_experimental(66)
                self.assertEqual(torch._C._get_sm_carveout_experimental(), 66)
-                torch._scaled_mm(x_fp8, y_fp8, scale_a=x_scales, scale_b=y_scales, out_dtype=torch.bfloat16)
+                scaled_mm_wrap(x_fp8, y_fp8, scale_a=x_scales, scale_b=y_scales, out_dtype=torch.bfloat16)
                torch._C._set_sm_carveout_experimental(None)
                self.assertIsNone(torch._C._get_sm_carveout_experimental())
-                torch._scaled_mm(x_fp8, y_fp8, scale_a=x_scales, scale_b=y_scales, out_dtype=torch.bfloat16)
+                scaled_mm_wrap(x_fp8, y_fp8, scale_a=x_scales, scale_b=y_scales, out_dtype=torch.bfloat16)
            prof.export_chrome_trace(f.name)
            if torch.version.hip:
@ -1244,7 +1371,7 @@ class TestFP8Matmul(TestCase):
            A_scale = to_blocked(A_scale)
            B_scale = to_blocked(B_scale)
-        C = torch._scaled_mm(
+        C = scaled_mm_wrap(
            A,
            B.t(),
            A_scale,
@ -1283,49 +1410,65 @@ class TestFP8Matmul(TestCase):
        expected_a_size = BLOCK_SIZE_MN * ceil_div(M, BLOCK_SIZE_MN) * padded_num_k_blocks
        expected_b_size = BLOCK_SIZE_MN * ceil_div(N, BLOCK_SIZE_MN) * padded_num_k_blocks
        block = (
            ScalingType.BlockWise1x16
            if recipe == "nvfp4"
            else ScalingType.BlockWise1x32
        )
        swizzle = SwizzleType.SWIZZLE_32_4_4
        # Test wrong scale tensor size for scale_a with correct dtype
        with self.assertRaisesRegex(
-            RuntimeError,
+            ValueError,
            f".*For Block[W,w]ise.*scaling.*scale_a should have {expected_a_size} "
            f"elements.*"
            ,
        ):
            incorrect_size_a = torch.ones(expected_a_size - 1, device=device, dtype=scale_dtype)
            correct_size_b = torch.ones(expected_b_size, device=device, dtype=scale_dtype)
-            torch._scaled_mm(
+
            scaled_mm_wrap(
                x_lowp,
                y_lowp,
                scale_a=incorrect_size_a,
                scale_recipe_a=block,
                scale_b=correct_size_b,
                scale_recipe_b=block,
                swizzle_a=swizzle,
                swizzle_b=swizzle,
                out_dtype=torch.bfloat16,
            )
        # Test wrong scale tensor size for scale_b with correct dtype
        with self.assertRaisesRegex(
-            RuntimeError,
+            ValueError,
            f"For Block[W,w]ise.*scaling.*scale_b should have {expected_b_size} "
            f"elements.*"
            ,
        ):
            correct_size_a = torch.ones(expected_a_size, device=device, dtype=scale_dtype)
            incorrect_size_b = torch.ones(expected_b_size + 1, device=device, dtype=scale_dtype)
-            torch._scaled_mm(
+            scaled_mm_wrap(
                x_lowp,
                y_lowp,
                scale_a=correct_size_a,
                scale_recipe_a=block,
                scale_b=incorrect_size_b,
                scale_recipe_b=block,
                swizzle_a=swizzle,
                swizzle_b=swizzle,
                out_dtype=torch.bfloat16,
            )
        # Test non-contiguous scale tensors with correct dtype
        with self.assertRaisesRegex(
-            RuntimeError,
+            ValueError,
-            "For Block[W,w]ise.*scaling.*both should be contiguous"
+            "For Block[W,w]ise.*scaling.*both scales should be contiguous"
            ,
        ):
            non_contiguous_a = torch.ones(expected_a_size * 2, device=device, dtype=scale_dtype)[::2]
            contiguous_b = torch.ones(expected_b_size, device=device, dtype=scale_dtype)
-            torch._scaled_mm(
+            scaled_mm_wrap(
                x_lowp,
                y_lowp,
                scale_a=non_contiguous_a,
@ -1335,8 +1478,8 @@ class TestFP8Matmul(TestCase):
    def scaled_grouped_mm_helper(self, alist, blist, ascalelist, bscalelist, outlist, use_fast_accum):
        for a, b, ascale, bscale, out in zip(alist, blist, ascalelist, bscalelist, outlist):
-            out_ref = torch._scaled_mm(a, b.t(), ascale.view(-1, 1), bscale.view(1, -1),
+            out_ref = scaled_mm_wrap(a, b.t(), ascale.view(-1, 1), bscale.view(1, -1),
-                                       out_dtype=torch.bfloat16, use_fast_accum=use_fast_accum)
+                                     out_dtype=torch.bfloat16, use_fast_accum=use_fast_accum)
            self.assertEqual(out, out_ref, atol=5e-2, rtol=5e-4)
    # Testing only _scaled_grouped_mm() with multiple shapes, as
@ -1485,7 +1628,7 @@ class TestFP8Matmul(TestCase):
        B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
        C_ref = A_ref @ B_ref.t()
-        compiled_scaled_mm = torch.compile(torch._scaled_mm, backend="inductor")
+        compiled_scaled_mm = torch.compile(scaled_mm_wrap, backend="inductor")
        C = compiled_scaled_mm(
            A,
            B.t(),
@ -1514,8 +1657,8 @@ class TestFP8Matmul(TestCase):
        B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
        C_ref = A_ref @ B_ref.t()
-        compiled_scaled_mm = torch.compile(torch._scaled_mm, backend="inductor")
+        compiled_scaled_mm = torch.compile(scaled_mm_wrap, backend="inductor")
-        # C = torch._scaled_mm(
+        # C = scaled_mm_wrap(
        C = compiled_scaled_mm(
            A,
            B.t(),
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@ -113,6 +113,7 @@
 #ifdef USE_CUDA
 #include <ATen/ROCmFABackend.h>
 #include <ATen/cuda/CUDABlas.h>
 #include <ATen/cuda/CUDAConfig.h>
 #include <ATen/native/transformers/cuda/sdp_utils.h>
 #include <torch/csrc/inductor/static_cuda_launcher.h>
@ -2504,6 +2505,39 @@ Call this whenever a new thread is created in order to propagate values from
    return at::globalContext().blasPreferredBackend();
  });
  py::enum_<at::blas::ScalingType>(
      py_module, "_ScalingType", "Supported Tensor scaling types")
      .value(
          "TensorWise",
          at::blas::ScalingType::TensorWise,
          "Single scale per-tensor")
      .value(
          "RowWise", at::blas::ScalingType::RowWise, "Scale per-row of tensor")
      .value(
          "BlockWise1x16",
          at::blas::ScalingType::BlockWise1x16,
          "Scale per 16 contiguous values")
      .value(
          "BlockWise1x32",
          at::blas::ScalingType::BlockWise1x32,
          "Scale per 32 contiguous values")
      .value(
          "BlockWise1x128",
          at::blas::ScalingType::BlockWise1x128,
          "Scale per 128 contiguous values")
      .value(
          "BlockWise128x128",
          at::blas::ScalingType::BlockWise128x128,
          "Scale per 128x128 tile");
  py::enum_<at::blas::SwizzleType>(
      py_module, "_SwizzleType", "Supported scale swizzle types")
      .value("NO_SWIZZLE", at::blas::SwizzleType::NO_SWIZZLE, "No swizzling")
      .value(
          "SWIZZLE_32_4_4",
          at::blas::SwizzleType::SWIZZLE_32_4_4,
          "Blackwell-stype 32x4x4 swizzle");
  py::enum_<at::ROCmFABackend>(py_module, "_ROCmFABackend")
      .value("Default", at::ROCmFABackend::Default)
      .value("AOTriton", at::ROCmFABackend::AOTriton)
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@ -4,11 +4,16 @@ import importlib
 import math
 import warnings
 from collections.abc import Callable
-from typing import Optional, TYPE_CHECKING, Union
+from typing import Any as _Any, Optional, TYPE_CHECKING, Union
 import torch
 from torch import _VF, sym_int as _sym_int, Tensor
-from torch._C import _add_docstr, _infer_size
+from torch._C import (
    _add_docstr,
    _infer_size,
    _ScalingType as ScalingType,
    _SwizzleType as SwizzleType,
 )
 from torch._jit_internal import (
    _overload,
    boolean_dispatch,
@ -27,6 +32,10 @@ from torch.overrides import (
 )
 # Set visibility of the bound enums to this module
 ScalingType.__module__ = "torch.nn.functional"
 SwizzleType.__module__ = "torch.nn.functional"
 if TYPE_CHECKING:
    from torch.types import _dtype as DType
 else:
@ -6618,3 +6627,87 @@ def multi_head_attention_forward(
            # squeeze the output if input was unbatched
            attn_output = attn_output.squeeze(1)
        return attn_output, None
 def scaled_mm(
    mat_a: Tensor,
    mat_b: Tensor,
    scale_a: Tensor | list[Tensor],
    scale_recipe_a: ScalingType | list[ScalingType],
    scale_b: Tensor | list[Tensor],
    scale_recipe_b: ScalingType | list[ScalingType],
    swizzle_a: SwizzleType | list[SwizzleType] | None = None,
    swizzle_b: SwizzleType | list[SwizzleType] | None = None,
    bias: Optional[Tensor] = None,
    output_dtype: Optional[torch.dtype] = torch.bfloat16,
    contraction_dim: list[int] | tuple[int] = (),
    use_fast_accum: bool = False,
 ) -> Tensor:
    r"""
    scaled_mm(mat_a, mat_b, scale_a, scale_recipe_a, scale_b, scale_recipe_b, swizzle_a, swizzle_b, bias, output_dtype,
              contraction_dim, use_fast_accum)
    Applies a scaled matrix-multiply, mm(mat_a, mat_b) where the scaling of mat_a and mat_b are described by
    scale_recipe_a and scale_recipe_b respectively.
    Args:
        scale_a: Tensor containing decoding scaling factors for mat_a
        scale_recipe_a: Enum describing how mat_a has been scaled
        scale_b: Tensor containing decoding scaling factors for mat_b
        scale_recipe_b: Enum describing how mat_b has been scaled
        swizzle_a: Enum describing the swizzling pattern (if any) of scale_a
        swizzle_b: Enum describing the swizzling pattern (if any) of scale_b
        bias: optional bias term to be added to the output
        output_dtype: dtype used for the output tensor
        contraction_dim: describe which dimensions are :math:`K` in the matmul.
        use_fast_accum: enable/disable tensor-core fast accumulation (Hopper-GPUs only)
    """
    def expand_single_value(v: _Any | list[_Any] | None) -> list[_Any]:
        if v is None:
            return []
        elif not isinstance(v, (list)):
            return [
                v,
            ]
        else:
            return v
    scale_a = expand_single_value(scale_a)
    scale_recipe_a = expand_single_value(scale_recipe_a)
    scale_b = expand_single_value(scale_b)
    scale_recipe_b = expand_single_value(scale_recipe_b)
    swizzle_a = expand_single_value(swizzle_a)
    swizzle_b = expand_single_value(swizzle_b)
    # native_functions has restrictions on what can be defined
    # & passed through - std::optional<ArrayRef<Tensor>> for instance
    # *cannot* be passed, but an empty vector (list) can.
    # So, we need to convert None arguments for lists in python
    # explicitly into empty lists.
    def list_or_empty(l: list[_Any] | None) -> list[_Any]:
        return [] if not l else l
    def enum_list_as_int_list(l: _Any | list[_Any]) -> list[_Any]:
        if not isinstance(l, list):
            l = [
                l,
            ]
        return [li.value for li in l]
    out = torch._scaled_mm_v2(
        mat_a,
        mat_b,
        scale_a,
        enum_list_as_int_list(scale_recipe_a),
        enum_list_as_int_list(list_or_empty(swizzle_a)),
        scale_b,
        enum_list_as_int_list(scale_recipe_b),
        enum_list_as_int_list(list_or_empty(swizzle_b)),
        bias,
        output_dtype,
        contraction_dim,
        use_fast_accum,
    )
    return out
--- a/torch/overrides.py
+++ b/torch/overrides.py
@ -249,6 +249,7 @@ def get_ignored_functions() -> set[Callable]:
        torch.nn.functional.has_torch_function_unary,
        torch.nn.functional.has_torch_function_variadic,
        torch.nn.functional.handle_torch_function,
        torch.nn.functional.scaled_mm,
        torch.nn.functional.sigmoid,
        torch.nn.functional.hardsigmoid,
        torch.nn.functional.tanh,