Add an option to put store large mmap weights on disk (#164526 )

Summary: As title In windows, we cannot modify the .dll to append weights at the end, the windows .dll loader will complain it's not a valid .dll file. So we store the weight blob as a separete file. 1. We add the following API which allows passing in a pointer to the weight blob and get the size of the weight blob. ```cpp AOTI_API AOTIRuntimeError AOTInductorModelContainerGetConstantsBlobSize( AOTInductorModelContainerHandle container_handle, uint64_t* ret_size); // Load weights from a single blob in weight_blob_ptr AOTI_API AOTIRuntimeError AOTInductorModelUpdateConstantsFromBlob( AOTInductorModelContainerHandle container_handle, const uint8_t* weight_blob_ptr); ``` 2. We also add a method in ModelContainerRunner to load the weight: If the runner see that there is a `.blob` file in the package, if will mmap the .blob file and use the content to load the constants. 3. We also add the `USE_MMAP_EXTERNAL` macro. When this macro is defined, the model expects to load the weights from external mmap'd weights. Test Plan: ``` buck run mode/dev-nosan caffe2/test/inductor:test_aot_inductor -- -r test_large_mmaped_weights_on_disk ``` Also tested for windows-cross compilation with 6542566585/demo/main_voxtral.cpp ``` Loaded model.dll audio_encoder loaded C:\Users\shangdiy\source\repos\torchnative\demo\token_embedding\data\aotinductor\model\model.wrapper.so Loaded model.dll token_embedding loaded C:\Users\shangdiy\source\repos\torchnative\demo\text_decoder\data\aotinductor\model\model.wrapper.so Loaded model.dll Loading weights from C:\Users\shangdiy\source\repos\torchnative\demo\text_decoder\data\aotinductor\model\model.wrapper_weights.blob text_decoder loaded Load latency (ms): audio_encoder: 1011.234 archive extraction: 0.000 .so loading: 1011.197 token_embedding: 525.773 archive extraction: 0.000 .so loading: 525.704 text_decoder: 3324.130 archive extraction: 0.000 .so loading: 3323.979 Run latency (ms): audio_encoder: 285.958 audio_encoder output: dtype=bfloat16, shape=[1, 1125, 3072], numel=3456000 token_embedding: 6.676 token_embedding output: dtype=bfloat16, shape=[1, 1138, 3072], numel=3495936 text_decoder: 576.519 text_decoder output: dtype=bfloat16, shape=[1, 1138, 131072], numel=149159936 ``` cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx ipiszy chenyang78 kadeng muchulee8 amjames chauhang aakhundov coconutruben Differential Revision: D84093310 Pulled By: yushangdi
[CD] Do not propagate download.pytorch.org IP into container (#165075 )
2025-10-28 02:04:53 +08:00 · 2025-10-09 15:21:58 -07:00 · 2025-10-09 21:59:31 +00:00 · 2025-10-09 21:58:54 +00:00 · 2025-10-09 21:36:10 +00:00 · 2025-10-09 21:18:05 +00:00
493 changed files with 5789 additions and 4326 deletions
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -233,7 +233,9 @@ if [[ "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
  export BUILD_STATIC_RUNTIME_BENCHMARK=ON
 fi

-if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *-full-debug* ]]; then
+  export CMAKE_BUILD_TYPE=Debug
+elif [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
  export CMAKE_BUILD_TYPE=RelWithAssert
 fi

@ -299,6 +301,11 @@ else
      python -m build --wheel --no-isolation
    fi
    pip_install_whl "$(echo dist/*.whl)"
+    if [[ "$BUILD_ENVIRONMENT" == *full-debug* ]]; then
+      # Regression test for https://github.com/pytorch/pytorch/issues/164297
+      # Torch should be importable and that's about it
+      pushd /; python -c "import torch;print(torch.__config__.show(), torch.randn(5) + 1.7)"; popd
+    fi

    if [[ "${BUILD_ADDITIONAL_PACKAGES:-}" == *vision* ]]; then
      install_torchvision
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -256,7 +256,7 @@ test_torchbench_smoketest() {
  local device=mps
  local dtypes=(undefined float16 bfloat16 notset)
  local dtype=${dtypes[$1]}
-  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
+  local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)

  for backend in eager inductor; do

@ -319,7 +319,7 @@ test_aoti_torchbench_smoketest() {
  local device=mps
  local dtypes=(undefined float16 bfloat16 notset)
  local dtype=${dtypes[$1]}
-  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
+  local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)

  echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}"
  local dtype_arg="--${dtype}"
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -838,7 +838,7 @@ test_dynamo_benchmark() {
      elif [[ "${suite}" == "timm_models" ]]; then
        export TORCHBENCH_ONLY_MODELS="inception_v3"
      elif [[ "${suite}" == "torchbench" ]]; then
-        export TORCHBENCH_ONLY_MODELS="hf_Bert"
+        export TORCHBENCH_ONLY_MODELS="BERT_pytorch"
      fi
    fi
    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
@ -869,13 +869,13 @@ test_inductor_torchbench_smoketest_perf() {
  mkdir -p "$TEST_REPORTS_DIR"

  python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
-    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
+    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only BERT_pytorch \
    --output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
  # The threshold value needs to be actively maintained to make this check useful
  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4

  # Check memory compression ratio for a few models
-  for test in hf_Albert timm_vision_transformer; do
+  for test in BERT_pytorch yolov3; do
    python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
      --disable-cudagraphs --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" \
      --only $test --output "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv"
--- a/.ci/pytorch/win-test-helpers/test_libtorch.bat
+++ b/.ci/pytorch/win-test-helpers/test_libtorch.bat
@ -15,37 +15,35 @@ if errorlevel 1 exit /b 1
 if not errorlevel 0 exit /b 1

 cd %TMP_DIR_WIN%\build\torch\test
+
+:: Enable delayed variable expansion to make the list
+setlocal enabledelayedexpansion
+set EXE_LIST=
 for /r "." %%a in (*.exe) do (
-    call :libtorch_check "%%~na" "%%~fa"
+  if "%%~na" == "c10_intrusive_ptr_benchmark" (
+    @REM NB: This is not a gtest executable file, thus couldn't be handled by
+    @REM pytest-cpp and is excluded from test discovery by run_test
+    call "%%~fa"
    if errorlevel 1 goto fail
+    if not errorlevel 0 goto fail
+  ) else (
+    if "%%~na" == "verify_api_visibility" (
+      @REM Skip verify_api_visibility as it is a compile-level test
+    ) else (
+      set EXE_LIST=!EXE_LIST! cpp/%%~na
+    )
+  )
 )

-goto :eof
-
-:libtorch_check
-
 cd %CWD%
 set CPP_TESTS_DIR=%TMP_DIR_WIN%\build\torch\test

-:: Skip verify_api_visibility as it a compile level test
-if "%~1" == "verify_api_visibility" goto :eof
+:: Run python test\run_test.py on the list
+set NO_TD=True && python test\run_test.py --cpp --verbose -i !EXE_LIST!
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail

-echo Running "%~2"
-if "%~1" == "c10_intrusive_ptr_benchmark" (
-  :: NB: This is not a gtest executable file, thus couldn't be handled by pytest-cpp
-  call "%~2"
-  goto :eof
-)
-
-python test\run_test.py --cpp --verbose -i "cpp/%~1"
-if errorlevel 1 (
-  echo %1 failed with exit code %errorlevel%
-  goto fail
-)
-if not errorlevel 0 (
-  echo %1 failed with exit code %errorlevel%
-  goto fail
-)
+goto :eof

 :eof
 exit /b 0
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -71,14 +71,7 @@ export PYTORCH_BUILD_NUMBER=1

 # Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
 TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
-
-# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
-TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
-
-# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries.
-if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then
-  TRITON_CONSTRAINT="platform_system == 'Linux'"
-fi
+TRITON_CONSTRAINT="platform_system == 'Linux'"

 if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
  TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
--- a/.github/actions/linux-test/action.yml
+++ b/.github/actions/linux-test/action.yml
@ -274,8 +274,6 @@ runs:
          -w /var/lib/jenkins/workspace \
          "${DOCKER_IMAGE}"
        )
-        # Propagate download.pytorch.org IP to container
-        grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
        echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
        docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"

--- a/.github/actions/test-pytorch-binary/action.yml
+++ b/.github/actions/test-pytorch-binary/action.yml
@ -33,10 +33,6 @@ runs:
        )

        echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
-        if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" && "${GPU_ARCH_TYPE}" != "xpu" ]]; then
-          # Propagate download.pytorch.org IP to container. This is only needed on Linux non aarch64 runner
-          grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" bash -c "/bin/cat >> /etc/hosts"
-        fi

        docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
        # Generate test script
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -30,6 +30,7 @@ ciflow_push_tags:
 - ciflow/riscv64
 - ciflow/rocm
 - ciflow/rocm-mi300
+- ciflow/rocm-mi355
 - ciflow/s390
 - ciflow/slow
 - ciflow/torchbench
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -389,8 +389,6 @@ jobs:
            "${DOCKER_IMAGE}" \
            ${DOCKER_SHELL_CMD}
          )
-          # Propagate download.pytorch.org IP to container
-          grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
          echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"

          if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
--- a/.github/workflows/h100-distributed.yml
+++ b/.github/workflows/h100-distributed.yml
@ -37,7 +37,7 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: "linux.12xlarge"
+      runner: "linux.c7i.12xlarge"
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '9.0'
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -128,7 +128,6 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.2xlarge.memory
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
      test-matrix: |
--- a/.github/workflows/rocm-mi355.yml
+++ b/.github/workflows/rocm-mi355.yml
@ -1,6 +1,9 @@
 name: rocm-mi355

 on:
+  push:
+    tags:
+      - ciflow/rocm-mi355/*
  workflow_dispatch:
  schedule:
    - cron: 30 11,1 * * *  # about 4:30am PDT and 6:30pm PDT
@ -64,5 +67,7 @@ jobs:
      build-environment: linux-noble-rocm-py3.12-mi355
      docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
-      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
+      tests-to-include: >-
+                        ${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor test_matmul_cuda test_scaled_matmul_cuda'
+                           || '' }}
    secrets: inherit
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -249,3 +249,14 @@ jobs:
      docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
    secrets: inherit
+
+  linux-jammy-py3_10-gcc11-full-debug-build-only:
+    name: linux-jammy-py3.10-gcc11-full-debug-build-only
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: linux.2xlarge.memory
+      build-environment: linux-jammy-py3.10-gcc11-full-debug-build-only
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+    secrets: inherit
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@ -35,7 +35,7 @@ jobs:
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
      build-environment: linux-jammy-xpu-n-1-py3.10
      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3
-      runner: linux.12xlarge
+      runner: linux.c7i.12xlarge
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
@ -56,7 +56,7 @@ jobs:
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
      build-environment: linux-jammy-xpu-n-py3.10
      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
-      runner: linux.12xlarge
+      runner: linux.c7i.12xlarge
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -388,9 +388,9 @@ cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker

 option(USE_MIMALLOC "Use mimalloc" OFF)
 # Enable third party mimalloc library to improve memory allocation performance
-# on Windows.
+# on Windows and AArch64.
 option(USE_MIMALLOC_ON_MKL "Use mimalloc on MKL" OFF)
-if(WIN32)
+if(WIN32 OR (CPU_AARCH64 AND NOT APPLE))
  set(USE_MIMALLOC ON)

  # Not enable USE_MIMALLOC_ON_MKL due to it caused issue:
--- a/aten/src/ATen/BlasBackend.h
+++ b/aten/src/ATen/BlasBackend.h
@ -28,4 +28,19 @@ inline std::ostream& operator<<(std::ostream& stream, at::BlasBackend backend) {
  return stream << BlasBackendToString(backend);
 }

+namespace blas {
+
+enum class ScalingType : std::uint8_t {
+  TensorWise, // fp32 scales
+  RowWise, // fp32 scales
+  BlockWise1x16, // fp8_e4m3fn scales
+  BlockWise1x32, // fp8_e8m0fnu scales
+  BlockWise1x128, // fp32 scales
+  BlockWise128x128, // fp32 scales
+};
+
+enum class SwizzleType : std::uint8_t { NO_SWIZZLE = 0, SWIZZLE_32_4_4 = 1 };
+
+} // namespace blas
+
 } // namespace at
--- a/aten/src/ATen/CPUApplyUtils.h
+++ b/aten/src/ATen/CPUApplyUtils.h
@ -144,8 +144,7 @@ inline std::string _all_equal_numel_error(at::ArrayRef<Tensor> tensors) {
 inline bool _apply_preamble(ArrayRef<Tensor> tensors) {
  checkDeviceType("CPU_tensor_apply", tensors, kCPU);
  checkLayout("CPU_tensor_apply", tensors, kStrided);
-  if (!_all_equal_numel(tensors))
-    TORCH_CHECK(false, _all_equal_numel_error(tensors));
+  TORCH_CHECK(_all_equal_numel(tensors), _all_equal_numel_error(tensors));
  // An empty tensor has no elements
  for (auto& t : tensors)
    if (t.numel() == 0)
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -226,15 +226,15 @@ class TORCH_API Context {
  bool userEnabledMkldnn() const;
  void setUserEnabledMkldnn(bool e);
  bool benchmarkCuDNN() const;
-  void setBenchmarkCuDNN(bool);
+  void setBenchmarkCuDNN(bool /*b*/);
  int benchmarkLimitCuDNN() const;
-  void setBenchmarkLimitCuDNN(int);
+  void setBenchmarkLimitCuDNN(int /*b*/);
  bool immediateMiopen() const;
-  void setImmediateMiopen(bool);
+  void setImmediateMiopen(bool /*b*/);
  bool deterministicCuDNN() const;
-  void setDeterministicCuDNN(bool);
+  void setDeterministicCuDNN(bool /*b*/);
  bool deterministicMkldnn() const;
-  void setDeterministicMkldnn(bool);
+  void setDeterministicMkldnn(bool /*b*/);
  bool userEnabledNNPACK() const;
  void setUserEnabledNNPACK(bool e);

@ -252,32 +252,32 @@ class TORCH_API Context {
  void setSDPPriorityOrder(const std::vector<int64_t>& order);
  std::array<at::SDPBackend, at::num_sdp_backends> sDPPriorityOrder();

-  void setSDPUseFlash(bool);
+  void setSDPUseFlash(bool /*e*/);
  bool userEnabledFlashSDP() const;

-  void setSDPUseMemEfficient(bool);
+  void setSDPUseMemEfficient(bool /*e*/);
  bool userEnabledMemEfficientSDP() const;

-  void setSDPUseMath(bool);
+  void setSDPUseMath(bool /*e*/);
  bool userEnabledMathSDP() const;

-  void setSDPUseCuDNN(bool);
+  void setSDPUseCuDNN(bool /*e*/);
  bool userEnabledCuDNNSDP() const;

-  void setAllowFP16BF16ReductionMathSDP(bool);
+  void setAllowFP16BF16ReductionMathSDP(bool /*e*/);
  bool allowFP16BF16ReductionMathSDP() const;

-  void setSDPUseOverrideable(bool);
+  void setSDPUseOverrideable(bool /*e*/);
  bool userEnabledOverrideableSDP() const;

  at::LinalgBackend linalgPreferredBackend() const;
-  void setLinalgPreferredBackend(at::LinalgBackend);
+  void setLinalgPreferredBackend(at::LinalgBackend /*b*/);

  at::BlasBackend blasPreferredBackend();
-  void setBlasPreferredBackend(at::BlasBackend);
+  void setBlasPreferredBackend(at::BlasBackend /*b*/);

  at::ROCmFABackend getROCmFAPreferredBackend();
-  void setROCmFAPreferredBackend(at::ROCmFABackend);
+  void setROCmFAPreferredBackend(at::ROCmFABackend /*b*/);

  // Note [Enabling Deterministic Operations]
  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -310,9 +310,9 @@ class TORCH_API Context {

  bool deterministicAlgorithms() const;
  bool deterministicAlgorithmsWarnOnly() const;
-  void setDeterministicAlgorithms(bool, bool);
+  void setDeterministicAlgorithms(bool /*b*/, bool /*warn_only*/);
  bool deterministicFillUninitializedMemory() const;
-  void setDeterministicFillUninitializedMemory(bool);
+  void setDeterministicFillUninitializedMemory(bool /*b*/);

  // Note [Writing Nondeterministic Operations]
  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -356,11 +356,11 @@ class TORCH_API Context {
      Float32Op op,
      Float32Precision p);
  bool allowTF32CuDNN(std::optional<Float32Op> op = std::nullopt) const;
-  void setAllowTF32CuDNN(bool);
+  void setAllowTF32CuDNN(bool /*b*/);
  bool allowTF32OneDNN() const;
-  void setAllowTF32OneDNN(bool);
+  void setAllowTF32OneDNN(bool /*b*/);
  bool allowTF32CuBLAS() const;
-  void setAllowTF32CuBLAS(bool);
+  void setAllowTF32CuBLAS(bool /*b*/);
  Float32MatmulPrecision float32MatmulPrecision() const;
  Float32Precision float32Precision(Float32Backend backend, Float32Op op) const;
  CuBLASReductionOption allowFP16ReductionCuBLAS() const;
@ -372,7 +372,7 @@ class TORCH_API Context {
      bool allow_reduced_precision,
      bool allow_splitk = true);
  bool allowFP16AccumulationCuBLAS() const;
-  void setAllowFP16AccumulationCuBLAS(bool);
+  void setAllowFP16AccumulationCuBLAS(bool /*b*/);

  // Matmuls can use a so-called "persistent" kernel which launches one CUDA
  // block for each SM on the GPU, and each block then iterates over multiple
@ -384,7 +384,7 @@ class TORCH_API Context {
  // to make matmuls target only a subset of the SMs, so they can fully schedule
  // even next to a comms kernel, and only be a few percent slower.
  std::optional<int32_t> _SMCarveout_EXPERIMENTAL() const;
-  void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t>);
+  void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t> /*c*/);

  at::QEngine qEngine() const;
  void setQEngine(at::QEngine e);
@ -405,7 +405,7 @@ class TORCH_API Context {
  void setDefaultMobileCPUAllocator();
  void unsetDefaultMobileCPUAllocator();
  bool allowFP16ReductionCPU() const;
-  void setAllowFP16ReductionCPU(bool);
+  void setAllowFP16ReductionCPU(bool /*b*/);

  // Preserved for BC
  void lazyInitCUDA() {
--- a/aten/src/ATen/MapAllocator.cpp
+++ b/aten/src/ATen/MapAllocator.cpp
@ -62,7 +62,7 @@ constexpr const char* unknown_eventname = "eventname not specified";
 #endif
 }  // namespace (anonymous)

-MapAllocator::MapAllocator(WithFd, std::string_view filename, int fd, int flags, size_t size)
+MapAllocator::MapAllocator(WithFd /*unused*/, std::string_view filename, int fd, int flags, size_t size)
  : filename_(filename.empty() ? unknown_filename : filename)
  , size_(0) // to be filled later
 #ifdef _WIN32
@ -494,7 +494,7 @@ RefcountedMapAllocator::RefcountedMapAllocator(const char *filename, int flags,

    initializeAlloc();
 }
-RefcountedMapAllocator::RefcountedMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size)
+RefcountedMapAllocator::RefcountedMapAllocator(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size)
  : RefcountedMapAllocatorArgCheck(flags)
  , MapAllocator(WITH_FD, filename, flags, fd, size + map_alloc_alignment) {

@ -614,7 +614,7 @@ at::DataPtr MapAllocator::makeDataPtr(std::string_view filename, int flags, size
  return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU};
 }

-at::DataPtr MapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
+at::DataPtr MapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
  auto* context = new MapAllocator(WITH_FD, filename, fd, flags, size);
  if (actual_size_out) *actual_size_out = context->size();
  return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU};
@ -626,7 +626,7 @@ at::DataPtr RefcountedMapAllocator::makeDataPtr(const char *filename, int flags,
  return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU};
 }

-at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
+at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
  auto* context = new RefcountedMapAllocator(WITH_FD, filename, fd, flags, size);
  if (actual_size_out) *actual_size_out = context->size() - map_alloc_alignment;
  return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU};
--- a/aten/src/ATen/MapAllocator.h
+++ b/aten/src/ATen/MapAllocator.h
@ -25,7 +25,7 @@ class TORCH_API MapAllocator {
 public:
  MapAllocator(std::string_view filename, int flags, size_t size);
  MapAllocator(
-      WithFd,
+      WithFd /*unused*/,
      std::string_view filename,
      int fd,
      int flags,
@ -59,14 +59,14 @@ class TORCH_API MapAllocator {
    return flags_;
  }

-  static MapAllocator* fromDataPtr(const at::DataPtr&);
+  static MapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/);
  static at::DataPtr makeDataPtr(
      std::string_view filename,
      int flags,
      size_t size,
      size_t* actual_size_out);
  static at::DataPtr makeDataPtr(
-      WithFd,
+      WithFd /*unused*/,
      const char* filename,
      int fd,
      int flags,
@ -105,13 +105,13 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
 public:
  RefcountedMapAllocator(const char* filename, int flags, size_t size);
  RefcountedMapAllocator(
-      WithFd,
+      WithFd /*unused*/,
      const char* filename,
      int fd,
      int flags,
      size_t size);

-  static RefcountedMapAllocator* fromDataPtr(const at::DataPtr&);
+  static RefcountedMapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/);
  RefcountedMapAllocator(const RefcountedMapAllocator&) = delete;
  RefcountedMapAllocator(RefcountedMapAllocator&&) = delete;
  RefcountedMapAllocator& operator=(const RefcountedMapAllocator&) = delete;
@ -122,7 +122,7 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
      size_t size,
      size_t* actual_size_out);
  static at::DataPtr makeDataPtr(
-      WithFd,
+      WithFd /*unused*/,
      const char* filename,
      int fd,
      int flags,
--- a/aten/src/ATen/NestedTensorImpl.cpp
+++ b/aten/src/ATen/NestedTensorImpl.cpp
@ -273,7 +273,7 @@ c10::SymInt NestedTensorImpl::sym_numel_custom() const {
  return NestedTensorImpl::numel_custom();
 }

-c10::SymBool NestedTensorImpl::sym_is_contiguous_custom(MemoryFormat) const {
+c10::SymBool NestedTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const {
  return nested_tensor_impl_is_contiguous(this);
 }
 IntArrayRef NestedTensorImpl::sizes_custom() const {
--- a/aten/src/ATen/NestedTensorImpl.h
+++ b/aten/src/ATen/NestedTensorImpl.h
@ -115,7 +115,8 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
  // with real implementations
  int64_t numel_custom() const override;
  c10::SymInt sym_numel_custom() const override;
-  c10::SymBool sym_is_contiguous_custom(MemoryFormat) const override;
+  c10::SymBool sym_is_contiguous_custom(
+      MemoryFormat /*memory_format*/) const override;
  int64_t size_custom(int64_t d) const override {
    return this->size(d);
  }
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@ -14,7 +14,7 @@ inline int64_t divup(int64_t x, int64_t y) {
 TORCH_API void init_num_threads();

 // Sets the number of threads to be used in parallel region
-TORCH_API void set_num_threads(int);
+TORCH_API void set_num_threads(int /*nthreads*/);

 // Returns the maximum number of threads that may be used in a parallel region
 TORCH_API int get_num_threads();
@ -37,7 +37,7 @@ inline void lazy_init_num_threads() {
  }
 }

-TORCH_API void set_thread_num(int);
+TORCH_API void set_thread_num(int /*id*/);

 class TORCH_API ThreadIdGuard {
 public:
@ -130,7 +130,7 @@ inline scalar_t parallel_reduce(
 TORCH_API std::string get_parallel_info();

 // Sets number of threads used for inter-op parallelism
-TORCH_API void set_num_interop_threads(int);
+TORCH_API void set_num_interop_threads(int /*nthreads*/);

 // Returns the number of threads used for inter-op parallelism
 TORCH_API size_t get_num_interop_threads();
--- a/aten/src/ATen/SparseCsrTensorImpl.cpp
+++ b/aten/src/ATen/SparseCsrTensorImpl.cpp
@ -252,7 +252,7 @@ void SparseCsrTensorImpl::set_stride(int64_t dim, int64_t new_stride) {
 void SparseCsrTensorImpl::set_storage_offset(int64_t storage_offset) {
  TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have set_storage_offset.");
 }
-c10::SymBool SparseCsrTensorImpl::sym_is_contiguous_custom(MemoryFormat) const {
+c10::SymBool SparseCsrTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const {
  TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have is_contiguous");
 }
 } // namespace at
--- a/aten/src/ATen/SparseCsrTensorImpl.h
+++ b/aten/src/ATen/SparseCsrTensorImpl.h
@ -32,10 +32,10 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {

 public:
  explicit SparseCsrTensorImpl(
-      at::DispatchKeySet,
+      at::DispatchKeySet /*key_set*/,
      at::Device device,
      Layout layout,
-      const caffe2::TypeMeta);
+      const caffe2::TypeMeta /*data_type*/);

  void resize_(int64_t nnz, IntArrayRef size);
  void resize_and_clear_(
@ -86,7 +86,8 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
 protected:
  IntArrayRef strides_custom() const override;
  SymIntArrayRef sym_strides_custom() const override;
-  SymBool sym_is_contiguous_custom(MemoryFormat) const override;
+  SymBool sym_is_contiguous_custom(
+      MemoryFormat /*memory_format*/) const override;

 public:
  void set_size(int64_t dim, int64_t new_size) override;
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@ -46,7 +46,9 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {

 public:
  // Public for now...
-  explicit SparseTensorImpl(at::DispatchKeySet, const caffe2::TypeMeta);
+  explicit SparseTensorImpl(
+      at::DispatchKeySet /*key_set*/,
+      const caffe2::TypeMeta /*data_type*/);

  void release_resources() override;

@ -384,8 +386,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {

 private:
  explicit SparseTensorImpl(
-      at::DispatchKeySet,
-      const caffe2::TypeMeta,
+      at::DispatchKeySet /*key_set*/,
+      const caffe2::TypeMeta /*data_type*/,
      at::Tensor indices,
      at::Tensor values);

--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@ -112,10 +112,10 @@ TORCH_API std::ostream& operator<<(std::ostream& stream, const Slice& slice);
 // `torch.tensor([1, 2])`) | `torch::tensor({1, 2})`
 struct TORCH_API TensorIndex final {
  // Case 1: `at::indexing::None`
-  TensorIndex(std::nullopt_t) : type_(TensorIndexType::None) {}
+  TensorIndex(std::nullopt_t /*unused*/) : type_(TensorIndexType::None) {}

  // Case 2: "..." / `at::indexing::Ellipsis`
-  TensorIndex(at::indexing::EllipsisIndexType)
+  TensorIndex(at::indexing::EllipsisIndexType /*unused*/)
      : type_(TensorIndexType::Ellipsis) {}
  TensorIndex(const char* str) : TensorIndex(at::indexing::Ellipsis) {
    TORCH_CHECK_VALUE(
--- a/aten/src/ATen/TensorIterator.h
+++ b/aten/src/ATen/TensorIterator.h
@ -250,7 +250,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
  using PtrVector = SmallVector<char*, 4>;
  using StrideVector = SmallVector<int64_t, 6>;

-  void build(TensorIteratorConfig&);
+  void build(TensorIteratorConfig& /*config*/);

  // The inner-loop function operates on the fastest moving dimension. It
  // implements element-wise operations in terms of 1-d strided tensors.
@ -618,20 +618,20 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
 #undef TORCH_DISALLOW_TEMPORARIES
 protected:
  // Mutable reference as it moves tensors out of TensorIteratorConfig
-  void populate_operands(TensorIteratorConfig&);
+  void populate_operands(TensorIteratorConfig& /*config*/);
  void mark_outputs();
-  void mark_resize_outputs(const TensorIteratorConfig&);
-  void compute_mem_overlaps(const TensorIteratorConfig&);
-  void compute_shape(const TensorIteratorConfig&);
-  void compute_strides(const TensorIteratorConfig&);
+  void mark_resize_outputs(const TensorIteratorConfig& /*config*/);
+  void compute_mem_overlaps(const TensorIteratorConfig& /*config*/);
+  void compute_shape(const TensorIteratorConfig& /*config*/);
+  void compute_strides(const TensorIteratorConfig& /*config*/);
  void reorder_dimensions();
  void permute_dimensions(IntArrayRef perm);
-  void compute_types(const TensorIteratorConfig&);
+  void compute_types(const TensorIteratorConfig& /*config*/);
  ScalarType compute_common_dtype();
  void allocate_or_resize_outputs();
-  bool fast_set_up(const TensorIteratorConfig&);
-  FastSetupType compute_fast_setup_type(const TensorIteratorConfig&);
-  void compute_names(const TensorIteratorConfig&);
+  bool fast_set_up(const TensorIteratorConfig& /*config*/);
+  FastSetupType compute_fast_setup_type(const TensorIteratorConfig& /*config*/);
+  void compute_names(const TensorIteratorConfig& /*config*/);
  void propagate_names_to_outputs();
  void coalesce_dimensions();

--- a/aten/src/ATen/Utils.h
+++ b/aten/src/ATen/Utils.h
@ -20,7 +20,7 @@

 namespace at {

-TORCH_API int _crash_if_asan(int);
+TORCH_API int _crash_if_asan(int /*arg*/);

 // Converts a TensorList (i.e. ArrayRef<Tensor> to vector of TensorImpl*)
 // NB: This is ONLY used by legacy TH bindings, and ONLY used by cat.
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@ -148,7 +148,7 @@ Tensor cached_cast(at::ScalarType to_type, const Tensor& arg, DeviceType device_
 Banned functions
 *******************************/

-static Tensor binary_cross_entropy_banned(const Tensor &, const Tensor &, const std::optional<Tensor>&, int64_t) {
+static Tensor binary_cross_entropy_banned(const Tensor & /*unused*/, const Tensor & /*unused*/, const std::optional<Tensor>& /*unused*/, int64_t /*unused*/) {
  TORCH_CHECK(false, "torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are unsafe to autocast.\n"
           "Many models use a sigmoid layer right before the binary cross entropy layer.\n"
           "In this case, combine the two layers using torch.nn.functional.binary_cross_entropy_with_logits\n"
--- a/aten/src/ATen/core/NamedTensor.h
+++ b/aten/src/ATen/core/NamedTensor.h
@ -27,11 +27,11 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
    HasNonWildcard
  };

-  explicit NamedTensorMeta(HAS_NON_WILDCARD, DimnameList names)
+  explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, DimnameList names)
    : names_(names.vec()) {
    check_invariants();
  }
-  explicit NamedTensorMeta(HAS_NON_WILDCARD, std::vector<Dimname>&& names)
+  explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, std::vector<Dimname>&& names)
    : names_(std::move(names)) {
    check_invariants();
  }
@ -52,13 +52,13 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
      std::any_of(names_.begin(), names_.end(), [](const Dimname& n) { return !n.isWildcard(); }));
  }

-  void set_names(HAS_NON_WILDCARD, DimnameList new_names) {
+  void set_names(HAS_NON_WILDCARD /*unused*/, DimnameList new_names) {
    TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
    std::copy(new_names.begin(), new_names.end(), names_.begin());
    check_invariants();
  }

-  void set_names(HAS_NON_WILDCARD, std::vector<Dimname>&& new_names) {
+  void set_names(HAS_NON_WILDCARD /*unused*/, std::vector<Dimname>&& new_names) {
    TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
    names_ = std::move(new_names);
    check_invariants();
--- a/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
+++ b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
@ -13,7 +13,7 @@ class TORCH_API PythonOpRegistrationTrampoline final {
 public:
  //  Returns true if you successfully registered yourself (that means
  //  you are in the hot seat for doing the operator registrations!)
-  static bool registerInterpreter(c10::impl::PyInterpreter*);
+  static bool registerInterpreter(c10::impl::PyInterpreter* /*interp*/);

  // Returns nullptr if no interpreter has been registered yet.
  static c10::impl::PyInterpreter* getInterpreter();
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -100,7 +100,7 @@ class TORCH_API TensorBase {
  // Create a Tensor with a +0 reference count. Special care must be
  // taken to avoid decrementing this reference count at destruction
  // time. Intended to support MaybeOwnedTraits<Tensor>.
-  explicit TensorBase(unsafe_borrow_t, const TensorBase& rhs)
+  explicit TensorBase(unsafe_borrow_t /*unused*/, const TensorBase& rhs)
      : impl_(c10::intrusive_ptr<at::TensorImpl, UndefinedTensorImpl>(rhs.impl_.get(), c10::raw::DontIncreaseRefcount{})) {}
  friend MaybeOwnedTraits<TensorBase>;

@ -954,7 +954,7 @@ protected:
  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;

 private:
-  TensorBase __dispatch_contiguous(c10::MemoryFormat) const;
+  TensorBase __dispatch_contiguous(c10::MemoryFormat /*memory_format*/) const;
 };

 inline DeviceIndex get_device(const TensorBase& self) {
--- a/aten/src/ATen/core/boxing/BoxedKernel.h
+++ b/aten/src/ATen/core/boxing/BoxedKernel.h
@ -18,10 +18,10 @@ class KernelFunction;
 // implementation notes; notably, this does NOT actually go through the
 // boxing/unboxing codepath.
 TORCH_API void fallthrough_kernel(
-    OperatorKernel*,
-    const OperatorHandle&,
-    DispatchKeySet,
-    Stack*);
+    OperatorKernel* /*unused*/,
+    const OperatorHandle& /*unused*/,
+    DispatchKeySet /*unused*/,
+    Stack* /*unused*/);

 // Note [Ambiguity in AutogradOther kernel]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -62,10 +62,10 @@ TORCH_API void fallthrough_kernel(
 // than arbitrarily pick one or the other, we just register a kernel that raises
 // an error and let the user decide how to proceed.
 TORCH_API void ambiguous_autogradother_kernel(
-    OperatorKernel*,
-    const OperatorHandle&,
-    DispatchKeySet,
-    Stack*);
+    OperatorKernel* /*unused*/,
+    const OperatorHandle& /*op*/,
+    DispatchKeySet /*unused*/,
+    Stack* /*unused*/);

 // Note [named_not_supported_kernel]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -75,10 +75,10 @@ TORCH_API void ambiguous_autogradother_kernel(
 // give a good error message in cases when boxing is not supported).  When
 // boxing is universally supported this can be removed.
 [[noreturn]] TORCH_API void named_not_supported_kernel(
-    OperatorKernel*,
-    const OperatorHandle&,
-    DispatchKeySet,
-    Stack*);
+    OperatorKernel* /*unused*/,
+    const OperatorHandle& /*op*/,
+    DispatchKeySet /*unused*/,
+    Stack* /*unused*/);

 /**
 * BoxedKernel is similar to a std::function storing a boxed kernel.
@ -185,16 +185,16 @@ class TORCH_API BoxedKernel final {

  template <BoxedKernelFunction* func>
  static void make_boxed_function(
-      OperatorKernel*,
+      OperatorKernel* /*unused*/,
      const OperatorHandle& opHandle,
-      DispatchKeySet,
+      DispatchKeySet /*unused*/,
      Stack* stack);

  template <BoxedKernelFunction_withDispatchKeys* func>
  static void make_boxed_function(
-      OperatorKernel*,
+      OperatorKernel* /*unused*/,
      const OperatorHandle& opHandle,
-      DispatchKeySet,
+      DispatchKeySet /*ks*/,
      Stack* stack);

  explicit BoxedKernel(
--- a/aten/src/ATen/core/boxing/BoxedKernel_impl.h
+++ b/aten/src/ATen/core/boxing/BoxedKernel_impl.h
@ -11,9 +11,9 @@ inline BoxedKernel::BoxedKernel(

 template <BoxedKernel::BoxedKernelFunction* func>
 inline void BoxedKernel::make_boxed_function(
-    OperatorKernel*,
+    OperatorKernel* /*unused*/,
    const OperatorHandle& opHandle,
-    DispatchKeySet,
+    DispatchKeySet /*unused*/,
    Stack* stack) {
  // Note that we're dropping the DispatchKeySet argument.
  // See Note [Plumbing Keys Through The Dispatcher 2] for details.
@ -22,7 +22,7 @@ inline void BoxedKernel::make_boxed_function(

 template <BoxedKernel::BoxedKernelFunction_withDispatchKeys* func>
 inline void BoxedKernel::make_boxed_function(
-    OperatorKernel*,
+    OperatorKernel* /*unused*/,
    const OperatorHandle& opHandle,
    DispatchKeySet ks,
    Stack* stack) {
--- a/aten/src/ATen/core/boxing/KernelFunction.cpp
+++ b/aten/src/ATen/core/boxing/KernelFunction.cpp
@ -10,7 +10,7 @@ namespace c10 {
 // be handled specially.  Its semantics is that it redispatches to the
 // *next* dispatch key that would have been processed, skipping the current
 // one.
-void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*) {
+void fallthrough_kernel(OperatorKernel* /*unused*/, const OperatorHandle& /*unused*/, DispatchKeySet /*unused*/, Stack* /*unused*/) {
  TORCH_INTERNAL_ASSERT(0,
    "fallthrough_kernel was executed but it should have been short-circuited by the dispatcher. "
    "This could occur if you registered a fallthrough kernel as a override for a specific operator "
@ -19,7 +19,7 @@ void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet,
    "let us know in the bug tracker.");
 }

-void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, DispatchKeySet, Stack*) {
+void ambiguous_autogradother_kernel(OperatorKernel* /*unused*/, const OperatorHandle& op, DispatchKeySet /*unused*/, Stack* /*unused*/) {
  TORCH_INTERNAL_ASSERT(0,
    op.operator_name(), " has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. "
    "This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering "
@ -32,7 +32,7 @@ void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, D
    "\nCanonical state\n~~~~~~~~~~~\n", op.dumpState(), "\n\n");
 }

-void named_not_supported_kernel(OperatorKernel*, const OperatorHandle& op, DispatchKeySet, Stack*) {
+void named_not_supported_kernel(OperatorKernel* /*unused*/, const OperatorHandle& op, DispatchKeySet /*unused*/, Stack* /*unused*/) {
  // DO NOT LOOK AT STACK, YOU HAVE SHORT CIRCUITED BOXING
  // See Note [named_not_supported_kernel]
  TORCH_CHECK(0,
--- a/aten/src/ATen/core/boxing/KernelFunction.h
+++ b/aten/src/ATen/core/boxing/KernelFunction.h
@ -229,7 +229,7 @@ class TORCH_API KernelFunction final {
   * &unboxed_func>();
   */
  template <class FuncPtr, bool AllowLegacyTypes = false>
-  static KernelFunction makeFromUnboxedFunction(FuncPtr);
+  static KernelFunction makeFromUnboxedFunction(FuncPtr /*func_ptr*/);

  /**
   * Create a KernelFunction from an unboxed function.
@ -271,7 +271,7 @@ class TORCH_API KernelFunction final {

  std::string dumpState() const;
  // For testing internal invariants only
-  bool _equalsBoxedAndUnboxed(const KernelFunction&) const;
+  bool _equalsBoxedAndUnboxed(const KernelFunction& /*other*/) const;

  // Register a token to be invalidated when this KernelFunction is destroyed
  void registerToken(std::weak_ptr<KernelToken> token) const;
--- a/aten/src/ATen/core/boxing/impl/boxing.h
+++ b/aten/src/ATen/core/boxing/impl/boxing.h
@ -131,7 +131,7 @@ C10_ALWAYS_INLINE_UNLESS_MOBILE void boxToStack(
  new (dest++) IValue(options.pinned_memory());
 }

-inline void boxArgsToStack(IValue*&) {}
+inline void boxArgsToStack(IValue*& /*unused*/) {}

 template <typename T, typename... Args>
 C10_ALWAYS_INLINE_UNLESS_MOBILE void boxArgsToStack(
@ -185,7 +185,7 @@ struct PopResult<std::tuple<Types...>> final {
  template <size_t... indices>
  static Result pop_to_tuple_impl(
      Stack& stack,
-      std::index_sequence<indices...>) {
+      std::index_sequence<indices...> /*unused*/) {
    return std::make_tuple((std::move(stack[indices]).template to<Types>())...);
  }
 };
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@ -561,7 +561,7 @@ struct wrap_kernel_functor_unboxed_<
  // doesn't use &&
  static ReturnType call(
      OperatorKernel* functor,
-      DispatchKeySet,
+      DispatchKeySet /*unused*/,
      ParameterTypes... args) {
    KernelFunctor* functor_ = static_cast<KernelFunctor*>(functor);
    // Note [Plumbing Keys Through The Dispatcher 2]
@ -629,8 +629,8 @@ call_functor_with_args_from_stack_(
    OperatorKernel* functor,
    DispatchKeySet dispatchKeySet,
    Stack* stack,
-    std::index_sequence<ivalue_arg_indices...>,
-    guts::typelist::typelist<ArgTypes...>*) {
+    std::index_sequence<ivalue_arg_indices...> /*unused*/,
+    guts::typelist::typelist<ArgTypes...>* /*unused*/) {
  (void)(stack); // when sizeof...(ivalue_arg_indices) == 0, this argument would
                 // be unused and we have to silence the compiler warning.

@ -708,7 +708,7 @@ struct push_outputs<std::tuple<OutputTypes...>, AllowDeprecatedTypes> final {
  static void call_(
      std::tuple<OutputTypes...>&& output,
      Stack* stack,
-      std::index_sequence<indices...>) {
+      std::index_sequence<indices...> /*unused*/) {
    torch::jit::push(
        *stack,
        return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::call(
@ -718,7 +718,7 @@ struct push_outputs<std::tuple<OutputTypes...>, AllowDeprecatedTypes> final {
  static void copy_(
      const std::tuple<OutputTypes...>& output,
      Stack* stack,
-      std::index_sequence<indices...>) {
+      std::index_sequence<indices...> /*unused*/) {
    torch::jit::push(
        *stack,
        return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::copy(
@ -741,7 +741,7 @@ struct make_boxed_from_unboxed_functor final {

  static void call(
      OperatorKernel* functor,
-      const OperatorHandle&,
+      const OperatorHandle& /*unused*/,
      DispatchKeySet dispatchKeySet,
      Stack* stack) {
    using ReturnType =
--- a/aten/src/ATen/core/builtin_function.h
+++ b/aten/src/ATen/core/builtin_function.h
@ -63,13 +63,13 @@ struct BuiltinOpFunction : public Function {

  bool call(
      Stack& stack,
-      std::optional<size_t>,
-      c10::function_ref<void(const Code&)>) override {
+      std::optional<size_t> /*unused*/,
+      c10::function_ref<void(const Code&)> /*unused*/) override {
    run(stack);
    return false;
  }

-  bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)>)
+  bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)> /*unused*/)
      override {
    run(stack);
    return false;
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
@ -80,7 +80,8 @@ struct MultiDispatchKeySet : at::IterArgs<MultiDispatchKeySet> {
      ts = ts | x.key_set();
    }
  }
-  [[noreturn]] void operator()(at::ArrayRef<std::optional<at::Tensor>>) {
+  [[noreturn]] void operator()(
+      at::ArrayRef<std::optional<at::Tensor>> /*unused*/) {
    // Just checking that the handling of Tensor?[] didn't change.
    TORCH_INTERNAL_ASSERT(false);
  }
@ -95,7 +96,7 @@ struct MultiDispatchKeySet : at::IterArgs<MultiDispatchKeySet> {
    }
  }
  template <typename T>
-  void operator()(const T&) {
+  void operator()(const T& /*unused*/) {
    // do nothing
  }
 };
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@ -633,7 +633,7 @@ class TypedOperatorHandle<Return(Args...)> final : public OperatorHandle {

 namespace detail {
 template <class... Args>
-inline void unused_arg_(const Args&...) {}
+inline void unused_arg_(const Args&... /*unused*/) {}

 // CaptureKernelCall is intended to capture return values from Dispatcher
 // unboxed kernel calls. A record function may request to get outputs from the
--- a/aten/src/ATen/core/dispatch/OperatorEntry.h
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.h
@ -105,7 +105,7 @@ class TORCH_API OperatorEntry final {
  // versa that is an error.  (Refcounting for the registrations is
  // handled in the OperatorHandle in Dispatcher)
  void registerSchema(
-      FunctionSchema&&,
+      FunctionSchema&& /*schema*/,
      std::string&& debug,
      std::vector<at::Tag> tags = {});
  void deregisterSchema();
--- a/aten/src/ATen/core/dynamic_type.cpp
+++ b/aten/src/ATen/core/dynamic_type.cpp
@ -177,7 +177,7 @@ bool DynamicType::equals(const Type& rhs) const {
  return equals(*create(rhs));
 }

-bool DynamicType::isSubtypeOfExt(const Type& rhs, std::ostream*) const {
+bool DynamicType::isSubtypeOfExt(const Type& rhs, std::ostream* /*why_not*/) const {
  auto other = create(rhs);
  if (tag_ == other->tag_) {
    if (equals(*other)) {
@ -371,7 +371,7 @@ DynamicTypePtr ivalue::TupleTypeFactory<c10::DynamicType>::create(
 }

 DynamicTypePtr ivalue::TupleTypeFactory<c10::DynamicType>::fallback(
-    const Type&) {
+    const Type& /*unused*/) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
  return nullptr;
 }
--- a/aten/src/ATen/core/dynamic_type.h
+++ b/aten/src/ATen/core/dynamic_type.h
@ -138,8 +138,8 @@ class DynamicType : public SharedType {

  struct Arguments {
    Arguments() = default;
-    Arguments(c10::ArrayRef<TypePtr>);
-    Arguments(const std::vector<std::string_view>&, c10::ArrayRef<TypePtr>);
+    Arguments(c10::ArrayRef<TypePtr> /*args*/);
+    Arguments(const std::vector<std::string_view>& /*names*/, c10::ArrayRef<TypePtr> /*args*/);
    std::vector<LabeledDynamicType> elems;
  };

@ -156,15 +156,15 @@ class DynamicType : public SharedType {
  static const TypeKind Kind = TypeKind::DynamicType;
  static TORCH_API DynamicTypePtr create(Type& ty);

-  explicit DynamicType(Tag, Arguments);
-  explicit DynamicType(Tag, std::string_view, Arguments);
+  explicit DynamicType(Tag /*tag*/, Arguments /*arguments*/);
+  explicit DynamicType(Tag /*tag*/, std::string_view /*name*/, Arguments /*arguments*/);

  DynamicType(DynamicType&& other) = delete;
  DynamicType(const DynamicType&) = delete;
  DynamicType& operator=(const DynamicType&) = delete;
  DynamicType& operator=(DynamicType&&) = delete;

-  TypePtr containedType(size_t) const override;
+  TypePtr containedType(size_t /*i*/) const override;
  size_t containedTypeSize() const override;
  Tag tag() const {
    return tag_;
--- a/aten/src/ATen/core/function.h
+++ b/aten/src/ATen/core/function.h
@ -96,15 +96,15 @@ struct TORCH_API Function {
  // Overload for server interpreter, a bailout size is needed for graph
  // executor.
  virtual bool call(
-      Stack&,
-      std::optional<size_t>,
-      c10::function_ref<void(const Code&)>) {
+      Stack& /*unused*/,
+      std::optional<size_t> /*unused*/,
+      c10::function_ref<void(const Code&)> /*unused*/) {
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
    return false;
  }

  // Overload for mobile interpreter.
-  virtual bool call(Stack&, c10::function_ref<void(const mobile::Code&)>) {
+  virtual bool call(Stack& /*unused*/, c10::function_ref<void(const mobile::Code&)> /*unused*/) {
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
    return false;
  }
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@ -847,7 +847,7 @@ struct TORCH_API IValue final {
  IValue(std::optional<T> v);
  template <class T, enable_if_list_is_ivalue_constructible<T> = nullptr>
  IValue(c10::OptionalArrayRef<T> v);
-  IValue(std::nullopt_t);
+  IValue(std::nullopt_t /*unused*/);

  // ClassType
  IValue(c10::intrusive_ptr<ivalue::Object> v);
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@ -660,7 +660,7 @@ struct TORCH_API TupleTypeFactory<TupleType> {
 template <>
 struct TORCH_API TupleTypeFactory<c10::DynamicType> {
  static DynamicTypePtr create(const std::vector<TypePtr>& elemTypes);
-  static DynamicTypePtr fallback(const Type&);
+  static DynamicTypePtr fallback(const Type& /*unused*/);
 };

 struct TORCH_API Tuple : c10::intrusive_ptr_target {
@ -1682,7 +1682,7 @@ struct ivalue::EnumHolder : c10::intrusive_ptr_target {
 namespace detail {

 struct _guarded_unsigned_long_unique_dummy final {
-  _guarded_unsigned_long_unique_dummy(int64_t){}
+  _guarded_unsigned_long_unique_dummy(int64_t /*unused*/){}
 };
 using _guarded_unsigned_long = std::conditional_t<
    std::is_same_v<unsigned long, uint32_t> ||
@ -1776,7 +1776,7 @@ template <class Elem>
 // native_functions.yaml still return std::vector.
 // C10_DEPRECATED_MESSAGE("IValues based on std::vector<T> are potentially slow
 // and deprecated. Please use torch::List<T> instead.")
-std::vector<Elem> generic_to(IValue ivalue, _fake_type<std::vector<Elem>>) {
+std::vector<Elem> generic_to(IValue ivalue, _fake_type<std::vector<Elem>> /*unused*/) {
  // We need to do a deep copy of the vector because there might be other
  // references to this same IValue that also use the list. We can't just
  // move the elements out.
@ -1826,18 +1826,18 @@ c10::intrusive_ptr<T> IValue::toCustomClass() const& {
 }

 template <typename T>
-T generic_to(IValue ivalue, _fake_type<T>) {
+T generic_to(IValue ivalue, _fake_type<T> /*unused*/) {
  using ElemType = typename std::remove_pointer<T>::type::element_type;
  return std::move(ivalue).template toCustomClass<ElemType>();
 }

 template <typename T>
-tagged_capsule<T> generic_to(IValue ivalue, _fake_type<tagged_capsule<T>>) {
+tagged_capsule<T> generic_to(IValue ivalue, _fake_type<tagged_capsule<T>> /*unused*/) {
  return tagged_capsule<T>{std::move(ivalue)};
 }

 template <typename Elem>
-c10::List<Elem> generic_to(IValue ivalue, _fake_type<c10::List<Elem>>) {
+c10::List<Elem> generic_to(IValue ivalue, _fake_type<c10::List<Elem>> /*unused*/) {
  return impl::toTypedList<Elem>(std::move(ivalue).toList());
 }

@ -1867,7 +1867,7 @@ std::vector<T> createVectorFromList(const c10::List<T>& impl) {
 }

 template <typename T>
-OptionalArray<T> generic_to(IValue ivalue, _fake_type<OptionalArray<T>>) {
+OptionalArray<T> generic_to(IValue ivalue, _fake_type<OptionalArray<T>> /*unused*/) {
  if (ivalue.isNone()) {
    return {};
  }
@ -1880,8 +1880,8 @@ namespace detail {
 template <typename Elem, size_t... I>
 std::array<Elem, sizeof...(I)> generic_to_array(
    IValue ivalue,
-    _fake_type<std::array<Elem, sizeof...(I)>>,
-    std::index_sequence<I...>) {
+    _fake_type<std::array<Elem, sizeof...(I)>> /*unused*/,
+    std::index_sequence<I...> /*unused*/) {
  // We need to do a deep copy of the array because there might be other
  // references to this same IValue that also use the list. We can't just
  // move the elements out.
@ -1906,7 +1906,7 @@ std::array<Elem, N> generic_to(
 template <typename Key, typename Value>
 c10::Dict<Key, Value> generic_to(
    IValue ivalue,
-    _fake_type<c10::Dict<Key, Value>>) {
+    _fake_type<c10::Dict<Key, Value>> /*unused*/) {
  return impl::toTypedDict<Key, Value>(std::move(ivalue).toGenericDict());
 }

@ -1915,7 +1915,7 @@ C10_DEPRECATED_MESSAGE(
    "IValues based on std::unordered_map are slow and deprecated. Please use c10::Dict<K, V> instead.")
 std::unordered_map<K, V> generic_to(
    IValue ivalue,
-    _fake_type<std::unordered_map<K, V>>) {
+    _fake_type<std::unordered_map<K, V>> /*unused*/) {
  std::unordered_map<K, V> specialized_dict;

  for (const auto& item : std::move(ivalue).toGenericDict()) {
@ -1926,7 +1926,7 @@ std::unordered_map<K, V> generic_to(
 }

 template <typename T>
-std::optional<T> generic_to(IValue ivalue, _fake_type<std::optional<T>>) {
+std::optional<T> generic_to(IValue ivalue, _fake_type<std::optional<T>> /*unused*/) {
  if (ivalue.isNone()) {
    return std::nullopt;
  }
@ -1937,7 +1937,7 @@ namespace detail {
 template <typename Tuple, std::size_t... INDEX>
 Tuple generic_to_tuple_impl(
    const ivalue::TupleElements& t,
-    std::index_sequence<INDEX...>) {
+    std::index_sequence<INDEX...> /*unused*/) {
  return std::make_tuple(
      t[INDEX].to<typename std::tuple_element<INDEX, Tuple>::type>()...);
 }
@ -1951,7 +1951,7 @@ template <
            std::is_lvalue_reference<Args>...,
            std::negation<std::is_constructible<IValue, Args>>...>,
        std::nullptr_t> = nullptr>
-std::tuple<Args...> generic_to(const IValue& ivalue, _fake_type<std::tuple<Args...>>) {
+std::tuple<Args...> generic_to(const IValue& ivalue, _fake_type<std::tuple<Args...>> /*unused*/) {
  const auto& vals = ivalue.toTupleRef().elements();
  TORCH_CHECK(vals.size() == sizeof...(Args));
  return detail::generic_to_tuple_impl<std::tuple<Args...>>(vals, Indices{});
@ -2311,7 +2311,7 @@ inline IValue::IValue(std::optional<T> v) : IValue() {
  }
 }

-inline IValue::IValue(std::nullopt_t) : IValue() {}
+inline IValue::IValue(std::nullopt_t /*unused*/) : IValue() {}

 inline IValue::IValue(c10::intrusive_ptr<ivalue::Object> v)
    : tag(Tag::Object) {
@ -2482,15 +2482,15 @@ namespace ivalue {
 namespace detail {

 template <typename T>
-IValue from_(T&& x, std::true_type) {
+IValue from_(T&& x, std::true_type /*unused*/) {
  return IValue(std::forward<T>(x));
 }
 template <typename T>
-IValue from_(c10::intrusive_ptr<T> x, std::false_type) {
+IValue from_(c10::intrusive_ptr<T> x, std::false_type /*unused*/) {
  return IValue(std::move(x));
 }
 template <typename T>
-IValue from_(T&& /*x*/, std::false_type) {
+IValue from_(T&& /*x*/, std::false_type /*unused*/) {
  static_assert(
      guts::false_t<T>::value,
      "You are calling from with a type that it doesn't support, and isn't a potential custom class (ie: is an intrusive_ptr)");
@ -2546,19 +2546,19 @@ struct MaybeOwnedTraits<IValue> {
    return &borrow;
  }

-  static bool debugBorrowIsValid(const borrow_type&) {
+  static bool debugBorrowIsValid(const borrow_type& /*unused*/) {
    return true;
  }
 };

 template <>
 struct IValue::TagType<c10::Type> {
-  static TORCH_API c10::TypePtr get(const IValue&);
+  static TORCH_API c10::TypePtr get(const IValue& /*v*/);
 };

 template <>
 struct IValue::TagType<c10::DynamicType> {
-  static TORCH_API c10::TypePtr get(const IValue&);
+  static TORCH_API c10::TypePtr get(const IValue& /*v*/);
 };

 template <typename T>
--- a/aten/src/ATen/core/op_registration/infer_schema.h
+++ b/aten/src/ATen/core/op_registration/infer_schema.h
@ -44,7 +44,7 @@ constexpr int checkStaticTypes() {
 }

 template <typename... Ts, size_t... Is>
-constexpr std::array<ArgumentDef, sizeof...(Ts)> createArgumentVectorFromTypes(std::index_sequence<Is...>) {
+constexpr std::array<ArgumentDef, sizeof...(Ts)> createArgumentVectorFromTypes(std::index_sequence<Is...> /*unused*/) {
  return (
    // Check types for common errors
    checkStaticTypes<Ts...>(),
--- a/aten/src/ATen/core/operator_name.h
+++ b/aten/src/ATen/core/operator_name.h
@ -83,7 +83,7 @@ inline bool operator!=(const OperatorName& lhs, const OperatorName& rhs) {
 }

 TORCH_API std::string toString(const OperatorName& opName);
-TORCH_API std::ostream& operator<<(std::ostream&, const OperatorName&);
+TORCH_API std::ostream& operator<<(std::ostream& /*os*/, const OperatorName& /*opName*/);

 } // namespace c10

--- a/aten/src/ATen/core/type_ptr.h
+++ b/aten/src/ATen/core/type_ptr.h
@ -16,7 +16,7 @@ class SingletonTypePtr {
  /* implicit */ SingletonTypePtr(T* p) : repr_(p) {}

  // We need this to satisfy Pybind11, but it shouldn't be hit.
-  explicit SingletonTypePtr(std::shared_ptr<T>) { TORCH_CHECK(false); }
+  explicit SingletonTypePtr(std::shared_ptr<T> /*unused*/) { TORCH_CHECK(false); }

  using element_type = typename std::shared_ptr<T>::element_type;

--- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
@ -307,8 +307,8 @@ Vectorized<c10::BFloat16> inline operator/(
 }

 inline Vectorized<BFloat16>::Vectorized() {
-  const short zero = 0;
-  values = svdup_n_bf16(c10::bit_cast<bfloat16_t>(zero));
+  auto vals_f = svdup_n_f32(0);
+  values = convert_float_bfloat16(vals_f, vals_f);
 }

 inline Vectorized<BFloat16>::Vectorized(int val) {
--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
@ -342,19 +342,19 @@ class Vectorized<c10::complex<double>> {
    return _mm256_cmp_pd(values, other.values, _CMP_NEQ_UQ);
  }
  Vectorized<c10::complex<double>> operator<(
-      const Vectorized<c10::complex<double>>&) const {
+      const Vectorized<c10::complex<double>>& /*unused*/) const {
    TORCH_CHECK(false, "not supported for complex numbers");
  }
  Vectorized<c10::complex<double>> operator<=(
-      const Vectorized<c10::complex<double>>&) const {
+      const Vectorized<c10::complex<double>>& /*unused*/) const {
    TORCH_CHECK(false, "not supported for complex numbers");
  }
  Vectorized<c10::complex<double>> operator>(
-      const Vectorized<c10::complex<double>>&) const {
+      const Vectorized<c10::complex<double>>& /*unused*/) const {
    TORCH_CHECK(false, "not supported for complex numbers");
  }
  Vectorized<c10::complex<double>> operator>=(
-      const Vectorized<c10::complex<double>>&) const {
+      const Vectorized<c10::complex<double>>& /*unused*/) const {
    TORCH_CHECK(false, "not supported for complex numbers");
  }

--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -1861,6 +1861,8 @@ template bool gemm_and_bias(
    int64_t result_ld,
    GEMMAndBiasActivationEpilogue activation);

+using at::blas::ScalingType;
+
 int get_scale_mode(ScalingType scaling_type, ScalarType scale_dtype, bool use_fast_accum) {
  switch (scaling_type) {
    case ScalingType::BlockWise1x32:
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@ -14,6 +14,7 @@
 */

 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/BlasBackend.h>
 #include <ATen/OpMathType.h>

 namespace at::cuda::blas {
@ -136,15 +137,6 @@ void int8_gemm(
    int32_t* result_ptr,
    int64_t result_ld);

-enum class ScalingType : std::uint8_t {
-  TensorWise,  // fp32 scales
-  RowWise,  // fp32 scales
-  BlockWise1x16,  // fp8_e4m3fn scales
-  BlockWise1x32,  // fp8_e8m0fnu scales
-  BlockWise1x128,  // fp32 scales
-  BlockWise128x128,  // fp32 scales
-};
-
 void scaled_gemm(
    char transa,
    char transb,
@ -156,13 +148,13 @@ void scaled_gemm(
    int64_t mat1_ld,
    ScalarType mat1_dtype,
    ScalarType mat1_scale_dtype,
-    ScalingType mat1_scaling_type,
+    at::blas::ScalingType mat1_scaling_type,
    const void* mat2_ptr,
    const void* mat2_scale_ptr,
    int64_t mat2_ld,
    ScalarType mat2_dtype,
    ScalarType mat2_scale_dtype,
-    ScalingType mat2_scaling_type,
+    at::blas::ScalingType mat2_scaling_type,
    const void* bias_ptr,
    ScalarType bias_dtype,
    void* result_ptr,
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@ -17,7 +17,7 @@ TORCH_CUDA_CPP_API void set_magma_init_fn(void (*magma_init_fn)());

 // The real implementation of CUDAHooksInterface
 struct CUDAHooks : public at::CUDAHooksInterface {
-  CUDAHooks(at::CUDAHooksArgs) {}
+  CUDAHooks(at::CUDAHooksArgs /*unused*/) {}
  void init() const override;
  Device getDeviceFromPtr(void* data) const override;
  bool isPinnedPtr(const void* data) const override;
--- a/aten/src/ATen/cuda/tunable/GemmCommon.h
+++ b/aten/src/ATen/cuda/tunable/GemmCommon.h
@ -29,7 +29,7 @@

 namespace at::cuda::tunable {

-using at::cuda::blas::ScalingType;
+using at::blas::ScalingType;

 enum class BlasOp {
  N = 0,
--- a/aten/src/ATen/cuda/tunable/TunableOp.h
+++ b/aten/src/ATen/cuda/tunable/TunableOp.h
@ -29,7 +29,7 @@ template <typename ParamsT>
 class Callable {
  public:
    virtual ~Callable() = default;
-    virtual TuningStatus Call(const ParamsT*) {
+    virtual TuningStatus Call(const ParamsT* /*unused*/) {
      return FAIL;
    }
    virtual TuningStatus IsSupported(const ParamsT* params) {
--- a/aten/src/ATen/detail/HPUHooksInterface.h
+++ b/aten/src/ATen/detail/HPUHooksInterface.h
@ -25,7 +25,7 @@ struct TORCH_API HPUHooksInterface : AcceleratorHooksInterface {
        false, "Cannot get device of pointer on HPU without HPU backend");
  }

-  bool isPinnedPtr(const void*) const override {
+  bool isPinnedPtr(const void* /*data*/) const override {
    return false;
  }

--- a/aten/src/ATen/functorch/BatchRulesHelper.h
+++ b/aten/src/ATen/functorch/BatchRulesHelper.h
@ -410,7 +410,7 @@ struct ExistingBdimBatchRuleHelper<F, Func, c10::guts::typelist::typelist<A, T..


 template <typename F, F Method, typename... ExtraArgs>
-Tensor& unary_inplace_batch_rule(Tensor& self, std::optional<int64_t>, ExtraArgs... extra_args) {
+Tensor& unary_inplace_batch_rule(Tensor& self, std::optional<int64_t> /*unused*/, ExtraArgs... extra_args) {
  INVOKE(self, Method)(std::forward<ExtraArgs>(extra_args)...);
  return self;
 }
--- a/aten/src/ATen/metal/Context.h
+++ b/aten/src/ATen/metal/Context.h
@ -18,7 +18,7 @@ extern std::atomic<const MetalInterface*> g_metal_impl_registry;

 class MetalImplRegistrar {
 public:
-  explicit MetalImplRegistrar(MetalInterface*);
+  explicit MetalImplRegistrar(MetalInterface* /*impl*/);
 };

 at::Tensor& metal_copy_(at::Tensor& self, const at::Tensor& src);
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@ -2060,7 +2060,7 @@ std::tuple<Tensor, Tensor> linalg_lu_factor(const Tensor& A, bool pivot) {
 }

 // TODO Deprecate this function in favour of linalg_lu_factor_ex
-std::tuple<Tensor, Tensor, Tensor> _lu_with_info(const Tensor& self, bool compute_pivots, bool) {
+std::tuple<Tensor, Tensor, Tensor> _lu_with_info(const Tensor& self, bool compute_pivots, bool /*unused*/) {
   TORCH_WARN_ONCE(
    "torch.lu is deprecated in favor of torch.linalg.lu_factor / torch.linalg.lu_factor_ex and will be ",
    "removed in a future PyTorch release.\n",
--- a/aten/src/ATen/native/Scalar.cpp
+++ b/aten/src/ATen/native/Scalar.cpp
@ -15,7 +15,11 @@ namespace at::native {

 Scalar item(const Tensor& self) {
  auto numel = self.sym_numel();
-  TORCH_CHECK(numel == 1, "a Tensor with ", numel, " elements cannot be converted to Scalar");
+  TORCH_SYM_CHECK(
+      numel.sym_eq(1),
+      "a Tensor with ",
+      numel,
+      " elements cannot be converted to Scalar");
  if (self.is_sparse()) {
    if (self._nnz() == 0) return Scalar(0);
    if (self.is_coalesced()) return at::_local_scalar_dense(self._values());
--- a/aten/src/ATen/native/SharedReduceOps.h
+++ b/aten/src/ATen/native/SharedReduceOps.h
@ -346,17 +346,17 @@ template<typename acc_t>
 struct AbsSwitch {};

 template<typename scalar_t, typename acc_t>
-inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch<acc_t>) {
+inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch<acc_t> /*unused*/) {
  return static_cast<acc_t>(data);
 }

 template<typename scalar_t, typename acc_t>
-inline C10_DEVICE acc_t abs_if_complex(std::complex<scalar_t> data, AbsSwitch<acc_t>) {
+inline C10_DEVICE acc_t abs_if_complex(std::complex<scalar_t> data, AbsSwitch<acc_t> /*unused*/) {
  return static_cast<acc_t>(std::abs(data));
 }

 template<typename scalar_t, typename acc_t>
-inline C10_DEVICE acc_t abs_if_complex(c10::complex<scalar_t> data, AbsSwitch<acc_t>) {
+inline C10_DEVICE acc_t abs_if_complex(c10::complex<scalar_t> data, AbsSwitch<acc_t> /*unused*/) {
  return static_cast<acc_t>(std::abs(at::opmath_type<c10::complex<scalar_t>>(data)));
 }

--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@ -846,7 +846,7 @@ TORCH_IMPL_FUNC(clamp_Tensor_out)
 (const Tensor& self,
 const OptionalTensorRef min,
 const OptionalTensorRef max,
- const Tensor&) {
+ const Tensor& /*unused*/) {
  if (min && max) {
    clamp_stub(device_type(), *this);
  } else if (min) {
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@ -23,6 +23,14 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/_cast_Byte_native.h>
+#include <ATen/ops/_cast_Char_native.h>
+#include <ATen/ops/_cast_Double_native.h>
+#include <ATen/ops/_cast_Float_native.h>
+#include <ATen/ops/_cast_Half_native.h>
+#include <ATen/ops/_cast_Int_native.h>
+#include <ATen/ops/_cast_Long_native.h>
+#include <ATen/ops/_cast_Short_native.h>
 #include <ATen/ops/_dim_arange_native.h>
 #include <ATen/ops/_efficientzerotensor_native.h>
 #include <ATen/ops/_empty_affine_quantized.h>
--- a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp
+++ b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp
@ -452,11 +452,11 @@ void convolution_depthwise3x3_winograd_impl(
 #else

 void convolution_depthwise3x3_winograd_impl(
-    const Arguments&,
-    const float* const,
-    const float* const,
-    const float* const,
-    float* const) {
+    const Arguments& /*unused*/,
+    const float* const /*unused*/,
+    const float* const /*unused*/,
+    const float* const /*unused*/,
+    float* const /*unused*/) {
 }

 #endif /* __ARM_NEON__ */
--- a/aten/src/ATen/native/cpu/Loops.h
+++ b/aten/src/ATen/native/cpu/Loops.h
@ -46,7 +46,7 @@ using namespace vec;
 template <typename traits, std::size_t... INDEX>
 typename traits::ArgsTuple
 dereference_impl(char* C10_RESTRICT data[], const int64_t* strides, int64_t i,
-                 std::index_sequence<INDEX...>) {
+                 std::index_sequence<INDEX...> /*unused*/) {
  return std::make_tuple(
      c10::load<typename traits::template arg<INDEX>::type>(
          data[INDEX] + i * strides[INDEX])...);
@ -65,7 +65,7 @@ dereference_vec_impl(char* C10_RESTRICT data[],
                     const typename traits::result_type& opt_scalar,
                     size_t S,
                     int64_t i,
-                     std::index_sequence<INDEX...>) {
+                     std::index_sequence<INDEX...> /*unused*/) {
  using Vec = typename traits::result_type;
  using scalar_t = typename Vec::value_type;
  return std::make_tuple(
@ -231,7 +231,7 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve
 template <typename traits, typename cb_t>
 inline void unroll_contiguous_scalar_checks(
    const int64_t* /*strides*/,
-    std::index_sequence<>,
+    std::index_sequence<> /*unused*/,
    cb_t&& cb) {
  cb(0);
 }
@ -239,7 +239,7 @@ inline void unroll_contiguous_scalar_checks(
 template <typename traits, typename cb_t, size_t INDEX0, size_t ...INDEX>
 inline void unroll_contiguous_scalar_checks(
    const int64_t* strides,
-    std::index_sequence<INDEX0, INDEX...>,
+    std::index_sequence<INDEX0, INDEX...> /*unused*/,
    cb_t&& cb) {
  if (is_contiguous_scalar<traits, INDEX0 + 1>(strides)) {
    cb(INDEX0 + 1);
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -4,6 +4,7 @@
 #include <c10/util/SmallVector.h>
 #include <c10/core/Scalar.h>
 #include <c10/core/ScalarType.h>
+#include <c10/util/Exception.h>
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
 #include <ATen/core/NamedTensor.h>
@ -105,7 +106,8 @@ c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, b
  }
 }

-using at::cuda::blas::ScalingType;
+using at::blas::ScalingType;
+using at::blas::SwizzleType;

 /**
 * @brief Prepares matrices for CUBLAS operation
@ -1112,7 +1114,7 @@ namespace{
 *   - Returns Error.
 */

-using at::cuda::blas::ScalingType;
+using at::blas::ScalingType;

 bool is_tensorwise_scaling(const at::Tensor& t, const at::Tensor& scale) {
  return isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat && scale.numel() == 1;
@ -1679,9 +1681,890 @@ _scaled_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
          bool use_fast_accum) {
  const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
  Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
+
  return _scaled_mm_out_cuda(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
 }

+/**
+ * Track concrete implementations available
+ */
+enum class ScaledGemmImplementation {
+  NONE = 0,
+  TENSORWISE_TENSORWISE = 1,
+  ROWWISE_ROWWISE = 2,
+  BLOCK_128x128_1x128 = 3,
+  BLOCK_1x128_128x128 = 4,
+  BLOCK_1x128_1x128 = 5,
+  MXFP8_MXFP8 = 6,
+  NVFP4_NVFP4 = 7,
+  NVFP4_NVFP4_SINGLE_SCALE = 8,
+};
+
+/**
+ * Convert passed int (enum) from python back into a
+ * strictly-typed enum
+ */
+template <class EnumType, class ArrayType>
+std::vector<EnumType> convert_int_to_enum(ArrayType& v) {
+  std::vector<EnumType> converted;
+  converted.reserve(v.size());
+
+  for (auto vi : v) {
+    converted.push_back(static_cast<EnumType>(vi));
+  }
+  return converted;
+}
+
+/**
+ * Both inputs must be fp8,
+ * Each needs a single scale, {Tensorwise (float)}
+ */
+bool check_tensorwise_recipe(c10::ScalarType type_a,
+                             std::vector<ScalingType>& recipe_a,
+                             ArrayRef<Tensor>& scales_a,
+                             c10::ScalarType type_b,
+                             std::vector<ScalingType>& recipe_b,
+                             ArrayRef<Tensor>& scales_b) {
+  // both types must be fp8
+  if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) {
+    return false;
+  }
+
+  // 1 scale each, {Tensorwise, float}
+  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) {
+    return false;
+  }
+  // Need {Blockwise_1x32, e8m0} for A & B
+  if (recipe_a[0] != ScalingType::TensorWise) return false;
+  if (scales_a[0].scalar_type() != ScalarType::Float) return false;
+  if (recipe_b[0] != ScalingType::TensorWise) return false;
+  if (scales_b[0].scalar_type() != ScalarType::Float) return false;
+
+  return true;
+}
+
+/**
+ * Both inputs must be fp8,
+ * Each needs scales, {Rowwise (float)}
+ */
+bool check_rowwise_recipe(c10::ScalarType type_a,
+                             std::vector<ScalingType>& recipe_a,
+                             ArrayRef<Tensor>& scales_a,
+                             c10::ScalarType type_b,
+                             std::vector<ScalingType>& recipe_b,
+                             ArrayRef<Tensor>& scales_b) {
+  // both types must be fp8
+  if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) {
+    return false;
+  }
+
+  // 1 scale each, {Tensorwise, float}
+  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) {
+    return false;
+  }
+
+  // Need {RowWise, dp32} for A & B
+  if (recipe_a[0] != ScalingType::RowWise) return false;
+  if (scales_a[0].scalar_type() != ScalarType::Float) return false;
+  if (recipe_b[0] != ScalingType::RowWise) return false;
+  if (scales_b[0].scalar_type() != ScalarType::Float) return false;
+
+  return true;
+}
+
+
+/**
+ * Two-level scaling, canonical NVFP4
+ * Both inputs must be fp4
+ * A, B need 2 scales, {Blockwise_1x16 (e4m3), Tensorwise (fp32)}
+ */
+bool check_nvfp4_recipe(c10::ScalarType type_a,
+                        std::vector<ScalingType>& recipe_a,
+                        ArrayRef<Tensor>& scales_a,
+                        c10::ScalarType type_b,
+                        std::vector<ScalingType>& recipe_b,
+                        ArrayRef<Tensor>& scales_b) {
+  // both types must be fp4
+  if (type_a != ScalarType::Float4_e2m1fn_x2 || type_b != ScalarType::Float4_e2m1fn_x2) {
+    return false;
+  }
+
+  // 2 scales, 2 recipes for each input
+  if (scales_a.size() != 2 || recipe_a.size() != 2 || scales_b.size() != 2 || recipe_b.size() != 2) {
+    return false;
+  }
+
+  // Need {Blockwise_1x16, e4m3 for scale[0], Tensorwise, fp32 for scale[1]}
+  if (recipe_a[0] != ScalingType::BlockWise1x16 || recipe_a[1] != ScalingType::TensorWise) return false;
+  if (scales_a[0].scalar_type() != ScalarType::Float8_e4m3fn || scales_a[1].scalar_type() != ScalarType::Float) return false;
+  if (recipe_b[0] != ScalingType::BlockWise1x16 || recipe_b[1] != ScalingType::TensorWise) return false;
+  if (scales_b[0].scalar_type() != ScalarType::Float8_e4m3fn || scales_b[1].scalar_type() != ScalarType::Float) return false;
+
+  return true;
+}
+
+/**
+ * Single-level scaling, what PyT currently understands
+ * Both inputs must be fp4
+ * A, B need 1 scale, {Blockwise_1x16 (e4m3)}
+ */
+bool check_nvfp4_recipe_single_scale
+                       (c10::ScalarType type_a,
+                        std::vector<ScalingType>& recipe_a,
+                        ArrayRef<Tensor>& scales_a,
+                        c10::ScalarType type_b,
+                        std::vector<ScalingType>& recipe_b,
+                        ArrayRef<Tensor>& scales_b) {
+  // both types must be fp4
+  if (type_a != ScalarType::Float4_e2m1fn_x2 || type_b != ScalarType::Float4_e2m1fn_x2) {
+    return false;
+  }
+
+  // 2 scales, 2 recipes for each input
+  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) {
+    return false;
+  }
+
+  // Need {Blockwise_1x16, e4m3 for scale[0], Tensorwise, fp32 for scale[1]}
+  if (recipe_a[0] != ScalingType::BlockWise1x16) return false;
+  if (scales_a[0].scalar_type() != ScalarType::Float8_e4m3fn) return false;
+  if (recipe_b[0] != ScalingType::BlockWise1x16) return false;
+  if (scales_b[0].scalar_type() != ScalarType::Float8_e4m3fn) return false;
+
+  return true;
+}
+
+/**
+ * Both inputs must be fp8
+ * A, B must only have 1 scale each, A: {Blockwise_1x128 (float), B: {Blockwise_128x128 (float)
+ */
+bool check_deepseek_recipe(ScalingType expected_recipe_a,
+                           ScalingType expected_recipe_b,
+                           c10::ScalarType type_a,
+                           std::vector<ScalingType>& recipe_a,
+                           ArrayRef<Tensor>& scales_a,
+                           c10::ScalarType type_b,
+                           std::vector<ScalingType>& recipe_b,
+                           ArrayRef<Tensor>& scales_b) {
+  // both types must be fp8
+  if (type_a != ScalarType::Float8_e4m3fn || type_b != ScalarType::Float8_e4m3fn) {
+    return false;
+  }
+
+  // 1 scales, 1 recipes for each input
+  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) {
+    return false;
+  }
+
+  // Need {Blockwise_1x128, float} for A, {Blockwise_128x128, float} for B
+  if (recipe_a[0] != expected_recipe_a) return false;
+  if (scales_a[0].scalar_type() != ScalarType::Float) return false;
+  if (recipe_b[0] != expected_recipe_b) return false;
+  if (scales_b[0].scalar_type() != ScalarType::Float) return false;
+
+  return true;
+}
+
+/**
+ * Both inputs must be fp8
+ * A, B must have 1 scale each, {Blockwise_1x32, e8m0}
+ */
+bool check_mxfp8_recipe(c10::ScalarType type_a,
+                        std::vector<ScalingType>& recipe_a,
+                        ArrayRef<Tensor>& scales_a,
+                        c10::ScalarType type_b,
+                        std::vector<ScalingType>& recipe_b,
+                        ArrayRef<Tensor>& scales_b) {
+  // both types must be fp8
+  if (type_a != ScalarType::Float8_e4m3fn || type_b != ScalarType::Float8_e4m3fn) {
+    return false;
+  }
+
+  // 1 scales, 1 recipes for each input
+  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) {
+    return false;
+  }
+
+  // Need {Blockwise_1x32, e8m0} for A & B
+  if (recipe_a[0] != ScalingType::BlockWise1x32) return false;
+  if (scales_a[0].scalar_type() != ScalarType::Float8_e8m0fnu) return false;
+  if (recipe_b[0] != ScalingType::BlockWise1x32) return false;
+  if (scales_b[0].scalar_type() != ScalarType::Float8_e8m0fnu) return false;
+
+  return true;
+}
+
+using acceptance_fn = std::function<bool(c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&, c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&)>;
+using namespace std::placeholders;
+
+std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 8> scale_kernel_dispatch = {{
+  { "tensorwise_tensorwise", check_tensorwise_recipe, ScaledGemmImplementation::TENSORWISE_TENSORWISE },
+  { "rowwise_rowwise", check_rowwise_recipe, ScaledGemmImplementation::ROWWISE_ROWWISE},
+  { "block_1x128_128x128", std::bind(check_deepseek_recipe, ScalingType::BlockWise1x128, ScalingType::BlockWise128x128, _1, _2, _3, _4, _5, _6),
+    ScaledGemmImplementation::BLOCK_1x128_128x128},
+  { "block_128x128_1x128", std::bind(check_deepseek_recipe, ScalingType::BlockWise128x128, ScalingType::BlockWise1x128, _1, _2, _3, _4, _5, _6),
+    ScaledGemmImplementation::BLOCK_128x128_1x128},
+  { "block_1x128_1x128", std::bind(check_deepseek_recipe, ScalingType::BlockWise1x128, ScalingType::BlockWise1x128, _1, _2, _3, _4, _5, _6),
+    ScaledGemmImplementation::BLOCK_1x128_1x128},
+  { "nvfp4_nvfp4", check_nvfp4_recipe, ScaledGemmImplementation::NVFP4_NVFP4},
+  { "nvfp4_nvfp4_single_scale", check_nvfp4_recipe_single_scale, ScaledGemmImplementation::NVFP4_NVFP4_SINGLE_SCALE },
+  { "mxfp8_mxfp8", check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8}}};
+
+Tensor&
+_cutlass_scaled_gemm(
+          const Tensor& mat1, const Tensor& mat2,
+          const Tensor& scale_a, const Tensor& scale_b,
+          const ScalingType scaling_choice_a, const ScalingType scaling_choice_b,
+          const std::optional<Tensor>& bias,
+          const bool use_fast_accum,
+          Tensor& out) {
+  cublasCommonArgs args(mat1, mat2, out, scale_a, scale_b, std::nullopt, scaling_choice_a, scaling_choice_b);
+  const auto out_dtype_ = args.result->scalar_type();
+  TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");
+
+#ifdef USE_ROCM
+  auto tuning_ctx = at::cuda::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+#define TUNABLE_DISPATCH(BLASOP_A, BLASOP_B)                            \
+        if (mat1.scalar_type() == ScalarType::Float8_e4m3fnuz) {        \
+          if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) {      \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e4m3fnuz, at::Float8_e4m3fnuz, scalar_t,     \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+          else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e4m3fnuz, at::Float8_e5m2fnuz, scalar_t,     \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+        }                                                               \
+        else if (mat1.scalar_type() == ScalarType::Float8_e5m2fnuz) {   \
+          if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) {      \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e5m2fnuz, at::Float8_e4m3fnuz, scalar_t,     \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+          else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e5m2fnuz, at::Float8_e5m2fnuz, scalar_t,     \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+        }                                                               \
+        else if (mat1.scalar_type() == ScalarType::Float8_e4m3fn) {     \
+          if (mat2.scalar_type() == ScalarType::Float8_e4m3fn) {        \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e4m3fn, at::Float8_e4m3fn, scalar_t,         \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+          else if (mat2.scalar_type() == ScalarType::Float8_e5m2) {     \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e4m3fn, at::Float8_e5m2, scalar_t,           \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+        }                                                               \
+        else if (mat1.scalar_type() == ScalarType::Float8_e5m2) {       \
+          if (mat2.scalar_type() == ScalarType::Float8_e4m3fn) {        \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e5m2, at::Float8_e4m3fn, scalar_t,           \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+          else if (mat2.scalar_type() == ScalarType::Float8_e5m2) {     \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e5m2, at::Float8_e5m2, scalar_t,             \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+        }
+    AT_DISPATCH_V2(out_dtype_, "_tunable_scaled_gemm", AT_WRAP([&] {
+      bool transa_ = ((args.transa != 'n') && (args.transa != 'N'));
+      bool transb_ = ((args.transb != 'n') && (args.transb != 'N'));
+      at::cuda::tunable::ScaledGemmParams<scalar_t> params;
+      params.transa = args.transa;
+      params.transb = args.transb;
+      params.m = args.m;
+      params.n = args.n;
+      params.k = args.k;
+      params.a = args.mata->data_ptr();
+      params.a_scale_ptr = args.scale_mata_ptr;
+      params.a_scale_dtype = args.scale_mata_dtype.value();
+      params.lda = args.lda;
+      params.a_dtype = args.mata->scalar_type();
+      params.a_scale_dtype = args.scale_mata_dtype.value();
+      params.a_scaling_type = args.scaling_mata_type.value();
+      params.b = args.matb->data_ptr();
+      params.b_scale_ptr = args.scale_matb_ptr;
+      params.b_scale_dtype = args.scale_matb_dtype.value();
+      params.ldb = args.ldb;
+      params.b_dtype = args.matb->scalar_type();
+      params.b_scale_dtype = args.scale_matb_dtype.value();
+      params.b_scaling_type = args.scaling_matb_type.value();
+      params.bias_ptr = bias ? bias->data_ptr(): nullptr;
+      params.bias_dtype = bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_;
+      params.c = args.result->data_ptr();
+      params.c_scale_ptr = args.scale_result_ptr;
+      params.ldc = args.result_ld;
+      params.c_dtype = out_dtype_;
+      params.use_fast_accum = use_fast_accum;
+      if (transa_ && transb_) {
+        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::T)
+      }
+      else if (transa_ && !transb_) {
+        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::N)
+      }
+      else if (!transa_ && transb_) {
+        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::T)
+      }
+      else if (!transa_ && !transb_) {
+        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::N)
+      }
+      else {
+        TORCH_CHECK(false, "unreachable");
+      }
+    }),
+    kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_FLOATING_TYPES));
+#undef TUNABLE_DISPATCH
+  }
+  else
+#endif
+ {
+    at::cuda::blas::scaled_gemm(
+        args.transa,
+        args.transb,
+        args.m,
+        args.n,
+        args.k,
+        args.mata->data_ptr(),
+        args.scale_mata_ptr,
+        args.lda,
+        args.mata->scalar_type(),
+        args.scale_mata_dtype.value(),
+        args.scaling_mata_type.value(),
+        args.matb->data_ptr(),
+        args.scale_matb_ptr,
+        args.ldb,
+        args.matb->scalar_type(),
+        args.scale_matb_dtype.value(),
+        args.scaling_matb_type.value(),
+        bias ? bias->data_ptr(): nullptr,
+        bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_,
+        args.result->data_ptr(),
+        args.scale_result_ptr,
+        args.result_ld,
+        out_dtype_,
+        use_fast_accum);
+  }
+  return out;
+}
+
+Tensor&
+_scaled_tensorwise_tensorwise(
+          const Tensor& mat_a, const Tensor& mat_b,
+          const Tensor& scale_a, const Tensor& scale_b,
+          const std::optional<Tensor>& bias,
+          const c10::ScalarType out_dtype,
+          bool use_fast_accum,
+          Tensor& out) {
+  // Restrictions:
+  // A, B are FP8, scales are fp32
+  //
+  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
+      mat_a.scalar_type(), mat_b.scalar_type());
+  TORCH_CHECK_VALUE(scale_a.numel() == 1 && scale_a.scalar_type() == kFloat, "scale_a must have 1 Float element")
+  TORCH_CHECK_VALUE(scale_b.numel() == 1 && scale_b.scalar_type() == kFloat, "scale_b must have 1 Float element")
+
+  auto scaling_choice_a = ScalingType::TensorWise;
+  auto scaling_choice_b = ScalingType::TensorWise;
+
+  _cutlass_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
+
+  return out;
+}
+
+
+Tensor&
+_scaled_rowwise_rowwise(
+          const Tensor& mat_a, const Tensor& mat_b,
+          const Tensor& scale_a, const Tensor& scale_b,
+          const std::optional<Tensor>& bias,
+          const c10::ScalarType out_dtype,
+          bool use_fast_accum,
+          Tensor& out) {
+  // Restrictions:
+  // A, B are FP8, scales are fp32, shape M/N for A/B
+  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
+      mat_a.scalar_type(), mat_b.scalar_type());
+  TORCH_CHECK_VALUE(scale_a.size(0) == mat_a.size(0) && scale_a.size(1) == 1, "scale_a must have shape [", mat_a.size(0), ", 1], got [", scale_a.sizes(), "]");
+  TORCH_CHECK_VALUE(scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat, "scale_a must have ", mat_a.size(0), " Float elements, got ", scale_a.numel())
+  TORCH_CHECK_VALUE(scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat, "scale_b must have ", mat_b.size(1), " Float elements, got ", scale_b.numel())
+
+  TORCH_CHECK_VALUE(scale_a.stride(1) == 1, "expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1));
+  TORCH_CHECK_VALUE(scale_b.stride(1) == 1, "expected scale_b.stride(1) to be 1, but got ", scale_b.stride(1));
+
+  auto scaling_choice_a = ScalingType::RowWise;
+  auto scaling_choice_b = ScalingType::RowWise;
+  //
+  // NVIDIA's cuBLAS only started supporting row-wise scaling in version 12.9,
+  // and only for compute capability 9.0+. In other cases we use CUTLASS.
+#ifndef USE_ROCM
+  // We are doing row-wise scaling
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  if (((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)
+      // cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales
+      ||  (dprops->major == 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) {
+    TORCH_CHECK(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling.");
+    at::cuda::detail::f8f8bf16_rowwise(
+        mat_a,
+        mat_b,
+        scale_a,
+        scale_b,
+        bias,
+        use_fast_accum,
+        out);
+    return out;
+  }
+#else
+
+  // For ROCm, match behavior of f8f8bf16_rowwise type checking, for unit test purposes.
+  //Tensor b = mat_b;
+  if (_scaled_mm_is_fnuz()) {
+    TORCH_CHECK_VALUE(mat_b.dtype() == at::kFloat8_e4m3fnuz, "expected mat_b.dtype() to be at::kFloat8_e4m3fnuz, but got ", mat_b.dtype());
+  }
+  else {
+    TORCH_CHECK_VALUE(mat_b.dtype() == at::kFloat8_e4m3fn, "expected mat_b.dtype() to be at::kFloat8_e4m3fn, but got ", mat_b.dtype());
+  }
+  // Until more than bf16 is supported.
+  TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16,
+       "hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type());
+#endif
+
+  _cutlass_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
+
+  return out;
+}
+
+Tensor&
+_scaled_block1x128_block1x128(
+          const Tensor& mat_a, const Tensor& mat_b,
+          const Tensor& scale_a, const Tensor& scale_b,
+          const std::optional<Tensor>& bias,
+          const c10::ScalarType out_dtype,
+          const bool use_fast_accum,
+          Tensor& out) {
+  // Restrictions:
+  // A, B are FP8, scales are fp32, shape K//128
+  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
+      mat_a.scalar_type(), mat_b.scalar_type());
+  TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,
+      "scale_a must have shape ", mat_a.sizes()[0], " x ", mat_a.sizes()[1] / 128, " Float elements, got ", scale_a.sizes())
+  TORCH_CHECK_VALUE(scale_b.sizes()[0] == ceil_div<int64_t>(mat_b.sizes()[0], 128) && scale_b.sizes()[1] == mat_b.sizes()[1] && scale_b.scalar_type() == kFloat,
+      "scale_b must have shape ", ceil_div<int64_t>(mat_b.sizes()[0], 128), " x ", mat_b.sizes()[1], " Float elements, got ", scale_b.sizes())
+
+  TORCH_CHECK(scale_a.stride(0) == 1, "expected scale_a.stride(0) to be 1, but got ", scale_a.stride(0));
+  TORCH_CHECK(scale_b.stride(1) == 1, "expected scale_b.stride(1) to be 1, but got ", scale_b.stride(1));
+  TORCH_CHECK(scale_b.stride(0) == scale_b.size(1),
+      "expected scale_b.stride(0) to be ", scale_b.size(1), ", but got ", scale_b.size(1));
+
+  auto scaling_choice_a = ScalingType::BlockWise1x128;
+  auto scaling_choice_b = ScalingType::BlockWise1x128;
+
+  _cutlass_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
+
+  return out;
+}
+
+Tensor&
+_scaled_block128x128_block1x128(
+          const Tensor& mat_a, const Tensor& mat_b,
+          const Tensor& scale_a, const Tensor& scale_b,
+          const std::optional<Tensor>& bias,
+          const c10::ScalarType out_dtype,
+          const bool use_fast_accum,
+          Tensor& out) {
+  // Restrictions:
+  // A, B are FP8, scales are fp32, shape K//128
+  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
+      mat_a.scalar_type(), mat_b.scalar_type());
+  TORCH_CHECK_VALUE(scale_a.sizes()[0] == ceil_div<int64_t>(mat_a.sizes()[0], 128) && scale_a.sizes()[1] == ceil_div<int64_t>(mat_a.sizes()[1], 128) && scale_a.scalar_type() == kFloat,
+      "scale_a must have shape ", ceil_div<int64_t>(mat_a.sizes()[0], 128), " x ", ceil_div<int64_t>(mat_a.sizes()[1], 128), " Float elements, got ", scale_a.sizes())
+  TORCH_CHECK_VALUE(scale_b.sizes()[0] == ceil_div<int64_t>(mat_b.sizes()[0], 128) && scale_b.sizes()[1] == mat_b.sizes()[1] && scale_b.scalar_type() == kFloat,
+      "scale_b must have shape ", ceil_div<int64_t>(mat_b.sizes()[0], 128), " x ", mat_b.sizes()[1], " Float elements, got ", scale_b.sizes())
+
+  TORCH_CHECK_VALUE(scale_a.stride(1) == 1, "expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1));
+  TORCH_CHECK_VALUE(scale_b.stride(1) == 1, "expected scale_b.stride(1) to be 1, but got ", scale_b.stride(1));
+  TORCH_CHECK_VALUE(scale_b.stride(0) == scale_b.size(1),
+      "expected scale_b.stride(0) to be ", scale_b.size(1), ", but got ", scale_b.stride(0));
+
+  auto scaling_choice_a = ScalingType::BlockWise128x128;
+  auto scaling_choice_b = ScalingType::BlockWise1x128;
+
+  _cutlass_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
+
+  return out;
+}
+
+Tensor&
+_scaled_block1x128_block128x128(
+          const Tensor& mat_a, const Tensor& mat_b,
+          const Tensor& scale_a, const Tensor& scale_b,
+          const std::optional<Tensor>& bias,
+          const c10::ScalarType out_dtype,
+          const bool use_fast_accum,
+          Tensor& out) {
+  // Restrictions:
+  // A, B are FP8, scales are fp32, A: shape K//128, B: K//128, N//128
+  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
+      mat_a.scalar_type(), mat_b.scalar_type());
+  TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,
+      "scale_a must have shape ", mat_a.sizes()[0], " x ", mat_a.sizes()[1] / 128, " Float elements, got ", scale_a.sizes())
+  TORCH_CHECK_VALUE(scale_b.sizes()[0] == mat_b.sizes()[0] / 128 && scale_b.sizes()[1] == mat_b.sizes()[1] / 128 && scale_b.scalar_type() == kFloat,
+      "scale_b must have shape ", mat_b.sizes()[0] / 128, " x ", mat_b.sizes()[1] / 128, " Float elements, got ", scale_b.sizes())
+
+  TORCH_CHECK_VALUE(scale_a.stride(0) == 1, "expected scale_a.stride(0) to be 1, but got ", scale_a.stride(0));
+  TORCH_CHECK_VALUE(scale_b.stride(0) == 1, "expected scale_b.stride(0) to be 1, but got ", scale_b.stride(0));
+  TORCH_CHECK_VALUE(scale_b.stride(1) == scale_b.size(0),
+      "expected scale_b.stride(1) to be ", scale_b.size(0), ", but got ", scale_b.stride(1));
+
+  auto scaling_choice_a = ScalingType::BlockWise1x128;
+  auto scaling_choice_b = ScalingType::BlockWise128x128;
+
+  _cutlass_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
+
+  return out;
+}
+
+Tensor&
+_scaled_mxfp8_mxfp8(
+          const Tensor& mat_a, const Tensor& mat_b,
+          const Tensor& scale_a, const SwizzleType swizzle_a,
+          const Tensor& scale_b, const SwizzleType swizzle_b,
+          const std::optional<Tensor>& bias,
+          const c10::ScalarType out_dtype,
+          Tensor& out) {
+  // Restrictions:
+  // A, B are FP8, scales are e8m0, A: shape K//32, B: K, N//32
+  // Scales must be swizzled
+  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
+      mat_a.scalar_type(), mat_b.scalar_type());
+
+  auto scale_a_elems = round_up<int64_t>(mat_a.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_a.size(1), 32), 4);
+  auto scale_b_elems = round_up<int64_t>(mat_b.size(1), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_b.size(0), 32), 4);
+  TORCH_CHECK_VALUE(scale_a_elems == scale_a.numel(),
+         "For Blockwise scaling scale_a should have ", scale_a_elems, " elements, got: ", scale_a.numel());
+  TORCH_CHECK_VALUE(scale_b_elems == scale_b.numel(),
+         "For Blockwise scaling scale_b should have ", scale_b_elems, " elements, got: ", scale_b.numel());
+
+  TORCH_CHECK_VALUE(swizzle_a == SwizzleType::SWIZZLE_32_4_4, "scale_a must be swizzled to SWIZZLE_32_4_4 format");
+  TORCH_CHECK_VALUE(swizzle_b == SwizzleType::SWIZZLE_32_4_4, "scale_b must be swizzled to SWIZZLE_32_4_4 format");
+
+  TORCH_CHECK_VALUE(scale_a.is_contiguous() && scale_b.is_contiguous(),
+        "For Blockwise scaling both scales should be contiguous");
+
+  TORCH_CHECK_VALUE(out.scalar_type() == out_dtype, "expected out.scalar_type() to be ", out_dtype, ", but got ", out_dtype);
+
+  auto scaling_choice_a = ScalingType::BlockWise1x32;
+  auto scaling_choice_b = ScalingType::BlockWise1x32;
+
+#ifdef USE_ROCM
+#if ROCM_VERSION >= 70000
+  TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
+              "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
+
+  TORCH_CHECK_VALUE(mat_a.size(0) % 32 == 0 && mat_a.size(1) % 32 == 0 &&
+              mat_b.size(0) % 32 == 0 && mat_b.size(1) % 32 == 0,
+              "Matrix dimensions must be multiples of 32 for block-wise scaling");
+
+  TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16 ||
+              out.scalar_type() == ScalarType::Half,
+              "Block-wise scaling only supports BFloat16 or Half output types");
+#else
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
+#endif
+#endif
+
+  return _cutlass_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
+}
+
+Tensor&
+_scaled_nvfp4_nvfp4(
+          const Tensor& mat_a, const Tensor& mat_b,
+          const Tensor& scale_a, const SwizzleType swizzle_a,
+          const Tensor& scale_b, const SwizzleType swizzle_b,
+          const std::optional<Tensor>& bias,
+          const c10::ScalarType out_dtype,
+          const bool single_scale,
+          Tensor& out) {
+#ifdef USE_ROCM
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "NVFP4 scaling not supported on ROCM");
+#endif
+  TORCH_CHECK_VALUE(single_scale, "Only single-scaled NVFP4 currently supported");
+  // Restrictions:
+  // A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
+  // Scales must be swizzled
+  TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ",
+      mat_a.scalar_type(), mat_b.scalar_type());
+  // Note: fp4x2 format, need to double the K dimension for checking purposes.
+  auto scale_a_elems = round_up<int64_t>(mat_a.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_a.size(1) * 2, 16), 4);
+  auto scale_b_elems = round_up<int64_t>(mat_b.size(1), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_b.size(0) * 2, 16), 4);
+  TORCH_CHECK_VALUE(scale_a_elems == scale_a.numel(),
+         "For Blockwise scaling scale_a should have ", scale_a_elems, " elements, got: ", scale_a.numel());
+  TORCH_CHECK_VALUE(scale_b_elems == scale_b.numel(),
+         "For Blockwise scaling scale_b should have ", scale_b_elems, " elements, got: ", scale_b.numel());
+
+  TORCH_CHECK_VALUE(swizzle_a == SwizzleType::SWIZZLE_32_4_4, "scale_a must be swizzled to SWIZZLE_32_4_4 format");
+  TORCH_CHECK_VALUE(swizzle_b == SwizzleType::SWIZZLE_32_4_4, "scale_b must be swizzled to SWIZZLE_32_4_4 format");
+
+  TORCH_CHECK_VALUE(scale_a.is_contiguous() && scale_b.is_contiguous(),
+        "For Blockwise scaling both scales should be contiguous");
+
+  auto scaling_choice_a = ScalingType::BlockWise1x16;
+  auto scaling_choice_b = ScalingType::BlockWise1x16;
+  return _cutlass_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
+}
+
+
+// V2: Computes matrix multiply + bias while applying scaling to input and output matrices
+// Scales are only applicable when matrices are of Float8 type and assumed to be equal to 1.0 by default.
+// If output matrix type is 16 or 32-bit type, scale_result is not applied.
+// Known limitations:
+//  - Only works if mat1 is row-major and mat2 is column-major
+//  - Only works if matrices sizes are divisible by 32
+//  - If 1-dimensional tensors are used then scale_a should be size = mat1.size(0)
+//    and scale_b should have size = to mat2.size(1)
+//  Arguments:
+//    - `mat1`: the first operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2`
+//    - `mat2`: the second operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2`
+//    - `scale_a`: a tensor with the inverse scale of `mat1`, whose shape/strides/dtype depend on the scaling scheme
+//    - `scale_recipe_a`: An integer corresponding to an enum describing the scaling scheme used for `scale_a`
+//    - `swizzle_a`: An integer corresponding to a `SwizzleType` enum describing the swizzling scheme for `scale_a`
+//    - `scale_b`: a tensor with the inverse scale of `mat2`, whose shape/strides/dtype depend on the scaling scheme
+//    - `scale_recipe_b`: An integer corresponding to an enum describing the scaling scheme used for `scale_b`
+//    - `swizzle_b`: An integer corresponding to a `SwizzleType` enum describing the swizzling scheme for `scale_b`
+//    - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16`
+//    - `out_dtype`: the output dtype, can either be a float8 or a higher precision floating point type
+//    - `use_fast_accum`: if true, enables fast float8 accumulation. Backends may ignore this option if not applicable.
+//    - `out`: a reference to the output tensor
+Tensor&
+_scaled_mm_cuda_v2_out(
+          const Tensor& mat_a, const Tensor& mat_b,
+          ArrayRef<Tensor> scale_a,
+          IntArrayRef scale_recipe_a,
+          IntArrayRef swizzle_a,
+          ArrayRef<Tensor> scale_b,
+          IntArrayRef scale_recipe_b,
+          IntArrayRef swizzle_b,
+          const std::optional<Tensor>& bias,
+          const std::optional<c10::ScalarType> out_dtype,
+          IntArrayRef contraction_dim,
+          bool use_fast_accum,
+          Tensor& out) {
+  // Check sizes
+  bool allowed_device = _scaled_mm_allowed_device();
+  TORCH_CHECK_NOT_IMPLEMENTED(allowed_device,
+      "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+");
+  TORCH_CHECK_VALUE(mat_a.dim() == 2, "mat_a must be a matrix");
+  TORCH_CHECK_VALUE(mat_b.dim() == 2, "mat_b must be a matrix");
+
+  // If any of M, K, N is 0 - return early (the tensorwise/rowwise float8 gemm kernels
+  // do not support this case).
+  if (mat_a.size(0) == 0 || mat_a.size(1) == 0 || mat_b.size(1) == 0) {
+    // `out` was created with `at::empty`. In the case where we are multiplying
+    // MxK by KxN and K is the zero dim, we need to initialize here to properly
+    // return a tensor of zeros.
+    at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
+    if (mat_a.size(1) == 0) {
+      out.zero_();
+    }
+
+    return out;
+  }
+
+  // Check if the input matrix sizes can be multiplied
+  // - if optional contraction dims are provided, use those
+  //   -- mostly for < 1B formats (i.e. nvfp4x2) where cheap .t() is not available.
+  if (contraction_dim.size() > 0) {
+    TORCH_CHECK_VALUE(contraction_dim.size() == 2, "contraction_dim must have exactly 2 elements");
+    auto mat_a_dim = contraction_dim[0];
+    auto mat_b_dim = contraction_dim[1];
+    TORCH_CHECK_VALUE(
+        mat_a.size(mat_a_dim) == mat_b.size(mat_b_dim), "mat_a and mat_b shapes cannot be multiplied (",
+        mat_a.size(0), "x", mat_a.size(1), " and ", mat_b.size(0), "x", mat_b.size(1), ") ",
+        "with contraction dims mat_a: ", mat_a_dim, ", mat_b: ", mat_b_dim);
+  } else {
+    TORCH_CHECK_VALUE(
+        mat_a.size(1) == mat_b.size(0), "mat_a and mat_b shapes cannot be multiplied (",
+        mat_a.size(0), "x", mat_a.size(1), " and ", mat_b.size(0), "x", mat_b.size(1), ")");
+  }
+
+  TORCH_CHECK_VALUE(!bias || bias->numel() == mat_b.sizes()[1], "Bias must be size ", mat_b.sizes()[1],
+       " but got ", bias->numel());
+  TORCH_CHECK_VALUE(
+      mat_a.sizes()[1] % 16 == 0,
+      "Expected trailing dimension of mat1 to be divisible by 16 ",
+      "but got mat1 shape: (",
+      mat_a.sizes()[0],
+      "x",
+      mat_a.sizes()[1],
+      ").");
+  TORCH_CHECK_VALUE(mat_b.sizes()[0] % 16 == 0 && mat_b.sizes()[1] % 16 == 0, "mat2 shape (", mat_b.sizes()[0], "x",
+       mat_b.sizes()[1], ") must be divisible by 16");
+
+  // TODO(slayton): Existing checks, not sure if they should really be here.
+  TORCH_CHECK_VALUE(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type");
+  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) || mat_a.scalar_type() == ScalarType::Float4_e2m1fn_x2,
+      "Expected mat_a to be Float8 or Float4_x2 matrix got ", mat_a.scalar_type());
+  TORCH_CHECK_VALUE(isFloat8Type(mat_b.scalar_type()) || mat_b.scalar_type() == ScalarType::Float4_e2m1fn_x2,
+      "Expected mat_b to be Float8 or Float4_x2 matrix got ", mat_b.scalar_type());
+#ifndef USE_ROCM
+  // Type restrictions imposed by CuBLASLt as of CUDA-12.1
+  TORCH_CHECK_VALUE(mat_a.scalar_type() != ScalarType::Float8_e5m2 || mat_b.scalar_type() != ScalarType::Float8_e5m2,
+        "Multiplication of two Float8_e5m2 matrices is not supported");
+#endif
+  if (use_fast_accum) {
+    TORCH_CHECK_VALUE(mat_a.scalar_type() != ScalarType::Float4_e2m1fn_x2 && mat_b.scalar_type() != ScalarType::Float4_e2m1fn_x2, "`use_fast_accum` is not supported when `mat_a` or `mat_b` tensors have the `Float4_e2m1fn_x2` dtype.");
+  }
+#ifdef USE_ROCM
+  if (mat_a.scalar_type() == ScalarType::Float4_e2m1fn_x2 || mat_b.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
+    TORCH_CHECK_NOT_IMPLEMENTED(ROCM_VERSION >= 70000,
+        "Float4_e2m1fn_x2 is only supported for ROCm 7.0 and above");
+  }
+  if (mat_a.scalar_type() == ScalarType::Float8_e5m2 || mat_b.scalar_type() == ScalarType::Float8_e5m2) {
+    TORCH_CHECK_NOT_IMPLEMENTED(ROCM_VERSION >= 60500,
+        "Float8_e5m2 is only supported for ROCm 6.5 and above");
+  }
+  if (mat_a.scalar_type() == ScalarType::Float8_e4m3fn || mat_b.scalar_type() == ScalarType::Float8_e4m3fn) {
+    TORCH_CHECK_NOT_IMPLEMENTED(ROCM_VERSION >= 60500,
+        "Float8_e4m3fn is only supported for ROCm 6.5 and above");
+  }
+#endif
+  if (bias) {
+    TORCH_CHECK_VALUE(out.scalar_type() != kFloat,
+        "Bias is not supported when out_dtype is set to Float32");
+
+    TORCH_CHECK_VALUE(bias->scalar_type() == ScalarType::BFloat16 ||
+                bias->scalar_type() == ScalarType::Half,
+        "Bias must be BFloat16 or Half, but got ", bias->scalar_type());
+
+    TORCH_CHECK_VALUE((out.scalar_type() != kFloat &&
+                 out.scalar_type() != ScalarType::BFloat16) ||
+                bias->scalar_type() == ScalarType::BFloat16,
+        "Bias must be BFloat16 to compute ", out.scalar_type(),
+        " output, but got ", bias->scalar_type());
+
+    TORCH_CHECK_VALUE(out.scalar_type() != ScalarType::Half ||
+                bias->scalar_type() == ScalarType::Half,
+        "Bias must be Float16 to compute ", out.scalar_type(),
+        " output, but got ", bias->scalar_type());
+  }
+  {
+    auto bias_ = bias.value_or(Tensor());
+
+    // NOLINTNEXTLINE(*c-array*)
+    TensorArg targs[]{{out, "out", 0}, {mat_a, "mat_a", 1}, {mat_b, "mat_b", 2},
+                      {bias_, "bias", 3}, {scale_a[0], "scale_a", 4}, {scale_b[0], "scale_b", 5}};
+    checkAllSameGPU(__func__, targs);
+  }
+
+  auto out_dtype_ = out_dtype.value_or(at::ScalarType::BFloat16);
+
+  // Conversion of implicitly-defined enums to explicit
+  auto scale_recipe_a_enum = convert_int_to_enum<ScalingType>(scale_recipe_a);
+  auto swizzle_a_enum = convert_int_to_enum<SwizzleType>(swizzle_a);
+  auto scale_recipe_b_enum = convert_int_to_enum<ScalingType>(scale_recipe_b);
+  auto swizzle_b_enum = convert_int_to_enum<SwizzleType>(swizzle_b);
+
+  // at this point we can start working out what we want to be doing
+  // Try to do as few steps as possible.
+  // NOTE: support is deliberately sparse, can explicitly enumerate all combinations allowed.
+  // Do this via a list of defined (name, acceptance, concrete_impl) tuples.
+  bool found_impl = false;
+  ScaledGemmImplementation gemm_impl = ScaledGemmImplementation::NONE;
+
+  for (const auto& fn_entry : scale_kernel_dispatch) {
+    const auto [name, accept_fn, scaled_gemm_impl] = fn_entry;
+    bool ok = accept_fn(mat_a.scalar_type(),
+                        scale_recipe_a_enum,
+                        scale_a,
+                        mat_b.scalar_type(),
+                        scale_recipe_b_enum,
+                        scale_b);
+    if (ok) {
+      gemm_impl = scaled_gemm_impl;
+      found_impl = true;
+      break;
+    }
+  }
+  TORCH_CHECK_VALUE(
+    found_impl,
+    "Invalid scaling configuration.\n"
+    "- For TensorWise scaling, a and b should be float8, scales should be float and singletons.\n"
+    "- For RowWise scaling, a and b should be float8, scales should be float, scale_a should be (", mat_a.size(0), ", 1) and scale_b should be (1, ", mat_b.size(1), "), and both should be contiguous.\n"
+    "- For BlockWise 1x128 scaling, a and b should be float8, scales should be float, scale_a should be (", mat_a.size(0), ", ", ceil_div<int64_t>(mat_a.size(1), 128), ") and scale_b should be (", ceil_div<int64_t>(mat_b.size(0), 128), ", ", mat_b.size(1), "), and both should be outer-dim-major.\n"
+    "- For BlockWise 128x128 scaling, a and b should be float8, scales should be float, scale_a should be (", ceil_div<int64_t>(mat_a.size(0), 128), ", ", ceil_div<int64_t>(mat_a.size(1), 128), ") and scale_b should be (", ceil_div<int64_t>(mat_b.size(0), 128), ", ", ceil_div<int64_t>(mat_b.size(1), 128), "), and both should be near-inner-dim-major (with 16-byte aligned strides).\n"
+    "- For Blockwise 1x32 scaling, a and b should be float8, scales should be float8_e8m0fnu, scale_a should have ", round_up<int64_t>(mat_a.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_a.size(1), 32), 4), " elements and scale_b should have ", round_up<int64_t>(mat_b.size(1), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_b.size(0), 32), 4), " elements, and both should be contiguous.\n"
+    "- For Blockwise 1x16 scaling, a and b should be float4 (packed 2x), scales should be float8_e4m3fn, scale_a should have ", round_up<int64_t>(mat_a.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_a.size(1) * 2, 16), 4), " elements and scale_b should have ", round_up<int64_t>(mat_b.size(1), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_b.size(0) * 2, 16), 4), " elements, and both should be contiguous.\n"
+    "Got mat_a.dtype()=", mat_a.scalar_type(), ", scale_a[0].dtype()=", scale_a[0].scalar_type(), ", scale_a[0].size()=", scale_a[0].sizes(), ", scale_a[0].stride()=", scale_a[0].strides(), ", ",
+    "mat_b.dtype()=", mat_b.scalar_type(), ", scale_b[0].dtype()=", scale_b[0].scalar_type(), ", scale_b[0].size()=", scale_b[0].sizes(), " and scale_b[0].stride()=", scale_b[0].strides()
+  );
+
+  at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
+
+  auto bias_ = bias.value_or(Tensor());
+
+  // dispatch to appropriate lower-level calls for error checking & execution
+  if (gemm_impl == ScaledGemmImplementation::TENSORWISE_TENSORWISE) {
+    return _scaled_tensorwise_tensorwise(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out);
+  } else if (gemm_impl == ScaledGemmImplementation::ROWWISE_ROWWISE) {
+    return _scaled_rowwise_rowwise(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out);
+  } else if (gemm_impl == ScaledGemmImplementation::BLOCK_128x128_1x128) {
+    return _scaled_block128x128_block1x128(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out);
+  } else if (gemm_impl == ScaledGemmImplementation::BLOCK_1x128_128x128) {
+    return _scaled_block1x128_block128x128(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out);
+  } else if (gemm_impl == ScaledGemmImplementation::BLOCK_1x128_1x128) {
+    return _scaled_block1x128_block1x128(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out);
+  } else if (gemm_impl == ScaledGemmImplementation::MXFP8_MXFP8) {
+    return _scaled_mxfp8_mxfp8(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
+  } else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4) {
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Only single-scale NVFP4 currently supported");
+  } else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4_SINGLE_SCALE) {
+    return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, true /* single_scale */, out);
+  } else {
+    TORCH_CHECK_VALUE(false, "Invalid state - found an implementation, but not really");
+  }
+}
+
+Tensor
+_scaled_mm_cuda_v2(
+          const Tensor& mat_a, const Tensor& mat_b,
+          ArrayRef<Tensor> scale_a,
+          IntArrayRef scale_recipe_a,
+          IntArrayRef swizzle_a,
+          ArrayRef<Tensor> scale_b,
+          IntArrayRef scale_recipe_b,
+          IntArrayRef swizzle_b,
+          const std::optional<Tensor>& bias,
+          const std::optional<c10::ScalarType> out_dtype,
+          IntArrayRef contraction_dim,
+          bool use_fast_accum) {
+  const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
+  Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
+
+  return _scaled_mm_cuda_v2_out(
+                      mat_a, mat_b,
+                      scale_a, scale_recipe_a, swizzle_a,
+                      scale_b, scale_recipe_b, swizzle_b,
+                      bias,
+                      out_dtype,
+                      contraction_dim,
+                      use_fast_accum,
+                      out);
+}

 Tensor
 _scaled_grouped_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
--- a/aten/src/ATen/native/cuda/Shape.cu
+++ b/aten/src/ATen/native/cuda/Shape.cu
@ -488,15 +488,16 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
      }
    }

+    int cat_dim = dimension;
    if (memory_format != c10::MemoryFormat::Contiguous) {
-      switch (dimension) {
+      switch (cat_dim) {
      case 0:
        break;
      case 1:
-        dimension = nDims - dimension;
+        cat_dim = nDims - cat_dim;
        break;
      default:
-        dimension--;
+        cat_dim--;
      }
    }
    // Template Declarations for dim = 1, 2, 3, 4
@ -505,23 +506,23 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
      constexpr auto elems_per_vec = alignment / sizeof(scalar_t); \
      CatArrayBatchedCopy_vectorized<scalar_t, unsigned int, DIMS, batch_size, stride_size, alignment, elems_per_vec><<<\
      catGrid, applyBlock, 0, stream.stream()>>>(\
-        (char*)data, catMetaData, kernelOutputParam, dimension, trailingSize);\
+        (char*)data, catMetaData, kernelOutputParam, cat_dim, trailingSize);\
    } else if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
      CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_16><<<\
          catGrid, applyBlock, 0, stream.stream()>>>(\
-              data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
+              data, catMetaData, outputParam, cat_dim, outputParam.tensorStride[cat_dim]);\
    } else if (isContig && isAligned && sizeof(scalar_t) == 2) { \
      CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_8><<<\
          catGrid, applyBlock, 0, stream.stream()>>>(\
-              data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
+              data, catMetaData, outputParam, cat_dim, outputParam.tensorStride[cat_dim]);\
    } else if (isContig) {\
      CatArrayBatchedCopy_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size><<<\
          catGrid, applyBlock, 0, stream.stream()>>>(\
-              data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
+              data, catMetaData, outputParam, cat_dim, outputParam.tensorStride[cat_dim]);\
    } else {\
      CatArrayBatchedCopy<scalar_t, unsigned int, DIMS, batch_size, stride_size><<<\
          catGrid, applyBlock, 0, stream.stream()>>>(\
-              data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
+              data, catMetaData, outputParam, cat_dim, outputParam.tensorStride[cat_dim]);\
    }\
    C10_CUDA_KERNEL_LAUNCH_CHECK();
    switch (nDims) {
--- a/aten/src/ATen/native/cuda/int4mm.cu
+++ b/aten/src/ATen/native/cuda/int4mm.cu
@ -127,7 +127,7 @@ inline __host__ __device__ uint32_t getAlignmentRoundUp(const void* p) {
  return diff == 0 ? 0 : uint32_t(Align) - diff;
 }

-#if defined (__gfx90a__) || defined(__gfx942__)
+#if defined (__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)
 #define CDNA2_OR_LATER 1
 #else
 #define CDNA2_OR_LATER 0
@ -143,7 +143,7 @@ template<typename T, uint32_t Rank>
 using VecT = T __attribute__((ext_vector_type(Rank)));

 static bool isCDNA2orLater(int index) {
-    return at::detail::getCUDAHooks().isGPUArch({"gfx90a", "gfx942"}, index);
+    return at::detail::getCUDAHooks().isGPUArch({"gfx90a", "gfx942", "gfx950"}, index);
 }

 #else
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
@ -86,7 +86,7 @@ namespace cuda { namespace detail {
 struct LinalgDispatch {
   Tensor (*cholesky_solve_helper)(const Tensor& self, const Tensor& A, bool upper);
 };
-C10_EXPORT void registerLinalgDispatch(const LinalgDispatch&);
+C10_EXPORT void registerLinalgDispatch(const LinalgDispatch& /*disp_*/);
 }} // namespace cuda::detail
 #endif

--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@ -341,16 +341,22 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
  }
 };

-template <typename T, typename KeyType>
 struct MHAGraphCache {
-  std::unordered_map<KeyType, T, ParamsWrapperHash<KeyType>> engine_cache;
+  using KeyType = MHACacheKeyWrapper;
+  using ValueType = std::unique_ptr<fe::graph::Graph>;
+  using MapType =
+      std::unordered_map<KeyType, ValueType, ParamsWrapperHash<KeyType>>;
+  using iterator = typename MapType::iterator;
+  using const_iterator = typename MapType::const_iterator;
+
+  MapType engine_cache;
  int count = 0;
  int hits = 0;

  // no mutexes here as caches are now thread local for v8, can also return a
  // pointer to the Execution Plan if we know it will not be invalidated by
  // another thread
-  T* find(const KeyType& key) {
+  iterator find(const KeyType& key) {
    static bool flag =
        c10::utils::check_env("TORCH_CUDNN_SDPA_CACHE_DEBUG") == true;
    if (flag && count) {
@ -363,15 +369,19 @@ struct MHAGraphCache {
    }
    count++;
    auto it = engine_cache.find(key);
-    if (it == engine_cache.end()) {
-      return nullptr;
+    if (it != engine_cache.end()) {
+      hits++;
    }
-    hits++;
-    return &(it->second);
+    return it;
  }

-  void update(const KeyType& key, T& results) {
-    engine_cache.insert_or_assign(key, std::move(results));
+  const_iterator end() const {
+    return engine_cache.end();
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> try_emplace(const KeyType& key, Args&&... args) {
+    return engine_cache.try_emplace(key, std::forward<Args>(args)...);
  }
 };

@ -380,16 +390,14 @@ struct MHAGraphCache {
 // https://docs.nvidia.com/deeplearning/cudnn/backend/latest/release-notes.html
 // We also leak the caches to workaround potential teardown race issues.

-auto& getMHAGraphCache_() {
-  thread_local auto& instance =
-      *new MHAGraphCache<std::shared_ptr<fe::graph::Graph>, MHACacheKeyWrapper>;
-  return instance;
+MHAGraphCache& getMHAGraphCache_() {
+  thread_local MHAGraphCache* instance{new MHAGraphCache()};
+  return *instance;
 }

-auto& getMHAGraphBackwardCache_() {
-  thread_local auto& instance =
-      *new MHAGraphCache<std::shared_ptr<fe::graph::Graph>, MHACacheKeyWrapper>;
-  return instance;
+MHAGraphCache& getMHAGraphBackwardCache_() {
+  thread_local MHAGraphCache* instance{new MHAGraphCache()};
+  return *instance;
 }

 namespace {
@ -437,7 +445,7 @@ auto fixSizeOneDimStrideSDPA(

 } // namespace

-auto build_graph(
+std::unique_ptr<fe::graph::Graph> build_graph(
    int64_t b,
    int64_t h,
    int64_t s_q,
@ -461,7 +469,7 @@ auto build_graph(
  if (q.scalar_type() == kBFloat16) {
    dtype = fe::DataType_t::BFLOAT16;
  }
-  auto mha_graph = std::make_shared<fe::graph::Graph>();
+  auto mha_graph = std::make_unique<fe::graph::Graph>();
  // We're baking in float accumulation and scale types
  // in theory the graph may support other types, but they
  // have not been tested
@ -531,15 +539,13 @@ auto build_graph(
      fe::graph::Tensor_attributes().set_uid(K).set_name("K"));
  auto V_ = mha_graph->tensor(
      fe::graph::Tensor_attributes().set_uid(V).set_name("V"));
-  std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
  if (attn_bias.has_value()) {
-    bias =
+    scaled_dot_product_flash_attention_options.set_bias(
        mha_graph->tensor(fe::graph::Tensor_attributes()
                              .set_uid(BIAS)
                              .set_name("bias")
                              .set_dim(attn_bias.value().sizes().vec())
-                              .set_stride(attn_bias.value().strides().vec()));
-    scaled_dot_product_flash_attention_options.set_bias(bias.value());
+                              .set_stride(attn_bias.value().strides().vec())));
  }

  auto [O_, Stats] =
@ -640,7 +646,7 @@ auto build_graph(
  return mha_graph;
 }

-auto build_graph_nestedtensor(
+std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
    int64_t b,
    int64_t h_q,
    int64_t h_k,
@ -668,7 +674,7 @@ auto build_graph_nestedtensor(
  if (q.scalar_type() == kBFloat16) {
    dtype = fe::DataType_t::BFLOAT16;
  }
-  auto mha_graph = std::make_shared<fe::graph::Graph>();
+  auto mha_graph = std::make_unique<fe::graph::Graph>();
  // We're baking in float accumulation and scale types
  // in theory the graph may support other types, but they
  // have not been tested
@ -766,18 +772,16 @@ auto build_graph_nestedtensor(
                                       v_strides[strideidx0],
                                       v_strides[strideidx1],
                                       v_strides[strideidx2]}));
-  std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
  if (attn_bias.has_value()) {
    TORCH_CHECK(
        false,
        "attn_bias not yet supportd with cuDNN Attention and NestedTensor");
-    bias =
+    scaled_dot_product_flash_attention_options.set_bias(
        mha_graph->tensor(fe::graph::Tensor_attributes()
                              .set_uid(BIAS)
                              .set_name("bias")
                              .set_dim(attn_bias.value().sizes().vec())
-                              .set_stride(attn_bias.value().strides().vec()));
-    scaled_dot_product_flash_attention_options.set_bias(bias.value());
+                              .set_stride(attn_bias.value().strides().vec())));
  }
  auto RAG_Q_OFF_ =
      mha_graph->tensor(fe::graph::Tensor_attributes()
@ -847,7 +851,7 @@ auto build_graph_nestedtensor(
  return mha_graph;
 }

-auto build_graph_backward(
+std::unique_ptr<fe::graph::Graph> build_graph_backward(
    int64_t b,
    int64_t h,
    int64_t s_q,
@ -874,7 +878,7 @@ auto build_graph_backward(
  if (q.scalar_type() == kBFloat16) {
    dtype = fe::DataType_t::BFLOAT16;
  }
-  auto mha_graph = std::make_shared<fe::graph::Graph>();
+  auto mha_graph = std::make_unique<fe::graph::Graph>();
  // We're baking in float accumulation and scale types
  // in theory the graph may support other types, but they
  // have not been tested
@ -919,15 +923,13 @@ auto build_graph_backward(
      fe::graph::Tensor_attributes().set_uid(K).set_name("K"));
  auto V_ = mha_graph->tensor(
      fe::graph::Tensor_attributes().set_uid(V).set_name("V"));
-  std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
  if (attn_bias.has_value()) {
-    bias =
+    sdpa_backward_options.set_bias(
        mha_graph->tensor(fe::graph::Tensor_attributes()
                              .set_uid(BIAS)
                              .set_name("bias")
                              .set_dim(attn_bias.value().sizes().vec())
-                              .set_stride(attn_bias.value().strides().vec()));
-    sdpa_backward_options.set_bias(bias.value());
+                              .set_stride(attn_bias.value().strides().vec())));
  }
  if (dropout_probability != 0.0f) {
    auto seed = mha_graph->tensor(fe::graph::Tensor_attributes()
@ -1061,7 +1063,7 @@ auto build_graph_backward(
  return mha_graph;
 }

-auto build_graph_backward_nestedtensor(
+std::unique_ptr<fe::graph::Graph> build_graph_backward_nestedtensor(
    int64_t b,
    int64_t h_q,
    int64_t h_k,
@ -1092,7 +1094,7 @@ auto build_graph_backward_nestedtensor(
  if (q.scalar_type() == kBFloat16) {
    dtype = fe::DataType_t::BFLOAT16;
  }
-  auto mha_graph = std::make_shared<fe::graph::Graph>();
+  auto mha_graph = std::make_unique<fe::graph::Graph>();
  // We're baking in float accumulation and scale types
  // in theory the graph may support other types, but they
  // have not been tested
@ -1195,18 +1197,16 @@ auto build_graph_backward_nestedtensor(
                                       o_strides[strideidx1],
                                       o_strides[strideidx2]}));

-  std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
  if (attn_bias.has_value()) {
    TORCH_CHECK(
        false,
        "attn_bias not yet supportd with cuDNN Attention and NestedTensor");
-    bias =
+    sdpa_backward_options.set_bias(
        mha_graph->tensor(fe::graph::Tensor_attributes()
                              .set_uid(BIAS)
                              .set_name("bias")
                              .set_dim(attn_bias.value().sizes().vec())
-                              .set_stride(attn_bias.value().strides().vec()));
-    sdpa_backward_options.set_bias(bias.value());
+                              .set_stride(attn_bias.value().strides().vec())));
  }
  auto RAG_Q_OFF_ =
      mha_graph->tensor(fe::graph::Tensor_attributes()
@ -1378,7 +1378,7 @@ void run_cudnn_SDP_fprop(
  // NB: The key initialization will round up sequence length, stride data etc.
  // if use_ragged_in_dense is enabled (to allow multiple sequence lengths to
  // reuse the same cached value/graph)
-  auto key = MHACacheKeyWrapper(
+  MHACacheKeyWrapper key(
      b,
      h,
      s_q,
@ -1393,12 +1393,9 @@ void run_cudnn_SDP_fprop(
      is_causal,
      return_softmaxstats,
      false);
-  auto graph_ptr = getMHAGraphCache_().find(key);
-  std::shared_ptr<fe::graph::Graph> mha_graph;
-  if (graph_ptr) {
-    mha_graph = *graph_ptr;
-  } else {
-    mha_graph = build_graph(
+  auto [cache_it, not_found] = getMHAGraphCache_().try_emplace(key, nullptr);
+  if (not_found) {
+    cache_it->second = build_graph(
        b,
        h,
        s_q,
@ -1419,39 +1416,39 @@ void run_cudnn_SDP_fprop(
        _dropoutoffset,
        handle);
  }
+  const fe::graph::Graph& mha_graph = *cache_it->second;
  std::unordered_map<int64_t, void*> variant_pack = {
-      {Q, q.data_ptr()},
-      {K, k.data_ptr()},
-      {V, v.data_ptr()},
+      {Q, q.mutable_data_ptr()},
+      {K, k.mutable_data_ptr()},
+      {V, v.mutable_data_ptr()},
      {SCALE, &scaling_factor},
-      {O, o.data_ptr()}};
+      {O, o.mutable_data_ptr()}};
  if (return_softmaxstats) {
-    variant_pack[LSE] = softmaxstats.data_ptr();
+    variant_pack[LSE] = softmaxstats.mutable_data_ptr();
  }
  if (attn_bias.has_value()) {
-    variant_pack[BIAS] = attn_bias.value().data_ptr();
+    variant_pack[BIAS] = attn_bias.value().mutable_data_ptr();
  }
  if (dropout_probability != 0.0f) {
-    variant_pack[SEED] = _dropoutseed.data_ptr();
-    variant_pack[OFFSET] = _dropoutoffset.data_ptr();
+    variant_pack[SEED] = _dropoutseed.mutable_data_ptr();
+    variant_pack[OFFSET] = _dropoutoffset.mutable_data_ptr();
  }
  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
-    variant_pack[SEQ_LEN_Q] = seqlen_q.data_ptr();
-    variant_pack[SEQ_LEN_KV] = seqlen_kv.data_ptr();
-    variant_pack[RAG_Q_OFF] = rag_off_q.data_ptr();
-    variant_pack[RAG_K_OFF] = rag_off_k.data_ptr();
-    variant_pack[RAG_V_OFF] = rag_off_v.data_ptr();
-    variant_pack[RAG_O_OFF] = rag_off_o.data_ptr();
+    variant_pack[SEQ_LEN_Q] = seqlen_q.mutable_data_ptr();
+    variant_pack[SEQ_LEN_KV] = seqlen_kv.mutable_data_ptr();
+    variant_pack[RAG_Q_OFF] = rag_off_q.mutable_data_ptr();
+    variant_pack[RAG_K_OFF] = rag_off_k.mutable_data_ptr();
+    variant_pack[RAG_V_OFF] = rag_off_v.mutable_data_ptr();
+    variant_pack[RAG_O_OFF] = rag_off_o.mutable_data_ptr();
    if (return_softmaxstats) {
-      variant_pack[RAG_LSE_OFF] = rag_off_lse.data_ptr();
+      variant_pack[RAG_LSE_OFF] = rag_off_lse.mutable_data_ptr();
    }
  }
-  auto workspace_size = mha_graph->get_workspace_size();
+  auto workspace_size = mha_graph.get_workspace_size();
  auto workspace_ptr =
      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
  TORCH_CHECK(
-      mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good());
-  getMHAGraphCache_().update(key, mha_graph);
+      mha_graph.execute(handle, variant_pack, workspace_ptr.get()).is_good());
 }

 void run_cudnn_SDP_fprop_nestedtensor(
@ -1491,7 +1488,7 @@ void run_cudnn_SDP_fprop_nestedtensor(
    softmaxstats = at::empty({q.size(0), h_q, 1}, q.options().dtype(kFloat));
  }

-  auto key = MHACacheKeyWrapper(
+  MHACacheKeyWrapper key(
      b,
      h_q,
      s_q, // max-seqlen-q
@ -1506,13 +1503,12 @@ void run_cudnn_SDP_fprop_nestedtensor(
      is_causal,
      return_softmaxstats,
      true);
-  auto graph_ptr = getMHAGraphCache_().find(key);
-  std::shared_ptr<fe::graph::Graph> mha_graph;

-  if (graph_ptr) {
-    mha_graph = *graph_ptr;
-  } else {
-    mha_graph = build_graph_nestedtensor(
+  MHAGraphCache& cache = getMHAGraphCache_();
+  auto cache_it = cache.find(key);
+  std::unique_ptr<fe::graph::Graph> mha_graph_storage;
+  if (cache_it == cache.end()) {
+    mha_graph_storage = build_graph_nestedtensor(
        b,
        h_q,
        h_k,
@ -1537,40 +1533,44 @@ void run_cudnn_SDP_fprop_nestedtensor(
        dropoutoffset,
        handle);
  }
+  const fe::graph::Graph& mha_graph =
+      mha_graph_storage ? *mha_graph_storage : *cache_it->second;
+
  auto seqlen_q = at::diff(cum_seqlen_q, 1, 0);
  auto seqlen_kv = at::diff(cum_seqlen_kv, 1, 0);
-  auto rag_q_off = cum_seqlen_q.mul(h_q * d_qk);
-  auto rag_k_off = cum_seqlen_kv.mul(h_k * d_v);
-  auto rag_v_off = cum_seqlen_kv.mul(h_v * d_v);
+  auto rag_q_off = cum_seqlen_q.mul(q.stride(-3));
+  auto rag_k_off = cum_seqlen_kv.mul(k.stride(-3));
+  auto rag_v_off = cum_seqlen_kv.mul(v.stride(-3));
+  auto rag_o_off = cum_seqlen_q.mul(o.stride(-3));
  auto rag_stats_off = cum_seqlen_q.mul(h_q);
  std::unordered_map<int64_t, void*> variant_pack = {
-      {Q, q.data_ptr()},
-      {K, k.data_ptr()},
-      {V, v.data_ptr()},
+      {Q, q.mutable_data_ptr()},
+      {K, k.mutable_data_ptr()},
+      {V, v.mutable_data_ptr()},
      {SCALE, &scaling_factor},
-      {O, o.data_ptr()},
-      {RAG_Q_OFF, rag_q_off.data_ptr()},
-      {RAG_O_OFF, rag_q_off.data_ptr()},
-      {RAG_K_OFF, rag_k_off.data_ptr()},
-      {RAG_V_OFF, rag_v_off.data_ptr()},
-      {SEQ_LEN_Q, seqlen_q.data_ptr()},
-      {SEQ_LEN_KV, seqlen_kv.data_ptr()}};
+      {O, o.mutable_data_ptr()},
+      {RAG_Q_OFF, rag_q_off.mutable_data_ptr()},
+      {RAG_O_OFF, rag_o_off.mutable_data_ptr()},
+      {RAG_K_OFF, rag_k_off.mutable_data_ptr()},
+      {RAG_V_OFF, rag_v_off.mutable_data_ptr()},
+      {SEQ_LEN_Q, seqlen_q.mutable_data_ptr()},
+      {SEQ_LEN_KV, seqlen_kv.mutable_data_ptr()}};
  if (return_softmaxstats) {
-    variant_pack[LSE] = softmaxstats.data_ptr();
-    variant_pack[RAG_LSE_OFF] = rag_stats_off.data_ptr();
+    variant_pack[LSE] = softmaxstats.mutable_data_ptr();
+    variant_pack[RAG_LSE_OFF] = rag_stats_off.mutable_data_ptr();
  }
  if (dropout_probability != 0.0f) {
-    variant_pack[SEED] = dropoutseed.data_ptr();
-    variant_pack[OFFSET] = dropoutoffset.data_ptr();
+    variant_pack[SEED] = dropoutseed.mutable_data_ptr();
+    variant_pack[OFFSET] = dropoutoffset.mutable_data_ptr();
  }
  if (attn_bias.has_value()) {
    TORCH_CHECK("bias not supported with nestedtensor");
  }
-  auto workspace_size = mha_graph->get_workspace_size();
+  auto workspace_size = mha_graph.get_workspace_size();
  auto workspace_ptr =
      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
  TORCH_CHECK(
-      mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good());
+      mha_graph.execute(handle, variant_pack, workspace_ptr.get()).is_good());
 }

 void run_cudnn_SDP_bprop(
@ -1652,7 +1652,7 @@ void run_cudnn_SDP_bprop(
  }

  cudnnHandle_t handle = getCudnnHandle();
-  auto key = MHACacheKeyWrapper(
+  MHACacheKeyWrapper key(
      b,
      h,
      s_q,
@ -1667,12 +1667,10 @@ void run_cudnn_SDP_bprop(
      is_causal,
      true,
      false);
-  auto graph_backward_ptr = getMHAGraphBackwardCache_().find(key);
-  std::shared_ptr<fe::graph::Graph> mha_graph;
-  if (graph_backward_ptr) {
-    mha_graph = *graph_backward_ptr;
-  } else {
-    mha_graph = build_graph_backward(
+  auto [cache_it, not_found] =
+      getMHAGraphBackwardCache_().try_emplace(key, nullptr);
+  if (not_found) {
+    cache_it->second = build_graph_backward(
        b,
        h,
        s_q,
@ -1696,43 +1694,44 @@ void run_cudnn_SDP_bprop(
        _dropoutoffset,
        handle);
  }
+  const fe::graph::Graph& mha_graph = *cache_it->second;
+
  std::unordered_map<int64_t, void*> variant_pack = {
      // inputs
-      {Q, q.data_ptr()},
-      {K, k.data_ptr()},
-      {V, v.data_ptr()},
-      {O, o.data_ptr()},
-      {DO, dO_.data_ptr()},
-      {LSE, softmaxstats.data_ptr()},
+      {Q, q.mutable_data_ptr()},
+      {K, k.mutable_data_ptr()},
+      {V, v.mutable_data_ptr()},
+      {O, o.mutable_data_ptr()},
+      {DO, dO_.mutable_data_ptr()},
+      {LSE, softmaxstats.mutable_data_ptr()},
      // outputs
-      {DQ, dQ.data_ptr()},
-      {DK, dK.data_ptr()},
-      {DV, dV.data_ptr()},
+      {DQ, dQ.mutable_data_ptr()},
+      {DK, dK.mutable_data_ptr()},
+      {DV, dV.mutable_data_ptr()},
      {SCALE, &scaling_factor}};
  if (dropout_probability != 0.0f) {
-    variant_pack[SEED] = _dropoutseed.data_ptr();
-    variant_pack[OFFSET] = _dropoutoffset.data_ptr();
+    variant_pack[SEED] = _dropoutseed.mutable_data_ptr();
+    variant_pack[OFFSET] = _dropoutoffset.mutable_data_ptr();
  }
  if (attn_bias.has_value()) {
-    variant_pack[BIAS] = attn_bias.value().data_ptr();
+    variant_pack[BIAS] = attn_bias.value().mutable_data_ptr();
  }
  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
-    variant_pack[SEQ_LEN_Q] = seqlen_q.data_ptr();
-    variant_pack[SEQ_LEN_KV] = seqlen_kv.data_ptr();
-    variant_pack[RAG_Q_OFF] = rag_off_q.data_ptr();
-    variant_pack[RAG_K_OFF] = rag_off_k.data_ptr();
-    variant_pack[RAG_V_OFF] = rag_off_v.data_ptr();
-    variant_pack[RAG_O_OFF] = rag_off_o.data_ptr();
-    variant_pack[RAG_LSE_OFF] = rag_off_lse.data_ptr();
+    variant_pack[SEQ_LEN_Q] = seqlen_q.mutable_data_ptr();
+    variant_pack[SEQ_LEN_KV] = seqlen_kv.mutable_data_ptr();
+    variant_pack[RAG_Q_OFF] = rag_off_q.mutable_data_ptr();
+    variant_pack[RAG_K_OFF] = rag_off_k.mutable_data_ptr();
+    variant_pack[RAG_V_OFF] = rag_off_v.mutable_data_ptr();
+    variant_pack[RAG_O_OFF] = rag_off_o.mutable_data_ptr();
+    variant_pack[RAG_LSE_OFF] = rag_off_lse.mutable_data_ptr();
  }

-  auto workspace_size = mha_graph->get_workspace_size();
+  auto workspace_size = mha_graph.get_workspace_size();
  auto workspace_ptr =
      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
  TORCH_CHECK(!workspace_size || workspace_ptr.get());
  TORCH_CHECK(
-      mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good());
-  getMHAGraphBackwardCache_().update(key, mha_graph);
+      mha_graph.execute(handle, variant_pack, workspace_ptr.get()).is_good());
 }

 void run_cudnn_SDP_bprop_nestedtensor(
@ -1775,9 +1774,10 @@ void run_cudnn_SDP_bprop_nestedtensor(

  auto seqlen_q = at::diff(cum_seqlen_q, 1, 0);
  auto seqlen_kv = at::diff(cum_seqlen_kv, 1, 0);
-  auto rag_q_off = cum_seqlen_q.mul(h_q * d_qk);
-  auto rag_k_off = cum_seqlen_kv.mul(h_k * d_v);
-  auto rag_v_off = cum_seqlen_kv.mul(h_v * d_v);
+  auto rag_q_off = cum_seqlen_q.mul(q.stride(-3));
+  auto rag_k_off = cum_seqlen_kv.mul(k.stride(-3));
+  auto rag_v_off = cum_seqlen_kv.mul(v.stride(-3));
+  auto rag_o_off = cum_seqlen_q.mul(o.stride(-3));
  auto rag_stats_off = cum_seqlen_q.mul(h_q);

  auto dprops = at::cuda::getCurrentDeviceProperties();
@ -1791,7 +1791,7 @@ void run_cudnn_SDP_bprop_nestedtensor(

  cudnnHandle_t handle = getCudnnHandle();

-  auto key = MHACacheKeyWrapper(
+  MHACacheKeyWrapper key(
      b,
      h_q,
      s_q, // max-seqlen-q
@ -1806,13 +1806,12 @@ void run_cudnn_SDP_bprop_nestedtensor(
      is_causal,
      true,
      true);
-  auto graph_ptr = getMHAGraphCache_().find(key);
-  std::shared_ptr<fe::graph::Graph> mha_graph;

-  if (graph_ptr) {
-    mha_graph = *graph_ptr;
-  } else {
-    mha_graph = build_graph_backward_nestedtensor(
+  MHAGraphCache& cache = getMHAGraphCache_();
+  auto cache_it = cache.find(key);
+  std::unique_ptr<fe::graph::Graph> mha_graph_storage;
+  if (cache_it == cache.end()) {
+    mha_graph_storage = build_graph_backward_nestedtensor(
        b,
        h_q,
        h_k,
@ -1840,41 +1839,43 @@ void run_cudnn_SDP_bprop_nestedtensor(
        dropoutoffset,
        handle);
  }
+  const fe::graph::Graph& mha_graph =
+      mha_graph_storage ? *mha_graph_storage : *cache_it->second;

  std::unordered_map<int64_t, void*> variant_pack = {
      // inputs
-      {Q, q.data_ptr()},
-      {K, k.data_ptr()},
-      {V, v.data_ptr()},
-      {O, o.data_ptr()},
-      {DO, dO_.data_ptr()},
-      {LSE, softmaxstats.data_ptr()},
+      {Q, q.mutable_data_ptr()},
+      {K, k.mutable_data_ptr()},
+      {V, v.mutable_data_ptr()},
+      {O, o.mutable_data_ptr()},
+      {DO, dO_.mutable_data_ptr()},
+      {LSE, softmaxstats.mutable_data_ptr()},
      // outputs
-      {DQ, dQ.data_ptr()},
-      {DK, dK.data_ptr()},
-      {DV, dV.data_ptr()},
+      {DQ, dQ.mutable_data_ptr()},
+      {DK, dK.mutable_data_ptr()},
+      {DV, dV.mutable_data_ptr()},
      {SCALE, &scaling_factor},
-      {RAG_Q_OFF, rag_q_off.data_ptr()},
-      {RAG_O_OFF, rag_q_off.data_ptr()},
-      {RAG_K_OFF, rag_k_off.data_ptr()},
-      {RAG_V_OFF, rag_v_off.data_ptr()},
-      {RAG_LSE_OFF, rag_stats_off.data_ptr()},
-      {SEQ_LEN_Q, seqlen_q.data_ptr()},
-      {SEQ_LEN_KV, seqlen_kv.data_ptr()}};
+      {RAG_Q_OFF, rag_q_off.mutable_data_ptr()},
+      {RAG_O_OFF, rag_o_off.mutable_data_ptr()},
+      {RAG_K_OFF, rag_k_off.mutable_data_ptr()},
+      {RAG_V_OFF, rag_v_off.mutable_data_ptr()},
+      {RAG_LSE_OFF, rag_stats_off.mutable_data_ptr()},
+      {SEQ_LEN_Q, seqlen_q.mutable_data_ptr()},
+      {SEQ_LEN_KV, seqlen_kv.mutable_data_ptr()}};
  if (dropout_probability != 0.0f) {
-    variant_pack[SEED] = _dropoutseed.data_ptr();
-    variant_pack[OFFSET] = _dropoutoffset.data_ptr();
+    variant_pack[SEED] = _dropoutseed.mutable_data_ptr();
+    variant_pack[OFFSET] = _dropoutoffset.mutable_data_ptr();
  }
  TORCH_CHECK(
      !attn_bias.has_value(),
      "attn_bias not yet supportd with cuDNN Attention and NestedTensor");

-  auto workspace_size = mha_graph->get_workspace_size();
+  auto workspace_size = mha_graph.get_workspace_size();
  auto workspace_ptr =
      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
  TORCH_CHECK(!workspace_size || workspace_ptr.get());
  TORCH_CHECK(
-      mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good());
+      mha_graph.execute(handle, variant_pack, workspace_ptr.get()).is_good());
 }

 } // namespace native
--- a/aten/src/ATen/native/mps/MetalShaderLibrary.h
+++ b/aten/src/ATen/native/mps/MetalShaderLibrary.h
@ -116,6 +116,8 @@ class MetalShaderLibrary {
  std::vector<std::string> getFunctionNames();
  std::shared_ptr<MetalKernelFunction> getKernelFunction(
      const std::string& name);
+  // Returns a raw pointer to the kernel function for use in C APIs
+  MetalKernelFunction* getCachedKernelFunctionPtr(const std::string& name);
  inline MTLComputePipelineState_t getPipelineStateForFunc(
      const std::string& fname) {
    return getLibraryPipelineState(getLibrary(), fname).first;
@ -164,6 +166,9 @@ class MetalShaderLibrary {
      std::string,
      std::pair<MTLComputePipelineState_t, MTLFunction_t>>
      cplMap;
+  // Cache for kernel functions returned by getCachedKernelFunctionPtr
+  std::unordered_map<std::string, std::unique_ptr<MetalKernelFunction>>
+      kernelCache;
 };

 class DynamicMetalShaderLibrary : public MetalShaderLibrary {
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@ -917,6 +917,22 @@ std::shared_ptr<MetalKernelFunction> MetalShaderLibrary::getKernelFunction(const
  return std::make_shared<MetalKernelFunction>(cpl, func);
 }

+MetalKernelFunction* MetalShaderLibrary::getCachedKernelFunctionPtr(const std::string& name) {
+  // Check if kernel is already cached
+  auto it = kernelCache.find(name);
+  if (it != kernelCache.end()) {
+    return it->second.get();
+  }
+
+  // Create new kernel function and cache it
+  auto [cpl, func] = getLibraryPipelineState(getLibrary(), name);
+  auto kernel = std::make_unique<MetalKernelFunction>(cpl, func);
+  MetalKernelFunction* raw_ptr = kernel.get();
+  kernelCache[name] = std::move(kernel);
+
+  return raw_ptr;
+}
+
 class BundledShaderLibary : public MetalShaderLibrary {
 public:
  BundledShaderLibary() : MetalShaderLibrary("") {}
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -5,6 +5,38 @@
 # representing ScalarType's. They are now superseded by usage of
 # `aten::to()`. The ops remain here for backward compatibility purposes.

+# DEPRECATED. DO NOT USE
+- func: _cast_Byte(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Char(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Double(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Float(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Int(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Long(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Short(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Half(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
 # Computes the gradient of current tensor w.r.t. graph leaves.
 - func: _backward(Tensor self, Tensor[] inputs, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()
  manual_cpp_binding: True
@ -7125,6 +7157,7 @@
    CUDA: _scaled_mm_cuda
  tags: needs_exact_strides

+
 - func: _scaled_mm.out(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False, *, Tensor(a!) out) -> Tensor(a!)
  variants: function
  dispatch:
@ -7132,6 +7165,16 @@
    CUDA: _scaled_mm_out_cuda
  tags: needs_exact_strides

+- func: _scaled_mm_v2(Tensor self, Tensor mat2, Tensor[] scale_a, int[] recipe_a, int[] swizzle_a, Tensor[] scale_b, int[] recipe_b, int[] swizzle_b, Tensor? bias, ScalarType? out_dtype, int[] contraction_dim=[], bool use_fast_accum=False) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: _scaled_mm_cuda_v2
+
+- func: _scaled_mm_v2.out(Tensor self, Tensor mat2, Tensor[] scale_a, int[] recipe_a, int[] swizzle_a, Tensor[] scale_b, int[] recipe_b, int[] swizzle_b, Tensor? bias, ScalarType? out_dtype, int[] contraction_dim=[], bool use_fast_accum=False, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CUDA: _scaled_mm_cuda_v2_out
+

 - func: _scaled_grouped_mm(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? offs=None, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False) -> Tensor
  variants: function
--- a/aten/src/ATen/native/quantized/AffineQuantizerBase.h
+++ b/aten/src/ATen/native/quantized/AffineQuantizerBase.h
@ -31,7 +31,7 @@ TORCH_API float dequantize_vec(
    float* dst,
    size_t count = 8);
 template <typename SRC_T, typename DST_T>
-TORCH_API DST_T requantize_val(double, int64_t, double, int64_t, SRC_T src);
+TORCH_API DST_T requantize_val(double /*src_scale*/, int64_t /*src_zero_point*/, double /*dst_scale*/, int64_t /*dst_zero_point*/, SRC_T src);

 // Given a multiplier and a zero_point, requantize int32_t computed values back
 // to quantized values. See comment above
--- a/aten/src/ATen/native/quantized/TensorFactories.cpp
+++ b/aten/src/ATen/native/quantized/TensorFactories.cpp
@ -104,27 +104,27 @@ Tensor empty_strided_unknown_quantized(

 // Provide better error message if dtype is wrong
 Tensor empty_affine_quantized_other_backends_stub(
-    IntArrayRef,
-    std::optional<ScalarType>,
-    std::optional<Layout>,
-    std::optional<Device>,
-    std::optional<bool>,
-    double,
-    int64_t,
-    std::optional<c10::MemoryFormat>) {
+    IntArrayRef /*unused*/,
+    std::optional<ScalarType> /*unused*/,
+    std::optional<Layout> /*unused*/,
+    std::optional<Device> /*unused*/,
+    std::optional<bool> /*unused*/,
+    double /*unused*/,
+    int64_t /*unused*/,
+    std::optional<c10::MemoryFormat> /*unused*/) {
  TORCH_CHECK(false, "Creation of quantized tensor requires quantized dtype like torch.quint8");
 }

 Tensor empty_per_channel_affine_quantized_other_backends_stub(
-    IntArrayRef,
-    const Tensor&,
-    const Tensor&,
-    int64_t,
-    std::optional<ScalarType>,
-    std::optional<Layout>,
-    std::optional<Device>,
-    std::optional<bool>,
-    std::optional<c10::MemoryFormat>) {
+    IntArrayRef /*unused*/,
+    const Tensor& /*unused*/,
+    const Tensor& /*unused*/,
+    int64_t /*unused*/,
+    std::optional<ScalarType> /*unused*/,
+    std::optional<Layout> /*unused*/,
+    std::optional<Device> /*unused*/,
+    std::optional<bool> /*unused*/,
+    std::optional<c10::MemoryFormat> /*unused*/) {
  TORCH_CHECK(false, "Creation of quantized tensor requires quantized dtype like torch.quint8");
 }

--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@ -637,13 +637,7 @@ bool check_for_nested_inputs(sdp_params const& params, bool debug) {
      TORCH_WARN("Experimental cuDNN SDPA nested tensor support is not enabled.");
    }
    return false;
-  } else if (has_for_nested_inputs(params) && (params.query.requires_grad() || params.key.requires_grad() || params.value.requires_grad())) {
-    if (debug) {
-      TORCH_WARN("Experimental cuDNN SDPA nested tensor support does not support backward.");
-      return false;
-    }
  }
-
  const auto dprop = at::cuda::getCurrentDeviceProperties();
  // Check that the input is nested
  if (!(dprop->major == 9 || dprop->major == 10) && has_for_nested_inputs(params)) {
--- a/aten/src/ATen/native/xnnpack/Shim.cpp
+++ b/aten/src/ATen/native/xnnpack/Shim.cpp
@ -29,63 +29,63 @@ bool available() {
 }

 bool use_convolution2d(
-    const Tensor&,
-    const Tensor&,
-    const at::OptionalIntArrayRef,
-    const IntArrayRef,
-    const IntArrayRef,
-    const IntArrayRef,
-    const int64_t,
-    bool) {
+    const Tensor& /*unused*/,
+    const Tensor& /*unused*/,
+    const at::OptionalIntArrayRef /*unused*/,
+    const IntArrayRef /*unused*/,
+    const IntArrayRef /*unused*/,
+    const IntArrayRef /*unused*/,
+    const int64_t /*unused*/,
+    bool /*unused*/) {
  return false;
 }

 Tensor convolution2d(
-    const Tensor&,
-    const Tensor&,
-    const Tensor&,
-    const IntArrayRef,
-    const IntArrayRef,
-    const IntArrayRef,
-    const int64_t) {
+    const Tensor& /*unused*/,
+    const Tensor& /*unused*/,
+    const Tensor& /*unused*/,
+    const IntArrayRef /*unused*/,
+    const IntArrayRef /*unused*/,
+    const IntArrayRef /*unused*/,
+    const int64_t /*unused*/) {
  TORCH_CHECK(false, internal::kError);
 }

 bool use_linear(
-    const Tensor&,
-    const Tensor&,
-    const Tensor&) {
+    const Tensor& /*unused*/,
+    const Tensor& /*unused*/,
+    const Tensor& /*unused*/) {
  return false;
 }

 Tensor linear(
-    const Tensor&,
-    const Tensor&,
-    const Tensor&) {
+    const Tensor& /*unused*/,
+    const Tensor& /*unused*/,
+    const Tensor& /*unused*/) {
  TORCH_CHECK(false, internal::kError);
 }

 bool use_max_pool2d(
-    const Tensor&,
-    const IntArrayRef,
-    const IntArrayRef,
-    IntArrayRef,
-    const IntArrayRef,
-    const bool,
-    const float,
-    const float) {
+    const Tensor& /*unused*/,
+    const IntArrayRef /*unused*/,
+    const IntArrayRef /*unused*/,
+    IntArrayRef /*unused*/,
+    const IntArrayRef /*unused*/,
+    const bool /*unused*/,
+    const float /*unused*/,
+    const float /*unused*/) {
  return false;
 }

 Tensor max_pool2d(
-    const Tensor&,
-    const IntArrayRef,
-    const IntArrayRef,
-    IntArrayRef,
-    const IntArrayRef,
-    const bool,
-    const float,
-    const float) {
+    const Tensor& /*unused*/,
+    const IntArrayRef /*unused*/,
+    const IntArrayRef /*unused*/,
+    IntArrayRef /*unused*/,
+    const IntArrayRef /*unused*/,
+    const bool /*unused*/,
+    const float /*unused*/,
+    const float /*unused*/) {
  TORCH_CHECK(false, internal::kError);
 }

--- a/aten/src/ATen/ops/from_blob.h
+++ b/aten/src/ATen/ops/from_blob.h
@ -5,7 +5,7 @@ namespace at {

 namespace detail {

-inline void noopDelete(void*) {}
+inline void noopDelete(void* /*unused*/) {}

 } // namespace detail

--- a/aten/src/ATen/vulkan/Context.h
+++ b/aten/src/ATen/vulkan/Context.h
@ -18,7 +18,7 @@ extern std::atomic<const VulkanImplInterface*> g_vulkan_impl_registry;

 class VulkanImplRegistrar {
 public:
-  explicit VulkanImplRegistrar(VulkanImplInterface*);
+  explicit VulkanImplRegistrar(VulkanImplInterface* /*impl*/);
 };

 at::Tensor& vulkan_copy_(at::Tensor& self, const at::Tensor& src);
--- a/benchmarks/dynamo/all_torchbench_models_list.txt
+++ b/benchmarks/dynamo/all_torchbench_models_list.txt
@ -25,15 +25,6 @@ drq
 fambench_dlrm
 fambench_xlmr
 fastNLP_Bert
-hf_Albert
-hf_Bart
-hf_Bert
-hf_BigBird
-hf_DistilBert
-hf_GPT2
-hf_Longformer
-hf_Reformer
-hf_T5
 maml
 maml_omniglot
 mnasnet1_0
@ -60,13 +51,6 @@ soft_actor_critic
 speech_transformer
 squeezenet1_1
 tacotron2
-timm_efficientdet
-timm_efficientnet
-timm_nfnet
-timm_regnet
-timm_resnest
-timm_vision_transformer
-timm_vovnet
 tts_angular
 vgg16
 vision_maskrcnn
--- a/benchmarks/dynamo/cachebench.py
+++ b/benchmarks/dynamo/cachebench.py
@ -23,7 +23,6 @@ TORCHBENCH_MODELS: list[str] = [
    "resnet50",
    "moco",
    "llama",
-    "hf_T5",
 ]
 HUGGINGFACE_MODELS: list[str] = [
    "AllenaiLongformerBase",
--- a/benchmarks/dynamo/check_accuracy.py
+++ b/benchmarks/dynamo/check_accuracy.py
@ -11,7 +11,6 @@ import pandas as pd
 flaky_models = {
    "yolov3",
    "detectron2_maskrcnn_r_101_c4",
-    "timm_efficientnet",  # see https://github.com/pytorch/pytorch/issues/148699
    "XGLMForCausalLM",  # discovered in https://github.com/pytorch/pytorch/pull/128148
    "moondream",  # discovered in https://github.com/pytorch/pytorch/pull/159291
    # discovered in https://github.com/pytorch/pytorch/issues/161419. Its not flaky but really hard to repro, so skipping it
@ -40,13 +39,9 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                "detectron2_fcos_r_50_fpn",
                "doctr_det_predictor",
                "doctr_reco_predictor",
-                "hf_BigBird",
-                "hf_Longformer",
-                "hf_Reformer",
-                "hf_Roberta_base",
-                "hf_T5",
-                "hf_T5_base",
-                "hf_T5_generate",
+                "dpn107",
+                "fbnetv3_b",
+                "levit_128",
                "llava",
                "microbench_unbacked_tolist_sum",
                "mnasnet1_0",
@ -63,12 +58,7 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                "squeezenet1_1",
                "stable_diffusion_text_encoder",
                "stable_diffusion_unet",
-                "timm_efficientdet",
-                "timm_efficientnet",
-                "timm_nfnet",
-                "timm_regnet",
-                "timm_resnest",
-                "timm_vovnet",
+                "swsl_resnext101_32x16d",
                "torchrec_dlrm",
                "vgg16",
                # LLM
--- a/benchmarks/dynamo/check_graph_breaks.py
+++ b/benchmarks/dynamo/check_graph_breaks.py
@ -36,12 +36,7 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
                "detectron2_fcos_r_50_fpn",
                "doctr_det_predictor",
                "doctr_reco_predictor",
-                "hf_BigBird",
-                "hf_Longformer",
-                "hf_Reformer",
-                "hf_Roberta_base",
-                "hf_T5",
-                "hf_T5_base",
+                "levit_128",
                "llava",
                "microbench_unbacked_tolist_sum",
                "resnet50",
@ -51,7 +46,6 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
                "stable_diffusion_text_encoder",
                "stable_diffusion_unet",
                "timm_efficientdet",
-                "timm_nfnet",
                "torchrec_dlrm",
                "vgg16",
                # LLM
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
@ -130,70 +130,6 @@ functorch_maml_omniglot,pass,0



-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Reformer,pass,5
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,eager_fail_to_run,0
-
-
-
-hf_T5_generate,pass,7
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0


@ -342,30 +278,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
@ -78,62 +78,6 @@ functorch_maml_omniglot,pass,7



-hf_Albert,pass,6
-
-
-
-hf_Bart,pass,6
-
-
-
-hf_Bert,pass,6
-
-
-
-hf_Bert_large,pass,6
-
-
-
-hf_BigBird,pass,6
-
-
-
-hf_DistilBert,pass,6
-
-
-
-hf_GPT2,pass,8
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Reformer,pass,20
-
-
-
-hf_Roberta_base,pass,6
-
-
-
-hf_T5_base,eager_2nd_run_OOM,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,6
-
-
-
-hf_distil_whisper,model_fail_to_load,0
-
-
-
 lennard_jones,pass,7


@ -250,30 +194,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientnet,fail_accuracy,7
-
-
-
-timm_regnet,pass,7
-
-
-
-timm_resnest,pass,6
-
-
-
-timm_vision_transformer,pass,6
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,6
-
-
-
 torch_multimodal_clip,pass,7


--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
@ -118,62 +118,6 @@ functorch_maml_omniglot,pass,0



-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,fail_accuracy,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,eager_fail_to_run,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0


@ -314,30 +258,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv
@ -114,58 +114,6 @@ functorch_maml_omniglot,pass,0



-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0


@ -278,38 +226,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
@ -114,58 +114,6 @@ functorch_maml_omniglot,pass,0



-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0


@ -278,38 +226,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
@ -122,66 +122,6 @@ functorch_maml_omniglot,pass,0



-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,27
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Longformer,pass,4
-
-
-
-hf_Reformer,pass,5
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0


@ -302,38 +242,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
@ -122,66 +122,6 @@ functorch_maml_omniglot,pass,0



-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,27
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Longformer,pass,4
-
-
-
-hf_Reformer,pass,5
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0


@ -302,38 +242,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
@ -122,66 +122,6 @@ functorch_maml_omniglot,pass,0



-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,27
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Longformer,pass,4
-
-
-
-hf_Reformer,pass,5
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0


@ -302,38 +242,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
@ -130,70 +130,6 @@ functorch_maml_omniglot,pass,0



-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Reformer,pass,5
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,eager_fail_to_run,0
-
-
-
-hf_T5_generate,pass,7
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0


@ -342,30 +278,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
@ -78,62 +78,6 @@ functorch_maml_omniglot,pass,7



-hf_Albert,pass,6
-
-
-
-hf_Bart,pass,6
-
-
-
-hf_Bert,pass,6
-
-
-
-hf_Bert_large,pass,6
-
-
-
-hf_BigBird,pass,6
-
-
-
-hf_DistilBert,pass,6
-
-
-
-hf_GPT2,pass,8
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Reformer,pass,20
-
-
-
-hf_Roberta_base,pass,6
-
-
-
-hf_T5_base,eager_2nd_run_OOM,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,6
-
-
-
-hf_distil_whisper,model_fail_to_load,0
-
-
-
 lennard_jones,pass,7


@ -246,30 +190,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientnet,pass,7
-
-
-
-timm_regnet,pass,7
-
-
-
-timm_resnest,pass,6
-
-
-
-timm_vision_transformer,pass,6
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,6
-
-
-
 torch_multimodal_clip,pass,7


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv
@ -98,58 +98,6 @@ functorch_maml_omniglot,pass,0



-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0


@ -262,38 +210,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv
@ -98,58 +98,6 @@ functorch_maml_omniglot,pass,0



-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0


@ -262,38 +210,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
@ -106,66 +106,6 @@ functorch_maml_omniglot,pass,0



-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,27
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Longformer,pass,4
-
-
-
-hf_Reformer,pass,5
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0


@ -286,38 +226,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
@ -122,66 +122,6 @@ functorch_maml_omniglot,pass,0



-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,25
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Longformer,pass,4
-
-
-
-hf_Reformer,pass,8
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0


@ -302,38 +242,6 @@ stable_diffusion_unet,pass_due_to_skip,0



-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,3


--- a/Show More
+++ b/Show More