[user-streams] Fix stream graph output semantics

ghstack-source-id: a1761206efa02920945b94e1ff811abeed6e470b Pull Request resolved: https://github.com/pytorch/pytorch/pull/164819
[dynamo] Remove retrieving objects by ID
2025-11-03 15:35:04 +08:00 · 2025-10-06 22:09:05 -07:00 · 2025-10-06 22:09:00 -07:00 · 2025-10-06 22:09:00 -07:00 · 2025-10-06 22:08:59 -07:00 · 2025-10-06 22:08:59 -07:00
969 changed files with 15429 additions and 15676 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -344,7 +344,7 @@ docker build \
       --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
       --build-arg "KATEX=${KATEX:-}" \
       --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
-       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx1100}" \
+       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942}" \
       --build-arg "IMAGE_NAME=${IMAGE_NAME}" \
       --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-deb42f2a8e48f5032b4a98ee781a15fa87a157cf
+e0dda9059d082537cee36be6c5e4fe3b18c880c0
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-7416ffcb92cdbe98d9f97e4e6f95247e46dfc9fd
+27664085f804afc83df26f740bb46c365854f2c4
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -46,9 +46,9 @@ case ${DOCKER_TAG_PREFIX} in
        BASE_TARGET=rocm
        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950, gfx115x conditionally starting in ROCm 7.0
+        # add gfx950 conditionally starting in ROCm 7.0
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
        ;;
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -115,9 +115,6 @@ RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
 # cmake-3.28.0 from pip for onnxruntime
 RUN python3 -mpip install cmake==3.28.0

-ADD ./common/patch_libstdc.sh patch_libstdc.sh
-RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
-
 # build onnxruntime 1.21.0 from sources.
 # it is not possible to build it from sources using pip,
 # so just build it from upstream repository.
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -84,9 +84,9 @@ case ${image} in
        DEVTOOLSET_VERSION="11"
        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950, gfx115x conditionally starting in ROCm 7.0
+        # add gfx950 conditionally starting in ROCm 7.0
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
--- a/.ci/docker/manywheel/build_scripts/ssl-check.py
+++ b/.ci/docker/manywheel/build_scripts/ssl-check.py
@ -10,6 +10,11 @@ BAD_SSL = "https://self-signed.badssl.com"

 print("Testing SSL certificate checking for Python:", sys.version)

+if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4):
+    print("This version never checks SSL certs; skipping tests")
+    sys.exit(0)
+
+
 EXC = OSError

 print(f"Connecting to {GOOD_SSL} should work")
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -67,7 +67,7 @@ fi
 #       wheels with cxx11-abi

 echo "Checking that the gcc ABI is what we expect"
-if [[ "$(uname)" != 'Darwin' ]]; then
+if [[ "$(uname)" != 'Darwin' &&  "$(uname -m)" != "s390x" ]]; then
  # We also check that there are cxx11 symbols in libtorch
  #
  echo "Checking that symbols in libtorch.so have the right gcc abi"
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -256,7 +256,7 @@ test_torchbench_smoketest() {
  local device=mps
  local dtypes=(undefined float16 bfloat16 notset)
  local dtype=${dtypes[$1]}
-  local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)
+  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)

  for backend in eager inductor; do

@ -319,7 +319,7 @@ test_aoti_torchbench_smoketest() {
  local device=mps
  local dtypes=(undefined float16 bfloat16 notset)
  local dtype=${dtypes[$1]}
-  local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)
+  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)

  echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}"
  local dtype_arg="--${dtype}"
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -838,7 +838,7 @@ test_dynamo_benchmark() {
      elif [[ "${suite}" == "timm_models" ]]; then
        export TORCHBENCH_ONLY_MODELS="inception_v3"
      elif [[ "${suite}" == "torchbench" ]]; then
-        export TORCHBENCH_ONLY_MODELS="BERT_pytorch"
+        export TORCHBENCH_ONLY_MODELS="hf_Bert"
      fi
    fi
    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
@ -869,13 +869,13 @@ test_inductor_torchbench_smoketest_perf() {
  mkdir -p "$TEST_REPORTS_DIR"

  python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
-    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only BERT_pytorch \
+    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
    --output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
  # The threshold value needs to be actively maintained to make this check useful
  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4

  # Check memory compression ratio for a few models
-  for test in BERT_pytorch yolov3; do
+  for test in hf_Albert timm_vision_transformer; do
    python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
      --disable-cudagraphs --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" \
      --only $test --output "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv"
@ -886,7 +886,7 @@ test_inductor_torchbench_smoketest_perf() {
  done

  # Perform some "warm-start" runs for a few huggingface models.
-  for test in AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
+  for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
    python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
      --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
    python benchmarks/dynamo/check_accuracy.py \
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -71,7 +71,14 @@ export PYTORCH_BUILD_NUMBER=1

 # Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
 TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
-TRITON_CONSTRAINT="platform_system == 'Linux'"
+
+# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
+TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
+
+# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries.
+if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then
+  TRITON_CONSTRAINT="platform_system == 'Linux'"
+fi

 if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
  TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@ -28,10 +28,6 @@ runs:
        echo "instance-type: $(get_ec2_metadata instance-type)"
        echo "system info $(uname -a)"

-    - name: Print GPU info (if present)
-      shell: bash
-      run: if [ -f /usr/bin/nvidia-smi ]; then nvidia-smi; fi
-
    - name: Check if in a container runner
      shell: bash
      id: check_container_runner
@ -86,6 +82,37 @@ runs:
        # Prune all of the docker images
        docker system prune -af

+    - name: Manually resolve download.pytorch.org
+      shell: bash
+      continue-on-error: true
+      run: |
+        set +e
+        set -x
+
+        PT_DOMAIN=download.pytorch.org
+        # TODO: Flaky access to download.pytorch.org https://github.com/pytorch/pytorch/issues/100400,
+        # cleaning this up once the issue is fixed. There are more than one resolved IP here, the last
+        # one is returned at random
+        RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" | tail -n1)
+
+        if [ -z "${RESOLVED_IP}" ]; then
+          echo "Couldn't resolve ${PT_DOMAIN}, retrying with Google DNS..."
+          RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" @8.8.8.8 | tail -n1)
+
+          if [ -z "${RESOLVED_IP}" ]; then
+            echo "Couldn't resolve ${PT_DOMAIN}, exiting..."
+            exit 1
+          fi
+        fi
+
+        if grep -r "${PT_DOMAIN}" /etc/hosts; then
+          # Clean up any old records first
+          sudo sed -i "/${PT_DOMAIN}/d" /etc/hosts
+        fi
+
+        echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts
+        cat /etc/hosts
+
    - name: Check that the docker daemon is running
      shell: bash
      continue-on-error: true
--- a/.github/scripts/drci_mocks.json.gz
+++ b/.github/scripts/drci_mocks.json.gz
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -18,7 +18,6 @@ class GitHubComment:
    body_text: str
    created_at: str
    author_login: str
-    author_url: Optional[str]
    author_association: str
    editor_login: Optional[str]
    database_id: int
--- a/.github/scripts/gql_mocks.json.gz
+++ b/.github/scripts/gql_mocks.json.gz
--- a/.github/scripts/test_check_labels.py
+++ b/.github/scripts/test_check_labels.py
@ -38,7 +38,6 @@ def mock_get_comments() -> list[GitHubComment]:
            body_text="mock_body_text",
            created_at="",
            author_login="",
-            author_url=None,
            author_association="",
            editor_login=None,
            database_id=1,
@ -49,7 +48,6 @@ def mock_get_comments() -> list[GitHubComment]:
            body_text=" #" + LABEL_ERR_MSG_TITLE.replace("`", ""),
            created_at="",
            author_login=BOT_AUTHORS[1],
-            author_url=None,
            author_association="",
            editor_login=None,
            database_id=2,
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -32,7 +32,6 @@ from trymerge import (
    main as trymerge_main,
    MandatoryChecksMissingError,
    MergeRule,
-    PostCommentError,
    RE_GHSTACK_DESC,
    read_merge_rules,
    remove_job_name_suffix,
@ -589,23 +588,6 @@ class TestTryMerge(TestCase):
            self.assertEqual(mock_merge_base, pr.get_merge_base())
            mocked_gh_fetch_merge_base.assert_called_once()

-    def test_app_can_revert(self, *args: Any) -> None:
-        pr = GitHubPR("pytorch", "pytorch", 164660)
-        repo = DummyGitRepo()
-        app_comment_id, impostor_comment_id = 3375785595, 3377647892
-        # Check that app can revert
-        self.assertIsNotNone(validate_revert(repo, pr, comment_id=app_comment_id))
-        # But impostor can not
-        self.assertRaises(
-            PostCommentError,
-            lambda: validate_revert(repo, pr, comment_id=impostor_comment_id),
-        )
-        # Despite it's name being the name of the bot
-        self.assertEqual(
-            pr.get_comment_by_id(impostor_comment_id).author_login,
-            "pytorch-auto-revert",
-        )
-

@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -234,7 +234,6 @@ query ($owner: String!, $name: String!, $number: Int!) {
          createdAt
          author {
            login
-            url
          }
          authorAssociation
          editor {
@ -1094,7 +1093,6 @@ class GitHubPR:
            body_text=node["bodyText"],
            created_at=node["createdAt"] if "createdAt" in node else "",
            author_login=node["author"]["login"],
-            author_url=node["author"].get("url", None),
            author_association=node["authorAssociation"],
            editor_login=editor["login"] if editor else None,
            database_id=node["databaseId"],
@ -2031,11 +2029,6 @@ def validate_revert(
    # For some reason, one can not be a member of private repo, only CONTRIBUTOR
    if pr.is_base_repo_private():
        allowed_reverters.append("CONTRIBUTOR")
-    # Special case the pytorch-auto-revert app, whose does not have association
-    # But should be able to issue revert command
-    if comment.author_url == "https://github.com/apps/pytorch-auto-revert":
-        allowed_reverters.append("NONE")
-
    if author_association not in allowed_reverters:
        raise PostCommentError(
            f"Will not revert as @{author_login} is not one of "
--- a/.github/workflows/h100-distributed.yml
+++ b/.github/workflows/h100-distributed.yml
@ -37,7 +37,7 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: "linux.c7i.12xlarge"
+      runner: "linux.12xlarge"
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '9.0'
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -2,7 +2,7 @@ name: inductor-perf-nightly-h100

 on:
  schedule:
-    - cron: 15 0 * * 1-6
+    - cron: 15 0,12 * * 1-6
    - cron: 0 7 * * 0
  # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
  # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
--- a/.github/workflows/inductor-perf-test-nightly-macos.yml
+++ b/.github/workflows/inductor-perf-test-nightly-macos.yml
@ -63,7 +63,6 @@ jobs:
      # Same as the build job
      python-version: 3.12.7
      test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }}
-      timeout-minutes: 300
      disable-monitor: false
      monitor-log-interval: 15
      monitor-data-collect-interval: 4
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@ -59,29 +59,3 @@ jobs:
      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
    secrets: inherit
-
-  linux-jammy-rocm-py3_10-gfx1100-test:
-    if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
-    permissions:
-      id-token: write
-      contents: read
-    name: linux-jammy-rocm-py3_10-gfx1100
-    uses: ./.github/workflows/_rocm-test.yml
-    needs:
-      - linux-jammy-rocm-py3_10-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
-        ]}
-      tests-to-include: >
-         test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
-         test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
-         inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
-         inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
-         inductor/test_flex_attention inductor/test_max_autotune
-    secrets: inherit
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@ -35,7 +35,7 @@ jobs:
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
      build-environment: linux-jammy-xpu-n-1-py3.10
      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3
-      runner: linux.c7i.12xlarge
+      runner: linux.12xlarge
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
@ -56,7 +56,7 @@ jobs:
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
      build-environment: linux-jammy-xpu-n-py3.10
      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
-      runner: linux.c7i.12xlarge
+      runner: linux.12xlarge
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
--- a/.gitignore
+++ b/.gitignore
@ -88,7 +88,7 @@ torch_compile_debug/
 # Listed manually because some files in this directory are not generated
 torch/testing/_internal/generated/annotated_fn_args.py
 torch/testing/_internal/data/*.pt
-torch/headeronly/version.h
+torch/csrc/api/include/torch/version.h
 torch/csrc/cudnn/cuDNN.cpp
 torch/csrc/generated
 torch/csrc/generic/TensorMethods.cpp
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -28,7 +28,7 @@ exclude_patterns = [
    'torch/lib/**',
    'venv/**',
    '**/*.pyi',
-    "tools/experimental/torchfuzz/**",
+    "tools/experimental/dynamic_shapes/torchfuzz/**",
    'tools/test/test_selective_build.py',
 ]
 command = [
@ -198,7 +198,7 @@ exclude_patterns = [
    'tools/test/gen_operators_yaml_test.py',
    'tools/test/gen_oplist_test.py',
    'tools/test/test_selective_build.py',
-    'tools/experimental/torchfuzz/**',
+    'tools/experimental/dynamic_shapes/torchfuzz/**',
 ]
 command = [
    'python3',
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -13,9 +13,6 @@ load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libt
 load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources")
 load("//:tools/bazel.bzl", "rules")

-# Export files for use by torch/headeronly (where version.h generation now lives)
-exports_files(["version.txt"])
-
 define_targets(rules = rules)

 COMMON_COPTS = [
@ -693,9 +690,7 @@ cc_library(
            "torch/csrc/*/generated/*.h",
            "torch/csrc/jit/serialization/mobile_bytecode_generated.h",
        ] + torch_cuda_headers,
-    ) + GENERATED_AUTOGRAD_CPP + [
-        "//torch/headeronly:version_h",
-    ],
+    ) + GENERATED_AUTOGRAD_CPP + [":version_h"],
    includes = [
        "third_party/kineto/libkineto/include",
        "torch/csrc",
--- a/aten/src/ATen/BlasBackend.h
+++ b/aten/src/ATen/BlasBackend.h
@ -28,19 +28,4 @@ inline std::ostream& operator<<(std::ostream& stream, at::BlasBackend backend) {
  return stream << BlasBackendToString(backend);
 }

-namespace blas {
-
-enum class ScalingType : std::uint8_t {
-  TensorWise, // fp32 scales
-  RowWise, // fp32 scales
-  BlockWise1x16, // fp8_e4m3fn scales
-  BlockWise1x32, // fp8_e8m0fnu scales
-  BlockWise1x128, // fp32 scales
-  BlockWise128x128, // fp32 scales
-};
-
-enum class SwizzleType : std::uint8_t { NO_SWIZZLE = 0, SWIZZLE_32_4_4 = 1 };
-
-} // namespace blas
-
 } // namespace at
--- a/aten/src/ATen/CPUApplyUtils.h
+++ b/aten/src/ATen/CPUApplyUtils.h
@ -144,7 +144,8 @@ inline std::string _all_equal_numel_error(at::ArrayRef<Tensor> tensors) {
 inline bool _apply_preamble(ArrayRef<Tensor> tensors) {
  checkDeviceType("CPU_tensor_apply", tensors, kCPU);
  checkLayout("CPU_tensor_apply", tensors, kStrided);
-  TORCH_CHECK(_all_equal_numel(tensors), _all_equal_numel_error(tensors));
+  if (!_all_equal_numel(tensors))
+    TORCH_CHECK(false, _all_equal_numel_error(tensors));
  // An empty tensor has no elements
  for (auto& t : tensors)
    if (t.numel() == 0)
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -587,33 +587,20 @@ void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
  rocm_fa_preferred_backend = b;
 }

-CuBLASReductionOption Context::allowFP16ReductionCuBLAS() const {
+bool Context::allowFP16ReductionCuBLAS() const {
  return allow_fp16_reduction_cublas;
 }

-CuBLASReductionOption inline get_reduction_option(bool allow_reduced_precision, bool allow_splitk) {
-  TORCH_CHECK(
-      !(allow_reduced_precision && !allow_splitk),
-      "allow_splitk=False is not supported when reduced precision reductions are enabled");
-  if (allow_reduced_precision) {
-    return CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
-  } else if (allow_splitk) {
-    return CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK;
-  } else {
-    return CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK;
-  }
+void Context::setAllowFP16ReductionCuBLAS(bool b) {
+  allow_fp16_reduction_cublas = b;
 }

-void Context::setAllowFP16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
-  allow_fp16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
-}
-
-CuBLASReductionOption Context::allowBF16ReductionCuBLAS() const {
+bool Context::allowBF16ReductionCuBLAS() const {
  return allow_bf16_reduction_cublas;
 }

-void Context::setAllowBF16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
-  allow_bf16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
+void Context::setAllowBF16ReductionCuBLAS(bool b) {
+  allow_bf16_reduction_cublas = b;
 }

 bool Context::allowFP16AccumulationCuBLAS() const {
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -38,12 +38,6 @@ namespace at {
 class Tensor;

 enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM };
-
-enum class CuBLASReductionOption : uint8_t {
-  AllowReducedPrecisionWithSplitK = 0,
-  DisallowReducedPrecisionAllowSplitK = 1,
-  DisallowReducedPrecisionDisallowSplitK = 2,
-};
 enum class TORCH_API Float32Backend { GENERIC, CUDA, MKLDNN };
 enum class TORCH_API Float32Op { ALL, CONV, RNN, MATMUL };
 enum class TORCH_API Float32Precision { NONE, IEEE, TF32, BF16 };
@ -226,15 +220,15 @@ class TORCH_API Context {
  bool userEnabledMkldnn() const;
  void setUserEnabledMkldnn(bool e);
  bool benchmarkCuDNN() const;
-  void setBenchmarkCuDNN(bool /*b*/);
+  void setBenchmarkCuDNN(bool);
  int benchmarkLimitCuDNN() const;
-  void setBenchmarkLimitCuDNN(int /*b*/);
+  void setBenchmarkLimitCuDNN(int);
  bool immediateMiopen() const;
-  void setImmediateMiopen(bool /*b*/);
+  void setImmediateMiopen(bool);
  bool deterministicCuDNN() const;
-  void setDeterministicCuDNN(bool /*b*/);
+  void setDeterministicCuDNN(bool);
  bool deterministicMkldnn() const;
-  void setDeterministicMkldnn(bool /*b*/);
+  void setDeterministicMkldnn(bool);
  bool userEnabledNNPACK() const;
  void setUserEnabledNNPACK(bool e);

@ -252,32 +246,32 @@ class TORCH_API Context {
  void setSDPPriorityOrder(const std::vector<int64_t>& order);
  std::array<at::SDPBackend, at::num_sdp_backends> sDPPriorityOrder();

-  void setSDPUseFlash(bool /*e*/);
+  void setSDPUseFlash(bool);
  bool userEnabledFlashSDP() const;

-  void setSDPUseMemEfficient(bool /*e*/);
+  void setSDPUseMemEfficient(bool);
  bool userEnabledMemEfficientSDP() const;

-  void setSDPUseMath(bool /*e*/);
+  void setSDPUseMath(bool);
  bool userEnabledMathSDP() const;

-  void setSDPUseCuDNN(bool /*e*/);
+  void setSDPUseCuDNN(bool);
  bool userEnabledCuDNNSDP() const;

-  void setAllowFP16BF16ReductionMathSDP(bool /*e*/);
+  void setAllowFP16BF16ReductionMathSDP(bool);
  bool allowFP16BF16ReductionMathSDP() const;

-  void setSDPUseOverrideable(bool /*e*/);
+  void setSDPUseOverrideable(bool);
  bool userEnabledOverrideableSDP() const;

  at::LinalgBackend linalgPreferredBackend() const;
-  void setLinalgPreferredBackend(at::LinalgBackend /*b*/);
+  void setLinalgPreferredBackend(at::LinalgBackend);

  at::BlasBackend blasPreferredBackend();
-  void setBlasPreferredBackend(at::BlasBackend /*b*/);
+  void setBlasPreferredBackend(at::BlasBackend);

  at::ROCmFABackend getROCmFAPreferredBackend();
-  void setROCmFAPreferredBackend(at::ROCmFABackend /*b*/);
+  void setROCmFAPreferredBackend(at::ROCmFABackend);

  // Note [Enabling Deterministic Operations]
  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -310,9 +304,9 @@ class TORCH_API Context {

  bool deterministicAlgorithms() const;
  bool deterministicAlgorithmsWarnOnly() const;
-  void setDeterministicAlgorithms(bool /*b*/, bool /*warn_only*/);
+  void setDeterministicAlgorithms(bool, bool);
  bool deterministicFillUninitializedMemory() const;
-  void setDeterministicFillUninitializedMemory(bool /*b*/);
+  void setDeterministicFillUninitializedMemory(bool);

  // Note [Writing Nondeterministic Operations]
  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -356,23 +350,19 @@ class TORCH_API Context {
      Float32Op op,
      Float32Precision p);
  bool allowTF32CuDNN(std::optional<Float32Op> op = std::nullopt) const;
-  void setAllowTF32CuDNN(bool /*b*/);
+  void setAllowTF32CuDNN(bool);
  bool allowTF32OneDNN() const;
-  void setAllowTF32OneDNN(bool /*b*/);
+  void setAllowTF32OneDNN(bool);
  bool allowTF32CuBLAS() const;
-  void setAllowTF32CuBLAS(bool /*b*/);
+  void setAllowTF32CuBLAS(bool);
  Float32MatmulPrecision float32MatmulPrecision() const;
  Float32Precision float32Precision(Float32Backend backend, Float32Op op) const;
-  CuBLASReductionOption allowFP16ReductionCuBLAS() const;
-  void setAllowFP16ReductionCuBLAS(
-      bool allow_reduced_precision,
-      bool allow_splitk = true);
-  CuBLASReductionOption allowBF16ReductionCuBLAS() const;
-  void setAllowBF16ReductionCuBLAS(
-      bool allow_reduced_precision,
-      bool allow_splitk = true);
+  bool allowFP16ReductionCuBLAS() const;
+  void setAllowFP16ReductionCuBLAS(bool);
+  bool allowBF16ReductionCuBLAS() const;
+  void setAllowBF16ReductionCuBLAS(bool);
  bool allowFP16AccumulationCuBLAS() const;
-  void setAllowFP16AccumulationCuBLAS(bool /*b*/);
+  void setAllowFP16AccumulationCuBLAS(bool);

  // Matmuls can use a so-called "persistent" kernel which launches one CUDA
  // block for each SM on the GPU, and each block then iterates over multiple
@ -384,7 +374,7 @@ class TORCH_API Context {
  // to make matmuls target only a subset of the SMs, so they can fully schedule
  // even next to a comms kernel, and only be a few percent slower.
  std::optional<int32_t> _SMCarveout_EXPERIMENTAL() const;
-  void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t> /*c*/);
+  void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t>);

  at::QEngine qEngine() const;
  void setQEngine(at::QEngine e);
@ -405,7 +395,7 @@ class TORCH_API Context {
  void setDefaultMobileCPUAllocator();
  void unsetDefaultMobileCPUAllocator();
  bool allowFP16ReductionCPU() const;
-  void setAllowFP16ReductionCPU(bool /*b*/);
+  void setAllowFP16ReductionCPU(bool);

  // Preserved for BC
  void lazyInitCUDA() {
@ -462,10 +452,8 @@ class TORCH_API Context {
      : at::Float32MatmulPrecision::HIGHEST;
  int benchmark_limit_cudnn = 10;
  bool allow_tf32_cudnn = true;
-  CuBLASReductionOption allow_fp16_reduction_cublas =
-      CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
-  CuBLASReductionOption allow_bf16_reduction_cublas =
-      CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
+  bool allow_fp16_reduction_cublas = true;
+  bool allow_bf16_reduction_cublas = true;
  bool allow_fp16_accumulation_cublas = false;
  std::optional<int32_t> sm_carveout = std::nullopt;
  bool enabled_mkldnn = true;
--- a/aten/src/ATen/MapAllocator.cpp
+++ b/aten/src/ATen/MapAllocator.cpp
@ -62,7 +62,7 @@ constexpr const char* unknown_eventname = "eventname not specified";
 #endif
 }  // namespace (anonymous)

-MapAllocator::MapAllocator(WithFd /*unused*/, std::string_view filename, int fd, int flags, size_t size)
+MapAllocator::MapAllocator(WithFd, std::string_view filename, int fd, int flags, size_t size)
  : filename_(filename.empty() ? unknown_filename : filename)
  , size_(0) // to be filled later
 #ifdef _WIN32
@ -494,7 +494,7 @@ RefcountedMapAllocator::RefcountedMapAllocator(const char *filename, int flags,

    initializeAlloc();
 }
-RefcountedMapAllocator::RefcountedMapAllocator(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size)
+RefcountedMapAllocator::RefcountedMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size)
  : RefcountedMapAllocatorArgCheck(flags)
  , MapAllocator(WITH_FD, filename, flags, fd, size + map_alloc_alignment) {

@ -614,7 +614,7 @@ at::DataPtr MapAllocator::makeDataPtr(std::string_view filename, int flags, size
  return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU};
 }

-at::DataPtr MapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
+at::DataPtr MapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
  auto* context = new MapAllocator(WITH_FD, filename, fd, flags, size);
  if (actual_size_out) *actual_size_out = context->size();
  return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU};
@ -626,7 +626,7 @@ at::DataPtr RefcountedMapAllocator::makeDataPtr(const char *filename, int flags,
  return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU};
 }

-at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
+at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
  auto* context = new RefcountedMapAllocator(WITH_FD, filename, fd, flags, size);
  if (actual_size_out) *actual_size_out = context->size() - map_alloc_alignment;
  return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU};
--- a/aten/src/ATen/MapAllocator.h
+++ b/aten/src/ATen/MapAllocator.h
@ -25,7 +25,7 @@ class TORCH_API MapAllocator {
 public:
  MapAllocator(std::string_view filename, int flags, size_t size);
  MapAllocator(
-      WithFd /*unused*/,
+      WithFd,
      std::string_view filename,
      int fd,
      int flags,
@ -59,14 +59,14 @@ class TORCH_API MapAllocator {
    return flags_;
  }

-  static MapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/);
+  static MapAllocator* fromDataPtr(const at::DataPtr&);
  static at::DataPtr makeDataPtr(
      std::string_view filename,
      int flags,
      size_t size,
      size_t* actual_size_out);
  static at::DataPtr makeDataPtr(
-      WithFd /*unused*/,
+      WithFd,
      const char* filename,
      int fd,
      int flags,
@ -105,13 +105,13 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
 public:
  RefcountedMapAllocator(const char* filename, int flags, size_t size);
  RefcountedMapAllocator(
-      WithFd /*unused*/,
+      WithFd,
      const char* filename,
      int fd,
      int flags,
      size_t size);

-  static RefcountedMapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/);
+  static RefcountedMapAllocator* fromDataPtr(const at::DataPtr&);
  RefcountedMapAllocator(const RefcountedMapAllocator&) = delete;
  RefcountedMapAllocator(RefcountedMapAllocator&&) = delete;
  RefcountedMapAllocator& operator=(const RefcountedMapAllocator&) = delete;
@ -122,7 +122,7 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
      size_t size,
      size_t* actual_size_out);
  static at::DataPtr makeDataPtr(
-      WithFd /*unused*/,
+      WithFd,
      const char* filename,
      int fd,
      int flags,
--- a/aten/src/ATen/NestedTensorImpl.cpp
+++ b/aten/src/ATen/NestedTensorImpl.cpp
@ -273,7 +273,7 @@ c10::SymInt NestedTensorImpl::sym_numel_custom() const {
  return NestedTensorImpl::numel_custom();
 }

-c10::SymBool NestedTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const {
+c10::SymBool NestedTensorImpl::sym_is_contiguous_custom(MemoryFormat) const {
  return nested_tensor_impl_is_contiguous(this);
 }
 IntArrayRef NestedTensorImpl::sizes_custom() const {
--- a/aten/src/ATen/NestedTensorImpl.h
+++ b/aten/src/ATen/NestedTensorImpl.h
@ -115,8 +115,7 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
  // with real implementations
  int64_t numel_custom() const override;
  c10::SymInt sym_numel_custom() const override;
-  c10::SymBool sym_is_contiguous_custom(
-      MemoryFormat /*memory_format*/) const override;
+  c10::SymBool sym_is_contiguous_custom(MemoryFormat) const override;
  int64_t size_custom(int64_t d) const override {
    return this->size(d);
  }
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@ -14,7 +14,7 @@ inline int64_t divup(int64_t x, int64_t y) {
 TORCH_API void init_num_threads();

 // Sets the number of threads to be used in parallel region
-TORCH_API void set_num_threads(int /*nthreads*/);
+TORCH_API void set_num_threads(int);

 // Returns the maximum number of threads that may be used in a parallel region
 TORCH_API int get_num_threads();
@ -37,7 +37,7 @@ inline void lazy_init_num_threads() {
  }
 }

-TORCH_API void set_thread_num(int /*id*/);
+TORCH_API void set_thread_num(int);

 class TORCH_API ThreadIdGuard {
 public:
@ -130,7 +130,7 @@ inline scalar_t parallel_reduce(
 TORCH_API std::string get_parallel_info();

 // Sets number of threads used for inter-op parallelism
-TORCH_API void set_num_interop_threads(int /*nthreads*/);
+TORCH_API void set_num_interop_threads(int);

 // Returns the number of threads used for inter-op parallelism
 TORCH_API size_t get_num_interop_threads();
--- a/aten/src/ATen/SparseCsrTensorImpl.cpp
+++ b/aten/src/ATen/SparseCsrTensorImpl.cpp
@ -252,7 +252,7 @@ void SparseCsrTensorImpl::set_stride(int64_t dim, int64_t new_stride) {
 void SparseCsrTensorImpl::set_storage_offset(int64_t storage_offset) {
  TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have set_storage_offset.");
 }
-c10::SymBool SparseCsrTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const {
+c10::SymBool SparseCsrTensorImpl::sym_is_contiguous_custom(MemoryFormat) const {
  TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have is_contiguous");
 }
 } // namespace at
--- a/aten/src/ATen/SparseCsrTensorImpl.h
+++ b/aten/src/ATen/SparseCsrTensorImpl.h
@ -32,10 +32,10 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {

 public:
  explicit SparseCsrTensorImpl(
-      at::DispatchKeySet /*key_set*/,
+      at::DispatchKeySet,
      at::Device device,
      Layout layout,
-      const caffe2::TypeMeta /*data_type*/);
+      const caffe2::TypeMeta);

  void resize_(int64_t nnz, IntArrayRef size);
  void resize_and_clear_(
@ -86,8 +86,7 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
 protected:
  IntArrayRef strides_custom() const override;
  SymIntArrayRef sym_strides_custom() const override;
-  SymBool sym_is_contiguous_custom(
-      MemoryFormat /*memory_format*/) const override;
+  SymBool sym_is_contiguous_custom(MemoryFormat) const override;

 public:
  void set_size(int64_t dim, int64_t new_size) override;
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@ -46,9 +46,7 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {

 public:
  // Public for now...
-  explicit SparseTensorImpl(
-      at::DispatchKeySet /*key_set*/,
-      const caffe2::TypeMeta /*data_type*/);
+  explicit SparseTensorImpl(at::DispatchKeySet, const caffe2::TypeMeta);

  void release_resources() override;

@ -231,14 +229,14 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
  }

  void resize_(int64_t sparse_dim, int64_t dense_dim, ArrayRef<int64_t> size) {
-    _resize_(sparse_dim, dense_dim, size);
+    return _resize_(sparse_dim, dense_dim, size);
  }

  void resize_(
      int64_t sparse_dim,
      int64_t dense_dim,
      ArrayRef<c10::SymInt> size) {
-    _resize_(sparse_dim, dense_dim, size);
+    return _resize_(sparse_dim, dense_dim, size);
  }

  // NOTE: this function will resize the sparse tensor and also set `indices`
@ -386,8 +384,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {

 private:
  explicit SparseTensorImpl(
-      at::DispatchKeySet /*key_set*/,
-      const caffe2::TypeMeta /*data_type*/,
+      at::DispatchKeySet,
+      const caffe2::TypeMeta,
      at::Tensor indices,
      at::Tensor values);

--- a/aten/src/ATen/TensorIndexing.cpp
+++ b/aten/src/ATen/TensorIndexing.cpp
@ -59,7 +59,7 @@ static inline void set_item(const Tensor& self, ArrayRef<TensorIndex> indices, c
    }
  }

-  set_item(self, indices, value);
+  return set_item(self, indices, value);
 }

 } // namespace indexing
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@ -112,10 +112,10 @@ TORCH_API std::ostream& operator<<(std::ostream& stream, const Slice& slice);
 // `torch.tensor([1, 2])`) | `torch::tensor({1, 2})`
 struct TORCH_API TensorIndex final {
  // Case 1: `at::indexing::None`
-  TensorIndex(std::nullopt_t /*unused*/) : type_(TensorIndexType::None) {}
+  TensorIndex(std::nullopt_t) : type_(TensorIndexType::None) {}

  // Case 2: "..." / `at::indexing::Ellipsis`
-  TensorIndex(at::indexing::EllipsisIndexType /*unused*/)
+  TensorIndex(at::indexing::EllipsisIndexType)
      : type_(TensorIndexType::Ellipsis) {}
  TensorIndex(const char* str) : TensorIndex(at::indexing::Ellipsis) {
    TORCH_CHECK_VALUE(
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@ -765,8 +765,7 @@ void TensorIteratorBase::for_each(loop2d_t loop, int64_t grain_size) {
  if (numel == 0) {
    return;
  } else if (numel < grain_size || at::get_num_threads() == 1) {
-    serial_for_each(loop, {0, numel});
-    return;
+    return serial_for_each(loop, {0, numel});
  } else {
    at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
      serial_for_each(loop, {begin, end});
--- a/aten/src/ATen/TensorIterator.h
+++ b/aten/src/ATen/TensorIterator.h
@ -250,7 +250,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
  using PtrVector = SmallVector<char*, 4>;
  using StrideVector = SmallVector<int64_t, 6>;

-  void build(TensorIteratorConfig& /*config*/);
+  void build(TensorIteratorConfig&);

  // The inner-loop function operates on the fastest moving dimension. It
  // implements element-wise operations in terms of 1-d strided tensors.
@ -618,20 +618,20 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
 #undef TORCH_DISALLOW_TEMPORARIES
 protected:
  // Mutable reference as it moves tensors out of TensorIteratorConfig
-  void populate_operands(TensorIteratorConfig& /*config*/);
+  void populate_operands(TensorIteratorConfig&);
  void mark_outputs();
-  void mark_resize_outputs(const TensorIteratorConfig& /*config*/);
-  void compute_mem_overlaps(const TensorIteratorConfig& /*config*/);
-  void compute_shape(const TensorIteratorConfig& /*config*/);
-  void compute_strides(const TensorIteratorConfig& /*config*/);
+  void mark_resize_outputs(const TensorIteratorConfig&);
+  void compute_mem_overlaps(const TensorIteratorConfig&);
+  void compute_shape(const TensorIteratorConfig&);
+  void compute_strides(const TensorIteratorConfig&);
  void reorder_dimensions();
  void permute_dimensions(IntArrayRef perm);
-  void compute_types(const TensorIteratorConfig& /*config*/);
+  void compute_types(const TensorIteratorConfig&);
  ScalarType compute_common_dtype();
  void allocate_or_resize_outputs();
-  bool fast_set_up(const TensorIteratorConfig& /*config*/);
-  FastSetupType compute_fast_setup_type(const TensorIteratorConfig& /*config*/);
-  void compute_names(const TensorIteratorConfig& /*config*/);
+  bool fast_set_up(const TensorIteratorConfig&);
+  FastSetupType compute_fast_setup_type(const TensorIteratorConfig&);
+  void compute_names(const TensorIteratorConfig&);
  void propagate_names_to_outputs();
  void coalesce_dimensions();

--- a/aten/src/ATen/Utils.h
+++ b/aten/src/ATen/Utils.h
@ -20,7 +20,7 @@

 namespace at {

-TORCH_API int _crash_if_asan(int /*arg*/);
+TORCH_API int _crash_if_asan(int);

 // Converts a TensorList (i.e. ArrayRef<Tensor> to vector of TensorImpl*)
 // NB: This is ONLY used by legacy TH bindings, and ONLY used by cat.
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@ -148,7 +148,7 @@ Tensor cached_cast(at::ScalarType to_type, const Tensor& arg, DeviceType device_
 Banned functions
 *******************************/

-static Tensor binary_cross_entropy_banned(const Tensor & /*unused*/, const Tensor & /*unused*/, const std::optional<Tensor>& /*unused*/, int64_t /*unused*/) {
+static Tensor binary_cross_entropy_banned(const Tensor &, const Tensor &, const std::optional<Tensor>&, int64_t) {
  TORCH_CHECK(false, "torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are unsafe to autocast.\n"
           "Many models use a sigmoid layer right before the binary cross entropy layer.\n"
           "In this case, combine the two layers using torch.nn.functional.binary_cross_entropy_with_logits\n"
--- a/aten/src/ATen/core/NamedTensor.cpp
+++ b/aten/src/ATen/core/NamedTensor.cpp
@ -49,7 +49,7 @@ static void check_unique_names(DimnameList names) {
 }

 void check_names_valid_for(const TensorBase& tensor, DimnameList names) {
-  impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
+  return impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
 }

 void check_names_valid_for(size_t tensor_dim, DimnameList names) {
--- a/aten/src/ATen/core/NamedTensor.h
+++ b/aten/src/ATen/core/NamedTensor.h
@ -27,11 +27,11 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
    HasNonWildcard
  };

-  explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, DimnameList names)
+  explicit NamedTensorMeta(HAS_NON_WILDCARD, DimnameList names)
    : names_(names.vec()) {
    check_invariants();
  }
-  explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, std::vector<Dimname>&& names)
+  explicit NamedTensorMeta(HAS_NON_WILDCARD, std::vector<Dimname>&& names)
    : names_(std::move(names)) {
    check_invariants();
  }
@ -52,13 +52,13 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
      std::any_of(names_.begin(), names_.end(), [](const Dimname& n) { return !n.isWildcard(); }));
  }

-  void set_names(HAS_NON_WILDCARD /*unused*/, DimnameList new_names) {
+  void set_names(HAS_NON_WILDCARD, DimnameList new_names) {
    TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
    std::copy(new_names.begin(), new_names.end(), names_.begin());
    check_invariants();
  }

-  void set_names(HAS_NON_WILDCARD /*unused*/, std::vector<Dimname>&& new_names) {
+  void set_names(HAS_NON_WILDCARD, std::vector<Dimname>&& new_names) {
    TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
    names_ = std::move(new_names);
    check_invariants();
--- a/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
+++ b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
@ -13,7 +13,7 @@ class TORCH_API PythonOpRegistrationTrampoline final {
 public:
  //  Returns true if you successfully registered yourself (that means
  //  you are in the hot seat for doing the operator registrations!)
-  static bool registerInterpreter(c10::impl::PyInterpreter* /*interp*/);
+  static bool registerInterpreter(c10::impl::PyInterpreter*);

  // Returns nullptr if no interpreter has been registered yet.
  static c10::impl::PyInterpreter* getInterpreter();
--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@ -138,7 +138,7 @@ void Tensor::_backward(TensorList inputs,
        const std::optional<Tensor>& gradient,
        std::optional<bool> keep_graph,
        bool create_graph) const {
-  impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
+  return impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
 }

 const TensorBase& TensorBase::requires_grad_(bool _requires_grad) const {
@ -173,12 +173,4 @@ unsigned TensorBase::_register_hook(std::function<TensorBase(const TensorBase&)>
  return impl::GetVariableHooks()->_register_hook(*this, std::move(hook));
 }

-std::optional<ScalarType> TensorBase::grad_dtype() const {
-  return impl::GetVariableHooks()->grad_dtype(*this);
-}
-
-void TensorBase::set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const {
-  return impl::GetVariableHooks()->set_grad_dtype(*this, grad_dtype);
-}
-
 } // namespace at
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -100,7 +100,7 @@ class TORCH_API TensorBase {
  // Create a Tensor with a +0 reference count. Special care must be
  // taken to avoid decrementing this reference count at destruction
  // time. Intended to support MaybeOwnedTraits<Tensor>.
-  explicit TensorBase(unsafe_borrow_t /*unused*/, const TensorBase& rhs)
+  explicit TensorBase(unsafe_borrow_t, const TensorBase& rhs)
      : impl_(c10::intrusive_ptr<at::TensorImpl, UndefinedTensorImpl>(rhs.impl_.get(), c10::raw::DontIncreaseRefcount{})) {}
  friend MaybeOwnedTraits<TensorBase>;

@ -930,10 +930,6 @@ public:

  const TensorBase& requires_grad_(bool _requires_grad=true) const;

-  std::optional<ScalarType> grad_dtype() const;
-
-  void set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const;
-
  // View Variables
  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -954,7 +950,7 @@ protected:
  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;

 private:
-  TensorBase __dispatch_contiguous(c10::MemoryFormat /*memory_format*/) const;
+  TensorBase __dispatch_contiguous(c10::MemoryFormat) const;
 };

 inline DeviceIndex get_device(const TensorBase& self) {
--- a/aten/src/ATen/core/VariableHooksInterface.h
+++ b/aten/src/ATen/core/VariableHooksInterface.h
@ -68,8 +68,6 @@ struct TORCH_API VariableHooksInterface {
      const c10::OperatorHandle& op,
      c10::DispatchKeySet dispatch_keys,
      torch::jit::Stack* stack) const = 0;
-  virtual std::optional<c10::ScalarType> grad_dtype(const TensorBase&) const = 0;
-  virtual void set_grad_dtype(const TensorBase&, const std::optional<c10::ScalarType>&) const = 0;
 };

 TORCH_API void SetVariableHooks(VariableHooksInterface* hooks);
--- a/aten/src/ATen/core/boxing/BoxedKernel.h
+++ b/aten/src/ATen/core/boxing/BoxedKernel.h
@ -18,10 +18,10 @@ class KernelFunction;
 // implementation notes; notably, this does NOT actually go through the
 // boxing/unboxing codepath.
 TORCH_API void fallthrough_kernel(
-    OperatorKernel* /*unused*/,
-    const OperatorHandle& /*unused*/,
-    DispatchKeySet /*unused*/,
-    Stack* /*unused*/);
+    OperatorKernel*,
+    const OperatorHandle&,
+    DispatchKeySet,
+    Stack*);

 // Note [Ambiguity in AutogradOther kernel]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -62,10 +62,10 @@ TORCH_API void fallthrough_kernel(
 // than arbitrarily pick one or the other, we just register a kernel that raises
 // an error and let the user decide how to proceed.
 TORCH_API void ambiguous_autogradother_kernel(
-    OperatorKernel* /*unused*/,
-    const OperatorHandle& /*op*/,
-    DispatchKeySet /*unused*/,
-    Stack* /*unused*/);
+    OperatorKernel*,
+    const OperatorHandle&,
+    DispatchKeySet,
+    Stack*);

 // Note [named_not_supported_kernel]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -75,10 +75,10 @@ TORCH_API void ambiguous_autogradother_kernel(
 // give a good error message in cases when boxing is not supported).  When
 // boxing is universally supported this can be removed.
 [[noreturn]] TORCH_API void named_not_supported_kernel(
-    OperatorKernel* /*unused*/,
-    const OperatorHandle& /*op*/,
-    DispatchKeySet /*unused*/,
-    Stack* /*unused*/);
+    OperatorKernel*,
+    const OperatorHandle&,
+    DispatchKeySet,
+    Stack*);

 /**
 * BoxedKernel is similar to a std::function storing a boxed kernel.
@ -185,16 +185,16 @@ class TORCH_API BoxedKernel final {

  template <BoxedKernelFunction* func>
  static void make_boxed_function(
-      OperatorKernel* /*unused*/,
+      OperatorKernel*,
      const OperatorHandle& opHandle,
-      DispatchKeySet /*unused*/,
+      DispatchKeySet,
      Stack* stack);

  template <BoxedKernelFunction_withDispatchKeys* func>
  static void make_boxed_function(
-      OperatorKernel* /*unused*/,
+      OperatorKernel*,
      const OperatorHandle& opHandle,
-      DispatchKeySet /*ks*/,
+      DispatchKeySet,
      Stack* stack);

  explicit BoxedKernel(
--- a/aten/src/ATen/core/boxing/BoxedKernel_impl.h
+++ b/aten/src/ATen/core/boxing/BoxedKernel_impl.h
@ -11,9 +11,9 @@ inline BoxedKernel::BoxedKernel(

 template <BoxedKernel::BoxedKernelFunction* func>
 inline void BoxedKernel::make_boxed_function(
-    OperatorKernel* /*unused*/,
+    OperatorKernel*,
    const OperatorHandle& opHandle,
-    DispatchKeySet /*unused*/,
+    DispatchKeySet,
    Stack* stack) {
  // Note that we're dropping the DispatchKeySet argument.
  // See Note [Plumbing Keys Through The Dispatcher 2] for details.
@ -22,7 +22,7 @@ inline void BoxedKernel::make_boxed_function(

 template <BoxedKernel::BoxedKernelFunction_withDispatchKeys* func>
 inline void BoxedKernel::make_boxed_function(
-    OperatorKernel* /*unused*/,
+    OperatorKernel*,
    const OperatorHandle& opHandle,
    DispatchKeySet ks,
    Stack* stack) {
--- a/aten/src/ATen/core/boxing/KernelFunction.cpp
+++ b/aten/src/ATen/core/boxing/KernelFunction.cpp
@ -10,7 +10,7 @@ namespace c10 {
 // be handled specially.  Its semantics is that it redispatches to the
 // *next* dispatch key that would have been processed, skipping the current
 // one.
-void fallthrough_kernel(OperatorKernel* /*unused*/, const OperatorHandle& /*unused*/, DispatchKeySet /*unused*/, Stack* /*unused*/) {
+void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*) {
  TORCH_INTERNAL_ASSERT(0,
    "fallthrough_kernel was executed but it should have been short-circuited by the dispatcher. "
    "This could occur if you registered a fallthrough kernel as a override for a specific operator "
@ -19,7 +19,7 @@ void fallthrough_kernel(OperatorKernel* /*unused*/, const OperatorHandle& /*unus
    "let us know in the bug tracker.");
 }

-void ambiguous_autogradother_kernel(OperatorKernel* /*unused*/, const OperatorHandle& op, DispatchKeySet /*unused*/, Stack* /*unused*/) {
+void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, DispatchKeySet, Stack*) {
  TORCH_INTERNAL_ASSERT(0,
    op.operator_name(), " has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. "
    "This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering "
@ -32,7 +32,7 @@ void ambiguous_autogradother_kernel(OperatorKernel* /*unused*/, const OperatorHa
    "\nCanonical state\n~~~~~~~~~~~\n", op.dumpState(), "\n\n");
 }

-void named_not_supported_kernel(OperatorKernel* /*unused*/, const OperatorHandle& op, DispatchKeySet /*unused*/, Stack* /*unused*/) {
+void named_not_supported_kernel(OperatorKernel*, const OperatorHandle& op, DispatchKeySet, Stack*) {
  // DO NOT LOOK AT STACK, YOU HAVE SHORT CIRCUITED BOXING
  // See Note [named_not_supported_kernel]
  TORCH_CHECK(0,
--- a/aten/src/ATen/core/boxing/KernelFunction.h
+++ b/aten/src/ATen/core/boxing/KernelFunction.h
@ -229,7 +229,7 @@ class TORCH_API KernelFunction final {
   * &unboxed_func>();
   */
  template <class FuncPtr, bool AllowLegacyTypes = false>
-  static KernelFunction makeFromUnboxedFunction(FuncPtr /*func_ptr*/);
+  static KernelFunction makeFromUnboxedFunction(FuncPtr);

  /**
   * Create a KernelFunction from an unboxed function.
@ -271,7 +271,7 @@ class TORCH_API KernelFunction final {

  std::string dumpState() const;
  // For testing internal invariants only
-  bool _equalsBoxedAndUnboxed(const KernelFunction& /*other*/) const;
+  bool _equalsBoxedAndUnboxed(const KernelFunction&) const;

  // Register a token to be invalidated when this KernelFunction is destroyed
  void registerToken(std::weak_ptr<KernelToken> token) const;
--- a/aten/src/ATen/core/boxing/impl/boxing.h
+++ b/aten/src/ATen/core/boxing/impl/boxing.h
@ -131,7 +131,7 @@ C10_ALWAYS_INLINE_UNLESS_MOBILE void boxToStack(
  new (dest++) IValue(options.pinned_memory());
 }

-inline void boxArgsToStack(IValue*& /*unused*/) {}
+inline void boxArgsToStack(IValue*&) {}

 template <typename T, typename... Args>
 C10_ALWAYS_INLINE_UNLESS_MOBILE void boxArgsToStack(
@ -185,7 +185,7 @@ struct PopResult<std::tuple<Types...>> final {
  template <size_t... indices>
  static Result pop_to_tuple_impl(
      Stack& stack,
-      std::index_sequence<indices...> /*unused*/) {
+      std::index_sequence<indices...>) {
    return std::make_tuple((std::move(stack[indices]).template to<Types>())...);
  }
 };
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@ -561,7 +561,7 @@ struct wrap_kernel_functor_unboxed_<
  // doesn't use &&
  static ReturnType call(
      OperatorKernel* functor,
-      DispatchKeySet /*unused*/,
+      DispatchKeySet,
      ParameterTypes... args) {
    KernelFunctor* functor_ = static_cast<KernelFunctor*>(functor);
    // Note [Plumbing Keys Through The Dispatcher 2]
@ -629,8 +629,8 @@ call_functor_with_args_from_stack_(
    OperatorKernel* functor,
    DispatchKeySet dispatchKeySet,
    Stack* stack,
-    std::index_sequence<ivalue_arg_indices...> /*unused*/,
-    guts::typelist::typelist<ArgTypes...>* /*unused*/) {
+    std::index_sequence<ivalue_arg_indices...>,
+    guts::typelist::typelist<ArgTypes...>*) {
  (void)(stack); // when sizeof...(ivalue_arg_indices) == 0, this argument would
                 // be unused and we have to silence the compiler warning.

@ -708,7 +708,7 @@ struct push_outputs<std::tuple<OutputTypes...>, AllowDeprecatedTypes> final {
  static void call_(
      std::tuple<OutputTypes...>&& output,
      Stack* stack,
-      std::index_sequence<indices...> /*unused*/) {
+      std::index_sequence<indices...>) {
    torch::jit::push(
        *stack,
        return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::call(
@ -718,7 +718,7 @@ struct push_outputs<std::tuple<OutputTypes...>, AllowDeprecatedTypes> final {
  static void copy_(
      const std::tuple<OutputTypes...>& output,
      Stack* stack,
-      std::index_sequence<indices...> /*unused*/) {
+      std::index_sequence<indices...>) {
    torch::jit::push(
        *stack,
        return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::copy(
@ -741,7 +741,7 @@ struct make_boxed_from_unboxed_functor final {

  static void call(
      OperatorKernel* functor,
-      const OperatorHandle& /*unused*/,
+      const OperatorHandle&,
      DispatchKeySet dispatchKeySet,
      Stack* stack) {
    using ReturnType =
--- a/aten/src/ATen/core/builtin_function.h
+++ b/aten/src/ATen/core/builtin_function.h
@ -63,13 +63,13 @@ struct BuiltinOpFunction : public Function {

  bool call(
      Stack& stack,
-      std::optional<size_t> /*unused*/,
-      c10::function_ref<void(const Code&)> /*unused*/) override {
+      std::optional<size_t>,
+      c10::function_ref<void(const Code&)>) override {
    run(stack);
    return false;
  }

-  bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)> /*unused*/)
+  bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)>)
      override {
    run(stack);
    return false;
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
@ -80,8 +80,7 @@ struct MultiDispatchKeySet : at::IterArgs<MultiDispatchKeySet> {
      ts = ts | x.key_set();
    }
  }
-  [[noreturn]] void operator()(
-      at::ArrayRef<std::optional<at::Tensor>> /*unused*/) {
+  [[noreturn]] void operator()(at::ArrayRef<std::optional<at::Tensor>>) {
    // Just checking that the handling of Tensor?[] didn't change.
    TORCH_INTERNAL_ASSERT(false);
  }
@ -96,7 +95,7 @@ struct MultiDispatchKeySet : at::IterArgs<MultiDispatchKeySet> {
    }
  }
  template <typename T>
-  void operator()(const T& /*unused*/) {
+  void operator()(const T&) {
    // do nothing
  }
 };
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@ -496,7 +496,7 @@ class TORCH_API OperatorHandle {
  }

  void checkInvariants() const {
-    operatorDef_->op.checkInvariants();
+    return operatorDef_->op.checkInvariants();
  }

  c10::ArrayRef<at::Tag> getTags() const {
@ -633,7 +633,7 @@ class TypedOperatorHandle<Return(Args...)> final : public OperatorHandle {

 namespace detail {
 template <class... Args>
-inline void unused_arg_(const Args&... /*unused*/) {}
+inline void unused_arg_(const Args&...) {}

 // CaptureKernelCall is intended to capture return values from Dispatcher
 // unboxed kernel calls. A record function may request to get outputs from the
@ -932,7 +932,7 @@ inline void Dispatcher::redispatchBoxed(
  }
 #endif
  const auto& kernel = entry.lookup(dispatchKeySet);
-  kernel.callBoxed(op, dispatchKeySet, stack);
+  return kernel.callBoxed(op, dispatchKeySet, stack);
 }

 } // namespace c10
--- a/aten/src/ATen/core/dispatch/OperatorEntry.h
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.h
@ -105,7 +105,7 @@ class TORCH_API OperatorEntry final {
  // versa that is an error.  (Refcounting for the registrations is
  // handled in the OperatorHandle in Dispatcher)
  void registerSchema(
-      FunctionSchema&& /*schema*/,
+      FunctionSchema&&,
      std::string&& debug,
      std::vector<at::Tag> tags = {});
  void deregisterSchema();
--- a/aten/src/ATen/core/dynamic_type.cpp
+++ b/aten/src/ATen/core/dynamic_type.cpp
@ -177,7 +177,7 @@ bool DynamicType::equals(const Type& rhs) const {
  return equals(*create(rhs));
 }

-bool DynamicType::isSubtypeOfExt(const Type& rhs, std::ostream* /*why_not*/) const {
+bool DynamicType::isSubtypeOfExt(const Type& rhs, std::ostream*) const {
  auto other = create(rhs);
  if (tag_ == other->tag_) {
    if (equals(*other)) {
@ -371,7 +371,7 @@ DynamicTypePtr ivalue::TupleTypeFactory<c10::DynamicType>::create(
 }

 DynamicTypePtr ivalue::TupleTypeFactory<c10::DynamicType>::fallback(
-    const Type& /*unused*/) {
+    const Type&) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
  return nullptr;
 }
--- a/aten/src/ATen/core/dynamic_type.h
+++ b/aten/src/ATen/core/dynamic_type.h
@ -138,8 +138,8 @@ class DynamicType : public SharedType {

  struct Arguments {
    Arguments() = default;
-    Arguments(c10::ArrayRef<TypePtr> /*args*/);
-    Arguments(const std::vector<std::string_view>& /*names*/, c10::ArrayRef<TypePtr> /*args*/);
+    Arguments(c10::ArrayRef<TypePtr>);
+    Arguments(const std::vector<std::string_view>&, c10::ArrayRef<TypePtr>);
    std::vector<LabeledDynamicType> elems;
  };

@ -156,15 +156,15 @@ class DynamicType : public SharedType {
  static const TypeKind Kind = TypeKind::DynamicType;
  static TORCH_API DynamicTypePtr create(Type& ty);

-  explicit DynamicType(Tag /*tag*/, Arguments /*arguments*/);
-  explicit DynamicType(Tag /*tag*/, std::string_view /*name*/, Arguments /*arguments*/);
+  explicit DynamicType(Tag, Arguments);
+  explicit DynamicType(Tag, std::string_view, Arguments);

  DynamicType(DynamicType&& other) = delete;
  DynamicType(const DynamicType&) = delete;
  DynamicType& operator=(const DynamicType&) = delete;
  DynamicType& operator=(DynamicType&&) = delete;

-  TypePtr containedType(size_t /*i*/) const override;
+  TypePtr containedType(size_t) const override;
  size_t containedTypeSize() const override;
  Tag tag() const {
    return tag_;
--- a/aten/src/ATen/core/function.h
+++ b/aten/src/ATen/core/function.h
@ -96,15 +96,15 @@ struct TORCH_API Function {
  // Overload for server interpreter, a bailout size is needed for graph
  // executor.
  virtual bool call(
-      Stack& /*unused*/,
-      std::optional<size_t> /*unused*/,
-      c10::function_ref<void(const Code&)> /*unused*/) {
+      Stack&,
+      std::optional<size_t>,
+      c10::function_ref<void(const Code&)>) {
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
    return false;
  }

  // Overload for mobile interpreter.
-  virtual bool call(Stack& /*unused*/, c10::function_ref<void(const mobile::Code&)> /*unused*/) {
+  virtual bool call(Stack&, c10::function_ref<void(const mobile::Code&)>) {
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
    return false;
  }
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@ -847,7 +847,7 @@ struct TORCH_API IValue final {
  IValue(std::optional<T> v);
  template <class T, enable_if_list_is_ivalue_constructible<T> = nullptr>
  IValue(c10::OptionalArrayRef<T> v);
-  IValue(std::nullopt_t /*unused*/);
+  IValue(std::nullopt_t);

  // ClassType
  IValue(c10::intrusive_ptr<ivalue::Object> v);
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@ -660,7 +660,7 @@ struct TORCH_API TupleTypeFactory<TupleType> {
 template <>
 struct TORCH_API TupleTypeFactory<c10::DynamicType> {
  static DynamicTypePtr create(const std::vector<TypePtr>& elemTypes);
-  static DynamicTypePtr fallback(const Type& /*unused*/);
+  static DynamicTypePtr fallback(const Type&);
 };

 struct TORCH_API Tuple : c10::intrusive_ptr_target {
@ -1682,7 +1682,7 @@ struct ivalue::EnumHolder : c10::intrusive_ptr_target {
 namespace detail {

 struct _guarded_unsigned_long_unique_dummy final {
-  _guarded_unsigned_long_unique_dummy(int64_t /*unused*/){}
+  _guarded_unsigned_long_unique_dummy(int64_t){}
 };
 using _guarded_unsigned_long = std::conditional_t<
    std::is_same_v<unsigned long, uint32_t> ||
@ -1776,7 +1776,7 @@ template <class Elem>
 // native_functions.yaml still return std::vector.
 // C10_DEPRECATED_MESSAGE("IValues based on std::vector<T> are potentially slow
 // and deprecated. Please use torch::List<T> instead.")
-std::vector<Elem> generic_to(IValue ivalue, _fake_type<std::vector<Elem>> /*unused*/) {
+std::vector<Elem> generic_to(IValue ivalue, _fake_type<std::vector<Elem>>) {
  // We need to do a deep copy of the vector because there might be other
  // references to this same IValue that also use the list. We can't just
  // move the elements out.
@ -1826,18 +1826,18 @@ c10::intrusive_ptr<T> IValue::toCustomClass() const& {
 }

 template <typename T>
-T generic_to(IValue ivalue, _fake_type<T> /*unused*/) {
+T generic_to(IValue ivalue, _fake_type<T>) {
  using ElemType = typename std::remove_pointer<T>::type::element_type;
  return std::move(ivalue).template toCustomClass<ElemType>();
 }

 template <typename T>
-tagged_capsule<T> generic_to(IValue ivalue, _fake_type<tagged_capsule<T>> /*unused*/) {
+tagged_capsule<T> generic_to(IValue ivalue, _fake_type<tagged_capsule<T>>) {
  return tagged_capsule<T>{std::move(ivalue)};
 }

 template <typename Elem>
-c10::List<Elem> generic_to(IValue ivalue, _fake_type<c10::List<Elem>> /*unused*/) {
+c10::List<Elem> generic_to(IValue ivalue, _fake_type<c10::List<Elem>>) {
  return impl::toTypedList<Elem>(std::move(ivalue).toList());
 }

@ -1867,7 +1867,7 @@ std::vector<T> createVectorFromList(const c10::List<T>& impl) {
 }

 template <typename T>
-OptionalArray<T> generic_to(IValue ivalue, _fake_type<OptionalArray<T>> /*unused*/) {
+OptionalArray<T> generic_to(IValue ivalue, _fake_type<OptionalArray<T>>) {
  if (ivalue.isNone()) {
    return {};
  }
@ -1880,8 +1880,8 @@ namespace detail {
 template <typename Elem, size_t... I>
 std::array<Elem, sizeof...(I)> generic_to_array(
    IValue ivalue,
-    _fake_type<std::array<Elem, sizeof...(I)>> /*unused*/,
-    std::index_sequence<I...> /*unused*/) {
+    _fake_type<std::array<Elem, sizeof...(I)>>,
+    std::index_sequence<I...>) {
  // We need to do a deep copy of the array because there might be other
  // references to this same IValue that also use the list. We can't just
  // move the elements out.
@ -1906,7 +1906,7 @@ std::array<Elem, N> generic_to(
 template <typename Key, typename Value>
 c10::Dict<Key, Value> generic_to(
    IValue ivalue,
-    _fake_type<c10::Dict<Key, Value>> /*unused*/) {
+    _fake_type<c10::Dict<Key, Value>>) {
  return impl::toTypedDict<Key, Value>(std::move(ivalue).toGenericDict());
 }

@ -1915,7 +1915,7 @@ C10_DEPRECATED_MESSAGE(
    "IValues based on std::unordered_map are slow and deprecated. Please use c10::Dict<K, V> instead.")
 std::unordered_map<K, V> generic_to(
    IValue ivalue,
-    _fake_type<std::unordered_map<K, V>> /*unused*/) {
+    _fake_type<std::unordered_map<K, V>>) {
  std::unordered_map<K, V> specialized_dict;

  for (const auto& item : std::move(ivalue).toGenericDict()) {
@ -1926,7 +1926,7 @@ std::unordered_map<K, V> generic_to(
 }

 template <typename T>
-std::optional<T> generic_to(IValue ivalue, _fake_type<std::optional<T>> /*unused*/) {
+std::optional<T> generic_to(IValue ivalue, _fake_type<std::optional<T>>) {
  if (ivalue.isNone()) {
    return std::nullopt;
  }
@ -1937,7 +1937,7 @@ namespace detail {
 template <typename Tuple, std::size_t... INDEX>
 Tuple generic_to_tuple_impl(
    const ivalue::TupleElements& t,
-    std::index_sequence<INDEX...> /*unused*/) {
+    std::index_sequence<INDEX...>) {
  return std::make_tuple(
      t[INDEX].to<typename std::tuple_element<INDEX, Tuple>::type>()...);
 }
@ -1951,7 +1951,7 @@ template <
            std::is_lvalue_reference<Args>...,
            std::negation<std::is_constructible<IValue, Args>>...>,
        std::nullptr_t> = nullptr>
-std::tuple<Args...> generic_to(const IValue& ivalue, _fake_type<std::tuple<Args...>> /*unused*/) {
+std::tuple<Args...> generic_to(const IValue& ivalue, _fake_type<std::tuple<Args...>>) {
  const auto& vals = ivalue.toTupleRef().elements();
  TORCH_CHECK(vals.size() == sizeof...(Args));
  return detail::generic_to_tuple_impl<std::tuple<Args...>>(vals, Indices{});
@ -2311,7 +2311,7 @@ inline IValue::IValue(std::optional<T> v) : IValue() {
  }
 }

-inline IValue::IValue(std::nullopt_t /*unused*/) : IValue() {}
+inline IValue::IValue(std::nullopt_t) : IValue() {}

 inline IValue::IValue(c10::intrusive_ptr<ivalue::Object> v)
    : tag(Tag::Object) {
@ -2482,15 +2482,15 @@ namespace ivalue {
 namespace detail {

 template <typename T>
-IValue from_(T&& x, std::true_type /*unused*/) {
+IValue from_(T&& x, std::true_type) {
  return IValue(std::forward<T>(x));
 }
 template <typename T>
-IValue from_(c10::intrusive_ptr<T> x, std::false_type /*unused*/) {
+IValue from_(c10::intrusive_ptr<T> x, std::false_type) {
  return IValue(std::move(x));
 }
 template <typename T>
-IValue from_(T&& /*x*/, std::false_type /*unused*/) {
+IValue from_(T&& /*x*/, std::false_type) {
  static_assert(
      guts::false_t<T>::value,
      "You are calling from with a type that it doesn't support, and isn't a potential custom class (ie: is an intrusive_ptr)");
@ -2546,19 +2546,19 @@ struct MaybeOwnedTraits<IValue> {
    return &borrow;
  }

-  static bool debugBorrowIsValid(const borrow_type& /*unused*/) {
+  static bool debugBorrowIsValid(const borrow_type&) {
    return true;
  }
 };

 template <>
 struct IValue::TagType<c10::Type> {
-  static TORCH_API c10::TypePtr get(const IValue& /*v*/);
+  static TORCH_API c10::TypePtr get(const IValue&);
 };

 template <>
 struct IValue::TagType<c10::DynamicType> {
-  static TORCH_API c10::TypePtr get(const IValue& /*v*/);
+  static TORCH_API c10::TypePtr get(const IValue&);
 };

 template <typename T>
--- a/aten/src/ATen/core/op_registration/infer_schema.h
+++ b/aten/src/ATen/core/op_registration/infer_schema.h
@ -44,7 +44,7 @@ constexpr int checkStaticTypes() {
 }

 template <typename... Ts, size_t... Is>
-constexpr std::array<ArgumentDef, sizeof...(Ts)> createArgumentVectorFromTypes(std::index_sequence<Is...> /*unused*/) {
+constexpr std::array<ArgumentDef, sizeof...(Ts)> createArgumentVectorFromTypes(std::index_sequence<Is...>) {
  return (
    // Check types for common errors
    checkStaticTypes<Ts...>(),
--- a/aten/src/ATen/core/operator_name.h
+++ b/aten/src/ATen/core/operator_name.h
@ -83,7 +83,7 @@ inline bool operator!=(const OperatorName& lhs, const OperatorName& rhs) {
 }

 TORCH_API std::string toString(const OperatorName& opName);
-TORCH_API std::ostream& operator<<(std::ostream& /*os*/, const OperatorName& /*opName*/);
+TORCH_API std::ostream& operator<<(std::ostream&, const OperatorName&);

 } // namespace c10

--- a/aten/src/ATen/core/type_ptr.h
+++ b/aten/src/ATen/core/type_ptr.h
@ -16,7 +16,7 @@ class SingletonTypePtr {
  /* implicit */ SingletonTypePtr(T* p) : repr_(p) {}

  // We need this to satisfy Pybind11, but it shouldn't be hit.
-  explicit SingletonTypePtr(std::shared_ptr<T> /*unused*/) { TORCH_CHECK(false); }
+  explicit SingletonTypePtr(std::shared_ptr<T>) { TORCH_CHECK(false); }

  using element_type = typename std::shared_ptr<T>::element_type;

--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
@ -342,19 +342,19 @@ class Vectorized<c10::complex<double>> {
    return _mm256_cmp_pd(values, other.values, _CMP_NEQ_UQ);
  }
  Vectorized<c10::complex<double>> operator<(
-      const Vectorized<c10::complex<double>>& /*unused*/) const {
+      const Vectorized<c10::complex<double>>&) const {
    TORCH_CHECK(false, "not supported for complex numbers");
  }
  Vectorized<c10::complex<double>> operator<=(
-      const Vectorized<c10::complex<double>>& /*unused*/) const {
+      const Vectorized<c10::complex<double>>&) const {
    TORCH_CHECK(false, "not supported for complex numbers");
  }
  Vectorized<c10::complex<double>> operator>(
-      const Vectorized<c10::complex<double>>& /*unused*/) const {
+      const Vectorized<c10::complex<double>>&) const {
    TORCH_CHECK(false, "not supported for complex numbers");
  }
  Vectorized<c10::complex<double>> operator>=(
-      const Vectorized<c10::complex<double>>& /*unused*/) const {
+      const Vectorized<c10::complex<double>>&) const {
    TORCH_CHECK(false, "not supported for complex numbers");
  }

--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -422,34 +422,18 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
    abType = CUDA_R_16F;
    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
 #ifndef USE_ROCM
-    auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
-    if (fp16_reduction !=
-        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
-      uint32_t mask =
-          fp16_reduction ==
-                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
-              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
-                 CUBLASLT_REDUCTION_SCHEME_NONE)
-              : CUBLASLT_REDUCTION_SCHEME_NONE;
-      preference.setAttribute(
-          CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
+    if (!at::globalContext().allowFP16ReductionCuBLAS()) {
+      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
+        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
    }
 #endif
  } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
    abType = CUDA_R_16BF;
    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
 #ifndef USE_ROCM
-    auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
-    if (bf16_reduction !=
-        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
-      uint32_t mask =
-          bf16_reduction ==
-                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
-              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
-                 CUBLASLT_REDUCTION_SCHEME_NONE)
-              : CUBLASLT_REDUCTION_SCHEME_NONE;
-      preference.setAttribute(
-          CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
+    if (!at::globalContext().allowBF16ReductionCuBLAS()) {
+      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
+        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
    }
 #endif
  } else {
@ -1136,15 +1120,8 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
  }
  if (prop->major >= 5) {
    cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
-    auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
-    TORCH_CHECK(fp16_reduction !=
-        at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
-          "torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction("
-          "..., allow_splitk=False) requires the cuBLASLt backend");
-    if (fp16_reduction !=
-        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
-      cublas_flags = static_cast<cublasMath_t>(
-          cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+    if (!at::globalContext().allowFP16ReductionCuBLAS()) {
+      cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
    }
    // Disallow fp16 reductions that could lead to unexpected overflow issues.
    TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, cublas_flags));
@ -1203,15 +1180,8 @@ inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DT
  GEMM_CHECK_ARGVALUES(at::BFloat16);
 #ifndef USE_ROCM
  cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
-  auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
-  TORCH_CHECK(bf16_reduction !=
-      at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
-        "torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction("
-        "..., allow_splitk=False) requires the cuBLASLt backend");
-  if (bf16_reduction !=
-      at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
-    cublas_flags = static_cast<cublasMath_t>(
-        cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+  if (!at::globalContext().allowBF16ReductionCuBLAS()) {
+    cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
  }
 #endif
 #if defined(USE_ROCM)
@ -1607,34 +1577,18 @@ bool gemm_and_bias(
    abType = CUDA_R_16F;
    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
 #ifndef USE_ROCM
-    auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
-    if (fp16_reduction !=
-        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
-      uint32_t mask =
-          fp16_reduction ==
-                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
-              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
-                 CUBLASLT_REDUCTION_SCHEME_NONE)
-              : CUBLASLT_REDUCTION_SCHEME_NONE;
-      preference.setAttribute(
-          CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
+    if (!at::globalContext().allowFP16ReductionCuBLAS()) {
+      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
+        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
    }
 #endif
  } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
    abType = CUDA_R_16BF;
    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
 #ifndef USE_ROCM
-    auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
-    if (bf16_reduction !=
-        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
-      uint32_t mask =
-          bf16_reduction ==
-                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
-              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
-                 CUBLASLT_REDUCTION_SCHEME_NONE)
-              : CUBLASLT_REDUCTION_SCHEME_NONE;
-      preference.setAttribute(
-          CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
+    if (!at::globalContext().allowBF16ReductionCuBLAS()) {
+      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
+        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
    }
 #endif
  }
@ -1861,8 +1815,6 @@ template bool gemm_and_bias(
    int64_t result_ld,
    GEMMAndBiasActivationEpilogue activation);

-using at::blas::ScalingType;
-
 int get_scale_mode(ScalingType scaling_type, ScalarType scale_dtype, bool use_fast_accum) {
  switch (scaling_type) {
    case ScalingType::BlockWise1x32:
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@ -14,7 +14,6 @@
 */

 #include <ATen/cuda/CUDAContext.h>
-#include <ATen/BlasBackend.h>
 #include <ATen/OpMathType.h>

 namespace at::cuda::blas {
@ -137,6 +136,15 @@ void int8_gemm(
    int32_t* result_ptr,
    int64_t result_ld);

+enum class ScalingType : std::uint8_t {
+  TensorWise,  // fp32 scales
+  RowWise,  // fp32 scales
+  BlockWise1x16,  // fp8_e4m3fn scales
+  BlockWise1x32,  // fp8_e8m0fnu scales
+  BlockWise1x128,  // fp32 scales
+  BlockWise128x128,  // fp32 scales
+};
+
 void scaled_gemm(
    char transa,
    char transb,
@ -148,13 +156,13 @@ void scaled_gemm(
    int64_t mat1_ld,
    ScalarType mat1_dtype,
    ScalarType mat1_scale_dtype,
-    at::blas::ScalingType mat1_scaling_type,
+    ScalingType mat1_scaling_type,
    const void* mat2_ptr,
    const void* mat2_scale_ptr,
    int64_t mat2_ld,
    ScalarType mat2_dtype,
    ScalarType mat2_scale_dtype,
-    at::blas::ScalingType mat2_scaling_type,
+    ScalingType mat2_scaling_type,
    const void* bias_ptr,
    ScalarType bias_dtype,
    void* result_ptr,
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -326,23 +326,6 @@ bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const {
 #endif
 }

-bool CUDAHooks::supportsBFloat16RNNWithCuDNN() const {
-#if AT_CUDNN_ENABLED() && (CUDNN_VERSION >= 91300)
-  if (!hasCUDA()) {
-    return false;
-  }
-  cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
-  // Check for Volta cores
-  if (prop->major >= 8) {
-    return true;
-  } else {
-    return false;
-  }
-#else
-  return false;
-#endif
-}
-
 long CUDAHooks::versionCuDNN() const {
 #if AT_CUDNN_ENABLED()
  return CUDNN_VERSION;
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@ -17,7 +17,7 @@ TORCH_CUDA_CPP_API void set_magma_init_fn(void (*magma_init_fn)());

 // The real implementation of CUDAHooksInterface
 struct CUDAHooks : public at::CUDAHooksInterface {
-  CUDAHooks(at::CUDAHooksArgs /*unused*/) {}
+  CUDAHooks(at::CUDAHooksArgs) {}
  void init() const override;
  Device getDeviceFromPtr(void* data) const override;
  bool isPinnedPtr(const void* data) const override;
@ -45,7 +45,6 @@ struct CUDAHooks : public at::CUDAHooksInterface {
  bool supportsDilatedConvolutionWithCuDNN() const override;
  bool supportsDepthwiseConvolutionWithCuDNN() const override;
  bool supportsBFloat16ConvolutionWithCuDNNv8() const override;
-  bool supportsBFloat16RNNWithCuDNN() const override;
  bool hasCUDART() const override;
  long versionCUDART() const override;
  long versionCuDNN() const override;
--- a/aten/src/ATen/cuda/tunable/GemmCommon.h
+++ b/aten/src/ATen/cuda/tunable/GemmCommon.h
@ -29,7 +29,7 @@

 namespace at::cuda::tunable {

-using at::blas::ScalingType;
+using at::cuda::blas::ScalingType;

 enum class BlasOp {
  N = 0,
--- a/aten/src/ATen/cuda/tunable/TunableOp.h
+++ b/aten/src/ATen/cuda/tunable/TunableOp.h
@ -29,7 +29,7 @@ template <typename ParamsT>
 class Callable {
  public:
    virtual ~Callable() = default;
-    virtual TuningStatus Call(const ParamsT* /*unused*/) {
+    virtual TuningStatus Call(const ParamsT*) {
      return FAIL;
    }
    virtual TuningStatus IsSupported(const ParamsT* params) {
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@ -166,10 +166,6 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
    return false;
  }

-  virtual bool supportsBFloat16RNNWithCuDNN() const {
-    return false;
-  }
-
  virtual long versionCuDNN() const {
    TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
  }
--- a/aten/src/ATen/detail/HPUHooksInterface.h
+++ b/aten/src/ATen/detail/HPUHooksInterface.h
@ -25,7 +25,7 @@ struct TORCH_API HPUHooksInterface : AcceleratorHooksInterface {
        false, "Cannot get device of pointer on HPU without HPU backend");
  }

-  bool isPinnedPtr(const void* /*data*/) const override {
+  bool isPinnedPtr(const void*) const override {
    return false;
  }

--- a/aten/src/ATen/functorch/BatchRulesHelper.h
+++ b/aten/src/ATen/functorch/BatchRulesHelper.h
@ -410,7 +410,7 @@ struct ExistingBdimBatchRuleHelper<F, Func, c10::guts::typelist::typelist<A, T..


 template <typename F, F Method, typename... ExtraArgs>
-Tensor& unary_inplace_batch_rule(Tensor& self, std::optional<int64_t> /*unused*/, ExtraArgs... extra_args) {
+Tensor& unary_inplace_batch_rule(Tensor& self, std::optional<int64_t>, ExtraArgs... extra_args) {
  INVOKE(self, Method)(std::forward<ExtraArgs>(extra_args)...);
  return self;
 }
--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@ -465,11 +465,11 @@ static void dynamicLayerBack(const c10::OperatorHandle& op, torch::jit::Stack* s

 // used for functions that have aliasing operations but should be treated like they're out of place (i.e. lift_fresh)
 static void dynamicLayerBackGradSpecialCase(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  dynamicLayerBack(op, stack, true);
+  return dynamicLayerBack(op, stack, true);
 }

 static void dynamicLayerBackFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  dynamicLayerBack(op, stack, false);
+  return dynamicLayerBack(op, stack, false);
 }

 TORCH_LIBRARY_IMPL(_, FuncTorchDynamicLayerFrontMode, m) {
--- a/aten/src/ATen/metal/Context.h
+++ b/aten/src/ATen/metal/Context.h
@ -18,7 +18,7 @@ extern std::atomic<const MetalInterface*> g_metal_impl_registry;

 class MetalImplRegistrar {
 public:
-  explicit MetalImplRegistrar(MetalInterface* /*impl*/);
+  explicit MetalImplRegistrar(MetalInterface*);
 };

 at::Tensor& metal_copy_(at::Tensor& self, const at::Tensor& src);
--- a/aten/src/ATen/mps/EmptyTensor.cpp
+++ b/aten/src/ATen/mps/EmptyTensor.cpp
@ -12,7 +12,7 @@

 #define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled"
 #define MPS_ERROR_RUNTIME_TOO_LOW \
-  "The MPS backend is supported on MacOS 14.0+. ", \
+  "The MPS backend is supported on MacOS 13.0+.", \
  "Current OS version can be queried using `sw_vers`"
 #define MPS_ERROR_DOUBLE_NOT_SUPPORTED "Cannot convert a MPS Tensor to float64 dtype " \
  "as the MPS framework doesn't support float64. Please use float32 instead."
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@ -2060,7 +2060,7 @@ std::tuple<Tensor, Tensor> linalg_lu_factor(const Tensor& A, bool pivot) {
 }

 // TODO Deprecate this function in favour of linalg_lu_factor_ex
-std::tuple<Tensor, Tensor, Tensor> _lu_with_info(const Tensor& self, bool compute_pivots, bool /*unused*/) {
+std::tuple<Tensor, Tensor, Tensor> _lu_with_info(const Tensor& self, bool compute_pivots, bool) {
   TORCH_WARN_ONCE(
    "torch.lu is deprecated in favor of torch.linalg.lu_factor / torch.linalg.lu_factor_ex and will be ",
    "removed in a future PyTorch release.\n",
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@ -375,7 +375,7 @@ static void bf16_gemv_trans(
  const at::BFloat16 beta,
  at::BFloat16* y,
  const int incy) {
-  bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
+  return bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }

 template <>
--- a/aten/src/ATen/native/BucketizationUtils.h
+++ b/aten/src/ATen/native/BucketizationUtils.h
@ -70,7 +70,7 @@ inline void searchsorted_maybe_trim_input_tensors(
    const Tensor& raw_boundaries) {
  Tensor trimmed_sorter;
  Tensor raw_sorter;
-  searchsorted_maybe_trim_input_tensors(
+  return searchsorted_maybe_trim_input_tensors(
      trimmed_input,
      trimmed_boundaries,
      trimmed_sorter,
--- a/aten/src/ATen/native/GridSamplerUtils.h
+++ b/aten/src/ATen/native/GridSamplerUtils.h
@ -93,12 +93,6 @@ inline bool cond_cudnn_grid_sampler(
  const TensorBase& input,
  const TensorBase& grid
 ) {
-  auto st = input.scalar_type();
-  if (!(st == kDouble || st == kFloat || st == kHalf))
-    return false;
-  st = grid.scalar_type();
-  if (!(st == kDouble || st == kFloat || st == kHalf))
-    return false;
  return (
    at::native::cudnn_is_acceptable(input) &&
    at::native::cudnn_is_acceptable(grid) &&
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@ -108,13 +108,6 @@ bool use_mkldnn(const Tensor& input, TensorList params, TensorList hx) {
  return false;
 }

-bool use_cudnn(const Tensor& t) {
-  bool acceptable = at::cudnn_is_acceptable(t);
-  auto st = t.scalar_type();
-  bool bfloat16_cond = st == kBFloat16 && at::detail::getCUDAHooks().supportsBFloat16RNNWithCuDNN();
-  return acceptable && (bfloat16_cond || st == kDouble || st == kFloat || st == kHalf);
-}
-
 template<typename T>
 using pair_of = std::pair<T, T>;

@ -1207,7 +1200,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
      bool train,                                                           \
      bool bidirectional,                                                   \
      bool batch_first) {                                                   \
-    if (use_cudnn(_input)) {                                                \
+    if (at::cudnn_is_acceptable(_input)) {                                  \
      Tensor output, hy;                                                    \
      NAME##_cudnn_stub(                                                    \
          _input.device().type(),                                           \
@ -1269,7 +1262,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
      double dropout_p,                                                     \
      bool train,                                                           \
      bool bidirectional) {                                                 \
-    if (use_cudnn(data)) {                                                  \
+    if (at::cudnn_is_acceptable(data)) {                                    \
      Tensor output, hy;                                                    \
      NAME##_packed_cudnn_stub(                                             \
          data.device().type(),                                             \
@ -1437,7 +1430,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
      TensorList _params, bool has_biases,
      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
  TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
-  if (use_cudnn(_input)) {
+  if (at::cudnn_is_acceptable(_input)) {
    Tensor output, hy, cy;
    lstm_cudnn_stub(_input.device().type(), output, hy, cy, _input, hx, _params, has_biases,
            num_layers, dropout_p, train, bidirectional, batch_first);
@ -1498,7 +1491,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
      TensorList _params, bool has_biases,
      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
  TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
-  if (use_cudnn(data)) {
+  if (at::cudnn_is_acceptable(data)) {
    Tensor output, hy, cy;
    lstm_packed_cudnn_stub(data.device().type(), output, hy, cy, data, batch_sizes, hx,
            _params, has_biases, num_layers, dropout_p, train, bidirectional);
--- a/aten/src/ATen/native/Scalar.cpp
+++ b/aten/src/ATen/native/Scalar.cpp
@ -15,11 +15,7 @@ namespace at::native {

 Scalar item(const Tensor& self) {
  auto numel = self.sym_numel();
-  TORCH_SYM_CHECK(
-      numel.sym_eq(1),
-      "a Tensor with ",
-      numel,
-      " elements cannot be converted to Scalar");
+  TORCH_CHECK(numel == 1, "a Tensor with ", numel, " elements cannot be converted to Scalar");
  if (self.is_sparse()) {
    if (self._nnz() == 0) return Scalar(0);
    if (self.is_coalesced()) return at::_local_scalar_dense(self._values());
--- a/aten/src/ATen/native/SharedReduceOps.h
+++ b/aten/src/ATen/native/SharedReduceOps.h
@ -346,17 +346,17 @@ template<typename acc_t>
 struct AbsSwitch {};

 template<typename scalar_t, typename acc_t>
-inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch<acc_t> /*unused*/) {
+inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch<acc_t>) {
  return static_cast<acc_t>(data);
 }

 template<typename scalar_t, typename acc_t>
-inline C10_DEVICE acc_t abs_if_complex(std::complex<scalar_t> data, AbsSwitch<acc_t> /*unused*/) {
+inline C10_DEVICE acc_t abs_if_complex(std::complex<scalar_t> data, AbsSwitch<acc_t>) {
  return static_cast<acc_t>(std::abs(data));
 }

 template<typename scalar_t, typename acc_t>
-inline C10_DEVICE acc_t abs_if_complex(c10::complex<scalar_t> data, AbsSwitch<acc_t> /*unused*/) {
+inline C10_DEVICE acc_t abs_if_complex(c10::complex<scalar_t> data, AbsSwitch<acc_t>) {
  return static_cast<acc_t>(std::abs(at::opmath_type<c10::complex<scalar_t>>(data)));
 }

--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@ -846,7 +846,7 @@ TORCH_IMPL_FUNC(clamp_Tensor_out)
 (const Tensor& self,
 const OptionalTensorRef min,
 const OptionalTensorRef max,
- const Tensor& /*unused*/) {
+ const Tensor&) {
  if (min && max) {
    clamp_stub(device_type(), *this);
  } else if (min) {
--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@ -91,6 +91,9 @@ bool cudnn_is_acceptable(const TensorBase& self) {
    return false;
  if (!self.is_cuda())
    return false;
+  auto st = self.scalar_type();
+  if (!(st == kDouble || st == kFloat || st == kHalf))
+    return false;
  if (!detail::getCUDAHooks().compiledWithCuDNN())
    return false;
  // cuDNN functions like grid_sampler returns CUDNN_STATUS_BAD_PARAM on empty
--- a/aten/src/ATen/native/VariableMethodStubs.cpp
+++ b/aten/src/ATen/native/VariableMethodStubs.cpp
@ -25,11 +25,11 @@
 namespace at::native {

 void _backward(const Tensor& self, TensorList inputs, const std::optional<Tensor>& gradient_opt, std::optional<bool> keep_graph, bool create_graph) {
-  self._backward(inputs, gradient_opt, keep_graph, create_graph);
+  return self._backward(inputs, gradient_opt, keep_graph, create_graph);
 }

 void set_data(Tensor& self, const Tensor& new_data) {
-  self.set_data(new_data);
+  return self.set_data(new_data);
 }

 Tensor data(const Tensor& self) {
@ -54,7 +54,7 @@ Tensor& requires_grad_(Tensor& self, bool _requires_grad) {
 }

 void retain_grad(Tensor& self) {
-  self.retain_grad();
+  return self.retain_grad();
 }

 bool retains_grad(const Tensor& self) {
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@ -300,8 +300,7 @@ void div_floor_kernel(TensorIteratorBase& iter) {
    // In the special case of unsigned integer division, floor division is
    // equivalent to truncation division (since the signs of the divisor and
    // dividend are always the same)
-    div_trunc_kernel(iter);
-    return;
+    return div_trunc_kernel(iter);
  } else if (isIntegralType(dtype, /*includeBool*/ false)) {
    // There's no SIMD integer division, so don't try to vectorize it.
    AT_DISPATCH_INTEGRAL_TYPES(dtype, "div_floor_cpu", [&]() {
--- a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp
+++ b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp
@ -452,11 +452,11 @@ void convolution_depthwise3x3_winograd_impl(
 #else

 void convolution_depthwise3x3_winograd_impl(
-    const Arguments& /*unused*/,
-    const float* const /*unused*/,
-    const float* const /*unused*/,
-    const float* const /*unused*/,
-    float* const /*unused*/) {
+    const Arguments&,
+    const float* const,
+    const float* const,
+    const float* const,
+    float* const) {
 }

 #endif /* __ARM_NEON__ */
--- a/aten/src/ATen/native/cpu/IndexKernel.cpp
+++ b/aten/src/ATen/native/cpu/IndexKernel.cpp
@ -749,29 +749,21 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
        // });

        if (iter_dtype == kByte) {
-          cpu_hflip_vec<uint8_t>(iter);
-          return;
+          return cpu_hflip_vec<uint8_t>(iter);
        } else if (iter_dtype == kChar) {
-          cpu_hflip_vec<int8_t>(iter);
-          return;
+          return cpu_hflip_vec<int8_t>(iter);
        } else if (iter_dtype == kInt) {
-          cpu_hflip_vec<int32_t>(iter);
-          return;
+          return cpu_hflip_vec<int32_t>(iter);
        } else if (iter_dtype == kLong) {
-          cpu_hflip_vec<int64_t>(iter);
-          return;
+          return cpu_hflip_vec<int64_t>(iter);
        } else if (iter_dtype == kShort) {
-          cpu_hflip_vec<int16_t>(iter);
-          return;
+          return cpu_hflip_vec<int16_t>(iter);
        } else if (iter_dtype == kBool) {
-          cpu_hflip_vec<bool>(iter);
-          return;
+          return cpu_hflip_vec<bool>(iter);
        } else if (iter_dtype == kFloat) {
-          cpu_hflip_vec<float>(iter);
-          return;
+          return cpu_hflip_vec<float>(iter);
        } else if (iter_dtype == kDouble) {
-          cpu_hflip_vec<double>(iter);
-          return;
+          return cpu_hflip_vec<double>(iter);
        }
      }
      // other dtypes (float16, bfloat16, complex) are handled by cpu_kernel_vec (see below)
@ -786,12 +778,10 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
          c == input_strides_2[1] &&
          c == iter.element_size(0) * iter.shape()[0]  // checks if dim=1 is contiguous as well
      ) {
-        cpu_hflip_channels_last_vec(iter);
-        return;
+        return cpu_hflip_channels_last_vec(iter);
      }
      // Special case: vertical flip using memcpy (faster than generic cpu_kernel_vec)
-      cpu_vflip_memcpy(iter);
-      return;
+      return cpu_vflip_memcpy(iter);
    }

    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(), "flip_cpu",
--- a/aten/src/ATen/native/cpu/Loops.h
+++ b/aten/src/ATen/native/cpu/Loops.h
@ -46,7 +46,7 @@ using namespace vec;
 template <typename traits, std::size_t... INDEX>
 typename traits::ArgsTuple
 dereference_impl(char* C10_RESTRICT data[], const int64_t* strides, int64_t i,
-                 std::index_sequence<INDEX...> /*unused*/) {
+                 std::index_sequence<INDEX...>) {
  return std::make_tuple(
      c10::load<typename traits::template arg<INDEX>::type>(
          data[INDEX] + i * strides[INDEX])...);
@ -65,7 +65,7 @@ dereference_vec_impl(char* C10_RESTRICT data[],
                     const typename traits::result_type& opt_scalar,
                     size_t S,
                     int64_t i,
-                     std::index_sequence<INDEX...> /*unused*/) {
+                     std::index_sequence<INDEX...>) {
  using Vec = typename traits::result_type;
  using scalar_t = typename Vec::value_type;
  return std::make_tuple(
@ -231,7 +231,7 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve
 template <typename traits, typename cb_t>
 inline void unroll_contiguous_scalar_checks(
    const int64_t* /*strides*/,
-    std::index_sequence<> /*unused*/,
+    std::index_sequence<>,
    cb_t&& cb) {
  cb(0);
 }
@ -239,7 +239,7 @@ inline void unroll_contiguous_scalar_checks(
 template <typename traits, typename cb_t, size_t INDEX0, size_t ...INDEX>
 inline void unroll_contiguous_scalar_checks(
    const int64_t* strides,
-    std::index_sequence<INDEX0, INDEX...> /*unused*/,
+    std::index_sequence<INDEX0, INDEX...>,
    cb_t&& cb) {
  if (is_contiguous_scalar<traits, INDEX0 + 1>(strides)) {
    cb(INDEX0 + 1);
--- a/aten/src/ATen/native/cpu/PowKernel.cpp
+++ b/aten/src/ATen/native/cpu/PowKernel.cpp
@ -96,14 +96,11 @@ static void pow_tensor_scalar_kernel(
      dtype == kBFloat16 || isComplexType(dtype)) {
    // Dispatch to fast specialization for sqrt, rsqrt and reciprocal
    if (exp_scalar.equal(.5)) {
-      sqrt_kernel(iter);
-      return;
+      return sqrt_kernel(iter);
    } else if (exp_scalar.equal(-0.5)) {
-      rsqrt_kernel(iter);
-      return;
+      return rsqrt_kernel(iter);
    } else if (exp_scalar.equal(-1.0)) {
-      reciprocal_kernel(iter);
-      return;
+      return reciprocal_kernel(iter);
    }
  }

--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -256,10 +256,10 @@ static void norm_kernel_tensor_iterator_impl(
  } else {
    if (iter.input_dtype() == kHalf && iter.dtype(0) == kFloat) {
      // type promotion that does cast and reduction in a single kernel
-      norm_kernel_cpu_impl<at::Half, float>(iter, val); return;
+      return norm_kernel_cpu_impl<at::Half, float>(iter, val);
    } else if (iter.input_dtype() == kBFloat16 && iter.dtype(0) == kFloat) {
      // type promotion that does cast and reduction in a single kernel
-      norm_kernel_cpu_impl<at::BFloat16, float>(iter, val); return;
+      return norm_kernel_cpu_impl<at::BFloat16, float>(iter, val);
    }

    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kHalf, kBFloat16, kComplexHalf, iter.input_dtype(), "norm_cpu", [&] {
--- a/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
@ -428,11 +428,10 @@ void fp16_gemv_trans(
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0);
 #if !defined(__aarch64__) || defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
  if (at::globalContext().allowFP16ReductionCPU()) {
-    fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
-    return;
+    return fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
  }
 #endif
-  fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
+  return fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
 }

 float bf16_dot_with_fp32_arith(const at::BFloat16* vec1, const at::BFloat16* vec2, int64_t len) {
@ -466,7 +465,7 @@ void bf16_gemv_trans(
  at::BFloat16* y,
  const int incy) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0 && beta == 0.0);
-  bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy);
+  return bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy);
 }

 float fp16_dot(
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -4,7 +4,6 @@
 #include <c10/util/SmallVector.h>
 #include <c10/core/Scalar.h>
 #include <c10/core/ScalarType.h>
-#include <c10/util/Exception.h>
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
 #include <ATen/core/NamedTensor.h>
@ -106,8 +105,7 @@ c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, b
  }
 }

-using at::blas::ScalingType;
-using at::blas::SwizzleType;
+using at::cuda::blas::ScalingType;

 /**
 * @brief Prepares matrices for CUBLAS operation
@ -1114,7 +1112,7 @@ namespace{
 *   - Returns Error.
 */

-using at::blas::ScalingType;
+using at::cuda::blas::ScalingType;

 bool is_tensorwise_scaling(const at::Tensor& t, const at::Tensor& scale) {
  return isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat && scale.numel() == 1;
@ -1681,890 +1679,9 @@ _scaled_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
          bool use_fast_accum) {
  const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
  Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
-
  return _scaled_mm_out_cuda(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
 }

-/**
- * Track concrete implementations available
- */
-enum class ScaledGemmImplementation {
-  NONE = 0,
-  TENSORWISE_TENSORWISE = 1,
-  ROWWISE_ROWWISE = 2,
-  BLOCK_128x128_1x128 = 3,
-  BLOCK_1x128_128x128 = 4,
-  BLOCK_1x128_1x128 = 5,
-  MXFP8_MXFP8 = 6,
-  NVFP4_NVFP4 = 7,
-  NVFP4_NVFP4_SINGLE_SCALE = 8,
-};
-
-/**
- * Convert passed int (enum) from python back into a
- * strictly-typed enum
- */
-template <class EnumType, class ArrayType>
-std::vector<EnumType> convert_int_to_enum(ArrayType& v) {
-  std::vector<EnumType> converted;
-  converted.reserve(v.size());
-
-  for (auto vi : v) {
-    converted.push_back(static_cast<EnumType>(vi));
-  }
-  return converted;
-}
-
-/**
- * Both inputs must be fp8,
- * Each needs a single scale, {Tensorwise (float)}
- */
-bool check_tensorwise_recipe(c10::ScalarType type_a,
-                             std::vector<ScalingType>& recipe_a,
-                             ArrayRef<Tensor>& scales_a,
-                             c10::ScalarType type_b,
-                             std::vector<ScalingType>& recipe_b,
-                             ArrayRef<Tensor>& scales_b) {
-  // both types must be fp8
-  if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) {
-    return false;
-  }
-
-  // 1 scale each, {Tensorwise, float}
-  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) {
-    return false;
-  }
-  // Need {Blockwise_1x32, e8m0} for A & B
-  if (recipe_a[0] != ScalingType::TensorWise) return false;
-  if (scales_a[0].scalar_type() != ScalarType::Float) return false;
-  if (recipe_b[0] != ScalingType::TensorWise) return false;
-  if (scales_b[0].scalar_type() != ScalarType::Float) return false;
-
-  return true;
-}
-
-/**
- * Both inputs must be fp8,
- * Each needs scales, {Rowwise (float)}
- */
-bool check_rowwise_recipe(c10::ScalarType type_a,
-                             std::vector<ScalingType>& recipe_a,
-                             ArrayRef<Tensor>& scales_a,
-                             c10::ScalarType type_b,
-                             std::vector<ScalingType>& recipe_b,
-                             ArrayRef<Tensor>& scales_b) {
-  // both types must be fp8
-  if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) {
-    return false;
-  }
-
-  // 1 scale each, {Tensorwise, float}
-  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) {
-    return false;
-  }
-
-  // Need {RowWise, dp32} for A & B
-  if (recipe_a[0] != ScalingType::RowWise) return false;
-  if (scales_a[0].scalar_type() != ScalarType::Float) return false;
-  if (recipe_b[0] != ScalingType::RowWise) return false;
-  if (scales_b[0].scalar_type() != ScalarType::Float) return false;
-
-  return true;
-}
-
-
-/**
- * Two-level scaling, canonical NVFP4
- * Both inputs must be fp4
- * A, B need 2 scales, {Blockwise_1x16 (e4m3), Tensorwise (fp32)}
- */
-bool check_nvfp4_recipe(c10::ScalarType type_a,
-                        std::vector<ScalingType>& recipe_a,
-                        ArrayRef<Tensor>& scales_a,
-                        c10::ScalarType type_b,
-                        std::vector<ScalingType>& recipe_b,
-                        ArrayRef<Tensor>& scales_b) {
-  // both types must be fp4
-  if (type_a != ScalarType::Float4_e2m1fn_x2 || type_b != ScalarType::Float4_e2m1fn_x2) {
-    return false;
-  }
-
-  // 2 scales, 2 recipes for each input
-  if (scales_a.size() != 2 || recipe_a.size() != 2 || scales_b.size() != 2 || recipe_b.size() != 2) {
-    return false;
-  }
-
-  // Need {Blockwise_1x16, e4m3 for scale[0], Tensorwise, fp32 for scale[1]}
-  if (recipe_a[0] != ScalingType::BlockWise1x16 || recipe_a[1] != ScalingType::TensorWise) return false;
-  if (scales_a[0].scalar_type() != ScalarType::Float8_e4m3fn || scales_a[1].scalar_type() != ScalarType::Float) return false;
-  if (recipe_b[0] != ScalingType::BlockWise1x16 || recipe_b[1] != ScalingType::TensorWise) return false;
-  if (scales_b[0].scalar_type() != ScalarType::Float8_e4m3fn || scales_b[1].scalar_type() != ScalarType::Float) return false;
-
-  return true;
-}
-
-/**
- * Single-level scaling, what PyT currently understands
- * Both inputs must be fp4
- * A, B need 1 scale, {Blockwise_1x16 (e4m3)}
- */
-bool check_nvfp4_recipe_single_scale
-                       (c10::ScalarType type_a,
-                        std::vector<ScalingType>& recipe_a,
-                        ArrayRef<Tensor>& scales_a,
-                        c10::ScalarType type_b,
-                        std::vector<ScalingType>& recipe_b,
-                        ArrayRef<Tensor>& scales_b) {
-  // both types must be fp4
-  if (type_a != ScalarType::Float4_e2m1fn_x2 || type_b != ScalarType::Float4_e2m1fn_x2) {
-    return false;
-  }
-
-  // 2 scales, 2 recipes for each input
-  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) {
-    return false;
-  }
-
-  // Need {Blockwise_1x16, e4m3 for scale[0], Tensorwise, fp32 for scale[1]}
-  if (recipe_a[0] != ScalingType::BlockWise1x16) return false;
-  if (scales_a[0].scalar_type() != ScalarType::Float8_e4m3fn) return false;
-  if (recipe_b[0] != ScalingType::BlockWise1x16) return false;
-  if (scales_b[0].scalar_type() != ScalarType::Float8_e4m3fn) return false;
-
-  return true;
-}
-
-/**
- * Both inputs must be fp8
- * A, B must only have 1 scale each, A: {Blockwise_1x128 (float), B: {Blockwise_128x128 (float)
- */
-bool check_deepseek_recipe(ScalingType expected_recipe_a,
-                           ScalingType expected_recipe_b,
-                           c10::ScalarType type_a,
-                           std::vector<ScalingType>& recipe_a,
-                           ArrayRef<Tensor>& scales_a,
-                           c10::ScalarType type_b,
-                           std::vector<ScalingType>& recipe_b,
-                           ArrayRef<Tensor>& scales_b) {
-  // both types must be fp8
-  if (type_a != ScalarType::Float8_e4m3fn || type_b != ScalarType::Float8_e4m3fn) {
-    return false;
-  }
-
-  // 1 scales, 1 recipes for each input
-  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) {
-    return false;
-  }
-
-  // Need {Blockwise_1x128, float} for A, {Blockwise_128x128, float} for B
-  if (recipe_a[0] != expected_recipe_a) return false;
-  if (scales_a[0].scalar_type() != ScalarType::Float) return false;
-  if (recipe_b[0] != expected_recipe_b) return false;
-  if (scales_b[0].scalar_type() != ScalarType::Float) return false;
-
-  return true;
-}
-
-/**
- * Both inputs must be fp8
- * A, B must have 1 scale each, {Blockwise_1x32, e8m0}
- */
-bool check_mxfp8_recipe(c10::ScalarType type_a,
-                        std::vector<ScalingType>& recipe_a,
-                        ArrayRef<Tensor>& scales_a,
-                        c10::ScalarType type_b,
-                        std::vector<ScalingType>& recipe_b,
-                        ArrayRef<Tensor>& scales_b) {
-  // both types must be fp8
-  if (type_a != ScalarType::Float8_e4m3fn || type_b != ScalarType::Float8_e4m3fn) {
-    return false;
-  }
-
-  // 1 scales, 1 recipes for each input
-  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) {
-    return false;
-  }
-
-  // Need {Blockwise_1x32, e8m0} for A & B
-  if (recipe_a[0] != ScalingType::BlockWise1x32) return false;
-  if (scales_a[0].scalar_type() != ScalarType::Float8_e8m0fnu) return false;
-  if (recipe_b[0] != ScalingType::BlockWise1x32) return false;
-  if (scales_b[0].scalar_type() != ScalarType::Float8_e8m0fnu) return false;
-
-  return true;
-}
-
-using acceptance_fn = std::function<bool(c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&, c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&)>;
-using namespace std::placeholders;
-
-std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 8> scale_kernel_dispatch = {{
-  { "tensorwise_tensorwise", check_tensorwise_recipe, ScaledGemmImplementation::TENSORWISE_TENSORWISE },
-  { "rowwise_rowwise", check_rowwise_recipe, ScaledGemmImplementation::ROWWISE_ROWWISE},
-  { "block_1x128_128x128", std::bind(check_deepseek_recipe, ScalingType::BlockWise1x128, ScalingType::BlockWise128x128, _1, _2, _3, _4, _5, _6),
-    ScaledGemmImplementation::BLOCK_1x128_128x128},
-  { "block_128x128_1x128", std::bind(check_deepseek_recipe, ScalingType::BlockWise128x128, ScalingType::BlockWise1x128, _1, _2, _3, _4, _5, _6),
-    ScaledGemmImplementation::BLOCK_128x128_1x128},
-  { "block_1x128_1x128", std::bind(check_deepseek_recipe, ScalingType::BlockWise1x128, ScalingType::BlockWise1x128, _1, _2, _3, _4, _5, _6),
-    ScaledGemmImplementation::BLOCK_1x128_1x128},
-  { "nvfp4_nvfp4", check_nvfp4_recipe, ScaledGemmImplementation::NVFP4_NVFP4},
-  { "nvfp4_nvfp4_single_scale", check_nvfp4_recipe_single_scale, ScaledGemmImplementation::NVFP4_NVFP4_SINGLE_SCALE },
-  { "mxfp8_mxfp8", check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8}}};
-
-Tensor&
-_cutlass_scaled_gemm(
-          const Tensor& mat1, const Tensor& mat2,
-          const Tensor& scale_a, const Tensor& scale_b,
-          const ScalingType scaling_choice_a, const ScalingType scaling_choice_b,
-          const std::optional<Tensor>& bias,
-          const bool use_fast_accum,
-          Tensor& out) {
-  cublasCommonArgs args(mat1, mat2, out, scale_a, scale_b, std::nullopt, scaling_choice_a, scaling_choice_b);
-  const auto out_dtype_ = args.result->scalar_type();
-  TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");
-
-#ifdef USE_ROCM
-  auto tuning_ctx = at::cuda::tunable::getTuningContext();
-  if (tuning_ctx->IsTunableOpEnabled()) {
-#define TUNABLE_DISPATCH(BLASOP_A, BLASOP_B)                            \
-        if (mat1.scalar_type() == ScalarType::Float8_e4m3fnuz) {        \
-          if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) {      \
-            static at::cuda::tunable::ScaledGemmTunableOp<              \
-                at::Float8_e4m3fnuz, at::Float8_e4m3fnuz, scalar_t,     \
-                BLASOP_A, BLASOP_B> scaledgemm{};                       \
-            scaledgemm(&params);                                        \
-          }                                                             \
-          else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \
-            static at::cuda::tunable::ScaledGemmTunableOp<              \
-                at::Float8_e4m3fnuz, at::Float8_e5m2fnuz, scalar_t,     \
-                BLASOP_A, BLASOP_B> scaledgemm{};                       \
-            scaledgemm(&params);                                        \
-          }                                                             \
-        }                                                               \
-        else if (mat1.scalar_type() == ScalarType::Float8_e5m2fnuz) {   \
-          if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) {      \
-            static at::cuda::tunable::ScaledGemmTunableOp<              \
-                at::Float8_e5m2fnuz, at::Float8_e4m3fnuz, scalar_t,     \
-                BLASOP_A, BLASOP_B> scaledgemm{};                       \
-            scaledgemm(&params);                                        \
-          }                                                             \
-          else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \
-            static at::cuda::tunable::ScaledGemmTunableOp<              \
-                at::Float8_e5m2fnuz, at::Float8_e5m2fnuz, scalar_t,     \
-                BLASOP_A, BLASOP_B> scaledgemm{};                       \
-            scaledgemm(&params);                                        \
-          }                                                             \
-        }                                                               \
-        else if (mat1.scalar_type() == ScalarType::Float8_e4m3fn) {     \
-          if (mat2.scalar_type() == ScalarType::Float8_e4m3fn) {        \
-            static at::cuda::tunable::ScaledGemmTunableOp<              \
-                at::Float8_e4m3fn, at::Float8_e4m3fn, scalar_t,         \
-                BLASOP_A, BLASOP_B> scaledgemm{};                       \
-            scaledgemm(&params);                                        \
-          }                                                             \
-          else if (mat2.scalar_type() == ScalarType::Float8_e5m2) {     \
-            static at::cuda::tunable::ScaledGemmTunableOp<              \
-                at::Float8_e4m3fn, at::Float8_e5m2, scalar_t,           \
-                BLASOP_A, BLASOP_B> scaledgemm{};                       \
-            scaledgemm(&params);                                        \
-          }                                                             \
-        }                                                               \
-        else if (mat1.scalar_type() == ScalarType::Float8_e5m2) {       \
-          if (mat2.scalar_type() == ScalarType::Float8_e4m3fn) {        \
-            static at::cuda::tunable::ScaledGemmTunableOp<              \
-                at::Float8_e5m2, at::Float8_e4m3fn, scalar_t,           \
-                BLASOP_A, BLASOP_B> scaledgemm{};                       \
-            scaledgemm(&params);                                        \
-          }                                                             \
-          else if (mat2.scalar_type() == ScalarType::Float8_e5m2) {     \
-            static at::cuda::tunable::ScaledGemmTunableOp<              \
-                at::Float8_e5m2, at::Float8_e5m2, scalar_t,             \
-                BLASOP_A, BLASOP_B> scaledgemm{};                       \
-            scaledgemm(&params);                                        \
-          }                                                             \
-        }
-    AT_DISPATCH_V2(out_dtype_, "_tunable_scaled_gemm", AT_WRAP([&] {
-      bool transa_ = ((args.transa != 'n') && (args.transa != 'N'));
-      bool transb_ = ((args.transb != 'n') && (args.transb != 'N'));
-      at::cuda::tunable::ScaledGemmParams<scalar_t> params;
-      params.transa = args.transa;
-      params.transb = args.transb;
-      params.m = args.m;
-      params.n = args.n;
-      params.k = args.k;
-      params.a = args.mata->data_ptr();
-      params.a_scale_ptr = args.scale_mata_ptr;
-      params.a_scale_dtype = args.scale_mata_dtype.value();
-      params.lda = args.lda;
-      params.a_dtype = args.mata->scalar_type();
-      params.a_scale_dtype = args.scale_mata_dtype.value();
-      params.a_scaling_type = args.scaling_mata_type.value();
-      params.b = args.matb->data_ptr();
-      params.b_scale_ptr = args.scale_matb_ptr;
-      params.b_scale_dtype = args.scale_matb_dtype.value();
-      params.ldb = args.ldb;
-      params.b_dtype = args.matb->scalar_type();
-      params.b_scale_dtype = args.scale_matb_dtype.value();
-      params.b_scaling_type = args.scaling_matb_type.value();
-      params.bias_ptr = bias ? bias->data_ptr(): nullptr;
-      params.bias_dtype = bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_;
-      params.c = args.result->data_ptr();
-      params.c_scale_ptr = args.scale_result_ptr;
-      params.ldc = args.result_ld;
-      params.c_dtype = out_dtype_;
-      params.use_fast_accum = use_fast_accum;
-      if (transa_ && transb_) {
-        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::T)
-      }
-      else if (transa_ && !transb_) {
-        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::N)
-      }
-      else if (!transa_ && transb_) {
-        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::T)
-      }
-      else if (!transa_ && !transb_) {
-        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::N)
-      }
-      else {
-        TORCH_CHECK(false, "unreachable");
-      }
-    }),
-    kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_FLOATING_TYPES));
-#undef TUNABLE_DISPATCH
-  }
-  else
-#endif
- {
-    at::cuda::blas::scaled_gemm(
-        args.transa,
-        args.transb,
-        args.m,
-        args.n,
-        args.k,
-        args.mata->data_ptr(),
-        args.scale_mata_ptr,
-        args.lda,
-        args.mata->scalar_type(),
-        args.scale_mata_dtype.value(),
-        args.scaling_mata_type.value(),
-        args.matb->data_ptr(),
-        args.scale_matb_ptr,
-        args.ldb,
-        args.matb->scalar_type(),
-        args.scale_matb_dtype.value(),
-        args.scaling_matb_type.value(),
-        bias ? bias->data_ptr(): nullptr,
-        bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_,
-        args.result->data_ptr(),
-        args.scale_result_ptr,
-        args.result_ld,
-        out_dtype_,
-        use_fast_accum);
-  }
-  return out;
-}
-
-Tensor&
-_scaled_tensorwise_tensorwise(
-          const Tensor& mat_a, const Tensor& mat_b,
-          const Tensor& scale_a, const Tensor& scale_b,
-          const std::optional<Tensor>& bias,
-          const c10::ScalarType out_dtype,
-          bool use_fast_accum,
-          Tensor& out) {
-  // Restrictions:
-  // A, B are FP8, scales are fp32
-  //
-  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
-      mat_a.scalar_type(), mat_b.scalar_type());
-  TORCH_CHECK_VALUE(scale_a.numel() == 1 && scale_a.scalar_type() == kFloat, "scale_a must have 1 Float element")
-  TORCH_CHECK_VALUE(scale_b.numel() == 1 && scale_b.scalar_type() == kFloat, "scale_b must have 1 Float element")
-
-  auto scaling_choice_a = ScalingType::TensorWise;
-  auto scaling_choice_b = ScalingType::TensorWise;
-
-  _cutlass_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
-
-  return out;
-}
-
-
-Tensor&
-_scaled_rowwise_rowwise(
-          const Tensor& mat_a, const Tensor& mat_b,
-          const Tensor& scale_a, const Tensor& scale_b,
-          const std::optional<Tensor>& bias,
-          const c10::ScalarType out_dtype,
-          bool use_fast_accum,
-          Tensor& out) {
-  // Restrictions:
-  // A, B are FP8, scales are fp32, shape M/N for A/B
-  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
-      mat_a.scalar_type(), mat_b.scalar_type());
-  TORCH_CHECK_VALUE(scale_a.size(0) == mat_a.size(0) && scale_a.size(1) == 1, "scale_a must have shape [", mat_a.size(0), ", 1], got [", scale_a.sizes(), "]");
-  TORCH_CHECK_VALUE(scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat, "scale_a must have ", mat_a.size(0), " Float elements, got ", scale_a.numel())
-  TORCH_CHECK_VALUE(scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat, "scale_b must have ", mat_b.size(1), " Float elements, got ", scale_b.numel())
-
-  TORCH_CHECK_VALUE(scale_a.stride(1) == 1, "expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1));
-  TORCH_CHECK_VALUE(scale_b.stride(1) == 1, "expected scale_b.stride(1) to be 1, but got ", scale_b.stride(1));
-
-  auto scaling_choice_a = ScalingType::RowWise;
-  auto scaling_choice_b = ScalingType::RowWise;
-  //
-  // NVIDIA's cuBLAS only started supporting row-wise scaling in version 12.9,
-  // and only for compute capability 9.0+. In other cases we use CUTLASS.
-#ifndef USE_ROCM
-  // We are doing row-wise scaling
-  auto dprops = at::cuda::getCurrentDeviceProperties();
-  if (((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)
-      // cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales
-      ||  (dprops->major == 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) {
-    TORCH_CHECK(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling.");
-    at::cuda::detail::f8f8bf16_rowwise(
-        mat_a,
-        mat_b,
-        scale_a,
-        scale_b,
-        bias,
-        use_fast_accum,
-        out);
-    return out;
-  }
-#else
-
-  // For ROCm, match behavior of f8f8bf16_rowwise type checking, for unit test purposes.
-  //Tensor b = mat_b;
-  if (_scaled_mm_is_fnuz()) {
-    TORCH_CHECK_VALUE(mat_b.dtype() == at::kFloat8_e4m3fnuz, "expected mat_b.dtype() to be at::kFloat8_e4m3fnuz, but got ", mat_b.dtype());
-  }
-  else {
-    TORCH_CHECK_VALUE(mat_b.dtype() == at::kFloat8_e4m3fn, "expected mat_b.dtype() to be at::kFloat8_e4m3fn, but got ", mat_b.dtype());
-  }
-  // Until more than bf16 is supported.
-  TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16,
-       "hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type());
-#endif
-
-  _cutlass_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
-
-  return out;
-}
-
-Tensor&
-_scaled_block1x128_block1x128(
-          const Tensor& mat_a, const Tensor& mat_b,
-          const Tensor& scale_a, const Tensor& scale_b,
-          const std::optional<Tensor>& bias,
-          const c10::ScalarType out_dtype,
-          const bool use_fast_accum,
-          Tensor& out) {
-  // Restrictions:
-  // A, B are FP8, scales are fp32, shape K//128
-  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
-      mat_a.scalar_type(), mat_b.scalar_type());
-  TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,
-      "scale_a must have shape ", mat_a.sizes()[0], " x ", mat_a.sizes()[1] / 128, " Float elements, got ", scale_a.sizes())
-  TORCH_CHECK_VALUE(scale_b.sizes()[0] == ceil_div<int64_t>(mat_b.sizes()[0], 128) && scale_b.sizes()[1] == mat_b.sizes()[1] && scale_b.scalar_type() == kFloat,
-      "scale_b must have shape ", ceil_div<int64_t>(mat_b.sizes()[0], 128), " x ", mat_b.sizes()[1], " Float elements, got ", scale_b.sizes())
-
-  TORCH_CHECK(scale_a.stride(0) == 1, "expected scale_a.stride(0) to be 1, but got ", scale_a.stride(0));
-  TORCH_CHECK(scale_b.stride(1) == 1, "expected scale_b.stride(1) to be 1, but got ", scale_b.stride(1));
-  TORCH_CHECK(scale_b.stride(0) == scale_b.size(1),
-      "expected scale_b.stride(0) to be ", scale_b.size(1), ", but got ", scale_b.size(1));
-
-  auto scaling_choice_a = ScalingType::BlockWise1x128;
-  auto scaling_choice_b = ScalingType::BlockWise1x128;
-
-  _cutlass_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
-
-  return out;
-}
-
-Tensor&
-_scaled_block128x128_block1x128(
-          const Tensor& mat_a, const Tensor& mat_b,
-          const Tensor& scale_a, const Tensor& scale_b,
-          const std::optional<Tensor>& bias,
-          const c10::ScalarType out_dtype,
-          const bool use_fast_accum,
-          Tensor& out) {
-  // Restrictions:
-  // A, B are FP8, scales are fp32, shape K//128
-  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
-      mat_a.scalar_type(), mat_b.scalar_type());
-  TORCH_CHECK_VALUE(scale_a.sizes()[0] == ceil_div<int64_t>(mat_a.sizes()[0], 128) && scale_a.sizes()[1] == ceil_div<int64_t>(mat_a.sizes()[1], 128) && scale_a.scalar_type() == kFloat,
-      "scale_a must have shape ", ceil_div<int64_t>(mat_a.sizes()[0], 128), " x ", ceil_div<int64_t>(mat_a.sizes()[1], 128), " Float elements, got ", scale_a.sizes())
-  TORCH_CHECK_VALUE(scale_b.sizes()[0] == ceil_div<int64_t>(mat_b.sizes()[0], 128) && scale_b.sizes()[1] == mat_b.sizes()[1] && scale_b.scalar_type() == kFloat,
-      "scale_b must have shape ", ceil_div<int64_t>(mat_b.sizes()[0], 128), " x ", mat_b.sizes()[1], " Float elements, got ", scale_b.sizes())
-
-  TORCH_CHECK_VALUE(scale_a.stride(1) == 1, "expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1));
-  TORCH_CHECK_VALUE(scale_b.stride(1) == 1, "expected scale_b.stride(1) to be 1, but got ", scale_b.stride(1));
-  TORCH_CHECK_VALUE(scale_b.stride(0) == scale_b.size(1),
-      "expected scale_b.stride(0) to be ", scale_b.size(1), ", but got ", scale_b.stride(0));
-
-  auto scaling_choice_a = ScalingType::BlockWise128x128;
-  auto scaling_choice_b = ScalingType::BlockWise1x128;
-
-  _cutlass_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
-
-  return out;
-}
-
-Tensor&
-_scaled_block1x128_block128x128(
-          const Tensor& mat_a, const Tensor& mat_b,
-          const Tensor& scale_a, const Tensor& scale_b,
-          const std::optional<Tensor>& bias,
-          const c10::ScalarType out_dtype,
-          const bool use_fast_accum,
-          Tensor& out) {
-  // Restrictions:
-  // A, B are FP8, scales are fp32, A: shape K//128, B: K//128, N//128
-  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
-      mat_a.scalar_type(), mat_b.scalar_type());
-  TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,
-      "scale_a must have shape ", mat_a.sizes()[0], " x ", mat_a.sizes()[1] / 128, " Float elements, got ", scale_a.sizes())
-  TORCH_CHECK_VALUE(scale_b.sizes()[0] == mat_b.sizes()[0] / 128 && scale_b.sizes()[1] == mat_b.sizes()[1] / 128 && scale_b.scalar_type() == kFloat,
-      "scale_b must have shape ", mat_b.sizes()[0] / 128, " x ", mat_b.sizes()[1] / 128, " Float elements, got ", scale_b.sizes())
-
-  TORCH_CHECK_VALUE(scale_a.stride(0) == 1, "expected scale_a.stride(0) to be 1, but got ", scale_a.stride(0));
-  TORCH_CHECK_VALUE(scale_b.stride(0) == 1, "expected scale_b.stride(0) to be 1, but got ", scale_b.stride(0));
-  TORCH_CHECK_VALUE(scale_b.stride(1) == scale_b.size(0),
-      "expected scale_b.stride(1) to be ", scale_b.size(0), ", but got ", scale_b.stride(1));
-
-  auto scaling_choice_a = ScalingType::BlockWise1x128;
-  auto scaling_choice_b = ScalingType::BlockWise128x128;
-
-  _cutlass_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
-
-  return out;
-}
-
-Tensor&
-_scaled_mxfp8_mxfp8(
-          const Tensor& mat_a, const Tensor& mat_b,
-          const Tensor& scale_a, const SwizzleType swizzle_a,
-          const Tensor& scale_b, const SwizzleType swizzle_b,
-          const std::optional<Tensor>& bias,
-          const c10::ScalarType out_dtype,
-          Tensor& out) {
-  // Restrictions:
-  // A, B are FP8, scales are e8m0, A: shape K//32, B: K, N//32
-  // Scales must be swizzled
-  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
-      mat_a.scalar_type(), mat_b.scalar_type());
-
-  auto scale_a_elems = round_up<int64_t>(mat_a.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_a.size(1), 32), 4);
-  auto scale_b_elems = round_up<int64_t>(mat_b.size(1), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_b.size(0), 32), 4);
-  TORCH_CHECK_VALUE(scale_a_elems == scale_a.numel(),
-         "For Blockwise scaling scale_a should have ", scale_a_elems, " elements, got: ", scale_a.numel());
-  TORCH_CHECK_VALUE(scale_b_elems == scale_b.numel(),
-         "For Blockwise scaling scale_b should have ", scale_b_elems, " elements, got: ", scale_b.numel());
-
-  TORCH_CHECK_VALUE(swizzle_a == SwizzleType::SWIZZLE_32_4_4, "scale_a must be swizzled to SWIZZLE_32_4_4 format");
-  TORCH_CHECK_VALUE(swizzle_b == SwizzleType::SWIZZLE_32_4_4, "scale_b must be swizzled to SWIZZLE_32_4_4 format");
-
-  TORCH_CHECK_VALUE(scale_a.is_contiguous() && scale_b.is_contiguous(),
-        "For Blockwise scaling both scales should be contiguous");
-
-  TORCH_CHECK_VALUE(out.scalar_type() == out_dtype, "expected out.scalar_type() to be ", out_dtype, ", but got ", out_dtype);
-
-  auto scaling_choice_a = ScalingType::BlockWise1x32;
-  auto scaling_choice_b = ScalingType::BlockWise1x32;
-
-#ifdef USE_ROCM
-#if ROCM_VERSION >= 70000
-  TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
-              "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
-
-  TORCH_CHECK_VALUE(mat_a.size(0) % 32 == 0 && mat_a.size(1) % 32 == 0 &&
-              mat_b.size(0) % 32 == 0 && mat_b.size(1) % 32 == 0,
-              "Matrix dimensions must be multiples of 32 for block-wise scaling");
-
-  TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16 ||
-              out.scalar_type() == ScalarType::Half,
-              "Block-wise scaling only supports BFloat16 or Half output types");
-#else
-    TORCH_CHECK_NOT_IMPLEMENTED(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
-#endif
-#endif
-
-  return _cutlass_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
-}
-
-Tensor&
-_scaled_nvfp4_nvfp4(
-          const Tensor& mat_a, const Tensor& mat_b,
-          const Tensor& scale_a, const SwizzleType swizzle_a,
-          const Tensor& scale_b, const SwizzleType swizzle_b,
-          const std::optional<Tensor>& bias,
-          const c10::ScalarType out_dtype,
-          const bool single_scale,
-          Tensor& out) {
-#ifdef USE_ROCM
-  TORCH_CHECK_NOT_IMPLEMENTED(false, "NVFP4 scaling not supported on ROCM");
-#endif
-  TORCH_CHECK_VALUE(single_scale, "Only single-scaled NVFP4 currently supported");
-  // Restrictions:
-  // A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
-  // Scales must be swizzled
-  TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ",
-      mat_a.scalar_type(), mat_b.scalar_type());
-  // Note: fp4x2 format, need to double the K dimension for checking purposes.
-  auto scale_a_elems = round_up<int64_t>(mat_a.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_a.size(1) * 2, 16), 4);
-  auto scale_b_elems = round_up<int64_t>(mat_b.size(1), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_b.size(0) * 2, 16), 4);
-  TORCH_CHECK_VALUE(scale_a_elems == scale_a.numel(),
-         "For Blockwise scaling scale_a should have ", scale_a_elems, " elements, got: ", scale_a.numel());
-  TORCH_CHECK_VALUE(scale_b_elems == scale_b.numel(),
-         "For Blockwise scaling scale_b should have ", scale_b_elems, " elements, got: ", scale_b.numel());
-
-  TORCH_CHECK_VALUE(swizzle_a == SwizzleType::SWIZZLE_32_4_4, "scale_a must be swizzled to SWIZZLE_32_4_4 format");
-  TORCH_CHECK_VALUE(swizzle_b == SwizzleType::SWIZZLE_32_4_4, "scale_b must be swizzled to SWIZZLE_32_4_4 format");
-
-  TORCH_CHECK_VALUE(scale_a.is_contiguous() && scale_b.is_contiguous(),
-        "For Blockwise scaling both scales should be contiguous");
-
-  auto scaling_choice_a = ScalingType::BlockWise1x16;
-  auto scaling_choice_b = ScalingType::BlockWise1x16;
-  return _cutlass_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
-}
-
-
-// V2: Computes matrix multiply + bias while applying scaling to input and output matrices
-// Scales are only applicable when matrices are of Float8 type and assumed to be equal to 1.0 by default.
-// If output matrix type is 16 or 32-bit type, scale_result is not applied.
-// Known limitations:
-//  - Only works if mat1 is row-major and mat2 is column-major
-//  - Only works if matrices sizes are divisible by 32
-//  - If 1-dimensional tensors are used then scale_a should be size = mat1.size(0)
-//    and scale_b should have size = to mat2.size(1)
-//  Arguments:
-//    - `mat1`: the first operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2`
-//    - `mat2`: the second operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2`
-//    - `scale_a`: a tensor with the inverse scale of `mat1`, whose shape/strides/dtype depend on the scaling scheme
-//    - `scale_recipe_a`: An integer corresponding to an enum describing the scaling scheme used for `scale_a`
-//    - `swizzle_a`: An integer corresponding to a `SwizzleType` enum describing the swizzling scheme for `scale_a`
-//    - `scale_b`: a tensor with the inverse scale of `mat2`, whose shape/strides/dtype depend on the scaling scheme
-//    - `scale_recipe_b`: An integer corresponding to an enum describing the scaling scheme used for `scale_b`
-//    - `swizzle_b`: An integer corresponding to a `SwizzleType` enum describing the swizzling scheme for `scale_b`
-//    - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16`
-//    - `out_dtype`: the output dtype, can either be a float8 or a higher precision floating point type
-//    - `use_fast_accum`: if true, enables fast float8 accumulation. Backends may ignore this option if not applicable.
-//    - `out`: a reference to the output tensor
-Tensor&
-_scaled_mm_cuda_v2_out(
-          const Tensor& mat_a, const Tensor& mat_b,
-          ArrayRef<Tensor> scale_a,
-          IntArrayRef scale_recipe_a,
-          IntArrayRef swizzle_a,
-          ArrayRef<Tensor> scale_b,
-          IntArrayRef scale_recipe_b,
-          IntArrayRef swizzle_b,
-          const std::optional<Tensor>& bias,
-          const std::optional<c10::ScalarType> out_dtype,
-          IntArrayRef contraction_dim,
-          bool use_fast_accum,
-          Tensor& out) {
-  // Check sizes
-  bool allowed_device = _scaled_mm_allowed_device();
-  TORCH_CHECK_NOT_IMPLEMENTED(allowed_device,
-      "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+");
-  TORCH_CHECK_VALUE(mat_a.dim() == 2, "mat_a must be a matrix");
-  TORCH_CHECK_VALUE(mat_b.dim() == 2, "mat_b must be a matrix");
-
-  // If any of M, K, N is 0 - return early (the tensorwise/rowwise float8 gemm kernels
-  // do not support this case).
-  if (mat_a.size(0) == 0 || mat_a.size(1) == 0 || mat_b.size(1) == 0) {
-    // `out` was created with `at::empty`. In the case where we are multiplying
-    // MxK by KxN and K is the zero dim, we need to initialize here to properly
-    // return a tensor of zeros.
-    at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
-    if (mat_a.size(1) == 0) {
-      out.zero_();
-    }
-
-    return out;
-  }
-
-  // Check if the input matrix sizes can be multiplied
-  // - if optional contraction dims are provided, use those
-  //   -- mostly for < 1B formats (i.e. nvfp4x2) where cheap .t() is not available.
-  if (contraction_dim.size() > 0) {
-    TORCH_CHECK_VALUE(contraction_dim.size() == 2, "contraction_dim must have exactly 2 elements");
-    auto mat_a_dim = contraction_dim[0];
-    auto mat_b_dim = contraction_dim[1];
-    TORCH_CHECK_VALUE(
-        mat_a.size(mat_a_dim) == mat_b.size(mat_b_dim), "mat_a and mat_b shapes cannot be multiplied (",
-        mat_a.size(0), "x", mat_a.size(1), " and ", mat_b.size(0), "x", mat_b.size(1), ") ",
-        "with contraction dims mat_a: ", mat_a_dim, ", mat_b: ", mat_b_dim);
-  } else {
-    TORCH_CHECK_VALUE(
-        mat_a.size(1) == mat_b.size(0), "mat_a and mat_b shapes cannot be multiplied (",
-        mat_a.size(0), "x", mat_a.size(1), " and ", mat_b.size(0), "x", mat_b.size(1), ")");
-  }
-
-  TORCH_CHECK_VALUE(!bias || bias->numel() == mat_b.sizes()[1], "Bias must be size ", mat_b.sizes()[1],
-       " but got ", bias->numel());
-  TORCH_CHECK_VALUE(
-      mat_a.sizes()[1] % 16 == 0,
-      "Expected trailing dimension of mat1 to be divisible by 16 ",
-      "but got mat1 shape: (",
-      mat_a.sizes()[0],
-      "x",
-      mat_a.sizes()[1],
-      ").");
-  TORCH_CHECK_VALUE(mat_b.sizes()[0] % 16 == 0 && mat_b.sizes()[1] % 16 == 0, "mat2 shape (", mat_b.sizes()[0], "x",
-       mat_b.sizes()[1], ") must be divisible by 16");
-
-  // TODO(slayton): Existing checks, not sure if they should really be here.
-  TORCH_CHECK_VALUE(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type");
-  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) || mat_a.scalar_type() == ScalarType::Float4_e2m1fn_x2,
-      "Expected mat_a to be Float8 or Float4_x2 matrix got ", mat_a.scalar_type());
-  TORCH_CHECK_VALUE(isFloat8Type(mat_b.scalar_type()) || mat_b.scalar_type() == ScalarType::Float4_e2m1fn_x2,
-      "Expected mat_b to be Float8 or Float4_x2 matrix got ", mat_b.scalar_type());
-#ifndef USE_ROCM
-  // Type restrictions imposed by CuBLASLt as of CUDA-12.1
-  TORCH_CHECK_VALUE(mat_a.scalar_type() != ScalarType::Float8_e5m2 || mat_b.scalar_type() != ScalarType::Float8_e5m2,
-        "Multiplication of two Float8_e5m2 matrices is not supported");
-#endif
-  if (use_fast_accum) {
-    TORCH_CHECK_VALUE(mat_a.scalar_type() != ScalarType::Float4_e2m1fn_x2 && mat_b.scalar_type() != ScalarType::Float4_e2m1fn_x2, "`use_fast_accum` is not supported when `mat_a` or `mat_b` tensors have the `Float4_e2m1fn_x2` dtype.");
-  }
-#ifdef USE_ROCM
-  if (mat_a.scalar_type() == ScalarType::Float4_e2m1fn_x2 || mat_b.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
-    TORCH_CHECK_NOT_IMPLEMENTED(ROCM_VERSION >= 70000,
-        "Float4_e2m1fn_x2 is only supported for ROCm 7.0 and above");
-  }
-  if (mat_a.scalar_type() == ScalarType::Float8_e5m2 || mat_b.scalar_type() == ScalarType::Float8_e5m2) {
-    TORCH_CHECK_NOT_IMPLEMENTED(ROCM_VERSION >= 60500,
-        "Float8_e5m2 is only supported for ROCm 6.5 and above");
-  }
-  if (mat_a.scalar_type() == ScalarType::Float8_e4m3fn || mat_b.scalar_type() == ScalarType::Float8_e4m3fn) {
-    TORCH_CHECK_NOT_IMPLEMENTED(ROCM_VERSION >= 60500,
-        "Float8_e4m3fn is only supported for ROCm 6.5 and above");
-  }
-#endif
-  if (bias) {
-    TORCH_CHECK_VALUE(out.scalar_type() != kFloat,
-        "Bias is not supported when out_dtype is set to Float32");
-
-    TORCH_CHECK_VALUE(bias->scalar_type() == ScalarType::BFloat16 ||
-                bias->scalar_type() == ScalarType::Half,
-        "Bias must be BFloat16 or Half, but got ", bias->scalar_type());
-
-    TORCH_CHECK_VALUE((out.scalar_type() != kFloat &&
-                 out.scalar_type() != ScalarType::BFloat16) ||
-                bias->scalar_type() == ScalarType::BFloat16,
-        "Bias must be BFloat16 to compute ", out.scalar_type(),
-        " output, but got ", bias->scalar_type());
-
-    TORCH_CHECK_VALUE(out.scalar_type() != ScalarType::Half ||
-                bias->scalar_type() == ScalarType::Half,
-        "Bias must be Float16 to compute ", out.scalar_type(),
-        " output, but got ", bias->scalar_type());
-  }
-  {
-    auto bias_ = bias.value_or(Tensor());
-
-    // NOLINTNEXTLINE(*c-array*)
-    TensorArg targs[]{{out, "out", 0}, {mat_a, "mat_a", 1}, {mat_b, "mat_b", 2},
-                      {bias_, "bias", 3}, {scale_a[0], "scale_a", 4}, {scale_b[0], "scale_b", 5}};
-    checkAllSameGPU(__func__, targs);
-  }
-
-  auto out_dtype_ = out_dtype.value_or(at::ScalarType::BFloat16);
-
-  // Conversion of implicitly-defined enums to explicit
-  auto scale_recipe_a_enum = convert_int_to_enum<ScalingType>(scale_recipe_a);
-  auto swizzle_a_enum = convert_int_to_enum<SwizzleType>(swizzle_a);
-  auto scale_recipe_b_enum = convert_int_to_enum<ScalingType>(scale_recipe_b);
-  auto swizzle_b_enum = convert_int_to_enum<SwizzleType>(swizzle_b);
-
-  // at this point we can start working out what we want to be doing
-  // Try to do as few steps as possible.
-  // NOTE: support is deliberately sparse, can explicitly enumerate all combinations allowed.
-  // Do this via a list of defined (name, acceptance, concrete_impl) tuples.
-  bool found_impl = false;
-  ScaledGemmImplementation gemm_impl = ScaledGemmImplementation::NONE;
-
-  for (const auto& fn_entry : scale_kernel_dispatch) {
-    const auto [name, accept_fn, scaled_gemm_impl] = fn_entry;
-    bool ok = accept_fn(mat_a.scalar_type(),
-                        scale_recipe_a_enum,
-                        scale_a,
-                        mat_b.scalar_type(),
-                        scale_recipe_b_enum,
-                        scale_b);
-    if (ok) {
-      gemm_impl = scaled_gemm_impl;
-      found_impl = true;
-      break;
-    }
-  }
-  TORCH_CHECK_VALUE(
-    found_impl,
-    "Invalid scaling configuration.\n"
-    "- For TensorWise scaling, a and b should be float8, scales should be float and singletons.\n"
-    "- For RowWise scaling, a and b should be float8, scales should be float, scale_a should be (", mat_a.size(0), ", 1) and scale_b should be (1, ", mat_b.size(1), "), and both should be contiguous.\n"
-    "- For BlockWise 1x128 scaling, a and b should be float8, scales should be float, scale_a should be (", mat_a.size(0), ", ", ceil_div<int64_t>(mat_a.size(1), 128), ") and scale_b should be (", ceil_div<int64_t>(mat_b.size(0), 128), ", ", mat_b.size(1), "), and both should be outer-dim-major.\n"
-    "- For BlockWise 128x128 scaling, a and b should be float8, scales should be float, scale_a should be (", ceil_div<int64_t>(mat_a.size(0), 128), ", ", ceil_div<int64_t>(mat_a.size(1), 128), ") and scale_b should be (", ceil_div<int64_t>(mat_b.size(0), 128), ", ", ceil_div<int64_t>(mat_b.size(1), 128), "), and both should be near-inner-dim-major (with 16-byte aligned strides).\n"
-    "- For Blockwise 1x32 scaling, a and b should be float8, scales should be float8_e8m0fnu, scale_a should have ", round_up<int64_t>(mat_a.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_a.size(1), 32), 4), " elements and scale_b should have ", round_up<int64_t>(mat_b.size(1), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_b.size(0), 32), 4), " elements, and both should be contiguous.\n"
-    "- For Blockwise 1x16 scaling, a and b should be float4 (packed 2x), scales should be float8_e4m3fn, scale_a should have ", round_up<int64_t>(mat_a.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_a.size(1) * 2, 16), 4), " elements and scale_b should have ", round_up<int64_t>(mat_b.size(1), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_b.size(0) * 2, 16), 4), " elements, and both should be contiguous.\n"
-    "Got mat_a.dtype()=", mat_a.scalar_type(), ", scale_a[0].dtype()=", scale_a[0].scalar_type(), ", scale_a[0].size()=", scale_a[0].sizes(), ", scale_a[0].stride()=", scale_a[0].strides(), ", ",
-    "mat_b.dtype()=", mat_b.scalar_type(), ", scale_b[0].dtype()=", scale_b[0].scalar_type(), ", scale_b[0].size()=", scale_b[0].sizes(), " and scale_b[0].stride()=", scale_b[0].strides()
-  );
-
-  at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
-
-  auto bias_ = bias.value_or(Tensor());
-
-  // dispatch to appropriate lower-level calls for error checking & execution
-  if (gemm_impl == ScaledGemmImplementation::TENSORWISE_TENSORWISE) {
-    return _scaled_tensorwise_tensorwise(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out);
-  } else if (gemm_impl == ScaledGemmImplementation::ROWWISE_ROWWISE) {
-    return _scaled_rowwise_rowwise(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out);
-  } else if (gemm_impl == ScaledGemmImplementation::BLOCK_128x128_1x128) {
-    return _scaled_block128x128_block1x128(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out);
-  } else if (gemm_impl == ScaledGemmImplementation::BLOCK_1x128_128x128) {
-    return _scaled_block1x128_block128x128(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out);
-  } else if (gemm_impl == ScaledGemmImplementation::BLOCK_1x128_1x128) {
-    return _scaled_block1x128_block1x128(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out);
-  } else if (gemm_impl == ScaledGemmImplementation::MXFP8_MXFP8) {
-    return _scaled_mxfp8_mxfp8(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
-  } else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4) {
-    TORCH_CHECK_NOT_IMPLEMENTED(false, "Only single-scale NVFP4 currently supported");
-  } else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4_SINGLE_SCALE) {
-    return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, true /* single_scale */, out);
-  } else {
-    TORCH_CHECK_VALUE(false, "Invalid state - found an implementation, but not really");
-  }
-}
-
-Tensor
-_scaled_mm_cuda_v2(
-          const Tensor& mat_a, const Tensor& mat_b,
-          ArrayRef<Tensor> scale_a,
-          IntArrayRef scale_recipe_a,
-          IntArrayRef swizzle_a,
-          ArrayRef<Tensor> scale_b,
-          IntArrayRef scale_recipe_b,
-          IntArrayRef swizzle_b,
-          const std::optional<Tensor>& bias,
-          const std::optional<c10::ScalarType> out_dtype,
-          IntArrayRef contraction_dim,
-          bool use_fast_accum) {
-  const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
-  Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
-
-  return _scaled_mm_cuda_v2_out(
-                      mat_a, mat_b,
-                      scale_a, scale_recipe_a, swizzle_a,
-                      scale_b, scale_recipe_b, swizzle_b,
-                      bias,
-                      out_dtype,
-                      contraction_dim,
-                      use_fast_accum,
-                      out);
-}

 Tensor
 _scaled_grouped_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@ -59,7 +59,7 @@ constexpr uint64_t getDefaultMaxThreadsPerBlock() {
 #ifdef USE_ROCM
 #define SKIP_SORTED_INDICES 32
 template <typename scalar_t, int SZ>
-__global__ void indexing_backward_kernel_many_indices(
+__global__ void indexing_backward_kernel(
  const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
  int64_t numel, int64_t stride, int64_t stride_before, int64_t outer_dim, bool accumulate) {
  using opmath_t = at::opmath_type<scalar_t>;
@ -254,8 +254,7 @@ __global__ void indexing_backward_kernel_stride_1(
    }
  }
 }
-#endif
-
+#else
 template <typename scalar_t, int SZ>
 __global__ void indexing_backward_kernel(
  const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
@ -334,7 +333,6 @@ __global__ void indexing_backward_kernel(
  }
 }

-#ifndef USE_ROCM
 template <typename scalar_t>
 __global__ void indexing_backward_kernel_stride_1(
  const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
@ -782,43 +780,11 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
            kBool,
            kBFloat16);
        } else {
-#ifdef USE_ROCM
-          if (num_indices >= 200000)
-            AT_DISPATCH_V2(
-              expandedValue.scalar_type(),
-              "indexing_backward_many_indices",
-              AT_WRAP([&] {
-                indexing_backward_kernel_many_indices<scalar_t, UNROLL><<<new_grid, block, smem_dups_size, stream>>>(
-                  sorted_indices.const_data_ptr<int64_t>(),
-                  orig_indices.const_data_ptr<int64_t>(),
-                  expandedValue.const_data_ptr<scalar_t>(),
-                  src_.mutable_data_ptr<scalar_t>(),
-                  num_indices,
-                  sliceSize,
-                  strideBefore,
-                  nElemBefore,
-                  accumulate);
-                C10_CUDA_KERNEL_LAUNCH_CHECK();
-              }),
-              AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
-              // AT_EXPAND(AT_FLOAT8_TYPES),
-              // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
-              // should not be supported here, then reenable AT_FLOAT8_DTYPES
-              kFloat8_e4m3fn,
-              kFloat8_e5m2,
-              kFloat8_e4m3fnuz,
-              kFloat8_e5m2fnuz,
-              kComplexHalf,
-              kHalf,
-              kBool,
-              kBFloat16);
-          else
-#endif
          AT_DISPATCH_V2(
            expandedValue.scalar_type(),
            "indexing_backward",
            AT_WRAP([&] {
-              indexing_backward_kernel<scalar_t, UNROLL><<<grid, block, 0, stream>>>(
+              indexing_backward_kernel<scalar_t, UNROLL><<<KERNEL_GRID, block, KERNEL_SMEM, stream>>>(
                sorted_indices.const_data_ptr<int64_t>(),
                orig_indices.const_data_ptr<int64_t>(),
                expandedValue.const_data_ptr<scalar_t>(),
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Michael Lazos	f70a6ac1a6	[user-streams] Fix stream graph output semantics ghstack-source-id: a1761206efa02920945b94e1ff811abeed6e470b Pull Request resolved: https://github.com/pytorch/pytorch/pull/164819	2025-10-06 22:09:05 -07:00
Michael Lazos	020bd1f830	[dynamo] Remove retrieving objects by ID ghstack-source-id: f09cb7bc515fb4f7e195d75ef0dff2340584c473 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162905	2025-10-06 22:09:00 -07:00
Michael Lazos	c84e73df6e	[user-streams] Add basic stream tests ghstack-source-id: a0860743dd23356a9b69889c0762799a6c848b47 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164523 merge into streams suite	2025-10-06 22:09:00 -07:00
Michael Lazos	6120d39fdb	[User-streams] Make torch.Event weakref compatible ghstack-source-id: 49e3de1c6f1f57bd33330bffb257e01cc28bdda1 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164522	2025-10-06 22:08:59 -07:00
Michael Lazos	17ed117a90	[user-streams] Make cuda streams weakref compatible ghstack-source-id: 7f211d3af308e5924e4725efed914adf23613727 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164304	2025-10-06 22:08:59 -07:00
Michael Lazos	3e941aebb7	[user-cuda-streams] Add cuda streams test suite ghstack-source-id: 782f3ac95798625c54d71a41249f4d31786831c9 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162901	2025-10-06 22:08:58 -07:00
Michael Lazos	e403203714	[user-streams] Support streams as contexts ghstack-source-id: d95de5ef14d5be19d536d53b142aa4126dcdf1e0 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164507	2025-10-06 22:08:58 -07:00
Michael Lazos	1e80b1ad7d	[user-streams] Have StreamVariable inherit from StreamContextVariable ghstack-source-id: e97a7966236489f10bb71c0178f973b0790a8170 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164344	2025-10-06 22:08:57 -07:00
Michael Lazos	3dbe758856	[user-streams] Move StreamContextVariable into streams module finish moving ghstack-source-id: bc16a138acb41b68164322ce8748d1da74318332 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164343	2025-10-06 22:08:57 -07:00
Michael Lazos	7faa4842f8	[user-streams] Exclude non-tensor nodes from stream args ghstack-source-id: bff24c337acc3c38d8c23c80543bf763b645e506 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164818	2025-10-06 22:08:56 -07:00
Michael Lazos	45200769d1	[user-streams] Track external/internal nodes for stream context ghstack-source-id: 9bbbf8cbc520f0f7a52acee50ee99771e711434e Pull Request resolved: https://github.com/pytorch/pytorch/pull/162904	2025-10-06 22:08:52 -07:00
Michael Lazos	38695fbfb4	[user-streams] update stream context to use fork/join ghstack-source-id: 47010b1cf4a6ff3fe80165fbbed29dddb3b33cc9 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162903	2025-10-06 22:08:52 -07:00
Michael Lazos	0c01bec755	[user-streams] Add stream state manager ghstack-source-id: aff52f66920a421884232d4e96a2ac1a80ec68d5 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162902	2025-10-06 22:08:51 -07:00
Michael Lazos	02cbfe3469	[user-cuda-streams] Add fork/join custom ops Make custom ops inplace ghstack-source-id: 3e41664853ac60d49b60bdebd8b3859d227925ad Pull Request resolved: https://github.com/pytorch/pytorch/pull/162900	2025-10-06 22:08:50 -07:00
Michael Lazos	3e8fcf18ab	[user-streams] Handle aliasing properly ghstack-source-id: 9d1810e19ec99c3b148f446b56f85354549104a2 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163028	2025-10-06 22:08:50 -07:00
Michael Lazos	3e5b122936	[user-cuda-streams] Pass streams/events to the graph via lookup table ghstack-source-id: 72a6321c9de91f6c6c5e8b27a998c60853ffe5d2 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162899 test fixes	2025-10-06 22:08:49 -07:00
Michael Lazos	422233fde2	[user-streams] Move stream code to streams module ghstack-source-id: 627a90d386ed00706ad9d04f607732a3a6a79fc4 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163027	2025-10-06 22:08:49 -07:00