adding documentation

2025-11-18 17:45:09 +08:00 · 2025-11-07 16:31:53 -08:00
822 changed files with 9261 additions and 27479 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -260,8 +260,8 @@ case "$tag" in
    HALIDE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-py3.12-pallas)
-    CUDA_VERSION=12.8.1
+  pytorch-linux-jammy-cuda13.0-py3.12-pallas)
+    CUDA_VERSION=13.0.0
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=11
    PALLAS=yes
--- a/.ci/lumen_cli/cli/lib/common/cli_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/cli_helper.py
@ -8,11 +8,9 @@ from abc import ABC, abstractmethod


 try:
-    from collections.abc import Callable  # Python 3.11+
-    from typing import Any, Required, TypedDict
+    from typing import Any, Callable, Required, TypedDict  # Python 3.11+
 except ImportError:
-    from collections.abc import Callable
-    from typing import Any, TypedDict
+    from typing import Any, Callable, TypedDict

    from typing_extensions import Required  # Fallback for Python <3.11

--- a/.ci/magma-rocm/README.md
+++ b/.ci/magma-rocm/README.md
@ -30,6 +30,7 @@ into a tarball, with the following structure:
 More specifically, `build_magma.sh` copies over the relevant files from the `package_files` directory depending on the ROCm version.
 Outputted binaries should be in the `output` folder.

+
 ## Pushing

 Packages can be uploaded to an S3 bucket using:
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -168,16 +168,14 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
  # shellcheck disable=SC1091
-  source /opt/intel/oneapi/umf/latest/env/vars.sh
-  # shellcheck disable=SC1091
  source /opt/intel/oneapi/ccl/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/mpi/latest/env/vars.sh
-  # shellcheck disable=SC1091
-  source /opt/intel/oneapi/pti/latest/env/vars.sh
  # Enable XCCL build
  export USE_XCCL=1
  export USE_MPI=0
+  # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
+  export USE_KINETO=0
  export TORCH_XPU_ARCH_LIST=pvc
 fi

--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -96,6 +96,7 @@ function pip_build_and_install() {
    python3 -m pip wheel \
      --no-build-isolation \
      --no-deps \
+      --no-use-pep517 \
      -w "${wheel_dir}" \
      "${build_target}"
  fi
@ -307,28 +308,6 @@ function install_torchao() {
  pip_build_and_install "git+https://github.com/pytorch/ao.git@${commit}" dist/ao
 }

-function install_flash_attn_cute() {
-  echo "Installing FlashAttention CuTe from GitHub..."
-  # Grab latest main til we have a pinned commit
-  local flash_attn_commit
-  flash_attn_commit=$(git ls-remote https://github.com/Dao-AILab/flash-attention.git HEAD | cut -f1)
-
-  # Clone the repo to a temporary directory
-  rm -rf flash-attention-build
-  git clone --depth 1 --recursive https://github.com/Dao-AILab/flash-attention.git flash-attention-build
-
-  pushd flash-attention-build
-  git checkout "${flash_attn_commit}"
-
-  # Install only the 'cute' sub-directory
-  pip_install -e flash_attn/cute/
-  popd
-
-  # remove the local repo
-  rm -rf flash-attention-build
-  echo "FlashAttention CuTe installation complete."
-}
-
 function print_sccache_stats() {
  echo 'PyTorch Build Statistics'
  sccache --show-stats
--- a/.ci/pytorch/smoke_test/smoke_test.py
+++ b/.ci/pytorch/smoke_test/smoke_test.py
@ -353,17 +353,6 @@ def test_linalg(device="cpu") -> None:
            torch.linalg.svd(A)


-def test_sdpa(device="cpu", dtype=torch.float16) -> None:
-    """Regression test for https://github.com/pytorch/pytorch/issues/167602
-    Without nvrtc_builtins on CuDNN-9.13 on CUDA-13 fails with ` No valid execution plans built.`
-    """
-    print(f"Testing SDPA on {device} using type {dtype}")
-    k, q, v = torch.rand(3, 1, 16, 77, 64, dtype=dtype, device=device).unbind(0)
-    attn = torch.rand(1, 1, 77, 77, dtype=dtype, device=device)
-    rc = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn)
-    assert rc.isnan().any().item() is False
-
-
 def smoke_test_compile(device: str = "cpu") -> None:
    supported_dtypes = [torch.float16, torch.float32, torch.float64]

@ -500,12 +489,10 @@ def main() -> None:
    smoke_test_conv2d()
    test_linalg()
    test_numpy()
-    test_sdpa()

    if is_cuda_system:
        test_linalg("cuda")
        test_cuda_gds_errors_captured()
-        test_sdpa("cuda")

    if options.package == "all":
        smoke_test_modules()
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -208,8 +208,6 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  source /opt/intel/oneapi/ccl/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/mpi/latest/env/vars.sh
-  # shellcheck disable=SC1091
-  source /opt/intel/oneapi/pti/latest/env/vars.sh
  # Check XPU status before testing
  timeout 30 xpu-smi discovery || true
 fi
@ -339,23 +337,13 @@ test_python() {

 test_python_smoke() {
  # Smoke tests for H100/B200
-  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune inductor/test_cutedsl_grouped_mm $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }

 test_python_smoke_b200() {
-  # Targeted smoke tests for B200 including FlashAttention CuTe coverage
-  install_flash_attn_cute
-  time python test/run_test.py \
-    --include \
-      test_matmul_cuda \
-      test_scaled_matmul_cuda \
-      inductor/test_fp8 \
-      nn/attention/test_fa4 \
-      nn/attention/test_open_registry \
-      inductor/test_flex_flash \
-    $PYTHON_TEST_EXTRA_OPTION \
-    --upload-artifacts-while-running
+  # Targeted smoke tests for B200 - staged approach to avoid too many failures
+  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }

@ -389,13 +377,6 @@ test_lazy_tensor_meta_reference_disabled() {
  export -n TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE
 }

-test_dynamo_core() {
-  time python test/run_test.py \
-    --include-dynamo-core-tests \
-    --verbose \
-    --upload-artifacts-while-running
-  assert_git_not_dirty
-}

 test_dynamo_wrapped_shard() {
  if [[ -z "$NUM_TEST_SHARDS" ]]; then
@ -1687,22 +1668,6 @@ test_operator_microbenchmark() {
  done
 }

-test_attention_microbenchmark() {
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-  TEST_DIR=$(pwd)
-
-  # Install attention-gym dependency
-  echo "Installing attention-gym..."
-  python -m pip install git+https://github.com/meta-pytorch/attention-gym.git@main
-  pip show triton
-
-  cd "${TEST_DIR}"/benchmarks/transformer
-
-  $TASKSET python score_mod.py --config configs/config_basic.yaml \
-    --output-json-for-dashboard "${TEST_REPORTS_DIR}/attention_microbenchmark.json"
-}
-
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
  (cd test && python -c "import torch; print(torch.__config__.show())")
  (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
@ -1760,8 +1725,6 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
  fi
 elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then
  test_operator_microbenchmark
-elif [[ "${TEST_CONFIG}" == *attention_microbenchmark* ]]; then
-  test_attention_microbenchmark
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
@ -1821,8 +1784,6 @@ elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  test_inductor_shard "${SHARD_NUMBER}"
 elif [[ "${TEST_CONFIG}" == *einops* ]]; then
  test_einops
-elif [[ "${TEST_CONFIG}" == *dynamo_core* ]]; then
-  test_dynamo_core
 elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
  install_torchvision
  test_dynamo_wrapped_shard "${SHARD_NUMBER}"
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -63,7 +63,7 @@ self-hosted-runner:
    - linux.rocm.gpu.gfx942.1
    - linux.rocm.gpu.gfx942.2
    - linux.rocm.gpu.gfx942.4
-    - linux.rocm.gfx942.docker-cache
+    - rocm-docker
    # Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors)
    - macos-m1-stable
    - macos-m1-14
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-ee1a1350eb37804b94334768f328144f058f14e9
+ad5816f0eee1c873df1b7d371c69f1f811a89387
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-2d82dc5caa336d179d9b46ac4a0fb8c43d84c5cc
+ca2212438fdd8ce29b66999ed70ed54b0f9372d1
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-94631807d22c09723dd006f7be5beb649d5f88d0
+c8b09f5f77d6bf6fb7ed7a9aa83e5d8156b3a5e9
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -165,16 +165,3 @@
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
 - third_party/fbgemm
-
-"ciflow/mps":
- aten/src/ATen/mps/**
- aten/src/ATen/native/mps/**
- torch/_inductor/codegen/mps.py
- test/test_mps.py
- test/inductor/test_mps_basic.py
-
-"ciflow/h100-symm-mem":
- torch/csrc/distributed/c10d/symm_mem/**
- torch/distributed/_symmetric_memory/**
- test/distributed/**/*mem*
- test/distributed/**/*mem*/**
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -7,7 +7,6 @@ ciflow_push_tags:
 - ciflow/binaries
 - ciflow/binaries_libtorch
 - ciflow/binaries_wheel
- ciflow/dynamo
 - ciflow/h100
 - ciflow/h100-cutlass-backend
 - ciflow/h100-distributed
--- a/.github/scripts/delete_old_branches.py
+++ b/.github/scripts/delete_old_branches.py
@ -1,11 +1,10 @@
 # Delete old branches
 import os
 import re
-from collections.abc import Callable
 from datetime import datetime
 from functools import lru_cache
 from pathlib import Path
-from typing import Any
+from typing import Any, Callable

 from github_utils import gh_fetch_json_dict, gh_graphql
 from gitutils import GitRepo
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -8,11 +8,10 @@ import re
 import subprocess
 import sys
 import warnings
-from collections.abc import Callable
 from enum import Enum
 from functools import cache
 from logging import info
-from typing import Any, Optional
+from typing import Any, Callable, Optional
 from urllib.request import Request, urlopen

 import yaml
--- a/.github/scripts/generate_pytorch_version.py
+++ b/.github/scripts/generate_pytorch_version.py
@ -50,7 +50,7 @@ def get_tag() -> str:

 def get_base_version() -> str:
    root = get_pytorch_root()
-    dirty_version = Path(root / "version.txt").read_text().strip()
+    dirty_version = open(root / "version.txt").read().strip()
    # Strips trailing a0 from version.txt, not too sure why it's there in the
    # first place
    return re.sub(LEGACY_BASE_VERSION_SUFFIX_PATTERN, "", dirty_version)
--- a/.github/scripts/get_workflow_job_id.py
+++ b/.github/scripts/get_workflow_job_id.py
@ -11,8 +11,7 @@ import sys
 import time
 import urllib
 import urllib.parse
-from collections.abc import Callable
-from typing import Any, Optional
+from typing import Any, Callable, Optional
 from urllib.request import Request, urlopen


--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -3,9 +3,8 @@
 import json
 import os
 import warnings
-from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, cast, Optional, Union
+from typing import Any, Callable, cast, Optional, Union
 from urllib.error import HTTPError
 from urllib.parse import quote
 from urllib.request import Request, urlopen
--- a/.github/scripts/gitutils.py
+++ b/.github/scripts/gitutils.py
@ -4,10 +4,10 @@ import os
 import re
 import tempfile
 from collections import defaultdict
-from collections.abc import Callable, Iterator
+from collections.abc import Iterator
 from datetime import datetime
 from functools import wraps
-from typing import Any, cast, Optional, TypeVar, Union
+from typing import Any, Callable, cast, Optional, TypeVar, Union


 T = TypeVar("T")
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@ -34,9 +34,6 @@ python3 torch/utils/data/datapipes/gen_pyi.py
 # Also check generated pyi files
 find torch -name '*.pyi' -exec git add --force -- "{}" +

-# Print current environment
-python3 -m pip freeze
-
 RC=0
 # Run lintrunner on all files
 if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -17,12 +17,12 @@ import re
 import time
 import urllib.parse
 from collections import defaultdict
-from collections.abc import Callable, Iterable
+from collections.abc import Iterable
 from dataclasses import dataclass
 from functools import cache
 from pathlib import Path
 from re import Pattern
-from typing import Any, cast, NamedTuple, Optional
+from typing import Any, Callable, cast, NamedTuple, Optional
 from warnings import warn

 import yaml
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -326,7 +326,7 @@ jobs:
          SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }}
          SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }}
          SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
-          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          DOCKER_IMAGE: ${{ inputs.docker-image }}
          XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
          XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
--- a/.github/workflows/attention_op_microbenchmark.yml
+++ b/.github/workflows/attention_op_microbenchmark.yml
@ -1,73 +0,0 @@
-name: attention_op_microbenchmark
-
-on:
-  push:
-    tags:
-      - ciflow/op-benchmark/*
-  workflow_dispatch:
-  schedule:
-    # Run at 06:00 UTC everyday
-    - cron: 0 7 * * *
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-  attn-microbenchmark-build:
-    if: github.repository_owner == 'pytorch'
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      runner: linux.12xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      cuda-arch-list: '8.0 9.0'
-      test-matrix: |
-        { include: [
-          { config: "attention_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
-          { config: "attention_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.h100" },
-        ]}
-    secrets: inherit
-
-  attn-microbenchmark-test:
-    name: attn-microbenchmark-test
-    uses: ./.github/workflows/_linux-test.yml
-    needs: attn-microbenchmark-build
-    with:
-      timeout-minutes: 500
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
-      docker-image: ${{ needs.attn-microbenchmark-build.outputs.docker-image }}
-      test-matrix: ${{ needs.attn-microbenchmark-build.outputs.test-matrix }}
-    secrets: inherit
-
-  # B200 runner
-  opmicrobenchmark-build-b200:
-    if: github.repository_owner == 'pytorch'
-    name: opmicrobenchmark-build-b200
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      runner: linux.12xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      cuda-arch-list: '10.0'
-      test-matrix: |
-        { include: [
-          { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
-        ]}
-    secrets: inherit
-
-  opmicrobenchmark-test-b200:
-    name: opmicrobenchmark-test-b200
-    uses: ./.github/workflows/_linux-test.yml
-    needs: opmicrobenchmark-build-b200
-    with:
-      timeout-minutes: 500
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
-      docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
-      test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
-      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-    secrets: inherit
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -67,7 +67,7 @@ jobs:
          pytorch-linux-jammy-py3.10-gcc11,
          pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
          pytorch-linux-jammy-py3.12-halide,
-          pytorch-linux-jammy-cuda12.8-py3.12-pallas,
+          pytorch-linux-jammy-cuda13.0-py3.12-pallas,
          pytorch-linux-jammy-xpu-n-1-py3,
          pytorch-linux-noble-xpu-n-py3,
          pytorch-linux-noble-xpu-n-py3-inductor-benchmarks,
@ -119,22 +119,6 @@ jobs:
        with:
          docker-image: ${{ steps.build-docker-image.outputs.docker-image }}

-      - name: Generate output
-        if: contains(matrix.docker-image-name, 'rocm')
-        id: generate_output
-        run: |
-          docker_image_name="${{ matrix.docker-image-name }}"
-          docker_image_tag="${{ steps.build-docker-image.outputs.docker-image }}"
-          echo "${docker_image_name}=${docker_image_tag}" >> docker-builds-output-${docker_image_name}.txt
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4.4.0
-        if: contains(matrix.docker-image-name, 'rocm')
-        with:
-          name: docker-builds-artifacts-${{ matrix.docker-image-name }}
-          retention-days: 14
-          path: ./docker-builds-output-${{ matrix.docker-image-name }}.txt
-
      - uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
        name: Push to https://ghcr.io/
        id: push-to-ghcr-io
--- a/.github/workflows/docker-cache-mi300.yml
+++ b/.github/workflows/docker-cache-mi300.yml
@ -0,0 +1,55 @@
+name: docker-cache-mi300
+
+on:
+  # run every 6 hours
+  schedule:
+    - cron: 0 0,6,12,18 * * *
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  docker-cache:
+    if: github.repository_owner == 'pytorch'
+    runs-on: rocm-docker
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          no-sudo: true
+
+      - name: configure aws credentials
+        id: aws_creds
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+
+      - name: Login to Amazon ECR
+        id: login-ecr
+        continue-on-error: false
+        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+          docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+          push: false
+
+      - name: Pull docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+      - name: Tar and upload to S3 bucket
+        run: |
+          sudo docker save -o ~/docker-data/pytorch/pytorch_docker_image.tar ${{ steps.calculate-docker-image.outputs.docker-image }}
+          sudo rclone copy -P --s3-upload-concurrency 64 --s3-chunk-size 200M --s3-upload-cutoff 300M ~/docker-data/pytorch/pytorch_docker_image.tar oci:pytorchbucket0002/pytorch_docker_image --progress
--- a/.github/workflows/docker-cache-rocm.yml
+++ b/.github/workflows/docker-cache-rocm.yml
@ -1,105 +0,0 @@
-name: docker-cache-rocm
-
-on:
-  workflow_run:
-    workflows: [docker-builds]
-    branches: [main, release]
-    types:
-      - completed
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-  actions: read
-
-jobs:
-  download-docker-builds-artifacts:
-    if: github.repository_owner == 'pytorch'
-    name: download-docker-builds-artifacts
-    runs-on: ubuntu-latest
-    outputs:
-      pytorch-linux-jammy-rocm-n-py3: ${{ steps.process-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}
-      pytorch-linux-noble-rocm-n-py3: ${{ steps.process-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }}
-      pytorch-linux-jammy-rocm-n-py3-benchmarks: ${{ steps.process-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }}
-    steps:
-      - name: Download artifacts
-        uses: actions/download-artifact@v4.1.7
-        with:
-          run-id: ${{ github.event.workflow_run.id }}
-          path: ./docker-builds-artifacts
-          merge-multiple: true
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Process artifacts
-        id: process-artifacts
-        run: |
-          ls -R ./docker-builds-artifacts
-          cat ./docker-builds-artifacts/*txt >> "${GITHUB_OUTPUT}"
-          cat "${GITHUB_OUTPUT}"
-
-  docker-cache:
-    if: github.repository_owner == 'pytorch'
-    needs: download-docker-builds-artifacts
-    strategy:
-      fail-fast: false
-      matrix:
-        runner: [linux.rocm.gfx942.docker-cache]
-        docker-image: [
-          "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}",
-          "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }}",
-          "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }}"
-        ]
-    runs-on: "${{ matrix.runner }}"
-    steps:
-      - name: debug
-        run: |
-          JSON_STRINGIFIED="${{ toJSON(needs.download-docker-builds-artifacts.outputs) }}"
-          echo "Outputs of download-docker-builds-artifacts job: ${JSON_STRINGIFIED}"
-
-      - name: configure aws credentials
-        id: aws_creds
-        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-          role-duration-seconds: 18000
-
-      - name: Login to Amazon ECR
-        id: login-ecr
-        continue-on-error: false
-        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
-
-      - name: Generate ghrc.io tag
-        id: ghcr-io-tag
-        run: |
-            ecr_image="${{ matrix.docker-image }}"
-            ghcr_image="ghcr.io/pytorch/ci-image:${ecr_image##*:}"
-            echo "ghcr_image=${ghcr_image}" >> "$GITHUB_OUTPUT"
-
-      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: ${{ steps.ghcr-io-tag.outputs.ghcr_image }}
-
-      - name: Save as tarball
-        run: |
-          docker_image_tag=${{ matrix.docker-image }}
-          docker_image_tag="${docker_image_tag#*:}" # Remove everything before and including first ":"
-          docker_image_tag="${docker_image_tag%-*}" # Remove everything after and including last "-"
-          ref_name=${{ github.event.workflow_run.head_branch }}
-          if [[ $ref_name =~ "release/" ]]; then
-            ref_suffix="release"
-          elif [[ $ref_name == "main" ]]; then
-            ref_suffix="main"
-          else
-            echo "Unexpected branch in ref_name: ${ref_name}" && exit 1
-          fi
-          docker tag ${{ steps.ghcr-io-tag.outputs.ghcr_image }} ${{ matrix.docker-image }}
-          # mv is atomic operation, so we use intermediate tar.tmp file to prevent read-write contention
-          docker save -o ~/pytorch-data/docker/${docker_image_tag}.tar.tmp ${{ matrix.docker-image }}
-          mv ~/pytorch-data/docker/${docker_image_tag}.tar.tmp ~/pytorch-data/docker/${docker_image_tag}_${ref_suffix}.tar
--- a/.github/workflows/dynamo-unittest.yml
+++ b/.github/workflows/dynamo-unittest.yml
@ -1,70 +0,0 @@
-# Workflow: Dynamo Unit Test
-# runs unit tests for dynamo.
-name: dynamo-unittest
-
-on:
-  push:
-    tags:
-      - ciflow/dynamo/*
-  workflow_call:
-  schedule:
-    - cron: 29 8 * * * # about 1:29am PDT
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-  get-label-type:
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-      opt_out_experiments: lf
-
-  dynamo-build:
-    name: dynamo-build
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    strategy:
-      matrix:
-        python-version: ['3.11', '3.12']
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py${{ matrix.python-version }}-clang12
-      docker-image-name: ci-image:pytorch-linux-jammy-py${{ matrix.python-version }}-clang12
-      test-matrix: |
-        { include: [
-          { config: "dynamo_core", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-          { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-          { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-          { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-        ]}
-    secrets: inherit
-
-  dynamo-test:
-    name: dynamo-test
-    uses: ./.github/workflows/_linux-test.yml
-    needs: [get-label-type, dynamo-build]
-    strategy:
-      matrix:
-        python-version: ['3.11', '3.12']
-    with:
-      build-environment: linux-jammy-py${{ matrix.python-version }}-clang12
-      docker-image: ci-image:pytorch-linux-jammy-py${{ matrix.python-version }}-clang12
-      test-matrix: |
-        { include: [
-          { config: "dynamo_core", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-          { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-          { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-          { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-        ]}
-    secrets: inherit
--- a/.github/workflows/h100-distributed.yml
+++ b/.github/workflows/h100-distributed.yml
@ -37,6 +37,7 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: "linux.c7i.12xlarge"
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '9.0'
--- a/.github/workflows/inductor-rocm-mi200.yml
+++ b/.github/workflows/inductor-rocm-mi200.yml
@ -1,4 +1,4 @@
-name: inductor-rocm-mi200
+name: inductor-rocm

 on:
  schedule:
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -86,14 +86,14 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      build-environment: linux-jammy-cuda12.8-py3.12-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-py3.12-pallas
+      build-environment: linux-jammy-py3.12-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-py3.12-pallas
      cuda-arch-list: '8.9'
      runner: linux.8xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "inductor-pallas", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu" },
+          { config: "inductor-pallas", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
    secrets: inherit

--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -5,11 +5,9 @@ on:
    - cron: 0 0 * * *
  push:
    tags:
-      # NOTE: Doc build pipelines should only get triggered on:
-      # Major or minor release candidates builds
-      - v[0-9]+.[0-9]+.0+-rc[0-9]+
-      # Final RC for major, minor and patch releases
-      - v[0-9]+.[0-9]+.[0-9]+
+      # NOTE: Doc build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
      - ciflow/nightly/*
  workflow_dispatch:

--- a/.github/workflows/rocm-mi200.yml
+++ b/.github/workflows/rocm-mi200.yml
@ -1,4 +1,4 @@
-name: rocm-mi200
+name: rocm

 on:
  push:
--- a/.github/workflows/test-b200.yml
+++ b/.github/workflows/test-b200.yml
@ -5,9 +5,7 @@
 # Flow:
 # 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200
 # 2. Runs smoke tests on linux.dgx.b200 runner
-# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke_b200() function
-#    - Includes matmul, scaled_matmul, FP8, and FlashAttention CuTe tests
-#    - FlashAttention CuTe DSL is installed as part of test execution
+# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function
 #
 # Triggered by:
 # - Pull requests modifying this workflow file
--- a/.github/workflows/test-h100.yml
+++ b/.github/workflows/test-h100.yml
@ -41,6 +41,7 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '9.0'
--- a/.github/workflows/trunk-rocm-mi300.yml
+++ b/.github/workflows/trunk-rocm-mi300.yml
@ -1,83 +0,0 @@
-name: trunk-rocm-mi300
-
-on:
-  push:
-    branches:
-      - main
-      - release/*
-  workflow_dispatch:
-  schedule:
-    - cron: 29 8 * * *  # about 1:29am PDT
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-  llm-td:
-    if: github.repository_owner == 'pytorch'
-    name: before-test
-    uses: ./.github/workflows/llm_td_retrieval.yml
-    permissions:
-      id-token: write
-      contents: read
-
-  target-determination:
-    name: before-test
-    uses: ./.github/workflows/target_determination.yml
-    needs: llm-td
-    permissions:
-      id-token: write
-      contents: read
-
-  get-label-type:
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-
-  linux-jammy-rocm-py3_10-build:
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
-      sync-tag: rocm-build
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
-          { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
-          { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
-          { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
-          { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
-          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
-          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4.b" },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4.b" },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4.b" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-rocm-py3_10-test:
-    permissions:
-      id-token: write
-      contents: read
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_rocm-test.yml
-    needs:
-      - linux-jammy-rocm-py3_10-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
-    secrets: inherit
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -5,7 +5,6 @@ on:
    workflows:
      - pull
      - trunk
-      - trunk-rocm-mi300
      - periodic
      - periodic-rocm-mi200
      - periodic-rocm-mi300
--- a/.gitignore
+++ b/.gitignore
@ -127,7 +127,6 @@ torch/test/
 torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
 torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h
 torch/version.py
-torch/_inductor/kernel/vendored_templates/*
 minifier_launcher.py
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_d*
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_d*
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -186,8 +186,6 @@ include_patterns = [
    'aten/src/ATen/native/nested/cuda/*.h',
    'aten/src/ATen/native/nested/*.cpp',
    'aten/src/ATen/native/nested/*.h',
-    'aten/src/ATen/xpu/**/*.h',
-    'aten/src/ATen/xpu/**/*.cpp',
    'c10/**/*.cpp',
    'c10/**/*.h',
    'torch/*.h',
--- a/.spin/cmds.py
+++ b/.spin/cmds.py
@ -1,330 +0,0 @@
-import hashlib
-import subprocess
-import sys
-from pathlib import Path
-
-import click
-import spin
-
-
-def file_digest(file, algorithm: str):
-    try:
-        return hashlib.file_digest(file, algorithm)
-    except AttributeError:
-        pass  # Fallback to manual implementation below
-    hash = hashlib.new(algorithm)
-    while chunk := file.read(8192):
-        hash.update(chunk)
-    return hash
-
-
-def _hash_file(file):
-    with open(file, "rb") as f:
-        hash = file_digest(f, "sha256")
-    return hash.hexdigest()
-
-
-def _hash_files(files):
-    hashes = {file: _hash_file(file) for file in files}
-    return hashes
-
-
-def _read_hashes(hash_file: Path):
-    if not hash_file.exists():
-        return {}
-    with hash_file.open("r") as f:
-        lines = f.readlines()
-    hashes = {}
-    for line in lines:
-        hash = line[:64]
-        file = line[66:].strip()
-        hashes[file] = hash
-    return hashes
-
-
-def _updated_hashes(hash_file, files_to_hash):
-    old_hashes = _read_hashes(hash_file)
-    new_hashes = _hash_files(files_to_hash)
-    if new_hashes != old_hashes:
-        return new_hashes
-    return None
-
-
-@click.command()
-def regenerate_version():
-    """Regenerate version.py."""
-    cmd = [
-        sys.executable,
-        "-m",
-        "tools.generate_torch_version",
-        "--is-debug=false",
-    ]
-    spin.util.run(cmd)
-
-
-TYPE_STUBS = [
-    (
-        "Pytorch type stubs",
-        Path(".lintbin/.pytorch-type-stubs.sha256"),
-        [
-            "aten/src/ATen/native/native_functions.yaml",
-            "aten/src/ATen/native/tags.yaml",
-            "tools/autograd/deprecated.yaml",
-        ],
-        [
-            sys.executable,
-            "-m",
-            "tools.pyi.gen_pyi",
-            "--native-functions-path",
-            "aten/src/ATen/native/native_functions.yaml",
-            "--tags-path",
-            "aten/src/ATen/native/tags.yaml",
-            "--deprecated-functions-path",
-            "tools/autograd/deprecated.yaml",
-        ],
-    ),
-    (
-        "Datapipes type stubs",
-        None,
-        [],
-        [
-            sys.executable,
-            "torch/utils/data/datapipes/gen_pyi.py",
-        ],
-    ),
-]
-
-
-@click.command()
-def regenerate_type_stubs():
-    """Regenerate type stubs."""
-    for name, hash_file, files_to_hash, cmd in TYPE_STUBS:
-        if hash_file:
-            if hashes := _updated_hashes(hash_file, files_to_hash):
-                click.echo(
-                    f"Changes detected in type stub files for {name}. Regenerating..."
-                )
-                spin.util.run(cmd)
-                hash_file.parent.mkdir(parents=True, exist_ok=True)
-                with hash_file.open("w") as f:
-                    for file, hash in hashes.items():
-                        f.write(f"{hash}  {file}\n")
-                click.echo("Type stubs and hashes updated.")
-            else:
-                click.echo(f"No changes detected in type stub files for {name}.")
-        else:
-            click.echo(f"No hash file for {name}. Regenerating...")
-            spin.util.run(cmd)
-            click.echo("Type stubs regenerated.")
-
-
-@click.command()
-def regenerate_clangtidy_files():
-    """Regenerate clang-tidy files."""
-    cmd = [
-        sys.executable,
-        "-m",
-        "tools.linter.clang_tidy.generate_build_files",
-    ]
-    spin.util.run(cmd)
-
-
-#: These linters are expected to need less than 3s cpu time total
-VERY_FAST_LINTERS = {
-    "ATEN_CPU_GPU_AGNOSTIC",
-    "BAZEL_LINTER",
-    "C10_NODISCARD",
-    "C10_UNUSED",
-    "CALL_ONCE",
-    "CMAKE_MINIMUM_REQUIRED",
-    "CONTEXT_DECORATOR",
-    "COPYRIGHT",
-    "CUBINCLUDE",
-    "DEPLOY_DETECTION",
-    "ERROR_PRONE_ISINSTANCE",
-    "EXEC",
-    "HEADER_ONLY_LINTER",
-    "IMPORT_LINTER",
-    "INCLUDE",
-    "LINTRUNNER_VERSION",
-    "MERGE_CONFLICTLESS_CSV",
-    "META_NO_CREATE_UNBACKED",
-    "NEWLINE",
-    "NOQA",
-    "NO_WORKFLOWS_ON_FORK",
-    "ONCE_FLAG",
-    "PYBIND11_INCLUDE",
-    "PYBIND11_SPECIALIZATION",
-    "PYPIDEP",
-    "PYPROJECT",
-    "RAWCUDA",
-    "RAWCUDADEVICE",
-    "ROOT_LOGGING",
-    "TABS",
-    "TESTOWNERS",
-    "TYPEIGNORE",
-    "TYPENOSKIP",
-    "WORKFLOWSYNC",
-}
-
-
-#: These linters are expected to take a few seconds, but less than 10s cpu time total
-FAST_LINTERS = {
-    "CMAKE",
-    "DOCSTRING_LINTER",
-    "GHA",
-    "NATIVEFUNCTIONS",
-    "RUFF",
-    "SET_LINTER",
-    "SHELLCHECK",
-    "SPACES",
-}
-
-
-#: These linters are expected to take more than 10s cpu time total;
-#: some need more than 1 hour.
-SLOW_LINTERS = {
-    "ACTIONLINT",
-    "CLANGFORMAT",
-    "CLANGTIDY",
-    "CODESPELL",
-    "FLAKE8",
-    "GB_REGISTRY",
-    "PYFMT",
-    "PYREFLY",
-    "TEST_DEVICE_BIAS",
-    "TEST_HAS_MAIN",
-}
-
-
-ALL_LINTERS = VERY_FAST_LINTERS | FAST_LINTERS | SLOW_LINTERS
-
-
-LINTRUNNER_CACHE_INFO = (
-    Path(".lintbin/.lintrunner.sha256"),
-    [
-        "requirements.txt",
-        "pyproject.toml",
-        ".lintrunner.toml",
-    ],
-)
-
-
-LINTRUNNER_BASE_CMD = [
-    "uvx",
-    "--python",
-    "3.10",
-    "lintrunner@0.12.7",
-]
-
-
-@click.command()
-def setup_lint():
-    """Set up lintrunner with current CI version."""
-    cmd = LINTRUNNER_BASE_CMD + ["init"]
-    subprocess.run(cmd, check=True, capture_output=True, text=True)
-
-
-def _check_linters():
-    cmd = LINTRUNNER_BASE_CMD + ["list"]
-    ret = spin.util.run(cmd, output=False, stderr=subprocess.PIPE)
-    linters = {l.strip() for l in ret.stdout.decode().strip().split("\n")[1:]}
-    unknown_linters = linters - ALL_LINTERS
-    missing_linters = ALL_LINTERS - linters
-    if unknown_linters:
-        click.secho(
-            f"Unknown linters found; please add them to the correct category "
-            f"in .spin/cmds.py: {', '.join(unknown_linters)}",
-            fg="yellow",
-        )
-    if missing_linters:
-        click.secho(
-            f"Missing linters found; please update the corresponding category "
-            f"in .spin/cmds.py: {', '.join(missing_linters)}",
-            fg="yellow",
-        )
-    return unknown_linters, missing_linters
-
-
-@spin.util.extend_command(
-    setup_lint,
-    doc=f"""
-        If configuration has changed, update lintrunner.
-
-        Compares the stored old hashes of configuration files with new ones and
-        performs setup via setup-lint if the hashes have changed.
-        Hashes are stored in {LINTRUNNER_CACHE_INFO[0]}; the following files are
-        considered: {", ".join(LINTRUNNER_CACHE_INFO[1])}.
-        """,
-)
-@click.pass_context
-def lazy_setup_lint(ctx, parent_callback, **kwargs):
-    if hashes := _updated_hashes(*LINTRUNNER_CACHE_INFO):
-        click.echo(
-            "Changes detected in lint configuration files. Setting up linting tools..."
-        )
-        parent_callback(**kwargs)
-        hash_file = LINTRUNNER_CACHE_INFO[0]
-        hash_file.parent.mkdir(parents=True, exist_ok=True)
-        with hash_file.open("w") as f:
-            for file, hash in hashes.items():
-                f.write(f"{hash}  {file}\n")
-        click.echo("Linting tools set up and hashes updated.")
-    else:
-        click.echo("No changes detected in lint configuration files. Skipping setup.")
-    click.echo("Regenerating version...")
-    ctx.invoke(regenerate_version)
-    click.echo("Regenerating type stubs...")
-    ctx.invoke(regenerate_type_stubs)
-    click.echo("Done.")
-    _check_linters()
-
-
-@click.command()
-@click.option("-a", "--apply-patches", is_flag=True)
-@click.pass_context
-def lint(ctx, apply_patches, **kwargs):
-    """Lint all files."""
-    ctx.invoke(lazy_setup_lint)
-    all_files_linters = VERY_FAST_LINTERS | FAST_LINTERS
-    changed_files_linters = SLOW_LINTERS
-    cmd = LINTRUNNER_BASE_CMD
-    if apply_patches:
-        cmd += ["--apply-patches"]
-    all_files_cmd = cmd + [
-        "--take",
-        ",".join(all_files_linters),
-        "--all-files",
-    ]
-    spin.util.run(all_files_cmd)
-    changed_files_cmd = cmd + [
-        "--take",
-        ",".join(changed_files_linters),
-    ]
-    spin.util.run(changed_files_cmd)
-
-
-@click.command()
-@click.pass_context
-def fixlint(ctx, **kwargs):
-    """Autofix all files."""
-    ctx.invoke(lint, apply_patches=True)
-
-
-@click.command()
-@click.option("-a", "--apply-patches", is_flag=True)
-@click.pass_context
-def quicklint(ctx, apply_patches, **kwargs):
-    """Lint changed files."""
-    ctx.invoke(lazy_setup_lint)
-    cmd = LINTRUNNER_BASE_CMD
-    if apply_patches:
-        cmd += ["--apply-patches"]
-    spin.util.run(cmd)
-
-
-@click.command()
-@click.pass_context
-def quickfix(ctx, **kwargs):
-    """Autofix changed files."""
-    ctx.invoke(quicklint, apply_patches=True)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -736,44 +736,6 @@ if(NOT DEFINED USE_BLAS)
  set(USE_BLAS ON)
 endif()

-# Prioritized Text Linker Optimization
-if(USE_PRIORITIZED_TEXT_FOR_LD)
-
-  set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
-  set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
-
-  execute_process(
-    COMMAND ${Python_EXECUTABLE}
-            ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py
-            --filein "${LINKER_SCRIPT_FILE_IN}"
-            --fout  "${LINKER_SCRIPT_FILE_OUT}"
-    RESULT_VARIABLE _gen_result
-    OUTPUT_VARIABLE _gen_output
-    ERROR_VARIABLE  _gen_error
-  )
-
-  if(NOT _gen_result EQUAL 0)
-    message(FATAL_ERROR
-      "Failed to generate linker script:\n${_gen_output}\n${_gen_error}")
-  endif()
-
-  append_cxx_flag_if_supported("-ffunction-sections" CMAKE_CXX_FLAGS)
-  append_cxx_flag_if_supported("-fdata-sections" CMAKE_CXX_FLAGS)
-  append_c_flag_if_supported("-ffunction-sections" CMAKE_C_FLAGS)
-  append_c_flag_if_supported("-fdata-sections" CMAKE_C_FLAGS)
-
-  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -T${LINKER_SCRIPT_FILE_OUT}")
-  set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -T${LINKER_SCRIPT_FILE_OUT}")
-
-else()
-  if(LINUX AND CPU_AARCH64)
-    message(WARNING [[
-    It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
-    To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
-    ]])
-  endif()
-endif()
-
 # Build libtorch mobile library, which contains ATen/TH ops and native support
 # for TorchScript model, but doesn't contain not-yet-unified caffe2 ops;
 if(INTERN_BUILD_MOBILE)
@ -1440,6 +1402,9 @@ if(BUILD_JNI)
  add_subdirectory(android/pytorch_android)
 endif()

+include(cmake/Summary.cmake)
+caffe2_print_configuration_summary()
+
 # Parse custom debug info
 if(DEFINED USE_CUSTOM_DEBINFO)
  string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}")
@ -1479,5 +1444,56 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
          DESTINATION "${CMAKE_INSTALL_BINDIR}")
 endif()

-include(cmake/Summary.cmake)
-caffe2_print_configuration_summary()
+if(USE_PRIORITIZED_TEXT_FOR_LD)
+  add_compile_options(
+    $<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
+    $<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
+  )
+  set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
+  set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
+
+  add_custom_command(
+    OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
+    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
+    DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
+    COMMENT "Generating prioritized text linker files"
+    VERBATIM
+  )
+
+  add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
+
+  if(BUILD_PYTHON)
+    set(LINKER_OPT_TARGETS torch_python)
+  endif()
+
+  if(NOT BUILD_LIBTORCHLESS)
+    list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
+    if(USE_CUDA)
+      list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
+    endif()
+    if(USE_XPU)
+      list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
+    endif()
+    if(USE_ROCM)
+      list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
+    endif()
+  endif()
+
+  foreach(tgt IN LISTS LINKER_OPT_TARGETS)
+    if(TARGET ${tgt})
+      add_dependencies("${tgt}" generate_linker_script)
+      target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
+      set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
+    else()
+       message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
+    endif()
+  endforeach()
+
+else()
+  if(LINUX AND CPU_AARCH64)
+    message(WARNING [[
+    It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
+    To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
+    ]])
+  endif()
+endif()
--- a/2
+++ b/2
@ -37,7 +37,7 @@ Copyright (c) 2024 Tri Dao.
 All rights reserved.

 All contributions by Arm:
-Copyright (c) 2021, 2023-2025 Arm Limited and/or its affiliates
+Copyright (c) 2021, 2023-2024 Arm Limited and/or its affiliates

 All contributions from Caffe:
 Copyright(c) 2013, 2014, 2015, the respective contributors
--- a/SECURITY.md
+++ b/SECURITY.md
@ -18,8 +18,6 @@ Please report security issues using https://github.com/pytorch/pytorch/security/

 All reports submitted through the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework.

-**Note on crashes and out of bounds access**: PyTorch is a computational framework that performs operations on behalf of the caller. Like many low-level libraries, PyTorch generally does not validate all inputs to every function—the responsibility for providing valid arguments lies with the calling code. While crashes and out of bounds memory access should be reported as bugs, they are generally not considered security vulnerabilities in PyTorch's threat model.
-
 Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported:

 https://www.facebook.com/whitehat
--- a/aten/src/ATen/DeviceAccelerator.h
+++ b/aten/src/ATen/DeviceAccelerator.h
@ -94,11 +94,6 @@ TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
  at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
 }

-TORCH_API inline std::pair<size_t, size_t> getMemoryInfo(
-    c10::DeviceIndex device_index) {
-  const auto device_type = getAccelerator(true).value();
-  return at::getDeviceAllocator(device_type)->getMemoryInfo(device_index);
-}
 } // namespace at::accelerator

 namespace at {
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -226,8 +226,8 @@ template <
    typename B = HostBlock<S>>
 struct CachingHostAllocatorImpl {
  virtual ~CachingHostAllocatorImpl() {
-    if (active_) {
-      active_ = false;
+    active_ = false;
+    if (pinned_use_background_threads()) {
      getBackgroundThreadPool()->waitWorkComplete();
    }
  }
@ -260,7 +260,6 @@ struct CachingHostAllocatorImpl {
    if (pinned_use_background_threads()) {
      // Launch the background thread and process events in a loop.
      static bool background_thread_flag [[maybe_unused]] = [this] {
-        active_ = true;
        getBackgroundThreadPool()->run([&]() {
          while (active_) {
            process_events();
@ -684,9 +683,9 @@ struct CachingHostAllocatorImpl {
  alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
  std::deque<std::pair<E, B*>> events_; // event queue paired with block

-  // Indicates whether the event-processing thread pool is active.
+  // Indicates whether the object is active.
  // Set to false in the destructor to signal background threads to stop.
-  std::atomic<bool> active_{false};
+  std::atomic<bool> active_{true};
 protected:
  alignas(hardware_destructive_interference_size) HostStatsStaged stats_;
 };
--- a/aten/src/ATen/core/TensorAccessor.h
+++ b/aten/src/ATen/core/TensorAccessor.h
@ -1,6 +1,5 @@
 #pragma once

-#include <torch/headeronly/core/TensorAccessor.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/ArrayRef.h>
 #include <c10/util/Deprecated.h>
@ -12,37 +11,252 @@

 namespace at {

-using torch::headeronly::DefaultPtrTraits;
+// The PtrTraits argument to the TensorAccessor/GenericPackedTensorAccessor
+// is used to enable the __restrict__ keyword/modifier for the data
+// passed to cuda.
+template <typename T>
+struct DefaultPtrTraits {
+  typedef T* PtrType;
+};
+
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  using torch::headeronly::RestrictPtrTraits;
+template <typename T>
+struct RestrictPtrTraits {
+  typedef T* __restrict__ PtrType;
+};
 #endif

+// TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors.
+// For CUDA tensors it is used in device code (only). This means that we restrict ourselves
+// to functions and types available there (e.g. IntArrayRef isn't).
+
+// The PtrTraits argument is only relevant to cuda to support `__restrict__` pointers.
 template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
-using TensorAccessorBase = torch::headeronly::detail::TensorAccessorBase<c10::IntArrayRef, T, N, PtrTraits, index_t>;
+class TensorAccessorBase {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;

+  C10_HOST_DEVICE TensorAccessorBase(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : data_(data_), sizes_(sizes_), strides_(strides_) {}
+  C10_HOST IntArrayRef sizes() const {
+    return IntArrayRef(sizes_,N);
+  }
+  C10_HOST IntArrayRef strides() const {
+    return IntArrayRef(strides_,N);
+  }
+  C10_HOST_DEVICE index_t stride(index_t i) const {
+    return strides_[i];
+  }
+  C10_HOST_DEVICE index_t size(index_t i) const {
+    return sizes_[i];
+  }
+  C10_HOST_DEVICE PtrType data() {
+    return data_;
+  }
+  C10_HOST_DEVICE const PtrType data() const {
+    return data_;
+  }
+protected:
+  PtrType data_;
+  const index_t* sizes_;
+  const index_t* strides_;
+};
+
+// The `TensorAccessor` is typically instantiated for CPU `Tensor`s using
+// `Tensor.accessor<T, N>()`.
+// For CUDA `Tensor`s, `GenericPackedTensorAccessor` is used on the host and only
+// indexing on the device uses `TensorAccessor`s.
 template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
-using TensorAccessor = torch::headeronly::detail::TensorAccessor<c10::IntArrayRef, T, N, PtrTraits, index_t>;
+class TensorAccessor : public TensorAccessorBase<T,N,PtrTraits,index_t> {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;

-namespace detail {
+  C10_HOST_DEVICE TensorAccessor(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : TensorAccessorBase<T, N, PtrTraits, index_t>(data_,sizes_,strides_) {}

-template <size_t N, typename index_t>
-struct IndexBoundsCheck {
-    IndexBoundsCheck(index_t i) {
-      TORCH_CHECK_INDEX(
+  C10_HOST_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) {
+    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
+  }
+
+  C10_HOST_DEVICE const TensorAccessor<T, N-1, PtrTraits, index_t> operator[](index_t i) const {
+    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
+  }
+};
+
+template<typename T, template <typename U> class PtrTraits, typename index_t>
+class TensorAccessor<T,1,PtrTraits,index_t> : public TensorAccessorBase<T,1,PtrTraits,index_t> {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  C10_HOST_DEVICE TensorAccessor(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : TensorAccessorBase<T, 1, PtrTraits, index_t>(data_,sizes_,strides_) {}
+  C10_HOST_DEVICE T & operator[](index_t i) {
+    // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+    return this->data_[this->strides_[0]*i];
+  }
+  C10_HOST_DEVICE const T & operator[](index_t i) const {
+    return this->data_[this->strides_[0]*i];
+  }
+};
+
+
+// GenericPackedTensorAccessorBase and GenericPackedTensorAccessor are used on for CUDA `Tensor`s on the host
+// and as
+// In contrast to `TensorAccessor`s, they copy the strides and sizes on instantiation (on the host)
+// in order to transfer them on the device when calling kernels.
+// On the device, indexing of multidimensional tensors gives to `TensorAccessor`s.
+// Use RestrictPtrTraits as PtrTraits if you want the tensor's data pointer to be marked as __restrict__.
+// Instantiation from data, sizes, strides is only needed on the host and std::copy isn't available
+// on the device, so those functions are host only.
+template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+class GenericPackedTensorAccessorBase {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  C10_HOST GenericPackedTensorAccessorBase(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : data_(data_) {
+    std::copy(sizes_, sizes_ + N, std::begin(this->sizes_));
+    std::copy(strides_, strides_ + N, std::begin(this->strides_));
+  }
+
+  // if index_t is not int64_t, we want to have an int64_t constructor
+  template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
+  C10_HOST GenericPackedTensorAccessorBase(
+      PtrType data_,
+      const source_index_t* sizes_,
+      const source_index_t* strides_)
+      : data_(data_) {
+    for (const auto i : c10::irange(N)) {
+      this->sizes_[i] = sizes_[i];
+      this->strides_[i] = strides_[i];
+    }
+  }
+
+  C10_HOST_DEVICE index_t stride(index_t i) const {
+    return strides_[i];
+  }
+  C10_HOST_DEVICE index_t size(index_t i) const {
+    return sizes_[i];
+  }
+  C10_HOST_DEVICE PtrType data() {
+    return data_;
+  }
+  C10_HOST_DEVICE const PtrType data() const {
+    return data_;
+  }
+protected:
+  PtrType data_;
+  // NOLINTNEXTLINE(*c-arrays*)
+  index_t sizes_[N];
+  // NOLINTNEXTLINE(*c-arrays*)
+  index_t strides_[N];
+  C10_HOST void bounds_check_(index_t i) const {
+    TORCH_CHECK_INDEX(
        0 <= i && i < index_t{N},
        "Index ",
        i,
        " is not within bounds of a tensor of dimension ",
        N);
-    }
+  }
 };
-}  // namespace detail

 template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
-using GenericPackedTensorAccessorBase = torch::headeronly::detail::GenericPackedTensorAccessorBase<detail::IndexBoundsCheck<N, index_t>, T, N, PtrTraits, index_t>;
+class GenericPackedTensorAccessor : public GenericPackedTensorAccessorBase<T,N,PtrTraits,index_t> {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
+
+  // if index_t is not int64_t, we want to have an int64_t constructor
+  template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data_,
+      const source_index_t* sizes_,
+      const source_index_t* strides_)
+      : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
+
+  C10_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) {
+    index_t* new_sizes = this->sizes_ + 1;
+    index_t* new_strides = this->strides_ + 1;
+    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
+  }
+
+  C10_DEVICE const TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) const {
+    const index_t* new_sizes = this->sizes_ + 1;
+    const index_t* new_strides = this->strides_ + 1;
+    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
+  }
+
+  /// Returns a PackedTensorAccessor of the same dimension after transposing the
+  /// two dimensions given. Does not actually move elements; transposition is
+  /// made by permuting the size/stride arrays. If the dimensions are not valid,
+  /// asserts.
+  C10_HOST GenericPackedTensorAccessor<T, N, PtrTraits, index_t> transpose(
+      index_t dim1,
+      index_t dim2) const {
+    this->bounds_check_(dim1);
+    this->bounds_check_(dim2);
+    GenericPackedTensorAccessor<T, N, PtrTraits, index_t> result(
+        this->data_, this->sizes_, this->strides_);
+    std::swap(result.strides_[dim1], result.strides_[dim2]);
+    std::swap(result.sizes_[dim1], result.sizes_[dim2]);
+    return result;
+  }
+};
+
+template<typename T, template <typename U> class PtrTraits, typename index_t>
+class GenericPackedTensorAccessor<T,1,PtrTraits,index_t> : public GenericPackedTensorAccessorBase<T,1,PtrTraits,index_t> {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
+
+  // if index_t is not int64_t, we want to have an int64_t constructor
+  template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data_,
+      const source_index_t* sizes_,
+      const source_index_t* strides_)
+      : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
+
+  C10_DEVICE T & operator[](index_t i) {
+    return this->data_[this->strides_[0] * i];
+  }
+  C10_DEVICE const T& operator[](index_t i) const {
+    return this->data_[this->strides_[0]*i];
+  }
+
+  // Same as in the general N-dimensional case, but note that in the
+  // 1-dimensional case the returned PackedTensorAccessor will always be an
+  // identical copy of the original
+  C10_HOST GenericPackedTensorAccessor<T, 1, PtrTraits, index_t> transpose(
+      index_t dim1,
+      index_t dim2) const {
+    this->bounds_check_(dim1);
+    this->bounds_check_(dim2);
+    return GenericPackedTensorAccessor<T, 1, PtrTraits, index_t>(
+        this->data_, this->sizes_, this->strides_);
+  }
+};

-template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
-using GenericPackedTensorAccessor = torch::headeronly::detail::GenericPackedTensorAccessor<TensorAccessor<T, N-1, PtrTraits, index_t>, detail::IndexBoundsCheck<N, index_t>, T, N, PtrTraits, index_t>;

 // Can't put this directly into the macro function args because of commas
 #define AT_X GenericPackedTensorAccessor<T, N, PtrTraits, index_t>
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -245,9 +245,6 @@ class TORCH_API TensorBase {
  size_t weak_use_count() const noexcept {
    return impl_.weak_use_count();
  }
-  bool is_uniquely_owned() const noexcept {
-    return impl_.is_uniquely_owned();
-  }

  std::string toString() const;

--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@ -18,8 +18,6 @@
 #include <unordered_set>
 #include <utility>

-C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default")
-
 namespace torch {
 class TORCH_API CustomClassHolder : public c10::intrusive_ptr_target {};
 namespace jit {
@ -1632,6 +1630,4 @@ struct TORCH_API WeakOrStrongTypePtr {

 } // namespace c10

-C10_DIAGNOSTIC_POP()
-
 #include <ATen/core/ivalue_inl.h> // IWYU pragma: keep
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@ -29,8 +29,6 @@
 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/irange.h>

-C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default")
-
 namespace torch {
 namespace jit {
 struct Function;
@ -2569,5 +2567,3 @@ TypePtr IValue::type() const {
 }

 } // namespace c10
-
-C10_DIAGNOSTIC_POP()
--- a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
@ -223,62 +223,6 @@ CONVERT_FROM_BF16_TEMPLATE(double)
 CONVERT_FROM_BF16_TEMPLATE(float16_t)
 #endif

-#ifdef __ARM_FEATURE_BF16
-
-// clang-[17, 20] crashes when autovectorizing static cast to bf16
-// Below is a workaround to have some vectorization
-// Works decently well for smaller int types
-template <typename from_type>
-inline void convertToBf16Impl(
-    const from_type* __restrict src,
-    c10::BFloat16* __restrict dst,
-    uint64_t n) {
-  bfloat16_t* dstPtr = reinterpret_cast<bfloat16_t*>(dst);
-  uint64_t loopBound = n - (n % 16);
-  uint64_t i = 0;
-  for (; i < loopBound; i += 16) {
-    float32x4_t a, b, c, d;
-    a[0] = static_cast<float>(src[i]);
-    a[1] = static_cast<float>(src[i + 1]);
-    a[2] = static_cast<float>(src[i + 2]);
-    a[3] = static_cast<float>(src[i + 3]);
-    b[0] = static_cast<float>(src[i + 4]);
-    b[1] = static_cast<float>(src[i + 5]);
-    b[2] = static_cast<float>(src[i + 6]);
-    b[3] = static_cast<float>(src[i + 7]);
-    c[0] = static_cast<float>(src[i + 8]);
-    c[1] = static_cast<float>(src[i + 9]);
-    c[2] = static_cast<float>(src[i + 10]);
-    c[3] = static_cast<float>(src[i + 11]);
-    d[0] = static_cast<float>(src[i + 12]);
-    d[1] = static_cast<float>(src[i + 13]);
-    d[2] = static_cast<float>(src[i + 14]);
-    d[3] = static_cast<float>(src[i + 15]);
-
-    vst1q_bf16(dstPtr + i, vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(a), b));
-    vst1q_bf16(dstPtr + i + 8, vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(c), d));
-  }
-
-#pragma clang loop vectorize(disable) interleave(disable) unroll(disable)
-  for (; i < n; i++) {
-    float a = static_cast<float>(src[i]);
-    dstPtr[i] = vcvth_bf16_f32(a);
-  }
-}
-
-#define CONVERT_TO_BF16_TEMPLATE(from_type)                                  \
-  template <>                                                                \
-  inline void convert(const from_type* src, c10::BFloat16* dst, int64_t n) { \
-    return convertToBf16Impl<from_type>(src, dst, n);                        \
-  }
-
-CONVERT_TO_BF16_TEMPLATE(uint8_t)
-CONVERT_TO_BF16_TEMPLATE(int8_t)
-CONVERT_TO_BF16_TEMPLATE(int16_t)
-CONVERT_TO_BF16_TEMPLATE(int32_t)
-
-#endif
-
 inline void convertBoolToBfloat16Impl(
    const bool* __restrict src,
    c10::BFloat16* __restrict dst,
--- a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
@ -11,8 +11,6 @@
 #include <sleef.h>
 #endif

-C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default")
-
 // Sleef offers vectorized versions of some transcedentals
 // such as sin, cos, tan etc..
 // However for now opting for STL, since we are not building
@ -652,5 +650,3 @@ inline Vectorized<float> Vectorized<float>::erf() const {

 } // namespace CPU_CAPABILITY
 } // namespace at::vec
-
-C10_DIAGNOSTIC_POP()
--- a/aten/src/ATen/cuda/CUDAContextLight.h
+++ b/aten/src/ATen/cuda/CUDAContextLight.h
@ -3,7 +3,6 @@

 #include <cstdint>
 #include <map>
-#include <shared_mutex>

 #include <cuda_runtime_api.h>
 #include <cusparse.h>
@ -89,13 +88,8 @@ TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
 TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();

 TORCH_CUDA_CPP_API void clearCublasWorkspaces();
-struct WorkspaceMapWithMutex {
-  std::map<std::tuple<void*, void*>, at::DataPtr> map;
-  std::shared_mutex mutex;
-};
-
-TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublas_handle_stream_to_workspace();
-TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace();
+TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace();
+TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace();
 TORCH_CUDA_CPP_API size_t getChosenWorkspaceSize();
 TORCH_CUDA_CPP_API size_t getCUDABlasLtWorkspaceSize();
 TORCH_CUDA_CPP_API void* getCUDABlasLtWorkspace();
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@ -1,7 +1,6 @@
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ATen/cuda/CUDAGraph.h>
 #include <ATen/cuda/Exceptions.h>
-#include <ATen/cuda/MemPool.h>
 #include <ATen/Functions.h>
 #include <c10/cuda/CUDAFunctions.h>

@ -14,7 +13,7 @@ static bool _cuda_graphs_debug = false;
 MempoolId_t graph_pool_handle() {
  // Sets just the second value, to distinguish it from MempoolId_ts created from
  // cudaStreamGetCaptureInfo id_s in capture_begin.
-  return at::cuda::MemPool::graph_pool_handle();
+  return c10::cuda::MemPool::graph_pool_handle();
 }

 /**
@ -91,7 +90,7 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt
  } else {
    // User did not ask us to share a mempool. Create graph pool handle using is_user_created=false.
    // Sets just the first value, to distinguish it from MempoolId_ts created by graph_pool_handle().
-    mempool_id_ = at::cuda::MemPool::graph_pool_handle(false);
+    mempool_id_ = c10::cuda::MemPool::graph_pool_handle(false);
    TORCH_INTERNAL_ASSERT(mempool_id_.first > 0);
  }

@ -175,24 +174,17 @@ void CUDAGraph::instantiate() {
    // Trailing NULL, NULL, 0 arguments were recommended by Cuda driver people,
    // who prefer not to report error message through these arguments moving forward
    // (they prefer return value, or errors on api calls internal to the capture)
-    // ROCM appears to fail with HIP error: invalid argument
-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && !defined(USE_ROCM)
-    AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, cudaGraphInstantiateFlagUseNodePriority));
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000)
+    AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, 0));
 #else
    AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, NULL, NULL, 0));
 #endif
 //Since ROCm 6.2, we want to go down this path as hipGraphExecDestroy in the destructor will not immediately free the memory.
 //It will wait for the next sync operation. cudaGraphInstantiateFlagAutoFreeOnLaunch will add async frees after graph launch.
  } else {
-#if !defined(USE_ROCM)
-    AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
-                                                graph_,
-                                                cudaGraphInstantiateFlagAutoFreeOnLaunch | cudaGraphInstantiateFlagUseNodePriority));
-#else
    AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
                                                graph_,
                                                cudaGraphInstantiateFlagAutoFreeOnLaunch));
-#endif
  }
  has_graph_exec_ = true;
 }
--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
@ -99,7 +99,7 @@ void destroyCublasHandle(cublasHandle_t handle) {
 //   - Comments of @soumith copied from cuDNN handle pool implementation
 #ifdef NO_CUDNN_DESTROY_HANDLE
 #else
-  cublasDestroy(handle);
+    cublasDestroy(handle);
 #endif
 }

@ -107,27 +107,19 @@ using CuBlasPoolType = DeviceThreadHandlePool<cublasHandle_t, createCublasHandle

 } // namespace

-WorkspaceMapWithMutex& cublas_handle_stream_to_workspace() {
-  static auto& instance = *new WorkspaceMapWithMutex;
+std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace() {
+  static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
  return instance;
 }

-WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace() {
-  static auto& instance = *new WorkspaceMapWithMutex;
+std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace() {
+  static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
  return instance;
 }

 void clearCublasWorkspaces() {
-  {
-    auto& workspace = cublas_handle_stream_to_workspace();
-    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
-    workspace.map.clear();
-  }
-  {
-    auto& workspace = cublaslt_handle_stream_to_workspace();
-    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
-    workspace.map.clear();
-  }
+  cublas_handle_stream_to_workspace().clear();
+  cublaslt_handle_stream_to_workspace().clear();
 }

 size_t parseChosenWorkspaceSize() {
@ -241,38 +233,6 @@ at::DataPtr getNewCUDABlasLtWorkspace() {
  return c10::cuda::CUDACachingAllocator::get()->allocate(getCUDABlasLtWorkspaceSize());
 }

-void setWorkspaceForHandle(cublasHandle_t handle, c10::cuda::CUDAStream stream) {
-  cudaStream_t _stream = stream;
-  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-
-  auto& workspace = cublas_handle_stream_to_workspace();
-
-  size_t workspace_size = getChosenWorkspaceSize();
-
-  // Fast path: check if workspace already exists
-  {
-    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
-    auto workspace_it = workspace.map.find(key);
-    if (workspace_it != workspace.map.end()) {
-      TORCH_CUDABLAS_CHECK(cublasSetWorkspace(
-          handle, workspace_it->second.get(), workspace_size));
-      return;
-    }
-  }
-
-  // Slow path: allocate workspace outside the lock
-  auto new_workspace = getNewWorkspace();
-
-  // Insert with lock (double-check in case another thread inserted while we
-  // were allocating)
-  {
-    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
-    auto workspace_it = workspace.map.try_emplace(key, std::move(new_workspace)).first;
-    TORCH_CUDABLAS_CHECK(
-        cublasSetWorkspace(handle, workspace_it->second.get(), workspace_size));
-  }
-}
-
 void* getCUDABlasLtWorkspace() {
 #ifndef USE_ROCM
  static bool unified = c10::utils::check_env(TORCH_CUBLASLT_UNIFIED_WORKSPACE) == true;
@ -281,10 +241,8 @@ void* getCUDABlasLtWorkspace() {
    auto stream = c10::cuda::getCurrentCUDAStream();
    cudaStream_t _stream = stream;
    auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-    auto& workspace = at::cuda::cublas_handle_stream_to_workspace();
-    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
-    auto workspace_it = workspace.map.find(key);
-    TORCH_INTERNAL_ASSERT(workspace_it != workspace.map.end());
+    auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key);
+    TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end());
    return workspace_it->second.mutable_get();
  }
 #endif
@ -292,29 +250,11 @@ void* getCUDABlasLtWorkspace() {
  auto stream = c10::cuda::getCurrentCUDAStream();
  cudaStream_t _stream = stream;
  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-
-  auto& workspace = cublaslt_handle_stream_to_workspace();
-
-  // Fast path: check if workspace already exists
-  {
-    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
-    auto workspace_it = workspace.map.find(key);
-    if (workspace_it != workspace.map.end()) {
-      return workspace_it->second.mutable_get();
-    }
-  }
-
-  // Slow path: allocate workspace outside the lock
-  auto new_workspace = getNewCUDABlasLtWorkspace();
-
-  // Insert with lock (double-check in case another thread inserted while we
-  // were allocating)
-  {
-    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
-    auto workspace_it =
-          workspace.map.try_emplace(key, std::move(new_workspace)).first;
-    return workspace_it->second.mutable_get();
+  auto workspace_it = cublaslt_handle_stream_to_workspace().find(key);
+  if (workspace_it == cublaslt_handle_stream_to_workspace().end()) {
+    workspace_it = cublaslt_handle_stream_to_workspace().insert(workspace_it, {key, getNewCUDABlasLtWorkspace()});
  }
+  return workspace_it->second.mutable_get();
 }

 cublasHandle_t getCurrentCUDABlasHandle() {
@ -358,8 +298,13 @@ cublasHandle_t getCurrentCUDABlasHandle() {
  // will allocate memory dynamically (even if they're cheap) outside
  // PyTorch's CUDA caching allocator. It's possible that CCA used up
  // all the memory and cublas's cudaMallocAsync will return OOM
-  setWorkspaceForHandle(handle, stream);
-
+  cudaStream_t _stream = stream;
+  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
+  auto workspace_it = cublas_handle_stream_to_workspace().find(key);
+  if (workspace_it == cublas_handle_stream_to_workspace().end()) {
+    workspace_it = cublas_handle_stream_to_workspace().insert(workspace_it, {key, getNewWorkspace()});
+  }
+  TORCH_CUDABLAS_CHECK(cublasSetWorkspace(handle, workspace_it->second.get(), getChosenWorkspaceSize()));
 #if !defined(USE_ROCM)
  // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup
  // FP32 data type calculations based on the value of the allow_tf32 flag.
--- a/aten/src/ATen/cuda/MemPool.cpp
+++ b/aten/src/ATen/cuda/MemPool.cpp
@ -1,69 +0,0 @@
-#include <ATen/core/CachingHostAllocator.h>
-#include <ATen/cuda/MemPool.h>
-
-namespace at::cuda {
-
-// uid_ is incremented when a user creates a MemPool,
-// for example: using graph_pool_handle() or c10::cuda::MemPool().
-//
-// uuid_ is incremented when CUDAGraph creates a MemPool
-// as a result of a user not providing a pool.
-//
-// MempoolId_t of {0, 0} is used to denote when no MemPool has been
-// passed to a function, either by user or CUDAGraphs. For example,
-// default value of MempoolId_t for capture_begin function is {0, 0}.
-// That's why uid_ and uuid_ start at 1.
-std::atomic<CaptureId_t> MemPool::uid_{1};
-std::atomic<CaptureId_t> MemPool::uuid_{1};
-
-MemPool::MemPool(
-    CUDACachingAllocator::CUDAAllocator* allocator,
-    bool is_user_created,
-    bool use_on_oom)
-    : allocator_(allocator), is_user_created_(is_user_created) {
-  if (is_user_created_) {
-    id_ = {0, uid_++};
-  } else {
-    id_ = {uuid_++, 0};
-  }
-  device_ = c10::cuda::current_device();
-  CUDACachingAllocator::createOrIncrefPool(device_, id_, allocator);
-  if (use_on_oom) {
-    CUDACachingAllocator::setUseOnOOM(device_, id_);
-  }
-}
-
-MemPool::~MemPool() {
-  // TORCH_INTERNAL_ASSERT(use_count() == 1);
-  // We used to assert that TORCH_INTERNAL_ASSERT(use_count() == 1);
-  // However, this assertion is not true if a memory pool is shared
-  // with a cuda graph. That CUDAGraph will increase the use count
-  // until it is reset.
-  CUDACachingAllocator::releasePool(device_, id_);
-  c10::cuda::CUDACachingAllocator::emptyCache(id_);
-}
-
-MempoolId_t MemPool::id() {
-  return id_;
-}
-
-CUDACachingAllocator::CUDAAllocator* MemPool::allocator() {
-  return allocator_;
-}
-
-int MemPool::use_count() {
-  return CUDACachingAllocator::getPoolUseCount(device_, id_);
-}
-
-c10::DeviceIndex MemPool::device() {
-  return device_;
-}
-
-MempoolId_t MemPool::graph_pool_handle(bool is_user_created) {
-  if (is_user_created) {
-    return {0, uid_++};
-  }
-  return {uuid_++, 0};
-}
-
-} // namespace at::cuda
--- a/aten/src/ATen/cuda/MemPool.h
+++ b/aten/src/ATen/cuda/MemPool.h
@ -1,44 +0,0 @@
-#pragma once
-
-#include <c10/core/Allocator.h>
-#include <c10/cuda/CUDACachingAllocator.h>
-
-namespace at::cuda {
-
-// Keep BC only
-using c10::CaptureId_t;
-using c10::MempoolId_t;
-
-// MemPool represents a pool of memory in a caching allocator. Currently,
-// it's just the ID of the pool object maintained in the CUDACachingAllocator.
-//
-// An allocator pointer can be passed to the MemPool to define how the
-// allocations should be done in the pool. For example: using a different
-// system allocator such as ncclMemAlloc.
-struct TORCH_CUDA_CPP_API MemPool {
-  MemPool(
-      c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator = nullptr,
-      bool is_user_created = true,
-      bool use_on_oom = false);
-  MemPool(const MemPool&) = delete;
-  MemPool(MemPool&&) = default;
-  MemPool& operator=(const MemPool&) = delete;
-  MemPool& operator=(MemPool&&) = default;
-  ~MemPool();
-
-  MempoolId_t id();
-  c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator();
-  int use_count();
-  c10::DeviceIndex device();
-  static MempoolId_t graph_pool_handle(bool is_user_created = true);
-
- private:
-  static std::atomic<CaptureId_t> uid_;
-  static std::atomic<CaptureId_t> uuid_;
-  c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator_;
-  bool is_user_created_;
-  MempoolId_t id_;
-  c10::DeviceIndex device_;
-};
-
-} // namespace at::cuda
--- a/aten/src/ATen/cuda/NumericLimits.cuh
+++ b/aten/src/ATen/cuda/NumericLimits.cuh
@ -55,6 +55,14 @@ struct numeric_limits<int8_t> {
  static inline __host__ __device__ int8_t upper_bound() { return INT8_MAX; }
 };

+template <>
+struct numeric_limits<uint16_t> {
+  static inline __host__ __device__ uint16_t lowest() { return 0; }
+  static inline __host__ __device__ uint16_t max() { return UINT16_MAX; }
+  static inline __host__ __device__ uint16_t lower_bound() { return 0; }
+  static inline __host__ __device__ uint16_t upper_bound() { return UINT16_MAX; }
+};
+
 template <>
 struct numeric_limits<int16_t> {
  static inline __host__ __device__ int16_t lowest() { return INT16_MIN; }
@ -63,6 +71,14 @@ struct numeric_limits<int16_t> {
  static inline __host__ __device__ int16_t upper_bound() { return INT16_MAX; }
 };

+template <>
+struct numeric_limits<uint32_t> {
+  static inline __host__ __device__ uint32_t lowest() { return 0; }
+  static inline __host__ __device__ uint32_t max() { return UINT32_MAX; }
+  static inline __host__ __device__ uint32_t lower_bound() { return 0; }
+  static inline __host__ __device__ uint32_t upper_bound() { return UINT32_MAX; }
+};
+
 template <>
 struct numeric_limits<int32_t> {
  static inline __host__ __device__ int32_t lowest() { return INT32_MIN; }
@ -71,6 +87,21 @@ struct numeric_limits<int32_t> {
  static inline __host__ __device__ int32_t upper_bound() { return INT32_MAX; }
 };

+template <>
+struct numeric_limits<uint64_t> {
+#ifdef _MSC_VER
+  static inline __host__ __device__ uint64_t lowest() { return 0; }
+  static inline __host__ __device__ uint64_t max() { return _UI64_MAX; }
+  static inline __host__ __device__ uint64_t lower_bound() { return 0; }
+  static inline __host__ __device__ uint64_t upper_bound() { return _UI64_MAX; }
+#else
+  static inline __host__ __device__ uint64_t lowest() { return 0; }
+  static inline __host__ __device__ uint64_t max() { return UINT64_MAX; }
+  static inline __host__ __device__ uint64_t lower_bound() { return 0; }
+  static inline __host__ __device__ uint64_t upper_bound() { return UINT64_MAX; }
+#endif
+};
+
 template <>
 struct numeric_limits<int64_t> {
 #ifdef _MSC_VER
--- a/aten/src/ATen/functorch/BatchedTensorImpl.h
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.h
@ -157,8 +157,6 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({
  DispatchKey::Negative,
  DispatchKey::Conjugate,
  DispatchKey::XLA,
-  DispatchKey::XPU,
-  DispatchKey::HPU,
  DispatchKey::CUDA,
  DispatchKey::CPU,
  DispatchKey::PrivateUse1,
--- a/aten/src/ATen/mps/MPSAllocator.mm
+++ b/aten/src/ATen/mps/MPSAllocator.mm
@ -440,7 +440,7 @@ bool MPSHeapAllocatorImpl::release_cached_buffers() {
  // we need to release the lock temporarily as synchronizing may cause deadlock with completion handlers.
  m_mutex.unlock();
  auto stream = getDefaultMPSStream();
-  dispatch_sync_with_rethrow(stream->queue(), ^() {
+  dispatch_sync(stream->queue(), ^() {
    stream->synchronize(SyncType::COMMIT_AND_WAIT);
  });
  m_mutex.lock();
--- a/aten/src/ATen/mps/MPSStream.h
+++ b/aten/src/ATen/mps/MPSStream.h
@ -110,9 +110,6 @@ class TORCH_API MPSStream {
    return _stream;
  }

-  MTLBuffer_t getErrorBuffer();
-  void checkLastError();
-
 private:
  Stream _stream;
  MTLCommandQueue_t _commandQueue = nil;
@ -124,8 +121,6 @@ class TORCH_API MPSStream {
  dispatch_queue_t _serialQueue = nullptr;
  // CommitAndContinue is enabled by default
  bool _enableCommitAndContinue = true;
-  // Buffer that contains last raised error
-  MTLBuffer_t _errorBuffer = nil;

  // use synchronize() to access any of these commit functions outside MPSStream
  void commit();
@ -160,7 +155,4 @@ class TORCH_API MPSStreamImpl {
  MPSStreamImpl();
 };

-#ifdef __OBJC__
-void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)());
-#endif
 } // namespace at::mps
--- a/aten/src/ATen/mps/MPSStream.mm
+++ b/aten/src/ATen/mps/MPSStream.mm
@ -3,13 +3,13 @@
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/mps/MPSStream.h>
-#include <c10/metal/error.h>

@interface MPSGraphExecutionDescriptor ()
@property(readwrite, atomic) BOOL enableCommitAndContinue;
@end

 namespace at::mps {
+
 //-----------------------------------------------------------------
 //  MPSStream
 //-----------------------------------------------------------------
@ -30,10 +30,6 @@ MPSStream::MPSStream(Stream stream) : _stream(stream) {
  // Choose level which optimizes for GPU
  _compilationDescriptor.optimizationLevel = MPSGraphOptimizationLevel0;
  _executionDescriptor.compilationDescriptor = _compilationDescriptor;
-
-  _errorBuffer = [MPSDevice::getInstance()->device() newBufferWithLength:sizeof(c10::metal::ErrorMessages)
-                                                                 options:MTLResourceStorageModeShared];
-  std::memset([_errorBuffer contents], 0, 1024);
 }

 MPSStream::~MPSStream() {
@ -42,8 +38,6 @@ MPSStream::~MPSStream() {
  [_executionDescriptor release];
  [_compilationDescriptor release];
  _executionDescriptor = nil;
-  [_errorBuffer release];
-  _errorBuffer = nil;
  _compilationDescriptor = nil;

  assert(_commandBuffer == nil);
@ -110,7 +104,6 @@ void MPSStream::commitAndWait() {
    [_prevCommandBuffer waitUntilCompleted];
    [_prevCommandBuffer release];
    _prevCommandBuffer = nil;
-    checkLastError();
  }

  if (_commandBuffer) {
@ -118,7 +111,6 @@ void MPSStream::commitAndWait() {
    [_commandBuffer waitUntilCompleted];
    [_commandBuffer release];
    _commandBuffer = nil;
-    checkLastError();
  }
 }

@ -161,7 +153,7 @@ void MPSStream::fill(id<MTLBuffer> buffer, uint8_t value, size_t length, size_t
  if (length == 0) {
    return;
  }
-  dispatch_sync_with_rethrow(_serialQueue, ^() {
+  dispatch_sync(_serialQueue, ^() {
    @autoreleasepool {
      endKernelCoalescing();
      id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
@ -191,7 +183,7 @@ void MPSStream::copy(id<MTLBuffer> srcBuffer,
                     size_t dstOffset,
                     uint64_t profileId,
                     SyncType syncType) {
-  dispatch_sync_with_rethrow(_serialQueue, ^() {
+  dispatch_sync(_serialQueue, ^() {
    @autoreleasepool {
      endKernelCoalescing();
      id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
@ -244,7 +236,7 @@ void MPSStream::executeMPSGraph(MPSGraph* mpsGraph, NSDictionary* feeds, NSDicti
  auto& profiler = getMPSProfiler();
  const bool isGraphProfilingEnabled = profiler.isOperationProfilingEnabled();

-  dispatch_sync_with_rethrow(_serialQueue, ^() {
+  dispatch_sync(_serialQueue, ^() {
    endKernelCoalescing();
    if (isGraphProfilingEnabled) {
      // this function call is only relevant for interval-based Signposts
@ -274,24 +266,6 @@ void MPSStream::executeMPSGraph(MPSGraph* mpsGraph, NSDictionary* feeds, NSDicti
  });
 }

-id<MTLBuffer> MPSStream::getErrorBuffer() {
-  return _errorBuffer;
-}
-
-void MPSStream::checkLastError() {
-  auto msgs = reinterpret_cast<c10::metal::ErrorMessages*>([_errorBuffer contents]);
-  const auto& msg = msgs->msg[0];
-  if (!msgs) {
-    return;
-  }
-  unsigned int count = 0;
-  std::swap(count, msgs->count);
-  if (!count) {
-    return;
-  }
-  throw c10::AcceleratorError({msg.func, msg.file, msg.line}, 1, msg.message);
-}
-
 //-----------------------------------------------------------------
 //  MPSStreamImpl
 //-----------------------------------------------------------------
@ -315,19 +289,4 @@ MPSStream* getDefaultMPSStream() {
  return MPSStreamImpl::getInstance();
 }

-// Helper methods
-void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()) {
-  __block std::optional<std::exception_ptr> block_exception;
-  dispatch_sync(queue, ^() {
-    try {
-      block();
-    } catch (...) {
-      block_exception = std::current_exception();
-    }
-  });
-  if (block_exception) {
-    std::rethrow_exception(*block_exception);
-  }
-}
-
 } // namespace at::mps
--- a/aten/src/ATen/native/PackedSequence.cpp
+++ b/aten/src/ATen/native/PackedSequence.cpp
@ -142,7 +142,6 @@ Tensor _pack_padded_sequence_backward_symint(const Tensor& grad, c10::SymIntArra
 std::tuple<Tensor, Tensor> _pad_packed_sequence(const Tensor& data, const Tensor& _batch_sizes, bool batch_first, const Scalar& padding_value, int64_t total_length) {
  auto batch_sizes_t = _batch_sizes.contiguous();
  checkLongTensor(batch_sizes_t);
-  TORCH_CHECK(batch_sizes_t.numel() > 0, "batch_sizes can not be empty");

  int64_t * batch_sizes = batch_sizes_t.data_ptr<int64_t>();
  int64_t max_batch_size = batch_sizes[0];
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@ -1087,8 +1087,7 @@ TORCH_IMPL_FUNC(index_copy_out)
    result.copy_(self);

  // See Note [Enabling Deterministic Operations]
-  if ((result.is_cuda() || result.is_xpu()) &&
-      globalContext().deterministicAlgorithms()) {
+  if (result.is_cuda() && globalContext().deterministicAlgorithms()) {
    torch::List<std::optional<Tensor>> indices;
    indices.resize(dim + 1);
    indices.set(dim, index);
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@ -23,7 +23,6 @@
 #include <ATen/ops/_aminmax_native.h>
 #include <ATen/ops/_assert_async_native.h>
 #include <ATen/ops/_assert_scalar_native.h>
-#include <ATen/ops/_async_error_native.h>
 #include <ATen/ops/_functional_assert_async_native.h>
 #include <ATen/ops/_functional_assert_scalar_native.h>
 #include <ATen/ops/_make_per_tensor_quantized_tensor.h>
@ -480,14 +479,6 @@ Tensor isfinite(const Tensor& self) {
  });
 }

-void _async_error(std::string_view msg) {
-  TORCH_CHECK(0, msg);
-}
-
-void _async_error_meta(std::string_view msg) {
-  // Do NOT error, it's an async error!
-}
-
 void _assert_async_cpu(const Tensor& self) {
  TORCH_CHECK(
      native::is_nonzero(self),
--- a/aten/src/ATen/native/TransposeType.h
+++ b/aten/src/ATen/native/TransposeType.h
@ -1,8 +1,6 @@
 #pragma once
 #include <c10/util/Exception.h>

-C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default")
-
 namespace at::native {

 // Used as an interface between the different BLAS-like libraries
@ -23,5 +21,3 @@ static inline char to_blas(TransposeType trans) {
 }

 }  // namespace at::native
-
-C10_DIAGNOSTIC_POP()
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@ -904,11 +904,19 @@ Tensor mvlgamma(const Tensor& self, int64_t p) {
  return args.lgamma_().sum(-1).add_(p2_sub_p * std::log(c10::pi<double>) * QUARTER);
 }

-// since mvlgamma_ has different signature from its
-// out and functional variant, we explicitly
-// define it (instead of using structured kernel).
 Tensor& mvlgamma_(Tensor& self, int64_t p) {
-  return at::mvlgamma_out(self, self, p);
+  mvlgamma_check(self, p);
+  Tensor args = native::arange(
+      -p *HALF  + HALF,
+      HALF,
+      HALF,
+      optTypeMetaToScalarType(self.options().dtype_opt()),
+      self.options().layout_opt(),
+      self.options().device_opt(),
+      self.options().pinned_memory_opt());
+  args = args.add(self.unsqueeze(-1));
+  const auto p2_sub_p = static_cast<double>(p * (p - 1));
+  return self.copy_(args.lgamma_().sum(-1).add_(p2_sub_p * std::log(c10::pi<double>) * QUARTER));
 }

 Tensor& mvlgamma_out(const Tensor& self, int64_t p, Tensor& result) {
--- a/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp
@ -5,6 +5,7 @@
 #include <ATen/native/ReduceOpsUtils.h>

 #include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
 #include <ATen/Parallel.h>
 #include <ATen/TensorIterator.h>
 #include <ATen/OpMathType.h>
@ -78,12 +79,12 @@ void min_all_kernel_impl(Tensor& result, const Tensor& input) {
    reduce_all_impl<int64_t>(result, input, upper_bound<int64_t>(),
      [=](int64_t a, int64_t b) -> int64_t { return min_impl(a, b); });
  } else {
-    AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "min_all", [&] {
+    AT_DISPATCH_V2(input.scalar_type(), "min_all", AT_WRAP([&] {
      using Vec = Vectorized<opmath_type<scalar_t>>;
      reduce_all_impl_vec<scalar_t>(result, input, upper_bound<scalar_t>(),
        [=] (scalar_t a , scalar_t b) -> scalar_t { return min_impl(a, b); },
        [=](Vec a, Vec b) -> Vec { return minimum(a, b); });
-    });
+    }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kHalf, kBFloat16);
  }
 }

@ -103,12 +104,12 @@ void max_all_kernel_impl(Tensor& result, const Tensor& input) {
    reduce_all_impl<int64_t>(result, input, lower_bound<int64_t>(),
      [=](int64_t a, int64_t b) -> int64_t { return max_impl(a, b); });
  } else {
-    AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "max_all", [&] {
+    AT_DISPATCH_V2(input.scalar_type(), "max_all", AT_WRAP([&] {
      using Vec = Vectorized<opmath_type<scalar_t>>;
      reduce_all_impl_vec<scalar_t>(result, input, lower_bound<scalar_t>(),
        [=] (scalar_t a , scalar_t b) -> scalar_t { return max_impl(a, b); },
        [=](Vec a, Vec b) -> Vec { return maximum(a, b); });
-    });
+    }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kHalf, kBFloat16);
  }
 }

@ -199,7 +200,7 @@ void aminmax_allreduce_kernel(
      }
    );
  } else {
-    AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "aminmax_cpu", [&] {
+    AT_DISPATCH_V2(input.scalar_type(), "aminmax_cpu", AT_WRAP([&] {
      using Vec = Vectorized<opmath_type<scalar_t>>;
      using scalar_t_pair = std::pair<scalar_t, scalar_t>;
      reduce_all_impl_vec_two_outputs<scalar_t>(
@ -214,7 +215,7 @@ void aminmax_allreduce_kernel(
        [=](Vec a, Vec b) -> Vec { return minimum(a, b); },
        [=](Vec a, Vec b) -> Vec { return maximum(a, b); }
      );
-    });
+    }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf);
  }
 }

--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -3,6 +3,7 @@

 #include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
 #include <ATen/OpMathType.h>
 #include <ATen/cpu/vec/vec.h>
 #include <ATen/cpu/vec/functional.h>
@ -347,34 +348,35 @@ struct MinValuesOps: public at::native::MinOps<scalar_t> {
 };

 void min_values_kernel_impl(TensorIterator& iter) {
-  if (iter.dtype() == kLong) {
-    // This case is special because of Vectorized<int64_t> does not
-    // handle upper_bound<int64_t>().
-    // See: https://github.com/pytorch/pytorch/issues/43254
-    using scalar_t = int64_t;
-    binary_kernel_reduce(
-      iter,
-      MinValuesOps<scalar_t>{},
-      std::pair<scalar_t, int64_t>(upper_bound<scalar_t>(), -1));
+  // This case is special because of Vectorized<int64_t> does not
+  // handle upper_bound<int64_t>().
+  // See: https://github.com/pytorch/pytorch/issues/43254
+  if (iter.dtype() == kLong || iter.dtype() == kUInt64) {
+    AT_DISPATCH_V2(iter.dtype(), "min_values_cpu", AT_WRAP([&iter] {
+      binary_kernel_reduce(
+        iter,
+        MinValuesOps<scalar_t>{},
+        std::pair<scalar_t, int64_t>(upper_bound<scalar_t>(), -1));
+    }), kLong, kUInt64);
    return;
  }
-  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "min_values_cpu", [&iter] {
+  AT_DISPATCH_V2(iter.dtype(), "min_values_cpu", AT_WRAP([&iter] {
    binary_kernel_reduce_vec(
      iter,
      [](scalar_t a, scalar_t b) -> scalar_t { return min_impl(a, b); },
      [](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return minimum(a, b); },
      static_cast<double>(upper_bound<scalar_t>()));
-  });
+  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
 }

 void max_values_kernel_impl(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "max_values_cpu", [&iter] {
+  AT_DISPATCH_V2(iter.dtype(), "max_values_cpu", AT_WRAP([&iter] {
    binary_kernel_reduce_vec(
      iter,
      [](scalar_t a, scalar_t b) -> scalar_t { return max_impl(a, b); },
      [](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return maximum(a, b); },
      lower_bound<scalar_t>());
-  });
+  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
 }

 void argmax_kernel_impl(TensorIterator &iter) {
--- a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
+++ b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
@ -11,6 +11,7 @@
 #include <vector>

 #include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
 #include <ATen/Parallel.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/TensorIterator.h>
@ -106,7 +107,7 @@ void min_kernel_impl(
    bool keepdim) {
  int64_t self_dim_size = ensure_nonempty_size(self, dim);

-  AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, self.scalar_type(), "min_cpu", [&] {
+  AT_DISPATCH_V2(self.scalar_type(), "min_cpu", AT_WRAP([&] {
    compare_base_kernel<scalar_t>(result, indice, self, dim, keepdim, [&] (
      scalar_t* result_data, int64_t* indice_data,
      const scalar_t* self_data, auto self_dim_stride) {
@ -128,7 +129,7 @@ void min_kernel_impl(
        *indice_data = index;
      }
    );
-  });
+  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool);
 }

 void max_kernel_impl(
@ -139,7 +140,7 @@ void max_kernel_impl(
    bool keepdim) {
  int64_t self_dim_size = ensure_nonempty_size(self, dim);

-  AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, self.scalar_type(), "max_cpu", [&] {
+  AT_DISPATCH_V2(self.scalar_type(), "max_cpu", AT_WRAP([&] {
    compare_base_kernel<scalar_t>(result, indice, self, dim, keepdim, [&] (
      scalar_t* result_data, int64_t* indice_data,
      const scalar_t* self_data, auto self_dim_stride) {
@ -161,7 +162,7 @@ void max_kernel_impl(
        *indice_data = index;
      }
    );
-  });
+  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool);
 }

 void aminmax_kernel(
@ -186,7 +187,7 @@ void aminmax_kernel(
    return;
  }

-  AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half, self.scalar_type(), "aminmax_cpu", [&] {
+  AT_DISPATCH_V2(self.scalar_type(), "aminmax_cpu", AT_WRAP([&] {
    compare_base_kernel<scalar_t, scalar_t>(min_result, max_result, self, wrap_dim, keepdim, [&] (
      scalar_t* min_result_data, scalar_t* max_result_data,
      const scalar_t* self_data, auto self_dim_stride) {
@ -209,7 +210,7 @@ void aminmax_kernel(
        *max_result_data = max_number;
      }
    );
-  });
+  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half);
 }

 void where_kernel_impl(TensorIterator &iter) {
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -296,7 +296,7 @@ template <typename scalar_t, typename res_scalar_t = scalar_t>
 bool launchGemmAndBiasCublasLt(
    // args contains result which is modified
    cublasCommonArgs& args,
-    const std::optional<Tensor>& self,
+    const Tensor& self,
    const Scalar& alpha,
    Activation activation = Activation::None
 ) {
@ -304,8 +304,12 @@ bool launchGemmAndBiasCublasLt(
  // or when it can be squeezed to 1D.
  // self_ptr == nullptr implies ignore bias epilogue
  // and use standard gemm-like API.
-  const auto* self_ptr = self.has_value() ? self.value().const_data_ptr<scalar_t>() : static_cast<const scalar_t*>(nullptr);
-
+  const auto* self_ptr = [&]() -> auto {
+    if (self.dim() == 1 || self.squeeze().dim() == 1) {
+      return self.const_data_ptr<scalar_t>();
+    }
+    return static_cast<const scalar_t*>(nullptr);
+  }();

  const auto tuning_ctx = at::cuda::tunable::getTuningContext();
  if (tuning_ctx->IsTunableOpEnabled()) {
@ -388,30 +392,35 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
  bool disable_addmm_cuda_lt = persistent_disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
  #ifdef USE_ROCM
  // Conditioned on the device index, which is not persistent
-  disable_addmm_cuda_lt = disable_addmm_cuda_lt || isGloballyDisabledAddmmCudaLt(self.device());
+  disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device()) || disable_addmm_cuda_lt;
  #endif
  // Condition on the input
-  disable_addmm_cuda_lt = disable_addmm_cuda_lt || !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation);
+  disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation) || disable_addmm_cuda_lt;
+  // }

  at::ScalarType scalar_type = mat1.scalar_type();
  bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;

-  #ifdef USE_ROCM
-  disable_addmm_cuda_lt = disable_addmm_cuda_lt || is_float_output_with_half_input;
-  #endif
-
-  bool use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
-  // for float output with half input cublasLT with bias produces wrong results
-  use_bias_ptr_lt &= !is_float_output_with_half_input;
-
  // Handle result/self shapes
  if (!result.is_same(self)) {
    at::native::resize_output(result, {mat1.sizes()[0], mat2.sizes()[1]});

-      // We do not copy bias only when we need the bias ptr
+    // We use bias ptr in the Lt path only when bias is 1D
+    const auto use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
+    const auto self_maybe_expanded = [&]() -> c10::MaybeOwned<Tensor> {
+      if (!use_bias_ptr_lt) {
+        // We do expand self even before
+        // check for beta != 0.0 to make sure that
+        // test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_errors_*
+        // runs green.
+        return expand_size(self, result.sizes(), "addmm");
+      }
+      return c10::MaybeOwned<Tensor>::borrowed(self);
+    }();
+    // We do not copy bias only when we need the bias ptr
    if (beta.toComplexDouble() != 0.0 && !use_bias_ptr_lt) {
      // NOTE: self should broadcast over result
-      at::native::copy_(result, *expand_size(self, result.sizes(), "addmm"));
+      at::native::copy_(result, *self_maybe_expanded);
    }
  }

@ -459,7 +468,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
        scalar_type,
        "addmm_cuda_lt",
        [&] {
-          lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
+          lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, self, alpha, activation);
        }
      );
      #endif
@ -471,7 +480,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
        scalar_type,
        "addmm_cuda_lt",
        [&] {
-          lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
+          lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, self, alpha, activation);
        }
      );
    } // end is_float_output_with_half_input
@ -927,7 +936,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
  return _int_mm_out_cuda(self, mat2, result);
 }

-static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
+static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, bool is_bmm, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
  // ref ATen/native/LinearAlgebra.cpp common_checks_baddbmm_bmm
  TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
  TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
@ -951,7 +960,7 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
    (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");

-  if (self_baddbmm.has_value()) {
+  if (!is_bmm && self_baddbmm.has_value()) {
    const auto& self = self_baddbmm.value();
    TORCH_CHECK(self.dim() == 3, "self must be a 3D tensor");
    TORCH_CHECK(self.sizes() == output_size, "self must have the same shape as the output");
@ -959,12 +968,15 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
 }

 Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
-  Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
+  IntArrayRef batch1_sizes = batch1.sizes();
+  IntArrayRef batch2_sizes = batch2.sizes();
+
+  Tensor out = at::empty({batch1_sizes[0], batch1_sizes[1], batch2_sizes[2]}, batch1.options().dtype(out_dtype));
  return _bmm_out_dtype_cuda(batch1, batch2, out_dtype, out);
 }

 Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) {
-  baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype);
+  baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype, true);
  Scalar beta(0.0);
  Scalar alpha(1.0);
  {
@ -976,16 +988,14 @@ Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at
 }

 Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
-  TORCH_CHECK(self.scalar_type() == out_dtype || self.scalar_type() == batch1.dtype(),
-  "self dtype must match either out_dtype or batch1 dtype");
-  Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
-  return _baddbmm_out_dtype_cuda(self, batch1, batch2, out_dtype, beta, alpha, out);
+  // We need to copy the tensor
+  Tensor out = self.clone().to(self.options().dtype(out_dtype));
+
+  return _baddbmm_out_dtype_cuda(out, batch1, batch2, out_dtype, beta, alpha, out);
 }

 Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
-  baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, out);
-  // We need to copy the tensor
-  out.copy_(self);
+  baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, false, self);
  {
    NoNamesGuard guard;
    baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
@ -1020,27 +1030,24 @@ Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::Sca
 }

 Tensor _addmm_dtype_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
-  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
-  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
-  Tensor result = at::empty({mat1.size(0), mat2.size(1)}, self.options().dtype(out_dtype));
+  Tensor result = at::empty(self.sizes(), self.options().dtype(out_dtype));
  return _addmm_dtype_out_cuda(self, mat1, mat2, out_dtype, beta, alpha, result);
 }

 Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
-// repeat dimensionality checks for direct calls to `out` overload
+  TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "self and mat2 must have the same dtype, but got ", self.scalar_type(), " and ", mat2.scalar_type());
+  TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
  TORCH_CHECK(
      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
-  TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
-  TORCH_CHECK(out_dtype == mat1.scalar_type() ||
-  (out_dtype == at::ScalarType::Float && (mat1.scalar_type() == at::ScalarType::Half || mat1.scalar_type() == at::ScalarType::BFloat16)),
-  "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");

  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
-  TORCH_CHECK(out_dtype == self.scalar_type() || self.scalar_type() == mat1.scalar_type(),
-    "self dtype must match either out_dtype or mat1 dtype");
+  TORCH_CHECK(out_dtype == self.scalar_type() ||
+    (out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)),
+    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
+  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");

  addmm_out_cuda_impl(out, self, mat1, mat2, beta, alpha);

--- a/aten/src/ATen/native/cuda/CompositeRandomAccessor.h
+++ b/aten/src/ATen/native/cuda/CompositeRandomAccessor.h
@ -1,7 +1,6 @@
 #pragma once

 #include <ATen/native/CompositeRandomAccessorCommon.h>
-#include <thrust/swap.h>
 #include <thrust/tuple.h>

 namespace at { namespace native {
--- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
@ -75,52 +75,30 @@ static inline bool can_use_int32_nhwc(
  return true;
 }

-static inline bool can_use_int32_nchw(
-    int64_t nbatch, int64_t channels,
-    int64_t height, int64_t width,
-    int64_t pooled_height, int64_t pooled_width) {
-  int64_t hw = height * width;
-  return can_use_int32_nhwc(
-      nbatch, channels, height, width,
-      pooled_height, pooled_width,
-      channels * hw,  // in_stride_n
-      hw, // in_stride_c
-      width, // in_stride_h
-      1 // in_stride_w
-  );
-}
-
 // kernels borrowed from Caffe
-template <typename scalar_t, typename index_t>
-__global__ void max_pool_forward_nchw(
-    const index_t nthreads,
-    const scalar_t* bottom_data,
-    const int64_t channels,
-    const int64_t height,
-    const int64_t width,
-    const int pooled_height,
-    const int pooled_width,
-    const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w,
-    const int pad_h, const int pad_w,
-    const int dilation_h, const int dilation_w,
-    scalar_t* top_data,
+template <typename scalar_t>
+__global__ void max_pool_forward_nchw(const int nthreads, const scalar_t* bottom_data,
+    const int64_t channels, const int64_t height,
+    const int64_t width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, scalar_t* top_data,
    int64_t* top_mask) {
-  CUDA_KERNEL_LOOP_TYPE(index, nthreads, index_t) {
-    index_t pw = index % pooled_width;
-    index_t ph = (index / pooled_width) % pooled_height;
-    index_t c = (index / pooled_width / pooled_height) % channels;
-    index_t n = index / pooled_width / pooled_height / channels;
-    index_t hstart = ph * stride_h - pad_h;
-    index_t wstart = pw * stride_w - pad_w;
-    index_t hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height);
-    index_t wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width);
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    int hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height);
+    int wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width);
    while(hstart < 0)
      hstart += dilation_h;
    while(wstart < 0)
      wstart += dilation_w;
    scalar_t maxval = at::numeric_limits<scalar_t>::lower_bound(); // -Infinity
-    index_t maxidx = hstart * width + wstart;
+    int maxidx = hstart * width + wstart;
    const scalar_t* btm_data = bottom_data + (n * channels + c) * height * width;
    for (int h = hstart; h < hend; h += dilation_h) {
      for (int w = wstart; w < wend; w += dilation_w) {
@ -273,39 +251,32 @@ __global__ void max_pool_forward_nhwc(

 static constexpr int BLOCK_THREADS = 256;

-template <typename scalar_t, typename accscalar_t, typename index_t>
+template <typename scalar_t, typename accscalar_t>
 #if defined (USE_ROCM)
 C10_LAUNCH_BOUNDS_2(BLOCK_THREADS, 4)
 #else
 C10_LAUNCH_BOUNDS_2(BLOCK_THREADS, 8)
 #endif
-__global__ void max_pool_backward_nchw(
-    const scalar_t* top_diff,
-    const int64_t* top_mask,
-    const index_t num,
-    const index_t channels,
-    const index_t height,
-    const index_t width,
-    const index_t pooled_height,
-    const index_t pooled_width,
-    const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w,
-    const int pad_h, const int pad_w,
+__global__ void max_pool_backward_nchw(const scalar_t* top_diff,
+    const int64_t* top_mask, const int num, const int64_t channels,
+    const int64_t height, const int64_t width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
    const int dilation_h, const int dilation_w,
    scalar_t* bottom_diff) {
-  CUDA_KERNEL_LOOP_TYPE(index, height*width, index_t) {
-    index_t h = index / width;
-    index_t w = index - h * width;
-    index_t phstart = p_start(h, pad_h, kernel_h, dilation_h, stride_h);
-    index_t phend = p_end(h, pad_h, pooled_height, stride_h);
-    index_t pwstart = p_start(w, pad_w, kernel_w, dilation_w, stride_w);
-    index_t pwend = p_end(w, pad_w, pooled_width, stride_w);
-    for (index_t n = blockIdx.y; n < num; n += gridDim.y) {
-      for (index_t c = blockIdx.z; c < channels; c += gridDim.z) {
+  CUDA_KERNEL_LOOP(index, height*width) {
+    int h = index / width;
+    int w = index - h * width;
+    int phstart = p_start(h, pad_h, kernel_h, dilation_h, stride_h);
+    int phend = p_end(h, pad_h, pooled_height, stride_h);
+    int pwstart = p_start(w, pad_w, kernel_w, dilation_w, stride_w);
+    int pwend = p_end(w, pad_w, pooled_width, stride_w);
+    for (int n = blockIdx.y; n < num; n += gridDim.y) {
+      for (int c = blockIdx.z; c < channels; c+= gridDim.z) {
        accscalar_t gradient = accscalar_t(0);
-        index_t offset = (n * channels + c) * pooled_height * pooled_width;
-        for (index_t ph = phstart; ph < phend; ++ph) {
-          for (index_t pw = pwstart; pw < pwend; ++pw) {
+        int offset = (n * channels + c) * pooled_height * pooled_width;
+        for (int ph = phstart; ph < phend; ++ph) {
+          for (int pw = pwstart; pw < pwend; ++pw) {
            if (top_mask[ph * pooled_width + pw + offset] == h * width + w) {
              gradient += static_cast<accscalar_t>(top_diff[ph * pooled_width + pw + offset]);
            }
@ -498,6 +469,8 @@ const Tensor& indices) {
  const int64_t in_stride_h = input.stride(-2);
  const int64_t in_stride_w = input.stride(-1);

+  const int count = safe_downcast<int, int64_t>(output.numel());
+
  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
    "max_pool2d_with_indices_out_cuda_frame",
    [&] {
@ -580,42 +553,14 @@ const Tensor& indices) {
          break;
        }
        case MemoryFormat::Contiguous: {
-          const int threads = std::min(
-              at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock,
-              BLOCK_THREADS);
-          const int64_t nthreads = output.numel();
-          bool use_int32 = can_use_int32_nchw(
-              nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth);
-          const int maxGridX = at::cuda::getCurrentDeviceProperties()->maxGridSize[0];
-          const int blocks = static_cast<int>(std::min<int64_t>(
-              ceil_div(nthreads, static_cast<int64_t>(threads)),
-              static_cast<int64_t>(maxGridX)));
-          auto stream = at::cuda::getCurrentCUDAStream();
-          if (use_int32) {
-            max_pool_forward_nchw<scalar_t, int32_t>
-                <<<blocks, threads, 0, stream>>>(
-                    static_cast<int32_t>(nthreads),
-                    input_data,
-                    static_cast<int32_t>(nInputPlane),
-                    static_cast<int32_t>(inputHeight),
-                    static_cast<int32_t>(inputWidth),
-                    static_cast<int32_t>(outputHeight),
-                    static_cast<int32_t>(outputWidth),
-                    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-                    output_data, indices_data);
-          } else {
-            max_pool_forward_nchw<scalar_t, int64_t>
-                <<<blocks, threads, 0, stream>>>(
-                    nthreads,
-                    input_data,
-                    nInputPlane,
-                    inputHeight,
-                    inputWidth,
-                    outputHeight,
-                    outputWidth,
-                    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-                    output_data, indices_data);
-          }
+          const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock,
+                                            BLOCK_THREADS);
+          max_pool_forward_nchw<scalar_t>
+              <<<ceil_div(count, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+              count, input_data,
+                  nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+                  kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                  output_data, indices_data);
          C10_CUDA_KERNEL_LAUNCH_CHECK();
          break;
        }
@ -688,6 +633,8 @@ const Tensor& gradInput) {

  gradInput.zero_();

+  int64_t count = input.numel();
+
  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
    "max_pool2d_with_indices_out_cuda_frame",
    [&] {
@ -745,45 +692,25 @@ const Tensor& gradInput) {
          break;
        }
        case MemoryFormat::Contiguous: {
-          const int threads = std::min(
-              at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock,
-              BLOCK_THREADS);
-          const int imgcount = inputWidth * inputHeight;
-          const int maxGridX = at::cuda::getCurrentDeviceProperties()->maxGridSize[0];
-          const int maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
-          const int maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2];
-          const int blocks_x = std::min(ceil_div(imgcount, threads), maxGridX);
-          dim3 grid(blocks_x, static_cast<unsigned>(std::min<int64_t>(nbatch, maxGridY)), static_cast<unsigned>(std::min<int64_t>(nInputPlane, maxGridZ)));
-          bool use_int32 = can_use_int32_nchw(
-              nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth);
-          auto stream = at::cuda::getCurrentCUDAStream();
-          if (use_int32) {
-            max_pool_backward_nchw<scalar_t, accscalar_t, int32_t>
-                <<<grid, threads, 0, stream>>>(
-                    gradOutput_data,
-                    indices_data,
-                    static_cast<int32_t>(nbatch),
-                    static_cast<int32_t>(nInputPlane),
-                    static_cast<int32_t>(inputHeight),
-                    static_cast<int32_t>(inputWidth),
-                    static_cast<int32_t>(outputHeight),
-                    static_cast<int32_t>(outputWidth),
-                    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-                    gradInput_data);
-          } else {
-            max_pool_backward_nchw<scalar_t, accscalar_t, int64_t>
-                <<<grid, threads, 0, stream>>>(
-                    gradOutput_data,
-                    indices_data,
-                    nbatch,
-                    nInputPlane,
-                    inputHeight,
-                    inputWidth,
-                    outputHeight,
-                    outputWidth,
-                    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-                    gradInput_data);
-          }
+          int imgcount = inputWidth * inputHeight;
+          dim3 grid;
+          const int blocks = (imgcount + BLOCK_THREADS - 1) / BLOCK_THREADS;
+          grid.x = blocks;
+          grid.y = nbatch;
+          uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
+          if (maxGridY < grid.y) grid.y = maxGridY;
+          grid.z = nInputPlane;
+          uint64_t maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2];
+          if (maxGridZ < grid.z) grid.z = maxGridZ;
+
+          max_pool_backward_nchw<scalar_t, accscalar_t>
+          <<<grid, BLOCK_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
+                  gradOutput_data,
+                  indices_data,
+                  nbatch,
+                  nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+                  kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                  gradInput_data);
          C10_CUDA_KERNEL_LAUNCH_CHECK();
          break;
        }
--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@ -78,18 +78,9 @@ __global__ void EmbeddingBag_updateOutputKernel_max(
      scalar_t weightFeatMax = 0;
      int64_t bag_size_ = 0;
      int64_t maxWord = -1;
-
-      // Separate validation loop reduces register pressure in the main loop below.
-      // No early exit (break) on invalid input as benchmarking shows it degrades performance.
-      bool has_invalid_index = false;
-      for (int64_t emb = begin; emb < end; emb++) {
-        index_t input_idx = input[emb];
-        has_invalid_index = has_invalid_index || (input_idx < 0 || input_idx >= numRows);
-      }
-      CUDA_KERNEL_ASSERT(!has_invalid_index && "Invalid input index in EmbeddingBag: index out of range [0, numRows)");
-
      for (int64_t emb = begin; emb < end; emb++) {
        bool pad = (input[emb] == padding_idx);
+        CUDA_KERNEL_ASSERT(input[emb] < numRows);
        const int64_t weightRow = input[emb] * weight_stride0;
        scalar_t weightValue = weightFeat[weightRow];
        if (bag_size_ == 0 || weightValue > weightFeatMax) {
@ -138,19 +129,10 @@ __global__ void EmbeddingBag_updateOutputKernel_sum_mean(
      CUDA_KERNEL_ASSERT(end >= begin);
      accscalar_t weightFeatSum = 0;
      int64_t bag_size_ = 0;
-
-      // Separate validation loop reduces register pressure in the main loop below.
-      // No early exit (break) on invalid input as benchmarking shows it degrades performance.
-      bool has_invalid_index = false;
-      for (int64_t emb = begin; emb < end; emb++) {
-        index_t input_idx = input[emb];
-        has_invalid_index = has_invalid_index || (input_idx < 0 || input_idx >= numRows);
-      }
-      CUDA_KERNEL_ASSERT(!has_invalid_index && "Invalid input index in EmbeddingBag: index out of range [0, numRows)");
-
      for (int64_t emb = begin; emb < end; emb++) {
        index_t input_idx = input[emb];
        bool pad = (input_idx == padding_idx);
+        CUDA_KERNEL_ASSERT(0 <= input_idx && input_idx < numRows);
        const int64_t weightRow = input_idx * weight_stride0;
        scalar_t weightValue = weightFeat[weightRow];
        weightValue = pad ? static_cast<scalar_t>(0) : weightValue;
--- a/aten/src/ATen/native/cuda/GroupedBlas.cpp
+++ b/aten/src/ATen/native/cuda/GroupedBlas.cpp
@ -78,9 +78,9 @@ _mx8_mx8_bf16_grouped_mm_fbgemm(
        const Tensor& mat_a,
        const Tensor& mat_b,
        const Tensor& scale_a,
-        const SwizzleType swizzle_a,
+        const SwizzleType& swizzle_a,
        const Tensor& scale_b,
-        const SwizzleType swizzle_b,
+        const SwizzleType& swizzle_b,
        const std::optional<at::Tensor>& offs,
        Tensor& out) {
    const bool a_is_2d = mat_a.dim() == 2;
@ -669,12 +669,9 @@ std::optional<c10::ScalarType> out_dtype) {
  // _scaled_mm_allowed_device is used here within _grouped_mm_cuda which seems incorrect since scale is not used.
  // the _grouped_mm_fallback should be safe for any ROCm GPU since it's just calling typical mm/bmm
  bool use_fast_path = false;
-  // On non CK system(w/ ROCm), make sure use_fast_path is false
-#if defined(USE_ROCM_CK_GEMM)
  if (at::detail::getCUDAHooks().isGPUArch({"gfx942", "gfx950"})) {
    use_fast_path = true;
  }
-#endif //USE_ROCM_CK_GEMM
 #endif
  const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
@ -683,11 +680,7 @@ std::optional<c10::ScalarType> out_dtype) {
 #ifndef USE_ROCM
    at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out);
 #else
-#if defined(USE_ROCM_CK_GEMM)
    at::hip::detail::group_gemm_ck(mat_a, mat_b, offs, bias, out);
-#else
-    TORCH_WARN("ROCm: Group Gemm through CK not selected.");
-#endif //USE_ROCM_CK_GEMM
 #endif
  } else {
    _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
--- a/aten/src/ATen/native/cuda/KernelUtils.cuh
+++ b/aten/src/ATen/native/cuda/KernelUtils.cuh
@ -5,11 +5,69 @@
 #include <cuda_bf16.h>
 #endif

+// ROCm 6.3 is planned to have these functions, but until then here they are.
 #if defined(USE_ROCM)
 #include <device_functions.h>
 #include <hip/hip_fp16.h>
 #include <hip/hip_bf16.h>
-#define ATOMICADD unsafeAtomicAdd
+
+__device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* address, __hip_bfloat162 value) {
+#if (defined(__gfx942__)) && \
+  __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2bf16)
+  typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
+  static_assert(sizeof(vec_short2) == sizeof(__hip_bfloat162_raw));
+  union {
+    __hip_bfloat162_raw bf162_raw;
+    vec_short2 vs2;
+  } u{static_cast<__hip_bfloat162_raw>(value)};
+  u.vs2 = __builtin_amdgcn_flat_atomic_fadd_v2bf16((vec_short2*)address, u.vs2);
+  return static_cast<__hip_bfloat162>(u.bf162_raw);
+#else
+  static_assert(sizeof(unsigned int) == sizeof(__hip_bfloat162_raw));
+  union u_hold {
+    __hip_bfloat162_raw h2r;
+    unsigned int u32;
+  };
+  u_hold old_val, new_val;
+  old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  do {
+    new_val.h2r = __hadd2(old_val.h2r, value);
+  } while (!__hip_atomic_compare_exchange_strong(
+        (unsigned int*)address, &old_val.u32, new_val.u32,
+        __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
+  return old_val.h2r;
+#endif
+}
+
+__device__ inline __half2 preview_unsafeAtomicAdd(__half2* address, __half2 value) {
+#if (defined(__gfx942__)) && \
+  __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2f16)
+  // The api expects an ext_vector_type of half
+  typedef _Float16 __attribute__((ext_vector_type(2))) vec_fp162;
+  static_assert(sizeof(vec_fp162) == sizeof(__half2_raw));
+  union {
+    __half2_raw h2r;
+    vec_fp162 fp16;
+  } u {static_cast<__half2_raw>(value)};
+  u.fp16 = __builtin_amdgcn_flat_atomic_fadd_v2f16((vec_fp162*)address, u.fp16);
+  return static_cast<__half2>(u.h2r);
+#else
+  static_assert(sizeof(__half2_raw) == sizeof(unsigned int));
+  union u_hold {
+    __half2_raw h2r;
+    unsigned int u32;
+  };
+  u_hold old_val, new_val;
+  old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  do {
+    new_val.h2r = __hadd2(old_val.h2r, value);
+  } while (!__hip_atomic_compare_exchange_strong(
+        (unsigned int*)address, &old_val.u32, new_val.u32,
+        __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
+  return old_val.h2r;
+#endif
+}
+#define ATOMICADD preview_unsafeAtomicAdd
 #define NATIVE_ZERO_BF16 __float2bfloat16(0.0f)
 #else
 #define ATOMICADD atomicAdd
--- a/aten/src/ATen/native/cuda/LogAddExpKernel.cu
+++ b/aten/src/ATen/native/cuda/LogAddExpKernel.cu
@ -2,250 +2,18 @@
 #include <ATen/Dispatch.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/cuda/Loops.cuh>
-#include <ATen/native/cuda/JitLoops.cuh>
-#include <ATen/native/cuda/jit_utils.h>
-#include <ATen/native/cuda/ScanUtils.cuh>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/BinaryOps.h>
 #include <ATen/OpMathType.h>
 #include <c10/util/MathConstants.h>
-#include <c10/util/complex.h>
-
-#include <cmath>
-#include <limits>

 // NOTE: CUDA on Windows requires that the enclosing function
 // of a __device__ lambda not have internal linkage.

 namespace at::native {

-// custom min and max to be used in logaddexp for  complex arguments
-template <typename scalar_t, bool min>
-__host__ __device__ c10::complex<scalar_t> _logaddexp_minmax(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
-  scalar_t xr = std::real(x);
-  scalar_t yr = std::real(y);
-  if (::isnan(yr) || (::isnan(std::imag(y)))) {
-    return y;
-  } else if (::isnan(xr) || (::isnan(std::imag(x)))) {
-    return x;
-  } else if (min) { // min
-    return (xr < yr) ? x : y;
-  } else { // max
-    return (xr >= yr) ? x : y;
-  }
-}
-
-template <typename scalar_t>
-__host__ __device__ scalar_t _log_add_exp_helper(const scalar_t& x, const scalar_t& y) {
-  // Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
-  // Using the original expression: `at::_isnan(y) ? y : std::min(x, y)` causes an error in ROCM
-  const auto isnan_x = at::_isnan(x);
-  const auto isnan_y = at::_isnan(y);
-  scalar_t min = isnan_y ? y : (isnan_x ? x : std::min(x, y));
-  scalar_t max = isnan_y ? y : (isnan_x ? x : std::max(x, y));
-  if (min != max || ::isfinite(min)) {
-    // nan will be propagated here
-    return ::log1p(std::exp(min - max)) + max;
-  } else {
-    // special case to correctly handle infinite cases
-    return x;
-  }
-}
-
-template <typename scalar_t>
-__host__ __device__ c10::complex<scalar_t> _fast_build_exp(const c10::complex<scalar_t>& x) {
-  // complex exponential function, but implemented manually to get fast compilation time
-  // this function only handles the case where the x is finite (not inf nor nan)
-  const auto xreal = std::real(x);
-  const auto ximag = std::imag(x);
-  const auto exp_x_abs = std::exp(xreal);
-  auto exp_x_real = exp_x_abs * std::cos(ximag);
-  auto exp_x_imag = exp_x_abs * std::sin(ximag);
-  return {exp_x_real, exp_x_imag};
-}
-
-template <typename scalar_t>
-__host__ __device__ c10::complex<scalar_t> _fast_build_exp_inf(const c10::complex<scalar_t>& x) {
-  // complex exponential function, but implemented manually to get fast compilation time
-  // this function only handles the case where the real part of x is infinite
-  const auto ximag = std::imag(x);
-  constexpr auto exp_x_abs = std::numeric_limits<scalar_t>::infinity();
-  if (!::isfinite(ximag)) {  // add this to make consitent with std::exp(x+yi)
-    return {exp_x_abs, std::numeric_limits<scalar_t>::quiet_NaN()};
-  }
-  const auto sin = std::sin(ximag);
-  const auto cos = std::cos(ximag);
-  // special case if the angle is exactly the multiple of pi/2
-  auto exp_x_real = (cos == 0) ? (scalar_t)0.0 : exp_x_abs * cos;
-  auto exp_x_imag = (sin == 0) ? (scalar_t)0.0 : exp_x_abs * sin;
-  return {exp_x_real, exp_x_imag};
-}
-
-template <typename scalar_t>
-__host__ __device__ c10::complex<scalar_t> _log_add_exp_helper(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
-  c10::complex<scalar_t> min = _logaddexp_minmax<scalar_t, /*min=*/true>(x, y);
-  c10::complex<scalar_t> max = _logaddexp_minmax<scalar_t, /*min=*/false>(x, y);
-  scalar_t min_real = std::real(min);
-  scalar_t max_real = std::real(max);
-
-  if (::isnan(min_real) || ::isnan(std::imag(min))) {
-    // handling the "infectious" NaNs
-    return {std::numeric_limits<scalar_t>::quiet_NaN(), std::numeric_limits<scalar_t>::quiet_NaN()};
-  }
-  else if ((!::isfinite(min_real)) && (min_real == max_real)) {
-    if (min_real < 0) {
-      // handle the -inf case, the imaginary part here does not really matter as the exp(value)
-      // will be around 0.0 and the angle (i.e. the imaginary part) cannot be determined.
-      // It does not matter if we're taking the exp of this value
-      return min;
-    } else {
-      // handle the +inf case, we don't need the special precision for log1p for small values
-      // and to avoid producing nan in case of real(max) == real(min) == +inf
-      const auto exp_min = _fast_build_exp_inf(min);
-      const auto exp_max = _fast_build_exp_inf(max);
-      return ::log1p(exp_min + exp_max - 1);  // log1p(x - 1) builds faster than log
-    }
-  } else {
-    const auto minmax = min - max;
-    c10::complex<scalar_t> exp_minmax;
-    if (!::isfinite(minmax.real())) {
-        exp_minmax = minmax.real() < 0 ? c10::complex<scalar_t>{0.0, 0.0} : _fast_build_exp_inf(minmax);
-    } else {
-        exp_minmax = _fast_build_exp(minmax);
-    }
-    return ::log1p(exp_minmax) + max;
-  }
-}
-
-// Complex logaddexp jiterator string
-const auto logaddexp_complex_string = jiterator_stringify(
-    template<typename T>
-    std::complex<T> log1p(const std::complex<T>& z)
-    {
-      using complex_t = std::complex<T>;
-      T x = z.real();
-      T y = z.imag();
-      T zabs = abs(z);
-      T theta = atan2(y, x + T(1));
-      if (zabs < 0.5) {
-          T r = x * (T(2) + x) + y * y;
-          if (r == 0) { // handle underflow
-              return complex_t(x, theta);
-          }
-          return complex_t(T(0.5) * std::log1p(r), theta);
-      } else {
-          T z0 = std::hypot(x + 1, y);
-          return complex_t(log(z0), theta);
-      }
-    }
-
-    // separated _logaddexp_minmax into 2 different functions for jiterator_string
-    template <typename T>
-    std::complex<T> logaddexp_min(const std::complex<T>& x, const std::complex<T>& y) {
-        T xr = x.real();
-        T yr = y.real();
-        if (isnan(yr) || isnan(y.imag())) {
-            return y;
-        } else if (isnan(xr) || isnan(x.imag())) {
-            return x;
-        } else {
-            return (xr < yr) ? x : y;
-        }
-    }
-
-    template <typename T>
-    std::complex<T> logaddexp_max(const std::complex<T>& x, const std::complex<T>& y) {
-        T xr = x.real();
-        T yr = y.real();
-        if (isnan(yr) || isnan(y.imag())) {
-            return y;
-        } else if (isnan(xr) || isnan(x.imag())) {
-            return x;
-        } else {
-            return (xr >= yr) ? x : y;
-        }
-    }
-
-    template <typename T>
-    std::complex<T> fast_build_exp(const std::complex<T>& x) {
-        const auto xreal = x.real();
-        const auto ximag = x.imag();
-        const auto exp_x_abs = exp(xreal);
-        auto exp_x_real = exp_x_abs * cos(ximag);
-        auto exp_x_imag = exp_x_abs * sin(ximag);
-        return std::complex<T>(exp_x_real, exp_x_imag);
-    }
-
-    template <typename T>
-    std::complex<T> fast_build_exp_inf(const std::complex<T>& x) {
-        using complex_t = std::complex<T>;
-        const auto ximag = x.imag();
-        const T exp_x_abs = INFINITY;
-        if (!isfinite(ximag)) {
-            return complex_t(exp_x_abs, NAN);
-        }
-        const auto sin_val = sin(ximag);
-        const auto cos_val = cos(ximag);
-        auto exp_x_real = (cos_val == T(0)) ? T(0) : exp_x_abs * cos_val;
-        auto exp_x_imag = (sin_val == T(0)) ? T(0) : exp_x_abs * sin_val;
-        return complex_t(exp_x_real, exp_x_imag);
-    }
-
-    template <typename complex_t>
-    complex_t logaddexp_complex(complex_t x, complex_t y) {
-        using T = typename complex_t::value_type;
-        complex_t min_val = logaddexp_min(x, y);
-        complex_t max_val = logaddexp_max(x, y);
-        T min_real = min_val.real();
-        T max_real = max_val.real();
-
-        if (isnan(min_real) || isnan(min_val.imag())) {
-            return complex_t(NAN, NAN);
-        }
-        else if ((!isfinite(min_real)) && (min_real == max_real)) {
-            if (min_real < T(0)) {
-                return min_val;
-            } else {
-                const auto exp_min = fast_build_exp_inf<T>(min_val);
-                const auto exp_max = fast_build_exp_inf<T>(max_val);
-                return log1p(exp_min + exp_max - complex_t(1, 0));
-            }
-        } else {
-            const auto minmax = min_val - max_val;
-            complex_t exp_minmax;
-            if (!isfinite(minmax.real())) {
-                exp_minmax = (minmax.real() < T(0)) ? complex_t(0, 0) : fast_build_exp_inf<T>(minmax);
-            } else {
-                exp_minmax = fast_build_exp<T>(minmax);
-            }
-            return log1p(exp_minmax) + max_val;
-        }
-    }
-);
-
-constexpr char logaddexp_complex_name[] = "logaddexp_complex";
 void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
-  if (at::isComplexType(iter.dtype())) {
-#if AT_USE_JITERATOR()
-    AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
-      jitted_gpu_kernel<
-          /*name=*/logaddexp_complex_name,
-          /*return_dtype=*/scalar_t,
-          /*common_dtype=*/scalar_t,
-          /*arity=*/2>(iter, logaddexp_complex_string);
-    });
-#else
-    AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
-      using opmath_t = at::opmath_type<scalar_t>;
-      gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a_, scalar_t b_) -> scalar_t {
-        const auto a = static_cast<opmath_t>(a_);
-        const auto b = static_cast<opmath_t>(b_);
-        return static_cast<scalar_t>(_log_add_exp_helper(a, b));
-      });
-    });
-#endif
-  } else {
-    AT_DISPATCH_FLOATING_TYPES_AND2(
+  AT_DISPATCH_FLOATING_TYPES_AND2(
      ScalarType::BFloat16, ScalarType::Half,
      iter.dtype(), "logaddexp_cuda",
      [&]() {
@ -261,7 +29,6 @@ void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
          }
        });
      });
-  }
 }

 void logaddexp2_kernel_cuda(TensorIteratorBase& iter) {
--- a/aten/src/ATen/native/cuda/ReduceAMinMaxKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceAMinMaxKernel.cu
@ -1,5 +1,6 @@
 #define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/ReduceAllOps.h>
@ -28,22 +29,22 @@ void _min_max_values_kernel_cuda_impl(TensorIterator& iter) {
 }

 void aminmax_allreduce_launch_kernel(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND3(
-      kBFloat16, kHalf, kBool, iter.input_dtype(), "aminmax_all_cuda", [&] {
+  AT_DISPATCH_V2(
+      iter.input_dtype(), "aminmax_all_cuda", AT_WRAP([&] {
        _min_max_values_kernel_cuda_impl<scalar_t>(iter);
-      });
+      }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
 }

 void aminmax_launch_kernel(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND3(
-      kBFloat16, kHalf, kBool, iter.input_dtype(), "aminmax_cuda", [&]() {
+  AT_DISPATCH_V2(
+      iter.input_dtype(), "aminmax_cuda", AT_WRAP([&]() {
        gpu_reduce_kernel<scalar_t, scalar_t>(
            iter,
            MinMaxOps<scalar_t, scalar_t, int32_t>{},
            thrust::pair<scalar_t, scalar_t>(
                at::numeric_limits<scalar_t>::upper_bound(),
                at::numeric_limits<scalar_t>::lower_bound()));
-      });
+      }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
 }

 } // namespace at::native
--- a/aten/src/ATen/native/cuda/ReduceMaxValuesKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceMaxValuesKernel.cu
@ -1,5 +1,6 @@
 #define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/ReduceAllOps.h>
@ -33,27 +34,27 @@ void max_values_kernel_cuda_impl(TensorIterator& iter) {
 }

 void max_values_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND3(
-      kBFloat16, kHalf, kBool, iter.dtype(), "max_values_cuda", [&]() {
+  AT_DISPATCH_V2(
+      iter.dtype(), "max_values_cuda", AT_WRAP([&]() {
        max_values_kernel_cuda_impl<scalar_t>(iter);
-      });
+      }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
 }

 void max_launch_kernel(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND3(
-      kBFloat16, kHalf, kBool, iter.input_dtype(), "max_cuda", [&]() {
+  AT_DISPATCH_V2(
+      iter.input_dtype(), "max_cuda", AT_WRAP([&]() {
        gpu_reduce_kernel<scalar_t, scalar_t>(
            iter,
            MaxOps<scalar_t>{},
            thrust::pair<scalar_t, int64_t>(
                at::numeric_limits<scalar_t>::lower_bound(), 0));
-      });
+      }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
 }

 void max_all_launch_kernel(TensorIterator &iter) {
-  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "max_all_cuda", [&] {
+  AT_DISPATCH_V2(iter.input_dtype(), "max_all_cuda", AT_WRAP([&] {
    max_values_kernel_cuda_impl<scalar_t>(iter);
-  });
+  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
 }

 REGISTER_DISPATCH(max_values_stub, &max_values_kernel_cuda)
--- a/aten/src/ATen/native/cuda/ReduceMinValuesKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceMinValuesKernel.cu
@ -12,6 +12,7 @@
 #include <ATen/NumericUtils.h>

 #include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/cuda/NumericLimits.cuh>

@ -33,24 +34,24 @@ void min_values_kernel_cuda_impl(TensorIterator& iter) {
 }

 void min_values_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "min_values_cuda", [&]() {
+  AT_DISPATCH_V2(iter.dtype(), "min_values_cuda", AT_WRAP([&]() {
    min_values_kernel_cuda_impl<scalar_t>(iter);
-  });
+  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
 }

 void min_launch_kernel(TensorIterator &iter) {
-  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "min_cuda", [&]() {
+  AT_DISPATCH_V2(iter.input_dtype(), "min_cuda", AT_WRAP([&]() {
    gpu_reduce_kernel<scalar_t, scalar_t>(
      iter,
      MinOps<scalar_t>{},
      thrust::pair<scalar_t, int64_t>(at::numeric_limits<scalar_t>::upper_bound(), 0));
-  });
+  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
 }

 void min_all_launch_kernel(TensorIterator &iter) {
-  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "min_all_cuda", [&] {
+  AT_DISPATCH_V2(iter.input_dtype(), "min_all_cuda", AT_WRAP([&] {
    min_values_kernel_cuda_impl<scalar_t>(iter);
-  });
+  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
 }

 REGISTER_DISPATCH(min_values_stub, &min_values_kernel_cuda)
--- a/aten/src/ATen/native/cuda/ScaledBlas.cpp
+++ b/aten/src/ATen/native/cuda/ScaledBlas.cpp
@ -740,12 +740,7 @@ _scaled_rowwise_rowwise(
  TORCH_CHECK_VALUE(scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat, "scale_a must have ", mat_a.size(0), " Float elements, got ", scale_a.numel())
  TORCH_CHECK_VALUE(scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat, "scale_b must have ", mat_b.size(1), " Float elements, got ", scale_b.numel())

-  // if we have a scale of shape [256, 1] (say), then stride can be [1, 0] - handle this case
-  TORCH_CHECK_VALUE(
-      scale_a.stride(1) == 1 ||
-      scale_a.size(1) == 1,
-      "expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1)
-  );
+  TORCH_CHECK_VALUE(scale_a.stride(1) == 1, "expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1));
  TORCH_CHECK_VALUE(scale_b.stride(1) == 1, "expected scale_b.stride(1) to be 1, but got ", scale_b.stride(1));

  auto scaling_choice_a = ScalingType::RowWise;
@ -1101,19 +1096,6 @@ _scaled_mxfp8_mxfp8(
  return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
 }

-void
-_check_mxfp4_support() {
-#ifndef USE_ROCM
-  auto dprops = at::cuda::getCurrentDeviceProperties();
-  // Only on B200 GPUs
-  TORCH_CHECK_NOT_IMPLEMENTED(
-    // B200 = 10.0, B300 = 10.3
-    dprops->major == 10,
-    "MXFP4 scaling only supported in CUDA for B200/B300"
-  );
-#endif
-}
-

 Tensor&
 _scaled_mxfp4_mxfp4(
@ -1126,7 +1108,6 @@ _scaled_mxfp4_mxfp4(
 #if defined(_WIN32) || (!defined(USE_ROCM) && !defined(USE_FBGEMM_GENAI))
  TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM and CUDA+FBGEMM_GENAI only");
 #else
-  _check_mxfp4_support();
  // Restrictions:
  // A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
  TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ",
--- a/aten/src/ATen/native/cuda/ScanUtils.cuh
+++ b/aten/src/ATen/native/cuda/ScanUtils.cuh
@ -267,15 +267,15 @@ void scan_dim_with_indices(const TensorBase& self, const TensorBase& values, con
 * outer dimensions, which contains several "inner rows").
 * Each thread processes a single inner row at a time.
 */
-template<typename scalar_t, typename index_t, class BinaryOp>
+template<typename scalar_t, class BinaryOp>
 __global__ void tensor_kernel_scan_outer_dim(scalar_t *tgt_, const scalar_t *src_,
                                              const uint32_t num_orows, const uint32_t num_irows, const uint32_t row_size,
                                              const scalar_t init, BinaryOp binary_op)
 {
  for (uint32_t orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
    for (uint32_t irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) {
-      const scalar_t *src = src_ + static_cast<index_t>(orow) * row_size * num_irows + irow;
-      scalar_t *tgt = tgt_ + (index_t) orow * row_size * num_irows + irow;
+      const scalar_t *src = src_ + orow * row_size * num_irows + irow;
+      scalar_t *tgt = tgt_ + orow * row_size * num_irows + irow;
      scalar_t acc = init;

      for (uint32_t col = 0; col < row_size; ++col) {
@ -409,15 +409,10 @@ __host__ void scan_outer_dim(const TensorBase& self, const TensorBase& result,
  check_fits_in_unsigned(num_irows, "num_irows");
  check_fits_in_unsigned(num_orows, "num_orows");
  check_fits_in_unsigned(row_size, "row_size");
-  if (static_cast<size_t>(num_irows) * num_orows * row_size <= UINT_MAX) {
-  tensor_kernel_scan_outer_dim<scalar_t, uint32_t><<<grid, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+
+  tensor_kernel_scan_outer_dim<scalar_t><<<grid, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
    result.mutable_data_ptr<scalar_t>(), self.const_data_ptr<scalar_t>(),
    num_orows, num_irows, row_size, init, binary_op);
-  } else  {
-  tensor_kernel_scan_outer_dim<scalar_t, size_t><<<grid, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-    result.mutable_data_ptr<scalar_t>(), self.const_data_ptr<scalar_t>(),
-    num_orows, num_irows, row_size, init, binary_op);
-  }
  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }

--- a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
@ -337,6 +337,10 @@ Tensor _convolution_out(
  TORCH_CHECK(
      3 == ndim || 4 == ndim || 5 == ndim,
      "convolution only supports 3D, 4D, 5D tensor");
+  // get computation format for Conv/TransposedConv
+  bool is_channels_last_suggested =
+      use_channels_last_for_conv(input_r, weight_r);
+
  Tensor input = input_r, weight = weight_r;
  // PyTorch does not support ChannelsLast1D case,
  // thus we need the transformation here
@ -344,8 +348,13 @@ Tensor _convolution_out(
    input = view4d(input_r);
    weight = view4d(weight_r);
  }
-  // get computation format for Conv/TransposedConv
-  bool is_channels_last_suggested = use_channels_last_for_conv(input, weight);
+  // ensure the input/weight/bias/output are congituous in desired format
+  at::MemoryFormat mfmt = is_channels_last_suggested
+      ? get_cl_tag_by_ndim(input.ndimension())
+      : at::MemoryFormat::Contiguous;
+  auto bias = bias_r.defined() ? bias_r.contiguous() : bias_r;
+  input = input.contiguous(mfmt);
+  weight = weight.contiguous(mfmt);

  auto k = weight.ndimension();
  if (k == input.ndimension() + 1) {
@ -379,14 +388,6 @@ Tensor _convolution_out(
        expand_param_if_needed(output_padding_, "output_padding", dim);
    params.groups = groups_;
  }
-
-  // ensure the input/weight/bias/output are congituous in desired format
-  at::MemoryFormat mfmt = is_channels_last_suggested
-      ? get_cl_tag_by_ndim(input.ndimension())
-      : at::MemoryFormat::Contiguous;
-  auto bias = bias_r.defined() ? bias_r.contiguous() : bias_r;
-  input = input.contiguous(mfmt);
-  weight = weight.contiguous(mfmt);
  check_shape_forward(input, weight, bias, params, true);

  Tensor output;
@ -513,9 +514,18 @@ Tensor convolution_overrideable(
      at::borrow_from_optional_tensor(bias_r_opt);
  const Tensor& bias_r = *bias_r_maybe_owned;

+  auto k = weight_r.ndimension();
+  at::MemoryFormat backend_memory_format = at::MemoryFormat::Contiguous;
+  if (xpu_conv_use_channels_last(input_r, weight_r)) {
+    backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d
+                                     : at::MemoryFormat::ChannelsLast;
+  }
+  Tensor input_c = input_r.contiguous(backend_memory_format);
+  Tensor weight_c = weight_r.contiguous(backend_memory_format);
+
  return _convolution(
-      input_r,
-      weight_r,
+      input_c,
+      weight_c,
      bias_r,
      stride_,
      padding_,
--- a/aten/src/ATen/native/mkldnn/xpu/ScaledBlas.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/ScaledBlas.cpp
@ -1,342 +0,0 @@
-#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/BlasBackend.h>
-#include <ATen/WrapDimUtilsMulti.h>
-#include <ATen/ceil_div.h>
-#include <ATen/native/Resize.h>
-#include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
-#include <ATen/native/xpu/Blas.h>
-#include <torch/library.h>
-
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
-#else
-#include <ATen/ops/_addmm_activation_native.h>
-#include <ATen/ops/_efficientzerotensor.h>
-#include <ATen/ops/_scaled_mm_native.h>
-#include <ATen/ops/_unsafe_view_native.h>
-#include <ATen/ops/abs.h>
-#include <ATen/ops/addmm_native.h>
-#include <ATen/ops/addmv_native.h>
-#include <ATen/ops/baddbmm_native.h>
-#include <ATen/ops/bmm_native.h>
-#include <ATen/ops/copy_native.h>
-#include <ATen/ops/dot_native.h>
-#include <ATen/ops/empty.h>
-#include <ATen/ops/empty_strided.h>
-#include <ATen/ops/gelu.h>
-#include <ATen/ops/max.h>
-#include <ATen/ops/mm_native.h>
-#include <ATen/ops/mul.h>
-#include <ATen/ops/ones.h>
-#include <ATen/ops/relu.h>
-#include <ATen/ops/scalar_tensor_native.h>
-#include <ATen/ops/vdot_native.h>
-#endif
-
-namespace at::native {
-
-using at::blas::ScalingType;
-using at::blas::SwizzleType;
-
-namespace {
-/*
- * Scaling Type Determination:
- * ---------------------------
- * Conditions and corresponding Scaling Types:
- *
- * - If scale tensor is `Float8_e8m0fnu` or `Float8_e4m3fn`:
- *   - Returns BlockWise (with additional size checks).
- *
- * - Else if scale.numel() == 1:
- *   - Returns TensorWise.
- *
- * - Else if scale.dim() == 2 && scale.size(0) == outer_dim && scale.size(1) ==
- * 1:
- *   - Returns RowWise.
- *
- * - Otherwise:
- *   - Returns Error.
- */
-
-bool is_tensorwise_scaling(const at::Tensor& t, const at::Tensor& scale) {
-  return at::isFloat8Type(t.scalar_type()) &&
-      scale.scalar_type() == at::kFloat && scale.numel() == 1;
-}
-
-bool is_rowwise_scaling(const at::Tensor& t, const at::Tensor& scale) {
-  return (
-      at::isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat &&
-      scale.dim() == 2 && scale.size(0) == t.size(0) && scale.size(1) == 1 &&
-      scale.is_contiguous());
-}
-
-bool is_desired_scaling(
-    const at::Tensor& t,
-    const at::Tensor& scale,
-    ScalingType desired_scaling) {
-  auto result = desired_scaling == ScalingType::TensorWise
-      ? is_tensorwise_scaling(t, scale)
-      : is_rowwise_scaling(t, scale);
-  return result;
-}
-
-std::pair<ScalingType, ScalingType> get_joint_scaling(
-    std::initializer_list<std::pair<ScalingType, ScalingType>> options,
-    const at::Tensor& a,
-    const at::Tensor& b,
-    const at::Tensor& scale_a,
-    const at::Tensor& scale_b) {
-  for (auto [lhs, rhs] : options) {
-    if (is_desired_scaling(a, scale_a, lhs) &&
-        is_desired_scaling(b.t(), scale_b.t(), rhs)) {
-      return {lhs, rhs};
-    }
-  }
-  TORCH_CHECK(
-      false,
-      "Invalid scaling configuration.\n"
-      "- For TensorWise scaling, a and b should be float8, scales should be float and singletons.\n"
-      "- For RowWise scaling, a and b should be float8, scales should be float, scale_a should be (",
-      a.size(0),
-      ", 1) and scale_b should be (1, ",
-      b.size(1),
-      "), and both should be contiguous.\n"
-      "Got a.dtype()=",
-      a.scalar_type(),
-      ", scale_a.dtype()=",
-      scale_a.scalar_type(),
-      ", scale_a.size()=",
-      scale_a.sizes(),
-      ", scale_a.stride()=",
-      scale_a.strides(),
-      ", ",
-      "b.dtype()=",
-      b.scalar_type(),
-      ", scale_b.dtype()=",
-      scale_b.scalar_type(),
-      ", scale_b.size()=",
-      scale_b.sizes(),
-      " and scale_b.stride()=",
-      scale_b.strides());
-}
-
-Tensor& _scaled_gemm(
-    const Tensor& mat1,
-    const Tensor& mat2,
-    const Tensor& scale_a,
-    const Tensor& scale_b,
-    const ScalingType scaling_choice_a,
-    const ScalingType scaling_choice_b,
-    const std::optional<Tensor>& bias,
-    const bool use_fast_accum,
-    Tensor& out,
-    const std::optional<Tensor>& alpha = std::nullopt) {
-  // TODO: scale_result and alpha is not defined or used!
-  std::optional<Tensor> scaled_result = std::nullopt;
-  at::native::onednn::scaled_matmul(
-      mat1,
-      mat2,
-      out,
-      scale_a,
-      scale_b,
-      scaling_choice_a,
-      scaling_choice_b,
-      bias,
-      scaled_result,
-      use_fast_accum);
-
-  return out;
-}
-
-} // namespace
-
-// Computes matrix multiply + bias while applying scaling to input and output
-// matrices Scales are only applicable when matrices are of Float8 type and
-// assumed to be equal to 1.0 by default. If output matrix type is 16 or 32-bit
-// type, scale_result is not applied. Known limitations:
-//  - Only works if mat1 is row-major and mat2 is column-major
-//  - Only works if matrices sizes are divisible by 32
-//  - If 1-dimensional tensors are used then scale_a should be size =
-//  mat1.size(0)
-//    and scale_b should have size = to mat2.size(1)
-//  Arguments:
-//    - `mat1`: the first operand of the matrix multiply, can be type
-//    `torch.float8_e4m3fn` or `torch.float8_e5m2`
-//    - `mat2`: the second operand of the matrix multiply, can be type
-//    `torch.float8_e4m3fn` or `torch.float8_e5m2`
-//    - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16`
-//    - `out_dtype`: the output dtype, can either be a float8 or a higher
-//    precision floating point type
-//    - `scale_a`: a tensor with the inverse scale of `mat1`, whose
-//    shape/strides/dtype depend on the scaling scheme
-//    - `scale_b`: a tensor with the inverse scale of `mat2`, whose
-//    shape/strides/dtype depend on the scaling scheme
-//    - `scale_result`: a scalar tensor with the scale of the output, only
-//    utilized if the output is a float8 type
-//    - `use_fast_accum`: Not applicable for XPU. For now, it should always be
-//    false.
-//    - `out`: a reference to the output tensor
-
-Tensor& _scaled_mm_out_xpu(
-    const Tensor& mat1,
-    const Tensor& mat2,
-    const Tensor& scale_a,
-    const Tensor& scale_b,
-    const std::optional<at::Tensor>& bias,
-    const std::optional<at::Tensor>& scale_result,
-    std::optional<c10::ScalarType> out_dtype,
-    bool use_fast_accum,
-    Tensor& out) {
-  // Note: fast_accum is not supported in XPU for now.
-  TORCH_CHECK(!use_fast_accum, "fast_accum is not supported in XPU for now.");
-
-  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
-  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
-
-  TORCH_CHECK(
-      mat1.sizes()[1] == mat2.sizes()[0],
-      "mat1 and mat2 shapes cannot be multiplied (",
-      mat1.sizes()[0],
-      "x",
-      mat1.sizes()[1],
-      " and ",
-      mat2.sizes()[0],
-      "x",
-      mat2.sizes()[1],
-      ")");
-
-  // Check what type of scaling we are doing based on inputs. This list is
-  // sorted by decreasing priority.
-
-  // List of supported datatypes for XPU with oneDNN:
-  // https://uxlfoundation.github.io/oneDNN/dev_guide_matmul.html#data-types
-  auto [scaling_choice_a, scaling_choice_b] = get_joint_scaling(
-      {
-          std::make_pair(ScalingType::TensorWise, ScalingType::TensorWise),
-          std::make_pair(ScalingType::RowWise, ScalingType::RowWise),
-      },
-      mat1,
-      mat2,
-      scale_a,
-      scale_b);
-  TORCH_CHECK(
-      !scale_result ||
-          (scale_result->numel() == 1 && scale_result->scalar_type() == kFloat),
-      "scale_result must be a float scalar");
-  TORCH_CHECK(
-      !bias || bias->numel() == mat2.sizes()[1],
-      "Bias must be size ",
-      mat2.sizes()[1],
-      " but got ",
-      bias->numel());
-  TORCH_CHECK(
-      mat1.sizes()[1] % 16 == 0,
-      "Expected trailing dimension of mat1 to be divisible by 16 ",
-      "but got mat1 shape: (",
-      mat1.sizes()[0],
-      "x",
-      mat1.sizes()[1],
-      ").");
-  TORCH_CHECK(
-      mat2.sizes()[0] % 16 == 0 && mat2.sizes()[1] % 16 == 0,
-      "mat2 shape (",
-      mat2.sizes()[0],
-      "x",
-      mat2.sizes()[1],
-      ") must be divisible by 16");
-  // Check types
-  TORCH_CHECK(
-      !out_dtype || *out_dtype == out.scalar_type(),
-      "out_dtype must match output matrix type");
-  TORCH_CHECK(
-      at::isFloat8Type(mat1.scalar_type()),
-      "Expected mat1 to be Float8 matrix got ",
-      mat1.scalar_type());
-  TORCH_CHECK(
-      at::isFloat8Type(mat2.scalar_type()),
-      "Expected mat2 to be Float8 matrix got ",
-      mat2.scalar_type());
-  // TODO: oneDNN Currently only supports e4m3 with group scales on BMG. Not
-  // support 2D scales, only 1D. Needs to add more checks there.
-
-  if (bias) {
-    TORCH_CHECK(
-        bias->scalar_type() == kFloat ||
-            bias->scalar_type() == c10::ScalarType::BFloat16 ||
-            bias->scalar_type() == c10::ScalarType::Half,
-        "Bias must be Float32 or BFloat16 or Half, but got ",
-        bias->scalar_type());
-  }
-
-  {
-    auto bias_ = bias.value_or(Tensor());
-    auto scale_result_ = scale_result.value_or(Tensor());
-
-    // NOLINTNEXTLINE(*c-array*)
-    TensorArg targs[]{
-        {out, "out", 0},
-        {mat1, "mat1", 1},
-        {mat2, "mat2", 2},
-        {bias_, "bias", 3},
-        {scale_a, "scale_a", 4},
-        {scale_b, "scale_b", 5},
-        {scale_result_, "scale_result", 6}};
-    checkAllSameGPU(__func__, targs);
-  }
-
-  // Validation checks have passed lets resize the output to actual size
-  IntArrayRef mat1_sizes = mat1.sizes();
-  IntArrayRef mat2_sizes = mat2.sizes();
-  at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
-
-  // If any of M, K, N is 0 - return early (the tensorwise/rowwise float8 gemm
-  // kernels do not support this case).
-  if (mat1_sizes[0] == 0 || mat1_sizes[1] == 0 || mat2_sizes[1] == 0) {
-    // `out` was created with `at::empty`. In the case where we are multiplying
-    // MxK by KxN and K is the zero dim, we need to initialize here to properly
-    // return a tensor of zeros.
-    if (mat1_sizes[1] == 0) {
-      out.zero_();
-    }
-
-    return out;
-  }
-
-  // TODO: Scale_result is not supported by now!!
-  return _scaled_gemm(
-      mat1,
-      mat2,
-      scale_a,
-      scale_b,
-      scaling_choice_a,
-      scaling_choice_b,
-      bias,
-      use_fast_accum,
-      out);
-}
-
-Tensor _scaled_mm_xpu(
-    const Tensor& mat_a,
-    const Tensor& mat_b,
-    const Tensor& scale_a,
-    const Tensor& scale_b,
-    const std::optional<at::Tensor>& bias,
-    const std::optional<at::Tensor>& scale_result,
-    std::optional<c10::ScalarType> out_dtype,
-    bool use_fast_accum) {
-  const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
-  Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
-  return _scaled_mm_out_xpu(
-      mat_a,
-      mat_b,
-      scale_a,
-      scale_b,
-      bias,
-      scale_result,
-      out_dtype,
-      use_fast_accum,
-      out);
-}
-
-} // namespace at::native
--- a/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
@ -1,4 +1,3 @@
-#include <ATen/BlasBackend.h>
 #include <ATen/Tensor.h>
 #include <ATen/core/Tensor.h>
 #include <c10/core/ScalarType.h>
@ -9,6 +8,7 @@
 #include <oneapi/dnnl/dnnl.hpp>

 namespace at::native::onednn {
+
 at::Tensor broadcast_bias2D(
    at::Tensor& dst,
    at::Tensor& bias,
@ -328,236 +328,4 @@ void quantized_matmul(
    result.copy_(dst);
 }

-// Describes how to configure oneDNN scales for a given role/ScalingType
-struct ScaleSpec {
-  // specifies the way scale values will be applied to an ARG tensor.
-  int mask;
-  // specifies how scales are grouped along dimensions where
-  // multiple scale factors are used.
-  dnnl::memory::dims groups;
-  // specifies data type for scale factors.
-  dnnl::memory::data_type dtype;
-
-  // Helper to compute expected number of elements for scale tensors
-  // arg_type: "src" for SRC (groups pattern {1, X}),
-  // "wei" for WEIGHTS (groups pattern {X, 1})
-  int64_t expected_numel(
-      int64_t outer_dim,
-      int64_t inner_dim,
-      const std::string& arg_type) const {
-    if (groups == dnnl::memory::dims{1, 1})
-      return 1; // tensorwise scaling
-
-    TORCH_CHECK(
-        arg_type == "src" || arg_type == "wei",
-        "Expected arg_type to be 'src' or 'wei', but got '",
-        arg_type,
-        "'");
-
-    // For rowwise: SRC groups={1, K}, WEI groups={K, 1}
-    TORCH_INTERNAL_ASSERT(
-        (groups == dnnl::memory::dims{1, inner_dim} ||
-         groups == dnnl::memory::dims{inner_dim, 1}),
-        "The groups must be either {1, inner_dim} or {inner_dim, 1}. But got ",
-        groups,
-        ".");
-    return outer_dim;
-  }
-
-  // Normalize an incoming scale tensor to contiguous storage and appropriate
-  // dtype/view
-  at::Tensor normalize(const at::Tensor& scale) const {
-    TORCH_INTERNAL_ASSERT(
-        dtype == dnnl::memory::data_type::f32,
-        "tensor scale currently must be f32, but got scale dtype: ",
-        scale.scalar_type());
-    return scale.to(at::kFloat).contiguous();
-  }
-};
-
-// This function defines how to set scales mask and groups according to:
-// https://github.com/uxlfoundation/oneDNN/blob/main/tests/benchdnn/doc/knobs_attr.md#--attr-scales
-// The returned value will be used in
-// `set_scales(arg, mask, groups, data_type)`.
-inline ScaleSpec make_scale_spec(
-    at::blas::ScalingType scaling_type,
-    int64_t M,
-    int64_t K,
-    int64_t N,
-    const std::string& arg_type) {
-  TORCH_CHECK(
-      arg_type == "src" || arg_type == "wei",
-      "Expected arg_type to be 'src' or 'wei', but got '",
-      arg_type,
-      "'");
-  TORCH_INTERNAL_ASSERT(
-      (scaling_type == at::blas::ScalingType::TensorWise ||
-       scaling_type == at::blas::ScalingType::RowWise),
-      "Currently only support scaling_type for TensorWise or RowWise");
-  int64_t dim = K; // Currently only K is used for grouping
-  bool is_src = (arg_type == "src");
-  if (scaling_type == at::blas::ScalingType::TensorWise) {
-    // Scale tensorwise. The same as `--attr-scales=common`.
-    // mask=0 : scale whole tensor
-    // groups={1, 1}: indicates that there is only one group for scaling
-    return {0, {1, 1}, dnnl::memory::data_type::f32};
-  } else {
-    // (scaling_type == at::blas::ScalingType::RowWise)
-    // Scale RowWise. The same as `--attr-scales=per_dim_01`.
-    // mask={(1 << 0) | (1 << 1)}: Scale on both dim0 and dim1
-    // SRC: groups={1, K}, WEIGHTS: groups={K, 1}
-    return {
-        (1 << 0) | (1 << 1),
-        is_src ? dnnl::memory::dims{1, dim} : dnnl::memory::dims{dim, 1},
-        dnnl::memory::data_type::f32};
-  }
-}
-
-sycl::event scaled_matmul(
-    const Tensor& mat1,
-    const Tensor& mat2,
-    Tensor& result,
-    const Tensor& scale_a,
-    const Tensor& scale_b,
-    at::blas::ScalingType scaling_choice_a,
-    at::blas::ScalingType scaling_choice_b,
-    const std::optional<at::Tensor>& bias,
-    const std::optional<at::Tensor>& scale_result,
-    bool use_fast_accum) {
-  auto& engine = GpuEngineManager::Instance().get_engine();
-  auto& stream = GpuStreamManager::Instance().get_stream();
-
-  // This function will do steps with following steps
-  // 1. create memory descriptor
-  // 2. call write_to_dnnl_memory() to actually write memory
-  // 3. execute
-
-  const int64_t M = mat1.size(0);
-  const int64_t K = mat1.size(1);
-  const int64_t N = mat2.size(1);
-
-  // 1.1 Create memory descriptor
-  dnnl::memory::desc src_md = get_onednn_md(mat1);
-  dnnl::memory::desc weights_md = get_onednn_md(mat2);
-  dnnl::memory::desc dst_md = get_onednn_md(result);
-
-  // scale_a and scale_b has already be checked in `is_desired_scaling()` call.
-  // So we could directly get their memory desc and set later.
-  dnnl::memory::desc scale_a_md = get_onednn_md(scale_a);
-  dnnl::memory::desc scale_b_md = get_onednn_md(scale_b);
-
-  dnnl::memory::desc bias_md;
-  bool with_bias = bias.has_value();
-  at::Tensor possible_reshaped_bias = bias.value_or(at::Tensor());
-  if (with_bias) {
-    if (possible_reshaped_bias.dim() == 1) {
-      possible_reshaped_bias =
-          possible_reshaped_bias.reshape({1, possible_reshaped_bias.size(0)});
-      bias_md = get_onednn_md(possible_reshaped_bias);
-    } else {
-      bias_md = get_onednn_md(possible_reshaped_bias);
-    }
-  }
-
-  // 1.2 Create primitive descriptor and set scales mask
-  const ScaleSpec src_spec = make_scale_spec(scaling_choice_a, M, K, N, "src");
-  const ScaleSpec wei_spec = make_scale_spec(scaling_choice_b, M, K, N, "wei");
-
-  dnnl::primitive_attr op_attr = dnnl::primitive_attr();
-
-#if ONEDNN_SUPPORT_DETERMINISTIC
-  if (at::globalContext().deterministicAlgorithms() ||
-      at::globalContext().deterministicMkldnn())
-    op_attr.set_deterministic(true);
-#endif
-
-  std::vector<int64_t> default_groups;
-  op_attr.set_scales(
-      DNNL_ARG_SRC, src_spec.mask, src_spec.groups, src_spec.dtype);
-  op_attr.set_scales(
-      DNNL_ARG_WEIGHTS, wei_spec.mask, wei_spec.groups, wei_spec.dtype);
-  // scale_result tensor currently only supports scalar(TensorWise Scaling).
-  bool with_dst_scale = scale_result && scale_result->defined();
-  if (with_dst_scale) {
-    op_attr.set_scales(DNNL_ARG_DST, 0, {1}, dnnl::memory::data_type::f32);
-  }
-
-  op_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
-
-  // 1.3 Create the matmul primitive descriptor
-  dnnl::matmul::primitive_desc matmul_pd = with_bias
-      ? dnnl::matmul::primitive_desc(
-            engine, src_md, weights_md, bias_md, dst_md, op_attr)
-      : dnnl::matmul::primitive_desc(
-            engine, src_md, weights_md, dst_md, op_attr);
-
-  // 1.4 (Possible) Additional Checks
-  // TODO: In case there are memory desc does not align with the actual tensor,
-  // we might need to reorder weights similar to CPU's reorder_if_differ_in()
-  // call. For example, weights not the same as matmul_pd.weights_desc(),
-
-  // 2. Prepare memory
-
-  // Create memory
-  auto src_usr_m = make_onednn_memory(src_md, engine, mat1.data_ptr());
-  auto weights_usr_m = make_onednn_memory(weights_md, engine, mat2.data_ptr());
-  auto dst_usr_m = make_onednn_memory(dst_md, engine, result.data_ptr());
-  dnnl::memory b_usr_m;
-  if (with_bias) {
-    b_usr_m =
-        make_onednn_memory(bias_md, engine, possible_reshaped_bias.data_ptr());
-  }
-
-  // Prepare runtime scale memories (flat 1-D views) using the specs
-  auto make_scale_mem_from_spec = [&](const ScaleSpec& spec,
-                                      int64_t expected_numel,
-                                      const at::Tensor& scale_tensor) {
-    at::Tensor prepared = spec.normalize(scale_tensor);
-    TORCH_CHECK(
-        prepared.numel() == expected_numel,
-        "Scale buffer length mismatch. Expected ",
-        expected_numel,
-        ", got ",
-        prepared.numel());
-    dnnl::memory::desc scale_md(
-        {prepared.numel()}, spec.dtype, dnnl::memory::format_tag::x);
-    return make_onednn_memory(scale_md, engine, prepared.data_ptr());
-  };
-
-  auto scratchpad =
-      make_onednn_memory(matmul_pd.scratchpad_desc(), engine, nullptr);
-
-  // 3. Setup Args for exec
-  std::unordered_map<int, dnnl::memory> args;
-  args.insert({DNNL_ARG_SRC, src_usr_m});
-  args.insert({DNNL_ARG_WEIGHTS, weights_usr_m});
-  args.insert({DNNL_ARG_DST, dst_usr_m});
-  args.insert({DNNL_ARG_SCRATCHPAD, scratchpad});
-  if (with_bias) {
-    args.insert({DNNL_ARG_BIAS, b_usr_m});
-  }
-
-  // Attach runtime scales using specs
-  auto src_sc_mem = make_scale_mem_from_spec(
-      src_spec, src_spec.expected_numel(M, K, "src"), scale_a);
-  auto wei_sc_mem = make_scale_mem_from_spec(
-      wei_spec, wei_spec.expected_numel(N, K, "wei"), scale_b);
-  args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_sc_mem});
-  args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, wei_sc_mem});
-  if (with_dst_scale) {
-    // Bind single f32 scalar as DST scale
-    at::Tensor dst_scale_f32 = scale_result->to(at::kFloat).contiguous();
-    dnnl::memory::desc dst_sc_md(
-        {1}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::x);
-    auto dst_sc_mem =
-        make_onednn_memory(dst_sc_md, engine, dst_scale_f32.data_ptr());
-    args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_sc_mem});
-  }
-
-  dnnl::matmul matmul_p = dnnl::matmul(matmul_pd);
-  sycl::event matmul_fwd_event =
-      dnnl::sycl_interop::execute(matmul_p, stream, args);
-  return matmul_fwd_event;
-}
-
 } // namespace at::native::onednn
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp
@ -78,10 +78,6 @@ dnnl::memory::data_type get_onednn_dtype(
      return dnnl::memory::data_type::f32;
    case at::ScalarType::BFloat16:
      return dnnl::memory::data_type::bf16;
-    case at::ScalarType::Float8_e4m3fn:
-      return dnnl::memory::data_type::f8_e4m3;
-    case at::ScalarType::Float8_e5m2:
-      return dnnl::memory::data_type::f8_e5m2;
    default:
      if (!allow_undef) {
        TORCH_CHECK(
--- a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
@ -1,7 +1,6 @@
 #pragma once

 #include <ATen/ATen.h>
-#include <ATen/BlasBackend.h>
 #include <ATen/native/mkldnn/xpu/detail/Attr.h>
 #include <ATen/native/mkldnn/xpu/detail/Utils.h>
 #include <ATen/native/mkldnn/xpu/detail/oneDNNContext.h>
@ -203,16 +202,4 @@ void sdpa_backward(
    Tensor& grad_query,
    Tensor& grad_key,
    Tensor& grad_value);
-
-sycl::event scaled_matmul(
-    const Tensor& mat1,
-    const Tensor& mat2,
-    Tensor& result,
-    const Tensor& scale_a,
-    const Tensor& scale_b,
-    at::blas::ScalingType scaling_choice_a,
-    at::blas::ScalingType scaling_choice_b,
-    const std::optional<at::Tensor>& bias,
-    const std::optional<at::Tensor>& scale_result,
-    bool use_fast_accum);
 } // namespace at::native::onednn
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@ -40,6 +40,8 @@ using namespace at::mps;

 namespace at::native::mps {

+void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)());
+
 struct MPSScalar {
  id<MTLBuffer> getMTLBuffer() const {
    return __builtin_bit_cast(id<MTLBuffer>, buffer.get());
@ -82,7 +84,6 @@ NSArray<NSNumber*>* getTensorAxes(const TensorBase& t);
 NSArray<NSNumber*>* getTensorAxes(const IntArrayRef& sizes, at::OptionalIntArrayRef dim);
 std::string getMPSShapeString(MPSShape* shape);
 std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype = true, bool exclude_shape = false);
-std::string to_hex_key(float);
 std::string getArrayRefString(const IntArrayRef s);
 // use has_storage() on the returned tensor to determine if src actually is a view
 Tensor gatherViewTensor(const Tensor& src, Tensor& dst);
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@ -53,6 +53,21 @@
@end

 namespace at::native::mps {
+
+void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()) {
+  __block std::optional<std::exception_ptr> block_exception;
+  dispatch_sync(queue, ^() {
+    try {
+      block();
+    } catch (...) {
+      block_exception = std::current_exception();
+    }
+  });
+  if (block_exception) {
+    std::rethrow_exception(*block_exception);
+  }
+}
+
 /**
 * Computes distance from lowest to highest element offset in given tensor.
 */
@ -301,10 +316,6 @@ std::string getArrayRefString(const IntArrayRef s) {
  return fmt::to_string(fmt::join(s, ","));
 }

-std::string to_hex_key(float f) {
-  return fmt::format("{:a}", f);
-}
-
 std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype, bool exclude_shape) {
  fmt::basic_memory_buffer<char, 100> buffer;
  auto buf_iterator = std::back_inserter(buffer);
--- a/aten/src/ATen/native/mps/kernels/Indexing.metal
+++ b/aten/src/ATen/native/mps/kernels/Indexing.metal
@ -1,5 +1,4 @@
 #include <c10/metal/atomic.h>
-#include <c10/metal/error.h>
 #include <c10/metal/indexing.h>
 #include <metal_stdlib>

@ -32,24 +31,10 @@ OffsetT index_apply_indices(
    constant IndexAB* indices,
    constant int64_t* sizes,
    constant int64_t* strides,
-    uint num_indices,
-    thread bool& error,
-    device ErrorMessages* error_buf) {
+    uint num_indices) {
  OffsetT rc = offs.x;
  for (uint i = 0; i < num_indices; i++) {
    auto idx = indices[i].indexArray[offs.y];
-    if (idx < -sizes[i] || idx >= sizes[i]) {
-      TORCH_REPORT_ERROR(
-          error_buf,
-          "index ",
-          idx,
-          " is out of bounds for dimension ",
-          i,
-          " with size ",
-          sizes[i]);
-      error = true;
-      break;
-    }
    if (idx < 0) {
      idx += sizes[i];
    }
@ -70,7 +55,6 @@ kernel void index_select(
    constant int64_t* index_sizes,
    constant int64_t* index_strides,
    constant uint4& ndim_nindices_numel,
-    device ErrorMessages* error_buffer,
    uint thread_index [[thread_position_in_grid]]) {
  const auto ndim = ndim_nindices_numel.x;
  const auto num_indices = ndim_nindices_numel.y;
@ -81,19 +65,8 @@ kernel void index_select(
      indices_strides,
      ndim,
      thread_index);
-  bool error = false;
  auto input_offs = index_apply_indices<OffsetT>(
-      offs.yz,
-      indices,
-      index_sizes,
-      index_strides,
-      num_indices,
-      error,
-      error_buffer);
-  if (error) {
-    output[offs.x / sizeof(T)] = 0;
-    return;
-  }
+      offs.yz, indices, index_sizes, index_strides, num_indices);
  output[offs.x / sizeof(T)] = input[input_offs / sizeof(T)];
 }

@ -109,9 +82,7 @@ inline void index_put_impl(
    constant int64_t* index_sizes,
    constant int64_t* index_strides,
    constant uint4& ndim_nindices_numel,
-    device ErrorMessages* error_buffer,
    uint thread_index) {
-  bool error = false;
  const auto ndim = ndim_nindices_numel.x;
  const auto num_indices = ndim_nindices_numel.y;
  const auto offs = index_get_offsets(
@ -122,16 +93,7 @@ inline void index_put_impl(
      ndim,
      thread_index);
  auto output_offs = index_apply_indices<OffsetT>(
-      offs.xz,
-      indices,
-      index_sizes,
-      index_strides,
-      num_indices,
-      error,
-      error_buffer);
-  if (error) {
-    return;
-  }
+      offs.xz, indices, index_sizes, index_strides, num_indices);
  output[output_offs / sizeof(T)] = input[offs.y / sizeof(T)];
 }

@ -147,7 +109,6 @@ kernel void index_put(
    constant int64_t* index_sizes,
    constant int64_t* index_strides,
    constant uint4& ndim_nindices_numel,
-    device ErrorMessages* error_buffer,
    uint thread_index [[thread_position_in_grid]]) {
  index_put_impl(
      output,
@ -160,7 +121,6 @@ kernel void index_put(
      index_sizes,
      index_strides,
      ndim_nindices_numel,
-      error_buffer,
      thread_index);
 }

@ -176,7 +136,6 @@ kernel void index_put_serial(
    constant int64_t* index_sizes,
    constant int64_t* index_strides,
    constant uint4& ndim_nindices_numel,
-    device ErrorMessages* error_buffer,
    uint thread_index [[thread_position_in_grid]]) {
  (void)thread_index; // Suppress unused vairable varning
  for (uint idx = 0; idx < ndim_nindices_numel.z; ++idx) {
@ -191,7 +150,6 @@ kernel void index_put_serial(
        index_sizes,
        index_strides,
        ndim_nindices_numel,
-        error_buffer,
        idx);
  }
 }
@ -208,7 +166,6 @@ kernel void index_put_accumulate(
    constant int64_t* index_sizes,
    constant int64_t* index_strides,
    constant uint4& ndim_nindices_numel,
-    device ErrorMessages* error_buffer,
    uint thread_index [[thread_position_in_grid]]) {
  const auto ndim = ndim_nindices_numel.x;
  const auto num_indices = ndim_nindices_numel.y;
@ -219,18 +176,8 @@ kernel void index_put_accumulate(
      indices_strides,
      ndim,
      thread_index);
-  bool error = false;
  auto output_offs = index_apply_indices<OffsetT>(
-      offs.xz,
-      indices,
-      index_sizes,
-      index_strides,
-      num_indices,
-      error,
-      error_buffer);
-  if (error) {
-    return;
-  }
+      offs.xz, indices, index_sizes, index_strides, num_indices);
  AtomicType<T>::atomic_add(
      reinterpret_cast<device AtomicType_t<T>*>(output),
      output_offs / sizeof(T),
@ -250,7 +197,6 @@ kernel void index_put_accumulate(
          constant int64_t* index_sizes,                            \
          constant int64_t* index_strides,                          \
          constant uint4& ndim_nindices_numel,                      \
-          device ErrorMessages* error_buffer,                       \
          uint thread_index [[thread_position_in_grid]])

 #define REGISTER_INDEX_OP_ALL_DTYPES(OP_NAME) \
--- a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
+++ b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
@ -40,7 +40,7 @@ inline c10::metal::opmath_t<T> matmul_inner(
    threadgroup_barrier(mem_flags::mem_threadgroup);

    for (uint k = 0; k < TILE_DIM; k++) {
-      sum += c10::metal::mul(A_tile[tid.y][k], B_tile[k][tid.x]);
+      sum += A_tile[tid.y][k] * B_tile[k][tid.x];
    }

    threadgroup_barrier(mem_flags::mem_threadgroup);
@ -96,9 +96,7 @@ kernel void addmm(
    auto bias =
        biasData[thread_id.y * strides[3].x + thread_id.x * strides[3].y];
    outputData[thread_id.y * strides[2].x + thread_id.x * strides[2].y] =
-        static_cast<T>(
-            c10::metal::mul(alpha_beta[0], sum) +
-            c10::metal::mul(alpha_beta[1], bias));
+        static_cast<T>(alpha_beta[0] * sum + alpha_beta[1] * bias);
  }
 }

@ -834,10 +832,6 @@ INSTANTIATE_MM_OPS(float);
 INSTANTIATE_MM_OPS(half);
 INSTANTIATE_MM_OPS(bfloat);

-// Complex MM
-INSTANTIATE_MM_OPS(float2);
-INSTANTIATE_MM_OPS(half2);
-
 // Integral MM
 INSTANTIATE_MM_OPS(long);
 INSTANTIATE_MM_OPS(int);
--- a/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
+++ b/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
@ -220,7 +220,7 @@ Tensor _embedding_bag_dense_backward_mps(const Tensor& output_grad,
  auto num_threads = (params.mode == EmbeddingBagMode::MAX) ? output_grad.numel() : num_indices * params.feature_size;
  MPSStream* stream = getCurrentMPSStream();

-  dispatch_sync_with_rethrow(stream->queue(), ^() {
+  mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
    @autoreleasepool {
      id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
      auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_backward_{}_{}",
@ -273,7 +273,7 @@ Tensor _embedding_bag_per_sample_weights_backward_mps(const Tensor& output_grad,
  auto num_threads = num_indices * feature_size;
  MPSStream* stream = getCurrentMPSStream();

-  dispatch_sync_with_rethrow(stream->queue(), ^() {
+  mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
    @autoreleasepool {
      id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
      auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_per_sample_weights_backward_{}_{}",
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@ -179,8 +179,7 @@ static void dispatch_index_kernel(TensorIteratorBase& iter,
                   iter.strides(2),
                   index_size,
                   index_stride,
-                   ndim_nindiees,
-                   mpsStream->getErrorBuffer());
+                   ndim_nindiees);
    mtl_dispatch1DJob(computeEncoder, indexSelectPSO, serial ? 1 : iter.numel());
  });
 }
@ -300,7 +299,7 @@ static Tensor& nonzero_out_native_mps(const Tensor& self, Tensor& out_) {
  MPSStream* stream = getCurrentMPSStream();
  using CachedGraph = MPSUnaryCachedGraph;

-  dispatch_sync_with_rethrow(stream->queue(), ^() {
+  dispatch_sync(stream->queue(), ^() {
    stream->synchronize(SyncType::COMMIT_AND_WAIT);
  });
  int64_t total_nonzero = at::count_nonzero(self).item<int64_t>();
@ -385,7 +384,7 @@ Tensor& nonzero_out_mps(const Tensor& self, Tensor& out_) {
  MPSStream* stream = getCurrentMPSStream();
  using CachedGraph = MPSUnaryCachedGraph;

-  dispatch_sync_with_rethrow(stream->queue(), ^() {
+  dispatch_sync(stream->queue(), ^() {
    stream->synchronize(SyncType::COMMIT_AND_WAIT);
  });
  int64_t total_nonzero = at::count_nonzero(self).item<int64_t>();
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@ -121,7 +121,7 @@ Tensor& do_metal_addmm(const Tensor& self,
                       const Scalar& alpha,
                       const Scalar& beta,
                       const Tensor& bias) {
-  if (beta.isFloatingPoint() && alpha.isFloatingPoint() && beta.toDouble() == 0 && alpha.toDouble() == 1) {
+  if (beta.toDouble() == 0 && alpha.toDouble() == 1) {
    return do_metal_mm(self, other, output);
  }
  auto stream = getCurrentMPSStream();
@ -147,15 +147,13 @@ Tensor& do_metal_addmm(const Tensor& self,
        std::array<int64_t, 2> i64;
        std::array<int32_t, 2> i32;
        std::array<float, 2> f32;
-        std::array<c10::complex<float>, 2> c64;
-      } alpha_beta{};
+      } alpha_beta;
      if (output.scalar_type() == kLong) {
        alpha_beta.i64 = {alpha.toLong(), beta.toLong()};
      } else if (c10::isIntegralType(output.scalar_type(), true)) {
        alpha_beta.i32 = {alpha.toInt(), beta.toInt()};
-      } else if (c10::isComplexType(output.scalar_type())) {
-        alpha_beta.c64 = {alpha.toComplexFloat(), beta.toComplexFloat()};
      } else {
+        TORCH_INTERNAL_ASSERT(c10::isFloatingType(output.scalar_type()));
        alpha_beta.f32 = {alpha.toFloat(), beta.toFloat()};
      }
      constexpr uint32_t TILE_DIM = 16; // fastest performance from tests on multiple macs
@ -192,16 +190,10 @@ std::tuple<MPSGraphTensor*, MPSGraphTensor*, MPSGraphTensor*> do_mm(MPSGraph* gr
 bool use_metal_mm(const Tensor& self, const Tensor& other, const Tensor& output) {
  static bool always_use_metal = c10::utils::has_env("PYTORCH_MPS_PREFER_METAL");
  constexpr auto max_stride_size = 32768;
-  constexpr auto max_complex_inner_size = 2048;
  static bool is_macos_14_4_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_4_PLUS);
  if (always_use_metal || c10::isIntegralType(self.scalar_type(), true)) {
    return true;
  }
-  // multiplicationWithPrimaryTensor: returns incorrect results if inner size exceeds 2048
-  // See https://github.com/pytorch/pytorch/issues/167727#issuecomment-3529308548
-  if (c10::isComplexType(self.scalar_type()) && self.size(1) > max_complex_inner_size) {
-    return true;
-  }
  return !is_macos_14_4_or_newer &&
      (self.stride(0) > max_stride_size || self.stride(1) > max_stride_size || self.size(0) > max_stride_size ||
       self.size(1) > max_stride_size || other.stride(0) > max_stride_size || other.stride(1) > max_stride_size ||
--- a/aten/src/ATen/native/mps/operations/Normalization.mm
+++ b/aten/src/ATen/native/mps/operations/Normalization.mm
@ -923,7 +923,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_mps(const Tensor& input,
  MPSStream* stream = getCurrentMPSStream();
  TORCH_CHECK_NOT_IMPLEMENTED(input.scalar_type() != kLong, "Not implemented for long on MPS");
  @autoreleasepool {
-    dispatch_sync_with_rethrow(stream->queue(), ^() {
+    mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
      // which kernel variant to use based on the normalized axis N size
      const int N_READS = 4;
      auto metalType = mps::scalarToMetalTypeString(input);
--- a/aten/src/ATen/native/mps/operations/Repeat.mm
+++ b/aten/src/ATen/native/mps/operations/Repeat.mm
@ -91,30 +91,25 @@ static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
 #include <ATen/native/mps/Repeat_metallib.h>
 #endif

-Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
-  TORCH_CHECK(repeat.dim() == 1, "repeat_interleave only accept 1D vector as repeat");
+template <typename index_t>
+void computeRepeatIndices(const index_t* repeat_ptr,
+                          const int64_t* cumsum_ptr,
+                          index_t* result_ptr,
+                          int64_t size,
+                          int64_t result_size) {
+  id<MTLBuffer> repeatBuffer = reinterpret_cast<id<MTLBuffer>>(repeat_ptr);
+  id<MTLBuffer> cumsumBuffer = reinterpret_cast<id<MTLBuffer>>(cumsum_ptr);
+  id<MTLBuffer> resultBuffer = reinterpret_cast<id<MTLBuffer>>(result_ptr);
+  TORCH_CHECK(repeatBuffer && cumsumBuffer && resultBuffer);
+
  std::string scalar_type;
-  if (repeat.scalar_type() == kInt) {
+  if constexpr (std::is_same_v<index_t, int32_t>) {
    scalar_type = "int32_t";
-  } else if (repeat.scalar_type() == kLong) {
+  } else if constexpr (std::is_same_v<index_t, int64_t>) {
    scalar_type = "int64_t";
  } else {
-    TORCH_CHECK(false, "repeats has to be Long or Int tensor");
+    TORCH_CHECK(false, "repeat_interleave: unsupported indexing data type");
  }
-  if (repeat.size(0) == 0) {
-    return at::empty_like(repeat, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  }
-  Tensor repeat_ = repeat.contiguous();
-  Tensor cumsum = repeat.cumsum(0);
-  int64_t total = 0;
-  if (output_size.has_value()) {
-    total = output_size.value();
-  } else {
-    total = cumsum[-1].item<int64_t>();
-    TORCH_CHECK((repeat >= 0).all().item<uint8_t>(), "repeats can not be negative");
-  }
-
-  auto result = at::empty({total}, repeat.options());

  MPSStream* mpsStream = getCurrentMPSStream();
  dispatch_sync(mpsStream->queue(), ^() {
@ -126,13 +121,20 @@ Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output
      getMPSProfiler().beginProfileKernel(pipelineState, "repeat_interleave:" + scalar_type, false);

      [computeEncoder setComputePipelineState:pipelineState];
-      mps::mtl_setArgs(computeEncoder, repeat_, cumsum, result, repeat.size(0));
-      mps::mtl_dispatch1DJob(computeEncoder, pipelineState, repeat.size(0));
+      mps::mtl_setArgs(computeEncoder, repeatBuffer, cumsumBuffer, resultBuffer, size);
+      mps::mtl_dispatch1DJob(computeEncoder, pipelineState, size);

      getMPSProfiler().endProfileKernel(pipelineState);
    }
  });
-  return result;
+}
+
+Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
+  Tensor output;
+  AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_mps", [&]() {
+    output = repeat_interleave_common<index_t, computeRepeatIndices<index_t>>(repeat, output_size);
+  });
+  return output;
 }

 } // namespace at::native
--- a/aten/src/ATen/native/mps/operations/TensorCompare.mm
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@ -5,7 +5,6 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorCompare.h>
 #include <ATen/native/mps/OperationUtils.h>
-#include <algorithm>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@ -90,21 +89,13 @@ static void check_min_max_dims(const OptionalTensorRef clamp_opt, const Tensor&
    auto clamp_shape = clamp_opt->sizes();
    auto input_shape = input_t.sizes();

-    if (num_clamp_dims > num_input_dims) {
-      auto leading_dims = num_clamp_dims - num_input_dims;
-      for (int64_t i = 0; i < leading_dims; ++i) {
-        TORCH_CHECK(clamp_shape[i] == 1,
-                    op_name + ": clamp tensor leading shape must be 1 to broadcast with input tensor");
-      }
-    }
+    TORCH_CHECK(num_clamp_dims <= num_input_dims,
+                op_name + ": clamp tensor number of dims must not be greater than that of input tensor")

-    auto clamp_idx = num_clamp_dims - 1;
-    auto input_idx = num_input_dims - 1;
-    auto common_dims = std::min(num_clamp_dims, num_input_dims);
-    for (int64_t i = 0; i < common_dims; ++i)
+    for (int i = 0; i < num_clamp_dims; i++)
      // One of the indices is allowed to be 1; will be handled by broadcast
-      TORCH_CHECK(clamp_shape[clamp_idx - i] == input_shape[input_idx - i] || clamp_shape[clamp_idx - i] == 1 ||
-                      input_shape[input_idx - i] == 1,
+      TORCH_CHECK(clamp_shape[num_clamp_dims - 1 - i] == input_shape[num_input_dims - 1 - i] ||
+                      clamp_shape[num_clamp_dims - 1 - i] == 1 || input_shape[num_input_dims - 1 - i] == 1,
                  op_name + ": clamp tensor trailing shape must match input tensor")
  }
 }
@ -145,6 +136,9 @@ static void clamp_tensor_out_mps(const Tensor& input_t,

  auto result_type = output_t.scalar_type();

+  IntArrayRef new_min_shape;
+  IntArrayRef new_max_shape;
+
  auto num_min_dims = min_opt->dim();
  auto num_max_dims = max_opt->dim();
  auto num_input_dims = input_t.dim();
@ -152,32 +146,24 @@ static void clamp_tensor_out_mps(const Tensor& input_t,
  std::vector<int64_t> new_min_arr(num_input_dims);
  std::vector<int64_t> new_max_arr(num_input_dims);

+  if (has_min && num_min_dims < num_input_dims) {
+    fill_new_shape(num_input_dims, num_min_dims, new_min_arr.data(), min_opt->sizes());
+    new_min_shape = IntArrayRef(new_min_arr);
+  }
+
+  if (has_max && num_max_dims < num_input_dims) {
+    fill_new_shape(num_input_dims, num_max_dims, new_max_arr.data(), max_opt->sizes());
+    new_max_shape = IntArrayRef(new_max_arr);
+  }
+
  Tensor min_opt_tensor;
  Tensor max_opt_tensor;

-  auto reshape_clamp_tensor = [&](const OptionalTensorRef clamp_tensor_ref,
-                                  int64_t num_clamp_dims,
-                                  std::vector<int64_t>& new_shape_storage) -> Tensor {
-    IntArrayRef clamp_shape = clamp_tensor_ref->sizes();
-    bool requires_view = false;
-
-    if (num_clamp_dims > num_input_dims) {
-      clamp_shape = clamp_shape.slice(num_clamp_dims - num_input_dims);
-      requires_view = true;
-    } else if (num_clamp_dims < num_input_dims) {
-      fill_new_shape(num_input_dims, num_clamp_dims, new_shape_storage.data(), clamp_shape);
-      clamp_shape = IntArrayRef(new_shape_storage);
-      requires_view = true;
-    }
-
-    return requires_view ? (*clamp_tensor_ref).view(clamp_shape) : *clamp_tensor_ref;
-  };
-
  if (has_min) {
-    min_opt_tensor = reshape_clamp_tensor(min_opt, num_min_dims, new_min_arr);
+    min_opt_tensor = (num_min_dims < num_input_dims) ? (*min_opt).view(new_min_shape) : *min_opt;
  }
  if (has_max) {
-    max_opt_tensor = reshape_clamp_tensor(max_opt, num_max_dims, new_max_arr);
+    max_opt_tensor = (num_max_dims < num_input_dims) ? (*max_opt).view(new_max_shape) : *max_opt;
  }

  @autoreleasepool {
@ -258,8 +244,8 @@ static void clamp_scalar_out_mps(const Tensor& input_t,

  @autoreleasepool {
    // the optional min/max refs could affect how we build the cached graph
-    std::string key = op_name + (has_min ? ("_min:" + to_hex_key(min_scalar)) : "") +
-        (has_max ? ("_max:" + to_hex_key(max_scalar)) : "") + "_scalar:" + getTensorsStringKey({input_t});
+    std::string key = op_name + (has_min ? ("_min:" + std::to_string(min_scalar)) : "") +
+        (has_max ? ("_max:" + std::to_string(max_scalar)) : "") + "_scalar:" + getTensorsStringKey({input_t});
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      if (has_min)
        newCachedGraph->minTensor = [mpsGraph constantWithScalar:min_scalar
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -192,11 +192,6 @@
    CompositeExplicitAutograd: _assert_tensor_metadata
    Meta: _assert_tensor_metadata_meta_symint

- func: _async_error(str msg) -> ()
-  dispatch:
-    CompositeExplicitAutograd: _async_error
-    Meta: _async_error_meta
-
 - func: _print(str s) -> ()
  dispatch:
    CompositeExplicitAutograd: _print
@ -4225,7 +4220,7 @@
    MTIA: mm_out_mtia
    MPS: mm_out_mps
    XPU: mm_out_xpu
-    SparseCPU, SparseCUDA, SparseMPS: _sparse_mm_out
+    SparseCPU, SparseCUDA: _sparse_mm_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm_out

 - func: mm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
@ -4297,7 +4292,6 @@
  dispatch:
    SparseCPU: sparse_sparse_matmul_cpu
    SparseCUDA: sparse_sparse_matmul_cuda
-    SparseMPS: sparse_sparse_matmul_mps
  autogen: _sparse_sparse_matmul.out

 - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
@ -4389,7 +4383,7 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: mv
-    SparseCPU, SparseCUDA, SparseMPS: mv_sparse
+    SparseCPU, SparseCUDA: mv_sparse

 - func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
@ -7518,7 +7512,7 @@
 - func: _sparse_mask_projection(Tensor self, Tensor mask, bool accumulate_matches=False) -> Tensor
  variants: method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: sparse_mask_projection
+    SparseCPU, SparseCUDA: sparse_mask_projection
  autogen: _sparse_mask_projection.out

 - func: _to_cpu(Tensor[] tensors) -> Tensor[]
@ -9838,7 +9832,7 @@
  structured_delegate: erfinv.out
  variants: method, function
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse
+    SparseCPU, SparseCUDA: erfinv_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr
  tags: pointwise

@ -9847,7 +9841,7 @@
  structured_delegate: erfinv.out
  variants: method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_
+    SparseCPU, SparseCUDA: erfinv_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_
  tags: pointwise

@ -9857,7 +9851,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: erfinv_out
-    SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_out
+    SparseCPU, SparseCUDA: erfinv_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_out
  tags: pointwise

--- a/aten/src/ATen/native/sparse/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/sparse/cuda/SoftMax.cu
@ -30,12 +30,10 @@

 #include <thrust/binary_search.h>
 #include <thrust/device_ptr.h>
-#include <thrust/distance.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/scan.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 #include <thrust/system/cuda/execution_policy.h>
+#include <thrust/iterator/constant_iterator.h>

 #include <cuda_runtime_api.h>
 #include <cusparse.h>
@ -49,7 +47,6 @@
 #include <c10/macros/Macros.h>
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
-#include <thrust/distance.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/gather.h>
--- a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
+++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
@ -10,10 +10,6 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/_coalesce_native.h>
-#include <ATen/ops/repeat_interleave_native.h>
-#include <ATen/ops/cumsum.h>
-#include <ATen/ops/_sparse_sparse_matmul_native.h>
-#include <ATen/ops/_sparse_coo_tensor_unsafe.h>
 #include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
 #include <ATen/ops/cat.h>
 #include <ATen/ops/add_native.h>
@ -445,33 +441,6 @@ static SparseTensor& mul_out_dense_sparse_mps(
  return out;
 }

-static std::tuple<Tensor, Tensor, int64_t> mps_intersect_binary_search(
-    const Tensor& A_keys,
-    const Tensor& B_keys,
-    int64_t lenA,
-    int64_t lenB,
-    bool boolean_flag) {
-
-  auto stream = getCurrentMPSStream();
-  auto outA_idx = at::empty({lenA}, A_keys.options().dtype(at::kLong));
-  auto outB_idx = at::empty({lenA}, A_keys.options().dtype(at::kLong));
-  auto counter = at::zeros({1}, A_keys.options().dtype(at::kInt));
-
-  dispatch_sync_with_rethrow(stream->queue(), ^() {
-    @autoreleasepool {
-      auto pso = lib.getPipelineStateForFunc("intersect_binary_search");
-      auto enc = stream->commandEncoder();
-      [enc setComputePipelineState:pso];
-      mtl_setArgs(enc, A_keys, B_keys, outA_idx, outB_idx, counter,
-                  static_cast<uint32_t>(lenB), boolean_flag);
-      mtl_dispatch1DJob(enc, pso, static_cast<uint32_t>(lenA));
-    }
-  });
-
-  const auto match_count = static_cast<int64_t>(counter.item<int32_t>());
-  return std::make_tuple(std::move(outA_idx), std::move(outB_idx), match_count);
-}
-

 SparseTensor& mul_out_sparse_mps(const Tensor& t_, const Tensor& src_, SparseTensor& r_) {
  TORCH_CHECK(r_.is_mps(), "mul: expected 'out' to be MPS, but got ", r_.device());
@ -550,10 +519,22 @@ SparseTensor& mul_out_sparse_mps(const Tensor& t_, const Tensor& src_, SparseTen
  auto A_keys = A_is_lhs ? lhs_keys : rhs_keys;
  auto B_keys = A_is_lhs ? rhs_keys : lhs_keys;

-  auto [outA_idx, outB_idx, M_int64] = mps_intersect_binary_search(
-      A_keys, B_keys, lenA, lenB, A_is_lhs);
+  auto outA_idx = at::empty({lenA}, at::device(device).dtype(kLong));
+  auto outB_idx = at::empty({lenA}, at::device(device).dtype(kLong));
+  auto counter = at::zeros({1}, at::device(device).dtype(kInt));

-  const auto M = static_cast<uint32_t>(M_int64); // number of structural matches
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pso = lib.getPipelineStateForFunc("intersect_binary_search");
+      auto enc = stream->commandEncoder();
+      [enc setComputePipelineState:pso];
+      mtl_setArgs(enc, A_keys, B_keys, outA_idx, outB_idx, counter,
+                  static_cast<uint32_t>(lenB), A_is_lhs);
+      mtl_dispatch1DJob(enc, pso, static_cast<uint32_t>(lenA));
+    }
+  });
+
+  const uint32_t M = counter.item<int32_t>(); // number of structural matches

  r_.resize_as_(lhs);

@ -777,14 +758,6 @@ SparseTensor& add_out_sparse_mps(const SparseTensor& self,

 using OptTensor = std::optional<Tensor>;

-static Tensor create_sparse_output_values(
-    const Tensor& template_values,
-    int64_t output_nnz,
-    ScalarType dtype) {
-  auto out_val_sizes = template_values.sizes().vec();
-  out_val_sizes[0] = output_nnz;
-  return at::zeros(out_val_sizes, template_values.options().dtype(dtype));
-}

 static void sparse_mask_apply_out_mps_kernel(
    Tensor& result,
@ -806,9 +779,9 @@ static void sparse_mask_apply_out_mps_kernel(
  auto src  = src_in.coalesce();
  auto mask = coalesce_mask ? mask_in.coalesce() : mask_in;

-  const auto src_nnz = src._nnz();
-  const auto mask_nnz = mask._nnz();
-  const auto sd = src.sparse_dim();
+  const int64_t src_nnz = src._nnz();
+  const int64_t mask_nnz = mask._nnz();
+  const int64_t sd = src.sparse_dim();
  result.sparse_resize_(mask.sizes(), mask.sparse_dim(), mask.dense_dim());

  auto commonDtype = at::result_type(src, mask);
@ -837,27 +810,53 @@ static void sparse_mask_apply_out_mps_kernel(
    return;
  }

-  auto mask_indices = mask._indices().contiguous();
-  auto src_values = src._values().to(commonDtype).contiguous();
-  auto out_values = create_sparse_output_values(src_values, mask_nnz, commonDtype);
-
  if (src_nnz == 0) {
-    alias_into_sparse(result, mask_indices, out_values);
+    auto out_indices = mask._indices().contiguous();
+    auto src_values  = src._values().to(commonDtype);
+    auto out_val_sizes = src_values.sizes().vec();
+    out_val_sizes[0] = mask_nnz;
+    auto out_values = at::zeros(out_val_sizes, src_values.options());
+    alias_into_sparse(result, out_indices, out_values);
    result._coalesced_(mask.is_coalesced());
    return;
  }

-  auto mask_keys = flatten_indices(mask._indices().contiguous(), mask.sizes().slice(0, sd)).contiguous();
-  auto src_keys  = flatten_indices(src._indices().contiguous(), src.sizes().slice(0, sd)).contiguous();
+  auto mask_indices = mask._indices().contiguous();
+  auto src_indices = src._indices().contiguous();
+  auto src_values = src._values().to(commonDtype).contiguous();

-  const auto A_is_src = (src_nnz <= mask_nnz);
-  const auto lenA = A_is_src ? src_nnz  : mask_nnz;
-  const auto lenB = A_is_src ? mask_nnz : src_nnz;
+  auto mask_keys = flatten_indices(mask_indices, mask.sizes().slice(0, sd)).contiguous();
+  auto src_keys  = flatten_indices(src_indices,  src.sizes().slice(0, sd)).contiguous();
+
+  const bool A_is_src = (src_nnz <= mask_nnz);
+  const int64_t lenA = A_is_src ? src_nnz  : mask_nnz;
+  const int64_t lenB = A_is_src ? mask_nnz : src_nnz;
  auto A_keys = A_is_src ? src_keys  : mask_keys;
  auto B_keys = A_is_src ? mask_keys : src_keys;

-  auto [outA_idx, outB_idx, M] = mps_intersect_binary_search(
-      A_keys, B_keys, lenA, lenB, A_is_src);
+  const auto device = result.device();
+  auto stream = getCurrentMPSStream();
+
+  auto outA_idx = at::empty({lenA}, at::device(device).dtype(at::kLong));
+  auto outB_idx = at::empty({lenA}, at::device(device).dtype(at::kLong));
+  auto counter = at::zeros({1}, at::device(device).dtype(at::kInt));
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pso = lib.getPipelineStateForFunc("intersect_binary_search");
+      auto enc = stream->commandEncoder();
+      [enc setComputePipelineState:pso];
+      mtl_setArgs(enc, A_keys, B_keys, outA_idx, outB_idx, counter,
+                  static_cast<uint32_t>(lenB), A_is_src);
+      mtl_dispatch1DJob(enc, pso, static_cast<uint32_t>(lenA));
+    }
+  });
+
+  const int64_t M = static_cast<int64_t>(counter.item<int32_t>());
+
+  auto out_val_sizes = src_values.sizes().vec();
+  out_val_sizes[0] = mask_nnz;
+  auto out_values = at::zeros(out_val_sizes, src_values.options());

  if (M > 0) {
    auto src_match = outA_idx.narrow(0, 0, M);
@ -875,70 +874,6 @@ static void sparse_mask_apply_out_mps_kernel(
  result._coalesced_(mask.is_coalesced());
 }

-static void sparse_mask_projection_out_mps_kernel(
-    Tensor& result,
-    const Tensor& lhs,
-    const Tensor& rhs,
-    const OptTensor& /*x_hash_opt*/,
-    bool accumulate_matches) {
-
-  TORCH_CHECK(lhs.is_sparse() && rhs.is_sparse(), "sparse_mask_projection: expected sparse COO");
-  TORCH_CHECK(lhs.is_mps() && rhs.is_mps(), "sparse_mask_projection: expected MPS tensors");
-  TORCH_CHECK(lhs.sparse_dim() == rhs.sparse_dim(), "sparse_dim mismatch");
-
-  auto lhs_c = lhs.coalesce();
-  auto rhs_c = rhs.coalesce();
-
-  const auto sd = lhs_c.sparse_dim();
-  const auto lhs_nnz = lhs_c._nnz();
-  const auto rhs_nnz = rhs_c._nnz();
-
-  auto commonDtype = at::result_type(lhs_c, rhs_c);
-  TORCH_CHECK(canCast(commonDtype, result.scalar_type()),
-              "Can't convert ", commonDtype, " to output ", result.scalar_type());
-
-  result.sparse_resize_(lhs.sizes(), lhs.sparse_dim(), lhs.dense_dim());
-
-  auto lhs_indices = lhs_c._indices().contiguous();
-  auto rhs_values  = rhs_c._values().to(commonDtype).contiguous();
-  auto out_values = create_sparse_output_values(rhs_values, lhs_nnz, commonDtype);
-
-  if (lhs_nnz > 0 && rhs_nnz > 0) {
-    auto lhs_keys = flatten_indices(lhs_indices, lhs_c.sizes().slice(0, sd)).contiguous();
-    auto rhs_keys = flatten_indices(rhs_c._indices().contiguous(), rhs_c.sizes().slice(0, sd)).contiguous();
-
-    const auto A_is_lhs = (lhs_nnz <= rhs_nnz);
-    const auto lenA = A_is_lhs ? lhs_nnz : rhs_nnz;
-    const auto lenB = A_is_lhs ? rhs_nnz : lhs_nnz;
-    auto A_keys = A_is_lhs ? lhs_keys : rhs_keys;
-    auto B_keys = A_is_lhs ? rhs_keys : lhs_keys;
-
-    auto [outA_idx, outB_idx, M] = mps_intersect_binary_search(
-        A_keys, B_keys, lenA, lenB, A_is_lhs);
-
-    if (M > 0) {
-      auto idx_in_A = outA_idx.narrow(0, 0, M);
-      auto idx_in_B = outB_idx.narrow(0, 0, M);
-      auto idx_in_lhs = A_is_lhs ? idx_in_A : idx_in_B;
-      auto idx_in_rhs = A_is_lhs ? idx_in_B : idx_in_A;
-
-      const auto view_cols = rhs_values.numel() / std::max<int64_t>(rhs_nnz, 1);
-      auto rhs_rows = rhs_values.index_select(0, idx_in_rhs).contiguous();
-      auto rhs_rows_2d = rhs_rows.view({M, view_cols});
-      auto out_2d = out_values.view({lhs_nnz, view_cols});
-
-      if (accumulate_matches) {
-        out_2d.index_add_(0, idx_in_lhs, rhs_rows_2d);
-      } else {
-        out_2d.index_copy_(0, idx_in_lhs, rhs_rows_2d);
-      }
-    }
-  }
-
-  alias_into_sparse(result, lhs._indices(), out_values);
-  result._coalesced_(lhs.is_coalesced());
-}
-
 static void sparse_mask_intersection_out_mps_kernel(
    Tensor& result,
    const Tensor& lhs,
@ -953,115 +888,5 @@ static void sparse_mask_intersection_out_mps_kernel(
      /*coalesce_mask=*/false);
 }

-Tensor sparse_sparse_matmul_mps(const Tensor& mat1_, const Tensor& mat2_) {
-  TORCH_CHECK(mat1_.is_sparse() && mat2_.is_sparse(),
-              "sparse_sparse_matmul_mps: both inputs must be sparse COO tensors");
-  TORCH_CHECK(mat1_.is_mps() && mat2_.is_mps(),
-              "sparse_sparse_matmul_mps: both inputs must be on MPS device");
-  TORCH_CHECK(mat1_.dim() == 2 && mat2_.dim() == 2,
-              "sparse_sparse_matmul_mps: both inputs must be 2D matrices");
-  TORCH_CHECK(mat1_.dense_dim() == 0 && mat2_.dense_dim() == 0,
-              "sparse_sparse_matmul_mps: only scalar values supported (dense_dim == 0)");
-  TORCH_CHECK(mat1_.size(1) == mat2_.size(0),
-              "mat1 and mat2 shapes cannot be multiplied (", mat1_.size(0), "x", mat1_.size(1), " and ", mat2_.size(0), "x", mat2_.size(1), ")");
-  TORCH_CHECK(mat1_.scalar_type() == mat2_.scalar_type(),
-              "sparse_sparse_matmul_mps: mat1 dtype ", mat1_.scalar_type(),
-              " does not match mat2 dtype ", mat2_.scalar_type());
-
-  const auto device = mat1_.device();
-
-  auto A = mat1_.coalesce();
-  auto B = mat2_.coalesce();
-
-  const auto I = A.size(0);
-  const auto K = A.size(1);
-  const auto N = B.size(1);
-
-  const auto nnzA = A._nnz();
-  const auto nnzB = B._nnz();
-
-  // Early empty result, return an empty, coalesced tensor
-  if (I == 0 || N == 0 || K == 0 || nnzA == 0 || nnzB == 0) {
-    auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong));
-    auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type()));
-    auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options());
-    out._coalesced_(true);
-    return out;
-  }
-
-  const auto computeDtype = at::result_type(mat1_, mat2_);
-
-  auto A_idx = A._indices().contiguous();
-  auto A_val = A._values().to(computeDtype).contiguous();
-  auto A_i = A_idx.select(0, 0).contiguous();
-  auto A_k = A_idx.select(0, 1).contiguous();
-
-  auto B_idx = B._indices().contiguous();
-  auto B_val = B._values().to(computeDtype).contiguous();
-  auto B_k = B_idx.select(0, 0).contiguous();
-  auto B_j = B_idx.select(0, 1).contiguous();
-
-  // csr-style row pointers for B by k (the shared dimension)
-  Tensor row_ptr_B;
-  {
-    auto batch_ptr = at::tensor({0LL, nnzB}, at::device(device).dtype(at::kLong));
-    row_ptr_B = at::empty({K + 1}, at::device(device).dtype(at::kLong));
-    build_row_ptr_per_batch_mps(B_k, batch_ptr, /*B=*/1, /*I=*/K, row_ptr_B);
-  }
-
-  auto row_ptr_B_lo = row_ptr_B.narrow(0, 0, K);
-  auto row_ptr_B_hi = row_ptr_B.narrow(0, 1, K);
-  auto deg_B = row_ptr_B_hi.sub(row_ptr_B_lo);
-
-  auto counts = deg_B.index_select(0, A_k);
-
-  const int64_t P = counts.sum().item<int64_t>();
-  if (P == 0) {
-    auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong));
-    auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type()));
-    auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options());
-    out._coalesced_(true);
-    return out;
-  }
-
-  auto group_ids = repeat_interleave_mps(counts);
-
-  // exclusive cumsum of counts
-  auto offsets = cumsum(counts, /*dim=*/0).sub(counts);
-  auto offsets_gather = offsets.index_select(0, group_ids);
-  auto within = at::arange(P, at::device(device).dtype(at::kLong)).sub(offsets_gather);
-
-  // Map each output element to its source B row and position
-  auto k_per_out = A_k.index_select(0, group_ids);
-  auto start_in_B = row_ptr_B.index_select(0, k_per_out);
-  auto seg_index = start_in_B.add(within);
-
-  // Assemble candidate coo pairs and values
-  auto i_out = A_i.index_select(0, group_ids).contiguous();
-  auto j_out = B_j.index_select(0, seg_index).contiguous();
-  auto vA_out = A_val.index_select(0, group_ids).contiguous();
-  auto vB_out = B_val.index_select(0, seg_index).contiguous();
-  auto v_out = vA_out.mul(vB_out);
-
-  // build (2, P) indices
-  auto out_indices = at::empty({2, P}, at::device(device).dtype(at::kLong)).contiguous();
-  out_indices.select(0, 0).copy_(i_out);
-  out_indices.select(0, 1).copy_(j_out);
-
-  auto result = _sparse_coo_tensor_unsafe(
-      out_indices, v_out, {I, N}, mat1_.options().dtype(computeDtype));
-
-  result = result.coalesce();
-
-  if (result.scalar_type() != mat1_.scalar_type()) {
-    auto cast_vals = result._values().to(mat1_.scalar_type());
-    auto out = _sparse_coo_tensor_unsafe(result._indices(), cast_vals, {I, N}, mat1_.options());
-    out._coalesced_(true);
-    return out;
-  }
-  return result;
-}
-
 REGISTER_MPS_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_mps_kernel);
-REGISTER_MPS_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_mps_kernel);
 } // namespace at::native
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@ -61,7 +61,6 @@ list(APPEND ATen_CUDA_TEST_SRCS
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_math_test.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_test.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_cub_test.cu
-  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_cublas_handle_pool_test.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_device_test.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_distributions_test.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_dlconvertor_test.cpp
--- a/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp
+++ b/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp
@ -1,77 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDACachingAllocator.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#include <atomic>
-#include <thread>
-#include <vector>
-
-// Test concurrent access to getCurrentCUDABlasHandle and getCUDABlasLtWorkspace
-// to verify that the data race fix is working correctly
-
-TEST(CUDABlasHandlePoolTest, ConcurrentGetAndClearWorkspaces) {
-  if (!at::cuda::is_available()) {
-    return;
-  }
-
-  constexpr int num_accessor_threads = 15;
-  constexpr int num_clear_threads = 5;
-  constexpr int iterations_per_thread = 50;
-
-  std::atomic<bool> stop{false};
-  std::atomic<int> error_count{0};
-  std::vector<std::thread> threads;
-  threads.reserve(num_accessor_threads + num_clear_threads);
-
-  // Launch accessor threads
-  for (int i = 0; i < num_accessor_threads; ++i) {
-    threads.emplace_back([&stop, &error_count]() {
-      try {
-        at::cuda::CUDAGuard device_guard(0);
-
-        while (!stop.load(std::memory_order_relaxed)) {
-          const auto handle = at::cuda::getCurrentCUDABlasHandle();
-          const auto workspace = at::cuda::getCUDABlasLtWorkspace();
-
-          if (handle == nullptr || workspace == nullptr) {
-            error_count++;
-          }
-        }
-      } catch (const std::exception& e) {
-        error_count++;
-      }
-    });
-  }
-
-  // Launch threads that clear workspaces
-  for (int i = 0; i < num_clear_threads; ++i) {
-    threads.emplace_back([&error_count]() {
-      try {
-        for (int j = 0; j < iterations_per_thread; ++j) {
-          at::cuda::clearCublasWorkspaces();
-          std::this_thread::yield();
-        }
-      } catch (const std::exception& e) {
-        error_count++;
-      }
-    });
-  }
-
-  // Let them run for a bit
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));
-  stop.store(true, std::memory_order_relaxed);
-
-  for (auto& thread : threads) {
-    thread.join();
-  }
-
-  EXPECT_EQ(error_count.load(), 0);
-}
-
-int main(int argc, char* argv[]) {
-  ::testing::InitGoogleTest(&argc, argv);
-  c10::cuda::CUDACachingAllocator::init(1);
-  return RUN_ALL_TESTS();
-}
--- a/Show More
+++ b/Show More