Merge branch 'main' into hipify_without_caffe_attempt2

fix build after last merge
2025-11-13 21:59:56 +08:00 · 2025-11-12 17:52:15 +00:00 · 2025-11-12 17:45:13 +00:00 · 2025-11-12 17:43:51 +00:00 · 2025-11-12 17:41:19 +00:00 · 2025-11-12 17:23:28 +00:00
601 changed files with 18809 additions and 16316 deletions
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -36,11 +36,7 @@ case ${DOCKER_TAG_PREFIX} in
    ;;
  rocm*)
    BASE_TARGET=rocm
-    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-    # add gfx950, gfx115x conditionally starting in ROCm 7.0
-    if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
-        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
-    fi
+    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
    EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
    ;;
  *)
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -260,6 +260,12 @@ case "$tag" in
    HALIDE=yes
    TRITON=yes
    ;;
+  pytorch-linux-jammy-cuda12.8-py3.12-pallas)
+    CUDA_VERSION=12.8.1
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=11
+    PALLAS=yes
+    ;;
  pytorch-linux-jammy-py3.12-triton-cpu)
    CUDA_VERSION=12.6
    ANACONDA_PYTHON_VERSION=3.12
@ -381,6 +387,7 @@ docker build \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
       --build-arg "HALIDE=${HALIDE}" \
+       --build-arg "PALLAS=${PALLAS}" \
       --build-arg "XPU_VERSION=${XPU_VERSION}" \
       --build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
       --build-arg "ACL=${ACL:-}" \
--- a/.ci/docker/ci_commit_pins/jax.txt
+++ b/.ci/docker/ci_commit_pins/jax.txt
@ -0,0 +1 @@
+0.8.0
--- a/.ci/docker/common/install_jax.sh
+++ b/.ci/docker/common/install_jax.sh
@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -ex
+
+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
+# Get the pinned JAX version (same for all CUDA versions)
+JAX_VERSION=$(get_pinned_commit /ci_commit_pins/jax)
+
+function install_jax_12() {
+  echo "Installing JAX ${JAX_VERSION} with CUDA 12 support"
+  pip_install "jax[cuda12]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+
+  # Verify installation
+  python -c "import jax"  # check for errors
+  echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 12"
+}
+
+function install_jax_13() {
+  echo "Installing JAX ${JAX_VERSION} with CUDA 13 support"
+  pip_install "jax[cuda13]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+
+  # Verify installation
+  python -c "import jax"  # check for errors
+  echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 13"
+}
+
+# idiomatic parameter and option handling in sh
+while test $# -gt 0
+do
+    case "$1" in
+    12.4|12.6|12.6.*|12.8|12.8.*|12.9|12.9.*) install_jax_12;
+        ;;
+    13.0|13.0.*) install_jax_13;
+        ;;
+    *) echo "bad argument $1"; exit 1
+        ;;
+    esac
+    shift
+done
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -49,11 +49,7 @@ case ${DOCKER_TAG_PREFIX} in
        fi
        BASE_TARGET=rocm
        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950, gfx115x conditionally starting in ROCm 7.0
-        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
-        fi
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
        ;;
    *)
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -87,11 +87,7 @@ case ${image} in
        MANY_LINUX_VERSION="2_28"
        DEVTOOLSET_VERSION="11"
        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950, gfx115x conditionally starting in ROCm 7.0
-        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
-        fi
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
    manylinux2_28-builder:xpu)
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -143,6 +143,15 @@ COPY ci_commit_pins/halide.txt halide.txt
 RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
 RUN rm install_halide.sh common_utils.sh halide.txt

+ARG PALLAS
+ARG CUDA_VERSION
+# Install JAX with CUDA support (for Pallas)
+COPY ./common/install_jax.sh install_jax.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./ci_commit_pins/jax.txt /ci_commit_pins/jax.txt
+RUN if [ -n "${PALLAS}" ]; then bash ./install_jax.sh ${CUDA_VERSION}; fi
+RUN rm -f install_jax.sh common_utils.sh /ci_commit_pins/jax.txt
+
 ARG ONNX
 # Install ONNX dependencies
 COPY ./common/install_onnx.sh ./common/common_utils.sh ./
--- a/.ci/lumen_cli/cli/lib/common/cli_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/cli_helper.py
@ -8,9 +8,11 @@ from abc import ABC, abstractmethod


 try:
-    from typing import Any, Callable, Required, TypedDict  # Python 3.11+
+    from collections.abc import Callable  # Python 3.11+
+    from typing import Any, Required, TypedDict
 except ImportError:
-    from typing import Any, Callable, TypedDict
+    from collections.abc import Callable
+    from typing import Any, TypedDict

    from typing_extensions import Required  # Fallback for Python <3.11

--- a/.ci/magma-rocm/README.md
+++ b/.ci/magma-rocm/README.md
@ -30,7 +30,6 @@ into a tarball, with the following structure:
 More specifically, `build_magma.sh` copies over the relevant files from the `package_files` directory depending on the ROCm version.
 Outputted binaries should be in the `output` folder.

-
 ## Pushing

 Packages can be uploaded to an S3 bucket using:
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -168,14 +168,16 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
  # shellcheck disable=SC1091
+  source /opt/intel/oneapi/umf/latest/env/vars.sh
+  # shellcheck disable=SC1091
  source /opt/intel/oneapi/ccl/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/mpi/latest/env/vars.sh
+  # shellcheck disable=SC1091
+  source /opt/intel/oneapi/pti/latest/env/vars.sh
  # Enable XCCL build
  export USE_XCCL=1
  export USE_MPI=0
-  # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
-  export USE_KINETO=0
  export TORCH_XPU_ARCH_LIST=pvc
 fi

--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -96,7 +96,6 @@ function pip_build_and_install() {
    python3 -m pip wheel \
      --no-build-isolation \
      --no-deps \
-      --no-use-pep517 \
      -w "${wheel_dir}" \
      "${build_target}"
  fi
@ -308,6 +307,28 @@ function install_torchao() {
  pip_build_and_install "git+https://github.com/pytorch/ao.git@${commit}" dist/ao
 }

+function install_flash_attn_cute() {
+  echo "Installing FlashAttention CuTe from GitHub..."
+  # Grab latest main til we have a pinned commit
+  local flash_attn_commit
+  flash_attn_commit=$(git ls-remote https://github.com/Dao-AILab/flash-attention.git HEAD | cut -f1)
+
+  # Clone the repo to a temporary directory
+  rm -rf flash-attention-build
+  git clone --depth 1 --recursive https://github.com/Dao-AILab/flash-attention.git flash-attention-build
+
+  pushd flash-attention-build
+  git checkout "${flash_attn_commit}"
+
+  # Install only the 'cute' sub-directory
+  pip_install -e flash_attn/cute/
+  popd
+
+  # remove the local repo
+  rm -rf flash-attention-build
+  echo "FlashAttention CuTe installation complete."
+}
+
 function print_sccache_stats() {
  echo 'PyTorch Build Statistics'
  sccache --show-stats
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -208,6 +208,8 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  source /opt/intel/oneapi/ccl/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/mpi/latest/env/vars.sh
+  # shellcheck disable=SC1091
+  source /opt/intel/oneapi/pti/latest/env/vars.sh
  # Check XPU status before testing
  timeout 30 xpu-smi discovery || true
 fi
@ -337,13 +339,23 @@ test_python() {

 test_python_smoke() {
  # Smoke tests for H100/B200
-  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune inductor/test_cutedsl_grouped_mm $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }

 test_python_smoke_b200() {
-  # Targeted smoke tests for B200 - staged approach to avoid too many failures
-  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  # Targeted smoke tests for B200 including FlashAttention CuTe coverage
+  install_flash_attn_cute
+  time python test/run_test.py \
+    --include \
+      test_matmul_cuda \
+      test_scaled_matmul_cuda \
+      inductor/test_fp8 \
+      nn/attention/test_fa4 \
+      nn/attention/test_open_registry \
+      inductor/test_flex_flash \
+    $PYTHON_TEST_EXTRA_OPTION \
+    --upload-artifacts-while-running
  assert_git_not_dirty
 }

@ -824,6 +836,11 @@ test_inductor_halide() {
  assert_git_not_dirty
 }

+test_inductor_pallas() {
+  python test/run_test.py --include inductor/test_pallas.py --verbose
+  assert_git_not_dirty
+}
+
 test_inductor_triton_cpu() {
  python test/run_test.py --include inductor/test_triton_cpu_backend.py inductor/test_torchinductor_strided_blocks.py --verbose
  assert_git_not_dirty
@ -1724,6 +1741,8 @@ elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
  test_inductor_halide
+elif [[ "${TEST_CONFIG}" == *inductor-pallas* ]]; then
+  test_inductor_pallas
 elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
  test_inductor_triton_cpu
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -63,7 +63,7 @@ self-hosted-runner:
    - linux.rocm.gpu.gfx942.1
    - linux.rocm.gpu.gfx942.2
    - linux.rocm.gpu.gfx942.4
-    - rocm-docker
+    - linux.rocm.gfx942.docker-cache
    # Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors)
    - macos-m1-stable
    - macos-m1-14
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-ad5816f0eee1c873df1b7d371c69f1f811a89387
+07b6cbde121417a70e4dc871adb6d27030e0ce3f
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-ca2212438fdd8ce29b66999ed70ed54b0f9372d1
+acccf86477759b2d3500f1ae1be065f7b1e409ec
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-c8b09f5f77d6bf6fb7ed7a9aa83e5d8156b3a5e9
+e4d25697f9dc5eedaf8f0a5bf085c62c5455a53a
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -165,3 +165,16 @@
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
 - third_party/fbgemm
+
+"ciflow/mps":
+- aten/src/ATen/mps/**
+- aten/src/ATen/native/mps/**
+- torch/_inductor/codegen/mps.py
+- test/test_mps.py
+- test/inductor/test_mps_basic.py
+
+"ciflow/h100-symm-mem":
+- torch/csrc/distributed/c10d/symm_mem/**
+- torch/distributed/_symmetric_memory/**
+- test/distributed/**/*mem*
+- test/distributed/**/*mem*/**
--- a/.github/nitpicks.yml
+++ b/.github/nitpicks.yml
@ -10,3 +10,4 @@
  pathFilter:
    - 'torch/csrc/inductor/aoti_torch/c/*'
    - 'torch/csrc/inductor/aoti_torch/generated/*'
+    - 'torch/csrc/stable/c/*'
--- a/.github/scripts/delete_old_branches.py
+++ b/.github/scripts/delete_old_branches.py
@ -1,10 +1,11 @@
 # Delete old branches
 import os
 import re
+from collections.abc import Callable
 from datetime import datetime
 from functools import lru_cache
 from pathlib import Path
-from typing import Any, Callable
+from typing import Any

 from github_utils import gh_fetch_json_dict, gh_graphql
 from gitutils import GitRepo
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -8,10 +8,11 @@ import re
 import subprocess
 import sys
 import warnings
+from collections.abc import Callable
 from enum import Enum
 from functools import cache
 from logging import info
-from typing import Any, Callable, Optional
+from typing import Any, Optional
 from urllib.request import Request, urlopen

 import yaml
--- a/.github/scripts/get_workflow_job_id.py
+++ b/.github/scripts/get_workflow_job_id.py
@ -11,7 +11,8 @@ import sys
 import time
 import urllib
 import urllib.parse
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional
 from urllib.request import Request, urlopen


--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -3,8 +3,9 @@
 import json
 import os
 import warnings
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, Callable, cast, Optional, Union
+from typing import Any, cast, Optional, Union
 from urllib.error import HTTPError
 from urllib.parse import quote
 from urllib.request import Request, urlopen
--- a/.github/scripts/gitutils.py
+++ b/.github/scripts/gitutils.py
@ -4,10 +4,10 @@ import os
 import re
 import tempfile
 from collections import defaultdict
-from collections.abc import Iterator
+from collections.abc import Callable, Iterator
 from datetime import datetime
 from functools import wraps
-from typing import Any, Callable, cast, Optional, TypeVar, Union
+from typing import Any, cast, Optional, TypeVar, Union


 T = TypeVar("T")
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@ -34,6 +34,9 @@ python3 torch/utils/data/datapipes/gen_pyi.py
 # Also check generated pyi files
 find torch -name '*.pyi' -exec git add --force -- "{}" +

+# Print current environment
+python3 -m pip freeze
+
 RC=0
 # Run lintrunner on all files
 if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -17,12 +17,12 @@ import re
 import time
 import urllib.parse
 from collections import defaultdict
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from dataclasses import dataclass
 from functools import cache
 from pathlib import Path
 from re import Pattern
-from typing import Any, Callable, cast, NamedTuple, Optional
+from typing import Any, cast, NamedTuple, Optional
 from warnings import warn

 import yaml
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -67,6 +67,7 @@ jobs:
          pytorch-linux-jammy-py3.10-gcc11,
          pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
          pytorch-linux-jammy-py3.12-halide,
+          pytorch-linux-jammy-cuda12.8-py3.12-pallas,
          pytorch-linux-jammy-xpu-n-1-py3,
          pytorch-linux-noble-xpu-n-py3,
          pytorch-linux-noble-xpu-n-py3-inductor-benchmarks,
@ -118,6 +119,22 @@ jobs:
        with:
          docker-image: ${{ steps.build-docker-image.outputs.docker-image }}

+      - name: Generate output
+        if: contains(matrix.docker-image-name, 'rocm')
+        id: generate_output
+        run: |
+          docker_image_name="${{ matrix.docker-image-name }}"
+          docker_image_tag="${{ steps.build-docker-image.outputs.docker-image }}"
+          echo "${docker_image_name}=${docker_image_tag}" >> docker-builds-output-${docker_image_name}.txt
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4.4.0
+        if: contains(matrix.docker-image-name, 'rocm')
+        with:
+          name: docker-builds-artifacts-${{ matrix.docker-image-name }}
+          retention-days: 14
+          path: ./docker-builds-output-${{ matrix.docker-image-name }}.txt
+
      - uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
        name: Push to https://ghcr.io/
        id: push-to-ghcr-io
--- a/.github/workflows/docker-cache-mi300.yml
+++ b/.github/workflows/docker-cache-mi300.yml
@ -1,55 +0,0 @@
-name: docker-cache-mi300
-
-on:
-  # run every 6 hours
-  schedule:
-    - cron: 0 0,6,12,18 * * *
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-  docker-cache:
-    if: github.repository_owner == 'pytorch'
-    runs-on: rocm-docker
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          no-sudo: true
-
-      - name: configure aws credentials
-        id: aws_creds
-        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-          role-duration-seconds: 18000
-
-      - name: Login to Amazon ECR
-        id: login-ecr
-        continue-on-error: false
-        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
-
-      - name: Calculate docker image
-        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-          docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
-          push: false
-
-      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
-
-      - name: Tar and upload to S3 bucket
-        run: |
-          sudo docker save -o ~/docker-data/pytorch/pytorch_docker_image.tar ${{ steps.calculate-docker-image.outputs.docker-image }}
-          sudo rclone copy -P --s3-upload-concurrency 64 --s3-chunk-size 200M --s3-upload-cutoff 300M ~/docker-data/pytorch/pytorch_docker_image.tar oci:pytorchbucket0002/pytorch_docker_image --progress
--- a/.github/workflows/docker-cache-rocm.yml
+++ b/.github/workflows/docker-cache-rocm.yml
@ -0,0 +1,105 @@
+name: docker-cache-rocm
+
+on:
+  workflow_run:
+    workflows: [docker-builds]
+    branches: [main, release]
+    types:
+      - completed
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+  actions: read
+
+jobs:
+  download-docker-builds-artifacts:
+    if: github.repository_owner == 'pytorch'
+    name: download-docker-builds-artifacts
+    runs-on: ubuntu-latest
+    outputs:
+      pytorch-linux-jammy-rocm-n-py3: ${{ steps.process-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}
+      pytorch-linux-noble-rocm-n-py3: ${{ steps.process-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }}
+      pytorch-linux-jammy-rocm-n-py3-benchmarks: ${{ steps.process-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }}
+    steps:
+      - name: Download artifacts
+        uses: actions/download-artifact@v4.1.7
+        with:
+          run-id: ${{ github.event.workflow_run.id }}
+          path: ./docker-builds-artifacts
+          merge-multiple: true
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Process artifacts
+        id: process-artifacts
+        run: |
+          ls -R ./docker-builds-artifacts
+          cat ./docker-builds-artifacts/*txt >> "${GITHUB_OUTPUT}"
+          cat "${GITHUB_OUTPUT}"
+
+  docker-cache:
+    if: github.repository_owner == 'pytorch'
+    needs: download-docker-builds-artifacts
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux.rocm.gfx942.docker-cache]
+        docker-image: [
+          "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}",
+          "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }}",
+          "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }}"
+        ]
+    runs-on: "${{ matrix.runner }}"
+    steps:
+      - name: debug
+        run: |
+          JSON_STRINGIFIED="${{ toJSON(needs.download-docker-builds-artifacts.outputs) }}"
+          echo "Outputs of download-docker-builds-artifacts job: ${JSON_STRINGIFIED}"
+
+      - name: configure aws credentials
+        id: aws_creds
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+
+      - name: Login to Amazon ECR
+        id: login-ecr
+        continue-on-error: false
+        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
+
+      - name: Generate ghrc.io tag
+        id: ghcr-io-tag
+        run: |
+            ecr_image="${{ matrix.docker-image }}"
+            ghcr_image="ghcr.io/pytorch/ci-image:${ecr_image##*:}"
+            echo "ghcr_image=${ghcr_image}" >> "$GITHUB_OUTPUT"
+
+      - name: Pull docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: ${{ steps.ghcr-io-tag.outputs.ghcr_image }}
+
+      - name: Save as tarball
+        run: |
+          docker_image_tag=${{ matrix.docker-image }}
+          docker_image_tag="${docker_image_tag#*:}" # Remove everything before and including first ":"
+          docker_image_tag="${docker_image_tag%-*}" # Remove everything after and including last "-"
+          ref_name=${{ github.event.workflow_run.head_branch }}
+          if [[ $ref_name =~ "release/" ]]; then
+            ref_suffix="release"
+          elif [[ $ref_name == "main" ]]; then
+            ref_suffix="main"
+          else
+            echo "Unexpected branch in ref_name: ${ref_name}" && exit 1
+          fi
+          docker tag ${{ steps.ghcr-io-tag.outputs.ghcr_image }} ${{ matrix.docker-image }}
+          # mv is atomic operation, so we use intermediate tar.tmp file to prevent read-write contention
+          docker save -o ~/pytorch-data/docker/${docker_image_tag}.tar.tmp ${{ matrix.docker-image }}
+          mv ~/pytorch-data/docker/${docker_image_tag}.tar.tmp ~/pytorch-data/docker/${docker_image_tag}_${ref_suffix}.tar
--- a/.github/workflows/h100-distributed.yml
+++ b/.github/workflows/h100-distributed.yml
@ -37,7 +37,6 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: "linux.c7i.12xlarge"
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '9.0'
--- a/.github/workflows/inductor-rocm-mi200.yml
+++ b/.github/workflows/inductor-rocm-mi200.yml
@ -1,4 +1,4 @@
-name: inductor-rocm
+name: inductor-rocm-mi200

 on:
  schedule:
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -81,6 +81,32 @@ jobs:
      test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }}
    secrets: inherit

+  inductor-pallas-build:
+    name: inductor-pallas-build
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-py3.12-pallas
+      cuda-arch-list: '8.9'
+      runner: linux.8xlarge.memory
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      test-matrix: |
+        { include: [
+          { config: "inductor-pallas", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  inductor-pallas-test:
+    name: inductor-pallas-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs: inductor-pallas-build
+    with:
+      build-environment: linux-jammy-py3.12-gcc11
+      docker-image: ${{ needs.inductor-pallas-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-pallas-build.outputs.test-matrix }}
+    secrets: inherit
+
  inductor-triton-cpu-build:
    name: inductor-triton-cpu-build
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -5,9 +5,11 @@ on:
    - cron: 0 0 * * *
  push:
    tags:
-      # NOTE: Doc build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      # NOTE: Doc build pipelines should only get triggered on:
+      # Major or minor release candidates builds
+      - v[0-9]+.[0-9]+.0+-rc[0-9]+
+      # Final RC for major, minor and patch releases
+      - v[0-9]+.[0-9]+.[0-9]+
      - ciflow/nightly/*
  workflow_dispatch:

--- a/.github/workflows/rocm-mi200.yml
+++ b/.github/workflows/rocm-mi200.yml
@ -1,4 +1,4 @@
-name: rocm
+name: rocm-mi200

 on:
  push:
--- a/.github/workflows/test-b200.yml
+++ b/.github/workflows/test-b200.yml
@ -5,7 +5,9 @@
 # Flow:
 # 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200
 # 2. Runs smoke tests on linux.dgx.b200 runner
-# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function
+# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke_b200() function
+#    - Includes matmul, scaled_matmul, FP8, and FlashAttention CuTe tests
+#    - FlashAttention CuTe DSL is installed as part of test execution
 #
 # Triggered by:
 # - Pull requests modifying this workflow file
--- a/.github/workflows/test-h100.yml
+++ b/.github/workflows/test-h100.yml
@ -41,7 +41,6 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '9.0'
--- a/.github/workflows/trunk-rocm-mi300.yml
+++ b/.github/workflows/trunk-rocm-mi300.yml
@ -0,0 +1,83 @@
+name: trunk-rocm-mi300
+
+on:
+  push:
+    branches:
+      - main
+      - release/*
+  workflow_dispatch:
+  schedule:
+    - cron: 29 8 * * *  # about 1:29am PDT
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  llm-td:
+    if: github.repository_owner == 'pytorch'
+    name: before-test
+    uses: ./.github/workflows/llm_td_retrieval.yml
+    permissions:
+      id-token: write
+      contents: read
+
+  target-determination:
+    name: before-test
+    uses: ./.github/workflows/target_determination.yml
+    needs: llm-td
+    permissions:
+      id-token: write
+      contents: read
+
+  get-label-type:
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-rocm-py3_10-build:
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      sync-tag: rocm-build
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
+          { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
+          { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
+          { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
+          { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
+          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1.b" },
+          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4.b" },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4.b" },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4.b" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
+    secrets: inherit
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -5,6 +5,7 @@ on:
    workflows:
      - pull
      - trunk
+      - trunk-rocm-mi300
      - periodic
      - periodic-rocm-mi200
      - periodic-rocm-mi300
--- a/.gitignore
+++ b/.gitignore
@ -127,6 +127,7 @@ torch/test/
 torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
 torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h
 torch/version.py
+torch/_inductor/kernel/vendored_templates/*
 minifier_launcher.py
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_d*
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_d*
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -186,6 +186,8 @@ include_patterns = [
    'aten/src/ATen/native/nested/cuda/*.h',
    'aten/src/ATen/native/nested/*.cpp',
    'aten/src/ATen/native/nested/*.h',
+    'aten/src/ATen/xpu/**/*.h',
+    'aten/src/ATen/xpu/**/*.cpp',
    'c10/**/*.cpp',
    'c10/**/*.h',
    'torch/*.h',
@ -1402,7 +1404,7 @@ init_command = [
    '--dry-run={{DRYRUN}}',
    'usort==1.0.8.post1',
    'isort==6.0.1',
-    'ruff==0.13.1',  # sync with RUFF
+    'ruff==0.14.4',  # sync with RUFF
 ]
 is_formatter = true

@ -1537,7 +1539,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'ruff==0.13.1',  # sync with PYFMT
+    'ruff==0.14.4',  # sync with PYFMT
 ]
 is_formatter = true

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -736,6 +736,44 @@ if(NOT DEFINED USE_BLAS)
  set(USE_BLAS ON)
 endif()

+# Prioritized Text Linker Optimization
+if(USE_PRIORITIZED_TEXT_FOR_LD)
+
+  set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
+  set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
+
+  execute_process(
+    COMMAND ${Python_EXECUTABLE}
+            ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py
+            --filein "${LINKER_SCRIPT_FILE_IN}"
+            --fout  "${LINKER_SCRIPT_FILE_OUT}"
+    RESULT_VARIABLE _gen_result
+    OUTPUT_VARIABLE _gen_output
+    ERROR_VARIABLE  _gen_error
+  )
+
+  if(NOT _gen_result EQUAL 0)
+    message(FATAL_ERROR
+      "Failed to generate linker script:\n${_gen_output}\n${_gen_error}")
+  endif()
+
+  append_cxx_flag_if_supported("-ffunction-sections" CMAKE_CXX_FLAGS)
+  append_cxx_flag_if_supported("-fdata-sections" CMAKE_CXX_FLAGS)
+  append_c_flag_if_supported("-ffunction-sections" CMAKE_C_FLAGS)
+  append_c_flag_if_supported("-fdata-sections" CMAKE_C_FLAGS)
+
+  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -T${LINKER_SCRIPT_FILE_OUT}")
+  set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -T${LINKER_SCRIPT_FILE_OUT}")
+
+else()
+  if(LINUX AND CPU_AARCH64)
+    message(WARNING [[
+    It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
+    To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
+    ]])
+  endif()
+endif()
+
 # Build libtorch mobile library, which contains ATen/TH ops and native support
 # for TorchScript model, but doesn't contain not-yet-unified caffe2 ops;
 if(INTERN_BUILD_MOBILE)
@ -1402,9 +1440,6 @@ if(BUILD_JNI)
  add_subdirectory(android/pytorch_android)
 endif()

-include(cmake/Summary.cmake)
-caffe2_print_configuration_summary()
-
 # Parse custom debug info
 if(DEFINED USE_CUSTOM_DEBINFO)
  string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}")
@ -1444,56 +1479,5 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
          DESTINATION "${CMAKE_INSTALL_BINDIR}")
 endif()

-if(USE_PRIORITIZED_TEXT_FOR_LD)
-  add_compile_options(
-    $<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
-    $<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
-  )
-  set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
-  set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
-
-  add_custom_command(
-    OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
-    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
-    DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
-    COMMENT "Generating prioritized text linker files"
-    VERBATIM
-  )
-
-  add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
-
-  if(BUILD_PYTHON)
-    set(LINKER_OPT_TARGETS torch_python)
-  endif()
-
-  if(NOT BUILD_LIBTORCHLESS)
-    list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
-    if(USE_CUDA)
-      list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
-    endif()
-    if(USE_XPU)
-      list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
-    endif()
-    if(USE_ROCM)
-      list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
-    endif()
-  endif()
-
-  foreach(tgt IN LISTS LINKER_OPT_TARGETS)
-    if(TARGET ${tgt})
-      add_dependencies("${tgt}" generate_linker_script)
-      target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
-      set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
-    else()
-       message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
-    endif()
-  endforeach()
-
-else()
-  if(LINUX AND CPU_AARCH64)
-    message(WARNING [[
-    It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
-    To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
-    ]])
-  endif()
-endif()
+include(cmake/Summary.cmake)
+caffe2_print_configuration_summary()
--- a/2
+++ b/2
@ -37,7 +37,7 @@ Copyright (c) 2024 Tri Dao.
 All rights reserved.

 All contributions by Arm:
-Copyright (c) 2021, 2023-2024 Arm Limited and/or its affiliates
+Copyright (c) 2021, 2023-2025 Arm Limited and/or its affiliates

 All contributions from Caffe:
 Copyright(c) 2013, 2014, 2015, the respective contributors
--- a/SECURITY.md
+++ b/SECURITY.md
@ -18,6 +18,8 @@ Please report security issues using https://github.com/pytorch/pytorch/security/

 All reports submitted through the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework.

+**Note on crashes and out of bounds access**: PyTorch is a computational framework that performs operations on behalf of the caller. Like many low-level libraries, PyTorch generally does not validate all inputs to every function—the responsibility for providing valid arguments lies with the calling code. While crashes and out of bounds memory access should be reported as bugs, they are generally not considered security vulnerabilities in PyTorch's threat model.
+
 Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported:

 https://www.facebook.com/whitehat
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -226,8 +226,8 @@ template <
    typename B = HostBlock<S>>
 struct CachingHostAllocatorImpl {
  virtual ~CachingHostAllocatorImpl() {
-    active_ = false;
-    if (pinned_use_background_threads()) {
+    if (active_) {
+      active_ = false;
      getBackgroundThreadPool()->waitWorkComplete();
    }
  }
@ -260,6 +260,7 @@ struct CachingHostAllocatorImpl {
    if (pinned_use_background_threads()) {
      // Launch the background thread and process events in a loop.
      static bool background_thread_flag [[maybe_unused]] = [this] {
+        active_ = true;
        getBackgroundThreadPool()->run([&]() {
          while (active_) {
            process_events();
@ -683,9 +684,9 @@ struct CachingHostAllocatorImpl {
  alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
  std::deque<std::pair<E, B*>> events_; // event queue paired with block

-  // Indicates whether the object is active.
+  // Indicates whether the event-processing thread pool is active.
  // Set to false in the destructor to signal background threads to stop.
-  std::atomic<bool> active_{true};
+  std::atomic<bool> active_{false};
 protected:
  alignas(hardware_destructive_interference_size) HostStatsStaged stats_;
 };
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@ -175,7 +175,7 @@ void CUDAGraph::instantiate() {
    // who prefer not to report error message through these arguments moving forward
    // (they prefer return value, or errors on api calls internal to the capture)
 #if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000)
-    AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, cudaGraphInstantiateFlagUseNodePriority));
+    AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, 0));
 #else
    AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, NULL, NULL, 0));
 #endif
@ -184,7 +184,7 @@ void CUDAGraph::instantiate() {
  } else {
    AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
                                                graph_,
-                                                cudaGraphInstantiateFlagAutoFreeOnLaunch | cudaGraphInstantiateFlagUseNodePriority));
+                                                cudaGraphInstantiateFlagAutoFreeOnLaunch));
  }
  has_graph_exec_ = true;
 }
--- a/aten/src/ATen/cuda/NumericLimits.cuh
+++ b/aten/src/ATen/cuda/NumericLimits.cuh
@ -55,14 +55,6 @@ struct numeric_limits<int8_t> {
  static inline __host__ __device__ int8_t upper_bound() { return INT8_MAX; }
 };

-template <>
-struct numeric_limits<uint16_t> {
-  static inline __host__ __device__ uint16_t lowest() { return 0; }
-  static inline __host__ __device__ uint16_t max() { return UINT16_MAX; }
-  static inline __host__ __device__ uint16_t lower_bound() { return 0; }
-  static inline __host__ __device__ uint16_t upper_bound() { return UINT16_MAX; }
-};
-
 template <>
 struct numeric_limits<int16_t> {
  static inline __host__ __device__ int16_t lowest() { return INT16_MIN; }
@ -71,14 +63,6 @@ struct numeric_limits<int16_t> {
  static inline __host__ __device__ int16_t upper_bound() { return INT16_MAX; }
 };

-template <>
-struct numeric_limits<uint32_t> {
-  static inline __host__ __device__ uint32_t lowest() { return 0; }
-  static inline __host__ __device__ uint32_t max() { return UINT32_MAX; }
-  static inline __host__ __device__ uint32_t lower_bound() { return 0; }
-  static inline __host__ __device__ uint32_t upper_bound() { return UINT32_MAX; }
-};
-
 template <>
 struct numeric_limits<int32_t> {
  static inline __host__ __device__ int32_t lowest() { return INT32_MIN; }
@ -87,21 +71,6 @@ struct numeric_limits<int32_t> {
  static inline __host__ __device__ int32_t upper_bound() { return INT32_MAX; }
 };

-template <>
-struct numeric_limits<uint64_t> {
-#ifdef _MSC_VER
-  static inline __host__ __device__ uint64_t lowest() { return 0; }
-  static inline __host__ __device__ uint64_t max() { return _UI64_MAX; }
-  static inline __host__ __device__ uint64_t lower_bound() { return 0; }
-  static inline __host__ __device__ uint64_t upper_bound() { return _UI64_MAX; }
-#else
-  static inline __host__ __device__ uint64_t lowest() { return 0; }
-  static inline __host__ __device__ uint64_t max() { return UINT64_MAX; }
-  static inline __host__ __device__ uint64_t lower_bound() { return 0; }
-  static inline __host__ __device__ uint64_t upper_bound() { return UINT64_MAX; }
-#endif
-};
-
 template <>
 struct numeric_limits<int64_t> {
 #ifdef _MSC_VER
--- a/aten/src/ATen/cuda/cub.cuh
+++ b/aten/src/ATen/cuda/cub.cuh
@ -213,7 +213,7 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
      scan_op,
      num_items,
      at::cuda::getCurrentCUDAStream());
-  C10_HIP_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 #else
  // non synchronizing cub call
  // even though cub is supposed to support tensors with int_max elements, in reality it doesn't,
@ -471,7 +471,7 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
      init_value,
      num_items,
      at::cuda::getCurrentCUDAStream());
-  C10_HIP_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 #else
  // non synchronizing cub call
  // even though cub is supposed to support tensors with int_max elements, in reality it doesn't,
--- a/aten/src/ATen/functorch/BatchedTensorImpl.h
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.h
@ -157,6 +157,8 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({
  DispatchKey::Negative,
  DispatchKey::Conjugate,
  DispatchKey::XLA,
+  DispatchKey::XPU,
+  DispatchKey::HPU,
  DispatchKey::CUDA,
  DispatchKey::CPU,
  DispatchKey::PrivateUse1,
--- a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
@ -1,239 +0,0 @@
-#pragma once
-
-#include <c10/hip/HIPCachingAllocator.h>
-
-// Use of c10::hip namespace here makes hipification easier, because
-// I don't have to also fix namespaces.  Sorry!
-namespace c10::hip {
-
-// Takes a valid HIPAllocator (of any sort) and turns it into
-// an allocator pretending to be a CUDA allocator.  See
-// Note [Masquerading as CUDA]
-class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllocator {
-  HIPCachingAllocator::HIPAllocator* allocator_;
-public:
-  explicit HIPAllocatorMasqueradingAsCUDA(HIPCachingAllocator::HIPAllocator* allocator)
-    : allocator_(allocator) {}
-
-  virtual ~HIPAllocatorMasqueradingAsCUDA() = default;
-
-  // From c10::Allocator
-
-  DataPtr allocate(size_t size) override {
-    DataPtr r = allocator_->allocate(size);
-    r.unsafe_set_device(Device(c10::DeviceType::CUDA, r.device().index()));
-    return r;
-  }
-
-  bool is_simple_data_ptr(const DataPtr& data_ptr) const override {
-    return allocator_->is_simple_data_ptr(data_ptr);
-  }
-
-  DeleterFnPtr raw_deleter() const override {
-    return allocator_->raw_deleter();
-  }
-
-  void copy_data(void* dest, const void* src, std::size_t count) const final {
-    allocator_->copy_data(dest, src, count);
-  }
-
-  // From DeviceAllocator
-
-  bool initialized() override {
-    return allocator_->initialized();
-  }
-
-  void emptyCache(MempoolId_t mempool_id = {0, 0}) override {
-    allocator_->emptyCache(mempool_id);
-  }
-
-  void recordStream(const DataPtr& ptr, c10::Stream stream) override {
-    HIPStream hip_stream = HIPStream(stream);
-    recordStream(ptr, hip_stream);
-  }
-
-  CachingDeviceAllocator::DeviceStats getDeviceStats(c10::DeviceIndex device) override {
-    return allocator_->getDeviceStats(device);
-  }
-
-  void resetAccumulatedStats(c10::DeviceIndex device) override {
-    allocator_->resetAccumulatedStats(device);
-  }
-
-  void resetPeakStats(c10::DeviceIndex device) override {
-    allocator_->resetPeakStats(device);
-  }
-
-  // From CUDAAllocator
-
-  void* raw_alloc(size_t nbytes) override {
-    return allocator_->raw_alloc(nbytes);
-  }
-
-  void* raw_alloc_with_stream(size_t nbytes, hipStream_t stream) override {
-    return allocator_->raw_alloc_with_stream(nbytes, stream);
-  }
-
-  void raw_delete(void* ptr) override {
-    allocator_->raw_delete(ptr);
-  }
-
-  void init(int device_count) override {
-    allocator_->init(device_count);
-  }
-
-  double getMemoryFraction(c10::DeviceIndex device) override {
-    return allocator_->getMemoryFraction(device);
-  }
-
-  void setMemoryFraction(double fraction, c10::DeviceIndex device) override {
-    allocator_->setMemoryFraction(fraction, device);
-  }
-
-  std::vector<HIPCachingAllocator::StreamSegmentSize> getExpandableSegmentSizes(c10::DeviceIndex device) override {
-    return allocator_->getExpandableSegmentSizes(device);
-  }
-
-  void enable(bool value) override {
-    allocator_->enable(value);
-  }
-
-  bool isEnabled() const override {
-    return allocator_->isEnabled();
-  }
-
-  void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) override {
-    allocator_->cacheInfo(device, largestBlock);
-  }
-
-  void* getBaseAllocation(void* ptr, size_t* size) override {
-    return allocator_->getBaseAllocation(ptr, size);
-  }
-
-  void recordStream(const DataPtr& ptr, HIPStream stream) override {
-    allocator_->recordStream(ptr, stream);
-  }
-
-  HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) override {
-    return allocator_->snapshot(mempool_id);
-  }
-
-  void beginAllocateToPool(
-      c10::DeviceIndex device,
-      MempoolId_t mempool_id,
-      std::function<bool(hipStream_t)> filter) override {
-    allocator_->beginAllocateToPool(device, mempool_id, filter);
-  }
-
-  void endAllocateToPool(
-      c10::DeviceIndex device,
-      MempoolId_t mempool_id) override {
-    allocator_->endAllocateToPool(device, mempool_id);
-  }
-
-  void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) override {
-    allocator_->releasePool(device, mempool_id);
-  }
-
-  int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) override {
-    return allocator_->getPoolUseCount(device, mempool_id);
-  }
-
-  void createOrIncrefPool(
-      c10::DeviceIndex device,
-      MempoolId_t mempool_id,
-      HIPAllocator* allocator = nullptr) override {
-    allocator_->createOrIncrefPool(device, mempool_id, allocator);
-  }
-
-  void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) override {
-    allocator_->setUseOnOOM(device, mempool_id);
-  }
-
-  bool checkPoolLiveAllocations(
-      c10::DeviceIndex device,
-      MempoolId_t mempool_id,
-      const std::unordered_set<void*>& expected_live_allocations) override {
-    return allocator_->checkPoolLiveAllocations(device, mempool_id, expected_live_allocations);
-  }
-
-  HIPCachingAllocator::ShareableHandle shareIpcHandle(void* ptr) override {
-    return allocator_->shareIpcHandle(ptr);
-  }
-
-  std::shared_ptr<void> getIpcDevPtr(std::string handle) override {
-    return allocator_->getIpcDevPtr(handle);
-  }
-
-  bool isHistoryEnabled() override {
-    return allocator_->isHistoryEnabled();
-  }
-
-  void recordHistory(
-      bool enabled,
-      HIPCachingAllocator::CreateContextFn context_recorder,
-      size_t alloc_trace_max_entries,
-      HIPCachingAllocator::RecordContext when,
-      bool clearHistory) override {
-    allocator_->recordHistory(enabled, context_recorder, alloc_trace_max_entries, when, clearHistory);
-  }
-
-  void recordAnnotation(
-      const std::vector<std::pair<std::string, std::string>>& md) override {
-    allocator_->recordAnnotation(md);
-  }
-
-  void pushCompileContext(std::string& md) override {
-    allocator_->pushCompileContext(md);
-  }
-
-  void popCompileContext() override {
-    allocator_->popCompileContext();
-  }
-
-  void attachOutOfMemoryObserver(HIPCachingAllocator::OutOfMemoryObserver observer) override {
-    allocator_->attachOutOfMemoryObserver(observer);
-  }
-
-  void attachAllocatorTraceTracker(HIPCachingAllocator::AllocatorTraceTracker tracker) override {
-    allocator_->attachAllocatorTraceTracker(tracker);
-  }
-
-  void enablePeerAccess(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) override {
-    allocator_->enablePeerAccess(dev, dev_to_access);
-  }
-
-  hipError_t memcpyAsync(
-      void* dst,
-      int dstDevice,
-      const void* src,
-      int srcDevice,
-      size_t count,
-      hipStream_t stream,
-      bool p2p_enabled) override {
-    return allocator_->memcpyAsync(dst, dstDevice, src, srcDevice, count, stream, p2p_enabled);
-  }
-
-  std::shared_ptr<HIPCachingAllocator::AllocatorState> getCheckpointState(
-      c10::DeviceIndex device,
-      MempoolId_t id) override {
-    return allocator_->getCheckpointState(device, id);
-  }
-
-  HIPCachingAllocator::CheckpointDelta setCheckpointPoolState(
-      c10::DeviceIndex device,
-      std::shared_ptr<HIPCachingAllocator::AllocatorState> pps) override {
-    auto cpd = allocator_->setCheckpointPoolState(device, pps);
-    for (auto& ptr : cpd.dataptrs_allocd) {
-      ptr.unsafe_set_device(Device(c10::DeviceType::CUDA, ptr.device().index()));
-    }
-    return cpd;
-  }
-
-  std::string name() override {
-    return allocator_->name();
-  }
-
-};
-
-} // namespace c10::hip
--- a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp
+++ b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp
@ -1,18 +0,0 @@
-#include <c10/hip/HIPCachingAllocator.h>
-#include <ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h>
-#include <ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h>
-
-namespace c10 { namespace hip {
-namespace HIPCachingAllocatorMasqueradingAsCUDA {
-
-HIPCachingAllocator::HIPAllocator* get() {
-  static HIPAllocatorMasqueradingAsCUDA allocator(HIPCachingAllocator::get());
-  return &allocator;
-}
-
-void recordStreamMasqueradingAsCUDA(const DataPtr& ptr, HIPStreamMasqueradingAsCUDA stream) {
-  HIPCachingAllocator::recordStream(ptr, stream.hip_stream());
-}
-
-} // namespace HIPCachingAllocatorMasqueradingAsCUDA
-}} // namespace c10::hip
--- a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h
@ -1,194 +0,0 @@
-#pragma once
-
-#include <c10/hip/HIPCachingAllocator.h>
-#include <ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h>
-#include <ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h>
-
-namespace c10 {
-// forward declaration
-class DataPtr;
-namespace hip {
-namespace HIPCachingAllocatorMasqueradingAsCUDA {
-
-C10_HIP_API HIPCachingAllocator::HIPAllocator* get();
-C10_HIP_API void recordStreamMasqueradingAsCUDA(const DataPtr& ptr, HIPStreamMasqueradingAsCUDA stream);
-
-inline void* raw_alloc(size_t nbytes) {
-  return get()->raw_alloc(nbytes);
-}
-
-inline void* raw_alloc_with_stream(size_t nbytes, hipStream_t stream) {
-  return get()->raw_alloc_with_stream(nbytes, stream);
-}
-
-inline void raw_delete(void* ptr) {
-  return get()->raw_delete(ptr);
-}
-
-inline void init(int device_count) {
-  return get()->init(device_count);
-}
-
-inline double getMemoryFraction(c10::DeviceIndex device) {
-  return get()->getMemoryFraction(device);
-}
-
-inline void setMemoryFraction(double fraction, c10::DeviceIndex device) {
-  return get()->setMemoryFraction(fraction, device);
-}
-
-inline void emptyCache(MempoolId_t mempool_id = {0, 0}) {
-  return get()->emptyCache(mempool_id);
-}
-
-inline void enable(bool value) {
-  return get()->enable(value);
-}
-
-inline bool isEnabled() {
-  return get()->isEnabled();
-}
-
-inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) {
-  return get()->cacheInfo(device, largestBlock);
-}
-
-inline void* getBaseAllocation(void* ptr, size_t* size) {
-  return get()->getBaseAllocation(ptr, size);
-}
-
-inline c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
-    c10::DeviceIndex device) {
-  return get()->getDeviceStats(device);
-}
-
-inline void resetAccumulatedStats(c10::DeviceIndex device) {
-  return get()->resetAccumulatedStats(device);
-}
-
-inline void resetPeakStats(c10::DeviceIndex device) {
-  return get()->resetPeakStats(device);
-}
-
-inline HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) {
-  return get()->snapshot(mempool_id);
-}
-
-inline std::shared_ptr<HIPCachingAllocator::AllocatorState> getCheckpointState(
-    c10::DeviceIndex device,
-    MempoolId_t id) {
-  return get()->getCheckpointState(device, id);
-}
-
-inline HIPCachingAllocator::CheckpointDelta setCheckpointPoolState(
-    c10::DeviceIndex device,
-    std::shared_ptr<HIPCachingAllocator::AllocatorState> pps) {
-  return get()->setCheckpointPoolState(device, std::move(pps));
-}
-
-inline void beginAllocateToPool(
-    c10::DeviceIndex device,
-    MempoolId_t mempool_id,
-    std::function<bool(hipStream_t)> filter) {
-  get()->beginAllocateToPool(device, mempool_id, std::move(filter));
-}
-
-inline void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id) {
-  get()->endAllocateToPool(device, mempool_id);
-}
-
-inline void recordHistory(
-    bool enabled,
-    HIPCachingAllocator::CreateContextFn context_recorder,
-    size_t alloc_trace_max_entries,
-    HIPCachingAllocator::RecordContext when,
-    bool clearHistory) {
-  return get()->recordHistory(
-      enabled, context_recorder, alloc_trace_max_entries, when, clearHistory);
-}
-
-inline void recordAnnotation(
-    const std::vector<std::pair<std::string, std::string>>& md) {
-  return get()->recordAnnotation(md);
-}
-
-inline void pushCompileContext(std::string& md) {
-  return get()->pushCompileContext(md);
-}
-
-inline void popCompileContext() {
-  return get()->popCompileContext();
-}
-
-inline bool isHistoryEnabled() {
-  return get()->isHistoryEnabled();
-}
-
-inline bool checkPoolLiveAllocations(
-    c10::DeviceIndex device,
-    MempoolId_t mempool_id,
-    const std::unordered_set<void*>& expected_live_allocations) {
-  return get()->checkPoolLiveAllocations(
-      device, mempool_id, expected_live_allocations);
-}
-
-inline void attachOutOfMemoryObserver(HIPCachingAllocator::OutOfMemoryObserver observer) {
-  return get()->attachOutOfMemoryObserver(std::move(observer));
-}
-
-inline void attachAllocatorTraceTracker(HIPCachingAllocator::AllocatorTraceTracker tracker) {
-  return get()->attachAllocatorTraceTracker(std::move(tracker));
-}
-
-inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) {
-  return get()->releasePool(device, mempool_id);
-}
-
-inline void createOrIncrefPool(
-    c10::DeviceIndex device,
-    MempoolId_t mempool_id,
-    HIPCachingAllocator::HIPAllocator* allocator_ptr = nullptr) {
-  get()->createOrIncrefPool(device, mempool_id, allocator_ptr);
-}
-
-inline void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) {
-  get()->setUseOnOOM(device, mempool_id);
-}
-
-inline int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) {
-  return get()->getPoolUseCount(device, mempool_id);
-}
-
-inline std::shared_ptr<void> getIpcDevPtr(std::string handle) {
-  return get()->getIpcDevPtr(std::move(handle));
-}
-
-inline HIPCachingAllocator::ShareableHandle shareIpcHandle(void* ptr) {
-  return get()->shareIpcHandle(ptr);
-}
-
-inline std::string name() {
-  return get()->name();
-}
-
-inline hipError_t memcpyAsync(
-    void* dst,
-    int dstDevice,
-    const void* src,
-    int srcDevice,
-    size_t count,
-    hipStream_t stream,
-    bool p2p_enabled) {
-  return get()->memcpyAsync(
-      dst, dstDevice, src, srcDevice, count, stream, p2p_enabled);
-}
-
-inline void enablePeerAccess(
-    c10::DeviceIndex dev,
-    c10::DeviceIndex dev_to_access) {
-  return get()->enablePeerAccess(dev, dev_to_access);
-}
-
-} // namespace HIPCachingAllocatorMasqueradingAsCUDA
-} // namespace hip
-} // namespace c10
--- a/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.cpp
+++ b/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.cpp
@ -1,14 +0,0 @@
-#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
-
-// THIS IS A MASSIVE HACK.  This will BREAK you Caffe2 CUDA code if you
-// load ATen_hip, even if you don't ever actually use ATen_hip at runtime.
-//
-// If you ever link ATen_hip statically into the full library along
-// with ATen_cuda (libomnibus), the loading order of this versus the regular
-// ATen_cuda will be nondeterministic, and you'll nondeterministically get
-// one or the other.  (This will be obvious because all of your code
-// will fail.)
-//
-// This hack can be removed once PyTorch is out-of-place HIPified, and
-// doesn't pretend CUDA is HIP.
-C10_REGISTER_GUARD_IMPL(CUDA, at::cuda::HIPGuardImplMasqueradingAsCUDA)
--- a/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
@ -1,383 +0,0 @@
-#pragma once
-
-#include <ATen/hip/HIPConfig.h>
-
-// The includes of HIPGuard.h
-#include <c10/hip/impl/HIPGuardImpl.h>
-#include <c10/hip/HIPMacros.h>
-#include <c10/core/DeviceType.h>
-#include <c10/core/impl/InlineDeviceGuard.h>
-#include <c10/core/impl/InlineStreamGuard.h>
-#include <c10/util/Exception.h>
-
-#include <c10/hip/impl/HIPGuardImpl.h>
-
-#include <ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h>
-#include <ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h>
-
-// Use of c10::hip namespace here makes hipification easier, because
-// I don't have to also fix namespaces.  Sorry!
-namespace c10 { namespace hip {
-
-// Note [Masquerading as CUDA]
-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~
-// c10_hip is very easy to understand: it is HIPified from c10_cuda,
-// and anywhere you said CUDA, the source code now says HIP.  HIPified
-// PyTorch is much harder to understand: it is HIPified from regular
-// PyTorch, yes, but NO source-to-source translation from CUDA to
-// HIP occurs; instead, anywhere we see "CUDA", it actually means "HIP".
-// For example, when you use HIPified PyTorch, you say x.cuda() to
-// move a tensor onto ROCm device.  We call this situation "HIP
-// masquerading as CUDA".
-//
-// This leads to a very awkward situation when we want to call c10_hip
-// code from PyTorch, since c10_hip is expecting things to be called
-// HIP, but PyTorch is calling them CUDA (masquerading as HIP).  To
-// fix this impedance mismatch, we have MasqueradingAsCUDA variants
-// for all c10_hip classes.  These translate between the "HIP" and "CUDA
-// masquerading as HIP" worlds.  For example,
-// HIPGuardImplMasqueradingAsCUDA (this file) provides something like a
-// HIPGuardImpl, but it reports its DeviceType as CUDA (e.g., type()
-// returns CUDA, getDevice() reports the current HIP device as a CUDA
-// device.)
-//
-// We should be able to delete all of these classes entirely once
-// we switch PyTorch to calling a HIP a HIP.
-//
-// When you add a new MasqueradingAsCUDA class/function, you need to
-// also update the rewrite rules in torch/utils/hipify/cuda_to_hip_mappings.py
-//
-//
-//
-// By the way, note that the cpp file associated with this also
-// *overwrites* the entry in the DeviceGuardImpl registry for CUDA with
-// this HIP implementation.
-
-struct HIPGuardImplMasqueradingAsCUDA final : public c10::impl::DeviceGuardImplInterface {
-  static constexpr c10::DeviceType static_type = c10::DeviceType::CUDA;
-  HIPGuardImplMasqueradingAsCUDA() {}
-  HIPGuardImplMasqueradingAsCUDA(c10::DeviceType t) {
-    TORCH_INTERNAL_ASSERT(t == c10::DeviceType::CUDA);
-  }
-  c10::DeviceType type() const override {
-    return c10::DeviceType::CUDA;
-  }
-  Device exchangeDevice(Device d) const override {
-    TORCH_INTERNAL_ASSERT(d.is_cuda());
-    Device old_device = getDevice();
-    if (old_device.index() != d.index()) {
-      C10_HIP_CHECK(hipSetDevice(d.index()));
-    }
-    return old_device;
-  }
-  Device getDevice() const override {
-    int device;
-    C10_HIP_CHECK(hipGetDevice(&device));
-    return Device(c10::DeviceType::CUDA, device);
-  }
-  void setDevice(Device d) const override {
-    TORCH_INTERNAL_ASSERT(d.is_cuda());
-    C10_HIP_CHECK(hipSetDevice(d.index()));
-  }
-  void uncheckedSetDevice(Device d) const noexcept override {
-    C10_HIP_CHECK_WARN(hipSetDevice(d.index()));
-  }
-  Stream getStream(Device d) const override {
-    return getCurrentHIPStreamMasqueradingAsCUDA(d.index()).unwrap();
-  }
-  Stream getDefaultStream(Device d) const override {
-    return getDefaultHIPStreamMasqueradingAsCUDA(d.index());
-  }
-  Stream getNewStream(Device d, int priority = 0) const override {
-    return getStreamFromPoolMasqueradingAsCUDA(priority, d.index());
-  }
-  Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false) const override {
-    return getStreamFromPoolMasqueradingAsCUDA(isHighPriority, d.index());
-  }
-  Stream exchangeStream(Stream s) const override {
-    HIPStreamMasqueradingAsCUDA cs(s);
-    auto old_stream = getCurrentHIPStreamMasqueradingAsCUDA(s.device().index());
-    setCurrentHIPStreamMasqueradingAsCUDA(cs);
-    return old_stream.unwrap();
-  }
-  DeviceIndex deviceCount() const noexcept override {
-    int deviceCnt;
-    hipError_t _err;
-    _err = hipGetDeviceCount(&deviceCnt);
-    if(_err != hipErrorNoDevice && _err != hipSuccess)
-        C10_HIP_CHECK(_err);
-    return deviceCnt;
-  }
-
-  // Event-related functions
-  // Note: hipEventCreateWithFlags should be called on the same device as
-  //  the recording stream's device.
-  void createEvent(
-    hipEvent_t* hip_event,
-    const EventFlag flag) const {
-    // Maps PyTorch's Event::Flag to HIP flag
-    auto hip_flag = hipEventDefault;
-    switch (flag) {
-      case EventFlag::PYTORCH_DEFAULT:
-        hip_flag = hipEventDisableTiming;
-        break;
-      case EventFlag::BACKEND_DEFAULT:
-        hip_flag = hipEventDefault;
-        break;
-      default:
-        TORCH_CHECK(false, "HIP event received unknown flag");
-    }
-
-    C10_HIP_CHECK(hipEventCreateWithFlags(hip_event, hip_flag));
-  }
-
-  void destroyEvent(
-    void* event,
-    const DeviceIndex device_index) const noexcept override {
-    if (!event) return;
-    auto hip_event = static_cast<hipEvent_t>(event);
-    int orig_device;
-    C10_HIP_CHECK_WARN(hipGetDevice(&orig_device));
-    C10_HIP_CHECK_WARN(hipSetDevice(device_index));
-    C10_HIP_CHECK_WARN(hipEventDestroy(hip_event));
-    C10_HIP_CHECK_WARN(hipSetDevice(orig_device));
-  }
-
-  void record(void** event,
-    const Stream& stream,
-    const DeviceIndex device_index,
-    const EventFlag flag) const override {
-    TORCH_CHECK(device_index == -1 || device_index == stream.device_index(),
-      "Event device index ",
-      device_index,
-      " does not match recording stream's device index ",
-      stream.device_index(),
-      ".");
-
-    hipEvent_t hip_event = static_cast<hipEvent_t>(*event);
-    HIPStreamMasqueradingAsCUDA hip_stream{stream};
-
-    // Moves to stream's device to record
-    const auto orig_device = getDevice();
-    setDevice(stream.device());
-
-    // Creates the event (lazily)
-    if (!hip_event) createEvent(&hip_event, flag);
-    C10_HIP_CHECK(hipEventRecord(hip_event, hip_stream));
-    // Makes the void* point to the (possibly just allocated) HIP event
-    *event = hip_event;
-
-    // Resets device
-    setDevice(orig_device);
-  }
-
-  void block(
-    void* event,
-    const Stream& stream) const override {
-    if (!event) return;
-    hipEvent_t hip_event = static_cast<hipEvent_t>(event);
-    HIPStreamMasqueradingAsCUDA hip_stream{stream};
-    const auto orig_device = getDevice();
-    setDevice(stream.device());
-    C10_HIP_CHECK(hipStreamWaitEvent(
-      hip_stream,
-      hip_event,
-      /*flags (must be zero)=*/ 0));
-    setDevice(orig_device);
-  }
-
-  bool queryEvent(void* event) const override {
-    if (!event) return true;
-    hipEvent_t hip_event = static_cast<hipEvent_t>(event);
-    const hipError_t err = hipEventQuery(hip_event);
-    if (err != hipErrorNotReady) C10_HIP_CHECK(err);
-    else {
-      // ignore and clear the error if not ready
-      (void)hipGetLastError();
-    }
-    return (err == hipSuccess);
-  }
-
-  // Stream-related functions
-  bool queryStream(const Stream& stream) const override {
-    HIPStreamMasqueradingAsCUDA hip_stream{stream};
-    return hip_stream.query();
-  }
-
-  void synchronizeStream(const Stream& stream) const override {
-    HIPStreamMasqueradingAsCUDA hip_stream{stream};
-    hip_stream.synchronize();
-  }
-
-  void synchronizeEvent(void* event) const override {
-    if (!event)
-      return;
-    hipEvent_t hip_event = static_cast<hipEvent_t>(event);
-    C10_HIP_CHECK(hipEventSynchronize(hip_event));
-  }
-
-  // Note: synchronizeDevice can be safely called from any device
-  void synchronizeDevice(const c10::DeviceIndex device_index) const override {
-    int orig_device{-1};
-    C10_HIP_CHECK(hipGetDevice(&orig_device));
-    C10_HIP_CHECK(hipSetDevice(device_index));
-    C10_HIP_CHECK(hipDeviceSynchronize());
-    C10_HIP_CHECK(hipSetDevice(orig_device));
-  }
-
-  void recordDataPtrOnStream(
-    const c10::DataPtr& data_ptr,
-    const Stream& stream) const override {
-    HIPStreamMasqueradingAsCUDA hip_stream{stream};
-    HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA(data_ptr, hip_stream);
-  }
-
-  double elapsedTime(void* event1, void* event2, const DeviceIndex device_index)
-      const override {
-    TORCH_CHECK(
-        event1 && event2,
-        "Both events must be recorded before calculating elapsed time.");
-    int orig_device;
-    C10_HIP_CHECK(hipGetDevice(&orig_device));
-    C10_HIP_CHECK(hipSetDevice(device_index));
-    hipEvent_t hip_event1 = static_cast<hipEvent_t>(event1);
-    hipEvent_t hip_event2 = static_cast<hipEvent_t>(event2);
-    float time_ms = 0;
-    // raise hipErrorNotReady if either event is recorded but not yet completed
-    C10_HIP_CHECK(hipEventElapsedTime(&time_ms, hip_event1, hip_event2));
-    C10_HIP_CHECK(hipSetDevice(orig_device));
-    return static_cast<double>(time_ms);
-  }
-};
-
-// All of the guards which have HIPGuardImpl burned in need to also have
-// variants using HIPGuardImplMasqueradingAsCUDA.
-
-/// This code is all a direct copy from c10/cuda/HIPGuardMasqueradingAsCUDA.h, but with
-/// the correct InlineDeviceGuard burned in.  Sorry about the
-/// copy-pasting.
-
-struct HIPGuardMasqueradingAsCUDA {
-  explicit HIPGuardMasqueradingAsCUDA() = delete;
-  explicit HIPGuardMasqueradingAsCUDA(DeviceIndex device_index) : guard_(device_index) {}
-  explicit HIPGuardMasqueradingAsCUDA(Device device) : guard_(device) {}
-
-  HIPGuardMasqueradingAsCUDA(const HIPGuardMasqueradingAsCUDA&) = delete;
-  HIPGuardMasqueradingAsCUDA& operator=(const HIPGuardMasqueradingAsCUDA&) = delete;
-  HIPGuardMasqueradingAsCUDA(HIPGuardMasqueradingAsCUDA&& other) = delete;
-  HIPGuardMasqueradingAsCUDA& operator=(HIPGuardMasqueradingAsCUDA&& other) = delete;
-
-  void set_device(Device device) { guard_.set_device(device); }
-  void reset_device(Device device) { guard_.reset_device(device); }
-  void set_index(DeviceIndex device_index) { guard_.set_index(device_index); }
-  Device original_device() const { return guard_.original_device(); }
-  Device current_device() const { return guard_.current_device(); }
-
- private:
-  c10::impl::InlineDeviceGuard<HIPGuardImplMasqueradingAsCUDA> guard_;
-};
-
-struct OptionalHIPGuardMasqueradingAsCUDA {
-  explicit OptionalHIPGuardMasqueradingAsCUDA() : guard_() {}
-  explicit OptionalHIPGuardMasqueradingAsCUDA(std::optional<Device> device_opt) : guard_(device_opt) {}
-  explicit OptionalHIPGuardMasqueradingAsCUDA(std::optional<DeviceIndex> device_index_opt) : guard_(device_index_opt) {}
-
-  OptionalHIPGuardMasqueradingAsCUDA(const OptionalHIPGuardMasqueradingAsCUDA&) = delete;
-  OptionalHIPGuardMasqueradingAsCUDA& operator=(const OptionalHIPGuardMasqueradingAsCUDA&) = delete;
-  OptionalHIPGuardMasqueradingAsCUDA(OptionalHIPGuardMasqueradingAsCUDA&& other) = delete;
-  OptionalHIPGuardMasqueradingAsCUDA& operator=(OptionalHIPGuardMasqueradingAsCUDA&& other) = delete;
-
-  void set_device(Device device) { guard_.set_device(device); }
-  void reset_device(Device device) { guard_.reset_device(device); }
-  void set_index(DeviceIndex device_index) { guard_.set_index(device_index); }
-  std::optional<Device> original_device() const { return guard_.original_device(); }
-  std::optional<Device> current_device() const { return guard_.current_device(); }
-  void reset() { guard_.reset(); }
-
-private:
-  c10::impl::InlineOptionalDeviceGuard<HIPGuardImplMasqueradingAsCUDA> guard_;
-};
-
-struct HIPStreamGuardMasqueradingAsCUDA {
-  explicit HIPStreamGuardMasqueradingAsCUDA() = delete;
-  explicit HIPStreamGuardMasqueradingAsCUDA(Stream stream) : guard_(stream) {}
-  HIPStreamGuardMasqueradingAsCUDA(const HIPStreamGuardMasqueradingAsCUDA&) = delete;
-  HIPStreamGuardMasqueradingAsCUDA& operator=(const HIPStreamGuardMasqueradingAsCUDA&) = delete;
-  HIPStreamGuardMasqueradingAsCUDA(HIPStreamGuardMasqueradingAsCUDA&& other) = delete;
-  HIPStreamGuardMasqueradingAsCUDA& operator=(HIPStreamGuardMasqueradingAsCUDA&& other) = delete;
-
-  void reset_stream(Stream stream) { guard_.reset_stream(stream); }
-
-  HIPStreamMasqueradingAsCUDA original_stream() const {
-    return HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, guard_.original_stream());
-  }
-  HIPStreamMasqueradingAsCUDA current_stream() const {
-    return HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, guard_.current_stream());
-  }
-
-  Device current_device() const { return guard_.current_device(); }
-  Device original_device() const { return guard_.original_device(); }
-
-private:
-  c10::impl::InlineStreamGuard<HIPGuardImplMasqueradingAsCUDA> guard_;
-};
-
-struct OptionalHIPStreamGuardMasqueradingAsCUDA {
-  explicit OptionalHIPStreamGuardMasqueradingAsCUDA() : guard_() {}
-  explicit OptionalHIPStreamGuardMasqueradingAsCUDA(Stream stream) : guard_(stream) {}
-  explicit OptionalHIPStreamGuardMasqueradingAsCUDA(std::optional<Stream> stream_opt) : guard_(stream_opt) {}
-
-  OptionalHIPStreamGuardMasqueradingAsCUDA(const OptionalHIPStreamGuardMasqueradingAsCUDA&) = delete;
-  OptionalHIPStreamGuardMasqueradingAsCUDA& operator=(const OptionalHIPStreamGuardMasqueradingAsCUDA&) = delete;
-  OptionalHIPStreamGuardMasqueradingAsCUDA(OptionalHIPStreamGuardMasqueradingAsCUDA&& other) = delete;
-  OptionalHIPStreamGuardMasqueradingAsCUDA& operator=(OptionalHIPStreamGuardMasqueradingAsCUDA&& other) = delete;
-
-  void reset_stream(Stream stream) { guard_.reset_stream(stream); }
-
-  std::optional<HIPStreamMasqueradingAsCUDA> original_stream() const {
-    auto r = guard_.original_stream();
-    if (r.has_value()) {
-      return HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, r.value());
-    } else {
-      return std::nullopt;
-    }
-  }
-
-  std::optional<HIPStreamMasqueradingAsCUDA> current_stream() const {
-    auto r = guard_.current_stream();
-    if (r.has_value()) {
-      return HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, r.value());
-    } else {
-      return std::nullopt;
-    }
-  }
-
-  void reset() { guard_.reset(); }
-
-private:
-  c10::impl::InlineOptionalStreamGuard<HIPGuardImplMasqueradingAsCUDA> guard_;
-};
-
-struct HIPMultiStreamGuardMasqueradingAsCUDA {
-  explicit HIPMultiStreamGuardMasqueradingAsCUDA(ArrayRef<HIPStreamMasqueradingAsCUDA> streams)
-    : guard_(unwrapStreams(streams)) {}
-
-  HIPMultiStreamGuardMasqueradingAsCUDA(const HIPMultiStreamGuardMasqueradingAsCUDA&) = delete;
-  HIPMultiStreamGuardMasqueradingAsCUDA& operator=(const HIPMultiStreamGuardMasqueradingAsCUDA&) = delete;
-  HIPMultiStreamGuardMasqueradingAsCUDA(HIPMultiStreamGuardMasqueradingAsCUDA&& other) = delete;
-  HIPMultiStreamGuardMasqueradingAsCUDA& operator=(HIPMultiStreamGuardMasqueradingAsCUDA&& other) = delete;
-
-private:
-  c10::impl::InlineMultiStreamGuard<HIPGuardImplMasqueradingAsCUDA> guard_;
-
-  static std::vector<Stream> unwrapStreams(ArrayRef<HIPStreamMasqueradingAsCUDA> hipStreams) {
-    std::vector<Stream> streams;
-    streams.reserve(hipStreams.size());
-    for (const HIPStreamMasqueradingAsCUDA& hipStream : hipStreams) {
-      streams.push_back(hipStream);
-    }
-    return streams;
-  }
-};
-
-}} // namespace c10::hip
--- a/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
@ -1,135 +0,0 @@
-#pragma once
-
-#include <c10/hip/HIPStream.h>
-
-// Use of c10::hip namespace here makes hipification easier, because
-// I don't have to also fix namespaces.  Sorry!
-namespace c10 { namespace hip {
-
-// See Note [Masquerading as CUDA] for motivation
-
-class HIPStreamMasqueradingAsCUDA {
-public:
-
-  enum Unchecked { UNCHECKED };
-
-  explicit HIPStreamMasqueradingAsCUDA(Stream stream)
-    : HIPStreamMasqueradingAsCUDA(UNCHECKED, stream) {
-    // We did the coercion unchecked; check that it was right.
-    TORCH_CHECK(stream.device().is_cuda() /* !!! */);
-  }
-
-  explicit HIPStreamMasqueradingAsCUDA(Unchecked, Stream stream)
-    // Unsafely coerce the "CUDA" stream into a HIP stream
-    : stream_(
-        HIPStream(
-          Stream(
-            Stream::UNSAFE,
-            Device(c10::DeviceType::HIP, stream.device_index()),
-            stream.id())
-        )
-      ) {}
-
-  // New constructor, just for this.  Does NOT coerce.
-  explicit HIPStreamMasqueradingAsCUDA(HIPStream stream) : stream_(stream) {}
-
-  bool operator==(const HIPStreamMasqueradingAsCUDA& other) const noexcept {
-    return stream_ == other.stream_;
-  }
-
-  bool operator!=(const HIPStreamMasqueradingAsCUDA& other) const noexcept {
-    return stream_ != other.stream_;
-  }
-
-  operator hipStream_t() const { return stream_.stream(); }
-
-  operator Stream() const {
-    // Unsafely coerce HIP stream into a "CUDA" stream
-    return Stream(Stream::UNSAFE, device(), id());
-  }
-
-  DeviceIndex device_index() const { return stream_.device_index(); }
-
-  // Unsafely coerce HIP device into CUDA device
-  c10::DeviceType device_type() const { return c10::DeviceType::CUDA; }
-
-  Device device() const {
-    // Unsafely coerce HIP device into CUDA device
-    return Device(c10::DeviceType::CUDA, stream_.device_index());
-  }
-
-  StreamId id() const        { return stream_.id(); }
-  bool query() const         { return stream_.query(); }
-  void synchronize() const   { stream_.synchronize(); }
-  int priority() const       { return stream_.priority(); }
-  hipStream_t stream() const { return stream_.stream(); }
-
-  Stream unwrap() const {
-    // Unsafely coerce HIP stream into "CUDA" stream
-    return Stream(Stream::UNSAFE, device(), id());
-  }
-
-  c10::StreamData3 pack3() const noexcept {
-    // Unsafely coerce HIP stream into "CUDA" stream before packing
-    return unwrap().pack3();
-  }
-
-  static HIPStreamMasqueradingAsCUDA unpack3(StreamId stream_id,
-                                             DeviceIndex device_index,
-                                             c10::DeviceType device_type) {
-    // NB: constructor manages CUDA->HIP translation for us
-    return HIPStreamMasqueradingAsCUDA(Stream::unpack3(
-        stream_id, device_index, device_type));
-  }
-
-  static std::tuple<int, int> priority_range() { return HIPStream::priority_range(); }
-
-  // New method, gets the underlying HIPStream
-  HIPStream hip_stream() const { return stream_; }
-
-private:
-  HIPStream stream_;
-};
-
-HIPStreamMasqueradingAsCUDA
-inline getStreamFromPoolMasqueradingAsCUDA(const bool isHighPriority = false, DeviceIndex device = -1) {
-  return HIPStreamMasqueradingAsCUDA(getStreamFromPool(isHighPriority, device));
-}
-
-HIPStreamMasqueradingAsCUDA
-inline getStreamFromPoolMasqueradingAsCUDA(const int priority, DeviceIndex device = -1) {
-  return HIPStreamMasqueradingAsCUDA(getStreamFromPool(priority, device));
-}
-
-HIPStreamMasqueradingAsCUDA
-inline getStreamFromExternalMasqueradingAsCUDA(hipStream_t ext_stream, DeviceIndex device) {
-  return HIPStreamMasqueradingAsCUDA(getStreamFromExternal(ext_stream, device));
-}
-
-inline HIPStreamMasqueradingAsCUDA getDefaultHIPStreamMasqueradingAsCUDA(DeviceIndex device_index = -1) {
-  return HIPStreamMasqueradingAsCUDA(getDefaultHIPStream(device_index));
-}
-
-inline HIPStreamMasqueradingAsCUDA getCurrentHIPStreamMasqueradingAsCUDA(DeviceIndex device_index = -1) {
-  return HIPStreamMasqueradingAsCUDA(getCurrentHIPStream(device_index));
-}
-
-inline void setCurrentHIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA stream) {
-  setCurrentHIPStream(stream.hip_stream());
-}
-
-inline std::ostream& operator<<(std::ostream& stream, const HIPStreamMasqueradingAsCUDA& s) {
-  stream << s.hip_stream() << " (masquerading as CUDA)";
-  return stream;
-}
-
-}} // namespace c10::hip
-
-namespace std {
-  template <>
-  struct hash<c10::hip::HIPStreamMasqueradingAsCUDA> {
-    size_t operator()(c10::hip::HIPStreamMasqueradingAsCUDA s) const noexcept {
-      return std::hash<c10::Stream>{}(s.unwrap());
-    }
-  };
-} // namespace std
--- a/aten/src/ATen/miopen/Handle.cpp
+++ b/aten/src/ATen/miopen/Handle.cpp
@ -39,7 +39,7 @@ using MIOpenPoolType = at::cuda::DeviceThreadHandlePool<

 miopenHandle_t getMiopenHandle() {
  c10::DeviceIndex device = 0;
-  AT_CUDA_CHECK(c10::hip::GetDevice(&device));
+  AT_CUDA_CHECK(at::cuda::GetDevice(&device));

  // Thread local PoolWindows are lazily-initialized
  // to avoid initialization issues that caused hangs on Windows.
@ -51,7 +51,7 @@ miopenHandle_t getMiopenHandle() {
      pool->newPoolWindow());

  auto handle = myPoolWindow->reserve(device);
-  MIOPEN_CHECK(miopenSetStream(handle, c10::hip::getCurrentHIPStream()));
+  MIOPEN_CHECK(miopenSetStream(handle, at::cuda::getCurrentCUDAStream()));
  return handle;
 }

--- a/aten/src/ATen/mps/MPSAllocator.mm
+++ b/aten/src/ATen/mps/MPSAllocator.mm
@ -440,7 +440,7 @@ bool MPSHeapAllocatorImpl::release_cached_buffers() {
  // we need to release the lock temporarily as synchronizing may cause deadlock with completion handlers.
  m_mutex.unlock();
  auto stream = getDefaultMPSStream();
-  dispatch_sync(stream->queue(), ^() {
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
    stream->synchronize(SyncType::COMMIT_AND_WAIT);
  });
  m_mutex.lock();
--- a/aten/src/ATen/mps/MPSStream.h
+++ b/aten/src/ATen/mps/MPSStream.h
@ -110,6 +110,9 @@ class TORCH_API MPSStream {
    return _stream;
  }

+  MTLBuffer_t getErrorBuffer();
+  void checkLastError();
+
 private:
  Stream _stream;
  MTLCommandQueue_t _commandQueue = nil;
@ -121,6 +124,8 @@ class TORCH_API MPSStream {
  dispatch_queue_t _serialQueue = nullptr;
  // CommitAndContinue is enabled by default
  bool _enableCommitAndContinue = true;
+  // Buffer that contains last raised error
+  MTLBuffer_t _errorBuffer = nil;

  // use synchronize() to access any of these commit functions outside MPSStream
  void commit();
@ -155,4 +160,7 @@ class TORCH_API MPSStreamImpl {
  MPSStreamImpl();
 };

+#ifdef __OBJC__
+void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)());
+#endif
 } // namespace at::mps
--- a/aten/src/ATen/mps/MPSStream.mm
+++ b/aten/src/ATen/mps/MPSStream.mm
@ -3,13 +3,13 @@
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/mps/MPSStream.h>
+#include <c10/metal/error.h>

@interface MPSGraphExecutionDescriptor ()
@property(readwrite, atomic) BOOL enableCommitAndContinue;
@end

 namespace at::mps {
-
 //-----------------------------------------------------------------
 //  MPSStream
 //-----------------------------------------------------------------
@ -30,6 +30,10 @@ MPSStream::MPSStream(Stream stream) : _stream(stream) {
  // Choose level which optimizes for GPU
  _compilationDescriptor.optimizationLevel = MPSGraphOptimizationLevel0;
  _executionDescriptor.compilationDescriptor = _compilationDescriptor;
+
+  _errorBuffer = [MPSDevice::getInstance()->device() newBufferWithLength:sizeof(c10::metal::ErrorMessages)
+                                                                 options:MTLResourceStorageModeShared];
+  std::memset([_errorBuffer contents], 0, 1024);
 }

 MPSStream::~MPSStream() {
@ -38,6 +42,8 @@ MPSStream::~MPSStream() {
  [_executionDescriptor release];
  [_compilationDescriptor release];
  _executionDescriptor = nil;
+  [_errorBuffer release];
+  _errorBuffer = nil;
  _compilationDescriptor = nil;

  assert(_commandBuffer == nil);
@ -104,6 +110,7 @@ void MPSStream::commitAndWait() {
    [_prevCommandBuffer waitUntilCompleted];
    [_prevCommandBuffer release];
    _prevCommandBuffer = nil;
+    checkLastError();
  }

  if (_commandBuffer) {
@ -111,6 +118,7 @@ void MPSStream::commitAndWait() {
    [_commandBuffer waitUntilCompleted];
    [_commandBuffer release];
    _commandBuffer = nil;
+    checkLastError();
  }
 }

@ -153,7 +161,7 @@ void MPSStream::fill(id<MTLBuffer> buffer, uint8_t value, size_t length, size_t
  if (length == 0) {
    return;
  }
-  dispatch_sync(_serialQueue, ^() {
+  dispatch_sync_with_rethrow(_serialQueue, ^() {
    @autoreleasepool {
      endKernelCoalescing();
      id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
@ -183,7 +191,7 @@ void MPSStream::copy(id<MTLBuffer> srcBuffer,
                     size_t dstOffset,
                     uint64_t profileId,
                     SyncType syncType) {
-  dispatch_sync(_serialQueue, ^() {
+  dispatch_sync_with_rethrow(_serialQueue, ^() {
    @autoreleasepool {
      endKernelCoalescing();
      id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
@ -236,7 +244,7 @@ void MPSStream::executeMPSGraph(MPSGraph* mpsGraph, NSDictionary* feeds, NSDicti
  auto& profiler = getMPSProfiler();
  const bool isGraphProfilingEnabled = profiler.isOperationProfilingEnabled();

-  dispatch_sync(_serialQueue, ^() {
+  dispatch_sync_with_rethrow(_serialQueue, ^() {
    endKernelCoalescing();
    if (isGraphProfilingEnabled) {
      // this function call is only relevant for interval-based Signposts
@ -266,6 +274,24 @@ void MPSStream::executeMPSGraph(MPSGraph* mpsGraph, NSDictionary* feeds, NSDicti
  });
 }

+id<MTLBuffer> MPSStream::getErrorBuffer() {
+  return _errorBuffer;
+}
+
+void MPSStream::checkLastError() {
+  auto msgs = reinterpret_cast<c10::metal::ErrorMessages*>([_errorBuffer contents]);
+  const auto& msg = msgs->msg[0];
+  if (!msgs) {
+    return;
+  }
+  unsigned int count = 0;
+  std::swap(count, msgs->count);
+  if (!count) {
+    return;
+  }
+  throw c10::AcceleratorError({msg.func, msg.file, msg.line}, 1, msg.message);
+}
+
 //-----------------------------------------------------------------
 //  MPSStreamImpl
 //-----------------------------------------------------------------
@ -289,4 +315,19 @@ MPSStream* getDefaultMPSStream() {
  return MPSStreamImpl::getInstance();
 }

+// Helper methods
+void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()) {
+  __block std::optional<std::exception_ptr> block_exception;
+  dispatch_sync(queue, ^() {
+    try {
+      block();
+    } catch (...) {
+      block_exception = std::current_exception();
+    }
+  });
+  if (block_exception) {
+    std::rethrow_exception(*block_exception);
+  }
+}
+
 } // namespace at::mps
--- a/aten/src/ATen/native/Distributions.h
+++ b/aten/src/ATen/native/Distributions.h
@ -5,9 +5,13 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/MathConstants.h>

-// ROCM hcc doesn't work well with using std:: in kernel functions
+// ROCm hip compiler doesn't work well with using std:: in kernel functions
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
 #if defined(__CUDA_ARCH__)
 #include <c10/cuda/CUDAMathCompat.h>
+#elif defined(__HIPCC__)
+#include <c10/hip/HIPMathCompat.h>
+#endif
 #define compat_exp c10::cuda::compat::exp
 #define compat_ceil c10::cuda::compat::ceil
 #define compat_floor c10::cuda::compat::floor
@ -17,17 +21,6 @@
 #define compat_tan c10::cuda::compat::tan
 #define compat_abs c10::cuda::compat::abs
 #define compat_log1p c10::cuda::compat::log1p
-#elif defined(__HIPCC__)
-#include <c10/hip/HIPMathCompat.h>
-#define compat_exp c10::hip::compat::exp
-#define compat_ceil c10::hip::compat::ceil
-#define compat_floor c10::hip::compat::floor
-#define compat_log c10::hip::compat::log
-#define compat_pow c10::hip::compat::pow
-#define compat_sqrt c10::hip::compat::sqrt
-#define compat_tan c10::hip::compat::tan
-#define compat_abs c10::hip::compat::abs
-#define compat_log1p c10::hip::compat::log1p
 #else
 #define compat_exp std::exp
 #define compat_ceil std::ceil
--- a/aten/src/ATen/native/PackedSequence.cpp
+++ b/aten/src/ATen/native/PackedSequence.cpp
@ -142,6 +142,7 @@ Tensor _pack_padded_sequence_backward_symint(const Tensor& grad, c10::SymIntArra
 std::tuple<Tensor, Tensor> _pad_packed_sequence(const Tensor& data, const Tensor& _batch_sizes, bool batch_first, const Scalar& padding_value, int64_t total_length) {
  auto batch_sizes_t = _batch_sizes.contiguous();
  checkLongTensor(batch_sizes_t);
+  TORCH_CHECK(batch_sizes_t.numel() > 0, "batch_sizes can not be empty");

  int64_t * batch_sizes = batch_sizes_t.data_ptr<int64_t>();
  int64_t max_batch_size = batch_sizes[0];
--- a/aten/src/ATen/native/SharedReduceOps.h
+++ b/aten/src/ATen/native/SharedReduceOps.h
@ -52,13 +52,14 @@ inline C10_DEVICE scalar_t min_propagate_nan(scalar_t a, scalar_t b) {
 #define MIN(X, Y) min_impl(X,Y)
 #endif

-// ROCM hcc doesn't work well with using std:: in kernel functions
+// ROCm hip compiler doesn't work well with using std:: in kernel functions
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
 #if defined(__CUDA_ARCH__)
 #include <c10/cuda/CUDAMathCompat.h>
-#define compat_pow c10::cuda::compat::pow
 #elif defined(__HIPCC__)
 #include <c10/hip/HIPMathCompat.h>
-#define compat_pow c10::hip::compat::pow
+#endif
+#define compat_pow c10::cuda::compat::pow
 #else
 #define compat_pow std::pow
 #endif
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@ -23,6 +23,7 @@
 #include <ATen/ops/_aminmax_native.h>
 #include <ATen/ops/_assert_async_native.h>
 #include <ATen/ops/_assert_scalar_native.h>
+#include <ATen/ops/_async_error_native.h>
 #include <ATen/ops/_functional_assert_async_native.h>
 #include <ATen/ops/_functional_assert_scalar_native.h>
 #include <ATen/ops/_make_per_tensor_quantized_tensor.h>
@ -479,6 +480,14 @@ Tensor isfinite(const Tensor& self) {
  });
 }

+void _async_error(std::string_view msg) {
+  TORCH_CHECK(0, msg);
+}
+
+void _async_error_meta(std::string_view msg) {
+  // Do NOT error, it's an async error!
+}
+
 void _assert_async_cpu(const Tensor& self) {
  TORCH_CHECK(
      native::is_nonzero(self),
--- a/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp
@ -5,7 +5,6 @@
 #include <ATen/native/ReduceOpsUtils.h>

 #include <ATen/Dispatch.h>
-#include <ATen/Dispatch_v2.h>
 #include <ATen/Parallel.h>
 #include <ATen/TensorIterator.h>
 #include <ATen/OpMathType.h>
@ -79,12 +78,12 @@ void min_all_kernel_impl(Tensor& result, const Tensor& input) {
    reduce_all_impl<int64_t>(result, input, upper_bound<int64_t>(),
      [=](int64_t a, int64_t b) -> int64_t { return min_impl(a, b); });
  } else {
-    AT_DISPATCH_V2(input.scalar_type(), "min_all", AT_WRAP([&] {
+    AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "min_all", [&] {
      using Vec = Vectorized<opmath_type<scalar_t>>;
      reduce_all_impl_vec<scalar_t>(result, input, upper_bound<scalar_t>(),
        [=] (scalar_t a , scalar_t b) -> scalar_t { return min_impl(a, b); },
        [=](Vec a, Vec b) -> Vec { return minimum(a, b); });
-    }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kHalf, kBFloat16);
+    });
  }
 }

@ -104,12 +103,12 @@ void max_all_kernel_impl(Tensor& result, const Tensor& input) {
    reduce_all_impl<int64_t>(result, input, lower_bound<int64_t>(),
      [=](int64_t a, int64_t b) -> int64_t { return max_impl(a, b); });
  } else {
-    AT_DISPATCH_V2(input.scalar_type(), "max_all", AT_WRAP([&] {
+    AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "max_all", [&] {
      using Vec = Vectorized<opmath_type<scalar_t>>;
      reduce_all_impl_vec<scalar_t>(result, input, lower_bound<scalar_t>(),
        [=] (scalar_t a , scalar_t b) -> scalar_t { return max_impl(a, b); },
        [=](Vec a, Vec b) -> Vec { return maximum(a, b); });
-    }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kHalf, kBFloat16);
+    });
  }
 }

@ -200,7 +199,7 @@ void aminmax_allreduce_kernel(
      }
    );
  } else {
-    AT_DISPATCH_V2(input.scalar_type(), "aminmax_cpu", AT_WRAP([&] {
+    AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "aminmax_cpu", [&] {
      using Vec = Vectorized<opmath_type<scalar_t>>;
      using scalar_t_pair = std::pair<scalar_t, scalar_t>;
      reduce_all_impl_vec_two_outputs<scalar_t>(
@ -215,7 +214,7 @@ void aminmax_allreduce_kernel(
        [=](Vec a, Vec b) -> Vec { return minimum(a, b); },
        [=](Vec a, Vec b) -> Vec { return maximum(a, b); }
      );
-    }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf);
+    });
  }
 }

--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -3,7 +3,6 @@

 #include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
-#include <ATen/Dispatch_v2.h>
 #include <ATen/OpMathType.h>
 #include <ATen/cpu/vec/vec.h>
 #include <ATen/cpu/vec/functional.h>
@ -348,35 +347,34 @@ struct MinValuesOps: public at::native::MinOps<scalar_t> {
 };

 void min_values_kernel_impl(TensorIterator& iter) {
-  // This case is special because of Vectorized<int64_t> does not
-  // handle upper_bound<int64_t>().
-  // See: https://github.com/pytorch/pytorch/issues/43254
-  if (iter.dtype() == kLong || iter.dtype() == kUInt64) {
-    AT_DISPATCH_V2(iter.dtype(), "min_values_cpu", AT_WRAP([&iter] {
-      binary_kernel_reduce(
-        iter,
-        MinValuesOps<scalar_t>{},
-        std::pair<scalar_t, int64_t>(upper_bound<scalar_t>(), -1));
-    }), kLong, kUInt64);
+  if (iter.dtype() == kLong) {
+    // This case is special because of Vectorized<int64_t> does not
+    // handle upper_bound<int64_t>().
+    // See: https://github.com/pytorch/pytorch/issues/43254
+    using scalar_t = int64_t;
+    binary_kernel_reduce(
+      iter,
+      MinValuesOps<scalar_t>{},
+      std::pair<scalar_t, int64_t>(upper_bound<scalar_t>(), -1));
    return;
  }
-  AT_DISPATCH_V2(iter.dtype(), "min_values_cpu", AT_WRAP([&iter] {
+  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "min_values_cpu", [&iter] {
    binary_kernel_reduce_vec(
      iter,
      [](scalar_t a, scalar_t b) -> scalar_t { return min_impl(a, b); },
      [](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return minimum(a, b); },
      static_cast<double>(upper_bound<scalar_t>()));
-  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
+  });
 }

 void max_values_kernel_impl(TensorIterator& iter) {
-  AT_DISPATCH_V2(iter.dtype(), "max_values_cpu", AT_WRAP([&iter] {
+  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "max_values_cpu", [&iter] {
    binary_kernel_reduce_vec(
      iter,
      [](scalar_t a, scalar_t b) -> scalar_t { return max_impl(a, b); },
      [](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return maximum(a, b); },
      lower_bound<scalar_t>());
-  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
+  });
 }

 void argmax_kernel_impl(TensorIterator &iter) {
--- a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
+++ b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
@ -11,7 +11,6 @@
 #include <vector>

 #include <ATen/Dispatch.h>
-#include <ATen/Dispatch_v2.h>
 #include <ATen/Parallel.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/TensorIterator.h>
@ -107,7 +106,7 @@ void min_kernel_impl(
    bool keepdim) {
  int64_t self_dim_size = ensure_nonempty_size(self, dim);

-  AT_DISPATCH_V2(self.scalar_type(), "min_cpu", AT_WRAP([&] {
+  AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, self.scalar_type(), "min_cpu", [&] {
    compare_base_kernel<scalar_t>(result, indice, self, dim, keepdim, [&] (
      scalar_t* result_data, int64_t* indice_data,
      const scalar_t* self_data, auto self_dim_stride) {
@ -129,7 +128,7 @@ void min_kernel_impl(
        *indice_data = index;
      }
    );
-  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool);
+  });
 }

 void max_kernel_impl(
@ -140,7 +139,7 @@ void max_kernel_impl(
    bool keepdim) {
  int64_t self_dim_size = ensure_nonempty_size(self, dim);

-  AT_DISPATCH_V2(self.scalar_type(), "max_cpu", AT_WRAP([&] {
+  AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, self.scalar_type(), "max_cpu", [&] {
    compare_base_kernel<scalar_t>(result, indice, self, dim, keepdim, [&] (
      scalar_t* result_data, int64_t* indice_data,
      const scalar_t* self_data, auto self_dim_stride) {
@ -162,7 +161,7 @@ void max_kernel_impl(
        *indice_data = index;
      }
    );
-  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool);
+  });
 }

 void aminmax_kernel(
@ -187,7 +186,7 @@ void aminmax_kernel(
    return;
  }

-  AT_DISPATCH_V2(self.scalar_type(), "aminmax_cpu", AT_WRAP([&] {
+  AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half, self.scalar_type(), "aminmax_cpu", [&] {
    compare_base_kernel<scalar_t, scalar_t>(min_result, max_result, self, wrap_dim, keepdim, [&] (
      scalar_t* min_result_data, scalar_t* max_result_data,
      const scalar_t* self_data, auto self_dim_stride) {
@ -210,7 +209,7 @@ void aminmax_kernel(
        *max_result_data = max_number;
      }
    );
-  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half);
+  });
 }

 void where_kernel_impl(TensorIterator &iter) {
--- a/aten/src/ATen/native/cuda/GroupedBlas.cpp
+++ b/aten/src/ATen/native/cuda/GroupedBlas.cpp
@ -669,9 +669,12 @@ std::optional<c10::ScalarType> out_dtype) {
  // _scaled_mm_allowed_device is used here within _grouped_mm_cuda which seems incorrect since scale is not used.
  // the _grouped_mm_fallback should be safe for any ROCm GPU since it's just calling typical mm/bmm
  bool use_fast_path = false;
+  // On non CK system(w/ ROCm), make sure use_fast_path is false
+#if defined(USE_ROCM_CK_GEMM)
  if (at::detail::getCUDAHooks().isGPUArch({"gfx942", "gfx950"})) {
    use_fast_path = true;
  }
+#endif //USE_ROCM_CK_GEMM
 #endif
  const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
@ -680,7 +683,11 @@ std::optional<c10::ScalarType> out_dtype) {
 #ifndef USE_ROCM
    at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out);
 #else
+#if defined(USE_ROCM_CK_GEMM)
    at::hip::detail::group_gemm_ck(mat_a, mat_b, offs, bias, out);
+#else
+    TORCH_WARN("ROCm: Group Gemm through CK not selected.");
+#endif //USE_ROCM_CK_GEMM
 #endif
  } else {
    _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
--- a/aten/src/ATen/native/cuda/ReduceAMinMaxKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceAMinMaxKernel.cu
@ -1,6 +1,5 @@
 #define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/Dispatch.h>
-#include <ATen/Dispatch_v2.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/ReduceAllOps.h>
@ -29,22 +28,22 @@ void _min_max_values_kernel_cuda_impl(TensorIterator& iter) {
 }

 void aminmax_allreduce_launch_kernel(TensorIterator& iter) {
-  AT_DISPATCH_V2(
-      iter.input_dtype(), "aminmax_all_cuda", AT_WRAP([&] {
+  AT_DISPATCH_ALL_TYPES_AND3(
+      kBFloat16, kHalf, kBool, iter.input_dtype(), "aminmax_all_cuda", [&] {
        _min_max_values_kernel_cuda_impl<scalar_t>(iter);
-      }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
+      });
 }

 void aminmax_launch_kernel(TensorIterator& iter) {
-  AT_DISPATCH_V2(
-      iter.input_dtype(), "aminmax_cuda", AT_WRAP([&]() {
+  AT_DISPATCH_ALL_TYPES_AND3(
+      kBFloat16, kHalf, kBool, iter.input_dtype(), "aminmax_cuda", [&]() {
        gpu_reduce_kernel<scalar_t, scalar_t>(
            iter,
            MinMaxOps<scalar_t, scalar_t, int32_t>{},
            thrust::pair<scalar_t, scalar_t>(
                at::numeric_limits<scalar_t>::upper_bound(),
                at::numeric_limits<scalar_t>::lower_bound()));
-      }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
+      });
 }

 } // namespace at::native
--- a/aten/src/ATen/native/cuda/ReduceMaxValuesKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceMaxValuesKernel.cu
@ -1,6 +1,5 @@
 #define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/Dispatch.h>
-#include <ATen/Dispatch_v2.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/ReduceAllOps.h>
@ -34,27 +33,27 @@ void max_values_kernel_cuda_impl(TensorIterator& iter) {
 }

 void max_values_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_V2(
-      iter.dtype(), "max_values_cuda", AT_WRAP([&]() {
+  AT_DISPATCH_ALL_TYPES_AND3(
+      kBFloat16, kHalf, kBool, iter.dtype(), "max_values_cuda", [&]() {
        max_values_kernel_cuda_impl<scalar_t>(iter);
-      }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
+      });
 }

 void max_launch_kernel(TensorIterator& iter) {
-  AT_DISPATCH_V2(
-      iter.input_dtype(), "max_cuda", AT_WRAP([&]() {
+  AT_DISPATCH_ALL_TYPES_AND3(
+      kBFloat16, kHalf, kBool, iter.input_dtype(), "max_cuda", [&]() {
        gpu_reduce_kernel<scalar_t, scalar_t>(
            iter,
            MaxOps<scalar_t>{},
            thrust::pair<scalar_t, int64_t>(
                at::numeric_limits<scalar_t>::lower_bound(), 0));
-      }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
+      });
 }

 void max_all_launch_kernel(TensorIterator &iter) {
-  AT_DISPATCH_V2(iter.input_dtype(), "max_all_cuda", AT_WRAP([&] {
+  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "max_all_cuda", [&] {
    max_values_kernel_cuda_impl<scalar_t>(iter);
-  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
+  });
 }

 REGISTER_DISPATCH(max_values_stub, &max_values_kernel_cuda)
--- a/aten/src/ATen/native/cuda/ReduceMinValuesKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceMinValuesKernel.cu
@ -12,7 +12,6 @@
 #include <ATen/NumericUtils.h>

 #include <ATen/Dispatch.h>
-#include <ATen/Dispatch_v2.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/cuda/NumericLimits.cuh>

@ -34,24 +33,24 @@ void min_values_kernel_cuda_impl(TensorIterator& iter) {
 }

 void min_values_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_V2(iter.dtype(), "min_values_cuda", AT_WRAP([&]() {
+  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "min_values_cuda", [&]() {
    min_values_kernel_cuda_impl<scalar_t>(iter);
-  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
+  });
 }

 void min_launch_kernel(TensorIterator &iter) {
-  AT_DISPATCH_V2(iter.input_dtype(), "min_cuda", AT_WRAP([&]() {
+  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "min_cuda", [&]() {
    gpu_reduce_kernel<scalar_t, scalar_t>(
      iter,
      MinOps<scalar_t>{},
      thrust::pair<scalar_t, int64_t>(at::numeric_limits<scalar_t>::upper_bound(), 0));
-  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
+  });
 }

 void min_all_launch_kernel(TensorIterator &iter) {
-  AT_DISPATCH_V2(iter.input_dtype(), "min_all_cuda", AT_WRAP([&] {
+  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "min_all_cuda", [&] {
    min_values_kernel_cuda_impl<scalar_t>(iter);
-  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
+  });
 }

 REGISTER_DISPATCH(min_values_stub, &min_values_kernel_cuda)
--- a/aten/src/ATen/native/hip/bgemm_kernels/bgemm_kernel_template.h
+++ b/aten/src/ATen/native/hip/bgemm_kernels/bgemm_kernel_template.h
@ -157,7 +157,7 @@ void bgemm_kernel_impl(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
          "wrong! device_gemm with the specified compilation parameters does "
          "not support this GEMM problem");
  }
-  auto stream = at::cuda::getCurrentHIPStream().stream();
+  auto stream = at::cuda::getCurrentCUDAStream().stream();
  invoker.Run(argument, StreamConfig{stream, false});
 }

--- a/aten/src/ATen/native/hip/ck_gemm_template.h
+++ b/aten/src/ATen/native/hip/ck_gemm_template.h
@ -11,7 +11,6 @@
 #include <numeric>

 #include <ATen/ATen.h>
-#include <ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h>
 #include <ATen/native/hip/ck_gemm.h>
 #include <ATen/native/hip/ck_types.h>

@ -233,7 +232,7 @@ void gemm_impl(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
 }


- auto stream = at::cuda::getCurrentHIPStream().stream();
+ auto stream = at::cuda::getCurrentCUDAStream().stream();
 invoker.Run(argument, StreamConfig{stream, false});
 }

@ -391,7 +390,7 @@ void gemm_impl_wmma(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
 }


- auto stream = at::cuda::getCurrentHIPStream().stream();
+ auto stream = at::cuda::getCurrentCUDAStream().stream();
 #if 1
 invoker.Run(argument, StreamConfig{stream, false});
 #else
--- a/aten/src/ATen/native/hip/ck_group_gemm.hip
+++ b/aten/src/ATen/native/hip/ck_group_gemm.hip
@ -385,7 +385,7 @@ void launch_grouped_bgemm_ck_impl_dispatch(
    gemm_instance.SetWorkSpacePointer(&argument, ws_buf);

    auto invoker = gemm_instance.MakeInvoker();
-    hipStream_t stream = c10::hip::getCurrentHIPStream();
+    hipStream_t stream = c10::cuda::getCurrentCUDAStream();
    invoker.Run(argument, {stream});
    hipFree(gemm_arg_buf);
    hipFree(ws_buf);
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@ -278,14 +278,14 @@ BenchmarkCache<size_t> bwd_filter_wssizes;

 struct Workspace {
  Workspace(size_t size) : size(size), data(NULL) {
-    data = c10::hip::HIPCachingAllocator::raw_alloc(size);
+    data = c10::cuda::CUDACachingAllocator::raw_alloc(size);
  }
  Workspace(const Workspace&) = delete;
  Workspace(Workspace&&) = default;
  Workspace& operator=(Workspace&&) = default;
  ~Workspace() {
    if (data) {
-      c10::hip::HIPCachingAllocator::raw_delete(data);
+      c10::cuda::CUDACachingAllocator::raw_delete(data);
    }
  }

@ -587,7 +587,7 @@ void findAlgorithm(const ConvolutionArgs& args, bool benchmark, algo_t* algo) {
  wsscache.insert(args.params, perfResults.memory);

  if (at::native::_cudnn_get_conv_benchmark_empty_cache()) {
-      c10::hip::HIPCachingAllocator::emptyCache();
+      c10::cuda::CUDACachingAllocator::emptyCache();
  }

 }
--- a/aten/src/ATen/native/miopen/RNN_miopen.cpp
+++ b/aten/src/ATen/native/miopen/RNN_miopen.cpp
@ -76,14 +76,14 @@ namespace {

    struct DropoutState {
        DropoutState(size_t size) : size(size), data(NULL) {
-            data = c10::hip::HIPCachingAllocator::raw_alloc(size);
+            data = c10::cuda::CUDACachingAllocator::raw_alloc(size);
        }
        DropoutState(const DropoutState&) = delete;
        DropoutState(DropoutState&&) = default;
        DropoutState& operator=(DropoutState&&) = default;
        ~DropoutState() {
            if (data) {
-                c10::hip::HIPCachingAllocator::raw_delete(data);
+                c10::cuda::CUDACachingAllocator::raw_delete(data);
            }
        }

--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@ -40,8 +40,6 @@ using namespace at::mps;

 namespace at::native::mps {

-void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)());
-
 struct MPSScalar {
  id<MTLBuffer> getMTLBuffer() const {
    return __builtin_bit_cast(id<MTLBuffer>, buffer.get());
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@ -53,21 +53,6 @@
@end

 namespace at::native::mps {
-
-void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()) {
-  __block std::optional<std::exception_ptr> block_exception;
-  dispatch_sync(queue, ^() {
-    try {
-      block();
-    } catch (...) {
-      block_exception = std::current_exception();
-    }
-  });
-  if (block_exception) {
-    std::rethrow_exception(*block_exception);
-  }
-}
-
 /**
 * Computes distance from lowest to highest element offset in given tensor.
 */
--- a/aten/src/ATen/native/mps/kernels/Indexing.metal
+++ b/aten/src/ATen/native/mps/kernels/Indexing.metal
@ -1,4 +1,5 @@
 #include <c10/metal/atomic.h>
+#include <c10/metal/error.h>
 #include <c10/metal/indexing.h>
 #include <metal_stdlib>

@ -31,10 +32,24 @@ OffsetT index_apply_indices(
    constant IndexAB* indices,
    constant int64_t* sizes,
    constant int64_t* strides,
-    uint num_indices) {
+    uint num_indices,
+    thread bool& error,
+    device ErrorMessages* error_buf) {
  OffsetT rc = offs.x;
  for (uint i = 0; i < num_indices; i++) {
    auto idx = indices[i].indexArray[offs.y];
+    if (idx < -sizes[i] || idx >= sizes[i]) {
+      TORCH_REPORT_ERROR(
+          error_buf,
+          "index ",
+          idx,
+          " is out of bounds for dimension ",
+          i,
+          " with size ",
+          sizes[i]);
+      error = true;
+      break;
+    }
    if (idx < 0) {
      idx += sizes[i];
    }
@ -55,6 +70,7 @@ kernel void index_select(
    constant int64_t* index_sizes,
    constant int64_t* index_strides,
    constant uint4& ndim_nindices_numel,
+    device ErrorMessages* error_buffer,
    uint thread_index [[thread_position_in_grid]]) {
  const auto ndim = ndim_nindices_numel.x;
  const auto num_indices = ndim_nindices_numel.y;
@ -65,8 +81,19 @@ kernel void index_select(
      indices_strides,
      ndim,
      thread_index);
+  bool error = false;
  auto input_offs = index_apply_indices<OffsetT>(
-      offs.yz, indices, index_sizes, index_strides, num_indices);
+      offs.yz,
+      indices,
+      index_sizes,
+      index_strides,
+      num_indices,
+      error,
+      error_buffer);
+  if (error) {
+    output[offs.x / sizeof(T)] = 0;
+    return;
+  }
  output[offs.x / sizeof(T)] = input[input_offs / sizeof(T)];
 }

@ -82,7 +109,9 @@ inline void index_put_impl(
    constant int64_t* index_sizes,
    constant int64_t* index_strides,
    constant uint4& ndim_nindices_numel,
+    device ErrorMessages* error_buffer,
    uint thread_index) {
+  bool error = false;
  const auto ndim = ndim_nindices_numel.x;
  const auto num_indices = ndim_nindices_numel.y;
  const auto offs = index_get_offsets(
@ -93,7 +122,16 @@ inline void index_put_impl(
      ndim,
      thread_index);
  auto output_offs = index_apply_indices<OffsetT>(
-      offs.xz, indices, index_sizes, index_strides, num_indices);
+      offs.xz,
+      indices,
+      index_sizes,
+      index_strides,
+      num_indices,
+      error,
+      error_buffer);
+  if (error) {
+    return;
+  }
  output[output_offs / sizeof(T)] = input[offs.y / sizeof(T)];
 }

@ -109,6 +147,7 @@ kernel void index_put(
    constant int64_t* index_sizes,
    constant int64_t* index_strides,
    constant uint4& ndim_nindices_numel,
+    device ErrorMessages* error_buffer,
    uint thread_index [[thread_position_in_grid]]) {
  index_put_impl(
      output,
@ -121,6 +160,7 @@ kernel void index_put(
      index_sizes,
      index_strides,
      ndim_nindices_numel,
+      error_buffer,
      thread_index);
 }

@ -136,6 +176,7 @@ kernel void index_put_serial(
    constant int64_t* index_sizes,
    constant int64_t* index_strides,
    constant uint4& ndim_nindices_numel,
+    device ErrorMessages* error_buffer,
    uint thread_index [[thread_position_in_grid]]) {
  (void)thread_index; // Suppress unused vairable varning
  for (uint idx = 0; idx < ndim_nindices_numel.z; ++idx) {
@ -150,6 +191,7 @@ kernel void index_put_serial(
        index_sizes,
        index_strides,
        ndim_nindices_numel,
+        error_buffer,
        idx);
  }
 }
@ -166,6 +208,7 @@ kernel void index_put_accumulate(
    constant int64_t* index_sizes,
    constant int64_t* index_strides,
    constant uint4& ndim_nindices_numel,
+    device ErrorMessages* error_buffer,
    uint thread_index [[thread_position_in_grid]]) {
  const auto ndim = ndim_nindices_numel.x;
  const auto num_indices = ndim_nindices_numel.y;
@ -176,8 +219,18 @@ kernel void index_put_accumulate(
      indices_strides,
      ndim,
      thread_index);
+  bool error = false;
  auto output_offs = index_apply_indices<OffsetT>(
-      offs.xz, indices, index_sizes, index_strides, num_indices);
+      offs.xz,
+      indices,
+      index_sizes,
+      index_strides,
+      num_indices,
+      error,
+      error_buffer);
+  if (error) {
+    return;
+  }
  AtomicType<T>::atomic_add(
      reinterpret_cast<device AtomicType_t<T>*>(output),
      output_offs / sizeof(T),
@ -197,6 +250,7 @@ kernel void index_put_accumulate(
          constant int64_t* index_sizes,                            \
          constant int64_t* index_strides,                          \
          constant uint4& ndim_nindices_numel,                      \
+          device ErrorMessages* error_buffer,                       \
          uint thread_index [[thread_position_in_grid]])

 #define REGISTER_INDEX_OP_ALL_DTYPES(OP_NAME) \
--- a/aten/src/ATen/native/mps/operations/Blas.mm
+++ b/aten/src/ATen/native/mps/operations/Blas.mm
@ -141,6 +141,9 @@ static Tensor& addmv_out_mps_impl(const Tensor& self,
  };

  MPSStream* stream = at::mps::getCurrentMPSStream();
+  if (result.numel() == 0) {
+    return result;
+  }
  Tensor matMulVec = at::mm(mat, vec.unsqueeze(1)).squeeze(1);

  @autoreleasepool {
--- a/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
+++ b/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
@ -220,7 +220,7 @@ Tensor _embedding_bag_dense_backward_mps(const Tensor& output_grad,
  auto num_threads = (params.mode == EmbeddingBagMode::MAX) ? output_grad.numel() : num_indices * params.feature_size;
  MPSStream* stream = getCurrentMPSStream();

-  mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
    @autoreleasepool {
      id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
      auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_backward_{}_{}",
@ -273,7 +273,7 @@ Tensor _embedding_bag_per_sample_weights_backward_mps(const Tensor& output_grad,
  auto num_threads = num_indices * feature_size;
  MPSStream* stream = getCurrentMPSStream();

-  mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
    @autoreleasepool {
      id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
      auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_per_sample_weights_backward_{}_{}",
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@ -179,7 +179,8 @@ static void dispatch_index_kernel(TensorIteratorBase& iter,
                   iter.strides(2),
                   index_size,
                   index_stride,
-                   ndim_nindiees);
+                   ndim_nindiees,
+                   mpsStream->getErrorBuffer());
    mtl_dispatch1DJob(computeEncoder, indexSelectPSO, serial ? 1 : iter.numel());
  });
 }
@ -299,7 +300,7 @@ static Tensor& nonzero_out_native_mps(const Tensor& self, Tensor& out_) {
  MPSStream* stream = getCurrentMPSStream();
  using CachedGraph = MPSUnaryCachedGraph;

-  dispatch_sync(stream->queue(), ^() {
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
    stream->synchronize(SyncType::COMMIT_AND_WAIT);
  });
  int64_t total_nonzero = at::count_nonzero(self).item<int64_t>();
@ -384,7 +385,7 @@ Tensor& nonzero_out_mps(const Tensor& self, Tensor& out_) {
  MPSStream* stream = getCurrentMPSStream();
  using CachedGraph = MPSUnaryCachedGraph;

-  dispatch_sync(stream->queue(), ^() {
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
    stream->synchronize(SyncType::COMMIT_AND_WAIT);
  });
  int64_t total_nonzero = at::count_nonzero(self).item<int64_t>();
--- a/aten/src/ATen/native/mps/operations/Normalization.mm
+++ b/aten/src/ATen/native/mps/operations/Normalization.mm
@ -923,7 +923,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_mps(const Tensor& input,
  MPSStream* stream = getCurrentMPSStream();
  TORCH_CHECK_NOT_IMPLEMENTED(input.scalar_type() != kLong, "Not implemented for long on MPS");
  @autoreleasepool {
-    mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
+    dispatch_sync_with_rethrow(stream->queue(), ^() {
      // which kernel variant to use based on the normalized axis N size
      const int N_READS = 4;
      auto metalType = mps::scalarToMetalTypeString(input);
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -192,6 +192,11 @@
    CompositeExplicitAutograd: _assert_tensor_metadata
    Meta: _assert_tensor_metadata_meta_symint

+- func: _async_error(str msg) -> ()
+  dispatch:
+    CompositeExplicitAutograd: _async_error
+    Meta: _async_error_meta
+
 - func: _print(str s) -> ()
  dispatch:
    CompositeExplicitAutograd: _print
@ -2803,7 +2808,7 @@
 - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  dispatch:
-    CPU, CUDA, MPS: floor_divide_out
+    CPU, CUDA, MPS, MTIA: floor_divide_out
    SparseCPU, SparseCUDA, SparseMPS: floor_divide_out_sparse_zerodim

 - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
@ -4292,6 +4297,7 @@
  dispatch:
    SparseCPU: sparse_sparse_matmul_cpu
    SparseCUDA: sparse_sparse_matmul_cuda
+    SparseMPS: sparse_sparse_matmul_mps
  autogen: _sparse_sparse_matmul.out

 - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
@ -9832,7 +9838,7 @@
  structured_delegate: erfinv.out
  variants: method, function
  dispatch:
-    SparseCPU, SparseCUDA: erfinv_sparse
+    SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr
  tags: pointwise

@ -9841,7 +9847,7 @@
  structured_delegate: erfinv.out
  variants: method
  dispatch:
-    SparseCPU, SparseCUDA: erfinv_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_
  tags: pointwise

@ -9851,7 +9857,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: erfinv_out
-    SparseCPU, SparseCUDA: erfinv_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_out
  tags: pointwise

--- a/aten/src/ATen/native/sparse/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/sparse/cuda/SoftMax.cu
@ -47,6 +47,7 @@
 #include <c10/macros/Macros.h>
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
+#include <thrust/distance.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/gather.h>
@ -59,8 +60,6 @@
 #include <thrust/transform.h>
 #include <thrust/unique.h>

-#include <c10/cuda/CUDAMathCompat.h>
-
 namespace at::native {
 namespace {

--- a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
+++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
@ -10,6 +10,10 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/_coalesce_native.h>
+#include <ATen/ops/repeat_interleave_native.h>
+#include <ATen/ops/cumsum.h>
+#include <ATen/ops/_sparse_sparse_matmul_native.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe.h>
 #include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
 #include <ATen/ops/cat.h>
 #include <ATen/ops/add_native.h>
@ -888,5 +892,114 @@ static void sparse_mask_intersection_out_mps_kernel(
      /*coalesce_mask=*/false);
 }

+Tensor sparse_sparse_matmul_mps(const Tensor& mat1_, const Tensor& mat2_) {
+  TORCH_CHECK(mat1_.is_sparse() && mat2_.is_sparse(),
+              "sparse_sparse_matmul_mps: both inputs must be sparse COO tensors");
+  TORCH_CHECK(mat1_.is_mps() && mat2_.is_mps(),
+              "sparse_sparse_matmul_mps: both inputs must be on MPS device");
+  TORCH_CHECK(mat1_.dim() == 2 && mat2_.dim() == 2,
+              "sparse_sparse_matmul_mps: both inputs must be 2D matrices");
+  TORCH_CHECK(mat1_.dense_dim() == 0 && mat2_.dense_dim() == 0,
+              "sparse_sparse_matmul_mps: only scalar values supported (dense_dim == 0)");
+  TORCH_CHECK(mat1_.size(1) == mat2_.size(0),
+              "mat1 and mat2 shapes cannot be multiplied (", mat1_.size(0), "x", mat1_.size(1), " and ", mat2_.size(0), "x", mat2_.size(1), ")");
+  TORCH_CHECK(mat1_.scalar_type() == mat2_.scalar_type(),
+              "sparse_sparse_matmul_mps: mat1 dtype ", mat1_.scalar_type(),
+              " does not match mat2 dtype ", mat2_.scalar_type());
+
+  const auto device = mat1_.device();
+
+  auto A = mat1_.coalesce();
+  auto B = mat2_.coalesce();
+
+  const auto I = A.size(0);
+  const auto K = A.size(1);
+  const auto N = B.size(1);
+
+  const auto nnzA = A._nnz();
+  const auto nnzB = B._nnz();
+
+  // Early empty result, return an empty, coalesced tensor
+  if (I == 0 || N == 0 || K == 0 || nnzA == 0 || nnzB == 0) {
+    auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong));
+    auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type()));
+    auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options());
+    out._coalesced_(true);
+    return out;
+  }
+
+  const auto computeDtype = at::result_type(mat1_, mat2_);
+
+  auto A_idx = A._indices().contiguous();
+  auto A_val = A._values().to(computeDtype).contiguous();
+  auto A_i = A_idx.select(0, 0).contiguous();
+  auto A_k = A_idx.select(0, 1).contiguous();
+
+  auto B_idx = B._indices().contiguous();
+  auto B_val = B._values().to(computeDtype).contiguous();
+  auto B_k = B_idx.select(0, 0).contiguous();
+  auto B_j = B_idx.select(0, 1).contiguous();
+
+  // csr-style row pointers for B by k (the shared dimension)
+  Tensor row_ptr_B;
+  {
+    auto batch_ptr = at::tensor({0LL, nnzB}, at::device(device).dtype(at::kLong));
+    row_ptr_B = at::empty({K + 1}, at::device(device).dtype(at::kLong));
+    build_row_ptr_per_batch_mps(B_k, batch_ptr, /*B=*/1, /*I=*/K, row_ptr_B);
+  }
+
+  auto row_ptr_B_lo = row_ptr_B.narrow(0, 0, K);
+  auto row_ptr_B_hi = row_ptr_B.narrow(0, 1, K);
+  auto deg_B = row_ptr_B_hi.sub(row_ptr_B_lo);
+
+  auto counts = deg_B.index_select(0, A_k);
+
+  const int64_t P = counts.sum().item<int64_t>();
+  if (P == 0) {
+    auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong));
+    auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type()));
+    auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options());
+    out._coalesced_(true);
+    return out;
+  }
+
+  auto group_ids = repeat_interleave_mps(counts);
+
+  // exclusive cumsum of counts
+  auto offsets = cumsum(counts, /*dim=*/0).sub(counts);
+  auto offsets_gather = offsets.index_select(0, group_ids);
+  auto within = at::arange(P, at::device(device).dtype(at::kLong)).sub(offsets_gather);
+
+  // Map each output element to its source B row and position
+  auto k_per_out = A_k.index_select(0, group_ids);
+  auto start_in_B = row_ptr_B.index_select(0, k_per_out);
+  auto seg_index = start_in_B.add(within);
+
+  // Assemble candidate coo pairs and values
+  auto i_out = A_i.index_select(0, group_ids).contiguous();
+  auto j_out = B_j.index_select(0, seg_index).contiguous();
+  auto vA_out = A_val.index_select(0, group_ids).contiguous();
+  auto vB_out = B_val.index_select(0, seg_index).contiguous();
+  auto v_out = vA_out.mul(vB_out);
+
+  // build (2, P) indices
+  auto out_indices = at::empty({2, P}, at::device(device).dtype(at::kLong)).contiguous();
+  out_indices.select(0, 0).copy_(i_out);
+  out_indices.select(0, 1).copy_(j_out);
+
+  auto result = _sparse_coo_tensor_unsafe(
+      out_indices, v_out, {I, N}, mat1_.options().dtype(computeDtype));
+
+  result = result.coalesce();
+
+  if (result.scalar_type() != mat1_.scalar_type()) {
+    auto cast_vals = result._values().to(mat1_.scalar_type());
+    auto out = _sparse_coo_tensor_unsafe(result._indices(), cast_vals, {I, N}, mat1_.options());
+    out._coalesced_(true);
+    return out;
+  }
+  return result;
+}
+
 REGISTER_MPS_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_mps_kernel);
 } // namespace at::native
--- a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
@ -37,7 +37,6 @@
 #ifdef USE_FLASH_ATTENTION
 #include <ATen/core/Tensor.h>
 #include <ATen/hip/HIPContext.h>
-#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
 #include <ATen/hip/HIPGraphsUtils.cuh>

 #ifndef AT_PER_OPERATOR_HEADERS
@ -162,7 +161,7 @@ mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x
            std::optional<int64_t> window_size_right,
            const bool return_softmax,
            const std::optional<at::Generator>& gen_) {
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA().stream();
+  auto stream = at::cuda::getCurrentCUDAStream().stream();
  check_gpu_arch(stream);

  auto q_dtype = q.dtype();
@ -348,8 +347,8 @@ mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, tot
  TORCH_CHECK(!paged_KV, "[ROCm] mha_varlen_fwd: block_table_ must be nullopt");
  TORCH_CHECK(!alibi_slopes_.has_value(), "[ROCm] mha_varlen_fwd: alibi_slopes_ must be nullopt");

-  at::hip::HIPGuardMasqueradingAsCUDA device_guard{(char)q.get_device()};
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA().stream();
+  at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+  auto stream = at::cuda::getCurrentCUDAStream().stream();
  check_gpu_arch(stream);

  auto q_dtype = q.dtype();
@ -560,8 +559,8 @@ mha_bwd_aot(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x hea
        const at::Tensor& philox_offset) {
  // Otherwise the kernel will be launched from cuda:0 device
  // Cast to char to avoid compiler warning about narrowing
-  at::hip::HIPGuardMasqueradingAsCUDA device_guard{(char)q.get_device()};
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA().stream();
+  at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+  auto stream = at::cuda::getCurrentCUDAStream().stream();
  check_gpu_arch(stream);

  bool is_dropout = p_dropout > 0.0;
@ -793,8 +792,8 @@ mha_varlen_bwd_aot(const at::Tensor &dout,  // total_q x num_heads, x head_size

  // Otherwise the kernel will be launched from cuda:0 device
  // Cast to char to avoid compiler warning about narrowing
-  at::hip::HIPGuardMasqueradingAsCUDA device_guard{(char)q.get_device()};
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA().stream();
+  at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+  auto stream = at::cuda::getCurrentCUDAStream().stream();
  check_gpu_arch(stream);

  bool is_dropout = p_dropout > 0.0;
--- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip
@ -261,7 +261,7 @@ mha_bwd_ck(const at::Tensor &dout,                   // batch_size x seqlen_q x
    if (is_causal) { window_size_right = 0; }

    bool is_dropout = p_dropout > 0.0;
-    auto stream = at::cuda::getCurrentHIPStream().stream();
+    auto stream = at::cuda::getCurrentCUDAStream().stream();

    auto q_dtype = q.dtype();
    TORCH_CHECK(q_dtype == at::kHalf || q_dtype == at::kBFloat16,
@ -365,7 +365,7 @@ mha_bwd_ck(const at::Tensor &dout,                   // batch_size x seqlen_q x
    }

    // Cast to char to avoid compiler warning about narrowing
-    at::hip::HIPGuardMasqueradingAsCUDA device_guard{(char)q.get_device()};
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};

    auto opts = q.options();
    auto softmax_d = at::empty({batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat));
--- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip
@ -261,7 +261,7 @@ mha_fwd_ck(const at::Tensor &q,                      // batch_size x seqlen_q x

    // Otherwise the kernel will be launched from cuda:0 device
    // Cast to char to avoid compiler warning about narrowing
-    at::hip::HIPGuardMasqueradingAsCUDA device_guard{(char)q.get_device()};
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};

    auto opts = q.options();
    bool has_lse = true;
@ -299,7 +299,7 @@ mha_fwd_ck(const at::Tensor &q,                      // batch_size x seqlen_q x


        hipLaunchKernelGGL(
-            flash::ParsePhiloxCudaState, dim3(1), dim3(64), 0, at::hip::getCurrentHIPStreamMasqueradingAsCUDA(), philox_args, rng_state_ptr);
+            flash::ParsePhiloxCudaState, dim3(1), dim3(64), 0, at::cuda::getCurrentCUDAStream(), philox_args, rng_state_ptr);
        seed_t = at::scalar_tensor(at::Scalar(static_cast<uint64_t>(rng_state_ptr[0])), at::dtype(at::kLong));
        offset_t = at::scalar_tensor(at::Scalar(static_cast<uint64_t>(rng_state_ptr[1])), at::dtype(at::kLong));
    }
@ -317,7 +317,7 @@ mha_fwd_ck(const at::Tensor &q,                      // batch_size x seqlen_q x

    if (seqlen_k > 0) {
        auto drop_seed_offset = std::make_pair(rng_state_ptr, rng_state_ptr + 1);
-        auto stream = at::cuda::getCurrentHIPStream().stream();
+        auto stream = at::cuda::getCurrentCUDAStream().stream();
        ck_tile::stream_config stream_config{stream};

        auto traits =
--- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip
@ -255,7 +255,7 @@ mha_varlen_bwd_ck(const at::Tensor &dout,                   // total_q x num_hea
    if (is_causal) { window_size_right = 0; }

    bool is_dropout = p_dropout > 0.0;
-    auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA().stream();
+    auto stream = at::cuda::getCurrentCUDAStream().stream();

    auto q_dtype = q.dtype();
    TORCH_CHECK(q_dtype == at::kHalf || q_dtype == at::kBFloat16,
@ -366,7 +366,7 @@ mha_varlen_bwd_ck(const at::Tensor &dout,                   // total_q x num_hea
    }

    // Cast to char to avoid compiler warning about narrowing
-    at::hip::HIPGuardMasqueradingAsCUDA device_guard{(char)q.get_device()};
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};

    auto opts = q.options();
    auto softmax_d = at::empty({batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat));
--- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip
@ -273,7 +273,7 @@ mha_varlen_fwd_ck(const at::Tensor &q,                   // total_q x num_heads

    // Otherwise the kernel will be launched from cuda:0 device
    // Cast to char to avoid compiler warning about narrowing
-    at::hip::HIPGuardMasqueradingAsCUDA device_guard{(char)q.get_device()};
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};

    auto opts = q.options();
    bool has_lse = true;
@ -307,7 +307,7 @@ mha_varlen_fwd_ck(const at::Tensor &q,                   // total_q x num_heads
        std::lock_guard<std::mutex> lock(gen->mutex_);
        auto philox_args = gen->philox_cuda_state(counter_offset);
        hipLaunchKernelGGL(
-            flash::ParsePhiloxCudaState, dim3(1), dim3(64), 0, at::hip::getCurrentHIPStreamMasqueradingAsCUDA(), philox_args, rng_state_ptr);
+            flash::ParsePhiloxCudaState, dim3(1), dim3(64), 0, at::cuda::getCurrentCUDAStream(), philox_args, rng_state_ptr);
    }

    // remove const from attn_bias_
@ -320,7 +320,7 @@ mha_varlen_fwd_ck(const at::Tensor &q,                   // total_q x num_heads

    if (max_seqlen_k > 0) {
        auto drop_seed_offset = std::make_pair(rng_state_ptr, rng_state_ptr + 1);
-        auto stream = at::cuda::getCurrentHIPStream().stream();
+        auto stream = at::cuda::getCurrentCUDAStream().stream();
        ck_tile::stream_config stream_config{stream};

        auto traits =
--- a/aten/src/ATen/native/transformers/hip/flash_attn/flash_common_hip.hpp
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/flash_common_hip.hpp
@ -7,7 +7,6 @@
 #include <ATen/TensorIndexing.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/hip/HIPContext.h>
-#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
 #include <ATen/hip/HIPGraphsUtils.cuh>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/xpu/XPUEvent.h
+++ b/aten/src/ATen/xpu/XPUEvent.h
@ -1,191 +1,3 @@
 #pragma once
 #include <ATen/xpu/XPUContext.h>
-
-#include <optional>
-
-namespace at::xpu {
-
-/*
- * XPUEvent are movable not copyable wrappers around SYCL event. XPUEvent are
- * constructed lazily when first recorded. It has a device, and this device is
- * acquired from the first recording stream. Later streams that record the event
- * must match the same device.
- *
- * Currently, XPUEvent does NOT support to export an inter-process event from
- * another process via inter-process communication(IPC). So it means that
- * inter-process communication for event handles between different processes is
- * not available. This could impact some applications that rely on cross-process
- * synchronization and communication.
- */
-struct TORCH_XPU_API XPUEvent {
-  // Constructors
-  XPUEvent(bool enable_timing = false) noexcept
-      : enable_timing_{enable_timing} {}
-
-  ~XPUEvent() {
-    if (isCreated()) {
-      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-      if (C10_UNLIKELY(interp)) {
-        (*interp)->trace_gpu_event_deletion(
-            at::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
-      }
-    }
-  }
-
-  XPUEvent(const XPUEvent&) = delete;
-  XPUEvent& operator=(const XPUEvent&) = delete;
-
-  XPUEvent(XPUEvent&& other) = default;
-  XPUEvent& operator=(XPUEvent&& other) = default;
-
-  operator sycl::event&() const {
-    return event();
-  }
-
-  std::optional<at::Device> device() const {
-    if (isCreated()) {
-      return at::Device(at::kXPU, device_index_);
-    } else {
-      return std::nullopt;
-    }
-  }
-
-  inline bool isCreated() const {
-    return (event_.get() != nullptr);
-  }
-
-  DeviceIndex device_index() const {
-    return device_index_;
-  }
-
-  sycl::event& event() const {
-    return *event_;
-  }
-
-  bool query() const {
-    using namespace sycl::info;
-    if (!isCreated()) {
-      return true;
-    }
-
-    return event().get_info<event::command_execution_status>() ==
-        event_command_status::complete;
-  }
-
-  void record() {
-    record(getCurrentXPUStream());
-  }
-
-  void recordOnce(const XPUStream& stream) {
-    if (!isCreated()) {
-      record(stream);
-    }
-  }
-
-  void record(const XPUStream& stream) {
-    if (!isCreated()) {
-      device_index_ = stream.device_index();
-      assignEvent(stream.queue());
-      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-      if (C10_UNLIKELY(interp)) {
-        (*interp)->trace_gpu_event_creation(
-            at::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
-      }
-    } else {
-      TORCH_CHECK(
-          device_index_ == stream.device_index(),
-          "Event device ",
-          device_index_,
-          " does not match recording stream's device ",
-          stream.device_index(),
-          ".");
-      reassignEvent(stream.queue());
-    }
-    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-    if (C10_UNLIKELY(interp)) {
-      (*interp)->trace_gpu_event_record(
-          at::kXPU,
-          reinterpret_cast<uintptr_t>(event_.get()),
-          reinterpret_cast<uintptr_t>(&stream.queue()));
-    }
-  }
-
-  void block(const XPUStream& stream) {
-    if (isCreated()) {
-      std::vector<sycl::event> event_list{event()};
-      // Make this stream wait until event_ is completed.
-      stream.queue().ext_oneapi_submit_barrier(event_list);
-      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-      if (C10_UNLIKELY(interp)) {
-        (*interp)->trace_gpu_event_wait(
-            at::kXPU,
-            reinterpret_cast<uintptr_t>(event_.get()),
-            reinterpret_cast<uintptr_t>(&stream.queue()));
-      }
-    }
-  }
-
-  double elapsed_time(const XPUEvent& other) const {
-    TORCH_CHECK(
-        isCreated() && other.isCreated(),
-        "Both events must be recorded before calculating elapsed time.");
-    TORCH_CHECK(
-        query() && other.query(),
-        "Both events must be completed before calculating elapsed time.");
-    TORCH_CHECK(
-        enable_timing_ && other.enable_timing_,
-        "Both events must be created with argument 'enable_timing=True'.");
-
-#if SYCL_COMPILER_VERSION < 20250000
-    TORCH_CHECK_NOT_IMPLEMENTED(
-        false,
-        "elapsed_time of XPUEvent requires PyTorch to be built with SYCL compiler version 2025.0.0 or newer.");
-#endif
-
-    using namespace sycl::info::event_profiling;
-    // Block until both of the recorded events are completed.
-    uint64_t end_time_ns = other.event().get_profiling_info<command_end>();
-    uint64_t start_time_ns = event().get_profiling_info<command_end>();
-    // Return the eplased time in milliseconds.
-    return 1e-6 *
-        (static_cast<double>(end_time_ns) - static_cast<double>(start_time_ns));
-  }
-
-  void synchronize() const {
-    if (isCreated()) {
-      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-      if (C10_UNLIKELY(interp)) {
-        (*interp)->trace_gpu_event_synchronization(
-            at::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
-      }
-      event().wait_and_throw();
-    }
-  }
-
- private:
-  void assignEvent(sycl::queue& queue) {
-#if SYCL_COMPILER_VERSION >= 20250000
-    if (enable_timing_) {
-      event_ = std::make_unique<sycl::event>(
-          sycl::ext::oneapi::experimental::submit_profiling_tag(queue));
-    } else {
-      event_ = std::make_unique<sycl::event>(queue.ext_oneapi_submit_barrier());
-    }
-#else
-    event_ = std::make_unique<sycl::event>(queue.ext_oneapi_submit_barrier());
-#endif
-  }
-
-  void reassignEvent(sycl::queue& queue) {
-    event_.reset();
-    assignEvent(queue);
-  }
-
-  bool enable_timing_ = false;
-  DeviceIndex device_index_ = -1;
-  // Only need to track the last event, as events in an in-order queue are
-  // executed sequentially.
-  std::unique_ptr<sycl::event> event_;
-};
-
-} // namespace at::xpu
+#include <c10/xpu/XPUEvent.h>
--- a/benchmarks/dynamo/check_accuracy.py
+++ b/benchmarks/dynamo/check_accuracy.py
@ -50,6 +50,7 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                "mobilenet_v2",
                "pytorch_CycleGAN_and_pix2pix",
                "pytorch_stargan",
+                "repvgg_a2",
                "resnet152",
                "resnet18",
                "resnet50",
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv
@ -10,7 +10,7 @@ beit_base_patch16_224,pass,7



-convnextv2_nano.fcmae_ft_in22k_in1k,pass,7
+convnextv2_nano.fcmae_ft_in22k_in1k,fail_accuracy,7



@ -66,7 +66,7 @@ visformer_small,pass,7



-vit_base_patch14_dinov2.lvd142m,pass,7
+vit_base_patch14_dinov2.lvd142m,fail_accuracy,7



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv
@ -50,7 +50,7 @@ nfnet_l0,pass,7



-repvgg_a2,fail_accuracy,7
+repvgg_a2,pass,7



--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -952,7 +952,7 @@ def latency_experiment_summary(suite_name, args, model, timings, **kwargs):
        first_fields.append(kwargs["tag"])
    headers = first_headers + ["speedup", "abs_latency"]
    row = first_fields + [float(speedup), median[1] * 1000]
-    msg = f"{speedup:.3f}x"
+    msg = f"{median[0] * 1000} ms, {median[1] * 1000} ms, {speedup:.3f}x"
    if args.baseline:
        headers.extend(
            [
@ -1010,7 +1010,7 @@ def latency_experiment_summary(suite_name, args, model, timings, **kwargs):
    # Hypothetically you can use this from other places, but it's currently
    # inaccessible, and when this assert fails you need to update the
    # event_name here to account for the other cases you are using this
-    assert args.quantization is not None
+    assert any([args.quantization, args.optimus])
    output_signpost(
        dict(zip(headers, row)),
        args,
@ -2288,11 +2288,9 @@ class BenchmarkRunner:
                    )
                ):
                    is_same = False
-            except Exception as e:
+            except Exception:
                # Sometimes torch.allclose may throw RuntimeError
-                exception_string = str(e)
-                accuracy_status = f"fail_exception: {exception_string}"
-                return record_status(accuracy_status, dynamo_start_stats=start_stats)
+                is_same = False

            if not is_same:
                accuracy_status = "eager_two_runs_differ"
@ -2409,11 +2407,9 @@ class BenchmarkRunner:
                    force_max_multiplier=force_max_multiplier,
                ):
                    is_same = False
-            except Exception as e:
+            except Exception:
                # Sometimes torch.allclose may throw RuntimeError
-                exception_string = str(e)
-                accuracy_status = f"fail_exception: {exception_string}"
-                return record_status(accuracy_status, dynamo_start_stats=start_stats)
+                is_same = False

            if not is_same:
                if self.args.skip_accuracy_check:
@ -2587,6 +2583,9 @@ class BenchmarkRunner:
                **experiment_kwargs,
            )

+            # reset dynamo
+            torch._dynamo.reset()
+
            if self.args.export_aot_inductor:
                optimized_model_iter_fn = optimize_ctx
            else:
@ -2950,7 +2949,7 @@ class BenchmarkRunner:
            status = self.check_tolerance(name, model, example_inputs, optimize_ctx)
            print(status)
        elif self.args.performance:
-            if self.args.backend == "torchao":
+            if self.args.backend in ["torchao", "optimus"]:
                status = self.run_performance_test_non_alternate(
                    name, model, example_inputs, optimize_ctx, experiment, tag
                )
@ -3526,6 +3525,12 @@ def parse_args(args=None):
        action="store_true",
        help="Measure speedup with TorchInductor",
    )
+    group.add_argument(
+        "--optimus",
+        choices=["vertical_opt", "horizontal_opt", "all"],
+        default=None,
+        help="Measure speedup of Optimus with TorchInductor baseline",
+    )
    group.add_argument(
        "--quantization",
        choices=[
@ -3783,6 +3788,9 @@ def run(runner, args, original_dir=None):
    if args.inductor:
        assert args.backend is None
        args.backend = "inductor"
+    if args.optimus:
+        assert args.backend is None
+        args.backend = "optimus"
    if args.quantization:
        assert args.backend is None
        args.backend = "torchao"
@ -4067,10 +4075,22 @@ def run(runner, args, original_dir=None):

            runner.model_iter_fn = model_iter_fn_and_mark_step
            optimize_ctx = torchao_optimize_ctx(args.quantization)
+        elif args.backend == "optimus":
+            from .optimus import get_baseline_ctx, get_optimus_optimize_ctx
+
+            baseline_ctx = get_baseline_ctx(
+                nopython=args.nopython, inductor_compile_mode=args.inductor_compile_mode
+            )
+            runner.model_iter_fn = baseline_ctx(runner.model_iter_fn)
+            optimize_ctx = get_optimus_optimize_ctx(
+                args.optimus, args.nopython, args.inductor_compile_mode
+            )
        else:
            optimize_ctx = torch._dynamo.optimize(args.backend, nopython=args.nopython)
        experiment = (
-            speedup_experiment if args.backend != "torchao" else latency_experiment
+            speedup_experiment
+            if args.backend not in ["torchao", "optimus"]
+            else latency_experiment
        )
        if args.accuracy:
            output_filename = f"accuracy_{args.backend}.csv"
@ -4091,7 +4111,12 @@ def run(runner, args, original_dir=None):
    if args.only in runner.disable_cudagraph_models:
        args.disable_cudagraphs = True

-    if args.inductor or args.backend == "inductor" or args.export_aot_inductor:
+    if (
+        args.inductor
+        or args.backend == "inductor"
+        or args.export_aot_inductor
+        or args.backend == "optimus"
+    ):
        inductor_config.triton.cudagraphs = not args.disable_cudagraphs
        inductor_config.triton.persistent_reductions = (
            not args.disable_persistent_reductions
--- a/benchmarks/dynamo/optimus.py
+++ b/benchmarks/dynamo/optimus.py
@ -0,0 +1,62 @@
+import functools
+
+import torch
+
+
+def get_baseline_ctx(nopython, inductor_compile_mode):
+    return functools.partial(
+        torch.compile,
+        backend="inductor",
+        fullgraph=nopython,
+        mode=inductor_compile_mode,
+    )
+
+
+def get_optimus_optimize_ctx(config, nopython, inductor_compile_mode):
+    if config == "vertical_opt":
+        optimus_inductor_config = {
+            "pre_grad_fusion_options": {
+                "normalization_pass": {},
+                "merge_splits_pass": {},
+                "split_cat_pass": {},
+                "unbind_stack_pass": {},
+                "unbind_cat_to_view_pass": {},
+            }
+        }
+    elif config == "horizontal_opt":
+        optimus_inductor_config = {
+            "pre_grad_fusion_options": {
+                "normalization_pass": {},
+                "batch_linear": {},
+                "batch_layernorm": {},
+            },
+        }
+    elif config == "all":
+        optimus_inductor_config = {
+            "pre_grad_fusion_options": {
+                "normalization_pass": {},
+                "batch_linear": {},
+                "batch_layernorm": {},
+                "merge_splits_pass": {},
+                "split_cat_pass": {},
+                "unbind_stack_pass": {},
+                "unbind_cat_to_view_pass": {},
+            },
+        }
+    else:
+        raise RuntimeError(f"Unknown optimus config: {config}")
+
+    def _inner(fn):
+        if "pre_grad_fusion_options" in optimus_inductor_config:
+            torch._inductor.config.pre_grad_fusion_options = optimus_inductor_config[
+                "pre_grad_fusion_options"
+            ]
+        if "post_grad_fusion_options" in optimus_inductor_config:
+            torch._inductor.config.post_grad_fusion_options = optimus_inductor_config[
+                "post_grad_fusion_options"
+            ]
+        return torch.compile(
+            fn, backend="inductor", fullgraph=nopython, mode=inductor_compile_mode
+        )
+
+    return _inner
--- a/benchmarks/dynamo/parse_logs.py
+++ b/benchmarks/dynamo/parse_logs.py
@ -2,6 +2,7 @@ import csv
 import os
 import re
 import sys
+from pathlib import Path


 # This script takes the logs produced by the benchmark scripts (e.g.,
@ -15,8 +16,7 @@ import sys
 # This script is not very well written, feel free to rewrite it as necessary

 assert len(sys.argv) == 2
-
-full_log = open(sys.argv[1]).read()
+full_log = Path(sys.argv[1]).read_text()

 # If the log contains a gist URL, extract it so we can include it in the CSV
 gist_url = ""
--- a/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv
+++ b/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv
@ -484,24 +484,106 @@ PyTorch,sum,sum_R256_V512_dim0_contiguousTrue_cpu,short,False,50.954394,0.000000
 PyTorch,sum,sum_R256_V512_dim0_contiguousFalse_cpu,short,False,57.957757,0.000000
 PyTorch,sum,sum_R256_V512_dim1_contiguousTrue_cpu,short,False,53.592068,0.000000
 PyTorch,sum,sum_R256_V512_dim1_contiguousFalse_cpu,short,False,51.339726,0.000000
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N16_cpu,short,False,7.040985,0.000000
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N64_cpu,short,False,7.168604,0.000000
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N128_cpu,short,False,7.434442,0.000000
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N16_cpu,short,False,7.078318,0.000000
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N64_cpu,short,False,7.426670,0.000000
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N128_cpu,short,False,7.679027,0.000000
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N16_cpu,short,False,7.281365,0.000000
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N64_cpu,short,False,7.682783,0.000000
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N128_cpu,short,False,8.381938,0.000000
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N16_cpu,short,False,7.039854,0.000000
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N64_cpu,short,False,7.399855,0.000000
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N128_cpu,short,False,7.715193,0.000000
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N16_cpu,short,False,7.255140,0.000000
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N64_cpu,short,False,7.753522,0.000000
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N128_cpu,short,False,8.364281,0.000000
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N16_cpu,short,False,7.476377,0.000000
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N64_cpu,short,False,8.458564,0.000000
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N128_cpu,short,False,9.391939,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,0.927,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.uint8,short,False,6.261,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int8,short,False,6.351,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int16,short,False,6.177,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int32,short,False,6.333,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int64,short,False,6.588,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float16,short,False,8.117,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bfloat16,short,False,9.358,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float32,short,False,7.844,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float64,short,False,8.097,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bool,short,False,6.159,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,0.926,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int8,short,False,6.192,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int16,short,False,6.276,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,6.461,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int64,short,False,6.524,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float16,short,False,8.136,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bfloat16,short,False,6.854,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float32,short,False,6.446,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float64,short,False,6.829,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bool,short,False,6.088,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.uint8,short,False,6.059,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int8,short,False,0.922,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int16,short,False,6.263,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int32,short,False,6.330,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int64,short,False,6.688,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float16,short,False,8.176,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bfloat16,short,False,6.959,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float32,short,False,6.430,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float64,short,False,6.818,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bool,short,False,6.350,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.uint8,short,False,6.221,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int8,short,False,6.193,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int16,short,False,0.922,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int32,short,False,6.263,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int64,short,False,6.525,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float16,short,False,7.960,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bfloat16,short,False,6.801,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float32,short,False,6.594,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float64,short,False,7.089,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bool,short,False,6.498,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,6.358,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int8,short,False,6.390,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int16,short,False,6.415,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,0.925,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int64,short,False,6.657,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float16,short,False,7.954,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bfloat16,short,False,6.930,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float32,short,False,6.737,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float64,short,False,6.948,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bool,short,False,6.757,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.uint8,short,False,6.402,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int8,short,False,6.550,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int16,short,False,6.518,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int32,short,False,6.766,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int64,short,False,0.929,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float16,short,False,8.557,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bfloat16,short,False,9.045,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float32,short,False,7.672,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float64,short,False,7.276,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bool,short,False,6.414,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.uint8,short,False,7.736,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int8,short,False,7.889,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int16,short,False,8.170,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int32,short,False,7.783,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int64,short,False,7.743,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float16,short,False,0.927,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bfloat16,short,False,7.018,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float32,short,False,8.428,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float64,short,False,6.767,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bool,short,False,6.479,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.uint8,short,False,7.827,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int8,short,False,6.450,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int16,short,False,6.320,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int32,short,False,6.385,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int64,short,False,8.119,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float16,short,False,8.063,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bfloat16,short,False,0.925,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float32,short,False,8.629,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float64,short,False,6.638,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bool,short,False,6.425,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.uint8,short,False,7.803,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int8,short,False,6.502,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int16,short,False,6.429,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int32,short,False,6.549,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int64,short,False,7.749,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float16,short,False,7.301,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bfloat16,short,False,7.682,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,False,0.930,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float64,short,False,6.738,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bool,short,False,6.798,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.uint8,short,False,6.506,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int8,short,False,6.494,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int16,short,False,6.668,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int32,short,False,6.696,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int64,short,False,7.115,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float16,short,False,7.910,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bfloat16,short,False,7.410,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float32,short,False,6.868,0.000000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float64,short,False,0.924,0.000000
 PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.float32,short,False,4.461410,0.000000
 PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.bfloat16,short,False,4.560082,0.000000
 PyTorch,addcmul,addcmul_M32_N64_cpu_dtypetorch.float32,short,False,5.141248,0.000000
--- a/benchmarks/operator_benchmark/pt/tensor_to_test.py
+++ b/benchmarks/operator_benchmark/pt/tensor_to_test.py
@ -4,74 +4,84 @@ import torch


 tensor_conversion_short_configs = op_bench.cross_product_configs(
-    M=(
-        8,
-        16,
-        32,
-    ),
-    N=(
-        16,
-        64,
-        128,
-    ),
+    M=[32],
+    N=[128],
    device=["cpu", "cuda"],
+    dtype_one=[
+        torch.bool,
+        torch.uint8,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.half,
+        torch.bfloat16,
+        torch.float,
+        torch.double,
+    ],
+    dtype_two=[
+        torch.bool,
+        torch.uint8,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.half,
+        torch.bfloat16,
+        torch.float,
+        torch.double,
+    ],
    tags=["short"],
 )

 tensor_conversion_long_configs = op_bench.cross_product_configs(
-    M=(
-        64,
-        128,
-        256,
-        512,
-    ),
-    N=(
-        256,
-        512,
-        1024,
-        2048,
-    ),
+    M=[1024],
+    N=[1024],
    device=["cpu", "cuda"],
+    dtype_one=[
+        torch.bool,
+        torch.uint8,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.half,
+        torch.bfloat16,
+        torch.float,
+        torch.double,
+    ],
+    dtype_two=[
+        torch.bool,
+        torch.uint8,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.half,
+        torch.bfloat16,
+        torch.float,
+        torch.double,
+    ],
    tags=["long"],
 )


-class FloatToHalfTensorConversionBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, M, N, device):
+class TensorConversionBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, M, N, dtype_one, dtype_two, device):
        self.inputs = {
            "input": torch.rand(
                M, N, device=device, requires_grad=False, dtype=torch.float
-            )
+            ).to(dtype=dtype_one)
        }
+        self.dtype_one = dtype_one
+        self.dtype_two = dtype_two

    def forward(self, input):
-        return input.to(torch.half)
+        return input.to(dtype=self.dtype_two)


-class HalfToFloatTensorConversionBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, M, N, device):
-        self.inputs = {
-            "input": torch.rand(
-                M, N, device=device, requires_grad=False, dtype=torch.half
-            )
-        }
-
-    def forward(self, input):
-        return input.to(torch.float)
-
-
-op_bench.generate_pt_test(
-    tensor_conversion_short_configs, FloatToHalfTensorConversionBenchmark
-)
-op_bench.generate_pt_test(
-    tensor_conversion_long_configs, FloatToHalfTensorConversionBenchmark
-)
-op_bench.generate_pt_test(
-    tensor_conversion_short_configs, HalfToFloatTensorConversionBenchmark
-)
-op_bench.generate_pt_test(
-    tensor_conversion_long_configs, HalfToFloatTensorConversionBenchmark
-)
+op_bench.generate_pt_test(tensor_conversion_short_configs, TensorConversionBenchmark)
+op_bench.generate_pt_test(tensor_conversion_long_configs, TensorConversionBenchmark)

 if __name__ == "__main__":
    op_bench.benchmark_runner.main()
--- a/benchmarks/operator_benchmark/x86_64_expected_ci_operator_benchmark_eager_float32_cpu.csv
+++ b/benchmarks/operator_benchmark/x86_64_expected_ci_operator_benchmark_eager_float32_cpu.csv
@ -349,24 +349,106 @@ PyTorch,sum,sum_R256_V512_dim0_contiguousTrue_cpu,short,FALSE,12.5841
 PyTorch,sum,sum_R256_V512_dim0_contiguousFALSE_cpu,short,FALSE,20.8765
 PyTorch,sum,sum_R256_V512_dim1_contiguousTrue_cpu,short,FALSE,15.4414
 PyTorch,sum,sum_R256_V512_dim1_contiguousFALSE_cpu,short,FALSE,15.3287
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N16_cpu,short,FALSE,5.0499
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N64_cpu,short,FALSE,5.3229
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N128_cpu,short,FALSE,5.4418
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N16_cpu,short,FALSE,5.0868
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N64_cpu,short,FALSE,5.4495
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N128_cpu,short,FALSE,5.5578
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N16_cpu,short,FALSE,5.2631
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N64_cpu,short,FALSE,5.5646
-PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N128_cpu,short,FALSE,5.7898
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N16_cpu,short,FALSE,5.0228
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N64_cpu,short,FALSE,5.3692
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N128_cpu,short,FALSE,5.4006
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N16_cpu,short,FALSE,5.1107
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N64_cpu,short,FALSE,5.4119
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N128_cpu,short,FALSE,5.5583
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N16_cpu,short,FALSE,5.3818
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N64_cpu,short,FALSE,5.5742
-PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N128_cpu,short,FALSE,6.8414
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,0.797
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.uint8,short,False,6.071
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int8,short,False,6.031
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int16,short,False,6.243
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int32,short,False,7.231
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int64,short,False,7.791
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float16,short,False,12.661
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bfloat16,short,False,11.225
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float32,short,False,9.772
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float64,short,False,9.872
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bool,short,False,6.033
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,0.781
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int8,short,False,6.060
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int16,short,False,6.180
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,7.258
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int64,short,False,7.758
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float16,short,False,10.504
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bfloat16,short,False,6.749
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float32,short,False,7.679
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float64,short,False,7.797
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bool,short,False,6.019
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.uint8,short,False,6.079
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int8,short,False,0.785
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int16,short,False,6.188
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int32,short,False,7.288
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int64,short,False,7.770
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float16,short,False,10.466
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bfloat16,short,False,6.676
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float32,short,False,7.736
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float64,short,False,7.780
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bool,short,False,6.130
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.uint8,short,False,6.221
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int8,short,False,6.101
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int16,short,False,0.791
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int32,short,False,6.254
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int64,short,False,7.733
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float16,short,False,10.562
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bfloat16,short,False,6.704
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float32,short,False,7.819
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float64,short,False,8.276
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bool,short,False,6.361
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,6.364
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int8,short,False,6.309
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int16,short,False,6.362
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,0.791
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int64,short,False,7.746
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float16,short,False,9.462
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bfloat16,short,False,6.678
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float32,short,False,7.827
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float64,short,False,8.200
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bool,short,False,6.925
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.uint8,short,False,6.947
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int8,short,False,6.962
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int16,short,False,6.906
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int32,short,False,7.664
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int64,short,False,0.782
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float16,short,False,10.528
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bfloat16,short,False,10.123
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float32,short,False,9.234
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float64,short,False,8.694
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bool,short,False,12.653
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.uint8,short,False,9.348
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int8,short,False,8.774
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int16,short,False,9.063
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int32,short,False,10.012
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int64,short,False,13.641
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float16,short,False,0.788
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bfloat16,short,False,13.757
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float32,short,False,7.170
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float64,short,False,12.511
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bool,short,False,6.516
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.uint8,short,False,8.539
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int8,short,False,6.483
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int16,short,False,6.468
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int32,short,False,7.752
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int64,short,False,9.868
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float16,short,False,10.556
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bfloat16,short,False,0.792
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float32,short,False,7.577
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float64,short,False,8.267
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bool,short,False,6.819
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.uint8,short,False,7.715
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int8,short,False,6.754
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int16,short,False,6.825
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int32,short,False,7.790
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int64,short,False,9.219
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float16,short,False,5.977
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bfloat16,short,False,7.069
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,False,0.794
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float64,short,False,8.301
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bool,short,False,7.401
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.uint8,short,False,7.843
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int8,short,False,7.117
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int16,short,False,7.170
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int32,short,False,8.000
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int64,short,False,9.284
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float16,short,False,7.179
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bfloat16,short,False,7.645
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float32,short,False,7.988
+PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float64,short,False,0.792
 PyTorch,relu,"relu_dims(3,4,5)_contigFALSE_inplaceFALSE_dtypetorch.quint8",short,FALSE,9.4657
 PyTorch,relu,"relu_dims(3,4,5)_contigFALSE_inplaceFALSE_dtypetorch.qint8",short,FALSE,9.4625
 PyTorch,relu,"relu_dims(3,4,5)_contigFALSE_inplaceFALSE_dtypetorch.qint32",short,FALSE,9.4165
--- a/benchmarks/sparse/spmm.py
+++ b/benchmarks/sparse/spmm.py
@ -52,19 +52,18 @@ def test_sparse_coo_and_csr(m, n, k, nnz, test_count):
        start.record()
        coo.matmul(mat)
        stop.record()
-
        times.append(start.elapsed_time(stop))

-        coo_mean_time = sum(times) / len(times)
+    coo_mean_time = sum(times) / len(times)

-        times = []
-        for _ in range(test_count):
-            start.record()
-            csr.matmul(mat)
-            stop.record()
-            times.append(start.elapsed_time(stop))
+    times = []
+    for _ in range(test_count):
+        start.record()
+        csr.matmul(mat)
+        stop.record()
+        times.append(start.elapsed_time(stop))

-            csr_mean_time = sum(times) / len(times)
+    csr_mean_time = sum(times) / len(times)

    return coo_mean_time, csr_mean_time

@ -84,10 +83,13 @@ if __name__ == "__main__":

    if args.outfile == "stdout":
        outfile = sys.stdout
+        need_close = False
    elif args.outfile == "stderr":
        outfile = sys.stderr
+        need_close = False
    else:
        outfile = open(args.outfile, "a")
+        need_close = True

    test_count = args.test_count
    m = args.m
@ -148,3 +150,5 @@ if __name__ == "__main__":
            time,
            file=outfile,
        )
+    if need_close:
+        outfile.close()
--- a/Show More
+++ b/Show More