update vllm commit hash

add a curve for customized compilation in the kernel benchmarking scripts (#166697 )
It's nice to add a curve with a customized compilation options so that we can compare side-by-side the perf improvement of new features. E.g. for mix-order-reduction, by running the following command ``` python benchmarks/dynamo/genai_layers/benchmark.py --tolerance=1e-2 --exit-on-accuracy-failure --visualize rmsnorm_backward --custom-compile-name="compiled-no-fusion" --custom-compile-options='{"triton.mix_order_reduction":false}' ``` I get following output: ``` Geomean speedup for benchmark RMSNormBackward eager 11 data points compiled 11 data points, 15.82x speedup quack 11 data points, 15.45x speedup liger 11 data points, 14.06x speedup compiled-no-fusion 11 data points, 10.26x speedup ``` The output shows that the feature on average improve perf by `15.82 / 10.26 = 1.54x` for all the shapes tested. (I remove a shape (32768, 32768) whose rnumel is too large and not representative). The new curve also shows up in the figure: <img width="3564" height="2368" alt="RMSNormBackward_bench" src="https://github.com/user-attachments/assets/1ffac2bc-e726-4f1e-806d-e9e5de711492" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/166697 Approved by: https://github.com/BoyuanFeng ghstack dependencies: #166053, #166382, #166461, #166585, #166675
2025-11-02 23:15:01 +08:00 · 2025-11-02 00:28:59 +00:00 · 2025-11-01 22:09:56 +00:00 · 2025-11-01 22:09:56 +00:00 · 2025-11-01 22:09:48 +00:00 · 2025-11-01 22:09:40 +00:00
598 changed files with 16379 additions and 6867 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -195,13 +195,16 @@ case "$tag" in
    NINJA_VERSION=1.9.0
    TRITON=yes
    ;;
-  pytorch-linux-jammy-xpu-n-py3)
+  pytorch-linux-jammy-xpu-n-py3 | pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    VISION=yes
    XPU_VERSION=2025.2
    NINJA_VERSION=1.9.0
    TRITON=yes
+    if [[ $tag =~ "benchmarks" ]]; then
+      INDUCTOR_BENCHMARKS=yes
+    fi
    ;;
  pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.10
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@ -3,7 +3,7 @@

 set -eux

-ACL_VERSION=${ACL_VERSION:-"v25.02"}
+ACL_VERSION=${ACL_VERSION:-"v52.6.0"}
 ACL_INSTALL_DIR="/acl"

 # Clone ACL
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -40,11 +40,7 @@ EOF

    # Default url values
    rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
-    amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
-
-    # Add amdgpu repository
    UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
-    echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list

    # Add rocm repository
    wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -12,8 +12,8 @@ function do_install() {

    rocm_version_nodot=${rocm_version//./}

-    # https://github.com/icl-utk-edu/magma/pull/65
-    MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
+    # post merge of https://github.com/icl-utk-edu/magma/pull/65
+    MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
    magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"

    rocm_dir="/opt/rocm"
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -97,7 +97,7 @@ case ${image} in
    manylinux2_28-builder:xpu)
        TARGET=xpu_final
        GPU_IMAGE=amd64/almalinux:8
-        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13"
        MANY_LINUX_VERSION="2_28"
        ;;
    *)
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -54,12 +54,15 @@ ENV OPENSSL_DIR /opt/openssl
 RUN rm install_openssl.sh

 ARG INDUCTOR_BENCHMARKS
+ARG ANACONDA_PYTHON_VERSION
+ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
+COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt

 # Install XPU Dependencies
 ARG XPU_VERSION
--- a/.ci/lumen_cli/pyproject.toml
+++ b/.ci/lumen_cli/pyproject.toml
@ -6,7 +6,7 @@ dependencies = [
    "GitPython==3.1.45",
    "docker==7.1.0",
    "pytest==7.3.2",
-    "uv==0.9.5"
+    "uv==0.9.6"
 ]

 [tool.setuptools]
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@ -1,7 +1,7 @@
 SHELL=/usr/bin/env bash

 DOCKER_CMD ?= docker
-DESIRED_ROCM ?= 7.0
+DESIRED_ROCM ?= 7.1
 DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
 PACKAGE_NAME = magma-rocm
 # inherit this from underlying docker image, do not pass this env var to docker
@ -16,6 +16,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	magma-rocm/build_magma.sh

 .PHONY: all
+all: magma-rocm71
 all: magma-rocm70
 all: magma-rocm64

@ -24,6 +25,11 @@ clean:
 	$(RM) -r magma-*
 	$(RM) -r output

+.PHONY: magma-rocm71
+magma-rocm71: DESIRED_ROCM := 7.1
+magma-rocm71:
+	$(DOCKER_RUN)
+
 .PHONY: magma-rocm70
 magma-rocm70: DESIRED_ROCM := 7.0
 magma-rocm70:
--- a/.ci/magma-rocm/build_magma.sh
+++ b/.ci/magma-rocm/build_magma.sh
@ -6,8 +6,8 @@ set -eou pipefail
 # The script expects DESIRED_CUDA and PACKAGE_NAME to be set
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"

-# https://github.com/icl-utk-edu/magma/pull/65
-MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
+# post merge of https://github.com/icl-utk-edu/magma/pull/65
+MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f

 # Folders for the build
 PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE

 # Fetch magma sources and verify checksum
 pushd ${PACKAGE_DIR}
-git clone https://github.com/jeffdaily/magma
+git clone https://github.com/icl-utk-edu/magma
 pushd magma
 git checkout ${MAGMA_VERSION}
 popd
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -426,7 +426,7 @@ fi
 if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; then
  # export test times so that potential sharded tests that'll branch off this build will use consistent data
  # don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build
-  python tools/stats/export_test_times.py
+  PYTHONPATH=. python tools/stats/export_test_times.py
 fi
 # don't do this for bazel or s390x or riscv64 as they don't use sccache
 if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -572,6 +572,8 @@ fi

 if [[ "${TEST_CONFIG}" == *cpu* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
+elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
+  DYNAMO_BENCHMARK_FLAGS+=(--device xpu)
 else
  DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
 fi
@ -665,6 +667,8 @@ test_perf_for_dashboard() {
    device=cuda_b200
  elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
    device=rocm
+  elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
+    device=xpu
  fi

  for mode in "${modes[@]}"; do
@ -1757,7 +1761,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  else
    # Do this after checkout_install_torchbench to ensure we clobber any
    # nightlies that torchbench may pull in
-    if [[ "${TEST_CONFIG}" != *cpu* ]]; then
+    if [[ "${TEST_CONFIG}" != *cpu* && "${TEST_CONFIG}" != *xpu* ]]; then
      install_torchrec_and_fbgemm
    fi
    PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
--- a/.github/actions/diskspace-cleanup/action.yml
+++ b/.github/actions/diskspace-cleanup/action.yml
@ -27,7 +27,9 @@ runs:
            docker system prune -af
            diskspace_new=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
            if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
-                echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
+                diskspace_cutoff_int=$((diskspace_cutoff + 0))
+                difference=$((100 - diskspace_cutoff_int))
+                echo "Error: Available diskspace is less than $difference percent. Not enough diskspace."
                echo "$msg"
                exit 1
            else
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-69bbe7363897764f9e758d851cd0340147d27f94
+3b0e7a6f192ca2715e7e6cbe5db007aea7165fe2
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-e5192819208c4d68194844b7dfafbc00020d0dea
+685c99ee77b4818dcdd15b30fe0e0eff0d5d22ec
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -19,6 +19,7 @@ ciflow_push_tags:
 - ciflow/inductor-perf-test-nightly-rocm-mi300
 - ciflow/inductor-perf-test-nightly-rocm-mi355
 - ciflow/inductor-perf-test-nightly-x86-zen
+- ciflow/inductor-perf-test-nightly-xpu
 - ciflow/inductor-periodic
 - ciflow/inductor-rocm
 - ciflow/linux-aarch64
@ -26,6 +27,7 @@ ciflow_push_tags:
 - ciflow/nightly
 - ciflow/op-benchmark
 - ciflow/periodic
+- ciflow/periodic-rocm-mi200
 - ciflow/periodic-rocm-mi300
 - ciflow/pull
 - ciflow/quantization-periodic
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -11,11 +11,17 @@ architectures:
    * Latest XPU
 """

+import json
 import os
+import re
+from pathlib import Path
 from typing import Optional


-# NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
+SCRIPT_DIR = Path(__file__).absolute().parent
+REPO_ROOT = SCRIPT_DIR.parent.parent
+
+
 CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
 CUDA_STABLE = "12.8"
 CUDA_ARCHES_FULL_VERSION = {
@ -31,8 +37,7 @@ CUDA_ARCHES_CUDNN_VERSION = {
    "13.0": "9",
 }

-# NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
-ROCM_ARCHES = ["6.4", "7.0"]
+ROCM_ARCHES = ["7.0", "7.1"]

 XPU_ARCHES = ["xpu"]

@ -137,9 +142,48 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
 }


-def get_nccl_wheel_version(arch_version: str) -> str:
-    import re
+# Used by tools/nightly.py
+PYTORCH_NIGHTLY_PIP_INDEX_URL = "https://download.pytorch.org/whl/nightly"
+NIGHTLY_SOURCE_MATRIX = {
+    "cpu": dict(
+        name="cpu",
+        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cpu",
+        supported_platforms=["Linux", "macOS", "Windows"],
+        accelerator="cpu",
+    )
+}
+CUDA_NIGHTLY_SOURCE_MATRIX = {
+    f"cuda-{major}.{minor}": dict(
+        name=f"cuda-{major}.{minor}",
+        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cu{major}{minor}",
+        supported_platforms=["Linux", "Windows"],
+        accelerator="cuda",
+    )
+    for major, minor in (map(int, version.split(".")) for version in CUDA_ARCHES)
+}
+ROCM_NIGHTLY_SOURCE_MATRIX = {
+    f"rocm-{major}.{minor}": dict(
+        name=f"rocm-{major}.{minor}",
+        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/rocm{major}.{minor}",
+        supported_platforms=["Linux"],
+        accelerator="rocm",
+    )
+    for major, minor in (map(int, version.split(".")) for version in ROCM_ARCHES)
+}
+XPU_NIGHTLY_SOURCE_MATRIX = {
+    "xpu": dict(
+        name="xpu",
+        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/xpu",
+        supported_platforms=["Linux"],
+        accelerator="xpu",
+    )
+}
+NIGHTLY_SOURCE_MATRIX.update(CUDA_NIGHTLY_SOURCE_MATRIX)
+NIGHTLY_SOURCE_MATRIX.update(ROCM_NIGHTLY_SOURCE_MATRIX)
+NIGHTLY_SOURCE_MATRIX.update(XPU_NIGHTLY_SOURCE_MATRIX)

+
+def get_nccl_wheel_version(arch_version: str) -> str:
    requirements = map(
        str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version])
    )
@ -147,17 +191,14 @@ def get_nccl_wheel_version(arch_version: str) -> str:


 def read_nccl_pin(arch_version: str) -> str:
-    from pathlib import Path
-
-    nccl_pin_path = os.path.join(
-        Path(__file__).absolute().parents[2],
-        ".ci",
-        "docker",
-        "ci_commit_pins",
-        f"nccl-cu{arch_version[:2]}.txt",
+    nccl_pin_path = (
+        REPO_ROOT
+        / ".ci"
+        / "docker"
+        / "ci_commit_pins"
+        / f"nccl-cu{arch_version[:2]}.txt"
    )
-    with open(nccl_pin_path) as f:
-        return f.read().strip()
+    return nccl_pin_path.read_text().strip()


 def validate_nccl_dep_consistency(arch_version: str) -> None:
@ -165,7 +206,8 @@ def validate_nccl_dep_consistency(arch_version: str) -> None:
    wheel_ver = get_nccl_wheel_version(arch_version)
    if not nccl_release_tag.startswith(f"v{wheel_ver}"):
        raise RuntimeError(
-            f"{arch_version} NCCL release tag version {nccl_release_tag} does not correspond to wheel version {wheel_ver}"
+            f"{arch_version} NCCL release tag version {nccl_release_tag} "
+            f"does not correspond to wheel version {wheel_ver}"
        )


@ -412,7 +454,14 @@ def generate_wheels_matrix(
    return ret


-validate_nccl_dep_consistency("13.0")
-validate_nccl_dep_consistency("12.9")
-validate_nccl_dep_consistency("12.8")
-validate_nccl_dep_consistency("12.6")
+arch_version = ""
+for arch_version in CUDA_ARCHES:
+    validate_nccl_dep_consistency(arch_version)
+del arch_version
+
+
+if __name__ == "__main__":
+    # Used by tools/nightly.py
+    (SCRIPT_DIR / "nightly_source_matrix.json").write_text(
+        json.dumps(NIGHTLY_SOURCE_MATRIX, indent=4) + "\n"
+    )
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -38,6 +38,10 @@ on:
        default: ""
        description: |
          List of tests to include (empty string implies default list)
+      dashboard-tag:
+        required: false
+        type: string
+        default: ""
      disable-monitor:
        description: |
          [Experimental] Disable utilization monitoring for tests.
@ -58,6 +62,11 @@ on:
        required: false
        type: number
        default: 1
+    secrets:
+      HUGGING_FACE_HUB_TOKEN:
+        required: false
+        description: |
+          HF Auth token to avoid rate limits when downloading models or datasets from hub
 permissions:
  id-token: write
  contents: read
@ -196,6 +205,8 @@ jobs:
          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
          TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
+          DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
        run: |
          # Fetch aws credential from IMDs
@ -246,6 +257,8 @@ jobs:
            -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
            -e TESTS_TO_INCLUDE \
            -e ZE_AFFINITY_MASK \
+            -e HUGGING_FACE_HUB_TOKEN \
+            -e DASHBOARD_TAG \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --ulimit stack=10485760:83886080 \
            --ulimit core=0 \
--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@ -36,7 +36,7 @@ jobs:
    runs-on: linux.9xlarge.ephemeral
    strategy:
      matrix:
-        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "cpu"]
+        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm7.0", "rocm7.1", "cpu"]
    steps:
      - name: Build docker image
        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -52,8 +52,8 @@ jobs:
          { tag: "cuda12.9" },
          { tag: "cuda12.8" },
          { tag: "cuda12.6" },
-          { tag: "rocm6.4"  },
          { tag: "rocm7.0"  },
+          { tag: "rocm7.1"  },
          { tag: "cpu"      },
        ]
    steps:
--- a/.github/workflows/build-magma-rocm-linux.yml
+++ b/.github/workflows/build-magma-rocm-linux.yml
@ -34,7 +34,7 @@ jobs:
      id-token: write
    strategy:
      matrix:
-        rocm_version: ["70", "64"]
+        rocm_version: ["71", "70"]
    steps:
      - name: Checkout PyTorch
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -54,8 +54,8 @@ jobs:
          { name: "manylinuxaarch64-builder",       tag: "cuda12.9",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinuxaarch64-builder",       tag: "cuda12.8",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinuxaarch64-builder",       tag: "cuda12.6",          runner: "linux.arm64.2xlarge.ephemeral" },
-          { name: "manylinux2_28-builder",          tag: "rocm6.4",           runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "rocm7.0",           runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "rocm7.1",           runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "cpu",               runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28_aarch64-builder",  tag: "cpu-aarch64",       runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "xpu",               runner: "linux.9xlarge.ephemeral" },
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -55,7 +55,7 @@ jobs:
        docker-image: ["pytorch/manylinux2_28-builder:cpu"]
        include:
          - device: "rocm"
-            rocm_version: "7.0"
+            rocm_version: "7.1"
            runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
          - device: "cuda"
            rocm_version: ""
@ -159,12 +159,7 @@ jobs:
            WITH_CLANG_LDD="--with-clang-ldd"
          fi

-          if [[ "${BUILD_DEVICE}" == xpu ]]; then
-            docker exec -t "${container_name}" bash -c "dnf install -y gcc-toolset-13-gcc-c++"
-            docker exec -t "${container_name}" bash -c "source /opt/rh/gcc-toolset-13/enable && ${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE"
-          else
-            docker exec -t "${container_name}" bash -c "${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE $WITH_CLANG_LDD"
-          fi
+          docker exec -t "${container_name}" bash -c "${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE $WITH_CLANG_LDD"

          if [[ ("${{ matrix.device }}" == "cuda" || "${{ matrix.device }}" == "xpu") ]]; then
            docker exec -t "${container_name}"  bash -c "auditwheel repair --plat ${PLATFORM} //artifacts/*.whl"
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -67,6 +67,7 @@ jobs:
          pytorch-linux-jammy-py3.12-halide,
          pytorch-linux-jammy-xpu-n-1-py3,
          pytorch-linux-jammy-xpu-n-py3,
+          pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks,
          pytorch-linux-jammy-py3-clang18-asan,
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@ -384,124 +384,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  libtorch-rocm6_4-shared-with-deps-release-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
-      build_name: libtorch-rocm6_4-shared-with-deps-release
-      build_environment: linux-binary-libtorch
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-rocm6_4-shared-with-deps-release-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - libtorch-rocm6_4-shared-with-deps-release-build
-      - get-label-type
-    runs-on: linux.rocm.gpu.mi250
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
-      GPU_ARCH_TYPE: rocm
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-    permissions:
-      id-token: write
-      contents: read
-    steps:
-      - name: Setup ROCm
-        uses: ./.github/actions/setup-rocm
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: libtorch-rocm6_4-shared-with-deps-release
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: ROCm set GPU_FLAG
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-      - name: configure aws credentials
-        id: aws_creds
-        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-          role-duration-seconds: 18000
-      - name: Calculate docker image
-        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
-          docker-image-name: libtorch-cxx11-builder
-          custom-tag-prefix: rocm6.4
-          docker-build-dir: .ci/docker
-          working-directory: pytorch
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-        env:
-          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      - name: Teardown ROCm
-        uses: ./.github/actions/teardown-rocm
-  libtorch-rocm6_4-shared-with-deps-release-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: libtorch-rocm6_4-shared-with-deps-release-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      build_name: libtorch-rocm6_4-shared-with-deps-release
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  libtorch-rocm7_0-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -619,3 +501,121 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  libtorch-rocm7_1-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm7.1
+      GPU_ARCH_VERSION: "7.1"
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
+      build_name: libtorch-rocm7_1-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-rocm7_1-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-rocm7_1-shared-with-deps-release-build
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm7.1
+      GPU_ARCH_VERSION: "7.1"
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm7_1-shared-with-deps-release
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: libtorch-cxx11-builder
+          custom-tag-prefix: rocm7.1
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  libtorch-rocm7_1-shared-with-deps-release-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-rocm7_1-shared-with-deps-release-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm7.1
+      GPU_ARCH_VERSION: "7.1"
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-rocm7_1-shared-with-deps-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
--- a/.github/workflows/inductor-perf-test-nightly-xpu.yml
+++ b/.github/workflows/inductor-perf-test-nightly-xpu.yml
@ -0,0 +1,148 @@
+name: inductor-perf-nightly-xpu
+
+on:
+  push:
+    tags:
+      - ciflow/inductor-perf-test-nightly-xpu/*
+  schedule:
+    - cron: 30 17 * * *
+  workflow_dispatch:
+    inputs:
+      training:
+        description: Run training (on by default)?
+        required: false
+        type: boolean
+        default: true
+      inference:
+        description: Run inference (on by default)?
+        required: false
+        type: boolean
+        default: true
+      default:
+        description: Run inductor_default?
+        required: false
+        type: boolean
+        default: false
+      dynamic:
+        description: Run inductor_dynamic_shapes?
+        required: false
+        type: boolean
+        default: false
+      cppwrapper:
+        description: Run inductor_cpp_wrapper?
+        required: false
+        type: boolean
+        default: false
+      cudagraphs:
+        description: Run inductor_cudagraphs?
+        required: false
+        type: boolean
+        default: false
+      freezing_cudagraphs:
+        description: Run inductor_cudagraphs with freezing for inference?
+        required: false
+        type: boolean
+        default: false
+      aotinductor:
+        description: Run aot_inductor for inference?
+        required: false
+        type: boolean
+        default: false
+      maxautotune:
+        description: Run inductor_max_autotune?
+        required: false
+        type: boolean
+        default: false
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+        default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf,cachebench
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  get-label-type:
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+      opt_out_experiments: lf
+
+  xpu-n-py3_10-inductor-benchmark-build:
+    name: xpu-n-py3.10-inductor-benchmark
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-xpu-n-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks
+      runner: linux.c7i.12xlarge
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface_perf_xpu", shard: 1, num_shards: 5, runner: "linux.idc.xpu" },
+          { config: "inductor_huggingface_perf_xpu", shard: 2, num_shards: 5, runner: "linux.idc.xpu" },
+          { config: "inductor_huggingface_perf_xpu", shard: 3, num_shards: 5, runner: "linux.idc.xpu" },
+          { config: "inductor_huggingface_perf_xpu", shard: 4, num_shards: 5, runner: "linux.idc.xpu" },
+          { config: "inductor_huggingface_perf_xpu", shard: 5, num_shards: 5, runner: "linux.idc.xpu" },
+          { config: "inductor_timm_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "inductor_timm_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "inductor_timm_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "inductor_timm_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "inductor_timm_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "inductor_timm_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "inductor_torchbench_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "inductor_torchbench_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "inductor_torchbench_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "inductor_torchbench_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "inductor_torchbench_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "inductor_torchbench_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
+        ]}
+    secrets: inherit
+
+  xpu-n-py3_10-inductor-benchmark-test-nightly:
+    permissions:
+      id-token: write
+      contents: read
+    if: github.event_name != 'workflow_dispatch'
+    name: xpu-n-py3.10-inductor-benchmark
+    uses: ./.github/workflows/_xpu-test.yml
+    needs: xpu-n-py3_10-inductor-benchmark-build
+    with:
+      build-environment: linux-jammy-xpu-n-py3.10
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false
+      docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
+      timeout-minutes: 720
+      # Disable monitor in perf tests for more investigation
+      disable-monitor: true
+      monitor-log-interval: 10
+      monitor-data-collect-interval: 2
+    secrets: inherit
+
+  xpu-n-py3_10-inductor-benchmark-test:
+    permissions:
+      id-token: write
+      contents: read
+    if: github.event_name == 'workflow_dispatch'
+    name: xpu-n-py3.10-inductor-test
+    uses: ./.github/workflows/_xpu-test.yml
+    needs: xpu-n-py3_10-inductor-benchmark-build
+    with:
+      build-environment: linux-jammy-xpu-n-py3.10
+      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
+      docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
+      timeout-minutes: 720
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
--- a/.github/workflows/periodic-rocm-mi200.yml
+++ b/.github/workflows/periodic-rocm-mi200.yml
@ -0,0 +1,84 @@
+name: periodic-rocm-mi200
+
+on:
+  schedule:
+    # We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
+    # Also run less frequently on weekends.
+    - cron: 45 0,8,16 * * 1-5
+    - cron: 45 4 * * 0,6
+    - cron: 45 4,12,20 * * 1-5
+    - cron: 45 12 * * 0,6
+    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
+  push:
+    tags:
+      - ciflow/periodic/*
+      - ciflow/periodic-rocm-mi200/*
+    branches:
+      - release/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  llm-td:
+    if: github.repository_owner == 'pytorch'
+    name: before-test
+    uses: ./.github/workflows/llm_td_retrieval.yml
+    permissions:
+      id-token: write
+      contents: read
+
+  target-determination:
+    name: before-test
+    uses: ./.github/workflows/target_determination.yml
+    needs: llm-td
+    permissions:
+      id-token: write
+      contents: read
+
+  get-label-type:
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch'
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-rocm-py3_10-build:
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      test-matrix: |
+        { include: [
+          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
+        ]}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
+    secrets: inherit
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -204,37 +204,6 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-rocm-py3_10-build:
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
-      test-matrix: |
-        { include: [
-          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
-        ]}
-    secrets: inherit
-
-  linux-jammy-rocm-py3_10-test:
-    permissions:
-      id-token: write
-      contents: read
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_rocm-test.yml
-    needs:
-      - linux-jammy-rocm-py3_10-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
-    secrets: inherit
-
  linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build:
    name: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -6,6 +6,7 @@ on:
      - pull
      - trunk
      - periodic
+      - periodic-rocm-mi200
      - periodic-rocm-mi300
      - inductor
      - unstable
--- a/.gitignore
+++ b/.gitignore
@ -143,6 +143,7 @@ scripts/release_notes/*.json
 sccache-stats*.json
 lint.json
 merge_record.json
+.github/scripts/nightly_source_matrix.json

 # These files get copied over on invoking setup.py
 torchgen/packaged/*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -374,7 +374,7 @@ cmake_dependent_option(
  "Build the lazy Torchscript backend, not compatible with mobile builds" ON
  "NOT INTERN_BUILD_MOBILE" OFF)
 cmake_dependent_option(BUILD_FUNCTORCH "Build Functorch" ON "BUILD_PYTHON" OFF)
-cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
+cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin folder"
                       OFF "USE_CUDA" OFF)
 cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
                        "CPU_AARCH64" OFF)
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -825,6 +825,14 @@ void Context::setDisplayVmapFallbackWarnings(bool enabled) {
  display_vmap_fallback_warnings_ = enabled;
 }

+bool Context::warnOnAccumulateGradStreamMismatch() const {
+  return warn_on_accumulate_grad_stream_mismatch_;
+}
+
+void Context::setWarnOnAccumulateGradStreamMismatch(bool enabled) {
+  warn_on_accumulate_grad_stream_mismatch_ = enabled;
+}
+
 bool Context::isDefaultMobileCPUAllocatorSet() {
  return prev_allocator_ptr_ != nullptr;
 }
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -404,6 +404,9 @@ class TORCH_API Context {
  void setDisplayVmapFallbackWarnings(bool enabled);
  bool areVmapFallbackWarningsEnabled() const;

+  void setWarnOnAccumulateGradStreamMismatch(bool enabled);
+  bool warnOnAccumulateGradStreamMismatch() const;
+
  bool isDefaultMobileCPUAllocatorSet();
  void setDefaultMobileCPUAllocator();
  void unsetDefaultMobileCPUAllocator();
@ -494,6 +497,7 @@ class TORCH_API Context {
  bool release_original_weights = false;
 #endif
  bool display_vmap_fallback_warnings_ = false;
+  bool warn_on_accumulate_grad_stream_mismatch_ = true;
  std::atomic<at::QEngine> quantized_engine = at::QEngine::NoQEngine;
  bool enable_sparse_tensor_invariant_checks = false;
  bool allow_fp16_reduction_cpu = false;
--- a/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
@ -19,6 +19,13 @@ inline namespace CPU_CAPABILITY {
 #error "Big endian is not supported."
 #endif

+// GCC does not properly optimize bf16 operators
+#if defined(__ARM_FEATURE_BF16) && (__clang_major__ >= 19)
+#define BF16_ARITHMETIC_SUPPORTED() 1
+#else
+#define BF16_ARITHMETIC_SUPPORTED() 0
+#endif
+
 // Unlike the float16_t family of types, bfloat16_t is not available
 // when we're not targeting bfloat16 hardware support on some
 // platforms (but not Mac, so we have to be careful not to shadow the
@ -352,18 +359,72 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
        other, &Vectorized<float>::name);                        \
  }

-  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs)
  Vectorized frac() const;
-  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(trunc)
  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(sqrt)
+
+#ifdef __ARM_FEATURE_BF16
+  // Flip sign bit
+  Vectorized<c10::BFloat16> neg() const {
+    return vreinterpretq_bf16_s16(vreinterpretq_s16_bf16(values) ^ (-32768));
+  }
+  // Fast reciprocal is fine because we are truncating results
+  Vectorized<c10::BFloat16> reciprocal() const {
+    auto x = vcvtq_low_f32_bf16(values);
+    auto y = vcvtq_high_f32_bf16(values);
+    x = vrecpeq_f32(x);
+    y = vrecpeq_f32(y);
+    return vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(x), y);
+  }
+  // Clearing the sign bit
+  Vectorized<c10::BFloat16> abs() const {
+    return vreinterpretq_bf16_u16(vreinterpretq_u16_bf16(values) & 0x7FFF);
+  }
+#else
+  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs)
+  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(reciprocal)
+#endif
+
+// These functions are optimized on clang-21+
+#if BF16_ARITHMETIC_SUPPORTED() && (__clang_major__ >= 21)
+  Vectorized<c10::BFloat16> operator==(
+      const Vectorized<c10::BFloat16>& other) const {
+    return values == other.values;
+  }
+
+  Vectorized<c10::BFloat16> operator!=(
+      const Vectorized<c10::BFloat16>& other) const {
+    return values != other.values;
+  }
+
+  Vectorized<c10::BFloat16> operator<(
+      const Vectorized<c10::BFloat16>& other) const {
+    return values < other.values;
+  }
+
+  Vectorized<c10::BFloat16> operator<=(
+      const Vectorized<c10::BFloat16>& other) const {
+    return values <= other.values;
+  }
+
+  Vectorized<c10::BFloat16> operator>(
+      const Vectorized<c10::BFloat16>& other) const {
+    return values > other.values;
+  }
+
+  Vectorized<c10::BFloat16> operator>=(
+      const Vectorized<c10::BFloat16>& other) const {
+    return values >= other.values;
+  }
+#else
  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator==)
  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator!=)
  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator<)
  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator<=)
  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>)
  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>=)
+#endif

 #undef DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD
 #undef DEFINE_BINARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD
@ -412,28 +473,52 @@ template <>
 Vectorized<c10::BFloat16> inline operator+(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b) {
+#if BF16_ARITHMETIC_SUPPORTED()
+  bfloat16x8_t x = a;
+  bfloat16x8_t y = b;
+  return x + y;
+#else
  return binary_operator_via_float(std::plus<Vectorized<float>>(), a, b);
+#endif
 }

 template <>
 Vectorized<c10::BFloat16> inline operator-(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b) {
+#if BF16_ARITHMETIC_SUPPORTED()
+  bfloat16x8_t x = a;
+  bfloat16x8_t y = b;
+  return x - y;
+#else
  return binary_operator_via_float(std::minus<Vectorized<float>>(), a, b);
+#endif
 }

 template <>
 Vectorized<c10::BFloat16> inline operator*(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b) {
+#if BF16_ARITHMETIC_SUPPORTED()
+  bfloat16x8_t x = a;
+  bfloat16x8_t y = b;
+  return x * y;
+#else
  return binary_operator_via_float(std::multiplies<Vectorized<float>>(), a, b);
+#endif
 }

 template <>
 Vectorized<c10::BFloat16> inline operator/(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b) {
+#if BF16_ARITHMETIC_SUPPORTED()
+  bfloat16x8_t x = a;
+  bfloat16x8_t y = b;
+  return x / y;
+#else
  return binary_operator_via_float(std::divides<Vectorized<float>>(), a, b);
+#endif
 }

 // frac. Implement this here so we can use subtraction
@ -544,12 +629,19 @@ Vectorized<c10::BFloat16> inline fmadd(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b,
    const Vectorized<c10::BFloat16>& c) {
+#if BF16_ARITHMETIC_SUPPORTED()
+  bfloat16x8_t x = a;
+  bfloat16x8_t y = b;
+  bfloat16x8_t z = c;
+  return x * y + z;
+#else
  // NOTE [BF16 FMA]: There isn't an FMA that accumulates into BF16!  Also,
  // vbfmlalbq_f32 and vbfmlaltq_f32 take the even and odd-numbered
  // elements, not the bottom and top half, so they don't seem
  // particularly useful here. Ideally we would include dot product in
  // the Vectorized interface...
  return a * b + c;
+#endif
 }

 template <>
@ -557,8 +649,15 @@ Vectorized<c10::BFloat16> inline fnmadd(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b,
    const Vectorized<c10::BFloat16>& c) {
+#if BF16_ARITHMETIC_SUPPORTED()
+  bfloat16x8_t x = a;
+  bfloat16x8_t y = b;
+  bfloat16x8_t z = c;
+  return (-x) * y + z;
+#else
  // See NOTE [BF16 FMA] above.
  return -a * b + c;
+#endif
 }

 template <>
@ -566,8 +665,15 @@ Vectorized<c10::BFloat16> inline fmsub(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b,
    const Vectorized<c10::BFloat16>& c) {
+#if BF16_ARITHMETIC_SUPPORTED()
+  bfloat16x8_t x = a;
+  bfloat16x8_t y = b;
+  bfloat16x8_t z = c;
+  return x * y - z;
+#else
  // See NOTE [BF16 FMA] above.
  return a * b - c;
+#endif
 }

 template <>
@ -575,8 +681,15 @@ Vectorized<c10::BFloat16> inline fnmsub(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b,
    const Vectorized<c10::BFloat16>& c) {
+#if BF16_ARITHMETIC_SUPPORTED()
+  bfloat16x8_t x = a;
+  bfloat16x8_t y = b;
+  bfloat16x8_t z = c;
+  return (-x) * y - z;
+#else
  // See NOTE [BF16 FMA] above.
  return -a * b - c;
+#endif
 }

 #endif // !defined(C10_MOBILE) && defined(__aarch64__)
--- a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
@ -6,9 +6,9 @@ namespace at::vec {
 inline namespace CPU_CAPABILITY {
 #if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))

-// Enable auto-vectorization for GCC-13+ and clang-17+
+// Enable auto-vectorization for clang-17+
 // GCC-12 has a bug: gcc.gnu.org/bugzilla/show_bug.cgi?id=117001
-#if __GNUC__ > 12 || (defined(__clang__) && (__clang_major__ >= 17))
+#if defined(__clang__) && (__clang_major__ >= 17)

 template <typename from_type, typename to_type>
 inline void convertImpl(
--- a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
@ -309,7 +309,7 @@ class Vectorized<float> {
  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(expm1)
  // Implementation copied from Arm Optimized Routine
  // https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
-  Vectorized<float> exp_u20() const {
+  inline Vectorized<float> vexpq_f32_u20() const {
    // bail out to sleef if it's a special case:
    // i.e. there's an input s.t. |input| > 87.3....
    const float32x4_t special_bound = vdupq_n_f32(0x1.5d5e2ap+6f);
@ -348,6 +348,9 @@ class Vectorized<float> {

    return vfmaq_f32(scale, poly, scale);
  }
+  Vectorized<float> exp_u20() const {
+    return vexpq_f32_u20();
+  }
  Vectorized<float> fexp_u20() const {
    return exp_u20();
  }
@ -634,7 +637,7 @@ inline Vectorized<float> Vectorized<float>::erf() const {
  // - exp(- x * x)
  auto pow_2 = (*this) * (*this);
  auto neg_pow_2 = pow_2 ^ neg_zero_vec;
-  auto tmp4 = neg_pow_2.exp();
+  auto tmp4 = neg_pow_2.vexpq_f32_u20();
  auto tmp5 = tmp4 ^ neg_zero_vec;
  // erf(x) = sign(x) * (1 - r * t * exp(- x * x))
  auto tmp6 = t * tmp5;
--- a/aten/src/ATen/cuda/CUDAGreenContext.cpp
+++ b/aten/src/ATen/cuda/CUDAGreenContext.cpp
@ -1,78 +1,90 @@
 #include <ATen/cuda/CUDAGreenContext.h>

-namespace at::cuda {
-  GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
-#if CUDA_HAS_GREEN_CONTEXT
-    int driver_version;
-    C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
-    TORCH_CHECK(
-        driver_version >= 12080, "cuda driver too old to use green context!");
-    CUcontext pctx = nullptr;
-    C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));
-    if (C10_UNLIKELY(!pctx)) {
-      TORCH_WARN(
-          "Attempted to create a green context but"
-          " there was no primary context! Creating a primary context...");
-
-      cudaFree(0);
-    }
-
-    CUdevice device;
-    device_id_ = device_id;
-    C10_CUDA_DRIVER_CHECK(
-        c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));
-
-    // Get device resources
-    CUdevResource device_resource;
-    C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(
-        device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));
-
-    // Split resources
-    std::vector<CUdevResource> result(1);
-    auto result_data = result.data();
-    unsigned int nb_groups = 1;
-    CUdevResource remaining;
-
-    C10_CUDA_DRIVER_CHECK(
-        c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(
-            result_data,
-            &nb_groups,
-            &device_resource,
-            &remaining,
-            0, // default flags
-            num_sms));
-
-    TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");
-
-    // Generate resource descriptor
-    CUdevResourceDesc desc;
-    C10_CUDA_DRIVER_CHECK(
-        c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(
-            &desc, result_data, 1));
-
-    // Create green context
-    // CU_GREEN_CTX_DEFAULT_STREAM is required per docs:
-    // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html
-    C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(
-        &green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));
-
-    // Convert to regular context
-    C10_CUDA_DRIVER_CHECK(
-        c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));
-    TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");
+#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+#include <c10/cuda/driver_api.h>
+#include <stdexcept>
+#include <vector>
+#define HAS_CUDA_GREEN_CONTEXT() 1
 #else
-    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
+#define HAS_CUDA_GREEN_CONTEXT() 0
+// Suppress unsued private field warnings as this class is not supposed to be called
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-private-field")
+#endif
+
+namespace at::cuda {
+
+GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
+#if HAS_CUDA_GREEN_CONTEXT()
+  int driver_version;
+  C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
+  TORCH_CHECK(
+      driver_version >= 12080, "cuda driver too old to use green context!");
+  CUcontext pctx = nullptr;
+  C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));
+  if (C10_UNLIKELY(!pctx)) {
+    TORCH_WARN(
+        "Attempted to create a green context but"
+        " there was no primary context! Creating a primary context...");
+
+    cudaFree(0);
+  }
+
+   CUdevice device;
+  device_id_ = device_id;
+  C10_CUDA_DRIVER_CHECK(
+      c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));
+
+  // Get device resources
+  CUdevResource device_resource;
+  C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(
+      device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));
+
+  // Split resources
+  std::vector<CUdevResource> result(1);
+  auto result_data = result.data();
+  unsigned int nb_groups = 1;
+  CUdevResource remaining;
+
+  C10_CUDA_DRIVER_CHECK(
+      c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(
+          result_data,
+          &nb_groups,
+          &device_resource,
+          &remaining,
+          0, // default flags
+          num_sms));
+
+  TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");
+
+  // Generate resource descriptor
+  CUdevResourceDesc desc;
+  C10_CUDA_DRIVER_CHECK(
+      c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(
+          &desc, result_data, 1));
+
+  // Create green context
+  // CU_GREEN_CTX_DEFAULT_STREAM is required per docs:
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html
+  C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(
+      &green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));
+
+  // Convert to regular context
+  C10_CUDA_DRIVER_CHECK(
+      c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));
+  TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");
+#else
+  TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
 #endif
  }

  std::unique_ptr<GreenContext> GreenContext::create(
      uint32_t num_sms,
      std::optional<uint32_t> device_id) {
-#if CUDA_HAS_GREEN_CONTEXT
+#if HAS_CUDA_GREEN_CONTEXT()
    if (!device_id.has_value()) {
      device_id = at::cuda::current_device();
    }
-    return std::make_unique<GreenContext>(device_id.value(), num_sms);
+    return std::unique_ptr<GreenContext>(new GreenContext(device_id.value(), num_sms));
 #else
    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
 #endif
@ -80,7 +92,7 @@ namespace at::cuda {

  // Implement move operations
  GreenContext::GreenContext(GreenContext&& other) noexcept{
-#if CUDA_HAS_GREEN_CONTEXT
+#if HAS_CUDA_GREEN_CONTEXT()
    device_id_ = std::exchange(other.device_id_, -1);
    green_ctx_ = std::exchange(other.green_ctx_, nullptr);
    context_ = std::exchange(other.context_, nullptr);
@ -91,7 +103,7 @@ namespace at::cuda {
  }

  GreenContext& GreenContext::operator=(GreenContext&& other) noexcept{
-#if CUDA_HAS_GREEN_CONTEXT
+#if HAS_CUDA_GREEN_CONTEXT()
    if (this != &other) {
      // Clean up current resources
      if (green_ctx_) {
@ -120,7 +132,7 @@ namespace at::cuda {
  }

  GreenContext::~GreenContext() noexcept{
-#if CUDA_HAS_GREEN_CONTEXT
+#if HAS_CUDA_GREEN_CONTEXT()
    C10_CUDA_DRIVER_CHECK(
        c10::cuda::DriverAPI::get()->cuGreenCtxDestroy_(green_ctx_));
 #else
@ -128,25 +140,9 @@ namespace at::cuda {
 #endif
  }

-  // Get the underlying CUDA context
-  CUcontext GreenContext::getContext() const {
-#if CUDA_HAS_GREEN_CONTEXT
-    return context_;
-#else
-    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
-#endif
-  }
-
-  // Get the underlying green context
-#if CUDA_HAS_GREEN_CONTEXT
-  CUgreenCtx GreenContext::getGreenContext() const {
-    return green_ctx_;
-  }
-#endif
-
  // Make this context current
  void GreenContext::setContext() {
-#if CUDA_HAS_GREEN_CONTEXT
+#if HAS_CUDA_GREEN_CONTEXT()
    auto current_stream = c10::cuda::getCurrentCUDAStream();
    parent_stream_ = current_stream.stream();

@ -175,7 +171,7 @@ namespace at::cuda {
  }

  void GreenContext::popContext() {
-#if CUDA_HAS_GREEN_CONTEXT
+#if HAS_CUDA_GREEN_CONTEXT()
    // see above note about stream being hardcoded to the default stream
    at::cuda::CUDAEvent ev;
    ev.record(c10::cuda::getCurrentCUDAStream());
--- a/aten/src/ATen/cuda/CUDAGreenContext.h
+++ b/aten/src/ATen/cuda/CUDAGreenContext.h
@ -1,53 +1,38 @@
 #pragma once
 #include <ATen/cuda/CUDAEvent.h>
-
-#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
-#include <c10/cuda/driver_api.h>
 #include <cuda.h>
-#include <memory>
-#include <stdexcept>
-#include <vector>
-#define CUDA_HAS_GREEN_CONTEXT 1
-#else
-#define CUDA_HAS_GREEN_CONTEXT 0
-#endif
+
+// Forward declare green context as opaque ptr
+typedef struct CUgreenCtx_st* CUgreenCtx;

 namespace at::cuda {

 class TORCH_CUDA_CPP_API GreenContext {
 public:
-  GreenContext(uint32_t device_id, uint32_t num_sms);
-
-  static std::unique_ptr<GreenContext> create(uint32_t num_sms, std::optional<uint32_t> device_id);
+  // Green context creation
+  static std::unique_ptr<GreenContext> create(
+      uint32_t num_sms,
+      std::optional<uint32_t> device_id);
+  ~GreenContext() noexcept;

  // Delete copy constructor and assignment
  GreenContext(const GreenContext&) = delete;
  GreenContext& operator=(const GreenContext&) = delete;

-  // Implement move operations
-  GreenContext(GreenContext&& other) noexcept;
-  GreenContext& operator=(GreenContext&& other) noexcept;
-  ~GreenContext() noexcept;
-
-  // Get the underlying CUDA context
-  CUcontext getContext() const;
-
-  // Get the underlying green context
-#if CUDA_HAS_GREEN_CONTEXT
-  CUgreenCtx getGreenContext() const;
-#endif
-
  // Make this context current
  void setContext();

  void popContext();

 private:
-#if CUDA_HAS_GREEN_CONTEXT
+  GreenContext(uint32_t device_id, uint32_t num_sms);
+  // Implement move operations
+  GreenContext(GreenContext&& other) noexcept;
+  GreenContext& operator=(GreenContext&& other) noexcept;
+
  int32_t device_id_ = -1;
  CUgreenCtx green_ctx_ = nullptr;
  CUcontext context_ = nullptr;
  cudaStream_t parent_stream_ = nullptr;
-#endif
 };
 } // namespace at::cuda
--- a/aten/src/ATen/cuda/CUDASparse.h
+++ b/aten/src/ATen/cuda/CUDASparse.h
@ -7,17 +7,6 @@
 #endif


-#if defined(USE_ROCM)
-// hipSparse const API added in v2.4.0
-#if HIPSPARSE_VERSION >= 200400
-#define AT_USE_HIPSPARSE_GENERIC_API() 1
-#else
-#define AT_USE_HIPSPARSE_GENERIC_API() 1
-#endif
-#else // USE_ROCM
-#define AT_USE_HIPSPARSE_GENERIC_API() 0
-#endif // USE_ROCM
-
 // cuSparse Generic API spsv function was added in CUDA 11.3.0
 #if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && (CUSPARSE_VERSION >= 11500)
 #define AT_USE_CUSPARSE_GENERIC_SPSV() 1
--- a/aten/src/ATen/cuda/detail/BLASConstants.cu
+++ b/aten/src/ATen/cuda/detail/BLASConstants.cu
@ -2,8 +2,6 @@
 #include <ATen/Tensor.h>
 #include <ATen/cuda/Exceptions.h>

-#include <mutex>
-
 namespace at {
 namespace cuda {
 namespace detail {
@ -12,39 +10,36 @@ __device__ __constant__ float cublas_one_device;
 __device__ __constant__ float cublas_zero_device;

 float *get_cublas_device_one() {
-  static c10::once_flag init_flag;
-
-  c10::call_once(init_flag, []() {
+  static float *ptr = nullptr;
+  static auto init_flag = [&]() {
    const float one = 1.f;
    AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_one_device, &one, sizeof(float)));
-  });
+    AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_one_device));
+    return true;
+  }();

-  float *ptr;
-  AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_one_device));
  return ptr;
 }

 float *get_cublas_device_zero() {
-  static c10::once_flag init_flag;
-
-  c10::call_once(init_flag, []() {
+  static float *ptr = nullptr;
+  static auto init_flag = [&]() {
    const float zero = 0.f;
    AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_zero_device, &zero, sizeof(float)));
-  });
+    AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_zero_device));
+    return true;
+  }();

-  float *ptr;
-  AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_zero_device));
  return ptr;
 }

 float *get_user_alpha_ptr() {
  static float *alpha_ptr;

-  static c10::once_flag init_flag;
-
-  c10::call_once(init_flag, []() {
+  static bool init_flag [[maybe_unused]] = []() {
    AT_CUDA_CHECK(cudaMalloc(&alpha_ptr, sizeof(float)));
-  });
+    return true;
+  }();

  return alpha_ptr;
 }
--- a/aten/src/ATen/detail/MTIAHooksInterface.h
+++ b/aten/src/ATen/detail/MTIAHooksInterface.h
@ -1,5 +1,6 @@
 #pragma once

+#include <c10/core/CachingDeviceAllocator.h>
 #include <c10/core/Device.h>
 #include <c10/util/Exception.h>

@ -151,6 +152,36 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
  }

  virtual bool isAvailable() const override;
+
+  /* MTIAGraph related APIs */
+  virtual int64_t mtiagraphCreate(bool keep_graph = false) const {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+    return -1;
+  }
+
+  virtual void mtiagraphCaptureBegin(int64_t handle, MempoolId_t pool) const {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+  }
+
+  virtual void mtiagraphCaptureEnd(int64_t handle) const {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+  }
+
+  virtual void mtiagraphInstantiate(int64_t handle) const {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+  }
+
+  virtual void mtiagraphReplay(int64_t handle) const {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+  }
+
+  virtual void mtiagraphReset(int64_t handle) const {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+  }
+
+  virtual MempoolId_t mtiagraphPool(int64_t handle) const {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+  }
 };

 struct TORCH_API MTIAHooksArgs {};
--- a/aten/src/ATen/functorch/BatchRulesViews.cpp
+++ b/aten/src/ATen/functorch/BatchRulesViews.cpp
@ -534,20 +534,20 @@ Tensor trace_decomp(const Tensor& tensor) {
 std::tuple<Tensor, std::optional<int64_t>> tril_batch_rule(
    const Tensor& self,
    std::optional<int64_t> self_bdim,
-    int64_t diagonal = 0) {
+    c10::SymInt diagonal = 0) {
  TORCH_CHECK(self.dim() >= 2, "tril: The input tensor must have at least 2 dimensions.");
  auto self_ = moveBatchDimToFront(self, self_bdim);
-  auto result = at::tril(self_, diagonal);
+  auto result = at::tril_symint(self_, std::move(diagonal));
  return std::make_tuple(std::move(result), 0);
 }

 std::tuple<Tensor, std::optional<int64_t>> triu_batch_rule(
    const Tensor& self,
    std::optional<int64_t> self_bdim,
-    int64_t diagonal = 0) {
+    c10::SymInt diagonal = 0) {
  TORCH_CHECK(self.dim() >= 2, "triu: The input tensor must have at least 2 dimensions.");
  auto self_ = moveBatchDimToFront(self, self_bdim);
-  auto result = at::triu(self_, diagonal);
+  auto result = at::triu_symint(self_, std::move(diagonal));
  return std::make_tuple(std::move(result), 0);
 }

--- a/aten/src/ATen/mps/MPSDevice.mm
+++ b/aten/src/ATen/mps/MPSDevice.mm
@ -1,7 +1,5 @@
 //  Copyright © 2022 Apple Inc.

-#include <c10/util/CallOnce.h>
-
 #include <ATen/mps/IndexKernels.h>
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSDevice.h>
@ -10,9 +8,6 @@

 namespace at::mps {

-static std::unique_ptr<MPSDevice> mps_device;
-static c10::once_flag mpsdev_init;
-
 static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& device) {
  // MPS Advanced Indexing needs at least Metal 2.0 (support for Argument Buffers and function constants)
  // host_name attribute needs at least Metal 2.2 and ulong needs Metal 2.3 (supported on MacOS 11+
@ -21,8 +16,8 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
 }

 MPSDevice* MPSDevice::getInstance() {
-  c10::call_once(mpsdev_init, [] { mps_device = std::unique_ptr<MPSDevice>(new MPSDevice()); });
-  return mps_device.get();
+  static MPSDevice mps_device;
+  return &mps_device;
 }

 MPSDevice::~MPSDevice() {
--- a/aten/src/ATen/native/AveragePool2d.cpp
+++ b/aten/src/ATen/native/AveragePool2d.cpp
@ -25,18 +25,19 @@ TORCH_PRECOMPUTE_META_FUNC(avg_pool2d)
  // #20866, #22032: Guarantee this for the official C++ API?
  TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
    "avg_pool2d: kernel_size must either be a single int, or a tuple of two ints");
-  const int64_t kH = kernel_size[0];
-  const int64_t kW = kernel_size.size() == 1 ? kH : kernel_size[1];
+  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);

  TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 2,
    "avg_pool2d: stride must either be omitted, a single int, or a tuple of two ints");
-  const int64_t dH = stride.empty() ? kH : stride[0];
-  const int64_t dW = stride.empty() ? kW : stride.size() == 1 ? dH : stride[1];
+  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
+  const int dW = stride.empty() ? kW :
+                 stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);

  TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
    "avg_pool2d: padding must either be a single int, or a tuple of two ints");
-  const int64_t padH = padding[0];
-  const int64_t padW = padding.size() == 1 ? padH : padding[1];
+  const int padH = safe_downcast<int, int64_t>(padding[0]);
+  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);

  TORCH_CHECK(!divisor_override.has_value() || divisor_override.value() != 0,
    "divisor must be not zero");
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -410,8 +410,8 @@ struct ConvParams {
      return false;
    }
    static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
-    // broken on cuDNN 9.8
-    if (cudnn_version >= 90800) {
+    // broken on cuDNN 9.8 - 9.14
+    if (cudnn_version >= 90800 && cudnn_version < 91500) {
      if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
          (input.scalar_type() == at::kBFloat16 || input.scalar_type() == at::kHalf) &&
          weight.dim() == 5) {
--- a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
@ -139,7 +139,7 @@ void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, dou
      }
    );
  } else {
-    AT_DISPATCH_ALL_TYPES(dtype, "smooth_l1_backward_cpu_out", [&] {
+    AT_DISPATCH_ALL_TYPES_AND(kHalf, dtype, "smooth_l1_backward_cpu_out", [&] {
    auto norm_val = norm.to<scalar_t>();
    scalar_t beta_val(beta);
    auto norm_val_vec = Vectorized<scalar_t>(norm_val);
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -170,10 +170,14 @@ static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const
  #if defined(CUDA_VERSION) || defined(USE_ROCM)
  const auto scalar_type = mat1.scalar_type();
  return (beta.toComplexDouble() == 1.0
-    // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
-    // is to use lt interface only when self is bias.
-    && self.dim() == 1 && self.sizes()[0] == mat2_sizes[1] && self.is_contiguous()
    && result.dim() == 2 && result.is_contiguous()
+    // Conditions for bias to be fusable
+    && (
+      self.is_contiguous() &&
+      // NOTE: fine to have 1-len dims to the left from the right-most one
+      (self.dim() == 1 || self.squeeze().dim() == 1) &&
+      self.sizes().back() == mat2_sizes[1]
+    )
    && ( // some dtype restrictions
      #ifndef USE_ROCM
      scalar_type == at::ScalarType::Double ||
--- a/aten/src/ATen/native/cuda/GroupedBlas.cpp
+++ b/aten/src/ATen/native/cuda/GroupedBlas.cpp
@ -213,9 +213,9 @@ _f4_f4_bf16_grouped_mm_fbgemm(
      const Tensor& mat_a,
      const Tensor& mat_b,
      const Tensor& scale_a,
-      const Tensor& global_scale_a,
+      const std::optional<Tensor>& global_scale_a,
      const Tensor& scale_b,
-      const Tensor& global_scale_b,
+      const std::optional<Tensor>& global_scale_b,
      const std::optional<Tensor>& offs,
      const std::optional<Tensor>& bias,
      Tensor& out) {
@ -225,14 +225,28 @@ _f4_f4_bf16_grouped_mm_fbgemm(
      "mat_a must be Float4_e2n1fn_2, got: ", mat_a.scalar_type());
  TORCH_CHECK_VALUE(mat_b.scalar_type() == at::kFloat4_e2m1fn_x2,
      "mat_b must be Float4_e2n1fn_2, got: ", mat_b.scalar_type());
-  TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e4m3fn,
-      "scale_a must be Float8_e4m3fn, got: ", scale_a.scalar_type());
-  TORCH_CHECK_VALUE(scale_b.scalar_type() == at::kFloat8_e4m3fn,
-      "scale_b must be Float8_e4m3fn, got: ", scale_b.scalar_type());
-  TORCH_CHECK_VALUE(global_scale_a.scalar_type() == at::kFloat,
-      "global_scale_a must be Float, got: ", global_scale_a.scalar_type());
-  TORCH_CHECK_VALUE(global_scale_b.scalar_type() == at::kFloat,
-      "global_scale_b must be Float, got: ", global_scale_b.scalar_type());
+
+  std::optional<Tensor> combined_global_scale = std::nullopt;
+  if (global_scale_a.has_value() || global_scale_b.has_value()) {
+      // NVFP4
+      TORCH_CHECK_VALUE(global_scale_a.has_value() && global_scale_b.has_value(),
+          "For NVFP4 grouped gemm both of global_scale_{a,b} must have values")
+      TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e4m3fn,
+          "scale_a must be Float8_e4m3fn, got: ", scale_a.scalar_type());
+      TORCH_CHECK_VALUE(scale_b.scalar_type() == at::kFloat8_e4m3fn,
+          "scale_b must be Float8_e4m3fn, got: ", scale_b.scalar_type());
+      TORCH_CHECK_VALUE(global_scale_a.value().scalar_type() == at::kFloat,
+          "global_scale_a must be Float, got: ", global_scale_a.value().scalar_type());
+      TORCH_CHECK_VALUE(global_scale_b.value().scalar_type() == at::kFloat,
+          "global_scale_b must be Float, got: ", global_scale_b.value().scalar_type());
+      combined_global_scale = global_scale_a.value().mul(global_scale_b.value());
+  } else {
+      // MXFP4
+      TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e8m0fnu,
+          "scale_a must be Float8_e8m0fnu, got: ", scale_a.scalar_type());
+      TORCH_CHECK_VALUE(scale_b.scalar_type() == at::kFloat8_e8m0fnu,
+          "scale_b must be Float8_e8m0fnu, got: ", scale_b.scalar_type());
+  }

  auto o = fbgemm_gpu::f4f4bf16_grouped_mm(
      mat_a,
@ -241,7 +255,7 @@ _f4_f4_bf16_grouped_mm_fbgemm(
      scale_b,
      offs.value(),
      out,
-      global_scale_a.mul(global_scale_b)
+      combined_global_scale
  );
 #else
  TORCH_CHECK_NOT_IMPLEMENTED(false, "nvfp4 grouped gemm is not supported without USE_FBGEMM_GENAI, and only for CUDA")
@ -471,9 +485,10 @@ namespace {

 using acceptance_fn = std::function<bool(c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&, c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&)>;

-std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 3> scale_grouped_kernel_dispatch = {{
+std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 4> scale_grouped_kernel_dispatch = {{
  { "rowwise_rowwise", scaled_blas::check_rowwise_recipe, ScaledGemmImplementation::ROWWISE_ROWWISE},
  { "mxfp8_mxfp8", scaled_blas::check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8},
+  { "mxfp4_mxfp4", scaled_blas::check_mxfp4_recipe, ScaledGemmImplementation::MXFP4_MXFP4},
  { "nvfp4_nvfp4", scaled_blas::check_nvfp4_recipe, ScaledGemmImplementation::NVFP4_NVFP4}}};

 } // anonymous namespace
@ -599,6 +614,21 @@ _scaled_grouped_mm_cuda_v2(
          offs.value(),
          out);
    }
+    case ScaledGemmImplementation::MXFP4_MXFP4: {
+      // scale shape checks
+      _check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
+      _check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
+      return _f4_f4_bf16_grouped_mm_fbgemm(
+          mat_a,
+          mat_b,
+          scale_a[0], /* block-scale A */
+          std::nullopt, /* global-scale A */
+          scale_b[0], /* block-scale B */
+          std::nullopt, /* global-scale B */
+          offs.value(),
+          std::nullopt, /* bias */
+          out);
+    }
    case ScaledGemmImplementation::NVFP4_NVFP4: {
      // scale shape checks
      _check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
--- a/aten/src/ATen/native/cuda/IndexKernelUtils.cu
+++ b/aten/src/ATen/native/cuda/IndexKernelUtils.cu
@ -13,7 +13,7 @@ __global__ void vectorized_gather_kernel(char * out, char * inp, index_t * idx,
    if (allow_neg_indices) {
        ind = (ind < 0) ? ind + ind_dim_size : ind;
    }
-    CUDA_KERNEL_ASSERT(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds");
+    CUDA_KERNEL_ASSERT_VERBOSE(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds", "Expected 0 <= index < ind_dim_size(%ld), but got index = %ld", ind_dim_size, ind);
    int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; // off is guaranteed to be within int32 limits
    if (off >= slice_size) return;
    auto vec = at::native::memory::ld_vec<Alignment>(inp + ind * inp_stride + off);
--- a/aten/src/ATen/native/cuda/ScaledBlas.cpp
+++ b/aten/src/ATen/native/cuda/ScaledBlas.cpp
@ -794,6 +794,24 @@ void _check_deepseek_scale_stride(const Tensor& scale, const Tensor& t, const Sc
  }
 }

+void
+_check_deepseek_support() {
+#ifndef USE_ROCM
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  if (dprops->major != 9) {
+    // Only on Hopper GPUs
+    TORCH_CHECK_NOT_IMPLEMENTED(
+      dprops->major == 9,
+      "DeepSeek style (1x128, 128x128) scaling only supported in CUDA for SM90")
+  }
+  // Only in cublasLt >= 12.9
+  TORCH_CHECK_NOT_IMPLEMENTED(
+    CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900,
+    "DeepSeek style (1x128, 128x128) scaling requires cublasLt >= 12.9"
+  );
+#endif
+}
+
 Tensor&
 _scaled_block1x128_block1x128(
          const Tensor& mat_a, const Tensor& mat_b,
@ -802,8 +820,12 @@ _scaled_block1x128_block1x128(
          const c10::ScalarType out_dtype,
          const bool use_fast_accum,
          Tensor& out) {
+#ifndef USE_ROCM
  // Restrictions:
  // A, B are FP8, scales are fp32, shape K//128
+  // CUDA: Only Hopper GPUs
+  _check_deepseek_support();
+
  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
      mat_a.scalar_type(), mat_b.scalar_type());
  TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,
@ -821,6 +843,12 @@ _scaled_block1x128_block1x128(
  _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);

  return out;
+#else
+  TORCH_CHECK_NOT_IMPLEMENTED(
+    false,
+    "1x128 and 128x128 scaling not available with ROCm"
+  );
+#endif
 }

 Tensor&
@ -831,10 +859,12 @@ _scaled_block128x128_block1x128(
          const c10::ScalarType out_dtype,
          const bool use_fast_accum,
          Tensor& out) {
+#ifndef USE_ROCM
  // Restrictions:
  // A, B are FP8, scales are fp32, shape K//128
-  std::cout << "mat_b: " << mat_b.dim() << ", " << mat_b.sizes() << ", " << mat_b.strides() << std::endl;
-  std::cout << "scale_b: " << scale_b.dim() << ", " << scale_b.sizes() << ", " << scale_b.strides() << std::endl;
+  // CUDA: Only Hopper GPUs
+  _check_deepseek_support();
+
  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
      mat_a.scalar_type(), mat_b.scalar_type());
  TORCH_CHECK_VALUE(scale_a.sizes()[0] == ceil_div<int64_t>(mat_a.sizes()[0], 128) && scale_a.sizes()[1] == ceil_div<int64_t>(mat_a.sizes()[1], 128) && scale_a.scalar_type() == kFloat,
@ -852,6 +882,12 @@ _scaled_block128x128_block1x128(
  _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);

  return out;
+#else
+  TORCH_CHECK_NOT_IMPLEMENTED(
+    false,
+    "1x128 and 128x128 scaling not available with ROCm"
+  );
+#endif
 }

 Tensor&
@ -862,8 +898,12 @@ _scaled_block1x128_block128x128(
          const c10::ScalarType out_dtype,
          const bool use_fast_accum,
          Tensor& out) {
+#ifndef USE_ROCM
  // Restrictions:
  // A, B are FP8, scales are fp32, A: shape K//128, B: K//128, N//128
+  // CUDA: Only Hopper GPUs
+  _check_deepseek_support();
+
  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
      mat_a.scalar_type(), mat_b.scalar_type());
  TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,
@ -881,6 +921,12 @@ _scaled_block1x128_block128x128(
  _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);

  return out;
+#else
+  TORCH_CHECK_NOT_IMPLEMENTED(
+    false,
+    "1x128 and 128x128 scaling not available with ROCm"
+  );
+#endif
 }

 Tensor&
--- a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
+++ b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
@ -160,8 +160,8 @@ struct _cuda_scatter_gather_internal_kernel {
      auto offsets = offset_calc.get(i);

      int64_t idx_dim = *(index_t*)(index_ptr + offsets[2]);
-      CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
-        && "scatter gather kernel index out of bounds");
+      CUDA_KERNEL_ASSERT_VERBOSE(idx_dim >= 0 && idx_dim < index_size
+        && "scatter gather kernel index out of bounds", "Expected 0 <= idx_dim < index_size (%ld), but got idx_dim = %ld", index_size, idx_dim);

      f(
        (scalar_t*)(self_ptr + offsets[0]),
@ -406,9 +406,8 @@ struct _cuda_scatter_fill_internal_kernel {
      auto offsets = offset_calc.get(i);

      int64_t idx_dim = *(index_t*)(index_ptr + offsets[1]);
-      CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
-        && "index out of bounds"
-      );
+      CUDA_KERNEL_ASSERT_VERBOSE(idx_dim >= 0 && idx_dim < index_size
+        && "index out of bounds", "Expected 0 <= idx_dim < index_size (%ld), but got idx_dim = %ld", index_size, idx_dim);

      f(
        (scalar_t*)(self_ptr + offsets[0]),
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@ -141,7 +141,8 @@ WelfordDataLN cuWelfordOnlineSum(
  if constexpr (!rms_norm){
    U delta = val - curr_sum.mean;
    U new_count = curr_sum.count + 1.f;
-#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
+//Due to low CU count, we run into accuracy issues on gfx90a with `__builtin_amdgcn_rcpf`
+#if defined(USE_ROCM) && !defined(__gfx90a__) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
    U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count);
 #else
    U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
@ -163,7 +164,8 @@ WelfordDataLN cuWelfordCombine(
    U count = dataA.count + dataB.count;
    U mean, sigma2;
    if (count > decltype(dataB.count){0}) {
-#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
+//Due to low CU count, we run into accuracy issues on gfx90a with `__builtin_amdgcn_rcpf`
+#if defined(USE_ROCM) && !defined(__gfx90a__) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
      auto coef = __builtin_amdgcn_rcpf(count);
 #else
      auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division
--- a/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
@ -86,6 +86,28 @@ struct zeta_functor {
  }
 };

+struct logaddexp_functor {
+  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
+  inline T operator()(const T a, const T b) {
+    return c10::metal::logaddexp(a, b);
+  }
+  template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
+  inline float operator()(const T a, const T b) {
+    return c10::metal::logaddexp(float(a), float(b));
+  }
+};
+
+struct logaddexp2_functor {
+  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
+  inline T operator()(const T a, const T b) {
+    return c10::metal::logaddexp2(a, b);
+  }
+  template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
+  inline float operator()(const T a, const T b) {
+    return c10::metal::logaddexp2(float(a), float(b));
+  }
+};
+
 struct xlog1py_functor {
  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
  inline T operator()(const T a, const T b) {
@ -377,6 +399,10 @@ REGISTER_FLOAT_BINARY_OP(fmin);
 REGISTER_FLOAT_BINARY_OP(nextafter);
 REGISTER_FLOAT_BINARY_OP(zeta);
 REGISTER_INT2FLOAT_BINARY_OP(zeta);
+REGISTER_FLOAT_BINARY_OP(logaddexp);
+REGISTER_INT2FLOAT_BINARY_OP(logaddexp);
+REGISTER_FLOAT_BINARY_OP(logaddexp2);
+REGISTER_INT2FLOAT_BINARY_OP(logaddexp2);
 REGISTER_FLOAT_BINARY_OP(xlog1py);
 REGISTER_INT2FLOAT_BINARY_OP(xlog1py);
 REGISTER_FLOAT_BINARY_OP(chebyshev_polynomial_t);
@ -463,6 +489,8 @@ REGISTER_BINARY_OP(add, float2, float2);
 REGISTER_BINARY_OP(add, half2, half2);
 REGISTER_BINARY_OP(sub, float2, float2);
 REGISTER_BINARY_OP(sub, half2, half2);
+REGISTER_BINARY_OP(logaddexp, float2, float2);
+REGISTER_BINARY_OP(logaddexp, half2, half2);
 REGISTER_BINARY_ALPHA_OP(add_alpha, float2, float2, float2);
 REGISTER_BINARY_ALPHA_OP(add_alpha, half2, half2, half2);
 REGISTER_BINARY_ALPHA_OP(sub_alpha, float2, float2, float2);
--- a/aten/src/ATen/native/mps/operations/BinaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
@ -89,6 +89,14 @@ static void zeta_mps_kernel(TensorIteratorBase& iter) {
  lib.exec_binary_kernel(iter, "zeta");
 }

+static void logaddexp_mps_kernel(TensorIteratorBase& iter) {
+  lib.exec_binary_kernel(iter, "logaddexp");
+}
+
+static void logaddexp2_mps_kernel(TensorIteratorBase& iter) {
+  lib.exec_binary_kernel(iter, "logaddexp2");
+}
+
 static void xlog1py_mps_kernel(TensorIteratorBase& iter) {
  TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "xlog1py_mps not implemented for non-floating types");
  lib.exec_binary_kernel(iter, "xlog1py");
@ -211,6 +219,8 @@ REGISTER_DISPATCH(fmin_stub, &fmin_mps_kernel)
 REGISTER_DISPATCH(copysign_stub, &copysign_mps_kernel)
 REGISTER_DISPATCH(nextafter_stub, &nextafter_mps_kernel)
 REGISTER_DISPATCH(zeta_stub, &zeta_mps_kernel)
+REGISTER_DISPATCH(logaddexp_stub, &logaddexp_mps_kernel);
+REGISTER_DISPATCH(logaddexp2_stub, &logaddexp2_mps_kernel);
 REGISTER_DISPATCH(xlog1py_stub, &xlog1py_mps_kernel)
 REGISTER_DISPATCH(chebyshev_polynomial_t_stub, &chebyshev_polynomial_t_mps_kernel)
 REGISTER_DISPATCH(chebyshev_polynomial_u_stub, &chebyshev_polynomial_u_mps_kernel)
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@ -17,8 +17,6 @@
 #include <ATen/ops/ge_native.h>
 #include <ATen/ops/gt_native.h>
 #include <ATen/ops/le_native.h>
-#include <ATen/ops/logaddexp2_native.h>
-#include <ATen/ops/logaddexp_native.h>
 #include <ATen/ops/logical_and_native.h>
 #include <ATen/ops/logical_or_native.h>
 #include <ATen/ops/logical_xor_native.h>
@ -277,30 +275,6 @@ TORCH_IMPL_FUNC(pow_Scalar_out_mps)(const Scalar& base, const Tensor& exp, const
  }
 }

-TORCH_IMPL_FUNC(logaddexp_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
-  mps::BinaryOpBlock logaddexp_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
-    MPSGraph* mpsGraph = cachedGraph->graph();
-    MPSGraphTensor* sumTensor =
-        [mpsGraph additionWithPrimaryTensor:[mpsGraph exponentWithTensor:primaryCastTensor name:nil]
-                            secondaryTensor:[mpsGraph exponentWithTensor:secondaryCastTensor name:nil]
-                                       name:nil];
-    return [mpsGraph logarithmWithTensor:sumTensor name:nil];
-  };
-  mps::binaryOpTensor(self, other, output, "logaddexp_out_mps", logaddexp_op_block);
-}
-
-TORCH_IMPL_FUNC(logaddexp2_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
-  mps::BinaryOpBlock logaddexp2_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
-    MPSGraph* mpsGraph = cachedGraph->graph();
-    MPSGraphTensor* sumTensor =
-        [mpsGraph additionWithPrimaryTensor:[mpsGraph exponentBase2WithTensor:primaryCastTensor name:nil]
-                            secondaryTensor:[mpsGraph exponentBase2WithTensor:secondaryCastTensor name:nil]
-                                       name:nil];
-    return [mpsGraph logarithmBase2WithTensor:sumTensor name:nil];
-  };
-  mps::binaryOpTensor(self, other, output, "logaddexp2_out_mps", logaddexp2_op_block);
-}
-
 TORCH_IMPL_FUNC(xlogy_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
  mps::BinaryOpBlock xlogy_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
    MPSGraph* mpsGraph = cachedGraph->graph();
--- a/aten/src/ATen/native/mps/operations/LossOps.mm
+++ b/aten/src/ATen/native/mps/operations/LossOps.mm
@ -370,7 +370,7 @@ static void nllnd_loss_backward_impl(Tensor& grad_input_arg,
                                                     onValue:-1.0f
                                                    offValue:0.0f
                                                        name:nil];
-      oneHotTensor = castMPSTensor(mpsGraph, oneHotTensor, inputTensor.dataType);
+      oneHotTensor = castMPSTensor(mpsGraph, oneHotTensor, [inputTensor dataType]);
      if (isWeightsArrayValid) {
        oneHotTensor = [mpsGraph multiplicationWithPrimaryTensor:oneHotTensor
                                                 secondaryTensor:weightTensor
@ -705,6 +705,7 @@ static void smooth_l1_loss_template(const Tensor& input,
  TORCH_CHECK(beta >= 0, "smooth_l1_loss does not support negative values for beta.");
  TORCH_CHECK(input.is_mps());
  TORCH_CHECK(target.is_mps());
+  TORCH_CHECK_NOT_IMPLEMENTED(input.scalar_type() != kLong, "MPS doesn't know how to do square_i64");
  if ((input.numel() == 0) || (target.numel() == 0)) {
    reduction == Reduction::Mean ? output.fill_(std::numeric_limits<float>::quiet_NaN()) : output.zero_();
    return;
@ -771,7 +772,7 @@ static void smooth_l1_loss_backward_impl(const Tensor& grad_output,
      MPSGraphTensor* targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target);
      MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);

-      MPSGraphTensor* betaTensor = [mpsGraph constantWithScalar:beta dataType:MPSDataTypeFloat32];
+      MPSGraphTensor* betaTensor = [mpsGraph constantWithScalar:beta dataType:[inputTensor dataType]];
      // xn - yn
      MPSGraphTensor* diffTensor = [mpsGraph subtractionWithPrimaryTensor:inputTensor
                                                          secondaryTensor:targetTensor
@ -797,7 +798,8 @@ static void smooth_l1_loss_backward_impl(const Tensor& grad_output,
                                                                  name:@"lossTensor"];
      MPSGraphTensor* outputTensor = lossTensor;
      if (reduction == Reduction::Mean) {
-        MPSGraphTensor* numelTensor = [mpsGraph constantWithScalar:(double)input.numel() dataType:MPSDataTypeFloat32];
+        MPSGraphTensor* numelTensor = [mpsGraph constantWithScalar:(double)input.numel()
+                                                          dataType:[lossTensor dataType]];
        outputTensor = [mpsGraph divisionWithPrimaryTensor:lossTensor secondaryTensor:numelTensor name:nil];
      }
      MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:outputTensor
--- a/aten/src/ATen/native/mps/operations/Normalization.mm
+++ b/aten/src/ATen/native/mps/operations/Normalization.mm
@ -84,6 +84,9 @@ std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_mps_out(const Tensor& self,
                                                         Tensor& output,
                                                         Tensor& save_mean,
                                                         Tensor& save_var) {
+  TORCH_CHECK_NOT_IMPLEMENTED(self.scalar_type() != kLong, "Long batch norm is not supported with MPS");
+  TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(self.scalar_type()),
+                              "Batch norm for complex is not supported for MPS");
  using namespace at::native::mps;
  struct CachedGraph : public MPSCachedGraph {
    CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
@ -918,6 +921,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_mps(const Tensor& input,
  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
  const int axis = input_ndim - normalized_ndim;
  MPSStream* stream = getCurrentMPSStream();
+  TORCH_CHECK_NOT_IMPLEMENTED(input.scalar_type() != kLong, "Not implemented for long on MPS");
  @autoreleasepool {
    mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
      // which kernel variant to use based on the normalized axis N size
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@ -1028,15 +1028,18 @@ TORCH_IMPL_FUNC(prod_out_mps)
 }

 TORCH_IMPL_FUNC(amax_out_mps)(const Tensor& input_t, IntArrayRef dim, bool keepdim, const Tensor& output_t) {
+  TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "amax is not defined for complex types");
  reduction_out_mps(input_t, dim, keepdim, std::nullopt, output_t, MPSReductionType::AMAX, "amax_out_mps");
 }

 TORCH_IMPL_FUNC(amin_out_mps)(const Tensor& input_t, IntArrayRef dim, bool keepdim, const Tensor& output_t) {
+  TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "amin is not defined for complex types");
  reduction_out_mps(input_t, dim, keepdim, std::nullopt, output_t, MPSReductionType::AMIN, "amin_out_mps");
 }

 TORCH_IMPL_FUNC(aminmax_out_mps)
 (const Tensor& input_t, std::optional<int64_t> dim_opt, bool keepdim, const Tensor& min_t, const Tensor& max_t) {
+  TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "aminmax is not defined for complex types");
  reduction_out_mps(input_t,
                    dim_opt.has_value() ? OptionalIntArrayRef({*dim_opt}) : std::nullopt,
                    keepdim,
--- a/aten/src/ATen/native/mps/operations/Sort.mm
+++ b/aten/src/ATen/native/mps/operations/Sort.mm
@ -31,6 +31,7 @@ void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& v
    indices.copy_(values.toType(at::ScalarType::Long));
    return;
  }
+  TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(self.scalar_type()), "kthvalue is not implemented for complex types");
  // issue #154890, raising error to prevent crash within MPSGraph until
  // workaround is implemented.
  TORCH_CHECK(self.dim() - dim <= 4, "On-going issue on MPSGraph topk when ndims() - axis > 4, see issue #154890");
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -3622,8 +3622,7 @@
  structured: True
  structured_inherits: TensorIteratorBase
  dispatch:
-    CPU, CUDA: logaddexp_out
-    MPS: logaddexp_out_mps
+    CPU, CUDA, MPS: logaddexp_out
  tags: pointwise

 - func: logaddexp(Tensor self, Tensor other) -> Tensor
@ -3635,8 +3634,7 @@
  structured: True
  structured_inherits: TensorIteratorBase
  dispatch:
-    CPU, CUDA: logaddexp2_out
-    MPS: logaddexp2_out_mps
+    CPU, CUDA, MPS: logaddexp2_out
  tags: pointwise

 - func: logaddexp2(Tensor self, Tensor other) -> Tensor
@ -8867,11 +8865,11 @@
  autogen: bitwise_right_shift.Scalar_Tensor_out
  tags: pointwise

- func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
+- func: tril_(Tensor(a!) self, SymInt diagonal=0) -> Tensor(a!)
  structured_delegate: tril.out
  variants: method

- func: triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
+- func: triu_(Tensor(a!) self, SymInt diagonal=0) -> Tensor(a!)
  structured_delegate: triu.out
  variants: method

@ -8995,25 +8993,25 @@
 - func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor
  variants: method, function

- func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+- func: triu.out(Tensor self, SymInt diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
  structured: True
  dispatch:
    CPU: triu_cpu
    CUDA: triu_cuda
    MPS: triu_mps_out

- func: triu(Tensor self, int diagonal=0) -> Tensor
+- func: triu(Tensor self, SymInt diagonal=0) -> Tensor
  structured_delegate: triu.out
  variants: method, function

- func: tril.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+- func: tril.out(Tensor self, SymInt diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
  structured: True
  dispatch:
    CPU: tril_cpu
    CUDA: tril_cuda
    MPS: tril_mps_out

- func: tril(Tensor self, int diagonal=0) -> Tensor
+- func: tril(Tensor self, SymInt diagonal=0) -> Tensor
  structured_delegate: tril.out
  variants: method, function

--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@ -467,6 +467,28 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, IntArrayRe
      !options.has_layout() || options.layout() == kSparse,
      "expected sparse layout, but got layout ",
      options.layout());
+
+  if (indices.numel() > 0) {
+    Tensor min_indices =
+        std::get</* values */ 0>(indices.min(/* dim */ 1, /* keepdim */ false));
+    Tensor cpu_min_indices;
+    if (!indices.is_cpu()) {
+      cpu_min_indices = min_indices.to(at::DeviceType::CPU);
+    } else {
+      cpu_min_indices = min_indices;
+    }
+    auto cpu_min_indices_accessor = cpu_min_indices.accessor<int64_t, 1>();
+    for (const auto d : c10::irange(indices.size(0))) {
+      int64_t min_index_in_dim = cpu_min_indices_accessor[d];
+      TORCH_CHECK(
+          min_index_in_dim >= 0,
+          "found negative index ",
+          min_index_in_dim,
+          " for dim ",
+          d);
+    }
+  }
+
  return at::native::_sparse_coo_tensor_unsafe(
      indices,
      values,
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -1837,6 +1837,10 @@ class BenchmarkRunner:
    def skip_models_for_cuda(self):
        return set()

+    @property
+    def skip_models_for_xpu(self):
+        return set()
+
    @property
    def skip_models_for_cpu(self):
        return set()
@ -3927,6 +3931,8 @@ def run(runner, args, original_dir=None):
            runner.skip_models.update(runner.skip_models_for_cpu_aarch64)
    elif args.devices == ["cuda"]:
        runner.skip_models.update(runner.skip_models_for_cuda)
+    elif args.devices == ["xpu"]:
+        runner.skip_models.update(runner.skip_models_for_xpu)

    if not args.multiprocess:
        runner.skip_models.update(runner.skip_multiprocess_models)
--- a/benchmarks/dynamo/genai_layers/benchmark.py
+++ b/benchmarks/dynamo/genai_layers/benchmark.py
@ -56,6 +56,20 @@ def list_benchmarks():
    print(f"Available benchmarks: {list(BENCHMARK_REGISTRY.keys())}")


+def _run_benchmark(
+    benchmark_cls,
+    script_args,
+):
+    benchmark = benchmark_cls(script_args)
+    benchmark.benchmark()
+    benchmark.report_geomean_speedup()
+    if script_args.print_benchmark_result:
+        print(f"Benchmarking results {benchmark.name}:")
+        print(benchmark.profiling_results)
+    if script_args.visualize:
+        benchmark.visualize()
+
+
 def run_benchmark(
    benchmark_name: str,
    script_args,
@ -71,10 +85,7 @@ def run_benchmark(
    print("=" * 60)

    benchmark_class = BENCHMARK_REGISTRY[benchmark_name]
-    benchmark = benchmark_class(script_args)
-    benchmark.benchmark()
-    if script_args.visualize:
-        benchmark.visualize()
+    _run_benchmark(benchmark_class, script_args)

    return True

@ -87,10 +98,7 @@ def run_all_benchmarks(script_args):

    for name, cls in BENCHMARK_REGISTRY.items():
        print(f"\n{'=' * 20} {name.upper()} {'=' * 20}")
-        benchmark = cls(script_args)
-        benchmark.benchmark()
-        if script_args.visualize:
-            benchmark.visualize()
+        _run_benchmark(cls, script_args)
        print()


@ -149,8 +157,43 @@ Examples:
        help="Whether to exit with an error message for accuracy failure",
    )

+    parser.add_argument(
+        "--print-benchmark-result",
+        action="store_true",
+        help="Whether to print the raw benchmarking result. Easier to quickly check the benchmark results on a server without GUI",
+    )
+
+    parser.add_argument(
+        "--custom-compile-name",
+        type=str,
+        default=None,
+        help="Name for the curve with customized compilation options",
+    )
+
+    parser.add_argument(
+        "--custom-compile-options",
+        type=str,
+        default=None,
+        help="Json string for the custom compile options.",
+    )
+
    args = parser.parse_args()

+    if args.custom_compile_options:
+        import json
+
+        try:
+            args.custom_compile_options = json.loads(args.custom_compile_options)
+        except json.decoder.JSONDecodeError as e:
+            raise RuntimeError(
+                f"Invalid json string for --custom-compile-options: {args.custom_compile_options}"
+            ) from e
+
+        if not args.custom_compile_options:
+            raise RuntimeError("Found no options for --custom-compile-options")
+        if not args.custom_compile_name:
+            raise RuntimeError("Missing label name for the custom compilation")
+
    # Handle list option
    if args.list:
        list_benchmarks()
--- a/benchmarks/dynamo/genai_layers/kernels.py
+++ b/benchmarks/dynamo/genai_layers/kernels.py
@ -8,6 +8,15 @@ import torch
 import torch.nn.functional as F


+# more important shapes used by internal models
+extra_shapes_for_norm = (
+    (1152 * 500, 384),
+    (1152 * 500, 512),
+    (1152 * 1000, 384),
+    (1152 * 1000, 512),
+)
+
+
 class CrossEntropyForward(BenchmarkKernel):
    def __init__(self, script_args):
        super().__init__(script_args)
@ -346,7 +355,7 @@ class RMSNormForward(BenchmarkKernel):
            (32768, 65536),
            (16384, 131072),
            (8192, 262144),
-        )
+        ) + extra_shapes_for_norm

    def get_memory_bytes(self, args, kwargs) -> int:
        x, w = args
@ -438,8 +447,7 @@ class RMSNormBackward(BenchmarkKernel):
            (32768, 4096),
            (32768, 8192),
            (32768, 16384),
-            (32768, 32768),
-        )
+        ) + extra_shapes_for_norm

    def get_memory_bytes(self, args, kwargs) -> int:
        x, w, dy = args
@ -553,7 +561,7 @@ class LayerNormForward(BenchmarkKernel):
            (32768, 16384),
            (32768, 32768),
            (32768, 65536),
-        )
+        ) + extra_shapes_for_norm

    def get_memory_bytes(self, args, kwargs) -> int:
        x, w = args
@ -627,7 +635,7 @@ class LayerNormBackward(BenchmarkKernel):
            (32768, 16384),
            (32768, 32768),
            (32768, 65536),
-        )
+        ) + extra_shapes_for_norm

    def get_memory_bytes(self, args, kwargs) -> int:
        x, w, dy = args
--- a/benchmarks/dynamo/genai_layers/utils.py
+++ b/benchmarks/dynamo/genai_layers/utils.py
@ -6,6 +6,7 @@ from dataclasses import dataclass
 from typing import Any, Optional

 import matplotlib.pyplot as plt
+from scipy.stats import gmean

 import torch
 from torch._inductor.runtime.benchmarking import benchmarker
@ -107,6 +108,18 @@ class BenchmarkKernel:
        for backend in self.available_backends:
            args_ref, kwargs_ref = self.clone_inputs(args, kwargs)
            res[backend] = getattr(self, backend)(args_ref, kwargs_ref)()
+
+        if (
+            "compiled" in self.available_backends
+            and self.script_args.custom_compile_options
+        ):
+            torch._dynamo.reset()  # cause recompile
+            with torch._inductor.config.patch(self.script_args.custom_compile_options):
+                args_ref, kwargs_ref = self.clone_inputs(args, kwargs)
+                res[self.script_args.custom_compile_name] = self.compiled(
+                    args_ref, kwargs_ref
+                )()
+
        gold = res["eager"]

        tol = {}
@ -115,7 +128,7 @@ class BenchmarkKernel:
                "atol": self.script_args.tolerance,
                "rtol": self.script_args.tolerance,
            }
-        for backend in self.available_backends:
+        for backend in res:
            if backend == "eager":
                continue
            try:
@ -134,37 +147,83 @@ class BenchmarkKernel:
                    print("Exit right away since --exit-on-accuracy-failure is set")
                    sys.exit(1)

+    def benchmark_single_shape_for_backend(
+        self, backend, args, kwargs, setting, fn=None
+    ) -> bool:
+        if fn is None:
+            fn = getattr(self, backend)
+        args_ref, kwargs_ref = self.clone_inputs(args, kwargs)
+        try:
+            avg_time = benchmark_kernel_in_milliseconds(fn(args_ref, kwargs_ref))
+        except Exception as e:
+            print(
+                f"Failed to run {backend} backend on {self.name} kernel for {setting} due to {e}"
+            )
+            self.available_backends.remove(backend)  # noqa: B909
+            return False
+        mem_bytes = self.get_memory_bytes(args_ref, kwargs_ref)
+        perf = Performance(setting, avg_time, mem_bytes)
+        print(f"{self.name} kernel on {backend} backend. {perf}")
+        self.profiling_results[backend].append(perf)
+        return True
+
    def benchmark_single_shape(
        self, args, kwargs=None, should_check_accuracy=True, setting: str = ""
    ):
        for backend in self.available_backends:
-            args_ref, kwargs_ref = self.clone_inputs(args, kwargs)
-            try:
-                avg_time = benchmark_kernel_in_milliseconds(
-                    getattr(self, backend)(args_ref, kwargs_ref)
+            self.benchmark_single_shape_for_backend(backend, args, kwargs, setting)
+        if (
+            "compiled" in self.available_backends
+            and self.script_args.custom_compile_options
+        ):
+            torch._dynamo.reset()  # cause recompile
+            with torch._inductor.config.patch(self.script_args.custom_compile_options):
+                status = self.benchmark_single_shape_for_backend(
+                    self.script_args.custom_compile_name,
+                    args,
+                    kwargs,
+                    setting,
+                    fn=self.compiled,
                )
-            except Exception as e:
-                print(
-                    f"Failed to run {backend} backend on {self.name} kernel for {setting} due to {e}"
+            if not status:
+                self.script_args.custom_compile_options = (
+                    None  # once fail, don't run again
                )
-                self.available_backends.remove(backend)  # noqa: B909
-                continue
-            mem_bytes = self.get_memory_bytes(args_ref, kwargs_ref)
-            perf = Performance(setting, avg_time, mem_bytes)
-            print(f"{self.name} kernel on {backend} backend. {perf}")
-            self.profiling_results[backend].append(perf)

        if should_check_accuracy:
            self.check_accuracy(args, kwargs)

    def visualize(self) -> None:
+        device_name = torch.cuda.get_device_name(0)
        visualize_comparison(
            self.profiling_results,
-            title=f"{self.name}",
+            title=f"{self.name} ({device_name})",
            output_path=f"{self.name}_bench",
        )
        return

+    def report_geomean_speedup(self) -> None:
+        print(f"Geomean speedup for benchmark {self.name}")
+        eager_result = {
+            result.setting: result for result in self.profiling_results["eager"]
+        }
+        print(f"  eager {len(eager_result)} data points")
+        for backend, backend_result in self.profiling_results.items():
+            if backend == "eager":
+                continue
+            speeduplist = []
+            for result in backend_result:
+                eager_latency = eager_result[result.setting].latency
+                backend_latency = result.latency
+                speeduplist.append(
+                    eager_latency / backend_latency if backend_latency != 0 else 0.0
+                )
+
+            if len(speeduplist) > 0:
+                print(
+                    f"  {backend} {len(speeduplist)} data points, {gmean(speeduplist):.2f}x speedup"
+                )
+

 def get_backend_colors() -> dict[str, str]:
    """Get consistent color scheme for different backends."""
@ -252,5 +311,6 @@ def visualize_comparison(
        os.makedirs("pics", exist_ok=True)
        full_path = os.path.join("pics", output_path + ".png")
        plt.savefig(full_path, dpi=300, bbox_inches="tight", facecolor="white")
+        print(f"Chart saved to {full_path}")

    plt.close()
--- a/benchmarks/dynamo/timm_models.py
+++ b/benchmarks/dynamo/timm_models.py
@ -74,7 +74,8 @@ REQUIRE_HIGHER_TOLERANCE = {
 REQUIRE_HIGHER_TOLERANCE_AMP = {}

 REQUIRE_EVEN_HIGHER_TOLERANCE = {
-    "beit_base_patch16_224",
+    "deit_base_distilled_patch16_224",
+    "vit_base_patch16_siglip_256",
 }

 # These models need higher tolerance in MaxAutotune mode
@ -354,7 +355,9 @@ class TimmRunner(BenchmarkRunner):
        if is_training:
            from torch._inductor import config as inductor_config

-            if name in REQUIRE_EVEN_HIGHER_TOLERANCE or (
+            if name == "beit_base_patch16_224":
+                tolerance = 16 * 1e-2
+            elif name in REQUIRE_EVEN_HIGHER_TOLERANCE or (
                inductor_config.max_autotune
                and name in REQUIRE_EVEN_HIGHER_TOLERANCE_MAX_AUTOTUNE
            ):
--- a/benchmarks/dynamo/torchbench.py
+++ b/benchmarks/dynamo/torchbench.py
@ -124,6 +124,10 @@ class TorchBenchmarkRunner(BenchmarkRunner):
    def skip_models_for_cuda(self):
        return self._skip["device"]["cuda"]

+    @property
+    def skip_models_for_xpu(self):
+        return self._skip["device"]["xpu"]
+
    @property
    def skip_models_for_freezing_cuda(self):
        return self._skip["freezing"]["cuda"]
--- a/benchmarks/dynamo/torchbench.yaml
+++ b/benchmarks/dynamo/torchbench.yaml
@ -217,6 +217,9 @@ skip:

    cuda: []

+    xpu:
+      - *DETECTRON2_MODELS
+
  test:
    training:
      - *DETECTRON2_MODELS
--- a/c10/cuda/CUDAStream.cpp
+++ b/c10/cuda/CUDAStream.cpp
@ -15,7 +15,6 @@ namespace c10::cuda {
 namespace {

 // Global stream state and constants
-c10::once_flag init_flag;
 DeviceIndex num_gpus = -1;
 constexpr int kStreamsPerPoolBits = 5;
 constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
@ -226,7 +225,10 @@ void initDeviceStreamState(DeviceIndex device_index) {
 // Init front-end to ensure initialization only occurs once
 void initCUDAStreamsOnce() {
  // Inits default streams (once, globally)
-  c10::call_once(init_flag, initGlobalStreamState);
+  auto static init_flag [[maybe_unused]] = [] {
+    initGlobalStreamState();
+    return true;
+  }();

  if (current_streams) {
    return;
--- a/c10/metal/special_math.h
+++ b/c10/metal/special_math.h
@ -1,4 +1,4 @@
-// Implementation of specal math functions for Metal
+// Implementation of special math functions for Metal
 #pragma once
 #include <c10/metal/expm1f.h>
 #include <c10/metal/igamma.h>
@ -624,6 +624,64 @@ inline T spherical_bessel_j0(T x) {
  return static_cast<T>(::metal::sin(x) / x);
 }

+template <typename T>
+inline ::metal::enable_if_t<is_scalar_floating_point_v<T>, T> logaddexp(
+    T a,
+    T b) {
+  float a0 = static_cast<float>(a);
+  float b0 = static_cast<float>(b);
+  if (::metal::isinf(a0) && a0 == b0) {
+    return static_cast<T>(a0);
+  } else {
+    float m0 = ::metal::max(a0, b0);
+    return static_cast<T>(
+        m0 + ::c10::metal::log1p(::metal::exp(-::metal::abs(a0 - b0))));
+  }
+}
+
+// The function is ported from mlx
+template <typename T>
+inline ::metal::enable_if_t<is_complex_v<T>, T> logaddexp(T a, T b) {
+  if (::metal::isnan(a.x) || ::metal::isnan(a.y) || ::metal::isnan(b.x) ||
+      ::metal::isnan(b.y)) {
+    return T(NAN, NAN);
+  }
+
+  T maxval = a.x > b.x ? a : b;
+  T minval = a.x < b.x ? a : b;
+  constexpr auto inf = ::metal::numeric_limits<T>::infinity().x;
+
+  if (minval.x == -inf || maxval.x == inf) {
+    return maxval;
+  }
+
+  float2 maxval_ = static_cast<float2>(maxval);
+  float2 minval_ = static_cast<float2>(minval);
+  float m = ::metal::exp(minval_.x - maxval_.x);
+  float2 dexp{
+      m * ::metal::cos(minval_.y - maxval_.y),
+      m * ::metal::sin(minval_.y - maxval_.y),
+  };
+  return static_cast<T>(maxval_ + ::c10::metal::log1p(dexp));
+}
+
+template <typename T>
+inline T logaddexp2(T a, T b) {
+  constexpr auto log_2 = float(0.693147180559945309417232121458176);
+  constexpr auto inv_log_2 = float(1) / log_2;
+  float a0 = static_cast<float>(a);
+  float b0 = static_cast<float>(b);
+  if (::metal::isinf(a0) && a0 == b0) {
+    return static_cast<T>(a0);
+  } else {
+    float m0 = ::metal::max(a0, b0);
+    return static_cast<T>(
+        m0 +
+        ::c10::metal::log1p(::metal::pow(float(2), -::metal::abs(a0 - b0))) *
+            inv_log_2);
+  }
+}
+
 template <typename T>
 inline float xlog1py(T x, T y) {
  if (::metal::isnan(y)) {
--- a/c10/metal/utils.h
+++ b/c10/metal/utils.h
@ -322,6 +322,24 @@ inline float log1p(float x) {
  return rc;
 }

+// The function is ported from mlx
+inline float2 log1p(float2 in) {
+  float x = in.x;
+  float y = in.y;
+  float zabs = ::metal::precise::sqrt(x * x + y * y);
+  float theta = ::metal::atan2(y, x + 1);
+  if (zabs < 0.5f) {
+    float r = x * (2 + x) + y * y;
+    if (r == 0) { // handle underflow
+      return {x, theta};
+    }
+    return {0.5f * log1p(r), theta};
+  } else {
+    auto z0 = ::metal::sqrt((x + 1) * (x + 1) + y * y);
+    return {::metal::log(z0), theta};
+  }
+}
+
 template <typename T1, typename T2 = T1>
 struct pair {
  T1 first;
--- a/c10/mobile/CPUProfilingAllocator.cpp
+++ b/c10/mobile/CPUProfilingAllocator.cpp
@ -34,7 +34,7 @@ struct MemEvent {
 bool overlaps(const MemBlock& a, const MemBlock& b) {
  // two blocks dont overlap if
  // |---a--------|--------------b--------|
-  // strat_a     end_a <= start_b       end_b
+  // start_a     end_a <= start_b       end_b
  return !(
      (a.end_offset <= b.start_offset) || (b.end_offset <= a.start_offset));
 }
--- a/c10/test/util/TypeList_test.cpp
+++ b/c10/test/util/TypeList_test.cpp
@ -239,7 +239,7 @@ struct Class2 {

 struct mapper_call_func {
  template <class T>
-  decltype(auto) operator()(T) {
+  auto operator()(T) {
    return T::type::func();
  }
 };
@ -254,7 +254,7 @@ TEST(TypeListTest, MapTypesToValues_members) {

 struct mapper_call_nonexistent_function {
  template <class T>
-  decltype(auto) operator()(T) {
+  auto operator()(T) {
    return T::type::this_doesnt_exist();
  }
 };
--- a/c10/util/Bitset.h
+++ b/c10/util/Bitset.h
@ -33,7 +33,7 @@ struct bitset final {
  constexpr bitset() noexcept = default;
  constexpr bitset(const bitset&) noexcept = default;
  constexpr bitset(bitset&&) noexcept = default;
-  // there is an issure for gcc 5.3.0 when define default function as constexpr
+  // there is an issue for gcc 5.3.0 when define default function as constexpr
  // see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68754.
  bitset& operator=(const bitset&) noexcept = default;
  bitset& operator=(bitset&&) noexcept = default;
--- a/c10/util/C++17.h
+++ b/c10/util/C++17.h
@ -53,7 +53,7 @@ namespace guts {
 // member functions.
 namespace detail {
 template <class F, class Tuple, std::size_t... INDEX>
-C10_HOST_DEVICE constexpr decltype(auto) apply_impl(
+C10_HOST_DEVICE constexpr auto apply_impl(
    F&& f,
    Tuple&& t,
    std::index_sequence<INDEX...>) {
@ -62,7 +62,7 @@ C10_HOST_DEVICE constexpr decltype(auto) apply_impl(
 } // namespace detail

 template <class F, class Tuple>
-C10_HOST_DEVICE constexpr decltype(auto) apply(F&& f, Tuple&& t) {
+C10_HOST_DEVICE constexpr auto apply(F&& f, Tuple&& t) {
  return detail::apply_impl(
      std::forward<F>(f),
      std::forward<Tuple>(t),
--- a/c10/util/Exception.h
+++ b/c10/util/Exception.h
@ -469,7 +469,7 @@ C10_API std::string GetExceptionString(const std::exception& e);

 namespace c10::detail {
 template <typename... Args>
-decltype(auto) torchCheckMsgImpl(const char* /*msg*/, const Args&... args) {
+auto torchCheckMsgImpl(const char* /*msg*/, const Args&... args) {
  return ::c10::str(args...);
 }
 inline C10_API const char* torchCheckMsgImpl(const char* msg) {
--- a/c10/util/StringUtil.h
+++ b/c10/util/StringUtil.h
@ -135,7 +135,7 @@ struct _str_wrapper<> final {

 // Convert a list of string-like arguments into a single string.
 template <typename... Args>
-inline decltype(auto) str(const Args&... args) {
+inline auto str(const Args&... args) {
  return detail::_str_wrapper<
      typename detail::CanonicalizeStrTypes<Args>::type...>::call(args...);
 }
--- a/c10/util/TypeList.h
+++ b/c10/util/TypeList.h
@ -507,7 +507,7 @@ struct map_types_to_values<typelist<Types...>> final {
 } // namespace detail

 template <class TypeList, class Func>
-decltype(auto) map_types_to_values(Func&& func) {
+auto map_types_to_values(Func&& func) {
  return detail::map_types_to_values<TypeList>::call(std::forward<Func>(func));
 }

--- a/c10/xpu/XPUCachingAllocator.cpp
+++ b/c10/xpu/XPUCachingAllocator.cpp
@ -554,6 +554,17 @@ class DeviceCachingAllocator {
    }
  }

+  double getMemoryFraction() {
+    if (!set_fraction) {
+      return 1.0;
+    }
+
+    c10::xpu::DeviceProp device_prop;
+    c10::xpu::get_device_properties(&device_prop, device_index);
+    return static_cast<double>(allowed_memory_maximum) /
+        static_cast<double>(device_prop.global_mem_size);
+  }
+
  void setMemoryFraction(double fraction) {
    c10::xpu::DeviceProp device_prop;
    c10::xpu::get_device_properties(&device_prop, device_index);
@ -724,6 +735,11 @@ class XPUAllocator : public DeviceAllocator {
    device_allocators[device]->resetAccumulatedStats();
  }

+  double getMemoryFraction(DeviceIndex device) {
+    assertValidDevice(device);
+    return device_allocators[device]->getMemoryFraction();
+  }
+
  void setMemoryFraction(double fraction, DeviceIndex device) {
    assertValidDevice(device);
    TORCH_CHECK_VALUE(
@ -777,6 +793,10 @@ void recordStream(const DataPtr& dataPtr, XPUStream stream) {
  return allocator.recordStream(dataPtr, stream);
 }

+double getMemoryFraction(DeviceIndex device) {
+  return allocator.getMemoryFraction(device);
+}
+
 void setMemoryFraction(double fraction, DeviceIndex device) {
  return allocator.setMemoryFraction(fraction, device);
 }
--- a/c10/xpu/XPUCachingAllocator.h
+++ b/c10/xpu/XPUCachingAllocator.h
@ -25,6 +25,8 @@ C10_XPU_API void raw_delete(void* ptr);

 C10_XPU_API void recordStream(const DataPtr& dataPtr, XPUStream stream);

+C10_XPU_API double getMemoryFraction(DeviceIndex device);
+
 C10_XPU_API void setMemoryFraction(double fraction, DeviceIndex device);

 } // namespace c10::xpu::XPUCachingAllocator
--- a/c10/xpu/XPUFunctions.cpp
+++ b/c10/xpu/XPUFunctions.cpp
@ -1,4 +1,3 @@
-#include <c10/util/CallOnce.h>
 #include <c10/util/Exception.h>
 #include <c10/xpu/XPUFunctions.h>

@ -33,7 +32,6 @@ namespace {
 *    one iGPU and enumerate all iGPUs on that platform.
 * 3. If neither dGPUs nor iGPUs are found, conclude that no GPUs are available.
 */
-c10::once_flag init_flag;
 thread_local DeviceIndex curDeviceIndex = 0;

 struct DevicePool {
@ -149,7 +147,10 @@ inline void initGlobalDevicePoolState() {
 }

 inline void initDevicePoolCallOnce() {
-  c10::call_once(init_flag, initGlobalDevicePoolState);
+  auto static init_flag [[maybe_unused]] = [] {
+    initGlobalDevicePoolState();
+    return true;
+  }();
 }

 void initDeviceProperties(DeviceProp* device_prop, DeviceIndex device) {
--- a/c10/xpu/XPUStream.cpp
+++ b/c10/xpu/XPUStream.cpp
@ -12,7 +12,6 @@ namespace c10::xpu {
 namespace {

 // Global stream state and constants
-c10::once_flag init_flag;
 DeviceIndex num_gpus = -1;
 constexpr int kStreamsPerPoolBits = 5;
 constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
@ -163,7 +162,10 @@ void initDeviceStreamState(DeviceIndex device) {
 }

 void initXPUStreamsOnce() {
-  c10::call_once(init_flag, initGlobalStreamState);
+  auto static init_flag [[maybe_unused]] = [] {
+    initGlobalStreamState();
+    return true;
+  }();

  if (current_streams) {
    return;
--- a/caffe2/serialize/crc_alt.h
+++ b/caffe2/serialize/crc_alt.h
@ -38,7 +38,7 @@ uint32_t crc32_combine (uint32_t crcA, uint32_t crcB, size_t lengthB);

 /// compute CRC32 (bitwise algorithm)
 uint32_t crc32_bitwise (const void* data, size_t length, uint32_t previousCrc32 = 0);
-/// compute CRC32 (half-byte algoritm)
+/// compute CRC32 (half-byte algorithm)
 uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32 = 0);

 #ifdef CRC32_USE_LOOKUP_TABLE_BYTE
@ -96,7 +96,7 @@ uint32_t crc32_16bytes_prefetch(const void* data, size_t length, uint32_t previo
  #define __BIG_ENDIAN    4321
 #endif

-// define endianess and some integer data types
+// define endianness and some integer data types
 #if defined(_MSC_VER) || defined(__MINGW32__)
  // Windows always little endian
  #define __BYTE_ORDER __LITTLE_ENDIAN
@ -168,7 +168,7 @@ namespace
  /// zlib's CRC32 polynomial
  const uint32_t Polynomial = 0xEDB88320;

-  /// swap endianess
+  /// swap endianness
  static inline uint32_t swap(uint32_t x)
  {
  #if defined(__GNUC__) || defined(__clang__)
@ -229,7 +229,7 @@ uint32_t crc32_bitwise(const void* data, size_t length, uint32_t previousCrc32)
 }


-/// compute CRC32 (half-byte algoritm)
+/// compute CRC32 (half-byte algorithm)
 uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32)
 {
  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
@ -662,7 +662,7 @@ uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB)
  // - if you append length(B) zeros to A and call it A' (think of it as AAAA000)
  //   and   prepend length(A) zeros to B and call it B' (think of it as 0000BBB)
  //   then exists a C' = A' ^ B'
-  // - remember: if you XOR someting with zero, it remains unchanged: X ^ 0 = X
+  // - remember: if you XOR something with zero, it remains unchanged: X ^ 0 = X
  // - that means C' = A concat B so that crc(A concat B) = crc(C') = crc(A') ^ crc(B')
  // - the trick is to compute crc(A') based on crc(A)
  //                       and crc(B') based on crc(B)
--- a/caffe2/serialize/inline_container.h
+++ b/caffe2/serialize/inline_container.h
@ -76,7 +76,7 @@ typedef struct mz_zip_archive mz_zip_archive;
 // 2) Writing with 1-pass sequential access
 //      -> We must take care not to require updating values that have already
 //         been written. We place the variable-length index at the end and do
-//         not put any indicies into the header to fulfill this constraint.
+//         not put any index into the header to fulfill this constraint.

 // The model.json, which contains all the metadata information,
 // should be written as the last file. One reason is that the size of tensor
--- a/caffe2/serialize/inline_container_test.cc
+++ b/caffe2/serialize/inline_container_test.cc
@ -519,7 +519,7 @@ TEST(PyTorchStreamWriterAndReader, SaveAndLoadWithAllocator) {
  std::tie(data_ptr, size) = reader.getRecord("key1", &overrideAllocator);
  EXPECT_EQ(overrideAllocator.getAllocatedBytes(), kBytes1);
  EXPECT_EQ(baseAllocator.getAllocatedBytes(), allocBytes);
-  // allcoate with base allocator
+  // allocate with base allocator
  std::tie(data_ptr, size) = reader.getRecord("key1");
  EXPECT_EQ(overrideAllocator.getAllocatedBytes(), kBytes1);
  EXPECT_EQ(baseAllocator.getAllocatedBytes(), allocBytes + kBytes1);
--- a/docs/source/autograd.md
+++ b/docs/source/autograd.md
@ -423,8 +423,10 @@ Also see {ref}`saved-tensors-hooks-doc`.

 ```{eval-rst}
 .. autofunction:: torch.autograd.graph.get_gradient_edge
+```

-
+```{eval-rst}
+.. autofunction:: torch.autograd.graph.set_warn_on_accumulate_grad_stream_mismatch
 ```

 % This module needs to be documented. Adding here in the meantime
--- a/docs/source/notes/libtorch_stable_abi.md
+++ b/docs/source/notes/libtorch_stable_abi.md
@ -2,9 +2,9 @@

 ## Overview

-The LibTorch Stable ABI (Application Binary Interface) provides an interface for extending PyTorch functionality without being tightly coupled to specific PyTorch versions. This enables the development of custom operators and extensions that remain compatible across PyTorch releases.
+The LibTorch Stable ABI (Application Binary Interface) provides a limited interface for extending PyTorch functionality without being tightly coupled to specific PyTorch versions. This enables the development of custom operators and extensions that remain compatible across PyTorch releases. This limited set of APIs is not intended to replace existing LibTorch, but rather to provide a stable foundation for a majority of custom extension use cases. If there is any API you would like to see added to the stable ABI, please file a request through a [new issue on the PyTorch repo](https://github.com/pytorch/pytorch/issues).

-The stable ABI consists of three main components:
+The limited stable ABI consists of three main components:

 1. **Stable C headers** - Low-level C API implemented by libtorch (primarily `torch/csrc/inductor/aoti_torch/c/shim.h`)
 2. **Header-only C++ library** - Standalone utilities implemented in only headers such that there is no dependence on libtorch (`torch/headeronly/*`)
@ -14,8 +14,8 @@ We discuss each of these in detail

 ### `torch/headeronly`

-This is a set of inlined C++ headers are completely decoupled from libtorch. The headers consist of certain utilities that might be familiar to custom extension writers. For example, the
-`c10::ScalarType` enum lives here as `torch::headeronly::ScalarType`.
+The inlined C++ headers living in [`torch/headeronly`](https://github.com/pytorch/pytorch/tree/main/torch/headeronly) are completely decoupled from LibTorch. The headers consist of certain utilities that might be familiar to custom extension writers. For example, the
+`c10::ScalarType` enum lives here as `torch::headeronly::ScalarType`, as well as a libtorch-independent version of `TORCH_CHECK` that is `STD_TORCH_CHECK`. You can trust all APIs in the `torch::headeronly` namespace to not depend on `libtorch.so`. These APIs are also globally listed in [torch/header_only_apis.txt](https://github.com/pytorch/pytorch/blob/main/torch/header_only_apis.txt).

 ### `torch/csrc/stable`

@ -34,8 +34,14 @@ We are continuing to improve coverage in our `torch/csrc/stable` APIs. Please fi

 ### Stable C headers

-The stable C headers used by AOTInductor form the foundation of the stable ABI. However, this is **use at your own risk**. For example, users must handle the memory lifecycle of objects returned by certain APIs.
- Further, the stack-based APIs discussed below which allow the user to call the PyTorch dispatcher don't provide strong guarantees on forward and backward compatibility.
+The stable C headers started by AOTInductor form the foundation of the stable ABI. Presently, the available C headers include:
+
+- [torch/csrc/inductor/aoti_torch/c/shim.h](https://github.com/pytorch/pytorch/blob/main/torch/csrc/inductor/aoti_torch/c/shim.h): Includes C-style shim APIs for commonly used regarding Tensors, dtypes, CUDA, and the like.
+- [torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h](https://github.com/pytorch/pytorch/blob/main/torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h): Includes C-style shim APIs for ATen ops from `native_functions.yaml` (e.g. `aoti_torch_aten_new_empty`).
+- [torch/csrc/inductor/aoti_torch/generated/c_shim_*.h](https://github.com/pytorch/pytorch/blob/main/torch/csrc/inductor/aoti_torch/generated): Includes C-style shim APIs for specific backend kernels dispatched from `native_functions.yaml` (e.g. `aoti_torch_cuda_pad`). These APIs should only be used for the specific backend they are named after (e.g. `aoti_torch_cuda_pad` should only be used within CUDA kernels), as they opt out of the dispatcher.
+- [torch/csrc/stable/c/shim.h](https://github.com/pytorch/pytorch/blob/main/torch/csrc/stable/c/shim.h): We are building out more ABIs to logically live in `torch/csrc/stable/c` instead of continuing the AOTI naming that no longer makes sense for our general use case.
+
+These headers are promised to be ABI stable across releases and adhere to a stronger backwards compatibility policy than LibTorch. Specifically, we promise not to modify them for at least 2 years after they are released. However, this is **use at your own risk**. For example, users must handle the memory lifecycle of objects returned by certain APIs. Further, the stack-based APIs discussed below which allow the user to call into the PyTorch dispatcher do not provide strong guarantees on forward and backward compatibility of the underlying op that is called.

 Unless absolutely necessary, we recommend the high-level C++ API in `torch/csrc/stable`
 which will handle all the rough edges of the C API for the user.
--- a/docs/source/xpu.md
+++ b/docs/source/xpu.md
@ -76,6 +76,7 @@
    :nosignatures:

     empty_cache
+     get_per_process_memory_fraction
     max_memory_allocated
     max_memory_reserved
     mem_get_info
--- a/setup.py
+++ b/setup.py
@ -1106,7 +1106,7 @@ class build_ext(setuptools.command.build_ext.build_ext):
                continue
            self.copy_file(source_lib, target_lib)
            # Delete old rpath and add @loader_lib to the rpath
-            # This should prevent delocate from attempting to package another instance
+            # This should prevent deallocate from attempting to package another instance
            # of OpenMP library in torch wheel as well as loading two libomp.dylib into
            # the address space, as libraries are cached by their unresolved names
            install_name_tool_args = [
--- a/test/distributed/_composable/fsdp/test_fully_shard_autograd.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_autograd.py
@ -266,7 +266,7 @@ class TestFullyShardPostAccGradHookMultiThread(FSDPTestMultiThread):
        model(inp).sum().backward()
        param_names = {param_name for param_name, _ in model.named_parameters()}
        self.assertEqual(param_names, set(param_name_to_hook_count.keys()))
-        for param_name, count in param_name_to_hook_count.items():
+        for count in param_name_to_hook_count.values():
            self.assertEqual(count, 1)


--- a/test/distributed/_composable/fsdp/test_fully_shard_training.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_training.py
@ -827,7 +827,7 @@ class TestFullyShardShardPlacementFnMultiProcess(FSDPTest):

        torch.manual_seed(42 + self.rank)
        inp = torch.randint(0, model_args.vocab_size, (2, 16), device=device_type.type)
-        for iter_idx in range(5):
+        for _ in range(5):
            ref_loss = ref_model(inp).sum()
            loss = model(inp).sum()
            self.assertEqual(ref_loss, loss)
--- a/test/distributed/elastic/multiprocessing/api_test.py
+++ b/test/distributed/elastic/multiprocessing/api_test.py
@ -800,6 +800,7 @@ if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS or IS_CI):
                    stderr_redirects={0: stderr_redir},
                    ret_vals={0: queue},
                    queue_finished_reading_event=worker_finished_event_mock,
+                    numa_options=None,
                )
                self.assertEqual("hello_0", queue.get())
                if stdout_redir:
--- a/test/distributed/fsdp/test_distributed_checkpoint.py
+++ b/test/distributed/fsdp/test_distributed_checkpoint.py
@ -31,17 +31,17 @@ if TEST_WITH_DEV_DBG_ASAN:
    sys.exit(0)


-_DISTRIBUTED_STATE_DICT_IMPLS = (
+_DISTRIBUTED_STATE_DICT_IMPLS = {
    StateDictType.LOCAL_STATE_DICT,
    StateDictType.SHARDED_STATE_DICT,
-)
+}


 class TestDistributedCheckpoint(FSDPTest):
    @property
    def world_size(self):
-        if torch.cuda.is_available():
-            gpu_cnt = torch.cuda.device_count()
+        if torch.accelerator.is_available():
+            gpu_cnt = torch.accelerator.device_count()
            if gpu_cnt < 2:
                return gpu_cnt
        return 2
@ -93,7 +93,9 @@ class TestDistributedCheckpoint(FSDPTest):
        # TODO: add resharding test case.


-devices = ("cuda", "hpu")
-instantiate_device_type_tests(TestDistributedCheckpoint, globals(), only_for=devices)
+devices = ("cuda", "hpu", "xpu")
+instantiate_device_type_tests(
+    TestDistributedCheckpoint, globals(), only_for=devices, allow_xpu=True
+)
 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/fsdp/test_fsdp_apply.py
+++ b/test/distributed/fsdp/test_fsdp_apply.py
@ -36,8 +36,8 @@ device_type = torch.device(get_devtype())
 class TestApply(FSDPTest):
    @property
    def world_size(self):
-        if torch.cuda.is_available():
-            gpu_cnt = torch.cuda.device_count()
+        if torch.accelerator.is_available():
+            gpu_cnt = torch.accelerator.device_count()
            if gpu_cnt < 2:
                return gpu_cnt
        return 2
--- a/test/distributed/fsdp/test_fsdp_misc.py
+++ b/test/distributed/fsdp/test_fsdp_misc.py
@ -514,18 +514,17 @@ class TestFSDPMiscMultiProcess(FSDPTest):
    def test_fsdp_cpu_training(self):
        """Tests FSDP training on CPU."""
        gloo_pg = dist.new_group(backend="gloo")
-        for ss in [  # noqa: F841
+        for ss in [
            ShardingStrategy.NO_SHARD,
            ShardingStrategy.FULL_SHARD,
            ShardingStrategy.SHARD_GRAD_OP,
-            ShardingStrategy.HYBRID_SHARD,
-            ShardingStrategy._HYBRID_SHARD_ZERO2,
        ]:
            torch.manual_seed(42)
            model = MyModel()
            ref_model = DDP(deepcopy(model), process_group=gloo_pg)
            model = FSDP(
                model,
+                sharding_strategy=ss,
                auto_wrap_policy=always_wrap_policy,
                process_group=gloo_pg,
                device_id=torch.device("cpu"),
--- a/test/distributed/nn/jit/test_instantiator.py
+++ b/test/distributed/nn/jit/test_instantiator.py
@ -2,7 +2,6 @@
 # Owner(s): ["oncall: distributed"]

 import sys
-from pathlib import Path

 import torch
 import torch.distributed as dist
@ -45,53 +44,19 @@ class TestInstantiator(TestCase):
        self.assertEqual(return_type_str, "Tuple[Tensor, int, str]")

    def test_instantiate_scripted_remote_module_template(self):
-        dir_path = Path(instantiator.INSTANTIATED_TEMPLATE_DIR_PATH)
-
-        # Cleanup.
-        file_paths = dir_path.glob(f"{instantiator._FILE_PREFIX}*.py")
-        for file_path in file_paths:
-            file_path.unlink()
-
-        # Check before run.
-        file_paths = dir_path.glob(f"{instantiator._FILE_PREFIX}*.py")
-        num_files_before = len(list(file_paths))
-        self.assertEqual(num_files_before, 0)
-
        generated_module = instantiator.instantiate_scriptable_remote_module_template(
            MyModuleInterface
        )
        self.assertTrue(hasattr(generated_module, "_remote_forward"))
        self.assertTrue(hasattr(generated_module, "_generated_methods"))

-        # Check after run.
-        file_paths = dir_path.glob(f"{instantiator._FILE_PREFIX}*.py")
-        num_files_after = len(list(file_paths))
-        self.assertEqual(num_files_after, 1)
-
    def test_instantiate_non_scripted_remote_module_template(self):
-        dir_path = Path(instantiator.INSTANTIATED_TEMPLATE_DIR_PATH)
-
-        # Cleanup.
-        file_paths = dir_path.glob(f"{instantiator._FILE_PREFIX}*.py")
-        for file_path in file_paths:
-            file_path.unlink()
-
-        # Check before run.
-        file_paths = dir_path.glob(f"{instantiator._FILE_PREFIX}*.py")
-        num_files_before = len(list(file_paths))
-        self.assertEqual(num_files_before, 0)
-
        generated_module = (
            instantiator.instantiate_non_scriptable_remote_module_template()
        )
        self.assertTrue(hasattr(generated_module, "_remote_forward"))
        self.assertTrue(hasattr(generated_module, "_generated_methods"))

-        # Check after run.
-        file_paths = dir_path.glob(f"{instantiator._FILE_PREFIX}*.py")
-        num_files_after = len(list(file_paths))
-        self.assertEqual(num_files_after, 1)
-

 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/tensor/debug/test_debug_mode.py
+++ b/test/distributed/tensor/debug/test_debug_mode.py
@ -64,6 +64,38 @@ class TestDTensorDebugMode(TestCase):
        self.assertTrue(isinstance(debug_mode.operators[2], _RedistributeCall))
        self.assertEqual(next(iter(debug_mode.operators[1])), torch.ops.aten.mm.default)

+        # check stringification
+        self.assertTrue(hasattr(debug_mode.operators[0], "args_str"))
+        self.assertFalse(hasattr(debug_mode.operators[0], "args"))
+
+        # check recording hook
+        def mm(x, y):
+            return (x @ y).sum()
+
+        eager_out = mm(x_dtensor, y_dtensor)
+
+        # check recording hook for compiled variant
+        with (
+            DebugMode() as debug_mode,
+            DebugMode.record_outputs(),
+            DebugMode.log_tensor_hashes(),
+        ):
+            compiled_out = torch.compile(mm, backend="aot_eager")(x_dtensor, y_dtensor)
+
+        # check numerical equivalence
+        self.assertTrue(torch.equal(eager_out, compiled_out))
+        sum_op = next(
+            iter(
+                op
+                for op in debug_mode.operators
+                if isinstance(op, _OpCall) and str(op.op) == "aten.sum.default"
+            )
+        )
+        self.assertTrue(torch.equal(sum_op.record["output"], eager_out.to_local()))
+        self.assertTrue(
+            "aten::sum(t: f32[1, 32])  # {'hash': " in debug_mode.debug_string()
+        )
+
    def test_debug_string_inside_context(self):
        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))

@ -267,6 +299,7 @@ class TestDTensorDebugMode(TestCase):
            record_torchfunction=True,
            record_faketensor=True,
            record_tensor_attributes=["a1", "a2"],
+            store_original_args=True,
        ) as debug_mode:
            torch.matmul(y, x)

@ -279,6 +312,9 @@ class TestDTensorDebugMode(TestCase):
      aten::_unsafe_view(t: f32[64, 8], [8, 8, 8])""",
        )

+        self.assertTrue(hasattr(debug_mode.operators[0], "args"))
+        self.assertEqual(id(debug_mode.operators[0].args[0]), id(y))
+
    @parametrize("has_inner_mode", [True, False])
    @parametrize("has_outer_mode", [True, False])
    def test_nested_debug_mode(self, has_inner_mode, has_outer_mode):
--- a/test/distributed/tensor/test_attention.py
+++ b/test/distributed/tensor/test_attention.py
@ -20,18 +20,18 @@ from torch.distributed.tensor.experimental._attention import (
    _cp_options,
    _disable_context_parallel_dispatcher,
    _enable_context_parallel_dispatcher,
+    _HeadTailLoadBalancer,
    _is_causal_behavior,
+    _LoadBalancer,
+    _PerDocumentHeadTailLoadBalancer,
+    _PTRRLoadBalancer,
    _RotateMethod,
    context_parallel,
    context_parallel_unshard,
    set_rotate_method,
 )
-from torch.distributed.tensor.experimental._cp_custom_ops import flex_cp_allgather
-from torch.distributed.tensor.experimental._load_balancer import (
-    _HeadTailLoadBalancer,
-    _LoadBalancer,
-    _PerDocumentHeadTailLoadBalancer,
-    _PTRRLoadBalancer,
+from torch.distributed.tensor.experimental._context_parallel._cp_custom_ops import (
+    flex_cp_allgather,
 )
 from torch.distributed.tensor.parallel import parallelize_module
 from torch.nn.attention import sdpa_kernel, SDPBackend
@ -52,7 +52,9 @@ from torch.testing._internal.common_cuda import (
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_utils import run_tests, skipIfRocm
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+    create_local_tensor_test_class,
    DTensorTestBase,
+    map_local_tensor_for_rank,
    with_comms,
 )

@ -800,11 +802,47 @@ class TestSharding(DTensorTestBase):
        chunks = freqs_cis.chunk(self.world_size * 2)
        self.assertEqual(
            freqs_cis_shard,
-            torch.cat(
-                [chunks[self.rank], chunks[self.world_size * 2 - self.rank - 1]], dim=0
+            map_local_tensor_for_rank(
+                chunks,
+                self.rank,
+                lambda chunks, rank: torch.cat(
+                    [chunks[rank], chunks[self.world_size * 2 - rank - 1]],
+                    dim=0,
+                ),
            ),
        )


+RingAttentionTestWithLocalTensor = create_local_tensor_test_class(
+    RingAttentionTest,
+    skipped_tests=[
+        # Need to make attention implementation local tensor friendly, e.g.
+        # rewrite "rank local" logic
+        "test_ring_attention_sdpa",
+    ],
+)
+
+CPFlexAttentionTestWithLocalTensor = create_local_tensor_test_class(
+    CPFlexAttentionTest,
+    skipped_tests=[
+        # Missing support for batched tensors
+        "test_cp_flex_attention_causal_mask",
+        "test_cp_flex_attention_document_mask",
+    ],
+)
+
+TestCPCustomOpsWithLocalTensor = create_local_tensor_test_class(
+    TestCPCustomOps,
+    skipped_tests=[
+        # Missing support for fake tensors
+        "test_flex_cp_custom_op",
+    ],
+)
+
+TestShardingWithLocalTensor = create_local_tensor_test_class(
+    TestSharding,
+)
+
+
 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/tensor/test_convolution_ops.py
+++ b/test/distributed/tensor/test_convolution_ops.py
@ -16,6 +16,7 @@ from torch.distributed.tensor import (
 from torch.nn import functional as F
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+    create_local_tensor_test_class,
    DTensorTestBase,
    skip_if_lt_x_gpu,
    with_comms,
@ -203,34 +204,42 @@ class DistConvolutionOpsTest(DTensorTestBase):
        self.assertTrue(b_dt.grad is not None)
        self.assertTrue(x_dt.grad is None)

+    def _run_single_arg_fwd(self, model, arg) -> tuple[torch.Tensor, torch.Tensor]:
+        """Given model and arg, runs fwd model local and distbuted given device_mesh"""
+        device_mesh = self.build_device_mesh()
+        model_copy = copy.deepcopy(model).to(device=self.device_type)
+        dist_model = distribute_module(model, device_mesh, _conv_fn)
+        arg_dt = DTensor.from_local(arg, device_mesh, [Replicate()])
+        out_dt = dist_model(arg_dt.to(device=self.device_type))
+        out = model_copy(arg)
+        return (out_dt.full_tensor(), out)
+
    @with_comms
    def test_conv1d(self):
-        device_mesh = self.build_device_mesh()
        model = nn.Conv1d(64, 64, 3, padding=1)
-        model_gt = copy.deepcopy(model)
-        x = torch.randn(1, 64, 8)
-        x_dt = DTensor.from_local(x, device_mesh, [Replicate()])
-        model_dt = distribute_module(
-            model, device_mesh, _conv_fn, input_fn=None, output_fn=None
-        )
-        out_dt = model_dt(x_dt)
-        out = model_gt(x)
+        x = torch.randn(1, 64, 8, device=self.device_type)
+        out_dt, out = self._run_single_arg_fwd(model, x)
        self.assertEqual(out_dt.shape, out.shape)

    @with_comms
    def test_conv3d(self):
-        device_mesh = self.build_device_mesh()
        model = nn.Conv3d(64, 64, 3, padding=1)
-        model_gt = copy.deepcopy(model).to(device=self.device_type)
        x = torch.randn(1, 64, 8, 8, 8, device=self.device_type)
-        x_dt = DTensor.from_local(x, device_mesh, [Replicate()])
-        model_dt = distribute_module(
-            model, device_mesh, _conv_fn, input_fn=None, output_fn=None
-        )
-        out_dt = model_dt(x_dt)
-        out = model_gt(x)
+        out_dt, out = self._run_single_arg_fwd(model, x)
        self.assertEqual(out_dt.shape, out.shape)


+DistConvolutionOpsTestWithLocalTensor = create_local_tensor_test_class(
+    DistConvolutionOpsTest,
+    # Send / recv ops are not supported
+    skipped_tests=[
+        "test_conv1d",
+        "test_conv3d",
+        "test_conv_backward_none_grad_inp",
+        "test_depthwise_convolution",
+        "test_downsampling_convolution",
+    ],
+)
+
 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/tensor/test_dtensor_export.py
+++ b/test/distributed/tensor/test_dtensor_export.py
@ -520,6 +520,21 @@ class DTensorExportTest(TestCase):
            2,
        )

+    def test_union_typed_annotation(self):
+        def fn(leaf: torch.Tensor | DTensor):
+            def nest_fn(leaf: torch.Tensor | DTensor):
+                # def nest_fn(leaf: Union[torch.Tensor, DTensor]):  # this works
+                if isinstance(leaf, DTensor):
+                    leaf = leaf.to_local()
+                return leaf
+
+            return nest_fn(leaf) + 1
+
+        z = torch.randn(16, 16)
+        gm = graph_capture_and_aot_export_joint_with_descriptors(fn, (z,))
+
+        self.assertEqual(fn(z), gm(z)[0])
+

 instantiate_parametrized_tests(DTensorExportTest)

--- a/Show More
+++ b/Show More