remove more lines

update
2025-10-24 07:27:32 +08:00 · 2025-09-15 15:51:32 -07:00 · 2025-09-15 15:15:47 -07:00 · 2025-09-15 15:09:42 -07:00 · 2025-09-15 15:07:24 -07:00 · 2025-09-15 14:58:56 -07:00
143 changed files with 2742 additions and 1287 deletions
--- a/.ci/libtorch/build.sh
+++ b/.ci/libtorch/build.sh
@ -7,4 +7,4 @@ set -ex

 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

-USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
+USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.10" ${SCRIPTPATH}/../manywheel/build.sh
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -21,6 +21,7 @@ self-hosted-runner:
    - linux.arm64.2xlarge.ephemeral
    - linux.arm64.m7g.4xlarge
    - linux.arm64.m7g.4xlarge.ephemeral
+    - linux.arm64.r7g.12xlarge.memory
    - linux.4xlarge.nvidia.gpu
    - linux.8xlarge.nvidia.gpu
    - linux.16xlarge.nvidia.gpu
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-caba63f0fa29ef9e3d566699f32f11c07c8bda4e
+87ff22e49ed0e92576c4935ccb8c143daac4a3cd
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-f510715882304796a96e33028b4f6de1b026c2c7
+973c9d01da863cac9c51e8a5c0d390fc84b84fbc
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-6c5478ff7c3d50dd1e3047d72ec5909bea474073
+c77852e117bdf056c8e9a087e51d6f65cf6ba53d
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@ -82,16 +82,10 @@ RUN if command -v apt-get >/dev/null; then \
        apt-get update -y \
        && apt-get install -y ccache software-properties-common git curl wget sudo vim; \
    else \
-        dnf install -y git curl wget sudo vim; \
+        dnf install -y git curl wget sudo; \
    fi \
    && python3 --version && python3 -m pip --version

-# Workaround for https://github.com/openai/triton/issues/2507 and
-# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
-# this won't be needed for future versions of this docker image
-# or future versions of triton.
-RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
-
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
    if ! python3 -m uv --version >/dev/null 2>&1; then \
@ -220,11 +214,16 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
    if [ "$USE_SCCACHE" = "1" ]; then \
-        echo "Installing sccache..." \
-        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
+        echo "Installing sccache..."; \
+        if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+            SCCACHE_ARCHIVE="sccache-v0.8.1-aarch64-unknown-linux-musl"; \
+        else \
+            SCCACHE_ARCHIVE="sccache-v0.8.1-x86_64-unknown-linux-musl"; \
+        fi; \
+        curl -L -o sccache.tar.gz "https://github.com/mozilla/sccache/releases/download/v0.8.1/${SCCACHE_ARCHIVE}.tar.gz" \
        && tar -xzf sccache.tar.gz \
-        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
-        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && sudo mv "${SCCACHE_ARCHIVE}"/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz "${SCCACHE_ARCHIVE}" \
        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
@ -285,7 +284,7 @@ RUN if command -v apt-get >/dev/null; then \
        && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
        && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
    else \
-        dnf install -y git curl wget sudo vim; \
+        dnf install -y git curl wget sudo; \
    fi \
    && python3 --version && python3 -m pip --version

@ -298,12 +297,6 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
    echo "[INFO] Showing torch_build_versions.txt content:" && \
    cat torch_build_versions.txt

-# Workaround for https://github.com/openai/triton/issues/2507 and
-# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
-# this won't be needed for future versions of this docker image
-# or future versions of triton.
-RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
-
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
    if ! python3 -m uv --version > /dev/null 2>&1; then \
--- a/.github/scripts/prepare_vllm_wheels.sh
+++ b/.github/scripts/prepare_vllm_wheels.sh
@ -84,6 +84,9 @@ repackage_wheel() {
  rm -rf $package
 }

+# Require to re-package the wheel
+${PYTHON_EXECUTABLE} -mpip install wheel==0.45.1
+
 pushd externals/vllm/wheels
 for package in xformers flashinfer-python vllm; do
  repackage_wheel $package
--- a/.github/workflows/build-vllm-wheel.yml
+++ b/.github/workflows/build-vllm-wheel.yml
@ -12,6 +12,9 @@ on:
    paths:
      - .github/workflows/build-vllm-wheel.yml
      - .github/ci_commit_pins/vllm.txt
+  schedule:
+    # every morning at 01:30PM UTC, 9:30AM EST, 6:30AM PST
+    - cron: 30 13 * * *

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@ -24,21 +27,33 @@ jobs:
      fail-fast: false
      matrix:
        python-version: [ '3.12' ]
-        # TODO (huydhn): Add cu130 https://github.com/pytorch/pytorch/pull/162000#issuecomment-3261541554
+        # TODO (huydhn): Add cu130 after https://github.com/vllm-project/vllm/issues/24464 is resolved
+        platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
        device: [ 'cu128', 'cu129' ]
-        runner: [ 'linux.12xlarge.memory' ]
        include:
-          - device: cu128
+          - platform: manylinux_2_28_x86_64
+            device: cu128
            manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.8'
-          - device: cu129
+            runner: linux.12xlarge.memory
+          - platform: manylinux_2_28_x86_64
+            device: cu129
            manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
-    name: "Build ${{ matrix.device }} vLLM wheel"
+            runner: linux.12xlarge.memory
+          - platform: manylinux_2_28_aarch64
+            device: cu128
+            manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.8'
+            runner: linux.arm64.r7g.12xlarge.memory
+          - platform: manylinux_2_28_aarch64
+            device: cu129
+            manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.9'
+            runner: linux.arm64.r7g.12xlarge.memory
+    name: "Build ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
    runs-on: ${{ matrix.runner }}
    timeout-minutes: 480
    env:
      PY_VERS: ${{ matrix.python-version }}
      MANYLINUX_IMAGE: ${{ matrix.manylinux-image }}
-      PLATFORM: 'manylinux_2_28_x86_64'
+      PLATFORM: ${{ matrix.platform }}
      BUILD_DEVICE: ${{ matrix.device }}
    steps:
      - name: Setup SSH (Click me for login details)
@ -136,7 +151,7 @@ jobs:

      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
        with:
-          name: vllm-wheel-${{ matrix.device }}-${{ matrix.python-version }}-${{ env.PLATFORM }}
+          name: vllm-wheel-${{ matrix.device }}-${{ matrix.platform }}-${{ matrix.python-version }}
          if-no-files-found: error
          path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl

@ -146,15 +161,17 @@ jobs:

  # Copied from build-triton-wheel workflow (mostly)
  upload-wheel:
-    name: "Upload ${{ matrix.device }} vLLM wheel"
+    name: "Upload ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
    needs:
      - build-wheel
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
+        platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
        device: [ 'cu128', 'cu129' ]
    env:
+      PLATFORM: ${{ matrix.platform }}
      BUILD_DEVICE: ${{ matrix.device }}
    permissions:
      id-token: write
@ -190,15 +207,15 @@ jobs:
        run: |
          set -eux
          mkdir -p "${RUNNER_TEMP}/artifacts/"
-          mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-*/* "${RUNNER_TEMP}/artifacts/"
+          mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-"${PLATFORM}"-*/* "${RUNNER_TEMP}/artifacts/"

-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
+      - name: Set DRY_RUN
+        if: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
        shell: bash
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"

-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+      - name: Set UPLOAD_CHANNEL
        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }}
        shell: bash
        run: |
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -39,7 +39,7 @@ jobs:
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.6'
+      cuda-arch-list: '8.0;8.6'
      test-matrix: |
        { include: [
          { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
@ -62,7 +62,7 @@ jobs:
          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -127,8 +127,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      # More memory is needed to build with asan
-      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -140,8 +140,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      # More memory is needed to build with asan
-      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
--- a/.gitignore
+++ b/.gitignore
@ -389,3 +389,5 @@ android/pytorch_android_torchvision/.cxx

 # Claude Code local configuration
 CLAUDE.local.md
+/test_*.py
+/debug_*.py
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -874,7 +874,7 @@ cmake_dependent_option(
  "Whether to build the flash_attention kernel for scaled dot product attention.\
  Will be disabled if not supported by the platform"
  ON
-  "USE_CUDA OR USE_ROCM"
+  "USE_CUDA OR USE_ROCM;NOT MSVC"
  OFF)

 cmake_dependent_option(
@ -909,7 +909,7 @@ cmake_dependent_option(
 # USE_FLASH_ATTENTION -> USE_ROCM -> Dependencies.cmake -> aotriton.cmake
 #
 if(USE_ROCM)
-  if(USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION)
+  if(UNIX AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
    include(cmake/External/aotriton.cmake)
  endif()
 endif()
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@ -2174,7 +2174,7 @@ static void _scatter_via_index_put(
  if (self.dim() == 1 || broadcast_index) {
    Tensor squeezed = index;
    if (broadcast_index && index.dim() > 1) {
-      for (const auto d : c10::irange(index.dim())) {
+      for (int64_t d = index.dim() - 1; d >= 0; --d) {
        if (d == dim) {
          continue;
        }
--- a/aten/src/ATen/native/cuda/Nonzero.cu
+++ b/aten/src/ATen/native/cuda/Nonzero.cu
@ -317,6 +317,17 @@ void nonzero_static_cuda_out_impl(
    out_temp =
        Tensor(at::detail::empty_cuda({self.dim(), size}, out.options())).t();
  }
+  // If input has zero elements, avoid kernel grid calculations (which can
+  // produce zero divisors) and just fill the output with fill_value.
+  if (self.numel() == 0) {
+    if (need_to_copy) {
+      out_temp.fill_(fill_value);
+      out.copy_(out_temp);
+    } else {
+      out.fill_(fill_value);
+    }
+    return;
+  }
  int64_t* out_data_ptr = need_to_copy ? out_temp.mutable_data_ptr<int64_t>()
                                       : out.mutable_data_ptr<int64_t>();

--- a/aten/src/ATen/native/mps/MPSGraphSonomaOps.h
+++ b/aten/src/ATen/native/mps/MPSGraphSonomaOps.h
@ -1,48 +0,0 @@
-#pragma once
-
-#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
-
-#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
-
-typedef NS_ENUM(NSUInteger, MPSGraphFFTScalingMode) {
-  MPSGraphFFTScalingModeNone = 0L,
-  MPSGraphFFTScalingModeSize = 1L,
-  MPSGraphFFTScalingModeUnitary = 2L,
-};
-
-@interface FakeMPSGraphFFTDescriptor : NSObject<NSCopying>
-@property(readwrite, nonatomic) BOOL inverse;
-@property(readwrite, nonatomic) MPSGraphFFTScalingMode scalingMode;
-@property(readwrite, nonatomic) BOOL roundToOddHermitean;
-+ (nullable instancetype)descriptor;
-@end
-
-@compatibility_alias MPSGraphFFTDescriptor FakeMPSGraphFFTDescriptor;
-
-@interface MPSGraph (SonomaOps)
- (MPSGraphTensor* _Nonnull)conjugateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)realPartOfTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)fastFourierTransformWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                                      axes:(NSArray<NSNumber*>* _Nonnull)axes
-                                                descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
-                                                      name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)realToHermiteanFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                                    axes:(NSArray<NSNumber*>* _Nonnull)axes
-                                              descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
-                                                    name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)HermiteanToRealFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                                    axes:(NSArray<NSNumber*>* _Nonnull)axes
-                                              descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
-                                                    name:(NSString* _Nullable)name;
-@end
-
-// define BFloat16 enums for MacOS13
-#define MPSDataTypeBFloat16 ((MPSDataType)(MPSDataTypeAlternateEncodingBit | MPSDataTypeFloat16))
-
-// define Metal version
-#define MTLLanguageVersion3_1 ((MTLLanguageVersion)((3 << 16) + 1))
-#endif
--- a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
+++ b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
@ -1,196 +0,0 @@
-#pragma once
-#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
-
-// TODO: Remove me when moved to MacOS 13
-#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
-
-@interface FakeMPSGraphConvolution3DOpDescriptor : NSObject<NSCopying>
-
-@property(readwrite, nonatomic) NSUInteger strideInX;
-@property(readwrite, nonatomic) NSUInteger strideInY;
-@property(readwrite, nonatomic) NSUInteger strideInZ;
-@property(readwrite, nonatomic) NSUInteger dilationRateInX;
-@property(readwrite, nonatomic) NSUInteger dilationRateInY;
-@property(readwrite, nonatomic) NSUInteger dilationRateInZ;
-
-@property(readwrite, nonatomic) NSUInteger paddingLeft;
-@property(readwrite, nonatomic) NSUInteger paddingRight;
-@property(readwrite, nonatomic) NSUInteger paddingTop;
-@property(readwrite, nonatomic) NSUInteger paddingBottom;
-@property(readwrite, nonatomic) NSUInteger paddingFront;
-@property(readwrite, nonatomic) NSUInteger paddingBack;
-
-@property(readwrite, nonatomic) MPSGraphPaddingStyle paddingStyle;
-@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout dataLayout;
-@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout weightsLayout;
-
-@property(readwrite, nonatomic) NSUInteger groups;
-
-@end
-
-@compatibility_alias MPSGraphConvolution3DOpDescriptor FakeMPSGraphConvolution3DOpDescriptor;
-
-#endif
-
-@interface MPSGraph (VenturaOps)
-
-#if !defined(__MAC_13_0) && (!defined(MAC_OS_X_VERSION_13_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_0))
-
-typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode) {
-  MPSGraphResizeNearestRoundingModeRoundPreferCeil = 0L,
-  MPSGraphResizeNearestRoundingModeRoundPreferFloor = 1L,
-  MPSGraphResizeNearestRoundingModeCeil = 2L,
-  MPSGraphResizeNearestRoundingModeFloor = 3L,
-  MPSGraphResizeNearestRoundingModeRoundToEven = 4L,
-  MPSGraphResizeNearestRoundingModeRoundToOdd = 5L,
-};
-
-// Define complex enums for MacOS 12
-#define MPSDataTypeComplexBit 0x01000000
-#define MPSDataTypeComplexFloat32 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 64))
-#define MPSDataTypeComplexFloat16 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 32))
-#endif
-
- (MPSGraphTensor* _Nonnull)convolution3DWithSourceTensor:(MPSGraphTensor* _Nonnull)source
-                                            weightsTensor:(MPSGraphTensor* _Nonnull)weights
-                                               descriptor:(MPSGraphConvolution3DOpDescriptor* _Nonnull)descriptor
-                                                     name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)
-    convolution3DDataGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
-                                          weightsTensor:(MPSGraphTensor* _Nonnull)weights
-                                            outputShape:(MPSShape* _Nonnull)outputShape
-                           forwardConvolutionDescriptor:
-                               (MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
-                                                   name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)
-    convolution3DWeightsGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
-                                              sourceTensor:(MPSGraphTensor* _Nonnull)source
-                                               outputShape:(MPSShape* _Nonnull)outputShape
-                              forwardConvolutionDescriptor:
-                                  (MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
-                                                      name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)cumulativeSumWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                               axis:(NSInteger)axis
-                                               name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                      axis:(NSInteger)axis
-                                      name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                      axis:(NSInteger)axis
-                                descending:(BOOL)descending
-                                      name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
-                                descending:(BOOL)descending
-                                      name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
-                                      name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                         axis:(NSInteger)axis
-                                         name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                         axis:(NSInteger)axis
-                                   descending:(BOOL)descending
-                                         name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                   axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
-                                   descending:(BOOL)descending
-                                         name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
-                                   axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
-                                         name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)inverseOfTensor:(MPSGraphTensor* _Nonnull)inputTensor name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
-                                         sizeTensor:(MPSGraphTensor* _Nonnull)size
-                                nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
-                                       centerResult:(BOOL)centerResult
-                                       alignCorners:(BOOL)alignCorners
-                                             layout:(MPSGraphTensorNamedDataLayout)layout
-                                               name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
-                                         sizeTensor:(MPSGraphTensor* _Nonnull)size
-                                  scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
-                                nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
-                                             layout:(MPSGraphTensorNamedDataLayout)layout
-                                               name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
-                                          sizeTensor:(MPSGraphTensor* _Nonnull)size
-                                        centerResult:(BOOL)centerResult
-                                        alignCorners:(BOOL)alignCorners
-                                              layout:(MPSGraphTensorNamedDataLayout)layout
-                                                name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
-                                          sizeTensor:(MPSGraphTensor* _Nonnull)size
-                                   scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
-                                              layout:(MPSGraphTensorNamedDataLayout)layout
-                                                name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
-                                                      input:(MPSGraphTensor* _Nonnull)input
-                                        nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
-                                               centerResult:(BOOL)centerResult
-                                               alignCorners:(BOOL)alignCorners
-                                                     layout:(MPSGraphTensorNamedDataLayout)layout
-                                                       name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
-                                                      input:(MPSGraphTensor* _Nonnull)input
-                                          scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
-                                        nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
-                                                     layout:(MPSGraphTensorNamedDataLayout)layout
-                                                       name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
-                                                       input:(MPSGraphTensor* _Nonnull)input
-                                                centerResult:(BOOL)centerResult
-                                                alignCorners:(BOOL)alignCorners
-                                                      layout:(MPSGraphTensorNamedDataLayout)layout
-                                                        name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
-                                                       input:(MPSGraphTensor* _Nonnull)input
-                                           scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
-                                                      layout:(MPSGraphTensorNamedDataLayout)layout
-                                                        name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
-                                      coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
-                                                layout:(MPSGraphTensorNamedDataLayout)layout
-                                  normalizeCoordinates:(BOOL)normalizeCoordinates
-                                   relativeCoordinates:(BOOL)relativeCoordinates
-                                          alignCorners:(BOOL)alignCorners
-                                           paddingMode:(MPSGraphPaddingMode)paddingMode
-                                          samplingMode:(MPSGraphResizeMode)samplingMode
-                                         constantValue:(double)constantValue
-                                                  name:(NSString* _Nullable)name;
-
- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
-                                      coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
-                                                layout:(MPSGraphTensorNamedDataLayout)layout
-                                  normalizeCoordinates:(BOOL)normalizeCoordinates
-                                   relativeCoordinates:(BOOL)relativeCoordinates
-                                          alignCorners:(BOOL)alignCorners
-                                           paddingMode:(MPSGraphPaddingMode)paddingMode
-                                   nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
-                                         constantValue:(double)constantValue
-                                                  name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)truncateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
-
-@end
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@ -9,8 +9,6 @@
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/mps/MPSGraphSequoiaOps.h>
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <fmt/format.h>
 #include <fmt/ranges.h>
--- a/aten/src/ATen/native/mps/operations/BinaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
@ -8,8 +8,6 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/mps/operations/BinaryKernel.h>
-// For MTLLanguageVersion_3_1
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <fmt/format.h>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@ -1,23 +1,12 @@
 //  Copyright © 2022 Apple Inc.
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/ConvUtils.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/ops/_mps_convolution_native.h>
 #include <ATen/ops/_mps_convolution_transpose_native.h>
 #include <ATen/ops/mps_convolution_backward_native.h>
 #include <ATen/ops/mps_convolution_transpose_backward_native.h>
-
-#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
-
-@implementation FakeMPSGraphConvolution3DOpDescriptor
- (nonnull id)copyWithZone:(nullable NSZone*)zone {
-  return self;
-}
-
-@end
-
-#endif
+#include <fmt/format.h>

 namespace at::native {

@ -50,11 +39,9 @@ static void fill_conv3d_desc(MPSGraphConvolution3DOpDescriptor* descriptor_,
  descriptor_.paddingFront = paddingDepth;
  descriptor_.paddingBack = paddingDepth;

-  // PyTorch always uses NCDHW memory layout for 3D tensors
-  descriptor_.dataLayout = (MPSGraphTensorNamedDataLayout)7L; // MPSGraphTensorNamedDataLayoutNCDHW;
+  descriptor_.dataLayout = MPSGraphTensorNamedDataLayoutNCDHW;

-  // PyTorch always uses OIDHW memory layout for 3D weights
-  descriptor_.weightsLayout = (MPSGraphTensorNamedDataLayout)9L; // MPSGraphTensorNamedDataLayoutOIDHW;
+  descriptor_.weightsLayout = MPSGraphTensorNamedDataLayoutOIDHW;

  descriptor_.groups = groups; // not yet tested in Xcode/C++
 }
@ -186,18 +173,6 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
    if (bias_defined)
      bias_shape = bias_opt.value().sizes();

-    std::string mem_format_key;
-    switch (memory_format) {
-      case at::MemoryFormat::Contiguous:
-        mem_format_key = "Contiguous";
-        break;
-      case at::MemoryFormat::ChannelsLast:
-        mem_format_key = "ChannelsLast";
-        break;
-      default:
-        assert(0 && "Check should have been done earlier\n");
-    }
-
    std::string bias_shape_key;
    if (bias_defined) {
      bias_shape_key = std::to_string(bias_shape[0]);
@ -205,20 +180,16 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
      bias_shape_key = "nobias";
    }

-    std::string key;
-    if (is3DConv) {
-      key = "mps_3d_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
-          std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
-          std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
-          std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
-          mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
-
-    } else {
-      key = "mps_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
-          std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
-          std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
-          mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
-    }
+    std::string key = fmt::format("mps_{}convolution:{}:{}:{}:{}:{}:{}:{}:{}",
+                                  is3DConv ? "3d_" : "",
+                                  getArrayRefString(stride),
+                                  getArrayRefString(dilation),
+                                  getArrayRefString(padding),
+                                  groups,
+                                  is_channels_last,
+                                  mps::getTensorsStringKey({input_t, weight_t}),
+                                  bias_defined,
+                                  bias_shape_key);

    MPSShape* inputShape = mps::getMPSShape(input_t, memory_format);
    MPSShape* outputShape = mps::getMPSShape(output_t, memory_format);
@ -400,33 +371,15 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
  @autoreleasepool {
    MPSStream* stream = getCurrentMPSStream();

-    std::string mem_format_key;
-    switch (memory_format) {
-      case at::MemoryFormat::Contiguous:
-        mem_format_key = "Contiguous";
-        break;
-      case at::MemoryFormat::ChannelsLast:
-        mem_format_key = "ChannelsLast";
-        break;
-      default:
-        assert(0 && "Check should have been done earlier\n");
-    }
-
    MPSShape* mps_input_shape = getMPSShape(input_size);
-    std::string key;
-    if (is3DConv) {
-      key = "mps_3d_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
-          ":" + std::to_string(stride[2]) + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
-          std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
-          std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
-          getTensorsStringKey({grad_output_t, weight_t});
-
-    } else {
-      key = "mps_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
-          std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
-          std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
-          getTensorsStringKey({grad_output_t, weight_t});
-    }
+    std::string key = fmt::format("mps_{}_convolution_backward_input:{}:{}:{}:{}:{}:{}",
+                                  is3DConv ? "3d_" : "",
+                                  getArrayRefString(stride),
+                                  getArrayRefString(dilation),
+                                  getArrayRefString(padding),
+                                  groups,
+                                  is_channels_last,
+                                  getTensorsStringKey({grad_output_t, weight_t}));
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      auto gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output_t);
      auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
@ -551,19 +504,13 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
    MPSStream* stream = getCurrentMPSStream();

    MPSShape* mps_weight_shape = getMPSShape(weight_size);
-    std::string key;
-    if (is3DConv) {
-      key = "mps_3d_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
-          std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
-          std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
-          std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" +
-          getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
-    } else {
-      key = "mps_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
-          std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
-          std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" +
-          getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
-    }
+    std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}",
+                                  is3DConv ? "3d_" : "",
+                                  getArrayRefString(stride),
+                                  getArrayRefString(dilation),
+                                  getArrayRefString(padding),
+                                  groups,
+                                  getTensorsStringKey({grad_output_t, input_t, grad_weight_t}));
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSShape* inputShape = getMPSShape(input_t);
      bool isDepthwiseConv =
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@ -2,7 +2,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/mps/Copy.h>
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/ops/_copy_from_and_resize_native.h>
 #include <ATen/ops/_copy_from_native.h>
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@ -5,8 +5,6 @@
 #include <ATen/native/DistributionTemplates.h>
 #include <ATen/native/Distributions.h>
 #include <ATen/native/TensorFactories.h>
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/native/mps/operations/FastFourierTransform.mm
+++ b/aten/src/ATen/native/mps/operations/FastFourierTransform.mm
@ -1,6 +1,4 @@
 #include <ATen/native/SpectralOpsUtils.h>
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
@ -12,20 +10,6 @@
 #include <ATen/ops/_fft_r2c_native.h>
 #endif

-#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
-@implementation FakeMPSGraphFFTDescriptor
-+ (nullable instancetype)descriptor {
-  // Redispatch the constructor to the actual implementation
-  id desc = NSClassFromString(@"MPSGraphFFTDescriptor");
-  return (FakeMPSGraphFFTDescriptor*)[desc descriptor];
-}
-
- (nonnull id)copyWithZone:(nullable NSZone*)zone {
-  return self;
-}
-@end
-#endif
-
 namespace at::native {
 namespace {
 MPSGraphFFTScalingMode normalization_to_ScalingMode(int64_t normalization) {
--- a/aten/src/ATen/native/mps/operations/GridSampler.mm
+++ b/aten/src/ATen/native/mps/operations/GridSampler.mm
@ -2,7 +2,6 @@
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/GridSamplerUtils.h>
 #include <ATen/native/Pool.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/mps/kernels/GridSampler.h>

--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@ -17,7 +17,6 @@
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorAdvancedIndexing.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <c10/util/SmallVector.h>
 #include <c10/util/irange.h>
 #include <fmt/format.h>
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@ -6,9 +6,7 @@
 #include <ATen/native/LinearAlgebra.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/Resize.h>
-// For MTLLanguageVersion_3_1
 #include <ATen/native/mps/MPSGraphSequoiaOps.h>
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@ -4,7 +4,6 @@
 #include <ATen/TensorUtils.h>
 #include <ATen/native/Pool.h>
 #include <ATen/native/ReduceOpsUtils.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <c10/util/irange.h>

--- a/aten/src/ATen/native/mps/operations/Shape.mm
+++ b/aten/src/ATen/native/mps/operations/Shape.mm
@ -4,7 +4,6 @@
 #include <ATen/WrapDimUtils.h>
 #include <ATen/native/TensorShape.h>
 #include <ATen/native/TypeProperties.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/native/mps/operations/Sort.mm
+++ b/aten/src/ATen/native/mps/operations/Sort.mm
@ -5,7 +5,6 @@
 #include <ATen/native/SortingUtils.h>
 #include <ATen/native/TensorShape.h>
 #include <ATen/native/TypeProperties.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@ -2,8 +2,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/UnaryOps.h>
 #include <ATen/native/mps/Copy.h>
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/native/mps/operations/Unique.mm
+++ b/aten/src/ATen/native/mps/operations/Unique.mm
@ -1,7 +1,6 @@
 //  Copyright © 2022 Apple Inc.
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/Resize.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/native/mps/operations/UpSample.mm
+++ b/aten/src/ATen/native/mps/operations/UpSample.mm
@ -1,7 +1,6 @@
 //  Copyright © 2023 Apple Inc.
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/UpSample.h>
-#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <fmt/format.h>

--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@ -4,8 +4,6 @@
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/Resize.h>
-// For MTLLanguageVersion_3_1
-#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <fmt/format.h>

--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -2517,7 +2517,7 @@
  dispatch:
    CompositeExplicitAutograd: empty_like
    QuantizedCPU, QuantizedCUDA: empty_like_quantized
-    SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo
+    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: empty_like_sparse_coo
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: empty_like_nested
  autogen: empty_like.out
@ -6492,7 +6492,7 @@
  device_guard: False
  dispatch:
    CompositeExplicitAutograd: unsqueeze
-    SparseCPU, SparseCUDA: unsqueeze_sparse
+    SparseCPU, SparseCUDA, SparseMPS: unsqueeze_sparse
    QuantizedCPU, QuantizedCUDA: unsqueeze_quantized
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: unsqueeze_nested
  tags: core
@ -10259,7 +10259,7 @@
  structured_delegate: any.all_out
  variants: method, function
  dispatch:
-    SparseCPU, SparseCUDA: any_sparse
+    SparseCPU, SparseCUDA, SparseMPS: any_sparse
  tags: core

 - func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@ -95,72 +95,6 @@
 #endif
 #endif

-#if defined(USE_ROCM) && (defined(USE_FLASH_ATTENTION) || defined(USE_MEM_EFF_ATTENTION))
-namespace pytorch_flash
-{
-std::tuple<
-    at::Tensor,
-    at::Tensor,
-    at::Tensor,
-    at::Tensor,
-    at::Tensor,
-    at::Tensor,
-    at::Tensor,
-    at::Tensor>
-mha_fwd(
-    const at::Tensor& q, // batch_size x seqlen_q x num_heads x head_size
-    const at::Tensor& k, // batch_size x seqlen_k x num_heads_k x head_size
-    const at::Tensor& v, // batch_size x seqlen_k x num_heads_k x head_size
-    std::optional<at::Tensor>&
-        out_, // batch_size x seqlen_q x num_heads x head_size
-    std::optional<at::Tensor>&
-        alibi_slopes_, // num_heads or batch_size x num_heads
-    const float p_dropout,
-    const float softmax_scale,
-    bool is_causal,
-    std::optional<int64_t> window_size_left,
-    std::optional<int64_t> window_size_right,
-    const float softcap,
-    const bool return_softmax,
-    std::optional<at::Generator> gen_) {
-#if defined(USE_ROCM_CK_SDPA)
-  if (at::globalContext().getROCmFAPreferredBackend() ==
-      at::ROCmFABackend::Ck) {
-    const int non_null_window_left = window_size_left.value_or(-1);
-    const int non_null_window_right = window_size_right.value_or(-1);
-    std::optional<at::Tensor> dummy_attn_bias = std::nullopt;
-    return mha_fwd_ck(
-        q,
-        k,
-        v,
-        out_,
-        p_dropout,
-        softmax_scale,
-        is_causal,
-        non_null_window_left,
-        non_null_window_right,
-        return_softmax,
-        gen_,
-        dummy_attn_bias); // Not used in flash attention
-  }
-#endif
-  return mha_fwd_aot(
-      q,
-      k,
-      v,
-      out_,
-      alibi_slopes_,
-      p_dropout,
-      softmax_scale,
-      is_causal,
-      window_size_left,
-      window_size_right,
-      return_softmax,
-      gen_);
-}
-}
-#endif
-
 namespace at {

 namespace cuda::philox {
--- a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h
@ -270,7 +270,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_varle
 #endif

 TORCH_API
-std::tuple<
+inline std::tuple<
    at::Tensor,
    at::Tensor,
    at::Tensor,
@ -294,7 +294,42 @@ mha_fwd(
    std::optional<int64_t> window_size_right,
    const float softcap,
    const bool return_softmax,
-    std::optional<at::Generator> gen_);
+    std::optional<at::Generator> gen_) {
+#if defined(USE_ROCM_CK_SDPA)
+  if (at::globalContext().getROCmFAPreferredBackend() ==
+      at::ROCmFABackend::Ck) {
+    const int non_null_window_left = window_size_left.value_or(-1);
+    const int non_null_window_right = window_size_right.value_or(-1);
+    std::optional<at::Tensor> dummy_attn_bias = std::nullopt;
+    return mha_fwd_ck(
+        q,
+        k,
+        v,
+        out_,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        non_null_window_left,
+        non_null_window_right,
+        return_softmax,
+        gen_,
+        dummy_attn_bias); // Not used in flash attention
+  }
+#endif
+  return mha_fwd_aot(
+      q,
+      k,
+      v,
+      out_,
+      alibi_slopes_,
+      p_dropout,
+      softmax_scale,
+      is_causal,
+      window_size_left,
+      window_size_right,
+      return_softmax,
+      gen_);
+}

 inline std::tuple<
    at::Tensor,
--- a/benchmarks/dynamo/check_accuracy.py
+++ b/benchmarks/dynamo/check_accuracy.py
@ -72,6 +72,12 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                "timm_vovnet",
                "torchrec_dlrm",
                "vgg16",
+                # LLM
+                "meta-llama/Llama-3.2-1B",
+                "google/gemma-2-2b",
+                "google/gemma-3-4b-it",
+                "openai/whisper-tiny",
+                "Qwen/Qwen3-0.6B",
            }
        )

--- a/benchmarks/dynamo/check_graph_breaks.py
+++ b/benchmarks/dynamo/check_graph_breaks.py
@ -55,6 +55,12 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
                "timm_nfnet",
                "torchrec_dlrm",
                "vgg16",
+                # LLM
+                "meta-llama/Llama-3.2-1B",
+                "google/gemma-2-2b",
+                "google/gemma-3-4b-it",
+                "openai/whisper-tiny",
+                "Qwen/Qwen3-0.6B",
            }
        )

--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,5
+
+
+
+google/gemma-2-2b,pass,5
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass,6
+
+
+
+Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5


 YituTechConvBert,pass,5
+
+
+
+meta-llama/Llama-3.2-1B,eager_fail_to_run,0
+
+
+
+google/gemma-2-2b,eager_fail_to_run,0
+
+
+
+google/gemma-3-4b-it,eager_fail_to_run,0
+
+
+
+openai/whisper-tiny,eager_fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,eager_fail_to_run,0
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
@ -167,3 +167,23 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,fail_accuracy,0
+
+
+
+google/gemma-2-2b,fail_accuracy,0
+
+
+
+google/gemma-3-4b-it,fail_accuracy,0
+
+
+
+openai/whisper-tiny,fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,fail_accuracy,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass_due_to_skip,0
+
+
+
+google/gemma-2-2b,pass_due_to_skip,0
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass_due_to_skip,0
+
+
+
+Qwen/Qwen3-0.6B,pass_due_to_skip,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass_due_to_skip,0
+
+
+
+google/gemma-2-2b,pass_due_to_skip,0
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass_due_to_skip,0
+
+
+
+Qwen/Qwen3-0.6B,pass_due_to_skip,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass_due_to_skip,0
+
+
+
+google/gemma-2-2b,pass_due_to_skip,0
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass_due_to_skip,0
+
+
+
+Qwen/Qwen3-0.6B,pass_due_to_skip,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,5
+
+
+
+google/gemma-2-2b,pass,5
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass,6
+
+
+
+Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5


 YituTechConvBert,pass,5
+
+
+
+meta-llama/Llama-3.2-1B,eager_fail_to_run,0
+
+
+
+google/gemma-2-2b,eager_fail_to_run,0
+
+
+
+google/gemma-3-4b-it,eager_fail_to_run,0
+
+
+
+openai/whisper-tiny,eager_fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,eager_fail_to_run,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,0
+
+
+
+google/gemma-2-2b,pass,0
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass,0
+
+
+
+Qwen/Qwen3-0.6B,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,5
+
+
+
+google/gemma-2-2b,pass,5
+
+
+
+google/gemma-3-4b-it,pass,0
+
+
+
+openai/whisper-tiny,pass,6
+
+
+
+Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5


 YituTechConvBert,pass,5
+
+
+
+meta-llama/Llama-3.2-1B,eager_fail_to_run,0
+
+
+
+google/gemma-2-2b,eager_fail_to_run,0
+
+
+
+google/gemma-3-4b-it,eager_fail_to_run,0
+
+
+
+openai/whisper-tiny,eager_fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,eager_fail_to_run,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,5
+
+
+
+google/gemma-2-2b,pass,5
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass,6
+
+
+
+Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
+
+
+
+meta-llama/Llama-3.2-1B,pass,5
+
+
+
+google/gemma-2-2b,pass,5
+
+
+
+google/gemma-3-4b-it,pass_due_to_skip,0
+
+
+
+openai/whisper-tiny,pass,6
+
+
+
+Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5


 YituTechConvBert,pass,5
+
+
+
+meta-llama/Llama-3.2-1B,eager_fail_to_run,0
+
+
+
+google/gemma-2-2b,eager_fail_to_run,0
+
+
+
+google/gemma-3-4b-it,eager_fail_to_run,0
+
+
+
+openai/whisper-tiny,eager_fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,eager_fail_to_run,0
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -733,7 +733,7 @@ def timed(

    time_total = 0
    # Dont collect outputs to correctly measure timing
-    for _ in range(times):
+    for i in range(times):
        # If batch_size is 1, it too often collides with other non batch size
        # dimensions resulting in errors.
        if batch_size and batch_size > 1:
@ -1106,7 +1106,13 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
        elif args.torchscript_jit_trace:
            frozen_model_iter_fn = torchscript_jit_trace(model, example_inputs)
        else:
-            frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
+            if kwargs["hf_llm"]:
+                # If it's an llm, we want to optimize model.forward, and use
+                # the generate function
+                model.forward = torch._dynamo.run(model)
+                frozen_model_iter_fn = model_iter_fn
+            else:
+                frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)

        for rep in trange(args.repeat, desc="running benchmark"):
            inputs = (
@ -1120,7 +1126,10 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
            maybe_mark_step(args)

            # interleave the runs to handle frequency scaling and load changes
-            with maybe_mark_profile(p=p, mark="expected"):
+            with (
+                maybe_mark_profile(p=p, mark="expected"),
+                torch.compiler.set_stance("force_eager"),
+            ):
                timings[rep, 0], expected_output = timed(
                    model,
                    model_iter_fn,
@ -2233,11 +2242,12 @@ class BenchmarkRunner:
            reset_rng_state()
            model_copy = None
            try:
-                model_copy = self.deepcopy_and_maybe_parallelize(model)
-                self.init_optimizer(name, current_device, model_copy.parameters())
-                correct_result = self.run_n_iterations(
-                    model_copy, clone_inputs(example_inputs), self.model_iter_fn
-                )
+                with torch.compiler.set_stance("force_eager"):
+                    model_copy = self.deepcopy_and_maybe_parallelize(model)
+                    self.init_optimizer(name, current_device, model_copy.parameters())
+                    correct_result = self.run_n_iterations(
+                        model_copy, clone_inputs(example_inputs), self.model_iter_fn
+                    )
            except Exception as e:
                accuracy_status = (
                    "eager_1st_run_OOM"
@ -2254,11 +2264,12 @@ class BenchmarkRunner:
            reset_rng_state()
            model_copy = None
            try:
-                model_copy = self.deepcopy_and_maybe_parallelize(model)
-                self.init_optimizer(name, current_device, model_copy.parameters())
-                correct_rerun_result = self.run_n_iterations(
-                    model_copy, clone_inputs(example_inputs), self.model_iter_fn
-                )
+                with torch.compiler.set_stance("force_eager"):
+                    model_copy = self.deepcopy_and_maybe_parallelize(model)
+                    self.init_optimizer(name, current_device, model_copy.parameters())
+                    correct_rerun_result = self.run_n_iterations(
+                        model_copy, clone_inputs(example_inputs), self.model_iter_fn
+                    )
            except Exception as e:
                accuracy_status = (
                    "eager_2nd_run_OOM"
@ -2542,7 +2553,11 @@ class BenchmarkRunner:
                    )

            baseline_timings = experiment(
-                model, example_inputs, mark="expected", **experiment_kwargs
+                self.model_iter_fn,
+                model,
+                example_inputs,
+                mark="expected",
+                **experiment_kwargs,
            )

            if self.args.export_aot_inductor:
@ -2610,7 +2625,11 @@ class BenchmarkRunner:
                    )

            backend_timings = experiment(
-                model, example_inputs, mark="expected", **experiment_kwargs
+                self.model_iter_fn,
+                model,
+                example_inputs,
+                mark="expected",
+                **experiment_kwargs,
            )
            timings = np.stack((baseline_timings, backend_timings), axis=1)
            result_summary = latency_experiment_summary(
@ -2629,9 +2648,17 @@ class BenchmarkRunner:
        tag=None,
        batch_size=None,
    ):
+        niters = 5
+        if getattr(self, "hf_llm", False):
+            # If we're benchmarking an llm, we want to use the generate function
+            self.model_iter_fn = self.generate
+            niters = 1
+
        if self.args.xla:
            with self.pick_grad(name, self.args.training):
-                return experiment(*self.maybe_cast(model, example_inputs))
+                return experiment(
+                    self.model_iter_fn, *self.maybe_cast(model, example_inputs)
+                )

        def warmup(fn, model, example_inputs, mode, niters=5):
            gc.collect()
@ -2696,17 +2723,22 @@ class BenchmarkRunner:
            with maybe_snapshot_memory(
                self.args.snapshot_memory, f"eager_{self.args.only}"
            ):
-                eager_latency, eager_peak_mem, _ = warmup(
-                    self.model_iter_fn, copy.deepcopy(model), example_inputs, "eager"
-                )
-                if self.args.use_warm_peak_memory:
-                    _, eager_peak_mem, _ = warmup(
+                with torch.compiler.set_stance("force_eager"):
+                    eager_latency, eager_peak_mem, _ = warmup(
                        self.model_iter_fn,
                        copy.deepcopy(model),
                        example_inputs,
                        "eager",
-                        niters=1,
+                        niters=niters,
                    )
+                    if self.args.use_warm_peak_memory:
+                        _, eager_peak_mem, _ = warmup(
+                            self.model_iter_fn,
+                            copy.deepcopy(model),
+                            example_inputs,
+                            "eager",
+                            niters=1,
+                        )

            if (
                self.args.export_aot_inductor
@ -2715,7 +2747,13 @@ class BenchmarkRunner:
            ):
                optimized_model_iter_fn = optimize_ctx
            else:
-                optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
+                if getattr(self, "hf_llm", False):
+                    # If it's an llm, we want to optimize model.forward, and use
+                    # the generate function
+                    model = optimize_ctx(model)
+                    optimized_model_iter_fn = self.model_iter_fn
+                else:
+                    optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)

            with maybe_snapshot_memory(
                self.args.snapshot_memory, f"compiled_{self.args.only}"
@ -2793,7 +2831,13 @@ class BenchmarkRunner:
                    f"{ok:3}/{total:3} +{frames_third_pass} frames {compilation_time:3.0f}s"
                )

-            results.append(experiment(model, example_inputs, **experiment_kwargs))
+            experiment_kwargs["hf_llm"] = getattr(self, "hf_llm", False)
+
+            results.append(
+                experiment(
+                    self.model_iter_fn, model, example_inputs, **experiment_kwargs
+                )
+            )
            return " ".join(map(str, results))

    def minify_model(
@ -4084,7 +4128,7 @@ def run(runner, args, original_dir=None):
        # Overwrite 'translation_validation' config, if specified.
        torch.fx.experimental._config.translation_validation = False

-    experiment = functools.partial(experiment, args, runner.model_iter_fn)
+    experiment = functools.partial(experiment, args)

    if args.only and should_diff_branch(args):
        import git
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@ -7,6 +7,7 @@ import os
 import re
 import subprocess
 import sys
+import types
 import warnings


@ -128,6 +129,12 @@ with open(MODELS_FILENAME) as fh:
 assert len(BATCH_SIZE_KNOWN_MODELS)


+try:
+    from .huggingface_llm_models import HF_LLM_MODELS
+except ImportError:
+    from huggingface_llm_models import HF_LLM_MODELS
+
+
 def get_module_cls_by_model_name(model_cls_name):
    _module_by_model_name = {
        "Speech2Text2Decoder": "transformers.models.speech_to_text_2.modeling_speech_to_text_2",
@ -418,11 +425,8 @@ class HuggingfaceRunner(BenchmarkRunner):
        use_eval_mode = self.args.use_eval_mode
        dtype = torch.float32
        reset_rng_state()
-        model_cls, config = self._get_model_cls_and_config(model_name)
-        model = self._download_model(model_name)
-        model = model.to(device, dtype=dtype)
-        if self.args.enable_activation_checkpointing:
-            model.gradient_checkpointing_enable()
+
+        # Get batch size
        if model_name in BATCH_SIZE_KNOWN_MODELS:
            batch_size_default = BATCH_SIZE_KNOWN_MODELS[model_name]
        elif batch_size is None:
@ -440,14 +444,46 @@ class HuggingfaceRunner(BenchmarkRunner):
                    f"Running smaller batch size={batch_size} for {model_name}, orig batch_size={batch_size_default}"  # noqa: G004
                )

-        example_inputs = generate_inputs_for_model(
-            model_cls, model, model_name, batch_size, device, include_loss_args=True
-        )
+        # Get model and example inputs
+        if model_name in HF_LLM_MODELS:
+            benchmark_cls = HF_LLM_MODELS[model_name]
+            model, example_inputs = benchmark_cls.get_model_and_inputs(
+                model_name, device
+            )

-        # So we can check for correct gradients without eliminating the dropout computation
-        for attr in dir(config):
-            if "drop" in attr and isinstance(getattr(config, attr), float):
-                setattr(config, attr, 1e-30)
+            # Set this flag so that when we test for speedup, we use
+            # model.generate instead of using model.forward
+            self.hf_llm = True
+
+            def generate(self, _, example_inputs, collect_outputs=True):
+                return model.generate(**example_inputs)
+
+            self.generate = types.MethodType(generate, self)
+
+        else:
+            self.hf_llm = False
+
+            model_cls, config = self._get_model_cls_and_config(model_name)
+            model = self._download_model(model_name)
+            model = model.to(device, dtype=dtype)
+
+            example_inputs = generate_inputs_for_model(
+                model_cls, model, model_name, batch_size, device, include_loss_args=True
+            )
+
+            # So we can check for correct gradients without eliminating the dropout computation
+            for attr in dir(config):
+                if "drop" in attr and isinstance(getattr(config, attr), float):
+                    setattr(config, attr, 1e-30)
+
+            # Turning off kv cache for torchbench models. This is not the right
+            # thing to do, but the pt2 dashboard is outdated. Real transformers
+            # benchmarks will be added soon using a different infra.
+            if hasattr(model, "config") and hasattr(model.config, "use_cache"):
+                model.config.use_cache = False
+
+        if self.args.enable_activation_checkpointing:
+            model.gradient_checkpointing_enable()

        if (
            is_training
@ -460,12 +496,6 @@ class HuggingfaceRunner(BenchmarkRunner):
        else:
            model.eval()

-        # Turning off kv cache for torchbench models. This is not the right
-        # thing to do, but the pt2 dashboard is outdated. Real transformers
-        # benchmarks will be added soon using a different infra.
-        if hasattr(model, "config") and hasattr(model.config, "use_cache"):
-            model.config.use_cache = False
-
        self.validate_model(model, example_inputs)
        return device, model_name, model, example_inputs, batch_size

@ -530,7 +560,8 @@ class HuggingfaceRunner(BenchmarkRunner):

    def forward_pass(self, mod, inputs, collect_outputs=True):
        with self.autocast(**self.autocast_arg):
-            return mod(**inputs)
+            res = mod(**inputs)
+        return res.logits if self.hf_llm else res

    def forward_and_backward_pass(self, mod, inputs, collect_outputs=True):
        cloned_inputs = clone_inputs(inputs)
--- a/benchmarks/dynamo/huggingface.yaml
+++ b/benchmarks/dynamo/huggingface.yaml
@ -9,9 +9,16 @@ skip:
    # Fails with even batch size = 1
    - GPTJForCausalLM
    - GPTJForQuestionAnswering
+    # Model too big
+    - google/gemma-3-4b-it

  device:
-    cpu: []
+    cpu:
+      - meta-llama/Llama-3.2-1B
+      - google/gemma-2-2b
+      - google/gemma-3-4b-it
+      - openai/whisper-tiny
+      - Qwen/Qwen3-0.6B

  control_flow:
    - AllenaiLongformerBase
@ -67,6 +74,11 @@ batch_size:
    XGLMForCausalLM: 4
    XLNetLMHeadModel: 2
    YituTechConvBert: 2
+    meta-llama/Llama-3.2-1B: 8
+    google/gemma-2-2b: 8
+    google/gemma-3-4b-it: 8
+    openai/whisper-tiny: 8
+    Qwen/Qwen3-0.6B: 8


 tolerance:
--- a/benchmarks/dynamo/huggingface_llm_models.py
+++ b/benchmarks/dynamo/huggingface_llm_models.py
@ -0,0 +1,102 @@
+import subprocess
+import sys
+
+import torch
+
+
+def pip_install(package):
+    subprocess.check_call([sys.executable, "-m", "pip", "install", package])
+
+
+try:
+    from transformers import (
+        AutoModelForCausalLM,
+        AutoTokenizer,
+        WhisperForConditionalGeneration,
+        WhisperProcessor,
+    )
+except ModuleNotFoundError:
+    print("Installing HuggingFace Transformers...")
+    pip_install("git+https://github.com/huggingface/transformers.git#egg=transformers")
+finally:
+    from transformers import (
+        AutoModelForCausalLM,
+        AutoTokenizer,
+        WhisperForConditionalGeneration,
+        WhisperProcessor,
+    )
+
+
+class Benchmark:
+    @staticmethod
+    def get_model_and_inputs(model_name, device):
+        raise NotImplementedError("get_model_and_inputs() not implemented")
+
+
+class WhisperBenchmark(Benchmark):
+    SAMPLE_RATE = 16000
+    DURATION = 30.0  # seconds
+
+    @staticmethod
+    def get_model_and_inputs(model_name, device):
+        processor = WhisperProcessor.from_pretrained(model_name)
+        model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
+        model.config.forced_decoder_ids = None
+
+        model.generation_config.do_sample = False
+        model.generation_config.temperature = 0.0
+
+        num_samples = int(WhisperBenchmark.DURATION * WhisperBenchmark.SAMPLE_RATE)
+        audio = torch.randn(num_samples) * 0.1
+        inputs = dict(
+            processor(
+                audio, sampling_rate=WhisperBenchmark.SAMPLE_RATE, return_tensors="pt"
+            )
+        )
+        inputs["input_features"] = inputs["input_features"].to(device)
+
+        decoder_start_token = model.config.decoder_start_token_id
+        inputs["decoder_input_ids"] = torch.tensor(
+            [[decoder_start_token]], device=device
+        )
+
+        return model, inputs
+
+
+class TextGenerationBenchmark(Benchmark):
+    INPUT_LENGTH = 1000
+    OUTPUT_LENGTH = 2000
+
+    @staticmethod
+    def get_model_and_inputs(model_name, device):
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device)
+        model.eval()
+
+        model.generation_config.do_sample = False
+        model.generation_config.use_cache = True
+        model.generation_config.cache_implementation = "static"
+        model.generation_config.max_new_tokens = TextGenerationBenchmark.OUTPUT_LENGTH
+        model.generation_config.pad_token_id = tokenizer.eos_token_id
+        model.generation_config.temperature = 0.0
+
+        vocab_size = tokenizer.vocab_size
+        input_ids = torch.randint(
+            low=0,
+            high=vocab_size,
+            size=(1, TextGenerationBenchmark.INPUT_LENGTH),
+            device=device,
+            dtype=torch.long,
+        )
+        example_inputs = {"input_ids": input_ids}
+
+        return model, example_inputs
+
+
+HF_LLM_MODELS: dict[str, Benchmark] = {
+    "meta-llama/Llama-3.2-1B": TextGenerationBenchmark,
+    "google/gemma-2-2b": TextGenerationBenchmark,
+    "google/gemma-3-4b-it": TextGenerationBenchmark,
+    "openai/whisper-tiny": WhisperBenchmark,
+    "Qwen/Qwen3-0.6B": TextGenerationBenchmark,
+}
--- a/benchmarks/dynamo/huggingface_models_list.txt
+++ b/benchmarks/dynamo/huggingface_models_list.txt
@ -46,3 +46,8 @@ TrOCRForCausalLM,64
 XGLMForCausalLM,32
 XLNetLMHeadModel,16
 YituTechConvBert,32
+meta-llama/Llama-3.2-1B,8
+google/gemma-2-2b,8
+google/gemma-3-4b-it,8
+openai/whisper-tiny,8
+Qwen/Qwen3-0.6B,8
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@ -3269,7 +3269,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl {
    is_le<sizeof(autograd_meta_),         16,  FieldNameEnum::autograd_meta_>();
    is_le<sizeof(extra_meta_),            16,  FieldNameEnum::extra_meta_>();
    are_equal<sizeof(version_counter_),    8,  FieldNameEnum::version_counter_>();
-    are_equal<sizeof(pyobj_slot_),         8,  FieldNameEnum::pyobj_slot_>();
+    are_equal<sizeof(pyobj_slot_),   16,  FieldNameEnum::pyobj_slot_>();
    are_equal<sizeof(sizes_and_strides_), 88,  FieldNameEnum::sizes_and_strides_>();
    are_equal<sizeof(storage_offset_),     8,  FieldNameEnum::storage_offset_>();
    are_equal<sizeof(numel_),              8,  FieldNameEnum::numel_>();
--- a/c10/core/impl/PyInterpreterHooks.h
+++ b/c10/core/impl/PyInterpreterHooks.h
@ -13,10 +13,11 @@ struct C10_API PyInterpreterHooksInterface {

  // Get the PyInterpreter instance
  // Stub implementation throws error when Python is not available
-  // We return nullptr rather than throwing an error since there are bits of c10
-  // that expect an empty PyObjectSlot when python is not available.
  virtual PyInterpreter* getPyInterpreter() const {
-    return nullptr;
+    TORCH_CHECK(
+        false,
+        "PyTorch was compiled without Python support. "
+        "Cannot access Python interpreter from C++.");
  }
 };

--- a/c10/core/impl/PyObjectSlot.cpp
+++ b/c10/core/impl/PyObjectSlot.cpp
@ -2,7 +2,7 @@

 namespace c10::impl {

-PyObjectSlot::PyObjectSlot() : pyobj_(nullptr) {}
+PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}

 PyObjectSlot::~PyObjectSlot() {
  maybe_destroy_pyobj();
@ -10,9 +10,9 @@ PyObjectSlot::~PyObjectSlot() {

 void PyObjectSlot::maybe_destroy_pyobj() {
  if (owns_pyobj()) {
-    TORCH_INTERNAL_ASSERT(getGlobalPyInterpreter() != nullptr);
+    TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
    TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
-    (*getGlobalPyInterpreter())
+    (*pyobj_interpreter_.load(std::memory_order_acquire))
        ->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true);
    // NB: this destructor can only be entered when there are no
    // references to this C++ object (obviously), NOR any references
@ -25,7 +25,7 @@ void PyObjectSlot::maybe_destroy_pyobj() {
 }

 PyInterpreter* PyObjectSlot::pyobj_interpreter() {
-  return getGlobalPyInterpreter();
+  return pyobj_interpreter_.load(std::memory_order_acquire);
 }

 PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
@ -35,7 +35,7 @@ PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
 }

 PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
-  auto interpreter = getGlobalPyInterpreter();
+  auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
  if (interpreter) {
    return *interpreter;
  }
--- a/c10/core/impl/PyObjectSlot.h
+++ b/c10/core/impl/PyObjectSlot.h
@ -6,17 +6,10 @@
 #include <c10/util/python_stub.h>
 #include <optional>

+#include <atomic>
+
 namespace c10::impl {

-// Function pointer type for getting the global interpreter
-using GetPyInterpreterFn = PyInterpreter* (*)();
-
-// Global function pointer (set by csrc initialization)
-C10_API extern GetPyInterpreterFn g_get_pyinterpreter_fn;
-
-// Helper function to get the global interpreter
-C10_API PyInterpreter* getGlobalPyInterpreter();
-
 struct C10_API PyObjectSlot {
 public:
  PyObjectSlot();
@ -33,6 +26,8 @@ struct C10_API PyObjectSlot {
  // NB: THIS FUNCTION CAN RAISE AN EXCEPTION.  Make sure to clean up after
  // PyObject if necessary!
  void init_pyobj(PyObject* pyobj) {
+    pyobj_interpreter_.store(
+        getGlobalPyInterpreter(), std::memory_order_relaxed);
    pyobj_ = pyobj;
  }

@ -60,15 +55,18 @@ struct C10_API PyObjectSlot {

  // @todo alban: I'm not too sure what's going on here, we can probably delete
  // it but it's worthwhile making sure
-  std::optional<PyObject*> check_pyobj() const {
-    impl::PyInterpreter* interpreter = getGlobalPyInterpreter();
-    if (interpreter == nullptr || pyobj_ == nullptr) {
+  std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
+    impl::PyInterpreter* interpreter =
+        pyobj_interpreter_.load(std::memory_order_acquire);
+    if (interpreter == nullptr) {
      return std::nullopt;
    }
-    if (c10::impl::HermeticPyObjectTLS::get_state()) {
+
+    if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
      return std::nullopt;
+    } else {
+      return _unchecked_untagged_pyobj();
    }
-    return _unchecked_untagged_pyobj();
  }

  PyInterpreter& load_pyobj_interpreter() const;
@ -78,6 +76,30 @@ struct C10_API PyObjectSlot {
  void set_owns_pyobj(bool b);

 private:
+  // This field contains the interpreter tag for this object.  See
+  // Note [Python interpreter tag] for general context
+  //
+  // Note [Memory ordering on Python interpreter tag]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // What memory_order do we need when accessing this atomic?  We don't
+  // need a single total modification order (as provided by
+  // memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
+  // transition from -1 to some positive integer and never changes afterwards.
+  // Because there is only one modification, it trivially already has a total
+  // modification order (e.g., we don't need fences or locked instructions on
+  // x86)
+  //
+  // In fact, one could make a reasonable argument that relaxed reads are OK,
+  // due to the presence of external locking (GIL) to ensure that interactions
+  // with other data structures are still correctly synchronized, so that
+  // we fall in the "Single-Location Data Structures" case as described in
+  // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
+  // However, on x86, it doesn't matter if I use acquire or relaxed on the load
+  // as I get the same assembly in both cases.  So I just use the more
+  // conservative acquire (which will impede compiler optimizations but I don't
+  // care)
+  std::atomic<PyInterpreter*> pyobj_interpreter_;
+
  // This field contains a reference to a PyObject representing this Tensor.
  // If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
  // PyObject for it and set this field.  This field does not have to be
--- a/c10/cuda/CUDAException.cpp
+++ b/c10/cuda/CUDAException.cpp
@ -10,9 +10,9 @@ namespace c10::cuda {

 void c10_cuda_check_implementation(
    const int32_t err,
-    const char* /*filename*/,
-    const char* /*function_name*/,
-    const int /*line_number*/,
+    const char* filename,
+    const char* function_name,
+    const uint32_t line_number,
    const bool include_device_assertions) {
  const auto cuda_error = static_cast<cudaError_t>(err);
  const auto cuda_kernel_failure = include_device_assertions
@ -41,7 +41,7 @@ void c10_cuda_check_implementation(
  }
 #endif
  throw c10::AcceleratorError(
-      {__func__, __FILE__, int32_t(__LINE__)}, err, check_message);
+      {function_name, filename, line_number}, err, check_message);
 }

 } // namespace c10::cuda
--- a/c10/cuda/CUDAException.h
+++ b/c10/cuda/CUDAException.h
@ -91,7 +91,7 @@ C10_CUDA_API void c10_cuda_check_implementation(
    const int32_t err,
    const char* filename,
    const char* function_name,
-    const int line_number,
+    const uint32_t line_number,
    const bool include_device_assertions);

 } // namespace c10::cuda
--- a/c10/util/FileSystem.h
+++ b/c10/util/FileSystem.h
@ -0,0 +1,22 @@
+// Shim header for filesystem for compilers that are too old to have it not
+// in the experimental namespace
+
+#if __has_include(<filesystem>)
+#include <filesystem>
+#elif __has_include(<experimental/filesystem>)
+#include <experimental/filesystem>
+#else
+#error "Neither <filesystem> nor <experimental/filesystem> is available."
+#endif
+
+namespace c10 {
+
+#if __has_include(<filesystem>)
+// NOLINTNEXTLINE(misc-unused-alias-decls)
+namespace filesystem = std::filesystem;
+#elif __has_include(<experimental/filesystem>)
+// NOLINTNEXTLINE(misc-unused-alias-decls)
+namespace filesystem = std::experimental::filesystem;
+#endif
+
+} // namespace c10
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@ -283,23 +283,55 @@ class intrusive_ptr final {
  }

  void reset_() noexcept {
-    if (target_ != NullType::singleton() &&
-        detail::atomic_refcount_decrement(target_->refcount_) == 0) {
-      // See comment above about weakcount. As long as refcount>0,
-      // weakcount is one larger than the actual number of weak references.
-      // So we need to decrement it here.
-      bool should_delete =
-          target_->weakcount_.load(std::memory_order_acquire) == 1;
-      if (!should_delete) {
-        // justification for const_cast: release_resources is basically a
-        // destructor and a destructor always mutates the object, even for const
-        // objects. NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-        const_cast<std::remove_const_t<TTarget>*>(target_)->release_resources();
-        should_delete =
-            detail::atomic_weakcount_decrement(target_->weakcount_) == 0;
+    if (target_ != NullType::singleton()) {
+#if defined(__linux__) && (defined(__aarch64__) || defined(__x86_64__))
+      if constexpr (
+          std::atomic<uint64_t>::is_always_lock_free &&
+          std::atomic<uint32_t>::is_always_lock_free &&
+          sizeof(std::atomic<uint64_t>) == 8 &&
+          sizeof(std::atomic<uint32_t>) == 4) {
+        auto both_counts_ =
+            reinterpret_cast<std::atomic<uint64_t>*>(&target_->refcount_);
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+            (reinterpret_cast<std::uintptr_t>(both_counts_) %
+             sizeof(std::atomic<uint64_t>)) == 0 &&
+            (reinterpret_cast<std::uintptr_t>(&target_->weakcount_) -
+             reinterpret_cast<std::uintptr_t>(both_counts_)) ==
+                sizeof(std::atomic<uint32_t>));
+        // 0x100000001ULL is a 64-bit number combination of both the refcount_
+        // and weakcount_ being 1.
+        constexpr uint64_t unique_ref_ = 0x100000001ULL;
+        if (both_counts_->load(std::memory_order_acquire) == unique_ref_) {
+          // Both counts are 1, so there are no weak references and
+          // we are releasing the last strong reference. No other
+          // threads can observe the effects of this target_ deletion
+          // call (e.g. calling use_count()) without a data race.
+          target_->refcount_.store(0, std::memory_order_relaxed);
+          delete target_;
+          return;
+        }
      }
-      if (should_delete) {
-        delete target_;
+#endif
+
+      if (detail::atomic_refcount_decrement(target_->refcount_) == 0) {
+        // See comment above about weakcount. As long as refcount>0,
+        // weakcount is one larger than the actual number of weak references.
+        // So we need to decrement it here.
+        bool should_delete =
+            target_->weakcount_.load(std::memory_order_acquire) == 1;
+        if (!should_delete) {
+          // justification for const_cast: release_resources is basically a
+          // destructor and a destructor always mutates the object, even for
+          // const objects.
+          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+          const_cast<std::remove_const_t<TTarget>*>(target_)
+              ->release_resources();
+          should_delete =
+              detail::atomic_weakcount_decrement(target_->weakcount_) == 0;
+        }
+        if (should_delete) {
+          delete target_;
+        }
      }
    }
  }
--- a/caffe2/perfkernels/batch_box_cox_vec.h
+++ b/caffe2/perfkernels/batch_box_cox_vec.h
@ -73,6 +73,19 @@ void box_cox_zero_lambda(
  }
 }

+template <typename T>
+at::vec::Vectorized<T> box_cox_nonzero_lambda_impl(
+    at::vec::Vectorized<T> data,
+    at::vec::Vectorized<T> lambda1,
+    at::vec::Vectorized<T> lambda2,
+    at::vec::Vectorized<T> k_eps) {
+  auto sum = data + lambda2;
+  auto max = at::vec::max(sum, k_eps);
+  auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
+  auto pow = max.pow(lambda1);
+  return at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
+}
+
 template <typename T>
 void box_cox_nonzero_lambda(
    int64_t D,
@ -88,21 +101,18 @@ void box_cox_nonzero_lambda(
  auto k_eps_vec = Vec(k_eps);
  for(; j + VLEN < D; j += VLEN) {
    auto data = Vec::loadu(data_ptr + j);
-    auto lambda2 = Vec::loadu(lambda2_ptr + j);
-    auto sum = data + lambda2;
-    auto max = at::vec::max(sum, k_eps_vec);
    auto lambda1 = Vec::loadu(lambda1_ptr + j);
-    auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
-    auto pow = max.pow(lambda1);
-    auto res = at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
+    auto lambda2 = Vec::loadu(lambda2_ptr + j);
+    auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
    res.store(out + j);
  }
-  for ( ;j < D; ++j) {
-    auto sum = data_ptr[j] + lambda2_ptr[j];
-    auto max = std::max(sum, k_eps);
-    auto lambda_over_1 = at::vec::fast_recieprocal(lambda1_ptr[j]);
-    auto pow = std::pow(max, lambda1_ptr[j]);
-    out[j] = pow * lambda_over_1 - lambda_over_1;
+  if (j < D) {
+    auto remaining = D - j;
+    auto data = Vec::loadu(data_ptr + j, remaining);
+    auto lambda1 = Vec::loadu(lambda1_ptr + j, remaining);
+    auto lambda2 = Vec::loadu(lambda2_ptr + j, remaining);
+    auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
+    res.store(out + j, remaining);
  }
 }
 #else
--- a/cmake/External/aotriton.cmake
+++ b/cmake/External/aotriton.cmake
@ -45,88 +45,13 @@ if(NOT __AOTRITON_INCLUDED)
     )
  set(__AOTRITON_BASE_URL "https://github.com/ROCm/aotriton/releases/download/")  # @lint-ignore
  set(__AOTRITON_Z "gz")
-  # Set the default __AOTRITON_LIB path
-  set(__AOTRITON_LIB "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so")
-  if(WIN32)
-    set(__AOTRITON_LIB "${__AOTRITON_INSTALL_DIR}/lib/aotriton_v2.lib")
-  endif()
-
-  function(aotriton_build_windows_dependencies dlfcn-win32_external xz_external dlfcn-win32_DIR liblzma_DIR)
-    # Windows-specific dependencies - build these first
-    if(NOT noimage)
-      message(FATAL_ERROR "noimage must be ON for Windows builds")
-    endif()
-    # Build dlfcn-win32
-    set(__DLFCN_WIN32_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/dlfcn-win32")
-    set(__DLFCN_WIN32_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/dlfcn-win32-install")
-
-    ExternalProject_Add(${dlfcn-win32_external}
-      GIT_REPOSITORY https://github.com/dlfcn-win32/dlfcn-win32.git
-      GIT_TAG v1.4.2
-      PREFIX ${__DLFCN_WIN32_PREFIX}
-      INSTALL_DIR ${__DLFCN_WIN32_INSTALL_DIR}
-      CMAKE_ARGS
-        -DCMAKE_INSTALL_PREFIX=${__DLFCN_WIN32_INSTALL_DIR}
-        -DCMAKE_BUILD_TYPE=Release
-        -DCMAKE_C_COMPILER=cl
-        -DCMAKE_CXX_COMPILER=cl
-        -DBUILD_SHARED_LIBS=ON
-        -DBUILD_TESTS=OFF
-      BUILD_BYPRODUCTS
-        "${__DLFCN_WIN32_INSTALL_DIR}/lib/dl.lib"
-        "${__DLFCN_WIN32_INSTALL_DIR}/bin/dl.dll"
-    )
-    ExternalProject_Add_Step(${dlfcn-win32_external} copy_to_aotriton
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different
-        "${__DLFCN_WIN32_INSTALL_DIR}/bin/dl.dll"
-        "${__AOTRITON_INSTALL_DIR}/lib/"
-      DEPENDEES install
-    )
-    set(${dlfcn-win32_DIR} "${__DLFCN_WIN32_INSTALL_DIR}/share/dlfcn-win32" CACHE PATH "Path to dlfcn-win32 CMake config" FORCE)
-
-    # Build xz/liblzma
-    set(__XZ_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/xz")
-    set(__XZ_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/xz-install")
-
-    ExternalProject_Add(${xz_external}
-      GIT_REPOSITORY https://github.com/tukaani-project/xz.git
-      GIT_TAG v5.8.1
-      PREFIX ${__XZ_PREFIX}
-      INSTALL_DIR ${__XZ_INSTALL_DIR}
-      CMAKE_ARGS
-        -DCMAKE_INSTALL_PREFIX=${__XZ_INSTALL_DIR}
-        -DCMAKE_BUILD_TYPE=Release
-        -DBUILD_SHARED_LIBS=ON
-        -DENABLE_NLS=OFF
-        -DXZ_TOOL_LZMAINFO=OFF
-        -DXZ_TOOL_XZ=OFF
-        -DXZ_TOOL_XZDEC=OFF
-        -DXZ_TOOL_LZMADEC=OFF
-      BUILD_BYPRODUCTS
-        "${__XZ_INSTALL_DIR}/lib/lzma.lib"
-        "${__XZ_INSTALL_DIR}/bin/liblzma.dll"
-    )
-    ExternalProject_Add_Step(${xz_external} copy_to_aotriton
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different
-        "${__XZ_INSTALL_DIR}/bin/liblzma.dll"
-        "${__AOTRITON_INSTALL_DIR}/lib/"
-      DEPENDEES install
-    )
-    set(${liblzma_DIR} "${__XZ_INSTALL_DIR}/lib/cmake/liblzma" CACHE PATH "Path to xz/liblzma CMake config" FORCE)
-  endfunction()
-
  function(aotriton_build_from_source noimage project)
    if(noimage)
      SET(RECURSIVE "OFF")
    else()
      SET(RECURSIVE "ON")
    endif()
-    if(WIN32)
-      message(STATUS "Building AOTriton Windows dependencies")
-      aotriton_build_windows_dependencies(dlfcn-win32_external xz_external dlfcn-win32_DIR liblzma_DIR)
-    endif()
    message(STATUS "PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}")
-
    ExternalProject_Add(${project}
      GIT_REPOSITORY https://github.com/ROCm/aotriton.git
      GIT_SUBMODULES_RECURSE ${RECURSIVE}
@ -140,19 +65,12 @@ if(NOT __AOTRITON_INCLUDED)
      -DAOTRITON_GPU_BUILD_TIMEOUT=0
      -DAOTRITON_NO_PYTHON=ON
      -DAOTRITON_NOIMAGE_MODE=${noimage}
-      -DHIP_PLATFORM=amd
-      $<$<BOOL:${WIN32}>:-Ddlfcn-win32_DIR=${dlfcn-win32_DIR}>
-      $<$<BOOL:${WIN32}>:-Dliblzma_DIR=${liblzma_DIR}>
-      BUILD_BYPRODUCTS
-        "${__AOTRITON_LIB}"
+      BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so"
      USES_TERMINAL_DOWNLOAD TRUE
      USES_TERMINAL_CONFIGURE TRUE
      USES_TERMINAL_BUILD TRUE
      USES_TERMINAL_INSTALL TRUE
    )
-    if(WIN32)
-      add_dependencies(${project} dlfcn-win32_external xz_external)
-    endif()
  endfunction()

  set(__AOTRITON_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
@ -177,7 +95,7 @@ if(NOT __AOTRITON_INCLUDED)
      INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory
      "${CMAKE_CURRENT_BINARY_DIR}/aotriton_runtime"
      "${__AOTRITON_INSTALL_DIR}"
-      BUILD_BYPRODUCTS "${__AOTRITON_LIB}"
+      BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so"
    )
    message(STATUS "Using AOTriton Runtime from pre-compiled binary ${__AOTRITON_URL}.\
    Set env variables AOTRITON_INSTALL_FROM_SOURCE=1 to build from source.")
@ -193,35 +111,14 @@ if(NOT __AOTRITON_INCLUDED)
    string(CONCAT __AOTRITON_URL
           "${__AOTRITON_BASE_URL}"
           "${__AOTRITON_VER}/${__AOTRITON_FILE}")
-
-    # Set up directories
-    set(__AOTRITON_DOWNLOAD_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_download-${image})
-    set(__AOTRITON_EXTRACT_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_image-${image})
-    set(__AOTRITON_INSTALL_SOURCE_DIR ${__AOTRITON_EXTRACT_DIR})
-    set(__DOWNLOAD_NO_EXTRACT "")
-    set(__BUILD_COMMANDS "")
-
-    # On Windows, we need custom tar extraction with UTF-8 support
-    if(WIN32)
-      set(__DOWNLOAD_NO_EXTRACT "DOWNLOAD_NO_EXTRACT;TRUE")
-      set(__BUILD_COMMANDS
-        COMMAND ${CMAKE_COMMAND} -E make_directory "${__AOTRITON_EXTRACT_DIR}"
-        COMMAND tar --options hdrcharset=UTF-8 -xf "${__AOTRITON_DOWNLOAD_DIR}/${__AOTRITON_FILE}" -C "${__AOTRITON_EXTRACT_DIR}"
-      )
-      set(__AOTRITON_INSTALL_SOURCE_DIR ${__AOTRITON_EXTRACT_DIR}/aotriton)
-    endif()
-
    ExternalProject_Add(${project}
      URL "${__AOTRITON_URL}"
      URL_HASH SHA256=${__AOTRITON_SHA256}
-      DOWNLOAD_DIR ${__AOTRITON_DOWNLOAD_DIR}
-      ${__DOWNLOAD_NO_EXTRACT}
-      SOURCE_DIR ${__AOTRITON_EXTRACT_DIR}
+      SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_image-${image}
      CONFIGURE_COMMAND ""
      BUILD_COMMAND ""
-      ${__BUILD_COMMANDS}
      INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory
-      "${__AOTRITON_INSTALL_SOURCE_DIR}"
+      "${CMAKE_CURRENT_BINARY_DIR}/aotriton_image-${image}"
      "${__AOTRITON_INSTALL_DIR}"
      BUILD_BYPRODUCTS
      "${__AOTRITON_INSTALL_DIR}/lib/aotriton.images/${image}/__signature__"
@ -267,7 +164,7 @@ if(NOT __AOTRITON_INCLUDED)
      endforeach()
    endforeach()
  endif()
-  target_link_libraries(__caffe2_aotriton INTERFACE ${__AOTRITON_LIB})
+  target_link_libraries(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so)
  target_include_directories(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/include)
  set(AOTRITON_FOUND TRUE)
 endif() # __AOTRITON_INCLUDED
--- a/demo_no_spew.py
+++ b/demo_no_spew.py
@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+"""
+Demo: No log spew with distributed logging patch.
+Run with: torchrun --nproc_per_node=2 demo_no_spew.py
+"""
+import os
+import warnings
+import logging
+import torch
+import torch.distributed as dist
+
+# Initialize distributed
+if 'RANK' in os.environ:
+    dist.init_process_group('gloo')
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+else:
+    rank = 0
+    world_size = 1
+
+print(f"=== Process {rank}/{world_size} ===")
+
+# Test warnings
+warnings.warn("This warning should only appear ONCE (from rank 0)")
+
+# Test logging
+logging.warning("This logging should only appear ONCE (from rank 0)")
+
+# Test the original cpp_extension case
+logging.getLogger('torch.utils.cpp_extension').setLevel(logging.DEBUG)
+from torch.utils.cpp_extension import _get_cuda_arch_flags
+_get_cuda_arch_flags()
+
+
+print(f"Process {rank} completed")
+
+if 'RANK' in os.environ:
+    dist.destroy_process_group()
--- a/functorch/csrc/dim/dim.cpp
+++ b/functorch/csrc/dim/dim.cpp
@ -1187,7 +1187,8 @@ int64_t _Tensor_ndim(mpy::handle h) {
 mpy::handle handle_from_tensor(Arena& A, TensorRef t) {
  // fast case: tensor is live in python
  std::optional<PyObject*> mb_obj =
-      t->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
+      t->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
+          /*ignore_hermetic_tls=*/false);
  if (mb_obj.has_value() &&
      !t->unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj()) {
    return *mb_obj;
--- a/requirements-build.txt
+++ b/requirements-build.txt
@ -8,3 +8,4 @@ pyyaml
 requests
 six  # dependency chain: NNPACK -> PeachPy -> six
 typing-extensions>=4.10.0
+pip  # not technically needed, but this makes setup.py invocation work
--- a/test/cpp/nativert/test_itree.cpp
+++ b/test/cpp/nativert/test_itree.cpp
@ -4,6 +4,7 @@
 #include <fmt/format.h>

 #include <c10/util/Enumerate.h>
+#include <torch/custom_class.h>
 #include <torch/nativert/detail/ITree.h>

 namespace torch::nativert::detail {
@ -1147,4 +1148,200 @@ TEST(ITreeTest, ToAtenType) {
      c10::TypeKind::AnyType);
 }

+TEST(ITreeTest, KeyedJaggedTensorUnflatten) {
+  // Test KeyedJaggedTensor pytree node registration
+  // KeyedJaggedTensor has 6 tensor fields: _values, _weights, _lengths,
+  // _offsets, _stride_per_key_per_rank, _inverse_indices
+  auto jsonSpec = R"(
+[
+  1,
+  {
+    "type": "torchrec.sparse.jagged_tensor.KeyedJaggedTensor",
+    "context": "[\"key1\", \"key2\"]",
+    "children_spec": [
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      }
+    ]
+  }
+]
+  )";
+
+  auto [graph, valuePtrs] = makeValues(6);
+  const auto spec = itreeSpecLoads(jsonSpec, valuePtrs);
+
+  // Create mock tensor values for the 6 fields
+  std::vector<c10::IValue> flats = {
+      c10::IValue(1), // _values
+      c10::IValue(2), // _weights
+      c10::IValue(3), // _lengths
+      c10::IValue(4), // _offsets
+      c10::IValue(5), // _stride_per_key_per_rank
+      c10::IValue(6), // _inverse_indices tensor part
+  };
+
+  // Test unflatten - this will create a generic tuple since we don't have
+  // the actual KeyedJaggedTensor constructor available in tests
+  auto itree = itreeUnflatten(flats, spec);
+  EXPECT_TRUE(itree.isTuple());
+  EXPECT_EQ(itree.toTupleRef().elements().size(), 6);
+
+  // Verify the values match what we put in
+  for (size_t i = 0; i < 6; i++) {
+    EXPECT_EQ(itree.toTupleRef().elements()[i], flats[i]);
+  }
+
+  // Verify spec has correct number of children and structure
+  EXPECT_EQ(spec.children().size(), 6);
+  EXPECT_EQ(spec.numIValues(), 6);
+  EXPECT_FALSE(spec.isIValue());
+  EXPECT_EQ(
+      spec.uniformName(), "torchrec.sparse.jagged_tensor.KeyedJaggedTensor");
+}
+
+TEST(ITreeTest, KeyedJaggedTensorNodeRegistration) {
+  // Test that KeyedJaggedTensor pytree node is properly registered
+
+  // Verify the KeyedJaggedTensor node is in the registry by attempting
+  // to load a spec that references it
+  auto jsonSpec = R"(
+[
+  1,
+  {
+    "type": "torchrec.sparse.jagged_tensor.KeyedJaggedTensor",
+    "context": "[\"key1\", \"key2\"]",
+    "children_spec": [
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      }
+    ]
+  }
+]
+  )";
+
+  auto [graph, valuePtrs] = makeValues(6);
+
+  // This should not throw - if KeyedJaggedTensor wasn't registered,
+  // we'd get an exception about "Unknown pytree node type"
+  EXPECT_NO_THROW({
+    const auto spec = itreeSpecLoads(jsonSpec, valuePtrs);
+
+    // Verify the spec loaded correctly
+    EXPECT_FALSE(spec.isIValue());
+    EXPECT_EQ(
+        spec.uniformName(), "torchrec.sparse.jagged_tensor.KeyedJaggedTensor");
+    EXPECT_EQ(spec.children().size(), 6);
+    EXPECT_EQ(spec.numIValues(), 6);
+
+    // Verify context is parsed correctly
+    EXPECT_FALSE(spec.context().is_null());
+    EXPECT_TRUE(spec.context().is_array());
+    EXPECT_EQ(spec.context().size(), 2);
+  });
+}
+
+TEST(ITreeTest, JaggedTensorNodeRegistration) {
+  // Test that JaggedTensor pytree node is also properly registered
+
+  auto jsonSpec = R"(
+[
+  1,
+  {
+    "type": "torchrec.sparse.jagged_tensor.JaggedTensor",
+    "context": "null",
+    "children_spec": [
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      }
+    ]
+  }
+]
+  )";
+
+  auto [graph, valuePtrs] = makeValues(4);
+
+  // This should not throw - if JaggedTensor wasn't registered,
+  // we'd get an exception about "Unknown pytree node type"
+  EXPECT_NO_THROW({
+    const auto spec = itreeSpecLoads(jsonSpec, valuePtrs);
+
+    // Verify the spec loaded correctly
+    EXPECT_FALSE(spec.isIValue());
+    EXPECT_EQ(spec.uniformName(), "torchrec.sparse.jagged_tensor.JaggedTensor");
+    EXPECT_EQ(spec.children().size(), 4);
+    EXPECT_EQ(spec.numIValues(), 4);
+  });
+}
+
 } // namespace torch::nativert::detail
--- a/test/distributed/_composable/test_checkpoint.py
+++ b/test/distributed/_composable/test_checkpoint.py
@ -10,10 +10,13 @@ import torch
 import torch.nn as nn
 from torch.distributed._composable import checkpoint
 from torch.testing._internal.common_cuda import TEST_CUDA
-from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.common_utils import run_tests, TEST_XPU, TestCase
 from torch.utils.checkpoint import CheckpointError


+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
+
 class MemoryDelta(ContextDecorator):
    def __init__(self, device: torch.device):
        self.device: torch.device = device
@ -22,16 +25,16 @@ class MemoryDelta(ContextDecorator):

    def __enter__(self):
        self.active_memory_enter = (
-            torch.cuda.memory_stats()["active_bytes.all.current"]
-            if self.device.type == "cuda"
+            torch.accelerator.memory_stats()["active_bytes.all.current"]
+            if self.device.type == "cuda" or self.device.type == "xpu"
            else 0
        )
        return self

    def __exit__(self, *exc):
        self.active_memory_exit = (
-            torch.cuda.memory_stats()["active_bytes.all.current"]
-            if self.device.type == "cuda"
+            torch.accelerator.memory_stats()["active_bytes.all.current"]
+            if self.device.type == "cuda" or self.device.type == "xpu"
            else 0
        )

@ -126,7 +129,7 @@ class TestCheckpoint(TestCase):
            loss2 = net2(x2).sum()
        loss2.backward()

-        if x.is_cuda:
+        if x.is_cuda or x.is_xpu:
            self.assertTrue(mem2.delta() < mem1.delta())

        for p1, p2 in zip(net1.parameters(), net2.parameters()):
@ -137,10 +140,10 @@ class TestCheckpoint(TestCase):
        net = ToyModel()
        self._test_tensor_only(net, x)

-    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "no cuda/xpu")
    def test_tensor_only_gpu(self):
-        x = torch.randn(20, 100, device="cuda:0")
-        net = ToyModel().to("cuda:0")
+        x = torch.randn(20, 100, device=f"{device_type}:0")
+        net = ToyModel().to(f"{device_type}:0")
        self._test_tensor_only(net, x)

    def test_random_cpu(self):
--- a/test/distributed/_composable/test_composability/test_2d_composability.py
+++ b/test/distributed/_composable/test_composability/test_2d_composability.py
@ -47,6 +47,8 @@ from torch.testing._internal.common_utils import (
    instantiate_parametrized_tests,
    parametrize,
    run_tests,
+    TEST_XPU,
+    xfailIf,
 )
 from torch.testing._internal.distributed._tensor.common_dtensor import (
    DTensorTestBase,
@ -58,6 +60,9 @@ from torch.testing._internal.distributed._tensor.common_dtensor import (
 from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir


+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
+
 class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
@ -73,7 +78,7 @@ class SimpleModel(nn.Module):
        return x

    def get_input(self):
-        return torch.rand(4, 5, device="cuda")
+        return torch.rand(4, 5, device=device_type)


 class SimpleModelUneven(nn.Module):
@ -94,7 +99,7 @@ class SimpleModelUneven(nn.Module):
        return x

    def get_input(self):
-        return torch.rand(4, 5, device="cuda")
+        return torch.rand(4, 5, device=device_type)


 class TestFullyShard2DTraining(FSDPTest):
@ -105,13 +110,15 @@ class TestFullyShard2DTraining(FSDPTest):

    @property
    def world_size(self) -> int:
-        return min(4, torch.cuda.device_count())
+        return min(4, torch.accelerator.device_count())

    def init_global_mesh(self) -> DeviceMesh:
        # Prefer to test with >=4 GPUs, but for 2 GPUs, use 2-way TP
        dp_size = 2 if self.world_size > 2 else 1
        return init_device_mesh(
-            "cuda", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
+            device_type,
+            (dp_size, self.world_size // dp_size),
+            mesh_dim_names=("dp", "tp"),
        )

    @skip_if_lt_x_gpu(2)
@ -138,7 +145,7 @@ class TestFullyShard2DTraining(FSDPTest):

        torch.manual_seed(42)
        model = MLPStack(mlp_dim)
-        ref_model = copy.deepcopy(model).cuda()
+        ref_model = copy.deepcopy(model).to(device_type)
        replicate(ref_model, device_ids=[self.rank], process_group=dp_pg)
        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=False)
        model.parallelize(
@ -150,9 +157,8 @@ class TestFullyShard2DTraining(FSDPTest):
        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=False)

        torch.manual_seed(42 + dp_pg.rank() + 1)
-        device = torch.device("cuda")
        for iter_idx in range(10):
-            inp = torch.randn((8, mlp_dim), device=device)
+            inp = torch.randn((8, mlp_dim), device=device_type)
            losses: list[torch.Tensor] = []
            for _model, _optim in ((ref_model, ref_optim), (model, optim)):
                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
@ -162,6 +168,7 @@ class TestFullyShard2DTraining(FSDPTest):
            self.assertEqual(losses[0], losses[1])

    @skip_if_lt_x_gpu(2)
+    @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1881
    def test_train_parity_2d_transformer(self):
        self.run_subtests(
            {"use_shard_placement_fn": [False, True]},
@ -172,12 +179,12 @@ class TestFullyShard2DTraining(FSDPTest):
        torch.manual_seed(42)
        model_args = ModelArgs(n_layers=3, dropout_p=0.0)
        model = Transformer(model_args)
-        ref_model = copy.deepcopy(model).cuda()
+        ref_model = copy.deepcopy(model).to(device_type)
        ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)

        dp_size, tp_size = self.world_size // 2, 2
        global_mesh = init_device_mesh(
-            "cuda", (dp_size, tp_size), mesh_dim_names=("dp", "tp")
+            device_type, (dp_size, tp_size), mesh_dim_names=("dp", "tp")
        )
        model = Transformer.parallelize(model, global_mesh["tp"], use_seq_parallel=True)

@ -205,7 +212,7 @@ class TestFullyShard2DTraining(FSDPTest):
            self.assertEqual(full_param, ref_param)

        torch.manual_seed(42 + global_mesh.get_local_rank("dp"))
-        inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device=device_type)
        for iter_idx in range(5):
            ref_loss = ref_model(inp).sum()
            loss = model(inp).sum()
@ -242,15 +249,16 @@ class TestFullyShard2DTraining(FSDPTest):
            self.assertEqual(full_param, ref_param)

    @skip_if_lt_x_gpu(2)
+    @xfailIf(TEST_XPU)  # https://github.com/pytorch/pytorch/issues/156782
    def test_tp_with_fsdp_offloading(self):
        global_mesh = init_device_mesh(
-            "cuda", (1, self.world_size), mesh_dim_names=("dp", "tp")
+            device_type, (1, self.world_size), mesh_dim_names=("dp", "tp")
        )
        dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
        torch.manual_seed(42)
        mlp_dim = 16
        model = MLPStack(mlp_dim)
-        ref_model = copy.deepcopy(model).cuda()
+        ref_model = copy.deepcopy(model).to(device_type)
        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=False)
        # Parallelize with N-way TP and 1-way FSDP
        model.parallelize(
@ -268,7 +276,7 @@ class TestFullyShard2DTraining(FSDPTest):
        # NOTE: We still see the FSDP all-gather/reduce-scatter c10d ops
        # called, but they will just be no-ops without issuing any kernels.
        # We prefer to keep the no-op check at the c10d level, not in FSDP.
-        inp = torch.randn((4, mlp_dim), device="cuda")  # same on all ranks
+        inp = torch.randn((4, mlp_dim), device=device_type)  # same on all ranks
        for _ in range(10):
            ref_optim.zero_grad()
            optim.zero_grad()
@ -297,6 +305,7 @@ class TestFullyShard2DTraining(FSDPTest):
            ref_optim.step()

    @skip_if_lt_x_gpu(2)
+    @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1881
    @with_temp_dir
    def test_train_parity_2d_transformer_checkpoint_resume(self):
        """
@ -352,7 +361,7 @@ class TestFullyShard2DTraining(FSDPTest):
        )

        torch.manual_seed(42 + global_mesh["dp"].get_local_rank() + 1)
-        inp = torch.randint(0, model_args.vocab_size, (3, 16), device="cuda")
+        inp = torch.randint(0, model_args.vocab_size, (3, 16), device=device_type)
        loss_no_cp1 = train_step(model_no_cp, optim_no_cp, inp)
        loss_no_cp2 = train_step(model_no_cp, optim_no_cp, inp)

@ -410,14 +419,14 @@ class TestFullyShard2DStateDict(DTensorTestBase):
    @property
    def backend(self):
        # need to specify gloo backend for testing cpu offload
-        return "cpu:gloo,cuda:nccl"
+        return "cpu:gloo,xpu:xccl" if TEST_XPU else "cpu:gloo,cuda:nccl"

    @with_comms
    @skip_if_lt_x_gpu(4)
    def test_fully_shard_tp_2d_set_full_state_dict(self):
-        dummy_model = SimpleModel().cuda()
+        dummy_model = SimpleModel().to(device_type)
        mesh_2d = init_device_mesh(
-            "cuda",
+            device_type,
            (2, self.world_size // 2),
            mesh_dim_names=("dp", "tp"),
        )
@ -561,7 +570,7 @@ class TestNew2dParallelTraining(DTensorTestBase):
            self.device_type, (2, self.world_size // 2), mesh_dim_names=("dp", "tp")
        )
        model = FSDP(
-            SimpleModel().cuda(),
+            SimpleModel().to(device_type),
            device_mesh=mesh_2d["dp"],
        )
        fsdp_state = _get_module_fsdp_state(model)
@ -573,7 +582,7 @@ class TestNew2dParallelTraining(DTensorTestBase):
        recompute_activation=False,
    ) -> None:
        torch.manual_seed(0)
-        model = SimpleModel().cuda(self.rank)
+        model = SimpleModel().to(f"{device_type}:{self.rank}")
        model = FSDP(model, use_orig_params=use_orig_params)
        optim = torch.optim.Adam(model.parameters(), lr=0.01)

@ -587,7 +596,9 @@ class TestNew2dParallelTraining(DTensorTestBase):
            "net1": ColwiseParallel(),
            "net2": RowwiseParallel(),
        }
-        model_2d = parallelize_module(SimpleModel().cuda(), tp_mesh, parallelize_plan)
+        model_2d = parallelize_module(
+            SimpleModel().to(device_type), tp_mesh, parallelize_plan
+        )
        model_2d = FSDP(
            model_2d,
            device_mesh=dp_mesh,
@ -615,7 +626,7 @@ class TestNew2dParallelTraining(DTensorTestBase):
            # Ensure all input across TP ranks are same.
            # TODO: add a get_group_rank() to DeviceMesh.
            torch.manual_seed(i + dist.get_rank(dp_mesh.get_group(mesh_dim=0)))
-            input = torch.rand(4, 5).cuda(self.rank)
+            input = torch.rand(4, 5).to(f"{device_type}:{self.rank}")
            output = model(input)
            output_2d = model_2d(input)
            self.assertEqual(output, output_2d)
@ -652,7 +663,7 @@ class TestNew2dParallelStateDict(DTensorTestBase):
    @property
    def backend(self):
        # need to specify gloo backend for testing cpu offload
-        return "cpu:gloo,cuda:nccl"
+        return "cpu:gloo,xpu:xccl" if TEST_XPU else "cpu:gloo,cuda:nccl"

    @with_comms
    @skip_if_lt_x_gpu(4)
@ -669,7 +680,7 @@ class TestNew2dParallelStateDict(DTensorTestBase):
            "net3": ColwiseParallel(),
        }
        model_2d = parallelize_module(
-            SimpleModel().cuda(),
+            SimpleModel().to(device_type),
            mesh_2d["tp"],
            parallelize_plan=parallelize_plan,
        )
@ -679,8 +690,10 @@ class TestNew2dParallelStateDict(DTensorTestBase):
            isinstance(model_2d_fsdp_state._fsdp_extension, DTensorExtensions)
        )

-        mesh_1d = init_device_mesh("cuda", (self.world_size,))
-        model_1d = FSDP(SimpleModel().cuda(), device_mesh=mesh_1d, use_orig_params=True)
+        mesh_1d = init_device_mesh(device_type, (self.world_size,))
+        model_1d = FSDP(
+            SimpleModel().to(device_type), device_mesh=mesh_1d, use_orig_params=True
+        )
        model_1d_fsdp_state = _get_module_fsdp_state(model_1d)
        self.assertEqual(model_1d_fsdp_state._fsdp_extension, None)

@ -692,7 +705,7 @@ class TestNew2dParallelStateDict(DTensorTestBase):

        # Create a model without wrapper
        torch.manual_seed(0)
-        no_wrap_model = simple_model().cuda(self.rank)
+        no_wrap_model = simple_model().to(f"{device_type}:{self.rank}")
        no_wrap_state_dict = no_wrap_model.state_dict()

        # Create a model and sharded it with 2D FSDP + TP
@ -706,7 +719,9 @@ class TestNew2dParallelStateDict(DTensorTestBase):
            "net1": ColwiseParallel(),
            "net2": RowwiseParallel(),
        }
-        model_2d = parallelize_module(simple_model().cuda(), tp_mesh, parallelize_plan)
+        model_2d = parallelize_module(
+            simple_model().to(device_type), tp_mesh, parallelize_plan
+        )
        model_2d = FSDP(model_2d, device_mesh=dp_mesh, use_orig_params=True)

        FSDP.set_state_dict_type(
@ -754,7 +769,9 @@ class TestNew2dParallelStateDict(DTensorTestBase):
            "net1": ColwiseParallel(),
            "net2": RowwiseParallel(),
        }
-        model_2d = parallelize_module(simple_model().cuda(), tp_mesh, parallelize_plan)
+        model_2d = parallelize_module(
+            simple_model().to(device_type), tp_mesh, parallelize_plan
+        )
        model_2d = FSDP(model_2d, device_mesh=dp_mesh, use_orig_params=True)
        optim_2d = torch.optim.Adam(model_2d.parameters(), lr=0.01)

@ -768,7 +785,7 @@ class TestNew2dParallelStateDict(DTensorTestBase):
        ref_state_dict = deepcopy(model_2d.state_dict())

        # Update the parameters so model.state_dict() will be different from ref_dtensor_sd.
-        model_2d(model_2d.get_input().cuda(self.rank)).sum().backward()
+        model_2d(model_2d.get_input().to(f"{device_type}:{self.rank}")).sum().backward()
        optim_2d.step()

        # Load ref_state_dict back.
@ -799,9 +816,11 @@ class TestNew2dParallelStateDict(DTensorTestBase):

        # Create a model without wrapper
        torch.manual_seed(0)
-        no_wrap_model = simple_model().cuda(self.rank)
+        no_wrap_model = simple_model().to(f"{device_type}:{self.rank}")
        no_wrap_optim = torch.optim.Adam(no_wrap_model.parameters(), lr=0.01)
-        no_wrap_model(no_wrap_model.get_input().cuda(self.rank)).sum().backward()
+        no_wrap_model(
+            no_wrap_model.get_input().to(f"{device_type}:{self.rank}")
+        ).sum().backward()
        no_wrap_optim.step()
        no_wrap_osd = get_optimizer_state_dict(no_wrap_model, optimizers=no_wrap_optim)

@ -815,7 +834,7 @@ class TestNew2dParallelStateDict(DTensorTestBase):
            "net2": RowwiseParallel(),
        }
        model_2d = parallelize_module(
-            simple_model().cuda(), mesh_2d["tp"], parallelize_plan
+            simple_model().to(device_type), mesh_2d["tp"], parallelize_plan
        )
        model_2d = FSDP(model_2d, device_mesh=mesh_2d["dp"], use_orig_params=True)
        FSDP.set_state_dict_type(
@ -823,7 +842,7 @@ class TestNew2dParallelStateDict(DTensorTestBase):
            StateDictType.SHARDED_STATE_DICT,
        )
        optim_2d = torch.optim.Adam(model_2d.parameters(), lr=0.01)
-        model_2d(model_2d.get_input().cuda(self.rank)).sum().backward()
+        model_2d(model_2d.get_input().to(f"{device_type}:{self.rank}")).sum().backward()
        optim_2d.step()
        optim_2d_osd = get_optimizer_state_dict(model_2d, optimizers=optim_2d)
        ref_optim_2d_osd = deepcopy(optim_2d_osd)
@ -842,7 +861,7 @@ class TestNew2dParallelStateDict(DTensorTestBase):
                # compare with no_wrap state.
                if isinstance(dist_state, DTensor):
                    dist_state = (
-                        dist_state.cuda()
+                        dist_state.to(device_type)
                        .redistribute(placements=(Replicate(), Replicate()))
                        .to_local()
                    )
@ -850,7 +869,7 @@ class TestNew2dParallelStateDict(DTensorTestBase):
                self.assertTrue(torch.allclose(state, dist_state))

        # Update the parameters 2d optim states will be different from ref_optim_state_dict.
-        model_2d(model_2d.get_input().cuda(self.rank)).sum().backward()
+        model_2d(model_2d.get_input().to(f"{device_type}:{self.rank}")).sum().backward()
        optim_2d.step()

        set_optimizer_state_dict(
@ -892,8 +911,8 @@ class TestNew2dParallelStateDict(DTensorTestBase):
        5) dcp.load the state dict from storage
        6) load the state dict into the 2D model
        """
-        dummy_model = SimpleModel().cuda()
-        mesh_1d = init_device_mesh("cuda", (self.world_size,))
+        dummy_model = SimpleModel().to(device_type)
+        mesh_1d = init_device_mesh(device_type, (self.world_size,))
        model = FSDP(dummy_model, device_mesh=mesh_1d)
        optim = torch.optim.Adam(model.parameters(), lr=0.01)
        model(model.get_input()).sum().backward()
@ -911,9 +930,9 @@ class TestNew2dParallelStateDict(DTensorTestBase):
        dcp.save(state_dict, checkpoint_id=self.temp_dir)

        # initialize 2d model
-        dummy_model = SimpleModel().cuda()
+        dummy_model = SimpleModel().to(device_type)
        mesh_2d = init_device_mesh(
-            "cuda",
+            device_type,
            (2, self.world_size // 2),
            mesh_dim_names=("dp", "tp"),
        )
--- a/test/distributed/_composable/test_composability/test_pp_composability.py
+++ b/test/distributed/_composable/test_composability/test_pp_composability.py
@ -30,7 +30,7 @@ from torch.distributed.tensor.parallel import (
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
    MultiProcessTestCase,
-    requires_nccl,
+    requires_accelerator_dist_backend,
    skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import (
@ -38,6 +38,7 @@ from torch.testing._internal.common_utils import (
    parametrize,
    run_tests,
    skip_but_pass_in_sandcastle_if,
+    TEST_XPU,
 )
 from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir

@ -46,6 +47,10 @@ if TYPE_CHECKING:
    from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE


+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+backend = torch.distributed.get_default_backend_for_device(device_type)
+
+
 # MLP Layer
 class MLPModule(torch.nn.Module):
    def __init__(self, d_hid: int):
@ -79,7 +84,7 @@ class ComposabilityTest(MultiProcessTestCase):
    @classmethod
    def backend_str(cls) -> str:
        # Testing with NCCL backend
-        return "nccl"
+        return backend

    def setUp(self):
        super().setUp()
@ -100,9 +105,11 @@ class ComposabilityTest(MultiProcessTestCase):
    def device(self):
        return self.rank

-    @requires_nccl()
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
    @skip_if_lt_x_gpu(4)
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "Test requires 4+ GPUs")
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIGPU and not TEST_XPU, "Test requires 4+ GPUs"
+    )
    def test_pp_and_dcp(self):
        """
        Test that pipeline parallelism and distributed checkpointing can be used together and
@ -143,11 +150,11 @@ class ComposabilityTest(MultiProcessTestCase):
                    x = layer(x)
                return x

-        device = torch.device("cuda", self.device)
-        torch.cuda.set_device(self.device)
+        device = torch.device(device_type, self.device)
+        torch.accelerator.set_device_index(self.device)
        store = torch.distributed.FileStore(self.file_name, self.world_size)
        torch.distributed.init_process_group(
-            backend="nccl",
+            backend=backend,
            store=store,
            rank=self.rank,
            world_size=self.world_size,
@ -192,9 +199,11 @@ class ComposabilityTest(MultiProcessTestCase):

        _dcp_test(self)

-    @requires_nccl()
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
    @skip_if_lt_x_gpu(8)
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "Test requires 8+ GPUs")
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs"
+    )
    @parametrize(
        "ScheduleClass",
        [
@ -213,11 +222,11 @@ class ComposabilityTest(MultiProcessTestCase):
        ],
    )
    def test_3d_with_tp_dp_pp(self, ScheduleClass, MixedPrecisionParam):
-        _device_raii = torch.device("cuda", self.device)
-        torch.cuda.set_device(self.device)
+        _device_raii = torch.device(device_type, self.device)
+        torch.accelerator.set_device_index(self.device)
        store = torch.distributed.FileStore(self.file_name, self.world_size)
        torch.distributed.init_process_group(
-            backend="nccl",
+            backend=backend,
            store=store,
            rank=self.rank,
            world_size=self.world_size,
@ -228,7 +237,7 @@ class ComposabilityTest(MultiProcessTestCase):
        num_microbatches = 8
        dp_size = self.world_size // (tp_size * pp_size)
        device_mesh = init_device_mesh(
-            "cuda",
+            device_type,
            mesh_shape=(dp_size, pp_size, tp_size),
            mesh_dim_names=("dp", "pp", "tp"),
        )
--- a/test/distributed/_composable/test_replicate.py
+++ b/test/distributed/_composable/test_replicate.py
@ -1,6 +1,7 @@
 # Owner(s): ["oncall: distributed"]

 import os
+import unittest
 from copy import deepcopy

 import torch
@ -14,7 +15,11 @@ from torch.testing._internal.common_distributed import (
    MultiProcessTestCase,
    skip_if_lt_x_gpu,
 )
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_utils import run_tests, TEST_XPU
+
+
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+device_module = torch.get_device_module(device_type)


 class Net(nn.Module):
@ -154,6 +159,7 @@ class ReplicateTest(MultiProcessTestCase):
        self._compare_module(model, replicate_model)

    @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(TEST_XPU, "XPU does not support gloo backend")
    def test_replicate_move_args_kwargs_to_device(self):
        class MyNet(nn.Module):
            def __init__(self) -> None:
@ -166,24 +172,25 @@ class ReplicateTest(MultiProcessTestCase):
                return self.a(inp)

        self._init_pg()
-        torch.cuda.set_device(self.rank)
-        model = MyNet().cuda()
-        replicate(model, device_id=torch.cuda.current_device())
+        torch.accelerator.set_device_index(self.rank)
+        model = MyNet().to(device_type)
+        replicate(model, device_id=torch.accelerator.current_device_index())
        # CPU input ensures replicate can move arg and kwargs to device.
        a, b = torch.randn(2, 2), torch.randn(2, 2)
        model(a, kwarg=b).sum().backward()

    @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(TEST_XPU, "XPU does not support gloo backend")
    def test_replicate_ignore_module(self):
        self._init_pg()
-        torch.cuda.set_device(self.rank)
+        torch.accelerator.set_device_index(self.rank)
        # Seed ensures diff input and thus different local grads across ranks.
        torch.manual_seed(self.rank)
-        torch.cuda.manual_seed(self.rank)
-        model = Net().cuda()
+        device_module.manual_seed(self.rank)
+        model = Net().to(device_type)
        replicate(model, ignored_modules=[model.fc1])
        # CPU input ensures that replicate can move input to GPU as DDP does.
-        inp = torch.randn(5, 2, device="cuda") * (self.rank + 1)
+        inp = torch.randn(5, 2, device=device_type) * (self.rank + 1)
        out = model(inp) * 10
        out.sum().backward()
        # FC1 grads should not be synchronized, FC2 and 3 should be.
@ -221,10 +228,11 @@ class ReplicateTest(MultiProcessTestCase):
        self._compare_module(model, replicate_model)

    @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(TEST_XPU, "XPU does not support gloo backend")
    def test_replicate_device_id(self):
        self._init_pg()
        model = Net()
-        model_cuda = deepcopy(model).cuda()
+        model_cuda = deepcopy(model).to(device_type)
        model_cuda2 = deepcopy(model_cuda)
        replicate(model, device_id=torch.device("cpu"))
        # DDP instance is attached in first pre forward
@ -233,13 +241,15 @@ class ReplicateTest(MultiProcessTestCase):
        # Should be None for CPU training
        self.assertEqual(None, replicate_ddp_weakref.device_ids)

-        replicate(model_cuda, device_id=torch.device(torch.cuda.current_device()))
+        replicate(
+            model_cuda, device_id=torch.device(torch.accelerator.current_device_index())
+        )
        # DDP instance is attached in first pre forward
        model_cuda(torch.randn(2, 2))
        replicate_ddp_weakref = replicate.state(model_cuda)._ddp_weakref()
        self.assertEqual([0], replicate_ddp_weakref.device_ids)
        # Pass in int as device_id
-        replicate(model_cuda2, device_id=int(torch.cuda.current_device()))
+        replicate(model_cuda2, device_id=int(torch.accelerator.current_device_index()))
        # DDP instance is attached in first pre forward
        model_cuda2(torch.randn(2, 2))
        replicate_ddp_weakref = replicate.state(model_cuda2)._ddp_weakref()
@ -256,6 +266,7 @@ class ReplicateTest(MultiProcessTestCase):

 class ReplicateFullyShardInit(ReplicateTest):
    @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(TEST_XPU, "XPU does not support gloo backend")
    def test_replicate_fully_shard_init(self):
        class ToyModel(nn.Module):
            def __init__(self, dim: int):
@ -273,14 +284,14 @@ class ReplicateFullyShardInit(ReplicateTest):
                return y

        self._init_pg()
-        torch.cuda.set_device(self.rank)
+        torch.accelerator.set_device_index(self.rank)
        dim = 3
        bz = 2
-        model = ToyModel(dim).cuda()
+        model = ToyModel(dim).to(device_type)
        for linear in model.linears:
            fully_shard(linear)
        fully_shard(model.linears)
-        replicate(model, device_id=torch.cuda.current_device())
+        replicate(model, device_id=torch.accelerator.current_device_index())
        for linear in model.linears:
            self.assertTrue(isinstance(linear.weight, DTensor))
        inp = torch.rand(bz, dim)
--- a/test/distributed/_composable/test_replicate_with_compiler.py
+++ b/test/distributed/_composable/test_replicate_with_compiler.py
@ -98,6 +98,8 @@ class ReplicateTest(MultiProcessInductorTestCase):
        self.create_pg(device)
        torch._dynamo.config.optimize_ddp = "python_reducer"
        torch.manual_seed(123)
+        if device_type == "xpu":
+            torch.use_deterministic_algorithms(True, warn_only=True)
        model = Net(checkpoint=checkpoint).to(device)
        input = torch.randn([1, DIM], device=device)

--- a/test/distributed/tensor/debug/test_comm_mode.py
+++ b/test/distributed/tensor/debug/test_comm_mode.py
@ -6,7 +6,7 @@ import torch.distributed._functional_collectives as funcol
 import torch.nn as nn
 from torch.distributed.tensor import DeviceMesh, DTensor, Shard
 from torch.distributed.tensor.debug import CommDebugMode
-from torch.testing._internal.common_distributed import requires_accelerator_dist_backend
+from torch.testing._internal.common_distributed import requires_nccl
 from torch.testing._internal.common_utils import run_tests, TestCase
 from torch.testing._internal.distributed._tensor.common_dtensor import MLPModule
 from torch.testing._internal.distributed.fake_pg import FakeStore
@ -14,7 +14,6 @@ from torch.testing._internal.distributed.fake_pg import FakeStore

 c10d_functional = torch.ops.c10d_functional
 c10d_ops = torch.ops.c10d
-device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"


 class TestCommMode(TestCase):
@ -29,7 +28,7 @@ class TestCommMode(TestCase):
        dist.init_process_group(
            backend="fake", rank=1, world_size=self.world_size, store=store
        )
-        self.device_type = device_type
+        self.device_type = "cuda" if torch.cuda.is_available() else "cpu"
        self.world_pg = dist.distributed_c10d._get_default_group()

    def checksAssert(self, comm_mode, key, expected_value, expected_total_value):
@ -112,12 +111,12 @@ class TestCommMode(TestCase):
        self.assertEqual(comm_counts[c10d_functional.all_gather_into_tensor], 1)
        self.assertEqual(comm_counts[c10d_functional.reduce_scatter_tensor], 0)

-    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @requires_nccl()
    def test_comm_mode_with_c10d(self):
-        if not torch.accelerator.is_available():
+        if not torch.cuda.is_available():
            return

-        inp = torch.rand(2, 8, 16).to(device_type)
+        inp = torch.rand(2, 8, 16).cuda()
        all_gather_out = inp.new_empty(self.world_size * 2, 8, 16)

        comm_mode = CommDebugMode()
--- a/test/distributed/tensor/test_dtensor.py
+++ b/test/distributed/tensor/test_dtensor.py
@ -616,11 +616,11 @@ class DTensorMeshTest(DTensorTestBase):

    @with_comms
    def test_dtensor_device_mesh_device_conversion(self):
-        # construct a gpu device mesh
+        # construct a cuda device mesh
        mesh = self.build_device_mesh()

-        # construct from a cpu local tensor with gpu device mesh
-        # should automatically convert the dist tensor to gpu
+        # construct from a cpu local tensor with cuda device mesh
+        # should automatically convert the dist tensor to cuda
        placements = [Shard(0)]
        local_tensor = torch.randn(3, 3)
        dist_tensor = DTensor.from_local(local_tensor, mesh, placements)
@ -669,7 +669,7 @@ class DTensorMeshTest(DTensorTestBase):
    @with_comms
    def test_dtensor_2d_mesh(self):
        mesh_tensor = torch.arange(self.world_size).reshape(2, 4)
-        # construct a gpu device mesh
+        # construct a cuda device mesh
        mesh = DeviceMesh(self.device_type, mesh_tensor)

        # construct a dist tensor on 2d device mesh and test if works
@ -691,7 +691,7 @@ class DTensorMeshTest(DTensorTestBase):

    @with_comms
    def test_device_mesh_nd(self):
-        # construct a gpu device mesh
+        # construct a cuda device mesh
        mesh_tensor = torch.arange(self.world_size).reshape(2, 2, 2)
        mesh = DeviceMesh(self.device_type, mesh_tensor)
        # construct a dist tensor on 3d device mesh and test if works
@ -953,8 +953,8 @@ class TestDTensorPlacementTypes(DTensorTestBase):
        # Keep everything deterministic.
        torch.manual_seed(0)
        tensor = torch.rand(size)
-        if self.device_type != "cpu":
-            return tensor.to(self.device_type)
+        if self.device_type == "cuda":
+            return tensor.cuda()
        else:
            return tensor

--- a/test/distributed/tensor/test_dtensor_compile.py
+++ b/test/distributed/tensor/test_dtensor_compile.py
@ -39,7 +39,6 @@ from torch.distributed.tensor.parallel import (
    RowwiseParallel,
 )
 from torch.distributed.tensor.placement_types import _StridedShard
-from torch.testing._internal.common_device_type import skipXPUIf
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import get_devtype
 from torch.testing._internal.common_utils import (
@ -48,6 +47,8 @@ from torch.testing._internal.common_utils import (
    run_tests,
    skipIfHpu,
    skipIfTorchDynamo,
+    TEST_CUDA,
+    TEST_HPU,
 )
 from torch.testing._internal.distributed._tensor.common_dtensor import (
    DTensorTestBase,
@ -94,8 +95,6 @@ aot_eager_graph = aot_autograd(
    partition_fn=min_cut_rematerialization_partition,
 )

-device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
-

 def _apply_sharding(mod: nn.Module, shard_dim: int, device_mesh: DeviceMesh):
    """
@ -142,7 +141,7 @@ class TestDTensorCompile(torch._dynamo.test_case.TestCase):

    @property
    def device_type(self) -> str:
-        return device_type
+        return "cuda" if TEST_CUDA else "hpu" if TEST_HPU else "cpu"

    @property
    def world_size(self) -> int:
@ -161,9 +160,9 @@ class TestDTensorCompile(torch._dynamo.test_case.TestCase):
        res = fn(x)
        res.to_local().sum().backward()

-    @unittest.skipIf(not torch.accelerator.is_available(), "accelerator not available")
+    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
    def test_dtensor_basic_export(self):
-        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        mesh = DeviceMesh("cuda", torch.arange(self.world_size))

        param = torch.randn(4, 4)
        param_x = DTensor.from_local(param, mesh, [Shard(0)], run_check=False)
@ -189,10 +188,10 @@ class TestDTensorCompile(torch._dynamo.test_case.TestCase):
        )
        self.assertExpectedInline(
            str(ep.graph_module.code).strip(),
-            f"""\
+            """\
 def forward(self, b_buffer, x):
    _assert_tensor_metadata_default = torch.ops.aten._assert_tensor_metadata.default(x, dtype = torch.float64, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata_default = None
-    to = torch.ops.aten.to.dtype_layout(x, dtype = torch.float64, layout = torch.strided, device = device(type='{self.device_type}'));  x = None
+    to = torch.ops.aten.to.dtype_layout(x, dtype = torch.float64, layout = torch.strided, device = device(type='cuda'));  x = None
    view_as = torch.ops.aten.view_as.default(to, to);  to = None
    dtensor___init__0 = self.dtensor___init__0
    dtensor_const_func_spec0 = self.dtensor_const_func_spec0
@ -207,10 +206,10 @@ def forward(self, b_buffer, x):
        # add is performed in _propagate_tensor_meta_non_cached, hence add_1 instead of add
        self.assertExpectedInline(
            str(ep.run_decompositions({}).graph_module.code).strip(),
-            f"""\
+            """\
 def forward(self, b_parametrizations_buffer_original0, x):
    _assert_tensor_metadata = torch.ops.aten._assert_tensor_metadata.default(x, None, None, torch.float64, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata = None
-    _to_copy = torch.ops.aten._to_copy.default(x, dtype = torch.float64, layout = torch.strided, device = device(type='{self.device_type}', index=0));  x = None
+    _to_copy = torch.ops.aten._to_copy.default(x, dtype = torch.float64, layout = torch.strided, device = device(type='cuda', index=0));  x = None
    view = torch.ops.aten.view.default(_to_copy, [4, 4]);  _to_copy = None
    add_1 = torch.ops.aten.add.Tensor(b_parametrizations_buffer_original0, view);  b_parametrizations_buffer_original0 = view = None
    view_1 = torch.ops.aten.view.default(add_1, [4, 4]);  add_1 = None
@ -340,7 +339,6 @@ def forward(self, b_parametrizations_buffer_original0, x):
        self.assertEqual(res, ref)

    @skipIfHpu
-    @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1981")
    def test_dtensor_dynamic_loss_parallel_log_softmax(self):
        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))

@ -716,13 +714,13 @@ def forward(self, b_parametrizations_buffer_original0, x):
            out = layer_norm.permute(0, 2, 1)
            return out

-        x = torch.randn(4, 2, 4, requires_grad=True, device=self.device_type)
+        x = torch.randn(4, 2, 4, requires_grad=True, device="cuda")
        x_dt = DTensor.from_local(x, mesh, [Shard(1)], run_check=False)

-        y = torch.randn(4, requires_grad=True, device=self.device_type)
+        y = torch.randn(4, requires_grad=True, device="cuda")
        y_dt = DTensor.from_local(y, mesh, [Replicate()], run_check=False)

-        z = torch.randn(4, requires_grad=True, device=self.device_type)
+        z = torch.randn(4, requires_grad=True, device="cuda")
        z_dt = DTensor.from_local(z, mesh, [Replicate()], run_check=False)

        opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
@ -820,7 +818,7 @@ def forward(self, b_parametrizations_buffer_original0, x):
        # pass in tensor as inputs/outputs, create DTensor and run redistribute
        # (allgather collective) inside the fn
        def fn(x_dt):
-            if x_dt.device_mesh.device_type == f"{self.device_type}":
+            if x_dt.device_mesh.device_type == "cuda":
                return x_dt + 1
            else:
                return x_dt + 2
@ -949,7 +947,7 @@ def forward(self, primals_1):

        model = FakeTransformer().to(self.device_type)

-        tp_mesh = init_device_mesh(self.device_type, (2,), mesh_dim_names=("tp",))
+        tp_mesh = init_device_mesh("cuda", (2,), mesh_dim_names=("tp",))

        # apply sequence parallel
        parallel_plan = {
--- a/test/distributed/tensor/test_dtensor_ops.py
+++ b/test/distributed/tensor/test_dtensor_ops.py
@ -126,10 +126,6 @@ dtensor_fails = {
    xfail("cummin"),
    xfail("diagonal_scatter"),
    xfail("dist"),
-    xfail("empty"),
-    xfail("empty_strided"),
-    xfail("empty_like"),
-    xfail("empty_permuted"),
    xfail("expand_copy"),
    xfail("exponential"),
    xfail("equal"),
@ -482,6 +478,11 @@ dtensor_fails = {
    skip("_segment_reduce", "offsets"),
    # TODO: fix the following ops
    skip("squeeze"),
+    # These must be skipped as their contents are nondeterministic
+    skip("empty"),
+    skip("empty_strided"),
+    skip("empty_like"),
+    skip("empty_permuted"),
 }


--- a/test/distributed/tensor/test_redistribute.py
+++ b/test/distributed/tensor/test_redistribute.py
@ -19,6 +19,8 @@ from torch.testing._internal.common_utils import (
    instantiate_parametrized_tests,
    parametrize,
    run_tests,
+    TEST_CUDA,
+    TEST_HPU,
 )
 from torch.testing._internal.distributed._tensor.common_dtensor import (
    DTensorTestBase,
@ -517,7 +519,7 @@ class RedistributeTest(DTensorTestBase):
                local_out_dt = out_dt.to_local()
                local_expected_dt = expected_dt.to_local()
                self.assertEqual(out_dt.to_local(), expected_dt.to_local())
-                if torch.accelerator.is_available():
+                if TEST_HPU or TEST_CUDA:
                    self.assertEqual(
                        comm_mode.get_comm_counts()[
                            torch.ops._dtensor.shard_dim_alltoall
--- a/test/distributed/tensor/test_tensor_ops.py
+++ b/test/distributed/tensor/test_tensor_ops.py
@ -295,8 +295,8 @@ class DistTensorOpsTest(DTensorTestBase):
        self.assertEqual(dist_tensor.dtype, torch.float32)
        self.assertEqual(zeros_like_dt.dtype, torch.bfloat16)

-    @skip_if_lt_x_gpu(4)
    @with_comms
+    @skip_if_lt_x_gpu(4)
    def test_stack(self):
        mesh_2d = DeviceMesh(
            self.device_type, torch.arange(self.world_size).reshape(2, 2)
--- a/test/dynamo/test_aot_compile.py
+++ b/test/dynamo/test_aot_compile.py
@ -2,6 +2,7 @@

 import os
 import pickle
+from contextlib import contextmanager

 import torch
 import torch._dynamo.testing
@ -9,6 +10,7 @@ import torch._inductor.config
 import torch._inductor.test_case
 import torch.onnx.operators
 import torch.utils.cpp_extension
+from torch._dynamo.aot_compile import ModelInput
 from torch._dynamo.exc import PackageError, Unsupported
 from torch._dynamo.package import DynamoCache
 from torch._dynamo.precompile_context import PrecompileContext
@ -226,6 +228,85 @@ from user code:
            actual = compiled_fn(*inputs)
            self.assertEqual(expected, actual)

+    def test_aot_compile_module(self):
+        mod = SimpleLinearModule()
+
+        model = torch.compile(
+            mod,
+            fullgraph=True,
+            backend="inductor",
+            options={
+                "guard_filter_fn": torch.compiler.skip_guard_on_globals_unsafe,
+            },
+        )
+
+        @contextmanager
+        def train_mode(model):
+            """
+            Context manager that sets the model to training mode before entering the context.
+            """
+            model.train()
+            yield
+
+        @contextmanager
+        def eval_mode(model):
+            """
+            Context manager that sets the model to evaluation mode before entering the context.
+            """
+            model.eval()
+            yield
+
+        inputs = [
+            ModelInput(
+                args=(torch.randn(3, 3),),
+                kwargs={},
+                contexts=[torch.no_grad(), eval_mode(model)],
+            ),
+            ModelInput(
+                args=(torch.randn(3, 3),), kwargs={}, contexts=[train_mode(model)]
+            ),
+        ]
+        assert isinstance(model, torch._dynamo.eval_frame.OptimizedModule)
+        model._aot_compile(
+            inputs,
+        )
+        with torch.compiler.set_stance("fail_on_recompile"):
+            model.eval()
+            inputs = (torch.randn(3, 3),)
+            expected = mod(*inputs)
+            actual = model(*inputs)
+            self.assertEqual(expected, actual)
+
+            # Shouldn't recompile
+            model.train()
+            expected.sum().backward()
+
+        model._save_aot_compiled_module(self.path())
+        torch._dynamo.reset()
+        model = torch.compile(
+            mod,
+            fullgraph=True,
+            backend="inductor",
+            options={
+                "guard_filter_fn": torch.compiler.skip_guard_on_globals_unsafe,
+            },
+        )
+        assert isinstance(model, torch._dynamo.eval_frame.OptimizedModule)
+        with open(self.path(), "rb") as f:
+            data = f.read()
+            model._load_aot_compiled_module(data)
+
+        with torch.compiler.set_stance("fail_on_recompile"):
+            model.eval()
+            inputs = (torch.randn(3, 3),)
+            expected = mod(*inputs)
+            actual = model(*inputs)
+            self.assertEqual(expected, actual)
+
+            # Shouldn't recompile
+            model.train()
+            expected.sum().backward()
+

 if __name__ == "__main__":
    from torch._dynamo.test_case import run_tests
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@ -16605,6 +16605,37 @@ def forward(self, x):
        wrapper = Wrapper(pyt_model, example_inputs)
        wrapper.forward()

+    def test_export_with_dict_input_nested_in_args(self):
+        """Test export with dictionary input nested in args."""
+
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+                self.linear = torch.nn.Linear(10, 1)
+
+            def forward(self, data_batch):
+                h1 = self.linear(data_batch["a1"])
+                h2 = self.linear(data_batch["a2"])
+                return h1 + h2
+
+        # Create model and example inputs
+        model = MyModel()
+        a1 = torch.randn(10)
+        a2 = torch.randn(10)
+        original_input = {"a1": a1, "a2": a2}
+        example_args_forward = (original_input,)
+
+        # Export the model
+        exported_model = export(model, example_args_forward)
+
+        # Run both models and compare results
+        reordered_input = {"a2": a2, "a1": a1}
+        original_output = exported_model.module()(reordered_input)
+        loaded_output = model(original_input)
+
+        # Verify outputs are close (allowing for floating point differences)
+        torch.testing.assert_close(original_output, loaded_output)
+
    def test_strict_export_with_shared_parameters(self):
        """Test that parameter names are preserved when there are shared parameters with the same name."""

--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@ -129,11 +129,13 @@ class TestMaxAutotune(TestCase):
    @parametrize("a_transposed", (False, True))
    @parametrize("b_transposed", (False, True))
    @parametrize("dynamic", (False, True))
+    @parametrize("tma_store", (False, True))
    def test_max_autotune_regular_mm_persistent_tma(
        self,
        a_transposed: bool,
        b_transposed: bool,
        dynamic: bool,
+        tma_store: bool,
    ):
        def mm(a, b):
            # TMA requires 16-byte alignment: here we repeat the dims
@ -165,12 +167,35 @@ class TestMaxAutotune(TestCase):
            {
                "max_autotune": True,
                "triton.enable_persistent_tma_matmul": "1",
+                "triton.enable_template_tma_store": tma_store,
                "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
            }
        ):
-            c_actual = torch.compile(mm, dynamic=dynamic)(a, b)
+            c_actual, code = run_and_get_code(torch.compile(mm, dynamic=dynamic), a, b)
            c_expected = mm(a, b)

+        if has_triton_stable_tma_api():
+            make_desc_api = "triton.language.make_tensor_descriptor"
+            read_api = "tl.load_tensor_descriptor"
+            if tma_store:
+                # Note: The tma_descriptor0 is generated by the kernel. If the
+                # code generation process changes this could change.
+                write_api = "tma_descriptor0.store"
+            else:
+                write_api = "tl.store"
+        else:
+            make_desc_api = (
+                "triton.language.extra.cuda.experimental_device_tensormap_create2d"
+            )
+            read_api = "tl._experimental_descriptor_load"
+            # TMA store is not supported with the experimental API
+            write_api = "tl.store"
+
+        # Verify that we are using a TMA implementation
+        FileCheck().check("triton_tem_fused_mm").check(make_desc_api).check(
+            read_api
+        ).check(write_api).run(code[0])
+
        torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=1e-2)

    @unittest.skipIf(
@ -264,6 +289,42 @@ class TestMaxAutotune(TestCase):
        # given the config flags above, we should have no choices left.
        self.assertIn("NoValidChoicesError", str(context.exception))

+    @unittest.skipIf(
+        not has_triton_tma_device(), "Need device-side TMA support in Triton"
+    )
+    @parametrize("dynamic", (False, True))
+    def test_max_autotune_regular_mm_persistent_tma_illegal_output_alignment(
+        self, dynamic
+    ):
+        def mm(a, b, out):
+            torch.mm(a, b, out=out)
+            return out
+
+        M, N, K = 21, 31, 32
+        a = torch.empty_strided((M, K), (K, 1), dtype=torch.float16, device=GPU_TYPE)
+        a[:] = torch.randn((M, K), dtype=torch.float16)
+        b = torch.empty_strided((K, N), (1, K), dtype=torch.float16, device=GPU_TYPE)
+        b[:] = torch.randn((K, N), dtype=torch.float16)
+        # allocate an output with a stride not divisble by 16, so it can't satisfy TMA alignment checks.
+        out = torch.empty_strided((M, N), (N, 1), dtype=torch.float16, device=GPU_TYPE)
+
+        with (
+            self.assertRaises(BackendCompilerFailed) as context,
+            config.patch(
+                {
+                    "max_autotune": True,
+                    "triton.enable_persistent_tma_matmul": "1",
+                    "triton.enable_template_tma_store": True,
+                    "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
+                }
+            ),
+        ):
+            torch.compile(mm, dynamic=dynamic)(a, b, out)
+
+        # Lowering to the persistent+TMA Triton template should be skipped
+        # since the output doesn't have a stride of 1 in any dim
+        self.assertIn("NoValidChoicesError", str(context.exception))
+
    @unittest.skipIf(
        not has_triton_tma_device(), "Need device-side TMA support in Triton"
    )
@ -317,11 +378,13 @@ class TestMaxAutotune(TestCase):
    @parametrize("a_transposed", (False, True))
    @parametrize("b_transposed", (False, True))
    @parametrize("dynamic", (False, True))
+    @parametrize("tma_store", (False, True))
    def test_max_autotune_addmm_persistent_tma(
        self,
        a_transposed: bool,
        b_transposed: bool,
        dynamic: bool,
+        tma_store: bool,
    ):
        def addmm(x, a, b):
            # TMA requires 16-byte alignment: here we repeat the dims
@ -355,12 +418,37 @@ class TestMaxAutotune(TestCase):
            {
                "max_autotune": True,
                "triton.enable_persistent_tma_matmul": "1",
+                "triton.enable_template_tma_store": tma_store,
                "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
            }
        ):
-            c_actual = torch.compile(addmm, dynamic=dynamic)(x, a, b)
+            c_actual, code = run_and_get_code(
+                torch.compile(addmm, dynamic=dynamic), x, a, b
+            )
            c_expected = addmm(x, a, b)

+        if has_triton_stable_tma_api():
+            make_desc_api = "triton.language.make_tensor_descriptor"
+            read_api = "tl.load_tensor_descriptor"
+            if tma_store:
+                # Note: The tma_descriptor0 is generated by the kernel. If the
+                # code generation process changes this could change.
+                write_api = "tma_descriptor0.store"
+            else:
+                write_api = "tl.store"
+        else:
+            make_desc_api = (
+                "triton.language.extra.cuda.experimental_device_tensormap_create2d"
+            )
+            read_api = "tl._experimental_descriptor_load"
+            # TMA store is not supported with the experimental API
+            write_api = "tl.store"
+
+        # Verify that we are using a TMA implementation
+        FileCheck().check("triton_tem_fused_addmm").check(make_desc_api).check(
+            read_api
+        ).check(write_api).run(code[0])
+
        torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=1e-2)

    @unittest.skipIf(
@ -1508,7 +1596,7 @@ class TestMaxAutotune(TestCase):
        # Make sure all args of generate_and_load_args are passed to make_key_args (Except generate_with_caching)
        # update this function each time new arg added to generate_and_load and make sure arg is added to make_key
        self.assertEqual(generate_and_load_args - 1, make_key_args)
-        self.assertEqual(generate_and_load_args, 17)
+        self.assertEqual(generate_and_load_args, 18)

    @fresh_cache()
    @config.patch(
@ -1594,7 +1682,7 @@ class TestMaxAutotune(TestCase):
                            "[[22,30],[30,1],torch.float32,device(type='cuda',index=0),0]"],
                        'num_stages':1,'num_warps':2,'prefix_args':0,'suffix_args':0,'call_sizes':[10,30],
                        'layout':"[[10,30],[30,1],torch.float32,device(type='cuda',index=0),0]",
-                        'num_consumer_groups':0,'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity',
+                        'num_consumer_groups':0,'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity','tma_store':False,
                        'kwargs':{'EVEN_K':False,'ALLOW_TF32':True,'USE_FAST_ACCUM':False,'ACC_TYPE':'tl.float32',
                        'BLOCK_M':16,'BLOCK_N':32,'BLOCK_K':16,'GROUP_M':8},'hint_override':None}"""

@ -1634,7 +1722,7 @@ class TestMaxAutotune(TestCase):
                        "[[s27,s94],[s94,1],torch.float32,device(type='cuda',index=0),0]"],
                    'num_stages':1,'num_warps':2,'prefix_args':0,'suffix_args':0,'call_sizes':[s77,s94],
                    'layout':"[[s77,s94],[s94,1],torch.float32,device(type='cuda',index=0),0]",'num_consumer_groups':0,
-                    'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity','kwargs':{'EVEN_K':False,'ALLOW_TF32':True,
+                    'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity','tma_store':False,'kwargs':{'EVEN_K':False,'ALLOW_TF32':True,
                    'USE_FAST_ACCUM':False,'ACC_TYPE':'tl.float32','BLOCK_M':16,'BLOCK_N':32,'BLOCK_K':16,'GROUP_M':8},'hint_override':None}"""
                expected = expected.replace("cuda", GPU_TYPE)
                self.assertExpectedInline(
--- a/test/inductor/test_memory.py
+++ b/test/inductor/test_memory.py
@ -353,6 +353,33 @@ class TestOperatorReorderForPeakMemory(TestCase):
        y = torch.rand(N, N, dtype=torch.float32, device=GPU_TYPE)
        z = torch.rand(N, N, dtype=torch.float32, device=GPU_TYPE)

+        from torch._inductor.choices import InductorChoices
+        from torch._inductor.scheduler import BaseSchedulerNode, Scheduler
+
+        class CustomInductorChoices(InductorChoices):
+            @staticmethod
+            def can_fuse(
+                scheduler: Scheduler,
+                node1: BaseSchedulerNode,
+                node2: BaseSchedulerNode,
+                shared_data_score: int,
+            ) -> bool:
+                can_fuse_default = InductorChoices.can_fuse(
+                    scheduler, node1, node2, shared_data_score
+                )
+                if (not can_fuse_default) or (
+                    not config.realize_acc_reads_size_threshold
+                ):
+                    return can_fuse_default
+
+                all_reads = (node1.read_writes.reads | node2.read_writes.reads) - (
+                    node1.read_writes.writes | node2.read_writes.writes
+                )
+                size_of_reads = [scheduler.dep_size_hint(dep) for dep in all_reads]
+                return sum(size_of_reads) < config.realize_acc_reads_size_threshold
+
+        torch._inductor.virtualized.V.set_choices_handler(CustomInductorChoices())
+
        # CASE 1: no restriction on the amount of accumulation
        with config.patch({"realize_acc_reads_size_threshold": float("inf")}):
            f_compiled = torch.compile(f)
--- a/test/slow_tests.json
+++ b/test/slow_tests.json
@ -1,244 +1,249 @@
 {
-  "EndToEndLSTM (__main__.RNNTest)": 194.9510040283203,
-  "MultiheadAttention (__main__.ModulesTest)": 140.13499959309897,
-  "test__adaptive_avg_pool2d (__main__.CPUReproTests)": 89.57710986667209,
-  "test_after_aot_cpu_runtime_error (__main__.MinifierIsolateTests)": 64.31833351982965,
-  "test_after_aot_gpu_runtime_error (__main__.MinifierIsolateTests)": 66.09833272298177,
-  "test_aot_autograd_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 64.02314267839704,
-  "test_aot_autograd_symbolic_exhaustive_linalg_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 72.13800048828125,
-  "test_aot_autograd_symbolic_exhaustive_masked_norm_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 63.19166692097982,
-  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 153.9259999593099,
-  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 214.78533426920572,
-  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 158.7769978841146,
-  "test_aot_autograd_symbolic_exhaustive_nn_functional_unfold_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 60.201476414998375,
-  "test_aot_autograd_symbolic_exhaustive_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 75.8566665649414,
-  "test_aot_autograd_symbolic_module_exhaustive_nn_TransformerDecoderLayer_cpu_float32 (__main__.TestEagerFusionModuleInfoCPU)": 158.88999938964844,
-  "test_avg_pool3d_backward2_cpu (__main__.CpuTests)": 600.0303955078125,
-  "test_avg_pool3d_backward2_cuda (__main__.GPUTests)": 143.89337348937988,
-  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 494.34210883246527,
-  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 504.5401102701823,
-  "test_avg_pool3d_backward2_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 135.9231694539388,
-  "test_backward_nn_functional_multi_head_attention_forward_cpu_float32 (__main__.TestCompositeComplianceCPU)": 71.03799947102864,
-  "test_backward_nn_functional_multi_head_attention_forward_cuda_float32 (__main__.TestCompositeComplianceCUDA)": 73.23316764831543,
-  "test_basic_cpu (__main__.EfficientConvBNEvalCpuTests)": 214.73055691189236,
-  "test_basic_cuda (__main__.EfficientConvBNEvalGpuTests)": 150.5653305053711,
-  "test_cat_2k_args (__main__.TestTEFuserDynamic)": 121.138150700114,
-  "test_cat_2k_args (__main__.TestTEFuserStatic)": 117.27021219874874,
-  "test_checkpointing_without_reentrant_input_requires_grad_False (__main__.TestAutogradWithCompiledAutograd)": 332.1435546875,
-  "test_checkpointing_without_reentrant_input_requires_grad_True (__main__.TestAutogradWithCompiledAutograd)": 413.1364440917969,
-  "test_collect_callgrind (__main__.TestBenchmarkUtils)": 322.539549085829,
-  "test_comprehensive_diff_cuda_complex128 (__main__.TestDecompCUDA)": 109.46066538492839,
-  "test_comprehensive_diff_cuda_complex64 (__main__.TestDecompCUDA)": 110.44916661580403,
-  "test_comprehensive_diff_cuda_float32 (__main__.TestDecompCUDA)": 77.25650024414062,
-  "test_comprehensive_diff_cuda_float64 (__main__.TestDecompCUDA)": 75.41433461507161,
-  "test_comprehensive_grid_sampler_2d_cpu_bfloat16 (__main__.TestDecompCPU)": 111.43533325195312,
-  "test_comprehensive_grid_sampler_2d_cpu_float16 (__main__.TestDecompCPU)": 113.98733520507812,
-  "test_comprehensive_grid_sampler_2d_cpu_float32 (__main__.TestDecompCPU)": 485.4573465983073,
-  "test_comprehensive_grid_sampler_2d_cpu_float64 (__main__.TestDecompCPU)": 464.56699625651044,
-  "test_comprehensive_grid_sampler_2d_cuda_bfloat16 (__main__.TestDecompCUDA)": 265.6348292032878,
-  "test_comprehensive_grid_sampler_2d_cuda_float16 (__main__.TestDecompCUDA)": 314.0461654663086,
-  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestDecompCUDA)": 1546.3898315429688,
-  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 69.4828332265218,
-  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestDecompCUDA)": 1384.938496907552,
-  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 73.32633463541667,
-  "test_comprehensive_linalg_lu_solve_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 78.70183436075847,
-  "test_comprehensive_linalg_lu_solve_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 76.88016764322917,
-  "test_comprehensive_linalg_pinv_singular_cuda_complex128 (__main__.TestDecompCUDA)": 60.60533459981283,
-  "test_comprehensive_linalg_solve_triangular_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 83.5096664428711,
-  "test_comprehensive_linalg_solve_triangular_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 78.69066619873047,
-  "test_comprehensive_linalg_svd_cuda_complex128 (__main__.TestDecompCUDA)": 92.91299947102864,
-  "test_comprehensive_linalg_svd_cuda_complex64 (__main__.TestDecompCUDA)": 73.34999974568684,
-  "test_comprehensive_linalg_vector_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 70.28683344523112,
-  "test_comprehensive_linalg_vector_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 69.44366518656413,
-  "test_comprehensive_logspace_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 77.09783299763997,
-  "test_comprehensive_logspace_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 70.4760004679362,
-  "test_comprehensive_masked_norm_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 142.64183044433594,
-  "test_comprehensive_masked_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 137.7250010172526,
-  "test_comprehensive_masked_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 138.17566553751627,
-  "test_comprehensive_nn_functional_conv_transpose3d_cuda_complex64 (__main__.TestDecompCUDA)": 69.95266660054524,
-  "test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDecompCPU)": 60.835333506266274,
-  "test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float64 (__main__.TestDecompCPU)": 66.94753379821778,
-  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestDecompCUDA)": 138.8831672668457,
-  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float64 (__main__.TestDecompCUDA)": 157.37983194986978,
-  "test_comprehensive_nn_functional_grid_sample_cpu_float32 (__main__.TestDecompCPU)": 148.48499552408853,
-  "test_comprehensive_nn_functional_grid_sample_cpu_float64 (__main__.TestDecompCPU)": 142.54666646321616,
-  "test_comprehensive_nn_functional_grid_sample_cuda_bfloat16 (__main__.TestDecompCUDA)": 66.76000086466472,
-  "test_comprehensive_nn_functional_grid_sample_cuda_float16 (__main__.TestDecompCUDA)": 70.30716641743977,
-  "test_comprehensive_nn_functional_grid_sample_cuda_float32 (__main__.TestDecompCUDA)": 340.98316701253253,
-  "test_comprehensive_nn_functional_grid_sample_cuda_float64 (__main__.TestDecompCUDA)": 314.614995320638,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestDecompCUDA)": 88.2018330891927,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 85.09549967447917,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestDecompCUDA)": 88.72550201416016,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 85.59499867757161,
-  "test_comprehensive_nn_functional_interpolate_trilinear_cpu_float32 (__main__.TestDecompCPU)": 61.82139994303385,
-  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float32 (__main__.TestDecompCUDA)": 141.1143341064453,
-  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float64 (__main__.TestDecompCUDA)": 142.72383499145508,
-  "test_comprehensive_nn_functional_max_pool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 1356.413838704427,
-  "test_comprehensive_nn_functional_max_pool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 1347.1215209960938,
-  "test_comprehensive_nn_functional_max_pool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 1366.5043131510417,
-  "test_comprehensive_nn_functional_max_pool3d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 593.5763346354166,
-  "test_comprehensive_nn_functional_max_pool3d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 549.9474945068359,
-  "test_comprehensive_nn_functional_max_unpool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 74.53666687011719,
-  "test_comprehensive_nn_functional_max_unpool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 75.8316650390625,
-  "test_comprehensive_nn_functional_max_unpool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 74.80666669209798,
-  "test_comprehensive_nn_functional_unfold_cuda_complex128 (__main__.TestDecompCUDA)": 67.3658332824707,
-  "test_comprehensive_ormqr_cpu_complex64 (__main__.TestDecompCPU)": 67.6716677347819,
-  "test_comprehensive_ormqr_cuda_complex128 (__main__.TestDecompCUDA)": 120.74283218383789,
-  "test_comprehensive_ormqr_cuda_complex64 (__main__.TestDecompCUDA)": 117.90700022379558,
-  "test_comprehensive_ormqr_cuda_float32 (__main__.TestDecompCUDA)": 74.16149965922038,
-  "test_comprehensive_ormqr_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 74.09249877929688,
-  "test_comprehensive_ormqr_cuda_float64 (__main__.TestDecompCUDA)": 68.72949981689453,
-  "test_comprehensive_svd_cuda_complex128 (__main__.TestDecompCUDA)": 76.05216598510742,
-  "test_comprehensive_svd_cuda_complex64 (__main__.TestDecompCUDA)": 79.25549952189128,
-  "test_constructor_autograd_SparseBSC_cuda (__main__.TestSparseAnyCUDA)": 124.02233123779297,
-  "test_constructor_autograd_SparseBSR_cuda (__main__.TestSparseAnyCUDA)": 130.15816497802734,
-  "test_constructor_autograd_SparseCSC_cuda (__main__.TestSparseAnyCUDA)": 114.52783139546712,
-  "test_constructor_autograd_SparseCSR_cuda (__main__.TestSparseAnyCUDA)": 94.13066546122234,
-  "test_conv1d_basic (__main__.TestXNNPACKConv1dTransformPass)": 243.25878143310547,
-  "test_conv1d_with_relu_fc (__main__.TestXNNPACKConv1dTransformPass)": 560.9872216118706,
-  "test_conv2d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 85.30400085449219,
-  "test_conv2d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 60.0622667948405,
-  "test_conv2d_unary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 60.94093297322591,
-  "test_conv3d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 164.94733174641928,
-  "test_conv3d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 67.41599782307942,
-  "test_conv_bn_fuse_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 80.62599987453885,
-  "test_conv_unary_fusion_nnc (__main__.TestMkldnnFusion)": 77.90822347005208,
-  "test_correctness_AdamW_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 88.02899932861328,
-  "test_correctness_Adam_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 83.99416732788086,
-  "test_count_nonzero_all (__main__.TestBool)": 625.3162163628472,
-  "test_custom_module_lstm (__main__.TestQuantizedOps)": 691.5127597384983,
-  "test_dispatch_symbolic_meta_outplace_all_strides_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestMetaCUDA)": 86.18333435058594,
-  "test_eager_sequence_nr_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 146.76594623766448,
-  "test_eig_check_magma_cuda_float32 (__main__.TestLinalgCUDA)": 341.765677134196,
-  "test_fail_arithmetic_ops.py (__main__.TestTyping)": 68.25488874647353,
-  "test_fail_random.py (__main__.TestTyping)": 69.70459224559643,
-  "test_fn_fwgrad_bwgrad_cumprod_cuda_complex128 (__main__.TestFwdGradientsCUDA)": 99.30016708374023,
-  "test_fn_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 90.32933298746745,
-  "test_fuse_large_params_cpu (__main__.CpuTests)": 100.9027509689331,
-  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 156.06466674804688,
-  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 154.44311014811197,
-  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesCodegenGPUTests)": 140.33400217692056,
-  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 108.87950007120769,
-  "test_grad_nn_Transformer_cpu_float64 (__main__.TestModuleCPU)": 78.21525671543219,
-  "test_grad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 95.37383270263672,
-  "test_gradgrad_nn_LSTM_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 124.23833465576172,
-  "test_gradgrad_nn_LSTM_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 130.07466634114584,
-  "test_gradgrad_nn_TransformerDecoderLayer_cuda_float64 (__main__.TestModuleCUDA)": 228.14850107828775,
-  "test_gradgrad_nn_TransformerEncoder_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 141.07866414388022,
-  "test_gradgrad_nn_TransformerEncoder_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 155.69166564941406,
-  "test_gradgrad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 638.5084838867188,
-  "test_group_norm (__main__.TestQuantizedOps)": 235.64022382100424,
-  "test_indirect_device_assert (__main__.TritonCodeGenTests)": 328.87933349609375,
-  "test_inductor_dynamic_shapes_broadcasting_dynamic_shapes (__main__.DynamicShapesReproTests)": 116.18105255930047,
-  "test_inductor_no_recursionerror_on_for_loops_dynamic_shapes (__main__.DynamicShapesReproTests)": 70.07888836330838,
-  "test_inplace_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 89.06283315022786,
-  "test_inputs_overlapping_with_mutation_stress_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 131.60088857014975,
-  "test_jit_cuda_archflags (__main__.TestCppExtensionJIT)": 118.61966451009114,
-  "test_linalg_solve_triangular_large_cuda_complex128 (__main__.TestLinalgCUDA)": 131.74433390299478,
-  "test_linalg_solve_triangular_large_cuda_complex64 (__main__.TestLinalgCUDA)": 101.52466583251953,
-  "test_linear (__main__.TestStaticQuantizedModule)": 219.97832912868924,
-  "test_linear_binary_cpp_wrapper (__main__.TestCppWrapper)": 111.1229985555013,
-  "test_linear_binary_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 139.29833475748697,
-  "test_linear_relu (__main__.TestStaticQuantizedModule)": 222.60332700941296,
-  "test_lobpcg_ortho_cuda_float64 (__main__.TestLinalgCUDA)": 137.30917072296143,
-  "test_longformer_chunk_dynamic_shapes (__main__.DynamicShapesReproTests)": 106.62766689724393,
-  "test_low_memory_max_pool_dilation_1_dim_3_cpu_halide (__main__.HalideCpuTests)": 585.4219970703125,
-  "test_low_memory_max_pool_dilation_2_dim_3_cpu_halide (__main__.HalideCpuTests)": 504.6419982910156,
-  "test_lstm_cpu (__main__.TestMkldnnCPU)": 69.61133321126302,
-  "test_many_overlapping_inputs_does_not_explode_guards_dynamic_shapes (__main__.DynamicShapesReproTests)": 127.47244517008464,
-  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 63.23977788289388,
-  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 63.10499954223633,
-  "test_nan_assert_float16 (__main__.ProcessGroupNCCLGroupTest)": 105.55233224232991,
-  "test_pattern_matcher_multi_user_cpu (__main__.CpuTritonTests)": 148.99966939290366,
-  "test_proper_exit (__main__.TestDataLoader)": 195.07049942016602,
-  "test_proper_exit (__main__.TestDataLoaderPersistentWorkers)": 238.3838322957357,
-  "test_qat_conv2d_unary (__main__.TestQuantizePT2EX86Inductor)": 180.44411044650607,
-  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn1d)": 64.31058961917192,
-  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn2d)": 62.13955030441284,
-  "test_qat_mobilenet_v2 (__main__.TestQuantizePT2EQATModels)": 141.32811228434244,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 92.34100087483723,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 84.88599904378255,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True (__main__.TestPatternMatcher)": 77.63999938964844,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 91.23133341471355,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 88.41600036621094,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 75.7643305460612,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 85.55433400472005,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 86.17699940999348,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True (__main__.TestPatternMatcher)": 76.47133382161458,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 98.72666676839192,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 102.08499908447266,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 79.43900044759114,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 87.4413324991862,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 88.52833302815755,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 91.18200174967448,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 91.71099853515625,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 75.84733327229817,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 89.47599792480469,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 89.17300160725911,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 96.56466674804688,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 92.08200073242188,
-  "test_qrnncell (__main__.TestDynamicQuantizedOps)": 200.46322377522787,
-  "test_quick_core_backward__unsafe_masked_index_cpu_float64 (__main__.TestDecompCPU)": 637.5349934895834,
-  "test_quick_core_backward__unsafe_masked_index_cuda_float64 (__main__.TestDecompCUDA)": 1213.9888509114583,
-  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cpu_float64 (__main__.TestDecompCPU)": 759.4036661783854,
-  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cuda_float64 (__main__.TestDecompCUDA)": 1672.4736735026042,
-  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cpu_float64 (__main__.TestDecompCPU)": 76.77566528320312,
-  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cuda_float64 (__main__.TestDecompCUDA)": 292.51483662923175,
-  "test_quick_core_backward_roll_cpu_float64 (__main__.TestDecompCPU)": 129.11066691080728,
-  "test_quick_core_backward_roll_cuda_float64 (__main__.TestDecompCUDA)": 260.64366658528644,
-  "test_quick_core_backward_select_scatter_cpu_float64 (__main__.TestDecompCPU)": 73.24966684977214,
-  "test_quick_core_backward_select_scatter_cuda_float64 (__main__.TestDecompCUDA)": 157.60366821289062,
-  "test_quick_core_backward_split_cuda_float64 (__main__.TestDecompCUDA)": 78.70783360799153,
-  "test_quick_core_backward_split_with_sizes_copy_cpu_float64 (__main__.TestDecompCPU)": 89.36199951171875,
-  "test_quick_core_backward_split_with_sizes_copy_cuda_float64 (__main__.TestDecompCUDA)": 193.34283447265625,
-  "test_quick_core_backward_std_cpu_float64 (__main__.TestDecompCPU)": 64.08739941914877,
-  "test_quick_core_backward_std_cuda_float64 (__main__.TestDecompCUDA)": 126.64083353678386,
-  "test_register_spills_cuda (__main__.BenchmarkFusionCudaTest)": 106.82166735331218,
-  "test_replicatepad_64bit_indexing_cuda_float16 (__main__.TestNNDeviceTypeCUDA)": 64.22033437093098,
-  "test_rosenbrock_sparse_with_lrsched_False_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 65.57016626993816,
-  "test_rosenbrock_sparse_with_lrsched_True_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 76.09683354695638,
-  "test_runtime_checks_large_cpu (__main__.AOTInductorTestABICompatibleCpu)": 71.15816752115886,
-  "test_runtime_checks_large_cpu_with_stack_allocation (__main__.AOTInductorTestABICompatibleCpuWithStackAllocation)": 74.32677883572049,
-  "test_runtime_checks_large_cuda (__main__.AOTInductorTestABICompatibleGpu)": 157.43183390299478,
-  "test_save_load_large_string_attribute (__main__.TestSaveLoad)": 131.13233439127603,
-  "test_sdpa_kernel_ctx_manager2_dynamic_shapes (__main__.DynamicShapesCtxManagerTests)": 160.5550011528863,
-  "test_shuffler_iterdatapipe (__main__.IntegrationTestDataLoaderDataPipe)": 117.62710995144315,
-  "test_slow_tasks (__main__.TestFunctionalAutogradBenchmark)": 114.96744452582465,
-  "test_std (__main__.TestQuantizedOps)": 275.08810419506494,
-  "test_svd_lowrank_cuda_complex128 (__main__.TestLinalgCUDA)": 150.82900087038675,
-  "test_terminate_handler_on_crash (__main__.TestTorch)": 110.43555479579501,
-  "test_terminate_signal (__main__.ForkTest)": 130.07055732442274,
-  "test_terminate_signal (__main__.ParallelForkServerShouldWorkTest)": 129.6981106830968,
-  "test_terminate_signal (__main__.SpawnTest)": 133.48411263359918,
-  "test_torchvision_smoke (__main__.TestTensorBoardPytorchGraph)": 90.4521090189616,
-  "test_train_parity_multi_group (__main__.TestFullyShard1DTrainingCore)": 164.04612350463867,
-  "test_triton_bsr_scatter_mm_blocksize_64_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 77.9958324432373,
-  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 78.84283447265625,
-  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 79.08466720581055,
-  "test_triton_bsr_softmax_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 127.43616739908855,
-  "test_triton_bsr_softmax_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 129.390500386556,
-  "test_triton_bsr_softmax_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 104.55349795023601,
-  "test_unary_ops (__main__.TestTEFuserDynamic)": 84.59466772609287,
-  "test_unary_ops (__main__.TestTEFuserStatic)": 87.30733429061041,
-  "test_variant_consistency_jit_nn_functional_max_pool2d_cpu_float32 (__main__.TestJitCPU)": 82.17999776204427,
-  "test_variant_consistency_jit_nn_functional_max_pool2d_cuda_float32 (__main__.TestJitCUDA)": 79.73050053914388,
-  "test_views1_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 87.70950190226237,
-  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cpu_float32 (__main__.TestOperatorsCPU)": 96.42566680908203,
-  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cuda_float32 (__main__.TestOperatorsCUDA)": 78.90966542561848,
-  "test_vmapjvpvjp_linalg_lu_solve_cpu_float32 (__main__.TestOperatorsCPU)": 62.53285598754883,
-  "test_vmapjvpvjp_linalg_lu_solve_cuda_float32 (__main__.TestOperatorsCUDA)": 91.11416816711426,
-  "test_vmapjvpvjp_linalg_multi_dot_cuda_float32 (__main__.TestOperatorsCUDA)": 86.59666760762532,
-  "test_vmapjvpvjp_linalg_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 93.32300059000652,
-  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cpu_float32 (__main__.TestOperatorsCPU)": 100.57566833496094,
-  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cuda_float32 (__main__.TestOperatorsCUDA)": 116.00733248392741,
-  "test_vmapjvpvjp_nn_functional_conv2d_cpu_float32 (__main__.TestOperatorsCPU)": 62.26690483093262,
-  "test_vmapjvpvjp_nn_functional_max_pool2d_cpu_float32 (__main__.TestOperatorsCPU)": 87.44200134277344,
-  "test_vmapjvpvjp_nn_functional_max_pool2d_cuda_float32 (__main__.TestOperatorsCUDA)": 133.6548334757487,
-  "test_vmapjvpvjp_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 114.57983334859212,
-  "test_vmapjvpvjp_unbind_cpu_float32 (__main__.TestOperatorsCPU)": 69.25033442179362,
-  "test_vmapjvpvjp_unbind_cuda_float32 (__main__.TestOperatorsCUDA)": 124.68766911824544,
-  "test_vmapvjpvjp_linalg_lstsq_cuda_float32 (__main__.TestOperatorsCUDA)": 76.81024932861328,
-  "test_vmapvjpvjp_meshgrid_list_of_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 140.70899963378906,
-  "test_vmapvjpvjp_meshgrid_variadic_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 118.22750091552734,
-  "test_vmapvjpvjp_nn_functional_bilinear_cuda_float32 (__main__.TestOperatorsCUDA)": 181.27366256713867
+  "EndToEndLSTM (__main__.RNNTest)": 197.77900187174478,
+  "MultiheadAttention (__main__.ModulesTest)": 137.42000325520834,
+  "test_AllenaiLongformerBase_repro_cpu_halide (__main__.HalideCpuTests)": 214.1816660563151,
+  "test__adaptive_avg_pool2d (__main__.CPUReproTests)": 91.37688869900174,
+  "test_adaptive_max_pool2d1_cpu_halide (__main__.HalideCpuTests)": 116.57933298746745,
+  "test_after_aot_cpu_runtime_error (__main__.MinifierIsolateTests)": 66.92922253078885,
+  "test_after_aot_gpu_runtime_error (__main__.MinifierIsolateTests)": 65.68500010172527,
+  "test_alexnet_prefix_cpu_halide (__main__.HalideCpuTests)": 177.91966756184897,
+  "test_aot_autograd_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 87.69499969482422,
+  "test_aot_autograd_symbolic_exhaustive_linalg_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 74.02233378092448,
+  "test_aot_autograd_symbolic_exhaustive_masked_norm_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 64.45699946085612,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 136.27599589029947,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 259.30466715494794,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 135.36400095621744,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_unfold_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 61.07166544596354,
+  "test_aot_autograd_symbolic_exhaustive_ormqr_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 64.8491905757359,
+  "test_aot_autograd_symbolic_exhaustive_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 90.34733327229817,
+  "test_aot_autograd_symbolic_module_exhaustive_nn_TransformerDecoderLayer_cpu_float32 (__main__.TestEagerFusionModuleInfoCPU)": 140.09266916910806,
+  "test_associative_scan_partial_grad_combine_mode_generic_compile_mode_compile_dynamic_shape_reverse_False_cpu (__main__.AssociativeScanTests)": 65.17999935150146,
+  "test_associative_scan_partial_grad_combine_mode_generic_compile_mode_compile_dynamic_shape_reverse_True_cpu (__main__.AssociativeScanTests)": 73.75112533569336,
+  "test_avg_pool3d_backward2_cpu (__main__.CpuTests)": 646.9324035644531,
+  "test_avg_pool3d_backward2_cuda (__main__.GPUTests)": 142.86450004577637,
+  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 493.49299791124133,
+  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 498.72944810655383,
+  "test_avg_pool3d_backward2_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 133.2033322652181,
+  "test_avg_pool3d_backward_cpu_halide (__main__.HalideCpuTests)": 61.788333892822266,
+  "test_backward_nn_functional_multi_head_attention_forward_cpu_float32 (__main__.TestCompositeComplianceCPU)": 69.57333119710286,
+  "test_backward_nn_functional_multi_head_attention_forward_cuda_float32 (__main__.TestCompositeComplianceCUDA)": 81.06516774495442,
+  "test_basic_cpu (__main__.EfficientConvBNEvalCpuTests)": 215.5933346218533,
+  "test_basic_cuda (__main__.EfficientConvBNEvalGpuTests)": 135.41816584269205,
+  "test_checkpointing_without_reentrant_input_requires_grad_False (__main__.TestAutogradWithCompiledAutograd)": 338.17533026801215,
+  "test_checkpointing_without_reentrant_input_requires_grad_True (__main__.TestAutogradWithCompiledAutograd)": 423.4767761230469,
+  "test_collect_callgrind (__main__.TestBenchmarkUtils)": 325.6485578748915,
+  "test_comprehensive_diff_cuda_complex128 (__main__.TestDecompCUDA)": 111.10633341471355,
+  "test_comprehensive_diff_cuda_complex64 (__main__.TestDecompCUDA)": 104.33766555786133,
+  "test_comprehensive_diff_cuda_float32 (__main__.TestDecompCUDA)": 69.72683334350586,
+  "test_comprehensive_diff_cuda_float64 (__main__.TestDecompCUDA)": 71.48199971516927,
+  "test_comprehensive_grid_sampler_2d_cpu_bfloat16 (__main__.TestDecompCPU)": 96.58033243815105,
+  "test_comprehensive_grid_sampler_2d_cpu_float16 (__main__.TestDecompCPU)": 96.65433247884114,
+  "test_comprehensive_grid_sampler_2d_cpu_float32 (__main__.TestDecompCPU)": 464.92467244466144,
+  "test_comprehensive_grid_sampler_2d_cpu_float64 (__main__.TestDecompCPU)": 460.3839925130208,
+  "test_comprehensive_grid_sampler_2d_cuda_bfloat16 (__main__.TestDecompCUDA)": 263.58483632405597,
+  "test_comprehensive_grid_sampler_2d_cuda_float16 (__main__.TestDecompCUDA)": 298.0318349202474,
+  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestDecompCUDA)": 1310.3350016276042,
+  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 66.3976656595866,
+  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestDecompCUDA)": 1316.084981282552,
+  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 69.58183288574219,
+  "test_comprehensive_linalg_lu_solve_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 78.05749893188477,
+  "test_comprehensive_linalg_lu_solve_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 72.31333287556966,
+  "test_comprehensive_linalg_solve_triangular_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 74.53133392333984,
+  "test_comprehensive_linalg_solve_triangular_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 82.40500005086263,
+  "test_comprehensive_linalg_svd_cuda_complex128 (__main__.TestDecompCUDA)": 69.91749890645345,
+  "test_comprehensive_linalg_svd_cuda_complex64 (__main__.TestDecompCUDA)": 70.98916562398274,
+  "test_comprehensive_masked_norm_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 126.90333302815755,
+  "test_comprehensive_masked_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 112.40283330281575,
+  "test_comprehensive_masked_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 114.09550094604492,
+  "test_comprehensive_nn_functional_conv_transpose3d_cuda_complex128 (__main__.TestDecompCUDA)": 63.223000049591064,
+  "test_comprehensive_nn_functional_conv_transpose3d_cuda_complex64 (__main__.TestDecompCUDA)": 67.44083213806152,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDecompCPU)": 62.70066706339518,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float64 (__main__.TestDecompCPU)": 60.468666076660156,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestDecompCUDA)": 116.34999974568684,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float64 (__main__.TestDecompCUDA)": 116.57566579182942,
+  "test_comprehensive_nn_functional_grid_sample_cpu_float32 (__main__.TestDecompCPU)": 115.4306640625,
+  "test_comprehensive_nn_functional_grid_sample_cpu_float64 (__main__.TestDecompCPU)": 114.67599741617839,
+  "test_comprehensive_nn_functional_grid_sample_cuda_bfloat16 (__main__.TestDecompCUDA)": 78.96566772460938,
+  "test_comprehensive_nn_functional_grid_sample_cuda_float16 (__main__.TestDecompCUDA)": 60.72616704305013,
+  "test_comprehensive_nn_functional_grid_sample_cuda_float32 (__main__.TestDecompCUDA)": 270.3598327636719,
+  "test_comprehensive_nn_functional_grid_sample_cuda_float64 (__main__.TestDecompCUDA)": 260.6623306274414,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestDecompCUDA)": 88.48316701253255,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 78.13166681925456,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestDecompCUDA)": 83.55450057983398,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 80.67749913533528,
+  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float32 (__main__.TestDecompCUDA)": 136.17766698201498,
+  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float64 (__main__.TestDecompCUDA)": 157.4010009765625,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 1222.983662923177,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 1228.281494140625,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 1216.2643432617188,
+  "test_comprehensive_nn_functional_max_pool3d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 503.51465861002606,
+  "test_comprehensive_nn_functional_max_pool3d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 523.0736694335938,
+  "test_comprehensive_nn_functional_max_unpool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 68.91749954223633,
+  "test_comprehensive_nn_functional_max_unpool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 61.947166442871094,
+  "test_comprehensive_nn_functional_max_unpool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 63.17983309427897,
+  "test_comprehensive_nn_functional_unfold_cuda_complex128 (__main__.TestDecompCUDA)": 77.92383321126302,
+  "test_comprehensive_nn_functional_unfold_cuda_complex64 (__main__.TestDecompCUDA)": 69.46137571334839,
+  "test_comprehensive_ormqr_cpu_complex64 (__main__.TestDecompCPU)": 62.2076670328776,
+  "test_comprehensive_ormqr_cuda_complex128 (__main__.TestDecompCUDA)": 139.3495012919108,
+  "test_comprehensive_ormqr_cuda_complex64 (__main__.TestDecompCUDA)": 124.99983469645183,
+  "test_comprehensive_ormqr_cuda_float32 (__main__.TestDecompCUDA)": 73.96983273824056,
+  "test_comprehensive_ormqr_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 73.27383422851562,
+  "test_comprehensive_ormqr_cuda_float64 (__main__.TestDecompCUDA)": 80.94216791788737,
+  "test_comprehensive_svd_cuda_complex128 (__main__.TestDecompCUDA)": 73.65583419799805,
+  "test_comprehensive_svd_cuda_complex64 (__main__.TestDecompCUDA)": 74.30566660563152,
+  "test_constructor_autograd_SparseBSC_cuda (__main__.TestSparseAnyCUDA)": 112.75583267211914,
+  "test_constructor_autograd_SparseBSR_cuda (__main__.TestSparseAnyCUDA)": 106.72283299763997,
+  "test_constructor_autograd_SparseCSC_cuda (__main__.TestSparseAnyCUDA)": 102.85349909464519,
+  "test_constructor_autograd_SparseCSR_cuda (__main__.TestSparseAnyCUDA)": 73.14683278401692,
+  "test_conv1d_basic (__main__.TestXNNPACKConv1dTransformPass)": 137.8197758992513,
+  "test_conv1d_with_relu_fc (__main__.TestXNNPACKConv1dTransformPass)": 437.60955386691626,
+  "test_conv2d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 75.4076665242513,
+  "test_conv2d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 62.40233357747396,
+  "test_conv3d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 149.36666870117188,
+  "test_conv3d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 72.90299987792969,
+  "test_conv_bn_fuse_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 81.56499862670898,
+  "test_conv_unary_fusion_nnc (__main__.TestMkldnnFusion)": 75.13744566175673,
+  "test_correctness_AdamW_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 82.20433298746745,
+  "test_correctness_Adam_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 76.78600056966145,
+  "test_count_nonzero_all (__main__.TestBool)": 655.6186726888021,
+  "test_cpu_gpu_parity_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 80.43400009940652,
+  "test_custom_module_lstm (__main__.TestQuantizedOps)": 798.5362040201823,
+  "test_ddp_uneven_inputs (__main__.TestDistBackendWithSpawn)": 360.75275349617004,
+  "test_diff_hyperparams_sharding_strategy_str_no_shard (__main__.TestFSDPUseOrigParamsMultipleParamGroups)": 60.4433339436849,
+  "test_dispatch_symbolic_meta_outplace_all_strides_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestMetaCUDA)": 85.3961664835612,
+  "test_dtensor_op_db_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDTensorOpsCPU)": 93.10799916585286,
+  "test_eig_check_magma_cuda_float32 (__main__.TestLinalgCUDA)": 215.1919957002004,
+  "test_error_detection_and_propagation (__main__.NcclErrorHandlingTest)": 67.04866790771484,
+  "test_fail_arithmetic_ops.py (__main__.TestTyping)": 64.6271112230089,
+  "test_fail_creation_ops.py (__main__.TestTyping)": 71.04431086573108,
+  "test_fn_fwgrad_bwgrad_cumprod_cuda_complex128 (__main__.TestFwdGradientsCUDA)": 88.46849950154622,
+  "test_fn_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 107.12216822306316,
+  "test_fuse_large_params_cpu (__main__.CpuTests)": 80.30040054321289,
+  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 162.87633260091147,
+  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 160.84833441840277,
+  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesCodegenGPUTests)": 153.62799580891928,
+  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 120.26516850789388,
+  "test_grad_nn_Transformer_cpu_float64 (__main__.TestModuleCPU)": 62.87366739908854,
+  "test_grad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 104.12133407592773,
+  "test_gradgrad_nn_LSTM_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 117.95999908447266,
+  "test_gradgrad_nn_LSTM_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 113.97000122070312,
+  "test_gradgrad_nn_TransformerDecoderLayer_cuda_float64 (__main__.TestModuleCUDA)": 248.1183293660482,
+  "test_gradgrad_nn_TransformerEncoder_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 180.4351666768392,
+  "test_gradgrad_nn_TransformerEncoder_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 160.81400299072266,
+  "test_gradgrad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 694.055165608724,
+  "test_grid_sampler_2d_cpu_halide (__main__.HalideCpuTests)": 194.28900146484375,
+  "test_group_norm (__main__.TestQuantizedOps)": 207.3484410179986,
+  "test_indirect_device_assert (__main__.TritonCodeGenTests)": 329.52866617838544,
+  "test_inductor_no_recursionerror_on_for_loops_dynamic_shapes (__main__.DynamicShapesReproTests)": 67.15944459703233,
+  "test_inplace_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 84.40099970499675,
+  "test_inputs_overlapping_with_mutation_stress_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 132.7371097140842,
+  "test_jit_cuda_archflags (__main__.TestCppExtensionJIT)": 118.91166687011719,
+  "test_linalg_solve_triangular_large_cuda_complex128 (__main__.TestLinalgCUDA)": 130.4806671142578,
+  "test_linalg_solve_triangular_large_cuda_complex64 (__main__.TestLinalgCUDA)": 101.25733184814453,
+  "test_linear (__main__.TestStaticQuantizedModule)": 131.34678183661566,
+  "test_linear_binary_cpp_wrapper (__main__.TestCppWrapper)": 124.32133229573567,
+  "test_linear_binary_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 126.89633433024089,
+  "test_linear_relu (__main__.TestStaticQuantizedModule)": 128.11266708374023,
+  "test_lobpcg_ortho_cuda_float64 (__main__.TestLinalgCUDA)": 75.69916741053264,
+  "test_longformer_chunk_dynamic_shapes (__main__.DynamicShapesReproTests)": 106.60366736518012,
+  "test_lstm_cpu (__main__.TestMkldnnCPU)": 66.15800094604492,
+  "test_many_overlapping_inputs_does_not_explode_guards_dynamic_shapes (__main__.DynamicShapesReproTests)": 130.17633226182727,
+  "test_max_autotune_addmm_max_autotune_gemm_backends_CK_x_shape2 (__main__.TestCKBackend)": 60.61724901199341,
+  "test_max_autotune_addmm_search_space_EXHAUSTIVE_dynamic_True (__main__.TestMaxAutotuneSubproc)": 82.76533508300781,
+  "test_max_autotune_precompile_matmul_max_autotune_gemm_backends_CKTILE_autotune_in_subproc_False_use_aoti_False (__main__.TestCKBackend)": 84.80249977111816,
+  "test_max_autotune_precompile_matmul_max_autotune_gemm_backends_CKTILE_autotune_in_subproc_True_use_aoti_False (__main__.TestCKBackend)": 82.48874931409955,
+  "test_max_pool2d2_cpu_halide (__main__.HalideCpuTests)": 421.6166585286458,
+  "test_max_pool2d3_cpu_halide (__main__.HalideCpuTests)": 133.6796671549479,
+  "test_max_pool2d5_cpu_halide (__main__.HalideCpuTests)": 357.6593322753906,
+  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 63.8608890109592,
+  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 64.60900031195746,
+  "test_proper_exit (__main__.TestDataLoader)": 223.7907740275065,
+  "test_proper_exit (__main__.TestDataLoaderPersistentWorkers)": 213.6155548095703,
+  "test_qat_conv2d_unary (__main__.TestQuantizePT2EX86Inductor)": 168.48199971516928,
+  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn1d)": 68.48926869834342,
+  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn2d)": 68.39782928838963,
+  "test_qat_mobilenet_v2 (__main__.TestQuantizePT2EQATModels)": 99.70321994357639,
+  "test_qat_resnet18 (__main__.TestQuantizePT2EQATModels)": 61.103378822063576,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 99.00533294677734,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 100.10599772135417,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True (__main__.TestPatternMatcher)": 75.0443344116211,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 91.9883321126302,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 100.07866668701172,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 68.79566701253255,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 90.1106669108073,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 88.92966969807942,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True (__main__.TestPatternMatcher)": 75.10766855875652,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 103.41666666666667,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 96.1106669108073,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 77.91766866048177,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 92.16766611735027,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 92.9856669108073,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 93.22266642252605,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 95.57533264160156,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 70.04799906412761,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 90.56433359781902,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 92.017333984375,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 94.46166737874348,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 95.06233215332031,
+  "test_qrnncell (__main__.TestDynamicQuantizedOps)": 204.8830050362481,
+  "test_quick_core_backward__unsafe_masked_index_cpu_float64 (__main__.TestDecompCPU)": 584.1243489583334,
+  "test_quick_core_backward__unsafe_masked_index_cuda_float64 (__main__.TestDecompCUDA)": 1194.274678548177,
+  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cpu_float64 (__main__.TestDecompCPU)": 842.1573282877604,
+  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cuda_float64 (__main__.TestDecompCUDA)": 1500.2438354492188,
+  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cpu_float64 (__main__.TestDecompCPU)": 80.01266479492188,
+  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cuda_float64 (__main__.TestDecompCUDA)": 304.8406728108724,
+  "test_quick_core_backward_roll_cpu_float64 (__main__.TestDecompCPU)": 123.26833089192708,
+  "test_quick_core_backward_roll_cuda_float64 (__main__.TestDecompCUDA)": 289.4941685994466,
+  "test_quick_core_backward_select_scatter_cpu_float64 (__main__.TestDecompCPU)": 78.4913330078125,
+  "test_quick_core_backward_select_scatter_cuda_float64 (__main__.TestDecompCUDA)": 160.19433085123697,
+  "test_quick_core_backward_split_cuda_float64 (__main__.TestDecompCUDA)": 76.93316650390625,
+  "test_quick_core_backward_split_with_sizes_copy_cpu_float64 (__main__.TestDecompCPU)": 95.25599924723308,
+  "test_quick_core_backward_split_with_sizes_copy_cuda_float64 (__main__.TestDecompCUDA)": 190.9510014851888,
+  "test_quick_core_backward_std_cuda_float64 (__main__.TestDecompCUDA)": 115.96716562906902,
+  "test_register_spills_cuda (__main__.BenchmarkFusionCudaTest)": 85.82816696166992,
+  "test_replicatepad_64bit_indexing_cuda_float16 (__main__.TestNNDeviceTypeCUDA)": 64.81233215332031,
+  "test_runtime_checks_large_cpu (__main__.AOTInductorTestABICompatibleCpu)": 73.0594991048177,
+  "test_runtime_checks_large_cpu_with_stack_allocation (__main__.AOTInductorTestABICompatibleCpuWithStackAllocation)": 78.28866704305013,
+  "test_runtime_checks_large_cuda (__main__.AOTInductorTestABICompatibleGpu)": 203.66749827067056,
+  "test_save_load_large_string_attribute (__main__.TestSaveLoad)": 118.92166392008464,
+  "test_sdpa_kernel_ctx_manager2_dynamic_shapes (__main__.DynamicShapesCtxManagerTests)": 161.21966722276477,
+  "test_shuffler_iterdatapipe (__main__.IntegrationTestDataLoaderDataPipe)": 119.33677842881944,
+  "test_slow_tasks (__main__.TestFunctionalAutogradBenchmark)": 122.50711229112413,
+  "test_sort_stable_cpu (__main__.CpuTritonTests)": 77.22933451334636,
+  "test_split_cumsum_cpu (__main__.CpuTritonTests)": 89.92000071207683,
+  "test_std (__main__.TestQuantizedOps)": 118.49511219395532,
+  "test_svd_lowrank_cuda_complex128 (__main__.TestLinalgCUDA)": 149.61699732144675,
+  "test_tensor_split (__main__.TestVmapOperators)": 83.01314294423376,
+  "test_terminate_handler_on_crash (__main__.TestTorch)": 111.18021970325046,
+  "test_terminate_signal (__main__.ForkTest)": 131.81088901807865,
+  "test_terminate_signal (__main__.ParallelForkServerShouldWorkTest)": 131.90911058253712,
+  "test_terminate_signal (__main__.SpawnTest)": 135.51344219843546,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 71.71866671244304,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 69.4015007019043,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 75.85683250427246,
+  "test_triton_bsr_softmax_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 144.25,
+  "test_triton_bsr_softmax_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 142.70416514078775,
+  "test_triton_bsr_softmax_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 105.90866597493489,
+  "test_unary_ops (__main__.TestTEFuserDynamic)": 83.01277730200026,
+  "test_unary_ops (__main__.TestTEFuserStatic)": 84.06699878639645,
+  "test_upsample_bicubic2d_cpu_halide (__main__.HalideCpuTests)": 97.28433227539062,
+  "test_variant_consistency_jit_nn_functional_max_pool2d_cpu_float32 (__main__.TestJitCPU)": 96.625,
+  "test_variant_consistency_jit_nn_functional_max_pool2d_cuda_float32 (__main__.TestJitCUDA)": 78.01066716512044,
+  "test_views1_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 82.23649978637695,
+  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cpu_float32 (__main__.TestOperatorsCPU)": 100.44966379801433,
+  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cuda_float32 (__main__.TestOperatorsCUDA)": 78.67900085449219,
+  "test_vmapjvpvjp_linalg_lu_solve_cpu_float32 (__main__.TestOperatorsCPU)": 75.2140007019043,
+  "test_vmapjvpvjp_linalg_lu_solve_cuda_float32 (__main__.TestOperatorsCUDA)": 100.80166753133138,
+  "test_vmapjvpvjp_linalg_multi_dot_cuda_float32 (__main__.TestOperatorsCUDA)": 96.56916745503743,
+  "test_vmapjvpvjp_linalg_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 99.54433314005534,
+  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cpu_float32 (__main__.TestOperatorsCPU)": 69.86966705322266,
+  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cuda_float32 (__main__.TestOperatorsCUDA)": 103.45650100708008,
+  "test_vmapjvpvjp_nn_functional_conv2d_cpu_float32 (__main__.TestOperatorsCPU)": 69.28766759236653,
+  "test_vmapjvpvjp_nn_functional_max_pool2d_cpu_float32 (__main__.TestOperatorsCPU)": 70.02966690063477,
+  "test_vmapjvpvjp_nn_functional_max_pool2d_cuda_float32 (__main__.TestOperatorsCUDA)": 100.93566703796387,
+  "test_vmapjvpvjp_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 94.60433260599773,
+  "test_vmapjvpvjp_unbind_cuda_float32 (__main__.TestOperatorsCUDA)": 98.65516599019368,
+  "test_vmapvjpvjp_meshgrid_list_of_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 105.35816828409831,
+  "test_vmapvjpvjp_meshgrid_variadic_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 74.68983332316081,
+  "test_vmapvjpvjp_nn_functional_bilinear_cuda_float32 (__main__.TestOperatorsCUDA)": 152.76449966430664
 }
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@ -7160,6 +7160,67 @@ class TestCompileKernel(TestCase):
        expected = torch.full((n,), test_value, device="cuda", dtype=torch.float16)
        torch.testing.assert_close(output, expected, rtol=1e-3, atol=1e-3)

+    @unittest.skipIf(not TEST_CUDA, "No CUDA")
+    def test_compile_kernel_template(self):
+        kernel_source = """
+        template<typename T>
+        __global__ void add_tensors(const T* a, const T* b, T* c, int n) {
+            int i = threadIdx.x + blockIdx.x * blockDim.x;
+            if (i < n)
+                c[i] = a[i] + b[i];
+        }
+        """
+
+        # Compile the kernel
+        from torch.cuda import _compile_kernel
+
+        add_kernel_float = _compile_kernel(kernel_source, "add_tensors<float>")
+
+        # Prepare data
+        N = 1024
+        a = torch.rand(N, device="cuda")
+        b = torch.rand(N, device="cuda")
+        c = torch.empty_like(a)
+
+        # Calculate grid and block dimensions
+        threads_per_block = 256
+        blocks_per_grid = (N + threads_per_block - 1) // threads_per_block
+
+        # Launch kernel
+        add_kernel_float(
+            grid=(blocks_per_grid, 1, 1),
+            block=(threads_per_block, 1, 1),
+            args=[a, b, c, N],
+        )
+
+        # Verify results
+        expected = a + b
+        self.assertEqual(c, expected)
+
+        # do again with different dtype
+        add_kernel_int = _compile_kernel(kernel_source, "add_tensors<int>")
+
+        # Prepare data
+        N = 1024
+        a = torch.randint(-1000, 1000, size=(N,), dtype=torch.int, device="cuda")
+        b = torch.randint(-1000, 1000, size=(N,), dtype=torch.int, device="cuda")
+        c = torch.empty_like(a)
+
+        # Calculate grid and block dimensions
+        threads_per_block = 256
+        blocks_per_grid = (N + threads_per_block - 1) // threads_per_block
+
+        # Launch kernel
+        add_kernel_int(
+            grid=(blocks_per_grid, 1, 1),
+            block=(threads_per_block, 1, 1),
+            args=[a, b, c, N],
+        )
+
+        # Verify results
+        expected = a + b
+        self.assertEqual(c, expected)
+

@unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests")
 class TestCudaDeviceParametrized(TestCase):
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@ -3685,6 +3685,21 @@ def forward(self, arg0_1: "i64[2][1]cpu", arg1_1: "Sym(u2)", arg2_1: "Sym(u3)",
        out = torch.compile(f)(idx, x)
        self.assertEqual(out, f(idx, x))

+    def test_trunc_int_div_true(self):
+        @torch.compile(backend="inductor", dynamic=True, fullgraph=True)
+        def f(x, s13, s57, s77):
+            torch._check(s13 >= 0)
+            torch._check(s57 >= 0)
+            torch._check(s77 >= 0)
+            if int(s13 * ((s57 // s13) + (s77 // s13)) / s13) >= 1:
+                return x * 2
+            else:
+                return x * 100
+
+        # ensure we compile this with no errors.
+        x = torch.rand(10)
+        f(x, 4, 4096, 3920)
+

 instantiate_parametrized_tests(TestUnbacked)

--- a/test/test_overrides.py
+++ b/test/test_overrides.py
@ -940,6 +940,8 @@ def generate_tensor_like_override_tests(cls):
                return None
            elif arg_type == "ScalarType":
                return torch.float32
+            elif arg_type == "c10::string_view":
+                return ""
            elif arg_type in ("std::string_view", "::std::string_view"):
                return ""
            elif arg_type == "SymInt":
--- a/test/test_scatter_gather_ops.py
+++ b/test/test_scatter_gather_ops.py
@ -383,13 +383,14 @@ class TestScatterGather(TestCase):
    @dtypes(torch.float32)
    def test_scatter_add_broadcasted_index_deterministic(self, device, dtype):
        for d in (0, 1):
-            inp = torch.randn(3, 4, device=device, dtype=dtype)
+            inp = torch.randn(3, 4, 5, device=device, dtype=dtype)
            idx_1d = torch.randint(3, (10,), device=device)
            src_shape = list(inp.shape)
            src_shape[d] = 10
            src = torch.randn(src_shape, device=device, dtype=dtype)
-            idx = idx_1d.unsqueeze(1 - d).expand(src_shape)
-            print(idx.stride())
+            idx_view_shape = [1] * inp.ndim
+            idx_view_shape[d] = 10
+            idx = idx_1d.view(idx_view_shape).expand(src_shape)
            ref = inp.clone().scatter_add_(d, idx, src)
            with DeterministicGuard(True):
                res = inp.clone().scatter_add_(d, idx, src)
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@ -1164,9 +1164,9 @@ class TestSparse(TestSparseBase):
                                    "Concatenating sparse tensors, but a dense tensor was found at position 1."):
            torch.cat((sp, dn))

-    @expectedFailureMPS
    @coalescedonoff
    @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
    def test_unsqueeze(self, device, dtype, coalesced):
        def test_shape(sparse_dims, nnz, sizes, unsqueeze_dim, fail_message=None):
            x, _, _ = self._gen_sparse(sparse_dims, nnz, sizes, dtype, device, coalesced)
@ -2353,14 +2353,14 @@ class TestSparse(TestSparseBase):
            self.assertTrue(result.layout == torch.strided)

        with self.assertRaisesRegex(
-            RuntimeError, r"Could not run 'aten::empty_strided' with arguments from the 'Sparse(CPU|CUDA)' backend"
+            RuntimeError, r"Could not run 'aten::empty_strided' with arguments from the 'Sparse(CPU|CUDA|MPS)' backend"
        ):
            dense_tensor = sparse_tensor.to_dense()
            result = torch.empty_like(dense_tensor, layout=torch.sparse_coo)

    @coalescedonoff
-    @expectedFailureMPS
    @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
    def test_empty_like(self, device, dtype, coalesced):
        # tests https://github.com/pytorch/pytorch/issues/43699

@ -3310,7 +3310,6 @@ class TestSparse(TestSparseBase):
            sp_tensor_loaded = pickle.loads(serialized)
            self.assertEqual(sp_tensor, sp_tensor_loaded)

-    @expectedFailureMPS
    def test_any(self, device):
        t = torch.sparse_coo_tensor(torch.tensor(([0, 0], [2, 0])), torch.tensor([False, False]), device=device)
        t_any = torch.tensor(False)
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@ -1654,6 +1654,15 @@ class TestUnaryUfuncs(TestCase):
            ),
        )

+        # empty input
+        # https://github.com/pytorch/pytorch/issues/162473
+        input_tensor = torch.tensor([], device=device)
+        static_size = 1
+        self.assertEqual(
+            torch.nonzero_static(input_tensor, size=static_size),
+            torch.tensor([[-1]], device=device),
+        )
+
        # 1D input
        input_tensor = torch.tensor([0, 8], device=device)
        static_size = 1
--- a/tools/autograd/load_derivatives.py
+++ b/tools/autograd/load_derivatives.py
@ -969,7 +969,7 @@ def saved_variables(
        if nctype.type == OptionalCType(BaseCType(stringT)):
            formula = re.sub(
                rf"\b{name}\b",
-                f"{name}.has_value() ? std::optional<::std::string_view>({name}.value()) : std::nullopt",
+                f"{name}.has_value() ? std::optional<std::string_view>({name}.value()) : std::nullopt",
                formula,
            )

--- a/tools/linter/dictionary.txt
+++ b/tools/linter/dictionary.txt
@ -12,7 +12,6 @@ BU
 contiguities
 contiguity
 coo
-DEPENDEES
 deser
 din
 dout
--- a/torch/_dynamo/aot_compile.py
+++ b/torch/_dynamo/aot_compile.py
@ -6,6 +6,7 @@ import inspect
 import logging
 import pickle
 import types
+from contextlib import AbstractContextManager, ExitStack
 from dataclasses import dataclass
 from typing import Any, Callable, Optional

@ -264,48 +265,141 @@ def aot_compile_fullgraph(

        assert check_fn.guards_state is not None

-    backend_input = capture_output.backend_input
-    assert backend_input is not None
-    backend_input.graph_module._backend_id = backend_input.backend_id  # type: ignore[assignment]
-    output_graph = dynamo_output.tracer_output.output_graph
-    assert output_graph is not None
-    use_cuda = _graph_uses_non_cpu(output_graph.current_tracer.graph)
+        backend_input = capture_output.backend_input
+        assert backend_input is not None
+        backend_input.graph_module._backend_id = backend_input.backend_id  # type: ignore[assignment]
+        output_graph = dynamo_output.tracer_output.output_graph
+        assert output_graph is not None
+        use_cuda = _graph_uses_non_cpu(output_graph.current_tracer.graph)
+        import_sources = output_graph.import_sources
+        with (
+            torch._guards.tracing(TracingContext(backend_input.fake_mode)),
+            torch._functorch.config.patch(
+                {
+                    "bundled_autograd_cache": True,
+                    "force_non_lazy_backward_lowering": True,
+                }
+            ),
+        ):
+            compiled_fn = backend(
+                backend_input.graph_module, backend_input.example_inputs
+            )

-    import_sources = output_graph.import_sources
-    with (
-        torch._guards.tracing(TracingContext(backend_input.fake_mode)),
-        torch._functorch.config.patch("bundled_autograd_cache", True),
-    ):
-        compiled_fn = backend(backend_input.graph_module, backend_input.example_inputs)
+        # If Inductor backend is used, grab the compiled_fn from PrecompileContext
+        # TODO: this should be replaced once we make the backend return the SerializableCallable directly.
+        if isinstance(backend, torch._TorchCompileInductorWrapper):
+            compiled_fn = BundledAOTAutogradSerializableCallable.from_backend_id(
+                backend_input.backend_id
+            )

-    # If Inductor backend is used, grab the compiled_fn from PrecompileContext
-    # TODO: this should be replaced once we make the backend return the SerializableCallable directly.
-    if isinstance(backend, torch._TorchCompileInductorWrapper):
-        compiled_fn = BundledAOTAutogradSerializableCallable.from_backend_id(
-            backend_input.backend_id
+        if not isinstance(compiled_fn, SerializableCallable):
+            if hasattr(backend, "compiler_fn"):
+                compiler_fn = backend.compiler_fn
+            else:
+                compiler_fn = backend
+            raise RuntimeError(
+                f"Compiled function type {type(compiled_fn)} (produced "
+                + f"from backend {compiler_fn}) does not implement SerializableCallable."
+            )
+
+        artifacts = CompileArtifacts(
+            signature=signature,
+            bytecode=dynamo_output.bytecode,
+            guard_manager=check_fn.guard_manager,
+            guards_state=check_fn.guards_state,
+            import_sources=import_sources,
+            backend_id=backend_input.backend_id,
+            compiled_fn=compiled_fn,
+            original_code=fn.__code__,
+            closure=fn.__closure__,
+            use_cuda=use_cuda,
        )
+        aot_compiled_fn = AOTCompiledFunction(_artifacts=artifacts)

-    if not isinstance(compiled_fn, SerializableCallable):
-        if hasattr(backend, "compiler_fn"):
-            compiler_fn = backend.compiler_fn
-        else:
-            compiler_fn = backend
-        raise RuntimeError(
-            f"Compiled function type {type(compiled_fn)} (produced "
-            + f"from backend {compiler_fn}) does not implement SerializableCallable."
-        )
-
-    artifacts = CompileArtifacts(
-        signature=signature,
-        bytecode=dynamo_output.bytecode,
-        guard_manager=check_fn.guard_manager,
-        guards_state=check_fn.guards_state,
-        import_sources=import_sources,
-        backend_id=backend_input.backend_id,
-        compiled_fn=compiled_fn,
-        original_code=fn.__code__,
-        closure=fn.__closure__,
-        use_cuda=use_cuda,
-    )
-    aot_compiled_fn = AOTCompiledFunction(_artifacts=artifacts)
    return aot_compiled_fn
+
+
+@dataclass
+class ModelInput:
+    """
+    WIP type: represents a single model input
+    Which consists of a tuple of arguments and a set of contexts in which to run the model.
+
+    For each ModelInput, we'll compile one full graph of the model, and then use the guards generated
+    to dispatch between the compiled graphs.
+
+
+    """
+
+    args: tuple[Any]
+    kwargs: dict[str, Any]
+    contexts: list[AbstractContextManager[Any]]
+
+
+@dataclass
+class AOTCompiledModel:
+    # Represents a single forward function of a model along with dispatch
+    # compiled_results is serializable. We require the model to deserialize again.
+    model: torch.nn.Module
+    compiled_results: list[AOTCompiledFunction]
+
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        for result in self.compiled_results:
+            if result.guard_check(self.model, *args, **kwargs):
+                return result(self.model, *args, **kwargs)
+        # All guards failed, just run one of them and throw the guard check error.
+        return self.compiled_results[0](self.model, *args, **kwargs)
+
+    def serialize(self) -> bytes:
+        data: list[bytes] = []
+        for result in self.compiled_results:
+            data.append(AOTCompiledFunction.serialize(result))
+        return pickle.dumps(data)
+
+    @classmethod
+    def deserialize(cls, model: torch.nn.Module, data: bytes) -> "AOTCompiledModel":
+        from torch._dynamo.utils import get_metrics_context
+        from torch._guards import compile_context, CompileContext
+
+        results: list[bytes] = pickle.loads(data)
+        compiled_results = []
+        for result in results:
+            with (
+                compile_context(CompileContext(convert_frame.get_compile_id({}))),
+                get_metrics_context(),
+            ):
+                compiled_results.append(AOTCompiledFunction.deserialize(result))
+        return cls(model, compiled_results)
+
+
+def aot_compile_module(
+    model: torch.nn.Module,
+    inputs: list[ModelInput],
+    hooks: Hooks,
+    backend: Callable[[torch.fx.GraphModule, list[torch.Tensor]], SerializableCallable],
+) -> AOTCompiledModel:
+    """
+    Compiles a single nn.Module with any number of inputs, and returns a compiled forward function.
+    """
+
+    def compile_single_graph(model_input: ModelInput) -> AOTCompiledFunction:
+        example_inputs = (model_input.args, model_input.kwargs)
+        orig_forward = model.forward
+        with ExitStack() as stack:
+            for ctx in model_input.contexts:
+                stack.enter_context(ctx)
+            return aot_compile_fullgraph(
+                orig_forward,
+                example_inputs,
+                hooks=hooks,
+                backend=backend,
+            )
+
+    compiled_results = []
+    for model_input in inputs:
+        log.info("Compiling input %s..", model_input)
+        compiled_results.append(compile_single_graph(model_input))
+
+    assert len(compiled_results) > 0
+
+    return AOTCompiledModel(model, compiled_results)
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@ -413,6 +413,57 @@ class OptimizedModule(torch.nn.Module):
            )
        return super().__call__(*args, **kwargs)

+    def _aot_compile(self, inputs: list[torch._dynamo.aot_compile.ModelInput]) -> None:
+        """
+        Experimental: AOT Compile a set of inputs and use that as the forward function
+        """
+        model = self._orig_mod
+        hooks = self.dynamo_ctx._hooks
+        assert hooks is not None
+        if not config.enable_aot_compile:
+            raise RuntimeError(
+                "AOT Compile is not enabled, please set torch._dynamo.config.enable_aot_config=True"
+            )
+        if not self.dynamo_ctx.fullgraph:
+            raise RuntimeError(
+                "Graph breaks are not supported with aot compile. Please use torch.compile(fullgraph=True)."
+            )
+
+        if not callable(self.dynamo_ctx.callback):
+            raise RuntimeError("aot compile requires a callable dynamo callback.")
+
+        backend = innermost_fn(
+            self.dynamo_ctx.callback, unaltered_fn_attr="_torchdynamo_orig_backend"
+        )
+        from torch._dynamo.aot_compile import aot_compile_module
+
+        self.forward = aot_compile_module(model, inputs, hooks, backend)
+
+    def _save_aot_compiled_module(self, path: Optional[str] = None) -> bytes:
+        if not config.enable_aot_compile:
+            raise RuntimeError(
+                "AOT Compile is not enabled, please set torch._dynamo.config.enable_aot_config=True"
+            )
+        from torch._dynamo.aot_compile import AOTCompiledModel
+
+        assert isinstance(self.forward, AOTCompiledModel)
+        result: bytes = self.forward.serialize()
+        if path is not None:
+            with open(path, "wb") as f:
+                f.write(result)
+        return result
+
+    def _load_aot_compiled_module(self, data: bytes) -> None:
+        if not config.enable_aot_compile:
+            raise RuntimeError(
+                "AOT Compile is not enabled, please set torch._dynamo.config.enable_aot_config=True"
+            )
+        from torch._dynamo.aot_compile import AOTCompiledModel
+
+        compiled_forward = AOTCompiledModel.deserialize(self._orig_mod, data)
+        assert isinstance(compiled_forward, AOTCompiledModel)
+        self.forward = compiled_forward
+
    def __reduce__(
        self,
    ) -> tuple[type[OptimizedModule], tuple[torch.nn.Module, _TorchDynamoContext]]:
--- a/torch/_functorch/_aot_autograd/autograd_cache.py
+++ b/torch/_functorch/_aot_autograd/autograd_cache.py
@ -306,6 +306,8 @@ class AOTAutogradCacheDetails(FxGraphHashDetails):
        self,
        gm: torch.fx.GraphModule,
    ):
+        assert has_triton_package(), "Triton is not available"
+
        triton_kernels = []
        for module in gm.modules():
            if not isinstance(module, torch.fx.GraphModule):
@ -331,6 +333,11 @@ class AOTAutogradCacheDetails(FxGraphHashDetails):
        )

        for kernel in triton_kernels:
+            from triton.runtime.autotuner import Autotuner
+
+            if isinstance(kernel, Autotuner):
+                # Grab the Inner JITFunction
+                kernel = kernel.fn
            source_codes = user_defined_triton_kernel_transitive_closure_source_code(
                kernel
            )
@ -355,7 +362,8 @@ class AOTAutogradCacheDetails(FxGraphHashDetails):
            [],
            [],
        )
-        self.triton_kernel_source_codes = self.get_triton_source_codes_from_gm(gm)
+        if has_triton_package():
+            self.triton_kernel_source_codes = self.get_triton_source_codes_from_gm(gm)

        if hasattr(gm, "saved_tensors_hooks_pack_0"):

--- a/torch/_functorch/aot_autograd.py
+++ b/torch/_functorch/aot_autograd.py
@ -1072,6 +1072,7 @@ def aot_module_simplified(
            boxed_forward_device_index,
            ignore_shape_env,
            flatten=False,
+            force_non_lazy_backward_lowering=config.force_non_lazy_backward_lowering,
        )

        compiled_fn = None
--- a/torch/_functorch/config.py
+++ b/torch/_functorch/config.py
@ -296,6 +296,11 @@ fake_tensor_prefer_device_type: Optional[str] = None
 # TODO: turn on by default
 graphsafe_rng_functionalization = True

+# Whether or not to eagerly compile the backward
+# used by AOT compile and other settings
+# TODO: once AOT compile calls aot autograd directly instead of
+# through compile_fx, we can remove this
+force_non_lazy_backward_lowering = False

 # Error on BypassAOTAutogradCache instead of just a warning
 # Used for tests
--- a/Show More
+++ b/Show More