lint

more memory?
2025-11-02 23:15:01 +08:00 · 2025-09-16 11:00:20 -07:00 · 2025-09-15 21:42:41 +00:00 · 2025-09-15 21:37:26 +00:00 · 2025-09-13 13:52:39 +00:00 · 2025-09-13 13:52:39 +00:00
380 changed files with 3777 additions and 10082 deletions
--- a/.ci/libtorch/build.sh
+++ b/.ci/libtorch/build.sh
@ -7,4 +7,4 @@ set -ex

 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

-USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.10" ${SCRIPTPATH}/../manywheel/build.sh
+USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
--- a/.ci/pytorch/functorch_doc_push_script.sh
+++ b/.ci/pytorch/functorch_doc_push_script.sh
@ -0,0 +1,40 @@
+#!/bin/bash
+
+# This is where the local pytorch install in the docker image is located
+pt_checkout="/var/lib/jenkins/workspace"
+source "$pt_checkout/.ci/pytorch/common_utils.sh"
+echo "functorch_doc_push_script.sh: Invoked with $*"
+
+set -ex -o pipefail
+
+version=${DOCS_VERSION:-nightly}
+echo "version: $version"
+
+# Build functorch docs
+pushd $pt_checkout/functorch/docs
+make html
+popd
+
+git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages
+pushd functorch_ghpages
+
+if [ "$version" == "main" ]; then
+  version=nightly
+fi
+
+git rm -rf "$version" || true
+mv "$pt_checkout/functorch/docs/build/html" "$version"
+
+git add "$version" || true
+git status
+git config user.email "soumith+bot@pytorch.org"
+git config user.name "pytorchbot"
+# If there aren't changes, don't make a commit; push is no-op
+git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true
+git status
+
+if [[ "${WITH_PUSH:-}" == true ]]; then
+  git push -u origin gh-pages
+fi
+
+popd
--- a/.ci/pytorch/numba-cuda-13.patch
+++ b/.ci/pytorch/numba-cuda-13.patch
@ -1,25 +0,0 @@
-From 6e08c9d08e9de59c7af28b720289debbbd384764 Mon Sep 17 00:00:00 2001
-From: Michael Wang <13521008+isVoid@users.noreply.github.com>
-Date: Tue, 1 Apr 2025 17:28:05 -0700
-Subject: [PATCH] Avoid bumping certain driver API to avoid future breakage
- (#185)
-
-Co-authored-by: isVoid <isVoid@users.noreply.github.com>
---
- numba_cuda/numba/cuda/cudadrv/driver.py | 3 +++
- 1 file changed, 3 insertions(+)
-
-diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
-index 1641bf77..233e9ed7 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
-+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
-@@ -365,6 +365,9 @@ def _find_api(self, fname):
-         else:
-             variants = ('_v2', '')
- 
-+        if fname in ("cuCtxGetDevice", "cuCtxSynchronize"):
-+            return getattr(self.lib, fname)
-+
-         for variant in variants:
-             try:
-                 return getattr(self.lib, f'{fname}{variant}')
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -32,16 +32,6 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v
  git config --global --add safe.directory /var/lib/jenkins/workspace
 fi

-
-# Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
-NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
-if [ -n "$NUMBA_CUDA_DIR" ]; then
-  NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
-  pushd "$NUMBA_CUDA_DIR"
-  patch -p4 <"$NUMBA_PATCH"
-  popd
-fi
-
 echo "Environment variables:"
 env

@ -1582,7 +1572,6 @@ test_linux_aarch64() {
  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
        test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
-        distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
        --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose

  # Dynamo tests
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -21,7 +21,6 @@ self-hosted-runner:
    - linux.arm64.2xlarge.ephemeral
    - linux.arm64.m7g.4xlarge
    - linux.arm64.m7g.4xlarge.ephemeral
-    - linux.arm64.r7g.12xlarge.memory
    - linux.4xlarge.nvidia.gpu
    - linux.8xlarge.nvidia.gpu
    - linux.16xlarge.nvidia.gpu
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-87ff22e49ed0e92576c4935ccb8c143daac4a3cd
+caba63f0fa29ef9e3d566699f32f11c07c8bda4e
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-d119fc86140785e7efc8f125c17153544d1e0f20
+f510715882304796a96e33028b4f6de1b026c2c7
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-c77852e117bdf056c8e9a087e51d6f65cf6ba53d
+6c5478ff7c3d50dd1e3047d72ec5909bea474073
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@ -82,10 +82,16 @@ RUN if command -v apt-get >/dev/null; then \
        apt-get update -y \
        && apt-get install -y ccache software-properties-common git curl wget sudo vim; \
    else \
-        dnf install -y git curl wget sudo; \
+        dnf install -y git curl wget sudo vim; \
    fi \
    && python3 --version && python3 -m pip --version

+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
    if ! python3 -m uv --version >/dev/null 2>&1; then \
@ -214,16 +220,11 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
    if [ "$USE_SCCACHE" = "1" ]; then \
-        echo "Installing sccache..."; \
-        if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-            SCCACHE_ARCHIVE="sccache-v0.8.1-aarch64-unknown-linux-musl"; \
-        else \
-            SCCACHE_ARCHIVE="sccache-v0.8.1-x86_64-unknown-linux-musl"; \
-        fi; \
-        curl -L -o sccache.tar.gz "https://github.com/mozilla/sccache/releases/download/v0.8.1/${SCCACHE_ARCHIVE}.tar.gz" \
+        echo "Installing sccache..." \
+        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
        && tar -xzf sccache.tar.gz \
-        && sudo mv "${SCCACHE_ARCHIVE}"/sccache /usr/bin/sccache \
-        && rm -rf sccache.tar.gz "${SCCACHE_ARCHIVE}" \
+        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
@ -284,7 +285,7 @@ RUN if command -v apt-get >/dev/null; then \
        && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
        && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
    else \
-        dnf install -y git curl wget sudo; \
+        dnf install -y git curl wget sudo vim; \
    fi \
    && python3 --version && python3 -m pip --version

@ -297,6 +298,12 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
    echo "[INFO] Showing torch_build_versions.txt content:" && \
    cat torch_build_versions.txt

+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
    if ! python3 -m uv --version > /dev/null 2>&1; then \
--- a/.github/scripts/prepare_vllm_wheels.sh
+++ b/.github/scripts/prepare_vllm_wheels.sh
@ -84,9 +84,6 @@ repackage_wheel() {
  rm -rf $package
 }

-# Require to re-package the wheel
-${PYTHON_EXECUTABLE} -mpip install wheel==0.45.1
-
 pushd externals/vllm/wheels
 for package in xformers flashinfer-python vllm; do
  repackage_wheel $package
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@ -75,6 +75,10 @@ jobs:
            runner: ${{ inputs.runner_prefix }}linux.2xlarge
            # It takes less than 30m to finish python docs unless there are issues
            timeout-minutes: 30
+          - docs_type: functorch
+            runner: ${{ inputs.runner_prefix }}linux.2xlarge
+            # It takes less than 15m to finish functorch docs unless there are issues
+            timeout-minutes: 15
    # Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)
    # The current name requires updating the database last docs push query from test-infra every time the matrix is updated
    name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
@ -207,6 +211,16 @@ jobs:
          path: cppdocs/
          s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/cppdocs

+      - name: Upload functorch Docs Preview
+        uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
+        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'functorch' && steps.build-docs.outcome == 'success' }}
+        with:
+          retention-days: 14
+          s3-bucket: doc-previews
+          if-no-files-found: error
+          path: functorch_ghpages/nightly/
+          s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs
+
      - name: Teardown Linux
        uses: pytorch/test-infra/.github/actions/teardown-linux@main
        if: always()
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -169,7 +169,7 @@ jobs:
        id: install-nvidia-driver
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
        with:
-          driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }}
+          driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}

      - name: Setup GPU_FLAG for docker run
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -62,11 +62,6 @@ on:
        required: false
        type: number
        default: 1
-    secrets:
-      HUGGING_FACE_HUB_TOKEN:
-        required: false
-        description: |
-          HF Auth token to avoid rate limits when downloading models or datasets from hub
 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}

@ -81,9 +76,10 @@ jobs:
    strategy:
      matrix: ${{ fromJSON(inputs.test-matrix) }}
      fail-fast: false
-    runs-on: ${{ matrix.runner }}
    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
+    runs-on: ${{ matrix.runner }}
    steps:
+      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
@ -135,9 +131,6 @@ jobs:

      - name: Start monitoring script
        id: monitor-script
-        if: ${{ !inputs.disable-monitor }}
-        shell: bash
-        continue-on-error: true
        env:
          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
@ -145,6 +138,9 @@ jobs:
          WORKFLOW_RUN_ID: ${{github.run_id}}
          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
+        if: ${{ !inputs.disable-monitor }}
+        shell: bash
+        continue-on-error: true
        run: |
          python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7
          python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
@ -182,12 +178,6 @@ jobs:
        run: |
          echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"

-      - name: Preserve github env variables for use in docker
-        shell: bash
-        run: |
-          env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
-          env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
-
      - name: Test
        id: test
        env:
@ -203,22 +193,20 @@ jobs:
          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
          BRANCH: ${{ steps.parse-ref.outputs.branch }}
          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
-          TEST_CONFIG: ${{ matrix.config }}
-          SHARD_NUMBER: ${{ matrix.shard }}
-          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
          TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
+          TEST_CONFIG: ${{ matrix.config }}
+          SHARD_NUMBER: ${{ matrix.shard }}
+          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
          DOCKER_IMAGE: ${{ inputs.docker-image }}
          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
          TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
          DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
-          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
        run: |
          set -x
@ -248,7 +236,6 @@ jobs:
            -e GITHUB_RUN_ATTEMPT \
            -e JOB_ID \
            -e JOB_NAME \
-            -e BASE_SHA \
            -e BRANCH \
            -e SHA1 \
            -e AWS_DEFAULT_REGION \
@ -266,12 +253,10 @@ jobs:
            -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
            -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
            -e TESTS_TO_INCLUDE \
-            -e HUGGING_FACE_HUB_TOKEN \
            -e DASHBOARD_TAG \
            --env-file="${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}" \
            --ulimit stack=10485760:83886080 \
            --ulimit core=0 \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --security-opt seccomp=unconfined \
            --cap-add=SYS_PTRACE \
            --shm-size="8g" \
--- a/.github/workflows/build-vllm-wheel.yml
+++ b/.github/workflows/build-vllm-wheel.yml
@ -12,9 +12,6 @@ on:
    paths:
      - .github/workflows/build-vllm-wheel.yml
      - .github/ci_commit_pins/vllm.txt
-  schedule:
-    # every morning at 01:30PM UTC, 9:30AM EST, 6:30AM PST
-    - cron: 30 13 * * *

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@ -27,33 +24,21 @@ jobs:
      fail-fast: false
      matrix:
        python-version: [ '3.12' ]
-        # TODO (huydhn): Add cu130 after https://github.com/vllm-project/vllm/issues/24464 is resolved
-        platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
+        # TODO (huydhn): Add cu130 https://github.com/pytorch/pytorch/pull/162000#issuecomment-3261541554
        device: [ 'cu128', 'cu129' ]
+        runner: [ 'linux.12xlarge.memory' ]
        include:
-          - platform: manylinux_2_28_x86_64
-            device: cu128
+          - device: cu128
            manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.8'
-            runner: linux.12xlarge.memory
-          - platform: manylinux_2_28_x86_64
-            device: cu129
+          - device: cu129
            manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
-            runner: linux.12xlarge.memory
-          - platform: manylinux_2_28_aarch64
-            device: cu128
-            manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.8'
-            runner: linux.arm64.r7g.12xlarge.memory
-          - platform: manylinux_2_28_aarch64
-            device: cu129
-            manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.9'
-            runner: linux.arm64.r7g.12xlarge.memory
-    name: "Build ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
+    name: "Build ${{ matrix.device }} vLLM wheel"
    runs-on: ${{ matrix.runner }}
    timeout-minutes: 480
    env:
      PY_VERS: ${{ matrix.python-version }}
      MANYLINUX_IMAGE: ${{ matrix.manylinux-image }}
-      PLATFORM: ${{ matrix.platform }}
+      PLATFORM: 'manylinux_2_28_x86_64'
      BUILD_DEVICE: ${{ matrix.device }}
    steps:
      - name: Setup SSH (Click me for login details)
@ -151,7 +136,7 @@ jobs:

      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
        with:
-          name: vllm-wheel-${{ matrix.device }}-${{ matrix.platform }}-${{ matrix.python-version }}
+          name: vllm-wheel-${{ matrix.device }}-${{ matrix.python-version }}-${{ env.PLATFORM }}
          if-no-files-found: error
          path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl

@ -161,29 +146,27 @@ jobs:

  # Copied from build-triton-wheel workflow (mostly)
  upload-wheel:
-    name: "Upload ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
+    name: "Upload ${{ matrix.device }} vLLM wheel"
    needs:
      - build-wheel
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
-        platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
        device: [ 'cu128', 'cu129' ]
    env:
-      PLATFORM: ${{ matrix.platform }}
      BUILD_DEVICE: ${{ matrix.device }}
    permissions:
      id-token: write
      contents: read
    container:
      image: continuumio/miniconda3:4.12.0
-    environment: ${{ ((github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && 'nightly-wheel-upload' || '' }}
+    environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }}
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Configure AWS credentials(PyTorch account) for main
-        if: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
+        if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }}
        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
        with:
          role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels
@ -207,15 +190,15 @@ jobs:
        run: |
          set -eux
          mkdir -p "${RUNNER_TEMP}/artifacts/"
-          mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-"${PLATFORM}"-*/* "${RUNNER_TEMP}/artifacts/"
+          mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-*/* "${RUNNER_TEMP}/artifacts/"

-      - name: Set DRY_RUN
-        if: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
        shell: bash
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"

-      - name: Set UPLOAD_CHANNEL
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }}
        shell: bash
        run: |
--- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
@ -43,11 +43,6 @@ on:
        required: false
        type: boolean
        default: false
-      freezing:
-        description: Run freezing?
-        required: false
-        type: boolean
-        default: true
      benchmark_configs:
        description: The list of configs used the benchmark
        required: false
@ -107,7 +102,7 @@ jobs:
    if: github.event.schedule == '0 7 * * *'
    with:
      build-environment: linux-jammy-py3.10-gcc11-build
-      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
+      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
@ -121,9 +116,10 @@ jobs:
    name: inductor-test
    uses: ./.github/workflows/_linux-test.yml
    needs: inductor-build
+    if: github.event_name == 'workflow_dispatch'
    with:
      build-environment: linux-jammy-py3.10-gcc11-build
-      dashboard-tag: training-${{ inputs.training || 'false' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'true' }}-aotinductor-${{ inputs.aotinductor || 'true' }}-freezing-${{ inputs.freezing || 'true' }}
+      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -39,7 +39,7 @@ jobs:
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.0;8.6'
+      cuda-arch-list: '8.6'
      test-matrix: |
        { include: [
          { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
@ -62,7 +62,7 @@ jobs:
          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
+          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
--- a/.gitignore
+++ b/.gitignore
@ -389,5 +389,3 @@ android/pytorch_android_torchvision/.cxx

 # Claude Code local configuration
 CLAUDE.local.md
-/test_*.py
-/debug_*.py
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -964,6 +964,7 @@ exclude_patterns = [
    'test/jit/**',  # should be run through test/test_jit.py
    'test/ao/sparsity/**',  # should be run through test/test_ao_sparsity.py
    'test/fx/**',  # should be run through test/test_fx.py
+    'test/bottleneck_test/**',  # excluded by test/run_test.py
    'test/package/**',  # excluded by test/run_test.py
    'test/distributed/argparse_util_test.py',
    'test/distributed/bin/test_script.py',
@ -1409,6 +1410,8 @@ exclude_patterns = [
    'torch/utils/benchmark/utils/timer.py',
    'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py',
    'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py',
+    'torch/utils/bottleneck/__init__.py',
+    'torch/utils/bottleneck/__main__.py',
    'torch/utils/bundled_inputs.py',
    'torch/utils/checkpoint.py',
    'torch/utils/collect_env.py',
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -874,7 +874,7 @@ cmake_dependent_option(
  "Whether to build the flash_attention kernel for scaled dot product attention.\
  Will be disabled if not supported by the platform"
  ON
-  "(USE_CUDA AND NOT MSVC) OR USE_ROCM"
+  "USE_CUDA OR USE_ROCM"
  OFF)

 cmake_dependent_option(
@ -891,7 +891,7 @@ IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
 endif()

 # Set USE_FBGEMM_GENAI to ON for CUDA build on SM100.
-if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32)
+if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
  message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a")
  set(USE_FBGEMM_GENAI ON)
 endif()
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@ -65,24 +65,14 @@ DLDataType getDLDataType(const Tensor& t) {
      break;
    // TODO(#146647): use macro here instead of spelling out each shell dtype
    case ScalarType::Float8_e5m2:
-      dtype.code = DLDataTypeCode::kDLFloat8_e5m2;
-      break;
    case ScalarType::Float8_e5m2fnuz:
-      dtype.code = DLDataTypeCode::kDLFloat8_e5m2fnuz;
-      break;
    case ScalarType::Float8_e4m3fn:
-      dtype.code = DLDataTypeCode::kDLFloat8_e4m3fn;
-      break;
    case ScalarType::Float8_e4m3fnuz:
-      dtype.code = DLDataTypeCode::kDLFloat8_e4m3fnuz;
-      break;
    case ScalarType::Float8_e8m0fnu:
-      dtype.code = DLDataTypeCode::kDLFloat8_e8m0fnu;
+      TORCH_CHECK_BUFFER(false, "float8 types are not supported by dlpack");
      break;
    case ScalarType::Float4_e2m1fn_x2:
-      dtype.code = DLDataTypeCode::kDLFloat4_e2m1fn;
-      dtype.lanes = 2;
-      dtype.bits = 4;
+      TORCH_CHECK_BUFFER(false, "float4 types are not supported by dlpack");
      break;
    case ScalarType::QInt8:
    case ScalarType::QUInt8:
@ -187,11 +177,7 @@ static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* dat

 ScalarType toScalarType(const DLDataType& dtype) {
  ScalarType stype = ScalarType::Undefined;
-  if (dtype.code != DLDataTypeCode::kDLFloat4_e2m1fn) {
-    TORCH_CHECK_BUFFER(
-        dtype.lanes == 1,
-        "ATen does not support lanes != 1 for dtype code", std::to_string(dtype.code));
-  }
+  TORCH_CHECK_BUFFER(dtype.lanes == 1, "ATen does not support lanes != 1");
  switch (dtype.code) {
    case DLDataTypeCode::kDLUInt:
      switch (dtype.bits) {
@ -283,73 +269,6 @@ ScalarType toScalarType(const DLDataType& dtype) {
              false, "Unsupported kDLBool bits ", std::to_string(dtype.bits));
      }
      break;
-    case DLDataTypeCode::kDLFloat8_e5m2:
-      switch (dtype.bits) {
-        case 8:
-          stype = ScalarType::Float8_e5m2;
-          break;
-        default:
-          TORCH_CHECK_BUFFER(
-              false, "Unsupported kDLFloat8_e5m2 bits ", std::to_string(dtype.bits));
-      }
-      break;
-    case DLDataTypeCode::kDLFloat8_e5m2fnuz:
-      switch (dtype.bits) {
-        case 8:
-          stype = ScalarType::Float8_e5m2fnuz;
-          break;
-        default:
-          TORCH_CHECK_BUFFER(
-              false, "Unsupported kDLFloat8_e5m2fnuz bits ", std::to_string(dtype.bits));
-      }
-      break;
-    case DLDataTypeCode::kDLFloat8_e4m3fn:
-      switch (dtype.bits) {
-        case 8:
-          stype = ScalarType::Float8_e4m3fn;
-          break;
-        default:
-          TORCH_CHECK_BUFFER(
-              false, "Unsupported kDLFloat8_e4m3fn bits ", std::to_string(dtype.bits));
-      }
-      break;
-    case DLDataTypeCode::kDLFloat8_e4m3fnuz:
-      switch (dtype.bits) {
-        case 8:
-          stype = ScalarType::Float8_e4m3fnuz;
-          break;
-        default:
-          TORCH_CHECK_BUFFER(
-              false, "Unsupported kDLFloat8_e4m3fnuz bits ", std::to_string(dtype.bits));
-      }
-      break;
-    case DLDataTypeCode::kDLFloat8_e8m0fnu:
-      switch (dtype.bits) {
-        case 8:
-          stype = ScalarType::Float8_e8m0fnu;
-          break;
-        default:
-          TORCH_CHECK_BUFFER(
-              false, "Unsupported kDLFloat8_e8m0fnu bits ", std::to_string(dtype.bits));
-      }
-      break;
-    case DLDataTypeCode::kDLFloat4_e2m1fn:
-      switch (dtype.bits) {
-        case 4:
-          switch (dtype.lanes) {
-            case 2:
-              stype = ScalarType::Float4_e2m1fn_x2;
-              break;
-            default:
-              TORCH_CHECK_BUFFER(
-                false, "Unsupported kDLFloat4_e2m1fn lanes ", std::to_string(dtype.lanes));
-          }
-          break;
-        default:
-          TORCH_CHECK_BUFFER(
-              false, "Unsupported kDLFloat4_e2m1fn bits ", std::to_string(dtype.bits));
-      }
-      break;
    default:
      TORCH_CHECK_BUFFER(false, "Unsupported code ", std::to_string(dtype.code));
  }
@ -435,8 +354,8 @@ T* toDLPackImpl(const Tensor& src) {
  atDLMTensor->tensor.dl_tensor.device = torchDeviceToDLDevice(src.device());
  atDLMTensor->tensor.dl_tensor.ndim = static_cast<int32_t>(src.dim());
  atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src);
-  atDLMTensor->tensor.dl_tensor.shape = const_cast<int64_t*>(view.sizes().data());
-  atDLMTensor->tensor.dl_tensor.strides = const_cast<int64_t*>(view.strides().data());
+  atDLMTensor->tensor.dl_tensor.shape = view.sizes().data();
+  atDLMTensor->tensor.dl_tensor.strides = view.strides().data();
  atDLMTensor->tensor.dl_tensor.byte_offset = 0;
  fillVersion(&atDLMTensor->tensor);

--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@ -102,7 +102,7 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
  // SparseTensorImpl has no storage, so we cannot query its nbytes.
  // (original_storage_size is only used for storage resizing in fsdp anyway, which does not apply to sparse)
  // Same for XLA
-  if (base.unsafeGetTensorImpl()->has_storage() && data_ptr().device().type() != c10::DeviceType::XLA) {
+  if (base.unsafeGetTensorImpl()->has_storage() && base.device().type() != c10::DeviceType::XLA) {
    original_storage_size_ = base.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl()->sym_nbytes();
  } else {
    original_storage_size_ = -1;
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
@ -266,14 +266,11 @@ CUDAGeneratorImpl::CUDAGeneratorImpl(
 * See Note [Acquire lock when using random generators]
 */
 void CUDAGeneratorImpl::set_current_seed(uint64_t seed) {
-  if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
-    state_->seed_ = seed;
-    state_->philox_offset_per_thread_ = 0;
-    no_reset_rnn_state_.clear();
-  } else {
-    TORCH_CHECK(state_->seed_ == seed, "CUDAGeneratorImpl::set_current_seed can be called during stream capture only if new seed is the same as the original seed.");
-    // no-op case
-  }
+  at::cuda::assertNotCapturing(
+      "Cannot call CUDAGeneratorImpl::set_current_seed");
+  state_->seed_ = seed;
+  state_->philox_offset_per_thread_ = 0;
+  no_reset_rnn_state_.clear();
 }

 /**
@ -302,6 +299,9 @@ uint64_t CUDAGeneratorImpl::get_offset() const {
 * Gets the current seed of CUDAGeneratorImpl.
 */
 uint64_t CUDAGeneratorImpl::current_seed() const {
+  // Debatable if current_seed() should be allowed in captured regions.
+  // Conservatively disallow it for now.
+  at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::current_seed");
  return state_->seed_;
 }

@ -346,6 +346,8 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
 * and size of the internal state.
 */
 void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
+  at::cuda::assertNotCapturing(
+      "Please ensure to utilize the CUDAGeneratorImpl::set_state_index method during capturing.");
  static const size_t seed_size = sizeof(uint64_t);
  static const size_t offset_size = sizeof(int64_t);
  static const size_t total_size = seed_size + offset_size;
@ -400,27 +402,15 @@ c10::intrusive_ptr<c10::GeneratorImpl> CUDAGeneratorImpl::graphsafe_get_state()
 */
 void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
  // see Note [Why enforce RNG offset % 4 == 0?]
-
-  // Note: If you use CUDNN RNN's, calling
-  // set_philox_offset_per_thread instead of set_offset will cause the
-  // cudnn RNN rng state to become stale.
  TORCH_CHECK(offset % 4 == 0, "offset must be a multiple of 4");
-  if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
-    state_->philox_offset_per_thread_ = offset;
-  } else {
-    state_->offset_intragraph_ = offset;
-  }
+  state_->philox_offset_per_thread_ = offset;
 }

 /**
 * Gets the current philox_offset_per_thread_ of CUDAGeneratorImpl.
 */
 uint64_t CUDAGeneratorImpl::philox_offset_per_thread() const {
-  if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
-    return state_->philox_offset_per_thread_;
-  } else {
-    return state_->offset_intragraph_;
-  }
+  return state_->philox_offset_per_thread_;
 }

 /**
--- a/aten/src/ATen/dlpack.h
+++ b/aten/src/ATen/dlpack.h
@ -19,7 +19,7 @@
 #define DLPACK_MAJOR_VERSION 1

 /*! \brief The current minor version of dlpack */
-#define DLPACK_MINOR_VERSION 1
+#define DLPACK_MINOR_VERSION 0

 /*! \brief DLPACK_DLL prefix for windows */
 #ifdef _WIN32
@ -32,7 +32,9 @@
 #define DLPACK_DLL
 #endif

+// NOLINTNEXTLINE(modernize-deprecated-headers)
 #include <stdint.h>
+// NOLINTNEXTLINE(modernize-deprecated-headers)
 #include <stddef.h>

 #ifdef __cplusplus
@ -157,26 +159,6 @@ typedef enum {
  kDLComplex = 5U,
  /*! \brief boolean */
  kDLBool = 6U,
-  /*! \brief FP8 data types */
-  kDLFloat8_e3m4 = 7U,
-  kDLFloat8_e4m3 = 8U,
-  kDLFloat8_e4m3b11fnuz = 9U,
-  kDLFloat8_e4m3fn = 10U,
-  kDLFloat8_e4m3fnuz = 11U,
-  kDLFloat8_e5m2 = 12U,
-  kDLFloat8_e5m2fnuz = 13U,
-  kDLFloat8_e8m0fnu = 14U,
-  /*! \brief FP6 data types
-   * Setting bits != 6 is currently unspecified, and the producer must ensure it is set
-   * while the consumer must stop importing if the value is unexpected.
-   */
-  kDLFloat6_e2m3fn = 15U,
-  kDLFloat6_e3m2fn = 16U,
-  /*! \brief FP4 data types
-   * Setting bits != 4 is currently unspecified, and the producer must ensure it is set
-   * while the consumer must stop importing if the value is unexpected.
-   */
-  kDLFloat4_e2m1fn = 17U,
 } DLDataTypeCode;

 /*!
@ -190,12 +172,6 @@ typedef enum {
 *   - int8: type_code = 0, bits = 8, lanes = 1
 *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
 *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
- *   - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory)
- *   - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory)
- *   - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory)
- *
- *  When a sub-byte type is packed, DLPack requires the data to be in little bit-endian, i.e.,
- *  for a packed data set D ((D >> (i * bits)) && bit_mask) stores the i-th element.
 */
 typedef struct {
  /*!
@ -253,12 +229,12 @@ typedef struct {
  /*! \brief The data type of the pointer*/
  DLDataType dtype;
  /*! \brief The shape of the tensor */
-  int64_t* shape;
+  const int64_t* shape;
  /*!
   * \brief strides of the tensor (in number of elements, not bytes)
   *  can be NULL, indicating tensor is compact and row-majored.
   */
-  int64_t* strides;
+  const int64_t* strides;
  /*! \brief The offset in bytes to the beginning pointer to data */
  uint64_t byte_offset;
 } DLTensor;
@ -293,7 +269,7 @@ typedef struct DLManagedTensor {
  void (*deleter)(struct DLManagedTensor * self);
 } DLManagedTensor;

-// bit masks used in the DLManagedTensorVersioned
+// bit masks used in in the DLManagedTensorVersioned

 /*! \brief bit mask to indicate that the tensor is read only. */
 #define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
@ -306,14 +282,6 @@ typedef struct DLManagedTensor {
 */
 #define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL)

-/*
- * \brief bit mask to indicate that whether a sub-byte type is packed or padded.
- *
- * The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can
- * be set by the producer to signal that a tensor of sub-byte type is padded.
- */
-#define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL)
-
 /*!
 * \brief A versioned and managed C Tensor object, manage memory of DLTensor.
 *
--- a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
@ -171,8 +171,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {

  POINTWISE_BOXED(fill_.Scalar);
  POINTWISE_BOXED(zero_);
-  // This is special because this op doesn't return anything
-  m.impl("_assert_tensor_metadata", native::_assert_tensor_metadata);

 #undef UNARY_POINTWISE
 #undef UNARY_POINTWISE_ALL
--- a/aten/src/ATen/native/ChanelShuffle.cpp
+++ b/aten/src/ATen/native/ChanelShuffle.cpp
@ -81,7 +81,7 @@ Tensor math_channel_shuffle(const Tensor& self, int64_t groups) {
  // TODO: contiguous can be made to preserve the memory format
  // of the input. However since the above reshape clobbers h and w
  // it may not be safe to do that, since channels_last contiguous
-  // may think oc and the last dim correspond to h,w?
+  // may think oc and and the last dim correspond to h,w?
  // It is not clear, however from initial looking around it feels that
  // this may not be correct.
  // In this case channels last will likely require custom implementation
--- a/aten/src/ATen/native/EmbeddingBag.h
+++ b/aten/src/ATen/native/EmbeddingBag.h
@ -1,4 +1,3 @@
-#pragma once
 #include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
 #include <cstdint>
--- a/aten/src/ATen/native/FractionalMaxPool3d.cpp
+++ b/aten/src/ATen/native/FractionalMaxPool3d.cpp
@ -67,13 +67,13 @@ TORCH_PRECOMPUTE_META_FUNC(fractional_max_pool3d)(
  int64_t inputH = input_.size(heightDim);
  int64_t inputW = input_.size(widthDim);

-  TORCH_CHECK((poolSizeT <= inputT) && (outputT + poolSizeT - 1 < inputT),
+  TORCH_CHECK(outputT + poolSizeT - 1 < inputT,
           "fractional_max_pool3d_out(): pool time ", poolSizeT,
           " too large relative to input time ", inputT);
-  TORCH_CHECK((poolSizeW <= inputW) && (outputW + poolSizeW - 1 < inputW),
+  TORCH_CHECK(outputW + poolSizeW - 1 < inputW,
           "fractional_max_pool3d_out(): pool width ", poolSizeW,
           " too large relative to input width ", inputW);
-  TORCH_CHECK((poolSizeH <= inputH) && (outputH + poolSizeH - 1 < inputH),
+  TORCH_CHECK(outputH + poolSizeH - 1 < inputH,
           "fractional_max_pool3d_out(): pool height ", poolSizeH,
           " too large relative to input height ", inputH);

--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@ -2174,7 +2174,7 @@ static void _scatter_via_index_put(
  if (self.dim() == 1 || broadcast_index) {
    Tensor squeezed = index;
    if (broadcast_index && index.dim() > 1) {
-      for (int64_t d = index.dim() - 1; d >= 0; --d) {
+      for (const auto d : c10::irange(index.dim())) {
        if (d == dim) {
          continue;
        }
--- a/aten/src/ATen/native/TriangularOps.cpp
+++ b/aten/src/ATen/native/TriangularOps.cpp
@ -52,7 +52,6 @@ void apply_triu_tril_single(
    int64_t self_col_stride,
    bool upper) {
  constexpr int64_t zero = 0;
-  k = std::clamp(k, -n, m); // Clamp k to [-n, m] to prevent i + k arithmetic overflow, especially if k approaches INT64_MAX/INT64_MIN.

  if (upper) {
    parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
--- a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
@ -85,11 +85,11 @@ void cpu_max_unpool(
    if constexpr (is_3d) {
      TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
          " (output volumes are of size ", output_depth,
-          "x", output_height, "x", output_width, ")");
+          "x", output_height, "x", output_width);
    } else {
      TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
          " (output volumes are of size ", output_height,
-          "x", output_width, ")");
+          "x", output_width);
    }
  }

--- a/aten/src/ATen/native/cuda/Nonzero.cu
+++ b/aten/src/ATen/native/cuda/Nonzero.cu
@ -317,17 +317,6 @@ void nonzero_static_cuda_out_impl(
    out_temp =
        Tensor(at::detail::empty_cuda({self.dim(), size}, out.options())).t();
  }
-  // If input has zero elements, avoid kernel grid calculations (which can
-  // produce zero divisors) and just fill the output with fill_value.
-  if (self.numel() == 0) {
-    if (need_to_copy) {
-      out_temp.fill_(fill_value);
-      out.copy_(out_temp);
-    } else {
-      out.fill_(fill_value);
-    }
-    return;
-  }
  int64_t* out_data_ptr = need_to_copy ? out_temp.mutable_data_ptr<int64_t>()
                                       : out.mutable_data_ptr<int64_t>();

--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@ -416,7 +416,6 @@ struct ReduceOp {
    if (config.should_block_y_reduce()) {
      value = block_y_reduce<output_vec_size>(value, shared_memory);
    }
-    __syncthreads();
    if (config.should_block_x_reduce()) {
      value = block_x_reduce<output_vec_size>(value, shared_memory);
    }
--- a/aten/src/ATen/native/cuda/Repeat.cu
+++ b/aten/src/ATen/native/cuda/Repeat.cu
@ -17,11 +17,12 @@ __global__ static void compute_cuda_kernel(
    index_t* result_ptr,
    int64_t size,
    int64_t result_size) {
-  CUDA_KERNEL_ASSERT_PRINTF(
-      result_size == cumsum_ptr[size - 1],
+  if (C10_UNLIKELY((result_size != cumsum_ptr[size - 1]))) {
+    printf("%s:%d:%s: block: [%d,%d,%d], thread: [%d,%d,%d] "
      "Invalid input! In `repeat_interleave`, the `output_size` argument (%ld) must be the same as the sum of the elements in the `repeats` tensor (%ld).\n",
-      result_size,
-      cumsum_ptr[size - 1]);
+      __FILE__, __LINE__, __func__,blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z, result_size, cumsum_ptr[size - 1 ]);
+    CUDA_KERNEL_ASSERT(result_size == cumsum_ptr[size - 1])
+  }

  int64_t idx = ((int64_t) blockIdx.x) * blockDim.x + threadIdx.x;
  int64_t stride = (blockDim.x * gridDim.x) / C10_WARP_SIZE;
--- a/aten/src/ATen/native/cuda/int8mm.cu
+++ b/aten/src/ATen/native/cuda/int8mm.cu
@ -5,20 +5,12 @@

 namespace at::native {

-__global__ void weight_int8pack_mm_kernel(
-    const float* x,
-    const int8_t* w,
-    const float* scale,
-    float* out,
-    int B,
-    int K,
-    int N) {
+__global__ void weight_int8pack_mm_kernel(const float* x, const int8_t* w, const float* scale, float* out, int B, int K, int N) {
  // one thread per output element: [B, N]
  int b = blockIdx.y * blockDim.y + threadIdx.y;
  int n = blockIdx.x * blockDim.x + threadIdx.x;

-  if (b >= B || n >= N)
-    return;
+  if (b >= B || n >= N) return;

  float acc = 0.0f;
  for (int k = 0; k < K; ++k) {
@ -28,11 +20,7 @@ __global__ void weight_int8pack_mm_kernel(
  out[b * N + n] = acc * scale[n];
 }

-void launch_weight_int8pack_mm_cuda_kernel(
-    const Tensor& x,
-    const Tensor& w_int8,
-    const Tensor& scale,
-    Tensor& out) {
+void launch_weight_int8pack_mm_cuda_kernel(const Tensor& x, const Tensor& w_int8, const Tensor& scale, Tensor& out) {
  const int B = x.size(0);
  const int K = x.size(1);
  const int N = w_int8.size(0);
@ -47,16 +35,12 @@ void launch_weight_int8pack_mm_cuda_kernel(
      w_int8.data_ptr<int8_t>(),
      scale.data_ptr<float>(),
      out.data_ptr<float>(),
-      B,
-      K,
-      N);
+      B, K, N);
 }

+
 // Main GPU entry point
-at::Tensor _weight_int8pack_mm_cuda(
-    const at::Tensor& x,
-    const at::Tensor& w_int8,
-    const at::Tensor& scale) {
+at::Tensor _weight_int8pack_mm_cuda(const at::Tensor& x, const at::Tensor& w_int8, const at::Tensor& scale) {
  // --- Check inputs ---
  TORCH_CHECK(x.is_cuda(), "x must be a CUDA tensor");
  TORCH_CHECK(w_int8.is_cuda(), "w must be a CUDA tensor");
@ -66,16 +50,12 @@ at::Tensor _weight_int8pack_mm_cuda(
  TORCH_CHECK(w_int8.dim() == 2, "w must be 2D");
  TORCH_CHECK(scale.dim() == 1, "scale must be 1D");

-  TORCH_CHECK(
-      x.size(1) == w_int8.size(1),
-      "K dimension mismatch: x.size(1) != w.size(1)");
-  TORCH_CHECK(
-      w_int8.size(0) == scale.size(0),
-      "Output dim mismatch: w.size(0) != scale.size(0)");
+  TORCH_CHECK(x.size(1) == w_int8.size(1), "K dimension mismatch: x.size(1) != w.size(1)");
+  TORCH_CHECK(w_int8.size(0) == scale.size(0), "Output dim mismatch: w.size(0) != scale.size(0)");

  // --- Determine shapes ---
-  auto B = x.size(0); // batch size
-  auto N = w_int8.size(0); // output dim
+  auto B = x.size(0);  // batch size
+  auto N = w_int8.size(0);  // output dim

  // Ensure inputs are in the correct types for the kernel
  auto x_f32 = x.to(at::kFloat);
@ -83,13 +63,12 @@ at::Tensor _weight_int8pack_mm_cuda(
  auto scale_f32 = scale.to(at::kFloat);

  // --- Allocate output ---
-  auto out = at::empty({B, N}, x_f32.options());
+  auto out = at::empty({B, N}, x.options().dtype(at::kFloat));

  // --- Launch kernel ---
-  launch_weight_int8pack_mm_cuda_kernel(
-      x_f32, w_int8_contiguous, scale_f32, out);
+  launch_weight_int8pack_mm_cuda_kernel(x_f32, w_int8_contiguous, scale_f32, out);

-  return out.to(x.dtype());
+  return out;
 }

 } // namespace at::native
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@ -482,9 +482,7 @@ auto build_graph(
  auto scaled_dot_product_flash_attention_options =
      fe::graph::SDPA_attributes()
          .set_name("CUDNN_SDPA")
-          .set_is_inference(return_softmaxstats == false)
-          // TODO(eqy): switch to this API once cuDNN FE is upgraded
-          // .set_generate_stats(return_softmaxstats)
+          .set_generate_stats(return_softmaxstats)
          .set_causal_mask(is_causal)
          .set_attn_scale(attn_scale);
  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
@ -704,9 +702,7 @@ auto build_graph_nestedtensor(
  auto scaled_dot_product_flash_attention_options =
      fe::graph::SDPA_attributes()
          .set_name("CUDNN_SDPA_NESTEDTENSOR")
-          .set_is_inference(return_softmaxstats == false)
-          // TODO(eqy): switch to this API once cuDNN FE is upgraded
-          // .set_generate_stats(return_softmaxstats)
+          .set_generate_stats(return_softmaxstats)
          .set_causal_mask(is_causal)
          .set_attn_scale(attn_scale)
          .set_seq_len_q(SEQ_LEN_Q_)
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@ -1770,12 +1770,10 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> miopen_depthwise_convolution_back
 // fusions
 // ---------------------------------------------------------------------

-void raw_miopen_convolution_add_relu_out(
+void raw_miopen_convolution_relu_out(
    const Tensor& output,
    const Tensor& input,
    const Tensor& weight,
-    const Tensor& z,
-    float alpha,
    const Tensor& bias,
    IntArrayRef stride,
    IntArrayRef padding,
@ -1783,20 +1781,68 @@ void raw_miopen_convolution_add_relu_out(
    int64_t groups,
    bool benchmark,
    bool deterministic) {
-  raw_miopen_convolution_forward_out(
-      output,
+  auto dataType = getMiopenDataType(input);
+  miopenConvolutionMode_t c_mode = miopenConvolution;
+  ConvolutionArgs args{ input, output, weight };
+  args.handle = getMiopenHandle();
+  at::MemoryFormat memory_format = miopen_conv_suggest_memory_format(input, weight);
+  setConvolutionParams(
+      &args.params,
+      args.handle,
      input,
      weight,
      padding,
      stride,
      dilation,
      groups,
+      deterministic,
+      memory_format);
+  args.idesc.set(input, memory_format);
+  args.wdesc.set(weight, memory_format, 0);
+  args.odesc.set(output, memory_format);
+  args.cdesc.set(
+      dataType,
+      c_mode,
+      input.dim() - 2,
+      args.params.padding,
+      args.params.stride,
+      args.params.dilation,
+      args.params.groups,
      benchmark,
      deterministic);
-  at::Tensor alpha_mul_z_add_bias =
-      at::native::reshape_bias(input.dim(), bias).add(z, alpha);
-  output.add_(alpha_mul_z_add_bias);
-  output.relu_();
+
+  TensorDescriptor bdesc;
+  bdesc.set(bias.expand({1, bias.size(0)}), output.dim());
+
+  // Create the fusion plan
+  miopenFusionPlanDescriptor_t fusePlanDesc;
+  miopenFusionOpDescriptor_t convoOp;
+  miopenFusionOpDescriptor_t biasOp;
+  miopenFusionOpDescriptor_t activOp;
+  MIOPEN_CHECK(miopenCreateFusionPlan(&fusePlanDesc, miopenVerticalFusion, args.idesc.desc()));
+  MIOPEN_CHECK(miopenCreateOpConvForward(fusePlanDesc, &convoOp, args.cdesc.desc(), args.wdesc.desc()));
+  MIOPEN_CHECK(miopenCreateOpBiasForward(fusePlanDesc, &biasOp, bdesc.desc()));
+  MIOPEN_CHECK(miopenCreateOpActivationForward(fusePlanDesc, &activOp, miopenActivationRELU));
+
+  // compile fusion plan
+  MIOPEN_CHECK(miopenCompileFusionPlan(args.handle, fusePlanDesc));
+
+  // Set the Args
+  float alpha = static_cast<float>(1);
+  float beta = static_cast<float>(0);
+  float activ_alpha = static_cast<float>(0);
+  float activ_beta = static_cast<float>(0);
+  float activ_gamma = static_cast<float>(0);
+  miopenOperatorArgs_t fusionArgs;
+  MIOPEN_CHECK(miopenCreateOperatorArgs(&fusionArgs));
+  MIOPEN_CHECK(miopenSetOpArgsConvForward(fusionArgs, convoOp, &alpha, &beta, weight.const_data_ptr()));
+  MIOPEN_CHECK(miopenSetOpArgsBiasForward(fusionArgs, biasOp, &alpha, &beta, bias.const_data_ptr()));
+  MIOPEN_CHECK(miopenSetOpArgsActivForward(fusionArgs, activOp, &alpha, &beta, activ_alpha, activ_beta, activ_gamma));
+
+  miopenExecuteFusionPlan(args.handle, fusePlanDesc, args.idesc.desc(), input.const_data_ptr(), args.odesc.desc(), output.data_ptr(), fusionArgs);
+
+  // Cleanup
+  miopenDestroyFusionPlan(fusePlanDesc);
 }

 static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat memory_format) {
@ -1809,107 +1855,171 @@ static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat m
 Tensor miopen_convolution_add_relu(
    const Tensor& input_t,
    const Tensor& weight_t,
-    const Tensor& z_t,
+    const Tensor& z,
    const std::optional<Scalar>& alpha,
-    const std::optional<Tensor>& bias_t,
+    const std::optional<Tensor>& bias,
    IntArrayRef stride,
    IntArrayRef padding,
    IntArrayRef dilation,
    int64_t groups) {
-  auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
-  const Tensor input = input_t.contiguous(memory_format);
-  const Tensor weight = weight_t.contiguous(memory_format);
-  Tensor z = z_t;
-  if (z.suggest_memory_format() != memory_format) {
-    z = z.to(memory_format);
-  }
-  z = z.contiguous(memory_format);

-  // FuseFrozenConvAddRelu performs some tensor shape checking
-  Tensor output_t = at::detail::empty_cuda(
-      conv_output_size(
-          input.sizes(), weight.sizes(), padding, stride, dilation),
-      input.options().memory_format(memory_format));
-  if (output_t.numel() == 0) {
-    return output_t;
-  }
+  // MIOpen does not support fusion of add, the alpha2 * z step of the below cuDNN function:
+  // y = act ( alpha1 * conv(x) + alpha2 * z + bias )
+
+  auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);

  auto& ctx = at::globalContext();
  bool benchmark = ctx.benchmarkCuDNN();
-  auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
-  auto _bias = bias_t.has_value()
-      ? bias_t.value()
-      : at::zeros(
-            {output_t.size(1)},
-            optTypeMetaToScalarType(output_t.options().dtype_opt()),
-            output_t.options().layout_opt(),
-            output_t.options().device_opt(),
-            output_t.options().pinned_memory_opt());

-  raw_miopen_convolution_add_relu_out(
-      output_t,
+  TensorArg input  { input_t,  "input",  1 },
+            weight { weight_t, "weight", 2 };
+
+  Tensor output_t = at::detail::empty_cuda(
+      conv_output_size(
+        input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
+      input_t.options().memory_format(memory_format));
+  if (output_t.numel() == 0){
+    return output_t;
+  }
+  // Avoid ambiguity of "output" when this is being used as backwards
+  TensorArg output{output_t, "result", 0};
+  miopen_convolution_forward_out(
+      output,
+      "miopen_convolution_add_relu",
      input,
      weight,
-      z,
-      _alpha,
-      _bias,
-      stride,
      padding,
+      stride,
      dilation,
      groups,
      benchmark,
-      true); // deterministic
+      false // deterministic
+  );

-  return output_t;
+  auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
+
+  if (!output_t.is_same(contig_output_t)) {
+    contig_output_t.copy_(output_t);
+  }
+
+  auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
+  auto _bias = bias.has_value()
+          ? bias.value()
+          : at::zeros(
+                {contig_output_t.size(1)},
+                optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
+                contig_output_t.options().layout_opt(),
+                contig_output_t.options().device_opt(),
+                contig_output_t.options().pinned_memory_opt());
+
+  at::Tensor alpha_mul_z_add_bias = at::native::reshape_bias(input_t.dim(), _bias).add(z, _alpha);
+  contig_output_t.add_(alpha_mul_z_add_bias);
+  contig_output_t.relu_();
+
+  return contig_output_t;
 }

 Tensor miopen_convolution_relu(
    const Tensor& input_t,
    const Tensor& weight_t,
-    const std::optional<Tensor>& bias_t,
+    const std::optional<Tensor>& bias,
    IntArrayRef stride,
    IntArrayRef padding,
    IntArrayRef dilation,
    int64_t groups) {
-  auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
-  const Tensor input = input_t.contiguous(memory_format);
-  const Tensor weight = weight_t.contiguous(memory_format);
-
-  // FuseFrozenConvAddRelu performs some tensor shape checking
-  Tensor output_t = at::detail::empty_cuda(
-      conv_output_size(
-          input.sizes(), weight.sizes(), padding, stride, dilation),
-      input.options().memory_format(memory_format));
-  if (output_t.numel() == 0) {
-    return output_t;
-  }

  auto& ctx = at::globalContext();
  bool benchmark = ctx.benchmarkCuDNN();
-  auto _bias = bias_t.has_value()
-      ? bias_t.value()
-      : at::zeros(
-            {output_t.size(1)},
-            optTypeMetaToScalarType(output_t.options().dtype_opt()),
-            output_t.options().layout_opt(),
-            output_t.options().device_opt(),
-            output_t.options().pinned_memory_opt());

-  raw_miopen_convolution_add_relu_out(
-      output_t,
-      input,
-      weight,
-      output_t, // use output_t as z to satisfy MIOpen API
-      0, // alpha
-      _bias,
-      stride,
-      padding,
-      dilation,
-      groups,
-      benchmark, // benchmark
-      true); // deterministic
+  // MIOpen currently only supports MemoryFormat::Contiguous and fp32 and 2d
+  if (input_t.suggest_memory_format() == at::MemoryFormat::Contiguous
+          && input_t.scalar_type() == at::kFloat
+          && input_t.ndimension() == 4) {

-  return output_t;
+    // FuseFrozenConvAddRelu performs some tensor shape checking
+    Tensor output_t = at::detail::empty_cuda(
+        conv_output_size(
+            input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
+        input_t.options().memory_format(input_t.suggest_memory_format()));
+    if (output_t.numel() == 0) {
+      return output_t;
+    }
+
+    auto _bias = bias.has_value()
+            ? bias.value()
+            : at::zeros(
+                  {output_t.size(1)},
+                  optTypeMetaToScalarType(output_t.options().dtype_opt()),
+                  output_t.options().layout_opt(),
+                  output_t.options().device_opt(),
+                  output_t.options().pinned_memory_opt());
+
+    raw_miopen_convolution_relu_out(
+        output_t,
+        input_t,
+        weight_t,
+        _bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        benchmark, // benchmark
+        false // deterministic
+    );
+
+    return output_t;
+  }
+  else {
+    // fallback
+
+    auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
+
+    TensorArg input  { input_t,  "input",  1 },
+              weight { weight_t, "weight", 2 };
+
+    Tensor output_t = at::detail::empty_cuda(
+        conv_output_size(
+          input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
+        input->options().memory_format(memory_format));
+    if (output_t.numel() == 0){
+      return output_t;
+    }
+    // Avoid ambiguity of "output" when this is being used as backwards
+    TensorArg output{output_t, "result", 0};
+    miopen_convolution_forward_out(
+        output,
+        "miopen_convolution_relu",
+        input,
+        weight,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        false // deterministic
+    );
+
+    auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
+
+    if (!output_t.is_same(contig_output_t)) {
+      contig_output_t.copy_(output_t);
+    }
+
+    auto _bias = bias.has_value()
+            ? bias.value()
+            : at::zeros(
+                  {contig_output_t.size(1)},
+                  optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
+                  contig_output_t.options().layout_opt(),
+                  contig_output_t.options().device_opt(),
+                  contig_output_t.options().pinned_memory_opt());
+
+    at::Tensor reshaped_bias = at::native::reshape_bias(input_t.dim(), _bias);
+    contig_output_t.add_(reshaped_bias);
+    contig_output_t.relu_();
+
+    return contig_output_t;
+  }
 }

 REGISTER_CUDA_DISPATCH(miopen_convolution_backward_stub, &miopen_convolution_backward)
--- a/aten/src/ATen/native/mps/MPSGraphSonomaOps.h
+++ b/aten/src/ATen/native/mps/MPSGraphSonomaOps.h
@ -0,0 +1,48 @@
+#pragma once
+
+#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
+
+#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
+
+typedef NS_ENUM(NSUInteger, MPSGraphFFTScalingMode) {
+  MPSGraphFFTScalingModeNone = 0L,
+  MPSGraphFFTScalingModeSize = 1L,
+  MPSGraphFFTScalingModeUnitary = 2L,
+};
+
+@interface FakeMPSGraphFFTDescriptor : NSObject<NSCopying>
+@property(readwrite, nonatomic) BOOL inverse;
+@property(readwrite, nonatomic) MPSGraphFFTScalingMode scalingMode;
+@property(readwrite, nonatomic) BOOL roundToOddHermitean;
+ (nullable instancetype)descriptor;
+@end
+
+@compatibility_alias MPSGraphFFTDescriptor FakeMPSGraphFFTDescriptor;
+
+@interface MPSGraph (SonomaOps)
+- (MPSGraphTensor* _Nonnull)conjugateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)realPartOfTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)fastFourierTransformWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                                      axes:(NSArray<NSNumber*>* _Nonnull)axes
+                                                descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
+                                                      name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)realToHermiteanFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                                    axes:(NSArray<NSNumber*>* _Nonnull)axes
+                                              descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
+                                                    name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)HermiteanToRealFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                                    axes:(NSArray<NSNumber*>* _Nonnull)axes
+                                              descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
+                                                    name:(NSString* _Nullable)name;
+@end
+
+// define BFloat16 enums for MacOS13
+#define MPSDataTypeBFloat16 ((MPSDataType)(MPSDataTypeAlternateEncodingBit | MPSDataTypeFloat16))
+
+// define Metal version
+#define MTLLanguageVersion3_1 ((MTLLanguageVersion)((3 << 16) + 1))
+#endif
--- a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
+++ b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
@ -0,0 +1,196 @@
+#pragma once
+#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
+
+// TODO: Remove me when moved to MacOS 13
+#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
+
+@interface FakeMPSGraphConvolution3DOpDescriptor : NSObject<NSCopying>
+
+@property(readwrite, nonatomic) NSUInteger strideInX;
+@property(readwrite, nonatomic) NSUInteger strideInY;
+@property(readwrite, nonatomic) NSUInteger strideInZ;
+@property(readwrite, nonatomic) NSUInteger dilationRateInX;
+@property(readwrite, nonatomic) NSUInteger dilationRateInY;
+@property(readwrite, nonatomic) NSUInteger dilationRateInZ;
+
+@property(readwrite, nonatomic) NSUInteger paddingLeft;
+@property(readwrite, nonatomic) NSUInteger paddingRight;
+@property(readwrite, nonatomic) NSUInteger paddingTop;
+@property(readwrite, nonatomic) NSUInteger paddingBottom;
+@property(readwrite, nonatomic) NSUInteger paddingFront;
+@property(readwrite, nonatomic) NSUInteger paddingBack;
+
+@property(readwrite, nonatomic) MPSGraphPaddingStyle paddingStyle;
+@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout dataLayout;
+@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout weightsLayout;
+
+@property(readwrite, nonatomic) NSUInteger groups;
+
+@end
+
+@compatibility_alias MPSGraphConvolution3DOpDescriptor FakeMPSGraphConvolution3DOpDescriptor;
+
+#endif
+
+@interface MPSGraph (VenturaOps)
+
+#if !defined(__MAC_13_0) && (!defined(MAC_OS_X_VERSION_13_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_0))
+
+typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode) {
+  MPSGraphResizeNearestRoundingModeRoundPreferCeil = 0L,
+  MPSGraphResizeNearestRoundingModeRoundPreferFloor = 1L,
+  MPSGraphResizeNearestRoundingModeCeil = 2L,
+  MPSGraphResizeNearestRoundingModeFloor = 3L,
+  MPSGraphResizeNearestRoundingModeRoundToEven = 4L,
+  MPSGraphResizeNearestRoundingModeRoundToOdd = 5L,
+};
+
+// Define complex enums for MacOS 12
+#define MPSDataTypeComplexBit 0x01000000
+#define MPSDataTypeComplexFloat32 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 64))
+#define MPSDataTypeComplexFloat16 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 32))
+#endif
+
+- (MPSGraphTensor* _Nonnull)convolution3DWithSourceTensor:(MPSGraphTensor* _Nonnull)source
+                                            weightsTensor:(MPSGraphTensor* _Nonnull)weights
+                                               descriptor:(MPSGraphConvolution3DOpDescriptor* _Nonnull)descriptor
+                                                     name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)
+    convolution3DDataGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
+                                          weightsTensor:(MPSGraphTensor* _Nonnull)weights
+                                            outputShape:(MPSShape* _Nonnull)outputShape
+                           forwardConvolutionDescriptor:
+                               (MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
+                                                   name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)
+    convolution3DWeightsGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
+                                              sourceTensor:(MPSGraphTensor* _Nonnull)source
+                                               outputShape:(MPSShape* _Nonnull)outputShape
+                              forwardConvolutionDescriptor:
+                                  (MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
+                                                      name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)cumulativeSumWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                               axis:(NSInteger)axis
+                                               name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                      axis:(NSInteger)axis
+                                      name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                      axis:(NSInteger)axis
+                                descending:(BOOL)descending
+                                      name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
+                                descending:(BOOL)descending
+                                      name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
+                                      name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                         axis:(NSInteger)axis
+                                         name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                         axis:(NSInteger)axis
+                                   descending:(BOOL)descending
+                                         name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                   axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
+                                   descending:(BOOL)descending
+                                         name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                   axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
+                                         name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)inverseOfTensor:(MPSGraphTensor* _Nonnull)inputTensor name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
+                                         sizeTensor:(MPSGraphTensor* _Nonnull)size
+                                nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
+                                       centerResult:(BOOL)centerResult
+                                       alignCorners:(BOOL)alignCorners
+                                             layout:(MPSGraphTensorNamedDataLayout)layout
+                                               name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
+                                         sizeTensor:(MPSGraphTensor* _Nonnull)size
+                                  scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
+                                nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
+                                             layout:(MPSGraphTensorNamedDataLayout)layout
+                                               name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
+                                          sizeTensor:(MPSGraphTensor* _Nonnull)size
+                                        centerResult:(BOOL)centerResult
+                                        alignCorners:(BOOL)alignCorners
+                                              layout:(MPSGraphTensorNamedDataLayout)layout
+                                                name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
+                                          sizeTensor:(MPSGraphTensor* _Nonnull)size
+                                   scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
+                                              layout:(MPSGraphTensorNamedDataLayout)layout
+                                                name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
+                                                      input:(MPSGraphTensor* _Nonnull)input
+                                        nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
+                                               centerResult:(BOOL)centerResult
+                                               alignCorners:(BOOL)alignCorners
+                                                     layout:(MPSGraphTensorNamedDataLayout)layout
+                                                       name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
+                                                      input:(MPSGraphTensor* _Nonnull)input
+                                          scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
+                                        nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
+                                                     layout:(MPSGraphTensorNamedDataLayout)layout
+                                                       name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
+                                                       input:(MPSGraphTensor* _Nonnull)input
+                                                centerResult:(BOOL)centerResult
+                                                alignCorners:(BOOL)alignCorners
+                                                      layout:(MPSGraphTensorNamedDataLayout)layout
+                                                        name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
+                                                       input:(MPSGraphTensor* _Nonnull)input
+                                           scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
+                                                      layout:(MPSGraphTensorNamedDataLayout)layout
+                                                        name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
+                                      coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
+                                                layout:(MPSGraphTensorNamedDataLayout)layout
+                                  normalizeCoordinates:(BOOL)normalizeCoordinates
+                                   relativeCoordinates:(BOOL)relativeCoordinates
+                                          alignCorners:(BOOL)alignCorners
+                                           paddingMode:(MPSGraphPaddingMode)paddingMode
+                                          samplingMode:(MPSGraphResizeMode)samplingMode
+                                         constantValue:(double)constantValue
+                                                  name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
+                                      coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
+                                                layout:(MPSGraphTensorNamedDataLayout)layout
+                                  normalizeCoordinates:(BOOL)normalizeCoordinates
+                                   relativeCoordinates:(BOOL)relativeCoordinates
+                                          alignCorners:(BOOL)alignCorners
+                                           paddingMode:(MPSGraphPaddingMode)paddingMode
+                                   nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
+                                         constantValue:(double)constantValue
+                                                  name:(NSString* _Nullable)name;
+- (MPSGraphTensor* _Nonnull)truncateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
+
+@end
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@ -9,6 +9,8 @@
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/mps/MPSGraphSequoiaOps.h>
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <fmt/format.h>
 #include <fmt/ranges.h>
@ -568,7 +570,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor,
    MPSShape* mpsStrides = getMPSShape(_tensor.strides());
    check_mps_shape(mpsShape);

-    auto storage_numel = src.storage().nbytes() / src.element_size() - src.storage_offset();
+    auto storage_numel = src.storage().nbytes() / src.element_size();
    TORCH_CHECK(storage_numel <= std::numeric_limits<int32_t>::max(),
                "MPSGaph does not support tensor dims larger than INT_MAX");
    MPSNDArrayDescriptor* srcTensorDesc = [MPSNDArrayDescriptor descriptorWithDataType:dataType
--- a/aten/src/ATen/native/mps/kernels/EmbeddingBag.h
+++ b/aten/src/ATen/native/mps/kernels/EmbeddingBag.h
@ -1,25 +0,0 @@
-#pragma once
-#include <c10/metal/common.h>
-
-#ifdef __METAL__
-enum class EmbeddingBagMode { SUM = 0, MEAN, MAX };
-#else
-#include <ATen/native/EmbeddingBag.h>
-using at::native::EmbeddingBagMode;
-#endif
-
-template <typename idx_type_t = uint32_t>
-struct EmbeddingBagParams {
-  ::c10::metal::array<idx_type_t, 2> weight_strides;
-  ::c10::metal::array<idx_type_t, 2> output_strides;
-  ::c10::metal::array<idx_type_t, 2> max_indices_strides;
-
-  idx_type_t per_sample_weights_strides;
-
-  idx_type_t num_indices;
-  idx_type_t num_bags;
-  idx_type_t feature_size;
-
-  EmbeddingBagMode mode;
-  int64_t padding_idx;
-};
--- a/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
+++ b/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
@ -1,212 +0,0 @@
-#include <ATen/native/mps/kernels/EmbeddingBag.h>
-#include <c10/metal/utils.h>
-#include <metal_array>
-#include <metal_stdlib>
-
-using namespace metal;
-using namespace c10::metal;
-
-template <EmbeddingBagMode M, typename T>
-struct ReductionOpInit {
-  inline opmath_t<T> operator()() {
-    return 0;
-  }
-};
-
-template <typename T>
-struct ReductionOpInit<EmbeddingBagMode::MAX, T> {
-  inline opmath_t<T> operator()() {
-    return static_cast<opmath_t<T>>(-INFINITY);
-  }
-};
-
-template <EmbeddingBagMode M, typename T>
-struct ReductionOp {
-  inline opmath_t<T> operator()(
-      T weight_val,
-      opmath_t<T> out_val,
-      uint32_t per_sample_weights_index,
-      constant T* per_sample_weights,
-      uint32_t per_sample_weights_strides);
-};
-
-template <typename T>
-struct ReductionOp<EmbeddingBagMode::SUM, T> {
-  inline opmath_t<T> operator()(
-      T weight_val,
-      opmath_t<T> out_val,
-      uint32_t per_sample_weights_index,
-      constant T* per_sample_weights,
-      uint32_t per_sample_weights_strides) {
-    if (per_sample_weights_strides) {
-      T per_sample_weight = per_sample_weights
-          [per_sample_weights_strides * per_sample_weights_index];
-      return static_cast<opmath_t<T>>(per_sample_weight) *
-          static_cast<opmath_t<T>>(weight_val) +
-          out_val;
-    } else {
-      return static_cast<opmath_t<T>>(weight_val) + out_val;
-    }
-  }
-};
-
-template <typename T>
-struct ReductionOp<EmbeddingBagMode::MEAN, T> {
-  inline opmath_t<T> operator()(
-      T weight_val,
-      opmath_t<T> out_val,
-      uint32_t,
-      constant T*,
-      uint32_t) {
-    return static_cast<opmath_t<T>>(weight_val) + out_val;
-  }
-};
-
-template <typename T>
-struct ReductionOp<EmbeddingBagMode::MAX, T> {
-  inline opmath_t<T> operator()(
-      T weight_val,
-      opmath_t<T> out_val,
-      uint32_t,
-      constant T*,
-      uint32_t) {
-    return max(static_cast<opmath_t<T>>(weight_val), out_val);
-  }
-};
-
-template <EmbeddingBagMode M, typename T>
-struct ReductionOpFinal {
-  inline T operator()(opmath_t<T> val, uint32_t) {
-    return static_cast<T>(val);
-  }
-};
-
-template <typename T>
-struct ReductionOpFinal<EmbeddingBagMode::MEAN, T> {
-  inline T operator()(opmath_t<T> val, uint32_t count) {
-    auto out = val / count;
-    return static_cast<T>((count == 0) ? 0 : out);
-  }
-};
-
-template <typename T>
-struct ReductionOpFinal<EmbeddingBagMode::MAX, T> {
-  inline T operator()(opmath_t<T> val, uint32_t count) {
-    return static_cast<T>((count == 0) ? 0 : val);
-  }
-};
-
-template <EmbeddingBagMode M, typename T, typename I>
-void embedding_bag_impl(
-    constant T* weight,
-    constant I* indices,
-    constant I* offsets,
-    constant T* per_sample_weights,
-    device T* output,
-    device I* offset2bag,
-    device I* bag_size,
-    device I* max_indices,
-    constant EmbeddingBagParams<uint32_t>& params,
-    uint tid) {
-  auto num_indices = params.num_indices;
-  auto num_bags = params.num_bags;
-  auto feature_size = params.feature_size;
-  auto padding_idx = params.padding_idx;
-  auto per_sample_weights_strides = params.per_sample_weights_strides;
-  constant auto& output_strides = params.output_strides;
-  constant auto& weight_strides = params.weight_strides;
-  constant auto& max_indices_strides = params.max_indices_strides;
-
-  auto bag_idx = tid / feature_size;
-  auto feature_idx = tid % feature_size;
-
-  output += bag_idx * output_strides[0] + feature_idx * output_strides[1];
-
-  uint32_t offsets_end = min(bag_idx + 1, num_bags - 1);
-  bool is_last_bag = bag_idx + 1 == num_bags;
-  uint32_t indices_start = static_cast<uint32_t>(offsets[bag_idx]);
-  uint32_t indices_end = is_last_bag * (num_indices) +
-      (!is_last_bag) * (static_cast<uint32_t>(offsets[offsets_end]));
-
-  auto out_val = ReductionOpInit<M, T>()();
-
-  uint32_t bag_size_ = 0;
-
-  for (uint32_t indices_idx = indices_start; indices_idx < indices_end;
-       indices_idx++) {
-    I weight_idx = indices[indices_idx];
-    bool pad = (weight_idx == padding_idx);
-    T weight_val = weight
-        [static_cast<uint32_t>(weight_idx) * weight_strides[0] +
-         feature_idx * weight_strides[1]];
-
-    bag_size_ += static_cast<uint32_t>(!pad);
-
-    auto tmp_val = ReductionOp<M, T>()(
-        weight_val,
-        out_val,
-        indices_idx,
-        per_sample_weights,
-        per_sample_weights_strides);
-
-    out_val = pad ? out_val : tmp_val;
-  }
-
-  *output = ReductionOpFinal<M, T>()(out_val, bag_size_);
-}
-
-#define DISPATCH_IMPL(MODE)        \
-  return embedding_bag_impl<MODE>( \
-      weight,                      \
-      indices,                     \
-      offsets,                     \
-      per_sample_weights,          \
-      output,                      \
-      offset2bag,                  \
-      bag_size,                    \
-      max_indices,                 \
-      params,                      \
-      tid)
-
-template <typename T, typename I>
-kernel void embedding_bag(
-    constant T* weight [[buffer(0)]],
-    constant I* indices [[buffer(1)]],
-    constant I* offsets [[buffer(2)]],
-    constant T* per_sample_weights [[buffer(3)]],
-    device T* output [[buffer(4)]],
-    device I* offset2bag [[buffer(5)]],
-    device I* bag_size [[buffer(6)]],
-    device I* max_indices [[buffer(7)]],
-    constant EmbeddingBagParams<uint32_t>& params [[buffer(8)]],
-    uint tid [[thread_position_in_grid]]) {
-  switch (params.mode) {
-    case EmbeddingBagMode::SUM:
-      DISPATCH_IMPL(EmbeddingBagMode::SUM);
-    case EmbeddingBagMode::MEAN:
-      DISPATCH_IMPL(EmbeddingBagMode::MEAN);
-    case EmbeddingBagMode::MAX:
-      DISPATCH_IMPL(EmbeddingBagMode::MAX);
-  }
-}
-
-#define REGISTER_EMBEDDING_BAG_OP(T, I)                             \
-  template [[host_name("embedding_bag_" #T "_" #I)]]                \
-  kernel void embedding_bag<T, I>(                                  \
-      constant T * weight [[buffer(0)]],                            \
-      constant I * indices [[buffer(1)]],                           \
-      constant I * offsets [[buffer(2)]],                           \
-      constant T * per_sample_weights [[buffer(3)]],                \
-      device T * output [[buffer(4)]],                              \
-      device I * offset2bag [[buffer(5)]],                          \
-      device I * bag_size [[buffer(6)]],                            \
-      device I * max_indices [[buffer(7)]],                         \
-      constant EmbeddingBagParams<uint32_t> & params [[buffer(8)]], \
-      uint tid [[thread_position_in_grid]]);
-
-REGISTER_EMBEDDING_BAG_OP(float, int);
-REGISTER_EMBEDDING_BAG_OP(float, long);
-REGISTER_EMBEDDING_BAG_OP(half, int);
-REGISTER_EMBEDDING_BAG_OP(half, long);
-REGISTER_EMBEDDING_BAG_OP(bfloat, int);
-REGISTER_EMBEDDING_BAG_OP(bfloat, long);
--- a/aten/src/ATen/native/mps/operations/BinaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
@ -8,6 +8,8 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/mps/operations/BinaryKernel.h>
+// For MTLLanguageVersion_3_1
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <fmt/format.h>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@ -1,12 +1,23 @@
 //  Copyright © 2022 Apple Inc.
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/ConvUtils.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/ops/_mps_convolution_native.h>
 #include <ATen/ops/_mps_convolution_transpose_native.h>
 #include <ATen/ops/mps_convolution_backward_native.h>
 #include <ATen/ops/mps_convolution_transpose_backward_native.h>
-#include <fmt/format.h>
+
+#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
+
+@implementation FakeMPSGraphConvolution3DOpDescriptor
+- (nonnull id)copyWithZone:(nullable NSZone*)zone {
+  return self;
+}
+
+@end
+
+#endif

 namespace at::native {

@ -39,9 +50,11 @@ static void fill_conv3d_desc(MPSGraphConvolution3DOpDescriptor* descriptor_,
  descriptor_.paddingFront = paddingDepth;
  descriptor_.paddingBack = paddingDepth;

-  descriptor_.dataLayout = MPSGraphTensorNamedDataLayoutNCDHW;
+  // PyTorch always uses NCDHW memory layout for 3D tensors
+  descriptor_.dataLayout = (MPSGraphTensorNamedDataLayout)7L; // MPSGraphTensorNamedDataLayoutNCDHW;

-  descriptor_.weightsLayout = MPSGraphTensorNamedDataLayoutOIDHW;
+  // PyTorch always uses OIDHW memory layout for 3D weights
+  descriptor_.weightsLayout = (MPSGraphTensorNamedDataLayout)9L; // MPSGraphTensorNamedDataLayoutOIDHW;

  descriptor_.groups = groups; // not yet tested in Xcode/C++
 }
@ -173,6 +186,18 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
    if (bias_defined)
      bias_shape = bias_opt.value().sizes();

+    std::string mem_format_key;
+    switch (memory_format) {
+      case at::MemoryFormat::Contiguous:
+        mem_format_key = "Contiguous";
+        break;
+      case at::MemoryFormat::ChannelsLast:
+        mem_format_key = "ChannelsLast";
+        break;
+      default:
+        assert(0 && "Check should have been done earlier\n");
+    }
+
    std::string bias_shape_key;
    if (bias_defined) {
      bias_shape_key = std::to_string(bias_shape[0]);
@ -180,16 +205,20 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
      bias_shape_key = "nobias";
    }

-    std::string key = fmt::format("mps_{}convolution:{}:{}:{}:{}:{}:{}:{}:{}",
-                                  is3DConv ? "3d_" : "",
-                                  getArrayRefString(stride),
-                                  getArrayRefString(dilation),
-                                  getArrayRefString(padding),
-                                  groups,
-                                  is_channels_last,
-                                  mps::getTensorsStringKey({input_t, weight_t}),
-                                  bias_defined,
-                                  bias_shape_key);
+    std::string key;
+    if (is3DConv) {
+      key = "mps_3d_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
+          std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
+          std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
+          std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
+          mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
+
+    } else {
+      key = "mps_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
+          std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
+          std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
+          mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
+    }

    MPSShape* inputShape = mps::getMPSShape(input_t, memory_format);
    MPSShape* outputShape = mps::getMPSShape(output_t, memory_format);
@ -371,15 +400,33 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
  @autoreleasepool {
    MPSStream* stream = getCurrentMPSStream();

+    std::string mem_format_key;
+    switch (memory_format) {
+      case at::MemoryFormat::Contiguous:
+        mem_format_key = "Contiguous";
+        break;
+      case at::MemoryFormat::ChannelsLast:
+        mem_format_key = "ChannelsLast";
+        break;
+      default:
+        assert(0 && "Check should have been done earlier\n");
+    }
+
    MPSShape* mps_input_shape = getMPSShape(input_size);
-    std::string key = fmt::format("mps_{}_convolution_backward_input:{}:{}:{}:{}:{}:{}",
-                                  is3DConv ? "3d_" : "",
-                                  getArrayRefString(stride),
-                                  getArrayRefString(dilation),
-                                  getArrayRefString(padding),
-                                  groups,
-                                  is_channels_last,
-                                  getTensorsStringKey({grad_output_t, weight_t}));
+    std::string key;
+    if (is3DConv) {
+      key = "mps_3d_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
+          ":" + std::to_string(stride[2]) + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
+          std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
+          std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
+          getTensorsStringKey({grad_output_t, weight_t});
+
+    } else {
+      key = "mps_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
+          std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
+          std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
+          getTensorsStringKey({grad_output_t, weight_t});
+    }
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      auto gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output_t);
      auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
@ -504,13 +551,19 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
    MPSStream* stream = getCurrentMPSStream();

    MPSShape* mps_weight_shape = getMPSShape(weight_size);
-    std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}",
-                                  is3DConv ? "3d_" : "",
-                                  getArrayRefString(stride),
-                                  getArrayRefString(dilation),
-                                  getArrayRefString(padding),
-                                  groups,
-                                  getTensorsStringKey({grad_output_t, input_t, grad_weight_t}));
+    std::string key;
+    if (is3DConv) {
+      key = "mps_3d_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
+          std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
+          std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
+          std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" +
+          getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
+    } else {
+      key = "mps_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
+          std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
+          std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" +
+          getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
+    }
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSShape* inputShape = getMPSShape(input_t);
      bool isDepthwiseConv =
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@ -2,6 +2,7 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/mps/Copy.h>
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/ops/_copy_from_and_resize_native.h>
 #include <ATen/ops/_copy_from_native.h>
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@ -5,6 +5,8 @@
 #include <ATen/native/DistributionTemplates.h>
 #include <ATen/native/Distributions.h>
 #include <ATen/native/TensorFactories.h>
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
+++ b/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
@ -1,179 +0,0 @@
-#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/TensorUtils.h>
-#include <ATen/core/Tensor.h>
-#include <ATen/mps/MPSProfiler.h>
-#include <ATen/native/EmbeddingBag.h>
-#include <ATen/native/Pool.h>
-#include <ATen/native/mps/OperationUtils.h>
-#include <ATen/native/mps/kernels/EmbeddingBag.h>
-
-#include <fmt/format.h>
-
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
-#else
-#include <ATen/ops/_embedding_bag_forward_only_native.h>
-#include <ATen/ops/_embedding_bag_native.h>
-#include <ATen/ops/empty.h>
-#endif
-
-namespace at::native {
-
-#ifndef PYTORCH_JIT_COMPILE_SHADERS
-static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
-#else
-#include <ATen/native/mps/EmbeddingBag_metallib.h>
-#endif
-
-namespace {
-
-std::pair<Tensor, Tensor> promoteIndicesAndOffsets(const Tensor& indices, const Tensor& offsets) {
-  const auto commonType = promoteTypes(offsets.scalar_type(), indices.scalar_type());
-  return {indices.scalar_type() == commonType ? indices : indices.toType(commonType),
-          offsets.scalar_type() == commonType ? offsets : offsets.toType(commonType)};
-}
-
-} // namespace
-
-namespace mps {
-
-static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
-    const Tensor& weight,
-    const Tensor& indices_,
-    const Tensor& offsets_,
-    const bool scale_grad_by_freq,
-    const int64_t mode,
-    bool sparse,
-    const std::optional<Tensor>& per_sample_weights_opt,
-    bool include_last_offset,
-    int64_t padding_idx) {
-  TORCH_CHECK(indices_.dim() == 1, "input has to be a 1D Tensor, but got Tensor of dimension ", indices_.dim());
-  if (indices_.dim() == 1) {
-    TORCH_CHECK(offsets_.dim() == 1, "offsets has to be a 1D Tensor, but got Tensor of dimension ", offsets_.dim());
-  }
-  TORCH_CHECK(weight.dim() == 2, "weight has to be a 2D Tensor, but got Tensor of dimension ", weight.dim());
-
-  Tensor indices, offsets;
-  std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_);
-  auto indices_arg = TensorArg(indices, "indices", 1);
-  checkScalarTypes("embedding_bag_mps", indices_arg, {kLong, kInt});
-  auto offsets_arg = TensorArg(offsets, "offsets", 1);
-  checkScalarTypes("embedding_bag_mps", offsets_arg, {kLong, kInt});
-  checkSameType("embedding_bag_mps", indices_arg, offsets_arg);
-  auto weight_arg = TensorArg(weight, "weight", 1);
-
-  int64_t num_indices = indices.size(0);
-  int64_t num_bags = offsets.size(0);
-  if (include_last_offset) {
-    num_bags -= 1;
-  }
-  int64_t feature_size = weight.size(1);
-
-  auto bag_size = at::empty(offsets.sizes(), indices.options());
-  auto offset2bag = at::empty({indices.size(0)}, indices.options());
-  auto output = at::empty({num_bags, feature_size}, weight.options());
-
-  Tensor max_indices;
-
-  if (mode == EmbeddingBagMode::MAX) {
-    max_indices = at::empty({num_bags, feature_size}, indices.options());
-  } else {
-    max_indices = at::empty({0}, indices.options());
-  }
-
-  EmbeddingBagParams<uint32_t> params;
-
-  for (const auto dim : c10::irange(weight.dim())) {
-    params.weight_strides[dim] = safe_downcast<uint32_t, int64_t>(weight.stride(dim));
-    params.output_strides[dim] = safe_downcast<uint32_t, int64_t>(output.stride(dim));
-
-    if (mode == EmbeddingBagMode::MAX) {
-      params.max_indices_strides[dim] = safe_downcast<uint32_t, int64_t>(max_indices.stride(dim));
-    }
-  }
-
-  bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
-  params.per_sample_weights_strides = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
-
-  params.num_indices = num_indices;
-  params.num_bags = num_bags;
-  params.feature_size = feature_size;
-  params.mode = static_cast<EmbeddingBagMode>(mode);
-  params.padding_idx = padding_idx;
-
-  auto num_threads = output.numel();
-  MPSStream* stream = getCurrentMPSStream();
-
-  dispatch_sync_with_rethrow(stream->queue(), ^() {
-    @autoreleasepool {
-      id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
-      auto pipeline_state = lib.getPipelineStateForFunc(
-          fmt::format("embedding_bag_{}_{}", scalarToMetalTypeString(weight), scalarToMetalTypeString(indices)));
-
-      getMPSProfiler().beginProfileKernel(pipeline_state, "embedding_bag", {weight, indices, offsets});
-      [computeEncoder setComputePipelineState:pipeline_state];
-      mtl_setArgs(computeEncoder,
-                  weight,
-                  indices,
-                  offsets,
-                  use_per_sample_weights ? per_sample_weights_opt : std::nullopt,
-                  output,
-                  offset2bag,
-                  bag_size,
-                  max_indices,
-                  params);
-
-      mtl_dispatch1DJob(computeEncoder, pipeline_state, num_threads);
-      getMPSProfiler().endProfileKernel(pipeline_state);
-    }
-  });
-
-  return std::tuple<Tensor, Tensor, Tensor, Tensor>(
-      std::move(output), std::move(offset2bag), std::move(bag_size), std::move(max_indices));
-}
-
-} // namespace mps
-
-std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps(const Tensor& weight,
-                                                              const Tensor& indices,
-                                                              const Tensor& offsets,
-                                                              const bool scale_grad_by_freq,
-                                                              const int64_t mode,
-                                                              bool sparse,
-                                                              const std::optional<Tensor>& per_sample_weights_opt,
-                                                              bool include_last_offset,
-                                                              int64_t padding_idx) {
-  return mps::_embedding_bag_mps_impl(weight,
-                                      indices,
-                                      offsets,
-                                      scale_grad_by_freq,
-                                      mode,
-                                      sparse,
-                                      per_sample_weights_opt,
-                                      include_last_offset,
-                                      padding_idx);
-}
-
-std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_forward_only_mps(
-    const Tensor& weight,
-    const Tensor& indices,
-    const Tensor& offsets,
-    const bool scale_grad_by_freq,
-    const int64_t mode,
-    bool sparse,
-    const std::optional<Tensor>& per_sample_weights_opt,
-    bool include_last_offset,
-    int64_t padding_idx) {
-  return _embedding_bag_mps(weight,
-                            indices,
-                            offsets,
-                            scale_grad_by_freq,
-                            mode,
-                            sparse,
-                            per_sample_weights_opt,
-                            include_last_offset,
-                            padding_idx);
-}
-
-} // namespace at::native
--- a/aten/src/ATen/native/mps/operations/FastFourierTransform.mm
+++ b/aten/src/ATen/native/mps/operations/FastFourierTransform.mm
@ -1,4 +1,6 @@
 #include <ATen/native/SpectralOpsUtils.h>
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
@ -10,6 +12,20 @@
 #include <ATen/ops/_fft_r2c_native.h>
 #endif

+#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
+@implementation FakeMPSGraphFFTDescriptor
+ (nullable instancetype)descriptor {
+  // Redispatch the constructor to the actual implementation
+  id desc = NSClassFromString(@"MPSGraphFFTDescriptor");
+  return (FakeMPSGraphFFTDescriptor*)[desc descriptor];
+}
+
+- (nonnull id)copyWithZone:(nullable NSZone*)zone {
+  return self;
+}
+@end
+#endif
+
 namespace at::native {
 namespace {
 MPSGraphFFTScalingMode normalization_to_ScalingMode(int64_t normalization) {
--- a/aten/src/ATen/native/mps/operations/GridSampler.mm
+++ b/aten/src/ATen/native/mps/operations/GridSampler.mm
@ -2,6 +2,7 @@
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/GridSamplerUtils.h>
 #include <ATen/native/Pool.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/mps/kernels/GridSampler.h>

--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@ -17,6 +17,7 @@
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorAdvancedIndexing.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <c10/util/SmallVector.h>
 #include <c10/util/irange.h>
 #include <fmt/format.h>
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@ -6,7 +6,9 @@
 #include <ATen/native/LinearAlgebra.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/Resize.h>
+// For MTLLanguageVersion_3_1
 #include <ATen/native/mps/MPSGraphSequoiaOps.h>
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@ -4,6 +4,7 @@
 #include <ATen/TensorUtils.h>
 #include <ATen/native/Pool.h>
 #include <ATen/native/ReduceOpsUtils.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <c10/util/irange.h>

--- a/aten/src/ATen/native/mps/operations/Shape.mm
+++ b/aten/src/ATen/native/mps/operations/Shape.mm
@ -4,6 +4,7 @@
 #include <ATen/WrapDimUtils.h>
 #include <ATen/native/TensorShape.h>
 #include <ATen/native/TypeProperties.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/native/mps/operations/Sort.mm
+++ b/aten/src/ATen/native/mps/operations/Sort.mm
@ -5,6 +5,7 @@
 #include <ATen/native/SortingUtils.h>
 #include <ATen/native/TensorShape.h>
 #include <ATen/native/TypeProperties.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@ -2,6 +2,8 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/UnaryOps.h>
 #include <ATen/native/mps/Copy.h>
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/native/mps/operations/Unique.mm
+++ b/aten/src/ATen/native/mps/operations/Unique.mm
@ -1,6 +1,7 @@
 //  Copyright © 2022 Apple Inc.
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/Resize.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
--- a/aten/src/ATen/native/mps/operations/UpSample.mm
+++ b/aten/src/ATen/native/mps/operations/UpSample.mm
@ -1,6 +1,7 @@
 //  Copyright © 2023 Apple Inc.
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/UpSample.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <fmt/format.h>

--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@ -4,6 +4,8 @@
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/Resize.h>
+// For MTLLanguageVersion_3_1
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <fmt/format.h>

--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -2351,7 +2351,6 @@
  dispatch:
    CPU: _embedding_bag_forward_only_cpu
    CUDA: _embedding_bag_forward_only_cuda
-    MPS: _embedding_bag_forward_only_mps
  autogen: _embedding_bag_forward_only.out

 - func: _rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor)
@ -2373,7 +2372,6 @@
  dispatch:
    CPU: _embedding_bag_cpu
    CUDA: _embedding_bag_cuda
-    MPS: _embedding_bag_mps
  autogen: _embedding_bag.out
  tags: core

@ -2519,7 +2517,7 @@
  dispatch:
    CompositeExplicitAutograd: empty_like
    QuantizedCPU, QuantizedCUDA: empty_like_quantized
-    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: empty_like_sparse_coo
+    SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: empty_like_nested
  autogen: empty_like.out
@ -4374,7 +4372,7 @@
  variants: function, method
  dispatch:
    CPU: narrow_copy_dense_cpu
-    SparseCPU, SparseCUDA, SparseMPS: narrow_copy_sparse
+    SparseCPU, SparseCUDA: narrow_copy_sparse
    CompositeExplicitAutogradNonFunctional: narrow_copy_dense_symint
  tags: view_copy

@ -6494,7 +6492,7 @@
  device_guard: False
  dispatch:
    CompositeExplicitAutograd: unsqueeze
-    SparseCPU, SparseCUDA, SparseMPS: unsqueeze_sparse
+    SparseCPU, SparseCUDA: unsqueeze_sparse
    QuantizedCPU, QuantizedCUDA: unsqueeze_quantized
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: unsqueeze_nested
  tags: core
@ -6662,7 +6660,7 @@
 - func: zeros.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CompositeExplicitAutograd: zeros_out
-    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: zeros_sparse_out
+    SparseCPU, SparseCUDA, SparseMeta: zeros_sparse_out

 - func: zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
  dispatch:
@ -10261,7 +10259,7 @@
  structured_delegate: any.all_out
  variants: method, function
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: any_sparse
+    SparseCPU, SparseCUDA: any_sparse
  tags: core

 - func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -10701,7 +10699,6 @@
  dispatch:
    CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow
    CUDA: foreach_tensor_div_list_kernel_cuda
-    MTIA: foreach_tensor_div_list_kernel_mtia

 - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@ -10709,7 +10706,6 @@
  dispatch:
    CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow_
    CUDA: foreach_tensor_div_list_kernel_cuda_
-    MTIA: foreach_tensor_div_list_kernel_mtia_
  autogen: _foreach_div.List_out

 - func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
@ -10733,7 +10729,6 @@
  dispatch:
    CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow
    CUDA: foreach_tensor_div_tensor_kernel_cuda
-    MTIA: foreach_tensor_div_tensor_kernel_mtia

 - func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@ -10741,7 +10736,6 @@
  dispatch:
    CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow_
    CUDA: foreach_tensor_div_tensor_kernel_cuda_
-    MTIA: foreach_tensor_div_tensor_kernel_mtia_
  autogen: _foreach_div.Tensor_out

 - func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
--- a/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu
+++ b/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu
@ -1,6 +1,5 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
-#include <ATen/Dispatch.h>
 #include <ATen/ceil_div.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <c10/cuda/CUDAGuard.h>
@ -22,11 +21,10 @@
 namespace at::native {

 namespace {
-template <typename T>
 __global__ void ChooseQuantizationParamsKernelImpl(
    const int64_t* fake_quant_on,
-    const T* x_min,
-    const T* x_max,
+    const float* x_min,
+    const float* x_max,
    int32_t qmin,
    int32_t qmax,
    int size,
@ -95,44 +93,34 @@ __global__ void ChooseQuantizationParamsKernelImpl(
  }
 }

-__device__ inline bool isinf_device(float v) {
-  return ::isinf(v);
-}
-__device__ inline bool isinf_device(c10::BFloat16 v) {
-  return ::isinf(static_cast<float>(v));
-}
-
 // CUDA kernel to compute Moving Average Min/Max of the tensor.
 // It uses the running_min and running_max along with averaging const, c.
 // The formula used to compute the new min/max is as follows
 //
 // running_min = (1 - c) * running_min + c * x_min, if running_min != inf
 // running_min = x_min, if running_min == inf
-template <typename T>
 __global__ void MovingAverageMinMax(
    const int64_t* observer_on,
-    const T* x_min,
-    const T* x_max,
-    T* running_min,
-    T* running_max,
+    const float* x_min,
+    const float* x_max,
+    float* running_min,
+    float* running_max,
    const float averaging_const,
    const int size) {
  int i = blockIdx.x * blockDim.x + threadIdx.x;

  if (*observer_on == 1) {
    if (i < size) {
-      T curr_min = x_min[i];
-      T curr_max = x_max[i];
+      float curr_min = x_min[i];
+      float curr_max = x_max[i];

-      T averaging_const_t = static_cast<T>(averaging_const);
+      float adjusted_min = ::isinf(running_min[i])
+          ? curr_min
+          : (running_min[i]) + averaging_const * (curr_min - (running_min[i]));

-      T adjusted_min = isinf_device(running_min[i]) ? curr_min
-                                                    : (running_min[i]) +
-              averaging_const_t * (curr_min - (running_min[i]));
-
-      T adjusted_max = isinf_device(running_max[i]) ? curr_max
-                                                    : (running_max[i]) +
-              averaging_const_t * (curr_max - (running_max[i]));
+      float adjusted_max = ::isinf(running_max[i])
+          ? curr_max
+          : (running_max[i]) + averaging_const * (curr_max - (running_max[i]));

      running_min[i] = adjusted_min;
      running_max[i] = adjusted_max;
@ -154,51 +142,40 @@ void _calculate_moving_average(
  at::Tensor x_min, x_max;

  int64_t* observer_on_data = observer_on.data_ptr<int64_t>();
+  float* running_min_data = running_min.data_ptr<float>();
+  float* running_max_data = running_max.data_ptr<float>();
  cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();

  if (per_row_fq) {
    std::tie(x_min, x_max) = at::aminmax(x, 1);
+    float* x_min_data = x_min.data_ptr<float>();
+    float* x_max_data = x_max.data_ptr<float>();
    int num_threads = std::min(size, (int64_t)512);
    const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
-    AT_DISPATCH_FLOATING_TYPES_AND(
-        at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
-          scalar_t* x_min_data = x_min.data_ptr<scalar_t>();
-          scalar_t* x_max_data = x_max.data_ptr<scalar_t>();

-          scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
-          scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
-
-          // Moving Average Min/Max observer for activations
-          MovingAverageMinMax<<<num_blocks, num_threads, 0, cuda_stream>>>(
-              observer_on_data,
-              x_min_data,
-              x_max_data,
-              running_min_data,
-              running_max_data,
-              averaging_const,
-              size);
-        });
+    // Moving Average Min/Max observer for activations
+    MovingAverageMinMax<<<num_blocks, num_threads, 0, cuda_stream>>>(
+        observer_on_data,
+        x_min_data,
+        x_max_data,
+        running_min_data,
+        running_max_data,
+        averaging_const,
+        size);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
  } else {
    std::tie(x_min, x_max) = at::aminmax(x);
-    AT_DISPATCH_FLOATING_TYPES_AND(
-        at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
-          scalar_t* x_min_data = x_min.data_ptr<scalar_t>();
-          scalar_t* x_max_data = x_max.data_ptr<scalar_t>();
-
-          scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
-          scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
-
-          // Moving Average Min/Max observer for activations
-          MovingAverageMinMax<<<1, 1, 0, cuda_stream>>>(
-              observer_on_data,
-              x_min_data,
-              x_max_data,
-              running_min_data,
-              running_max_data,
-              averaging_const,
-              1 /*size*/);
-        });
+    float* x_min_data = x_min.data_ptr<float>();
+    float* x_max_data = x_max.data_ptr<float>();
+    // Moving Average Min/Max observer for activations
+    MovingAverageMinMax<<<1, 1, 0, cuda_stream>>>(
+        observer_on_data,
+        x_min_data,
+        x_max_data,
+        running_min_data,
+        running_max_data,
+        averaging_const,
+        1 /*size*/);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
  }
 }
@ -221,44 +198,34 @@ void _calc_moving_avg_qparams_helper(
  cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
  int64_t* fake_quant_on_data = fake_quant_on.data_ptr<int64_t>();
  if (per_row_fq) {
-    AT_DISPATCH_FLOATING_TYPES_AND(
-        at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
-          scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
-          scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
-          int num_threads = std::min(size, (int64_t)512);
-          const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
-          ChooseQuantizationParamsKernelImpl<<<
-              num_blocks,
-              num_threads,
-              0,
-              cuda_stream>>>(
-              fake_quant_on_data,
-              running_min_data,
-              running_max_data,
-              qmin,
-              qmax,
-              size,
-              symmetric_quant,
-              scale_ptr,
-              zp_ptr);
-        });
+    float* running_min_data = running_min.data_ptr<float>();
+    float* running_max_data = running_max.data_ptr<float>();
+    int num_threads = std::min(size, (int64_t)512);
+    const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
+    ChooseQuantizationParamsKernelImpl<<<num_blocks, num_threads, 0, cuda_stream>>>(
+        fake_quant_on_data,
+        running_min_data,
+        running_max_data,
+        qmin,
+        qmax,
+        size,
+        symmetric_quant,
+        scale_ptr,
+        zp_ptr);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
  } else {
-    AT_DISPATCH_FLOATING_TYPES_AND(
-        at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
-          scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
-          scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
-          ChooseQuantizationParamsKernelImpl<<<1, 1, 0, cuda_stream>>>(
-              fake_quant_on_data,
-              running_min_data,
-              running_max_data,
-              qmin,
-              qmax,
-              1, // size
-              symmetric_quant, // preserve_sparsity
-              scale_ptr,
-              zp_ptr);
-        });
+    float* running_min_data = running_min.data_ptr<float>();
+    float* running_max_data = running_max.data_ptr<float>();
+    ChooseQuantizationParamsKernelImpl<<<1, 1, 0, cuda_stream>>>(
+        fake_quant_on_data,
+        running_min_data,
+        running_max_data,
+        qmin,
+        qmax,
+        1, // size
+        symmetric_quant, // preserve_sparsity
+        scale_ptr,
+        zp_ptr);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
  }
 }
--- a/aten/src/ATen/test/mps_test_objc_interface.mm
+++ b/aten/src/ATen/test/mps_test_objc_interface.mm
@ -42,7 +42,7 @@ TEST(MPSObjCInterfaceTest, MPSCustomKernel) {
    id<MTLLibrary> customKernelLibrary = [device newLibraryWithSource: [NSString stringWithUTF8String:CUSTOM_KERNEL]
                                                              options: nil
                                                                error: &error];
-    TORCH_CHECK(customKernelLibrary, "Failed to create custom kernel library, error: ", error.localizedDescription.UTF8String);
+    TORCH_CHECK(customKernelLibrary, "Failed to to create custom kernel library, error: ", error.localizedDescription.UTF8String);

    id<MTLFunction> customFunction = [customKernelLibrary newFunctionWithName: @"add_arrays"];
    TORCH_CHECK(customFunction, "Failed to create function state object for the kernel");
--- a/aten/src/ATen/xpu/XPUContext.cpp
+++ b/aten/src/ATen/xpu/XPUContext.cpp
@ -76,23 +76,4 @@ int32_t getGlobalIdxFromDevice(DeviceIndex device) {
  return device_global_idxs[device];
 }

-// Check if a device can access the memory of a peer device directly.
-bool canDeviceAccessPeer(DeviceIndex device, DeviceIndex peer) {
-  if (device == -1) {
-    device = c10::xpu::current_device();
-  }
-  if (peer == -1) {
-    peer = c10::xpu::current_device();
-  }
-  check_device_index(device);
-  check_device_index(peer);
-  // A device can always access itself
-  if (device == peer) {
-    return true;
-  }
-  return c10::xpu::get_raw_device(device).ext_oneapi_can_access_peer(
-      c10::xpu::get_raw_device(peer),
-      sycl::ext::oneapi::peer_access::access_supported);
-}
-
 } // namespace at::xpu
--- a/aten/src/ATen/xpu/XPUContext.h
+++ b/aten/src/ATen/xpu/XPUContext.h
@ -17,6 +17,4 @@ TORCH_XPU_API DeviceProp* getDeviceProperties(DeviceIndex device);

 TORCH_XPU_API int32_t getGlobalIdxFromDevice(DeviceIndex device);

-TORCH_XPU_API bool canDeviceAccessPeer(DeviceIndex device, DeviceIndex peer);
-
 } // namespace at::xpu
--- a/benchmarks/dynamo/check_accuracy.py
+++ b/benchmarks/dynamo/check_accuracy.py
@ -72,12 +72,6 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                "timm_vovnet",
                "torchrec_dlrm",
                "vgg16",
-                # LLM
-                "meta-llama/Llama-3.2-1B",
-                "google/gemma-2-2b",
-                "google/gemma-3-4b-it",
-                "openai/whisper-tiny",
-                "Qwen/Qwen3-0.6B",
            }
        )

--- a/benchmarks/dynamo/check_graph_breaks.py
+++ b/benchmarks/dynamo/check_graph_breaks.py
@ -55,12 +55,6 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
                "timm_nfnet",
                "torchrec_dlrm",
                "vgg16",
-                # LLM
-                "meta-llama/Llama-3.2-1B",
-                "google/gemma-2-2b",
-                "google/gemma-3-4b-it",
-                "openai/whisper-tiny",
-                "Qwen/Qwen3-0.6B",
            }
        )

--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
-
-
-
-meta-llama/Llama-3.2-1B,pass,5
-
-
-
-google/gemma-2-2b,pass,5
-
-
-
-google/gemma-3-4b-it,pass_due_to_skip,0
-
-
-
-openai/whisper-tiny,pass,6
-
-
-
-Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5


 YituTechConvBert,pass,5
-
-
-
-meta-llama/Llama-3.2-1B,eager_fail_to_run,0
-
-
-
-google/gemma-2-2b,eager_fail_to_run,0
-
-
-
-google/gemma-3-4b-it,eager_fail_to_run,0
-
-
-
-openai/whisper-tiny,eager_fail_to_run,0
-
-
-
-Qwen/Qwen3-0.6B,eager_fail_to_run,0
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
@ -167,23 +167,3 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
-
-
-
-meta-llama/Llama-3.2-1B,fail_accuracy,0
-
-
-
-google/gemma-2-2b,fail_accuracy,0
-
-
-
-google/gemma-3-4b-it,fail_accuracy,0
-
-
-
-openai/whisper-tiny,fail_to_run,0
-
-
-
-Qwen/Qwen3-0.6B,fail_accuracy,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
-
-
-
-meta-llama/Llama-3.2-1B,pass_due_to_skip,0
-
-
-
-google/gemma-2-2b,pass_due_to_skip,0
-
-
-
-google/gemma-3-4b-it,pass_due_to_skip,0
-
-
-
-openai/whisper-tiny,pass_due_to_skip,0
-
-
-
-Qwen/Qwen3-0.6B,pass_due_to_skip,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
-
-
-
-meta-llama/Llama-3.2-1B,pass_due_to_skip,0
-
-
-
-google/gemma-2-2b,pass_due_to_skip,0
-
-
-
-google/gemma-3-4b-it,pass_due_to_skip,0
-
-
-
-openai/whisper-tiny,pass_due_to_skip,0
-
-
-
-Qwen/Qwen3-0.6B,pass_due_to_skip,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
-
-
-
-meta-llama/Llama-3.2-1B,pass_due_to_skip,0
-
-
-
-google/gemma-2-2b,pass_due_to_skip,0
-
-
-
-google/gemma-3-4b-it,pass_due_to_skip,0
-
-
-
-openai/whisper-tiny,pass_due_to_skip,0
-
-
-
-Qwen/Qwen3-0.6B,pass_due_to_skip,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
-
-
-
-meta-llama/Llama-3.2-1B,pass,5
-
-
-
-google/gemma-2-2b,pass,5
-
-
-
-google/gemma-3-4b-it,pass_due_to_skip,0
-
-
-
-openai/whisper-tiny,pass,6
-
-
-
-Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5


 YituTechConvBert,pass,5
-
-
-
-meta-llama/Llama-3.2-1B,eager_fail_to_run,0
-
-
-
-google/gemma-2-2b,eager_fail_to_run,0
-
-
-
-google/gemma-3-4b-it,eager_fail_to_run,0
-
-
-
-openai/whisper-tiny,eager_fail_to_run,0
-
-
-
-Qwen/Qwen3-0.6B,eager_fail_to_run,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
-
-
-
-meta-llama/Llama-3.2-1B,pass,0
-
-
-
-google/gemma-2-2b,pass,0
-
-
-
-google/gemma-3-4b-it,pass_due_to_skip,0
-
-
-
-openai/whisper-tiny,pass,0
-
-
-
-Qwen/Qwen3-0.6B,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
-
-
-
-meta-llama/Llama-3.2-1B,pass,5
-
-
-
-google/gemma-2-2b,pass,5
-
-
-
-google/gemma-3-4b-it,pass,0
-
-
-
-openai/whisper-tiny,pass,6
-
-
-
-Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5


 YituTechConvBert,pass,5
-
-
-
-meta-llama/Llama-3.2-1B,eager_fail_to_run,0
-
-
-
-google/gemma-2-2b,eager_fail_to_run,0
-
-
-
-google/gemma-3-4b-it,eager_fail_to_run,0
-
-
-
-openai/whisper-tiny,eager_fail_to_run,0
-
-
-
-Qwen/Qwen3-0.6B,eager_fail_to_run,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
-
-
-
-meta-llama/Llama-3.2-1B,pass,5
-
-
-
-google/gemma-2-2b,pass,5
-
-
-
-google/gemma-3-4b-it,pass_due_to_skip,0
-
-
-
-openai/whisper-tiny,pass,6
-
-
-
-Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
-
-
-
-meta-llama/Llama-3.2-1B,pass,5
-
-
-
-google/gemma-2-2b,pass,5
-
-
-
-google/gemma-3-4b-it,pass_due_to_skip,0
-
-
-
-openai/whisper-tiny,pass,6
-
-
-
-Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5


 YituTechConvBert,pass,5
-
-
-
-meta-llama/Llama-3.2-1B,eager_fail_to_run,0
-
-
-
-google/gemma-2-2b,eager_fail_to_run,0
-
-
-
-google/gemma-3-4b-it,eager_fail_to_run,0
-
-
-
-openai/whisper-tiny,eager_fail_to_run,0
-
-
-
-Qwen/Qwen3-0.6B,eager_fail_to_run,0
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
-
-
-
-meta-llama/Llama-3.2-1B,pass,5
-
-
-
-google/gemma-2-2b,pass,5
-
-
-
-google/gemma-3-4b-it,pass_due_to_skip,0
-
-
-
-openai/whisper-tiny,pass,6
-
-
-
-Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5


 YituTechConvBert,pass,5
-
-
-
-meta-llama/Llama-3.2-1B,eager_fail_to_run,0
-
-
-
-google/gemma-2-2b,eager_fail_to_run,0
-
-
-
-google/gemma-3-4b-it,eager_fail_to_run,0
-
-
-
-openai/whisper-tiny,eager_fail_to_run,0
-
-
-
-Qwen/Qwen3-0.6B,eager_fail_to_run,0
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv
@ -167,23 +167,3 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
-
-
-
-meta-llama/Llama-3.2-1B,fail_accuracy,0
-
-
-
-google/gemma-2-2b,fail_accuracy,0
-
-
-
-google/gemma-3-4b-it,fail_accuracy,0
-
-
-
-openai/whisper-tiny,fail_to_run,0
-
-
-
-Qwen/Qwen3-0.6B,fail_accuracy,0
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
-
-
-
-meta-llama/Llama-3.2-1B,pass,5
-
-
-
-google/gemma-2-2b,pass,5
-
-
-
-google/gemma-3-4b-it,pass_due_to_skip,0
-
-
-
-openai/whisper-tiny,pass,6
-
-
-
-Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5


 YituTechConvBert,pass,5
-
-
-
-meta-llama/Llama-3.2-1B,eager_fail_to_run,0
-
-
-
-google/gemma-2-2b,eager_fail_to_run,0
-
-
-
-google/gemma-3-4b-it,eager_fail_to_run,0
-
-
-
-openai/whisper-tiny,eager_fail_to_run,0
-
-
-
-Qwen/Qwen3-0.6B,eager_fail_to_run,0
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
@ -205,7 +205,7 @@ llama,pass,0



-llama_v2_7b_16h,pass_due_to_skip,0
+llama_v2_7b_16h,model_fail_to_load,0



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
-
-
-
-meta-llama/Llama-3.2-1B,pass,5
-
-
-
-google/gemma-2-2b,pass,5
-
-
-
-google/gemma-3-4b-it,pass,0
-
-
-
-openai/whisper-tiny,pass,6
-
-
-
-Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5


 YituTechConvBert,pass,5
-
-
-
-meta-llama/Llama-3.2-1B,eager_fail_to_run,0
-
-
-
-google/gemma-2-2b,eager_fail_to_run,0
-
-
-
-google/gemma-3-4b-it,eager_fail_to_run,0
-
-
-
-openai/whisper-tiny,eager_fail_to_run,0
-
-
-
-Qwen/Qwen3-0.6B,eager_fail_to_run,0
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
-
-
-
-meta-llama/Llama-3.2-1B,pass,5
-
-
-
-google/gemma-2-2b,pass,5
-
-
-
-google/gemma-3-4b-it,pass_due_to_skip,0
-
-
-
-openai/whisper-tiny,pass,6
-
-
-
-Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
@ -205,7 +205,7 @@ llama,pass,0



-llama_v2_7b_16h,pass_due_to_skip,0
+llama_v2_7b_16h,model_fail_to_load,0



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0


 YituTechConvBert,pass,0
-
-
-
-meta-llama/Llama-3.2-1B,pass,5
-
-
-
-google/gemma-2-2b,pass,5
-
-
-
-google/gemma-3-4b-it,pass_due_to_skip,0
-
-
-
-openai/whisper-tiny,pass,6
-
-
-
-Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5


 YituTechConvBert,pass,5
-
-
-
-meta-llama/Llama-3.2-1B,eager_fail_to_run,0
-
-
-
-google/gemma-2-2b,eager_fail_to_run,0
-
-
-
-google/gemma-3-4b-it,eager_fail_to_run,0
-
-
-
-openai/whisper-tiny,eager_fail_to_run,0
-
-
-
-Qwen/Qwen3-0.6B,eager_fail_to_run,0
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -733,7 +733,7 @@ def timed(

    time_total = 0
    # Dont collect outputs to correctly measure timing
-    for i in range(times):
+    for _ in range(times):
        # If batch_size is 1, it too often collides with other non batch size
        # dimensions resulting in errors.
        if batch_size and batch_size > 1:
@ -1106,13 +1106,7 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
        elif args.torchscript_jit_trace:
            frozen_model_iter_fn = torchscript_jit_trace(model, example_inputs)
        else:
-            if kwargs["hf_llm"]:
-                # If it's an llm, we want to optimize model.forward, and use
-                # the generate function
-                model.forward = torch._dynamo.run(model)
-                frozen_model_iter_fn = model_iter_fn
-            else:
-                frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
+            frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)

        for rep in trange(args.repeat, desc="running benchmark"):
            inputs = (
@ -1126,10 +1120,7 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
            maybe_mark_step(args)

            # interleave the runs to handle frequency scaling and load changes
-            with (
-                maybe_mark_profile(p=p, mark="expected"),
-                torch.compiler.set_stance("force_eager"),
-            ):
+            with maybe_mark_profile(p=p, mark="expected"):
                timings[rep, 0], expected_output = timed(
                    model,
                    model_iter_fn,
@ -2242,12 +2233,11 @@ class BenchmarkRunner:
            reset_rng_state()
            model_copy = None
            try:
-                with torch.compiler.set_stance("force_eager"):
-                    model_copy = self.deepcopy_and_maybe_parallelize(model)
-                    self.init_optimizer(name, current_device, model_copy.parameters())
-                    correct_result = self.run_n_iterations(
-                        model_copy, clone_inputs(example_inputs), self.model_iter_fn
-                    )
+                model_copy = self.deepcopy_and_maybe_parallelize(model)
+                self.init_optimizer(name, current_device, model_copy.parameters())
+                correct_result = self.run_n_iterations(
+                    model_copy, clone_inputs(example_inputs), self.model_iter_fn
+                )
            except Exception as e:
                accuracy_status = (
                    "eager_1st_run_OOM"
@ -2264,12 +2254,11 @@ class BenchmarkRunner:
            reset_rng_state()
            model_copy = None
            try:
-                with torch.compiler.set_stance("force_eager"):
-                    model_copy = self.deepcopy_and_maybe_parallelize(model)
-                    self.init_optimizer(name, current_device, model_copy.parameters())
-                    correct_rerun_result = self.run_n_iterations(
-                        model_copy, clone_inputs(example_inputs), self.model_iter_fn
-                    )
+                model_copy = self.deepcopy_and_maybe_parallelize(model)
+                self.init_optimizer(name, current_device, model_copy.parameters())
+                correct_rerun_result = self.run_n_iterations(
+                    model_copy, clone_inputs(example_inputs), self.model_iter_fn
+                )
            except Exception as e:
                accuracy_status = (
                    "eager_2nd_run_OOM"
@ -2553,11 +2542,7 @@ class BenchmarkRunner:
                    )

            baseline_timings = experiment(
-                self.model_iter_fn,
-                model,
-                example_inputs,
-                mark="expected",
-                **experiment_kwargs,
+                model, example_inputs, mark="expected", **experiment_kwargs
            )

            if self.args.export_aot_inductor:
@ -2625,11 +2610,7 @@ class BenchmarkRunner:
                    )

            backend_timings = experiment(
-                self.model_iter_fn,
-                model,
-                example_inputs,
-                mark="expected",
-                **experiment_kwargs,
+                model, example_inputs, mark="expected", **experiment_kwargs
            )
            timings = np.stack((baseline_timings, backend_timings), axis=1)
            result_summary = latency_experiment_summary(
@ -2648,17 +2629,9 @@ class BenchmarkRunner:
        tag=None,
        batch_size=None,
    ):
-        niters = 5
-        if getattr(self, "hf_llm", False):
-            # If we're benchmarking an llm, we want to use the generate function
-            self.model_iter_fn = self.generate
-            niters = 1
-
        if self.args.xla:
            with self.pick_grad(name, self.args.training):
-                return experiment(
-                    self.model_iter_fn, *self.maybe_cast(model, example_inputs)
-                )
+                return experiment(*self.maybe_cast(model, example_inputs))

        def warmup(fn, model, example_inputs, mode, niters=5):
            gc.collect()
@ -2723,22 +2696,17 @@ class BenchmarkRunner:
            with maybe_snapshot_memory(
                self.args.snapshot_memory, f"eager_{self.args.only}"
            ):
-                with torch.compiler.set_stance("force_eager"):
-                    eager_latency, eager_peak_mem, _ = warmup(
+                eager_latency, eager_peak_mem, _ = warmup(
+                    self.model_iter_fn, copy.deepcopy(model), example_inputs, "eager"
+                )
+                if self.args.use_warm_peak_memory:
+                    _, eager_peak_mem, _ = warmup(
                        self.model_iter_fn,
                        copy.deepcopy(model),
                        example_inputs,
                        "eager",
-                        niters=niters,
+                        niters=1,
                    )
-                    if self.args.use_warm_peak_memory:
-                        _, eager_peak_mem, _ = warmup(
-                            self.model_iter_fn,
-                            copy.deepcopy(model),
-                            example_inputs,
-                            "eager",
-                            niters=1,
-                        )

            if (
                self.args.export_aot_inductor
@ -2747,13 +2715,7 @@ class BenchmarkRunner:
            ):
                optimized_model_iter_fn = optimize_ctx
            else:
-                if getattr(self, "hf_llm", False):
-                    # If it's an llm, we want to optimize model.forward, and use
-                    # the generate function
-                    model = optimize_ctx(model)
-                    optimized_model_iter_fn = self.model_iter_fn
-                else:
-                    optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
+                optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)

            with maybe_snapshot_memory(
                self.args.snapshot_memory, f"compiled_{self.args.only}"
@ -2831,13 +2793,7 @@ class BenchmarkRunner:
                    f"{ok:3}/{total:3} +{frames_third_pass} frames {compilation_time:3.0f}s"
                )

-            experiment_kwargs["hf_llm"] = getattr(self, "hf_llm", False)
-
-            results.append(
-                experiment(
-                    self.model_iter_fn, model, example_inputs, **experiment_kwargs
-                )
-            )
+            results.append(experiment(model, example_inputs, **experiment_kwargs))
            return " ".join(map(str, results))

    def minify_model(
@ -4128,7 +4084,7 @@ def run(runner, args, original_dir=None):
        # Overwrite 'translation_validation' config, if specified.
        torch.fx.experimental._config.translation_validation = False

-    experiment = functools.partial(experiment, args)
+    experiment = functools.partial(experiment, args, runner.model_iter_fn)

    if args.only and should_diff_branch(args):
        import git
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@ -7,7 +7,6 @@ import os
 import re
 import subprocess
 import sys
-import types
 import warnings


@ -129,12 +128,6 @@ with open(MODELS_FILENAME) as fh:
 assert len(BATCH_SIZE_KNOWN_MODELS)


-try:
-    from .huggingface_llm_models import HF_LLM_MODELS
-except ImportError:
-    from huggingface_llm_models import HF_LLM_MODELS
-
-
 def get_module_cls_by_model_name(model_cls_name):
    _module_by_model_name = {
        "Speech2Text2Decoder": "transformers.models.speech_to_text_2.modeling_speech_to_text_2",
@ -425,8 +418,11 @@ class HuggingfaceRunner(BenchmarkRunner):
        use_eval_mode = self.args.use_eval_mode
        dtype = torch.float32
        reset_rng_state()
-
-        # Get batch size
+        model_cls, config = self._get_model_cls_and_config(model_name)
+        model = self._download_model(model_name)
+        model = model.to(device, dtype=dtype)
+        if self.args.enable_activation_checkpointing:
+            model.gradient_checkpointing_enable()
        if model_name in BATCH_SIZE_KNOWN_MODELS:
            batch_size_default = BATCH_SIZE_KNOWN_MODELS[model_name]
        elif batch_size is None:
@ -444,46 +440,14 @@ class HuggingfaceRunner(BenchmarkRunner):
                    f"Running smaller batch size={batch_size} for {model_name}, orig batch_size={batch_size_default}"  # noqa: G004
                )

-        # Get model and example inputs
-        if model_name in HF_LLM_MODELS:
-            benchmark_cls = HF_LLM_MODELS[model_name]
-            model, example_inputs = benchmark_cls.get_model_and_inputs(
-                model_name, device
-            )
+        example_inputs = generate_inputs_for_model(
+            model_cls, model, model_name, batch_size, device, include_loss_args=True
+        )

-            # Set this flag so that when we test for speedup, we use
-            # model.generate instead of using model.forward
-            self.hf_llm = True
-
-            def generate(self, _, example_inputs, collect_outputs=True):
-                return model.generate(**example_inputs)
-
-            self.generate = types.MethodType(generate, self)
-
-        else:
-            self.hf_llm = False
-
-            model_cls, config = self._get_model_cls_and_config(model_name)
-            model = self._download_model(model_name)
-            model = model.to(device, dtype=dtype)
-
-            example_inputs = generate_inputs_for_model(
-                model_cls, model, model_name, batch_size, device, include_loss_args=True
-            )
-
-            # So we can check for correct gradients without eliminating the dropout computation
-            for attr in dir(config):
-                if "drop" in attr and isinstance(getattr(config, attr), float):
-                    setattr(config, attr, 1e-30)
-
-            # Turning off kv cache for torchbench models. This is not the right
-            # thing to do, but the pt2 dashboard is outdated. Real transformers
-            # benchmarks will be added soon using a different infra.
-            if hasattr(model, "config") and hasattr(model.config, "use_cache"):
-                model.config.use_cache = False
-
-        if self.args.enable_activation_checkpointing:
-            model.gradient_checkpointing_enable()
+        # So we can check for correct gradients without eliminating the dropout computation
+        for attr in dir(config):
+            if "drop" in attr and isinstance(getattr(config, attr), float):
+                setattr(config, attr, 1e-30)

        if (
            is_training
@ -496,6 +460,12 @@ class HuggingfaceRunner(BenchmarkRunner):
        else:
            model.eval()

+        # Turning off kv cache for torchbench models. This is not the right
+        # thing to do, but the pt2 dashboard is outdated. Real transformers
+        # benchmarks will be added soon using a different infra.
+        if hasattr(model, "config") and hasattr(model.config, "use_cache"):
+            model.config.use_cache = False
+
        self.validate_model(model, example_inputs)
        return device, model_name, model, example_inputs, batch_size

@ -560,8 +530,7 @@ class HuggingfaceRunner(BenchmarkRunner):

    def forward_pass(self, mod, inputs, collect_outputs=True):
        with self.autocast(**self.autocast_arg):
-            res = mod(**inputs)
-        return res.logits if self.hf_llm else res
+            return mod(**inputs)

    def forward_and_backward_pass(self, mod, inputs, collect_outputs=True):
        cloned_inputs = clone_inputs(inputs)
--- a/benchmarks/dynamo/huggingface.yaml
+++ b/benchmarks/dynamo/huggingface.yaml
@ -9,16 +9,9 @@ skip:
    # Fails with even batch size = 1
    - GPTJForCausalLM
    - GPTJForQuestionAnswering
-    # Model too big
-    - google/gemma-3-4b-it

  device:
-    cpu:
-      - meta-llama/Llama-3.2-1B
-      - google/gemma-2-2b
-      - google/gemma-3-4b-it
-      - openai/whisper-tiny
-      - Qwen/Qwen3-0.6B
+    cpu: []

  control_flow:
    - AllenaiLongformerBase
@ -74,11 +67,6 @@ batch_size:
    XGLMForCausalLM: 4
    XLNetLMHeadModel: 2
    YituTechConvBert: 2
-    meta-llama/Llama-3.2-1B: 8
-    google/gemma-2-2b: 8
-    google/gemma-3-4b-it: 8
-    openai/whisper-tiny: 8
-    Qwen/Qwen3-0.6B: 8


 tolerance:
--- a/benchmarks/dynamo/huggingface_llm_models.py
+++ b/benchmarks/dynamo/huggingface_llm_models.py
@ -1,102 +0,0 @@
-import subprocess
-import sys
-
-import torch
-
-
-def pip_install(package):
-    subprocess.check_call([sys.executable, "-m", "pip", "install", package])
-
-
-try:
-    from transformers import (
-        AutoModelForCausalLM,
-        AutoTokenizer,
-        WhisperForConditionalGeneration,
-        WhisperProcessor,
-    )
-except ModuleNotFoundError:
-    print("Installing HuggingFace Transformers...")
-    pip_install("git+https://github.com/huggingface/transformers.git#egg=transformers")
-finally:
-    from transformers import (
-        AutoModelForCausalLM,
-        AutoTokenizer,
-        WhisperForConditionalGeneration,
-        WhisperProcessor,
-    )
-
-
-class Benchmark:
-    @staticmethod
-    def get_model_and_inputs(model_name, device):
-        raise NotImplementedError("get_model_and_inputs() not implemented")
-
-
-class WhisperBenchmark(Benchmark):
-    SAMPLE_RATE = 16000
-    DURATION = 30.0  # seconds
-
-    @staticmethod
-    def get_model_and_inputs(model_name, device):
-        processor = WhisperProcessor.from_pretrained(model_name)
-        model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
-        model.config.forced_decoder_ids = None
-
-        model.generation_config.do_sample = False
-        model.generation_config.temperature = 0.0
-
-        num_samples = int(WhisperBenchmark.DURATION * WhisperBenchmark.SAMPLE_RATE)
-        audio = torch.randn(num_samples) * 0.1
-        inputs = dict(
-            processor(
-                audio, sampling_rate=WhisperBenchmark.SAMPLE_RATE, return_tensors="pt"
-            )
-        )
-        inputs["input_features"] = inputs["input_features"].to(device)
-
-        decoder_start_token = model.config.decoder_start_token_id
-        inputs["decoder_input_ids"] = torch.tensor(
-            [[decoder_start_token]], device=device
-        )
-
-        return model, inputs
-
-
-class TextGenerationBenchmark(Benchmark):
-    INPUT_LENGTH = 1000
-    OUTPUT_LENGTH = 2000
-
-    @staticmethod
-    def get_model_and_inputs(model_name, device):
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device)
-        model.eval()
-
-        model.generation_config.do_sample = False
-        model.generation_config.use_cache = True
-        model.generation_config.cache_implementation = "static"
-        model.generation_config.max_new_tokens = TextGenerationBenchmark.OUTPUT_LENGTH
-        model.generation_config.pad_token_id = tokenizer.eos_token_id
-        model.generation_config.temperature = 0.0
-
-        vocab_size = tokenizer.vocab_size
-        input_ids = torch.randint(
-            low=0,
-            high=vocab_size,
-            size=(1, TextGenerationBenchmark.INPUT_LENGTH),
-            device=device,
-            dtype=torch.long,
-        )
-        example_inputs = {"input_ids": input_ids}
-
-        return model, example_inputs
-
-
-HF_LLM_MODELS: dict[str, Benchmark] = {
-    "meta-llama/Llama-3.2-1B": TextGenerationBenchmark,
-    "google/gemma-2-2b": TextGenerationBenchmark,
-    "google/gemma-3-4b-it": TextGenerationBenchmark,
-    "openai/whisper-tiny": WhisperBenchmark,
-    "Qwen/Qwen3-0.6B": TextGenerationBenchmark,
-}
--- a/benchmarks/dynamo/huggingface_models_list.txt
+++ b/benchmarks/dynamo/huggingface_models_list.txt
@ -46,8 +46,3 @@ TrOCRForCausalLM,64
 XGLMForCausalLM,32
 XLNetLMHeadModel,16
 YituTechConvBert,32
-meta-llama/Llama-3.2-1B,8
-google/gemma-2-2b,8
-google/gemma-3-4b-it,8
-openai/whisper-tiny,8
-Qwen/Qwen3-0.6B,8
--- a/c10/core/Storage.h
+++ b/c10/core/Storage.h
@ -61,22 +61,6 @@ struct C10_API Storage {
            allocator,
            resizable)) {}

-  // Creates storage with pre-allocated memory buffer. Allocator is given for
-  // potential future reallocations, however it can be nullptr if the storage
-  // is non-resizable
-  Storage(
-      use_byte_size_t /*use_byte_size*/,
-      SymInt size_bytes,
-      at::DataPtr data_ptr,
-      at::Allocator* allocator = nullptr,
-      bool resizable = false)
-      : storage_impl_(c10::make_intrusive<StorageImpl>(
-            StorageImpl::use_byte_size_t(),
-            std::move(size_bytes),
-            std::move(data_ptr),
-            allocator,
-            resizable)) {}
-
 protected:
  explicit Storage(unsafe_borrow_t, const Storage& rhs)
      : storage_impl_(c10::intrusive_ptr<c10::StorageImpl>::reclaim(
--- a/c10/cuda/CUDAException.cpp
+++ b/c10/cuda/CUDAException.cpp
@ -10,9 +10,9 @@ namespace c10::cuda {

 void c10_cuda_check_implementation(
    const int32_t err,
-    const char* filename,
-    const char* function_name,
-    const uint32_t line_number,
+    const char* /*filename*/,
+    const char* /*function_name*/,
+    const int /*line_number*/,
    const bool include_device_assertions) {
  const auto cuda_error = static_cast<cudaError_t>(err);
  const auto cuda_kernel_failure = include_device_assertions
@ -41,7 +41,7 @@ void c10_cuda_check_implementation(
  }
 #endif
  throw c10::AcceleratorError(
-      {function_name, filename, line_number}, err, check_message);
+      {__func__, __FILE__, int32_t(__LINE__)}, err, check_message);
 }

 } // namespace c10::cuda
--- a/c10/cuda/CUDAException.h
+++ b/c10/cuda/CUDAException.h
@ -91,7 +91,7 @@ C10_CUDA_API void c10_cuda_check_implementation(
    const int32_t err,
    const char* filename,
    const char* function_name,
-    const uint32_t line_number,
+    const int line_number,
    const bool include_device_assertions);

 } // namespace c10::cuda
--- a/c10/util/FileSystem.h
+++ b/c10/util/FileSystem.h
@ -1,22 +0,0 @@
-// Shim header for filesystem for compilers that are too old to have it not
-// in the experimental namespace
-
-#if __has_include(<filesystem>)
-#include <filesystem>
-#elif __has_include(<experimental/filesystem>)
-#include <experimental/filesystem>
-#else
-#error "Neither <filesystem> nor <experimental/filesystem> is available."
-#endif
-
-namespace c10 {
-
-#if __has_include(<filesystem>)
-// NOLINTNEXTLINE(misc-unused-alias-decls)
-namespace filesystem = std::filesystem;
-#elif __has_include(<experimental/filesystem>)
-// NOLINTNEXTLINE(misc-unused-alias-decls)
-namespace filesystem = std::experimental::filesystem;
-#endif
-
-} // namespace c10
--- a/c10/util/TypeCast.h
+++ b/c10/util/TypeCast.h
@ -52,7 +52,7 @@ struct maybe_bool {
 template <typename src_t>
 struct maybe_bool<true, src_t> {
  C10_HOST_DEVICE static inline decltype(auto) apply(src_t src) {
-    // Don't use bool operator so as to also compile for ComplexHalf.
+    // Don't use bool operator so as to to also compile for ComplexHalf.
    return src.real() || src.imag();
  }
 };
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Sahan Paliskara	a3795cfaea	lint	2025-09-16 11:00:20 -07:00
PaliC	a9d5c00727	more memory?	2025-09-15 21:42:41 +00:00
PaliC	c1102ca308	more memory?	2025-09-15 21:37:26 +00:00
PaliC	7856f8d7f4	fixed i think	2025-09-13 13:52:39 +00:00
PaliC	973c3b531a	oops	2025-09-13 13:52:39 +00:00
PaliC	d5a496e7f1	check	2025-09-13 13:52:39 +00:00
PaliC	051e544ef6	[BE] Make PyObjectSlot use a global PyInterpreter ghstack-source-id: 826f3f0c155cd0677776949b0c1c82395a95a1e8 Pull Request resolved: https://github.com/pytorch/pytorch/pull/158409 [BE] Remove pyinterpreter struct from pyobjslot ghstack-source-id: 826f3f0c155cd0677776949b0c1c82395a95a1e8 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159511	2025-09-13 13:52:39 +00:00