Update base for Update on "[map] support gen_schema for map"

In-place mutation may create inter-loop dependency that breaks the parallelism we have for map so we ban input mutations. [ghstack-poisoned]
2025-10-29 19:24:55 +08:00 · 2025-08-11 15:52:21 -07:00
parent ee494f3330 5a40c57844
commit e93449289a
1392 changed files with 55537 additions and 65309 deletions
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -438,9 +438,7 @@ def build_torchvision(
        )
        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
    elif build_version is not None:
-        build_vars += (
-            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
-        )
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

@ -495,9 +493,7 @@ def build_torchdata(
        )
        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
    elif build_version is not None:
-        build_vars += (
-            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
-        )
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

@ -553,9 +549,7 @@ def build_torchtext(
        )
        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
    elif build_version is not None:
-        build_vars += (
-            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
-        )
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

@ -613,9 +607,7 @@ def build_torchaudio(
        )
        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
    elif build_version is not None:
-        build_vars += (
-            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
-        )
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

--- a/.ci/docker/README.md
+++ b/.ci/docker/README.md
@ -104,7 +104,6 @@ If your new Docker image needs a library installed from a specific pinned commit
   ```bash
   pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-new1)
     CUDA_VERSION=12.8.1
-     CUDNN_VERSION=9
     ANACONDA_PYTHON_VERSION=3.12
     GCC_VERSION=11
     VISION=yes
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -93,7 +93,6 @@ tag=$(echo $image | awk -F':' '{print $2}')
 case "$tag" in
  pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11)
    CUDA_VERSION=12.4
-    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    VISION=yes
@ -104,7 +103,6 @@ case "$tag" in
    ;;
  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11)
    CUDA_VERSION=12.8.1
-    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    VISION=yes
@ -115,7 +113,6 @@ case "$tag" in
    ;;
  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.8.1
-    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    VISION=yes
@ -127,7 +124,6 @@ case "$tag" in
    ;;
  pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.8.1
-    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
    VISION=yes
@ -139,7 +135,6 @@ case "$tag" in
    ;;
  pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.8.1
-    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
    VISION=yes
@ -149,20 +144,8 @@ case "$tag" in
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.6.3
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    ;;
  pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)
    CUDA_VERSION=12.8.1
-    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=11
    VISION=yes
@ -171,45 +154,8 @@ case "$tag" in
    UCC_COMMIT=${_UCC_COMMIT}
    TRITON=yes
    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.13
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
    CUDA_VERSION=12.8.1
-    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    VISION=yes
@ -230,19 +176,7 @@ case "$tag" in
    VISION=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-py3.11-clang12)
-    ANACONDA_PYTHON_VERSION=3.11
-    CLANG_VERSION=12
-    VISION=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-py3.9-gcc9)
-    ANACONDA_PYTHON_VERSION=3.9
-    GCC_VERSION=9
-    VISION=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-noble-rocm-n-py3)
+  pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3)
    if [[ $tag =~ "jammy" ]]; then
      ANACONDA_PYTHON_VERSION=3.10
    else
@ -256,7 +190,9 @@ case "$tag" in
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
-    INDUCTOR_BENCHMARKS=yes
+    if [[ $tag =~ "benchmarks" ]]; then
+      INDUCTOR_BENCHMARKS=yes
+    fi
    ;;
  pytorch-linux-noble-rocm-alpha-py3)
    ANACONDA_PYTHON_VERSION=3.12
@ -268,7 +204,6 @@ case "$tag" in
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
-    INDUCTOR_BENCHMARKS=yes
    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
    ;;
  pytorch-linux-jammy-xpu-2025.0-py3)
@ -299,7 +234,6 @@ case "$tag" in
  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
    ANACONDA_PYTHON_VERSION=3.9
    CUDA_VERSION=12.8.1
-    CUDNN_VERSION=9
    CLANG_VERSION=12
    VISION=yes
    TRITON=yes
@ -378,7 +312,6 @@ case "$tag" in
    fi
    if [[ "$image" == *cuda* ]]; then
      extract_version_from_image_name cuda CUDA_VERSION
-      extract_version_from_image_name cudnn CUDNN_VERSION
    fi
    if [[ "$image" == *rocm* ]]; then
      extract_version_from_image_name rocm ROCM_VERSION
@ -430,9 +363,6 @@ docker build \
       --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
       --build-arg "GCC_VERSION=${GCC_VERSION}" \
       --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
-       --build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \
-       --build-arg "TENSORRT_VERSION=${TENSORRT_VERSION}" \
-       --build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \
       --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
       --build-arg "KATEX=${KATEX:-}" \
       --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
--- a/.ci/docker/ci_commit_pins/torchbench.txt
+++ b/.ci/docker/ci_commit_pins/torchbench.txt
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-11ec6354315768a85da41032535e3b7b99c5f706
+f7888497a1eb9e98d4c07537f0d0bcfe180d1363
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -66,8 +66,9 @@ function do_cpython_build {
        ln -s pip3 ${prefix}/bin/pip
    fi
    # install setuptools since python 3.12 is required to use distutils
-    ${prefix}/bin/pip install wheel==0.45.1 setuptools==80.9.0
-    local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))")
+    # packaging is needed to create symlink since wheel no longer provides needed information
+    ${prefix}/bin/pip install packaging==25.0 wheel==0.45.1 setuptools==80.9.0
+    local abi_tag=$(${prefix}/bin/python -c "from packaging.tags import interpreter_name, interpreter_version; import sysconfig ; from sysconfig import get_config_var; print('{0}{1}-{0}{1}{2}'.format(interpreter_name(), interpreter_version(), 't' if sysconfig.get_config_var('Py_GIL_DISABLED') else ''))")
    ln -sf ${prefix} /opt/python/${abi_tag}
 }

--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -68,8 +68,8 @@ function install_nvshmem {
  # download, unpack, install
  wget -q "${url}"
  tar xf "${filename}.tar.gz"
-  cp -a "libnvshmem/include/"* /usr/local/include/
-  cp -a "libnvshmem/lib/"*     /usr/local/lib/
+  cp -a "libnvshmem/include/"* /usr/local/cuda/include/
+  cp -a "libnvshmem/lib/"*     /usr/local/cuda/lib64/

  # cleanup
  cd ..
--- a/.ci/docker/common/install_cudnn.sh
+++ b/.ci/docker/common/install_cudnn.sh
@ -1,26 +0,0 @@
-#!/bin/bash
-
-if [[ -n "${CUDNN_VERSION}" ]]; then
-    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-    mkdir tmp_cudnn
-    pushd tmp_cudnn
-    if [[ ${CUDA_VERSION:0:4} == "12.9" || ${CUDA_VERSION:0:4} == "12.8" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
-    elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
-    elif [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
-    elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive"
-    else
-        print "Unsupported CUDA version ${CUDA_VERSION}"
-        exit 1
-    fi
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
-    tar xf ${CUDNN_NAME}.tar.xz
-    cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/
-    cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/
-    popd
-    rm -rf tmp_cudnn
-    ldconfig
-fi
--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@ -15,11 +15,37 @@ function install_timm() {
  commit=$(get_pinned_commit timm)

  pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
-  # Clean up
-  conda_run pip uninstall -y torch torchvision triton
+}
+
+function install_torchbench() {
+  local commit
+  commit=$(get_pinned_commit torchbench)
+  git clone https://github.com/pytorch/benchmark torchbench
+  pushd torchbench
+  git checkout "$commit"
+
+  python install.py --continue_on_fail
+
+  # TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
+  # is regressing speedup metric. This needs to be investigated further
+  pip install transformers==4.38.1
+
+  echo "Print all dependencies after TorchBench is installed"
+  python -mpip freeze
+  popd
+
+  chown -R jenkins torchbench
 }

 # Pango is needed for weasyprint which is needed for doctr
 conda_install pango
+
+# Stable packages are ok here, just to satisfy TorchBench check
+pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+
+install_torchbench
 install_huggingface
 install_timm
+
+# Clean up
+conda_run pip uninstall -y torch torchvision torchaudio triton torchao
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -30,7 +30,7 @@ EOF

    # we want the patch version of 6.4 instead
    if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
-        ROCM_VERSION="${ROCM_VERSION}.1"
+        ROCM_VERSION="${ROCM_VERSION}.2"
    fi

    # Default url values
@ -85,16 +85,19 @@ EOF
    # CI no longer builds for ROCm 6.3, but
    # ROCm 6.4 did not yet fix the regression, also HIP branch names are different
    if [[ $(ver $ROCM_VERSION) -ge $(ver 6.4) ]] && [[ $(ver $ROCM_VERSION) -lt $(ver 7.0) ]]; then
-        if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.1) ]]; then
-            HIP_BRANCH=release/rocm-rel-6.4
-            CLR_HASH=606bc820b4b1f315d135da02a1f0b176ca50a92c  # branch release/rocm-rel-6.4.1-statco-hotfix
+        if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.2) ]]; then
+            HIP_TAG=rocm-6.4.2
+            CLR_HASH=74d78ba3ac4bac235d02bcb48511c30b5cfdd457  # branch release/rocm-rel-6.4.2-statco-hotfix
+        elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.1) ]]; then
+            HIP_TAG=rocm-6.4.1
+            CLR_HASH=efe6c35790b9206923bfeed1209902feff37f386  # branch release/rocm-rel-6.4.1-statco-hotfix
        elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
-            HIP_BRANCH=release/rocm-rel-6.4
+            HIP_TAG=rocm-6.4.0
            CLR_HASH=600f5b0d2baed94d5121e2174a9de0851b040b0c  # branch release/rocm-rel-6.4-statco-hotfix
        fi
        # clr build needs CppHeaderParser but can only find it using conda's python
        python -m pip install CppHeaderParser
-        git clone https://github.com/ROCm/HIP -b $HIP_BRANCH
+        git clone https://github.com/ROCm/HIP -b $HIP_TAG
        HIP_COMMON_DIR=$(readlink -f HIP)
        git clone https://github.com/jeffdaily/clr
        pushd clr
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -34,18 +34,27 @@ function install_ubuntu() {

    # The xpu-smi packages
    apt-get install -y flex bison xpu-smi
-    # Compute and Media Runtimes
-    apt-get install -y \
-        intel-opencl-icd intel-level-zero-gpu level-zero \
-        intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
-        libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
-        libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
-        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
-    if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
-        apt-get install -y intel-ocloc
+
+    if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
+        # Compute and Media Runtimes
+        apt-get install -y \
+            intel-opencl-icd intel-level-zero-gpu level-zero \
+            intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
+            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
+            libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
+            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
+        # Development Packages
+        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
+    else # rolling driver
+        apt-get install -y \
+            intel-opencl-icd libze-intel-gpu1 libze1 \
+            intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
+            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
+            libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
+            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
+        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev
    fi
-    # Development Packages
-    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
+
    # Install Intel Support Packages
    apt-get install -y ${XPU_PACKAGES}

@ -130,11 +139,11 @@ function install_sles() {

 }

-# Default use GPU driver LTS releases
-XPU_DRIVER_VERSION="/lts/2350"
-if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
-    # Use GPU driver rolling releases
-    XPU_DRIVER_VERSION=""
+# Default use GPU driver rolling releases
+XPU_DRIVER_VERSION=""
+if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
+    # Use GPU driver LTS releases
+    XPU_DRIVER_VERSION="/lts/2350"
 fi

 # Default use Intel® oneAPI Deep Learning Essentials 2025.0
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -41,7 +41,7 @@ case ${DOCKER_TAG_PREFIX} in
    rocm*)
        # we want the patch version of 6.4 instead
        if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
-            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.1"
+            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
        fi
        BASE_TARGET=rocm
        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -77,7 +77,7 @@ case ${image} in
    manylinux2_28-builder:rocm*)
        # we want the patch version of 6.4 instead
        if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
-            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.1"
+            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
        fi
        TARGET=rocm_final
        MANY_LINUX_VERSION="2_28"
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -63,11 +63,12 @@ lark==0.12.0
 #Pinned versions: 0.12.0
 #test that import:

-librosa>=0.6.2 ; python_version < "3.11"
-librosa==0.10.2 ; python_version == "3.12"
+librosa>=0.6.2 ; python_version < "3.11" and platform_machine != "s390x"
+librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: A python package for music and audio analysis
 #Pinned versions: >=0.6.2
 #test that import: test_spectral_ops.py
+#librosa depends on numba; disable it for s390x while numba is disabled too

 #mkl #this breaks linux-bionic-rocm4.5-py3.7
 #Description: Intel oneAPI Math Kernel Library
@ -110,14 +111,15 @@ ninja==1.11.1.3
 #Pinned versions: 1.11.1.3
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py

-numba==0.49.0 ; python_version < "3.9"
-numba==0.55.2 ; python_version == "3.9"
-numba==0.55.2 ; python_version == "3.10"
-numba==0.60.0 ; python_version == "3.12"
+numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
+numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x"
+numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
+numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: Just-In-Time Compiler for Numerical Functions
 #Pinned versions: 0.54.1, 0.49.0, <=0.49.1
 #test that import: test_numba_integration.py
 #For numba issue see https://github.com/pytorch/pytorch/issues/51511
+#Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073

 #numpy
 #Description: Provides N-dimensional arrays and linear algebra
@ -221,9 +223,9 @@ pygments==2.15.0
 #Pinned versions: 2.12.0
 #test that import: the doctests

-#PyYAML
+#pyyaml
 #Description: data serialization format
-#Pinned versions:
+#Pinned versions: 6.0.2
 #test that import:

 #requests
@ -233,7 +235,7 @@ pygments==2.15.0

 #rich
 #Description: rich text and beautiful formatting in the terminal
-#Pinned versions: 10.9.0
+#Pinned versions: 14.1.0
 #test that import:

 scikit-image==0.19.3 ; python_version < "3.10"
@ -307,7 +309,7 @@ pytest-cpp==2.3.0
 #Pinned versions: 2.3.0
 #test that import:

-z3-solver==4.15.1.0
+z3-solver==4.15.1.0 ; platform_machine != "s390x"
 #Description: The Z3 Theorem Prover Project
 #Pinned versions:
 #test that import:
@ -361,7 +363,6 @@ pwlf==2.2.1
 #Pinned versions: 2.2.1
 #test that import: test_sac_estimator.py

-
 # To build PyTorch itself
 pyyaml
 pyzstd
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,7 +1,7 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2

 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
@ -50,8 +50,8 @@ IPython==8.12.0
 #Pinned versions: 8.12.0

 myst-nb==0.17.2
-#Description: This is used to generate PyTorch functorch docs
-#Pinned versions: 0.13.2
+#Description: This is used to generate PyTorch functorch and torch.compile docs.
+#Pinned versions: 0.17.2

 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd==0.4.5
@ -59,4 +59,3 @@ sphinx-copybutton==0.5.0
 sphinx-design==0.4.0
 sphinxcontrib-mermaid==1.0.0
 myst-parser==0.18.1
-myst-nb
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -98,8 +98,9 @@ COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
+COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt

 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -98,8 +98,9 @@ COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
+COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt

 ARG TRITON
 ARG TRITON_CPU
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -138,28 +138,11 @@ fi

 echo "Calling setup.py bdist at $(date)"

-if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-    echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
-    time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
-    BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 \
+time CMAKE_ARGS=${CMAKE_ARGS[@]} \
+    EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
    BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
    USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
    python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
-    echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
-    echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
-    time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
-    BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 \
-    BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
-    USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
-    CMAKE_FRESH=1 python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
-    echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
-else
-    time CMAKE_ARGS=${CMAKE_ARGS[@]} \
-        EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
-        BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
-        USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
-        python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
-fi
 echo "Finished setup.py bdist at $(date)"

 # Build libtorch packages
@ -272,10 +255,6 @@ ls /tmp/$WHEELHOUSE_DIR
 mkdir -p "/$WHEELHOUSE_DIR"
 mv /tmp/$WHEELHOUSE_DIR/torch*linux*.whl /$WHEELHOUSE_DIR/

-if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-    mv /tmp/$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/ || true
-fi
-
 if [[ -n "$BUILD_PYTHONLESS" ]]; then
    mkdir -p /$LIBTORCH_HOUSE_DIR
    mv /tmp/$LIBTORCH_HOUSE_DIR/*.zip /$LIBTORCH_HOUSE_DIR
@ -452,16 +431,8 @@ if [[ -z "$BUILD_PYTHONLESS" ]]; then
  pushd $PYTORCH_ROOT/test

  # Install the wheel for this Python version
-  if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-    pip uninstall -y "$TORCH_NO_PYTHON_PACKAGE_NAME" || true
-  fi
-
  pip uninstall -y "$TORCH_PACKAGE_NAME"

-  if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-    pip install "$TORCH_NO_PYTHON_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v
-  fi
-
  pip install "$TORCH_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v

  # Print info on the libraries installed in this wheel
--- a/.ci/manywheel/build_rocm.sh
+++ b/.ci/manywheel/build_rocm.sh
@ -194,7 +194,7 @@ ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library
 ROCBLAS_LIB_DST=lib/rocblas/library
 ROCBLAS_ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH)
 ROCBLAS_OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx)
-ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $OTHER_FILES)
+ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $ROCBLAS_OTHER_FILES)

 # hipblaslt library files
 HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library
--- a/.ci/pytorch/build-mobile.sh
+++ b/.ci/pytorch/build-mobile.sh
@ -1,34 +0,0 @@
-#!/usr/bin/env bash
-# DO NOT ADD 'set -x' not to reveal CircleCI secret context environment variables
-set -eu -o pipefail
-
-# This script uses linux host toolchain + mobile build options in order to
-# build & test mobile libtorch without having to setup Android/iOS
-# toolchain/simulator.
-
-# shellcheck source=./common.sh
-source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
-# shellcheck source=./common-build.sh
-source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
-
-# Install torch & torchvision - used to download & trace test model.
-# Ideally we should use the libtorch built on the PR so that backward
-# incompatible changes won't break this script - but it will significantly slow
-# down mobile CI jobs.
-# Here we install nightly instead of stable so that we have an option to
-# temporarily skip mobile CI jobs on BC-breaking PRs until they are in nightly.
-retry pip install --pre torch torchvision \
-  -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html \
-  --progress-bar off
-
-# Run end-to-end process of building mobile library, linking into the predictor
-# binary, and running forward pass with a real model.
-if [[ "$BUILD_ENVIRONMENT" == *-mobile-custom-build-static* ]]; then
-  TEST_CUSTOM_BUILD_STATIC=1 test/mobile/custom_build/build.sh
-elif [[ "$BUILD_ENVIRONMENT" == *-mobile-lightweight-dispatch* ]]; then
-  test/mobile/lightweight_dispatch/build.sh
-else
-  TEST_DEFAULT_BUILD=1 test/mobile/custom_build/build.sh
-fi
-
-print_sccache_stats
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -11,10 +11,6 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 # shellcheck source=./common-build.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"

-if [[ "$BUILD_ENVIRONMENT" == *-mobile-*build* ]]; then
-  exec "$(dirname "${BASH_SOURCE[0]}")/build-mobile.sh" "$@"
-fi
-
 echo "Python version:"
 python --version

@ -54,9 +50,6 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
  export ATEN_THREADING=NATIVE
 fi

-# Enable LLVM dependency for TensorExpr testing
-export USE_LLVM=/opt/llvm
-export LLVM_DIR=/opt/llvm/lib/cmake/llvm

 if ! which conda; then
  # In ROCm CIs, we are doing cross compilation on build machines with
@ -124,26 +117,8 @@ if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then
 fi

 # Use special scripts for Android builds
-if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
-  export ANDROID_NDK=/opt/ndk
-  build_args=()
-  if [[ "${BUILD_ENVIRONMENT}" == *-arm-v7a* ]]; then
-    build_args+=("-DANDROID_ABI=armeabi-v7a")
-  elif [[ "${BUILD_ENVIRONMENT}" == *-arm-v8a* ]]; then
-    build_args+=("-DANDROID_ABI=arm64-v8a")
-  elif [[ "${BUILD_ENVIRONMENT}" == *-x86_32* ]]; then
-    build_args+=("-DANDROID_ABI=x86")
-  elif [[ "${BUILD_ENVIRONMENT}" == *-x86_64* ]]; then
-    build_args+=("-DANDROID_ABI=x86_64")
-  fi
-  if [[ "${BUILD_ENVIRONMENT}" == *vulkan* ]]; then
-    build_args+=("-DUSE_VULKAN=ON")
-  fi
-  build_args+=("-DUSE_LITE_INTERPRETER_PROFILER=OFF")
-  exec ./scripts/build_android.sh "${build_args[@]}" "$@"
-fi

-if [[ "$BUILD_ENVIRONMENT" != *android* && "$BUILD_ENVIRONMENT" == *vulkan* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *vulkan* ]]; then
  export USE_VULKAN=1
  # shellcheck disable=SC1091
  source /var/lib/jenkins/vulkansdk/setup-env.sh
@ -198,7 +173,7 @@ fi

 # We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
 # memory to build and will OOM
-if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && echo "${TORCH_CUDA_ARCH_LIST}" | tr ' ' '\n' | sed 's/$/>= 8.0/' | bc | grep -q 1; then
  export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j 2"
 fi

@ -214,7 +189,6 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
  export USE_ASAN=1
  export REL_WITH_DEB_INFO=1
  export UBSAN_FLAGS="-fno-sanitize-recover=all"
-  unset USE_LLVM
 fi

 if [[ "${BUILD_ENVIRONMENT}" == *no-ops* ]]; then
@ -225,7 +199,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *-pch* ]]; then
    export USE_PRECOMPILED_HEADERS=1
 fi

-if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
+if [[ "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
  export BUILD_STATIC_RUNTIME_BENCHMARK=ON
 fi

@ -287,22 +261,13 @@ else

      WERROR=1 python setup.py clean

-      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-        python3 tools/packaging/split_wheel.py bdist_wheel
-      else
-        WERROR=1 python setup.py bdist_wheel
-      fi
+      WERROR=1 python setup.py bdist_wheel
    else
      python setup.py clean
      if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
        source .ci/pytorch/install_cache_xla.sh
      fi
-      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-        echo "USE_SPLIT_BUILD cannot be used with xla or rocm"
-        exit 1
-      else
-        python setup.py bdist_wheel
-      fi
+      python setup.py bdist_wheel
    fi
    pip_install_whl "$(echo dist/*.whl)"

--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -229,7 +229,6 @@ function install_torchrec_and_fbgemm() {

    pip_install tabulate  # needed for newer fbgemm
    pip_install patchelf  # needed for rocm fbgemm
-    pushd /tmp

    local wheel_dir=dist/fbgemm_gpu
    local found_whl=0
@ -245,7 +244,7 @@ function install_torchrec_and_fbgemm() {
    if [ "${found_whl}" == "0" ]; then
      git clone --recursive https://github.com/pytorch/fbgemm
      pushd fbgemm/fbgemm_gpu
-      git checkout "${fbgemm_commit}"
+      git checkout "${fbgemm_commit}" --recurse-submodules
      python setup.py bdist_wheel \
        --build-variant=rocm \
        -DHIP_ROOT_DIR="${ROCM_PATH}" \
@ -264,7 +263,6 @@ function install_torchrec_and_fbgemm() {
    done

    rm -rf fbgemm
-    popd
  else
    pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
    pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
@ -283,30 +281,6 @@ function clone_pytorch_xla() {
  fi
 }

-function checkout_install_torchbench() {
-  local commit
-  commit=$(get_pinned_commit torchbench)
-  git clone https://github.com/pytorch/benchmark torchbench
-  pushd torchbench
-  git checkout "$commit"
-
-  if [ "$1" ]; then
-    python install.py --continue_on_fail models "$@"
-  else
-    # Occasionally the installation may fail on one model but it is ok to continue
-    # to install and test other models
-    python install.py --continue_on_fail
-  fi
-
-  # TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
-  # is regressing speedup metric. This needs to be investigated further
-  pip install transformers==4.38.1
-
-  echo "Print all dependencies after TorchBench is installed"
-  python -mpip freeze
-  popd
-}
-
 function install_torchao() {
  local commit
  commit=$(get_pinned_commit torchao)
--- a/.ci/pytorch/create_test_cert.py
+++ b/.ci/pytorch/create_test_cert.py
@ -1,123 +0,0 @@
-from datetime import datetime, timedelta, timezone
-from tempfile import mkdtemp
-
-from cryptography import x509
-from cryptography.hazmat.primitives import hashes, serialization
-from cryptography.hazmat.primitives.asymmetric import rsa
-from cryptography.x509.oid import NameOID
-
-
-temp_dir = mkdtemp()
-print(temp_dir)
-
-
-def genrsa(path):
-    key = rsa.generate_private_key(
-        public_exponent=65537,
-        key_size=2048,
-    )
-    with open(path, "wb") as f:
-        f.write(
-            key.private_bytes(
-                encoding=serialization.Encoding.PEM,
-                format=serialization.PrivateFormat.TraditionalOpenSSL,
-                encryption_algorithm=serialization.NoEncryption(),
-            )
-        )
-    return key
-
-
-def create_cert(path, C, ST, L, O, key):
-    subject = issuer = x509.Name(
-        [
-            x509.NameAttribute(NameOID.COUNTRY_NAME, C),
-            x509.NameAttribute(NameOID.STATE_OR_PROVINCE_NAME, ST),
-            x509.NameAttribute(NameOID.LOCALITY_NAME, L),
-            x509.NameAttribute(NameOID.ORGANIZATION_NAME, O),
-        ]
-    )
-    cert = (
-        x509.CertificateBuilder()
-        .subject_name(subject)
-        .issuer_name(issuer)
-        .public_key(key.public_key())
-        .serial_number(x509.random_serial_number())
-        .not_valid_before(datetime.now(timezone.utc))
-        .not_valid_after(
-            # Our certificate will be valid for 10 days
-            datetime.now(timezone.utc) + timedelta(days=10)
-        )
-        .add_extension(
-            x509.BasicConstraints(ca=True, path_length=None),
-            critical=True,
-        )
-        .sign(key, hashes.SHA256())
-    )
-    # Write our certificate out to disk.
-    with open(path, "wb") as f:
-        f.write(cert.public_bytes(serialization.Encoding.PEM))
-    return cert
-
-
-def create_req(path, C, ST, L, O, key):
-    csr = (
-        x509.CertificateSigningRequestBuilder()
-        .subject_name(
-            x509.Name(
-                [
-                    # Provide various details about who we are.
-                    x509.NameAttribute(NameOID.COUNTRY_NAME, C),
-                    x509.NameAttribute(NameOID.STATE_OR_PROVINCE_NAME, ST),
-                    x509.NameAttribute(NameOID.LOCALITY_NAME, L),
-                    x509.NameAttribute(NameOID.ORGANIZATION_NAME, O),
-                ]
-            )
-        )
-        .sign(key, hashes.SHA256())
-    )
-    with open(path, "wb") as f:
-        f.write(csr.public_bytes(serialization.Encoding.PEM))
-    return csr
-
-
-def sign_certificate_request(path, csr_cert, ca_cert, private_ca_key):
-    cert = (
-        x509.CertificateBuilder()
-        .subject_name(csr_cert.subject)
-        .issuer_name(ca_cert.subject)
-        .public_key(csr_cert.public_key())
-        .serial_number(x509.random_serial_number())
-        .not_valid_before(datetime.now(timezone.utc))
-        .not_valid_after(
-            # Our certificate will be valid for 10 days
-            datetime.now(timezone.utc) + timedelta(days=10)
-            # Sign our certificate with our private key
-        )
-        .sign(private_ca_key, hashes.SHA256())
-    )
-    with open(path, "wb") as f:
-        f.write(cert.public_bytes(serialization.Encoding.PEM))
-    return cert
-
-
-ca_key = genrsa(temp_dir + "/ca.key")
-ca_cert = create_cert(
-    temp_dir + "/ca.pem",
-    "US",
-    "New York",
-    "New York",
-    "Gloo Certificate Authority",
-    ca_key,
-)
-
-pkey = genrsa(temp_dir + "/pkey.key")
-csr = create_req(
-    temp_dir + "/csr.csr",
-    "US",
-    "California",
-    "San Francisco",
-    "Gloo Testing Company",
-    pkey,
-)
-
-cert = sign_certificate_request(temp_dir + "/cert.pem", csr, ca_cert, ca_key)
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -157,6 +157,29 @@ test_jit_hooks() {
  assert_git_not_dirty
 }

+# Shellcheck doesn't like it when you pass no arguments to a function
+# that can take args. See https://www.shellcheck.net/wiki/SC2120
+# shellcheck disable=SC2120
+checkout_install_torchbench() {
+  local commit
+  commit=$(cat .ci/docker/ci_commit_pins/torchbench.txt)
+  git clone https://github.com/pytorch/benchmark torchbench
+  pushd torchbench
+  git checkout "$commit"
+
+  if [ "$1" ]; then
+    python install.py --continue_on_fail models "$@"
+  else
+    # Occasionally the installation may fail on one model but it is ok to continue
+    # to install and test other models
+    python install.py --continue_on_fail
+  fi
+
+  echo "Print all dependencies after TorchBench is installed"
+  python -mpip freeze
+  popd
+}
+
 torchbench_setup_macos() {
  git clone --recursive https://github.com/pytorch/vision torchvision
  git clone --recursive https://github.com/pytorch/audio torchaudio
@ -179,8 +202,6 @@ torchbench_setup_macos() {
  USE_OPENMP=0 python setup.py develop
  popd

-  # Shellcheck doesn't like it when you pass no arguments to a function that can take args. See https://www.shellcheck.net/wiki/SC2120
-  # shellcheck disable=SC2119,SC2120
  checkout_install_torchbench
 }

--- a/.ci/pytorch/run_glootls_test.sh
+++ b/.ci/pytorch/run_glootls_test.sh
@ -1,18 +0,0 @@
-#!/bin/bash
-
-CREATE_TEST_CERT="$(dirname "${BASH_SOURCE[0]}")/create_test_cert.py"
-TMP_CERT_DIR=$(python "$CREATE_TEST_CERT")
-
-openssl verify -CAfile "${TMP_CERT_DIR}/ca.pem" "${TMP_CERT_DIR}/cert.pem"
-
-export GLOO_DEVICE_TRANSPORT=TCP_TLS
-export GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY=${TMP_CERT_DIR}/pkey.key
-export GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT=${TMP_CERT_DIR}/cert.pem
-export GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE=${TMP_CERT_DIR}/ca.pem
-
-time python test/run_test.py --include distributed/test_c10d_gloo --verbose -- ProcessGroupGlooTest
-
-unset GLOO_DEVICE_TRANSPORT
-unset GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY
-unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT
-unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE
--- a/.ci/pytorch/smoke_test/smoke_test.py
+++ b/.ci/pytorch/smoke_test/smoke_test.py
@ -385,6 +385,29 @@ def smoke_test_compile(device: str = "cpu") -> None:
    x_pt2 = torch.compile(model, mode="max-autotune")(x)


+def smoke_test_nvshmem() -> None:
+    if not torch.cuda.is_available():
+        print("CUDA is not available, skipping NVSHMEM test")
+        return
+
+    # Check if NVSHMEM is compiled in current build
+    try:
+        from torch._C._distributed_c10d import _is_nvshmem_available
+    except ImportError:
+        # Not built with NVSHMEM support.
+        # torch is not compiled with NVSHMEM prior to 2.9
+        if torch.__version__ < "2.9":
+            return
+        else:
+            # After 2.9: NVSHMEM is expected to be compiled in current build
+            raise RuntimeError("torch not compiled with NVSHMEM") from None
+
+    print("torch compiled with NVSHMEM")
+
+    # Check if NVSHMEM is available on current system.
+    print(f"NVSHMEM available at run time: {_is_nvshmem_available()}")
+
+
 def smoke_test_modules():
    cwd = os.getcwd()
    for module in MODULES:
@ -479,6 +502,8 @@ def main() -> None:
        options.pypi_pkg_check,
    )

+    smoke_test_nvshmem()
+

 if __name__ == "__main__":
    main()
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -462,7 +462,7 @@ test_inductor_aoti() {
  # rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
  /usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"

-  /usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
+  /usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
 }

 test_inductor_cpp_wrapper_shard() {
@ -627,6 +627,8 @@ test_perf_for_dashboard() {
    device=cuda_a10g
  elif [[ "${TEST_CONFIG}" == *h100* ]]; then
    device=cuda_h100
+  elif [[ "${TEST_CONFIG}" == *b200* ]]; then
+    device=cuda_b200
  elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
    device=rocm
  fi
@ -801,6 +803,16 @@ test_dynamo_benchmark() {
  if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
    test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
  elif [[ "${TEST_CONFIG}" == *perf* ]]; then
+    # TODO (huydhn): Just smoke test some sample models
+    if [[ "${TEST_CONFIG}" == *b200* ]]; then
+      if [[ "${suite}" == "huggingface" ]]; then
+        export TORCHBENCH_ONLY_MODELS="DistillGPT2"
+      elif [[ "${suite}" == "timm_models" ]]; then
+        export TORCHBENCH_ONLY_MODELS="inception_v3"
+      elif [[ "${suite}" == "torchbench" ]]; then
+        export TORCHBENCH_ONLY_MODELS="hf_Bert"
+      fi
+    fi
    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
  else
    if [[ "${TEST_CONFIG}" == *cpu* ]]; then
@ -928,12 +940,6 @@ test_torchbench_gcp_smoketest(){
  popd
 }

-test_python_gloo_with_tls() {
-  source "$(dirname "${BASH_SOURCE[0]}")/run_glootls_test.sh"
-  assert_git_not_dirty
-}
-
-
 test_aten() {
  # Test ATen
  # The following test(s) of ATen have already been skipped by caffe2 in rocm environment:
@ -980,6 +986,8 @@ test_without_numpy() {
  if [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
    python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch;torch.compile(lambda x:print(x))('Hello World')"
  fi
+  # Regression test for https://github.com/pytorch/pytorch/pull/157734 (torch.onnx should be importable without numpy)
+  python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch; import torch.onnx"
  popd
 }

@ -1043,20 +1051,10 @@ test_libtorch_api() {
    mkdir -p $TEST_REPORTS_DIR

    OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
-    "$TORCH_BIN_DIR"/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml
  else
    # Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy
    OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest"

-    # On s390x, pytorch is built without llvm.
-    # Even if it would be built with llvm, llvm currently doesn't support used features on s390x and
-    # test fails with errors like:
-    # JIT session error: Unsupported target machine architecture in ELF object pytorch-jitted-objectbuffer
-    # unknown file: Failure
-    # C++ exception with description "valOrErr INTERNAL ASSERT FAILED at "/var/lib/jenkins/workspace/torch/csrc/jit/tensorexpr/llvm_jit.h":34, please report a bug to PyTorch. Unexpected failure in LLVM JIT: Failed to materialize symbols: { (main, { func }) }
-    if [[ "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
-      python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr
-    fi
  fi

  # quantization is not fully supported on s390x yet
@ -1324,10 +1322,13 @@ EOF

  # Step 2. Make sure that the public API test "test_correct_module_names" fails when an existing
  # file is modified to introduce an invalid public API function.
-  EXISTING_FILEPATH="${TORCH_INSTALL_DIR}/nn/parameter.py"
+  # The filepath here must not have __all__ defined in it, otherwise the test will pass.
+  # If your PR introduces __all__ to torch/cuda/streams.py please point this to another file
+  # that does not have __all__ defined.
+  EXISTING_FILEPATH="${TORCH_INSTALL_DIR}/cuda/streams.py"
  cp -v "${EXISTING_FILEPATH}" "${EXISTING_FILEPATH}.orig"
  echo "${BAD_PUBLIC_FUNC}" >> "${EXISTING_FILEPATH}"
-  invalid_api="torch.nn.parameter.new_public_func"
+  invalid_api="torch.cuda.streams.new_public_func"
  echo "Appended an invalid public API function to existing file ${EXISTING_FILEPATH}..."

  check_public_api_test_fails \
@ -1561,7 +1562,7 @@ test_executorch() {
 test_linux_aarch64() {
  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
        test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
-        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops test_cpp_extensions_open_device_registration \
+        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
        --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose

  # Dynamo tests
@ -1673,13 +1674,11 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
 elif [[ "${TEST_CONFIG}" == cachebench ]]; then
  install_torchaudio
  install_torchvision
-  checkout_install_torchbench nanogpt BERT_pytorch resnet50 hf_T5 llama moco
-  PYTHONPATH=$(pwd)/torchbench test_cachebench
+  PYTHONPATH=/torchbench test_cachebench
 elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
  install_torchaudio
  install_torchvision
-  checkout_install_torchbench nanogpt
-  PYTHONPATH=$(pwd)/torchbench test_verify_cachebench
+  PYTHONPATH=/torchbench test_verify_cachebench
 elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  install_torchaudio
  install_torchvision
@ -1688,28 +1687,22 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  # https://github.com/opencv/opencv-python/issues/885
  pip_install opencv-python==4.8.0.74
  if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
-    checkout_install_torchbench hf_Bert hf_Albert timm_vision_transformer
-    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
+    PYTHONPATH=/torchbench test_inductor_torchbench_smoketest_perf
  elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
-    checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \
-      llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \
-      functorch_maml_omniglot yolov3 mobilenet_v2 resnext50_32x4d densenet121 mnasnet1_0
-    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf
+    PYTHONPATH=/torchbench test_inductor_torchbench_cpu_smoketest_perf
  elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
-    checkout_install_torchbench
-    TORCHBENCHPATH=$(pwd)/torchbench test_torchbench_gcp_smoketest
+    TORCHBENCHPATH=/torchbench test_torchbench_gcp_smoketest
  else
-    checkout_install_torchbench
    # Do this after checkout_install_torchbench to ensure we clobber any
    # nightlies that torchbench may pull in
    if [[ "${TEST_CONFIG}" != *cpu* ]]; then
      install_torchrec_and_fbgemm
    fi
-    PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
+    PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
  fi
 elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
  install_torchvision
-  PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
+  PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
  if [[ "$SHARD_NUMBER" -eq "1" ]]; then
    test_inductor_aoti
  fi
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -192,9 +192,6 @@ retry brew install libomp
 # For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
 export USE_DISTRIBUTED=1

-if [[ -n "$CROSS_COMPILE_ARM64" ]]; then
-    export CMAKE_OSX_ARCHITECTURES=arm64
-fi
 export USE_MKLDNN=OFF
 export USE_QNNPACK=OFF
 export BUILD_TEST=OFF
@ -202,16 +199,7 @@ export BUILD_TEST=OFF
 pushd "$pytorch_rootdir"
 echo "Calling setup.py bdist_wheel at $(date)"

-if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-    echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
-    BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 python setup.py bdist_wheel -d "$whl_tmp_dir"
-    echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
-    echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
-    BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 CMAKE_FRESH=1 python setup.py bdist_wheel -d "$whl_tmp_dir"
-    echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
-else
-    python setup.py bdist_wheel -d "$whl_tmp_dir"
-fi
+python setup.py bdist_wheel -d "$whl_tmp_dir"

 echo "Finished setup.py bdist_wheel at $(date)"

--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -65,16 +65,8 @@ fi

 if [[ "$PACKAGE_TYPE" != libtorch ]]; then
  if [[ "\$BUILD_ENVIRONMENT" != *s390x* ]]; then
-    if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-      pkg_no_python="$(ls -1 /final_pkgs/torch_no_python* | sort |tail -1)"
-      pkg_torch="$(ls -1 /final_pkgs/torch-* | sort |tail -1)"
-      # todo: after folder is populated use the pypi_pkg channel instead
-      pip install "\$pkg_no_python" "\$pkg_torch" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}_pypi_pkg"
-      retry pip install -q numpy protobuf typing-extensions
-    else
-      pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
-      retry pip install -q numpy protobuf typing-extensions
-    fi
+    pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
+    retry pip install -q numpy protobuf typing-extensions
  else
    pip install "\$pkg"
    retry pip install -q numpy protobuf typing-extensions
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -134,7 +134,6 @@ export DESIRED_PYTHON="${DESIRED_PYTHON:-}"
 export DESIRED_CUDA="$DESIRED_CUDA"
 export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}"
 export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}"
-export USE_SPLIT_BUILD="${USE_SPLIT_BUILD:-}"
 if [[ "${OSTYPE}" == "msys" ]]; then
  export LIBTORCH_CONFIG="${LIBTORCH_CONFIG:-}"
  if [[ "${LIBTORCH_CONFIG:-}" == 'debug' ]]; then
--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@ -23,10 +23,6 @@ if [[ "${DRY_RUN}" = "disabled" ]]; then
  AWS_S3_CP="aws s3 cp"
 fi

-if [[ "${USE_SPLIT_BUILD:-false}" == "true" ]]; then
-  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_pypi_pkg"
-fi
-
 # this is special build with all dependencies packaged
 if [[ ${BUILD_NAME} == *-full* ]]; then
  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_full"
--- a/.flake8
+++ b/.flake8
@ -7,12 +7,12 @@ max-line-length = 120
 # C408 ignored because we like the dict keyword argument syntax
 # E501 is not flexible enough, we're using B950 instead
 ignore =
-    E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,
+    E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
    # shebang has extra meaning in fbcode lints, so I think it's not worth trying
    # to line this up with executable bit
    EXE001,
    # these ignores are from flake8-bugbear; please fix!
-    B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907
+    B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907,B908,B910
    # these ignores are from flake8-comprehensions; please fix!
    C407,
    # these ignores are from flake8-logging-format; please fix!
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -53,16 +53,12 @@ self-hosted-runner:
    - linux.rocm.gpu.mi250
    - linux.rocm.gpu.2
    - linux.rocm.gpu.4
-    # MI300 runners
-    - linux.rocm.gpu.mi300.2
-    - linux.rocm.gpu.mi300.4
+    # gfx942 runners
+    - linux.rocm.gpu.gfx942.2
+    - linux.rocm.gpu.gfx942.4
    - rocm-docker
-    # Repo-specific Apple hosted  runners
-    - macos-m1-ultra
-    - macos-m2-14
    # Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors)
    - macos-m1-stable
-    - macos-m1-13
    - macos-m1-14
    # GitHub-hosted MacOS runners
    - macos-latest-xlarge
--- a/.github/actions/build-android/action.yml
+++ b/.github/actions/build-android/action.yml
@ -1,78 +0,0 @@
-name: build android
-
-description: build android for a specific arch
-
-inputs:
-  arch:
-    description: arch to build
-    required: true
-  arch-for-build-env:
-    description: |
-      arch to pass to build environment.
-      This is currently different than the arch name we use elsewhere, which
-      should be fixed.
-    required: true
-  github-secret:
-    description: github token
-    required: true
-  build-environment:
-    required: true
-    description: Top-level label for what's being built/tested.
-  docker-image:
-    required: true
-    description: Name of the base docker image to build with.
-  branch:
-    required: true
-    description: What branch we are building on.
-outputs:
-  container_id:
-    description: Docker container identifier used to build the artifacts
-    value: ${{ steps.build.outputs.container_id }}
-
-runs:
-  using: composite
-  steps:
-    - name: Build-${{ inputs.arch }}
-      id: build
-      shell: bash
-      env:
-        BRANCH: ${{ inputs.branch }}
-        BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-${{ inputs.arch-for-build-env }}-build"
-        AWS_DEFAULT_REGION: us-east-1
-        PR_NUMBER: ${{ github.event.pull_request.number }}
-        SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-        SCCACHE_REGION: us-east-1
-        DOCKER_IMAGE: ${{ inputs.docker-image  }}
-        MATRIX_ARCH: ${{ inputs.arch }}
-      run: |
-        # detached container should get cleaned up by teardown_ec2_linux
-        set -exo pipefail
-        export container_name
-        container_name=$(docker run \
-          -e BUILD_ENVIRONMENT \
-          -e MAX_JOBS="$(nproc --ignore=2)" \
-          -e AWS_DEFAULT_REGION \
-          -e PR_NUMBER \
-          -e SHA1 \
-          -e BRANCH \
-          -e SCCACHE_BUCKET \
-          -e SCCACHE_REGION \
-          -e SKIP_SCCACHE_INITIALIZATION=1 \
-          --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-          --security-opt seccomp=unconfined \
-          --cap-add=SYS_PTRACE \
-          --tty \
-          --detach \
-          --user jenkins \
-          -w /var/lib/jenkins/workspace \
-          "${DOCKER_IMAGE}"
-        )
-        git submodule sync && git submodule update -q --init --recursive --depth 1
-        docker cp "${GITHUB_WORKSPACE}/." "${container_name}:/var/lib/jenkins/workspace"
-        (echo "sudo chown -R jenkins . && .ci/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete" | docker exec -u jenkins -i "${container_name}" bash) 2>&1
-
-        # Copy install binaries back
-        mkdir -p "${GITHUB_WORKSPACE}/build_android_install_${MATRIX_ARCH}"
-        docker cp "${container_name}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_${MATRIX_ARCH}"
-        echo "container_id=${container_name}" >> "${GITHUB_OUTPUT}"
--- a/.github/actions/filter-test-configs/action.yml
+++ b/.github/actions/filter-test-configs/action.yml
@ -70,7 +70,7 @@ runs:
          set -eux
          # PyYAML 6.0 doesn't work with MacOS x86 anymore
          # This must run on Python-3.7 (AmazonLinux2) so can't use request=3.32.2
-          python3 -m pip install requests==2.27.1 pyyaml==6.0.1
+          python3 -m pip install requests==2.27.1 pyyaml==6.0.2

    - name: Parse ref
      id: parse-ref
--- a/.github/actions/test-pytorch-binary/action.yml
+++ b/.github/actions/test-pytorch-binary/action.yml
@ -24,7 +24,6 @@ runs:
          -e PYTORCH_FINAL_PACKAGE_DIR \
          -e PYTORCH_ROOT \
          -e SKIP_ALL_TESTS \
-          -e USE_SPLIT_BUILD \
          --tty \
          --detach \
          -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-b6a3368a45aaafe05f1a6a9f10c68adc5e944d9e
+e500f0cf88bc57ffd8b0029033da305eef24ae25
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-b77c7d327f2a463bb9ef8be36f30e920bc066502
+35afe1b30b154114dc2ee8329e12f8cf3fe9f576
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-1c00dea2c9adb2137903c86b4191e8c247f8fda9
+095faec1e7b6cc47220181e74ae9cde2605f9b00
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -131,21 +131,6 @@
  - Lint
  - pull

- name: Mobile
-  patterns:
-  - ios/**
-  - android/**
-  - test/mobile/**
-  approved_by:
-  - linbinyu
-  - IvanKobzarev
-  - dreiss
-  - raziel
-  mandatory_checks_name:
-  - EasyCLA
-  - Lint
-  - pull
-
 - name: PrimTorch
  patterns:
  - torch/_meta_registrations.py
@ -503,6 +488,10 @@
  - torch/_dynamo/**
  - torch/csrc/dynamo/**
  - test/dynamo/**
+  - test/dynamo_expected_failures/**
+  - test/dynamo_skips/**
+  - test/inductor_expected_failures/**
+  - test/inductor_skips/**
  approved_by:
  - guilhermeleobas
  mandatory_checks_name:
--- a/.github/requirements-gha-cache.txt
+++ b/.github/requirements-gha-cache.txt
@ -7,9 +7,9 @@
 #   .ci/docker/requirements-ci.txt
 boto3==1.35.42
 jinja2==3.1.6
-lintrunner==0.10.7
+lintrunner==0.12.7
 ninja==1.10.0.post1
 nvidia-ml-py==11.525.84
-pyyaml==6.0
+pyyaml==6.0.2
 requests==2.32.4
-rich==10.9.0
+rich==14.1.0
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -2,7 +2,7 @@ boto3==1.35.42
 cmake==3.27.*
 expecttest==0.3.0
 fbscribelogger==0.1.7
-filelock==3.6.0
+filelock==3.18.0
 hypothesis==6.56.4
 librosa>=0.6.2
 mpmath==1.3.0
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -193,7 +193,7 @@ LIBTORCH_CONTAINER_IMAGES: dict[str, str] = {
    "cpu": "libtorch-cxx11-builder:cpu",
 }

-FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"]
+FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]


 def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
@ -273,7 +273,6 @@ def generate_wheels_matrix(
    os: str,
    arches: Optional[list[str]] = None,
    python_versions: Optional[list[str]] = None,
-    use_split_build: bool = False,
 ) -> list[dict[str, str]]:
    package_type = "wheel"
    if os == "linux" or os == "linux-aarch64" or os == "linux-s390x":
@ -315,15 +314,11 @@ def generate_wheels_matrix(
            # TODO: Enable python 3.13t on cpu-s390x
            if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
                continue
-
-            if use_split_build and (
-                arch_version not in ["12.6", "12.8", "12.9", "cpu"] or os != "linux"
+            # TODO: Enable python 3.14 on non linux OSes
+            if os != "linux" and (
+                python_version == "3.14" or python_version == "3.14t"
            ):
-                raise RuntimeError(
-                    "Split build is only supported on linux with cuda 12* and cpu.\n"
-                    f"Currently attempting to build on arch version {arch_version} and os {os}.\n"
-                    "Please modify the matrix generation to exclude this combination."
-                )
+                continue

            # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install

@ -339,7 +334,6 @@ def generate_wheels_matrix(
                        "gpu_arch_type": gpu_arch_type,
                        "gpu_arch_version": gpu_arch_version,
                        "desired_cuda": desired_cuda,
-                        "use_split_build": "True" if use_split_build else "False",
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version].split(
                            ":"
                        )[0],
@ -372,7 +366,6 @@ def generate_wheels_matrix(
                            "desired_cuda": translate_desired_cuda(
                                gpu_arch_type, gpu_arch_version
                            ),
-                            "use_split_build": "True" if use_split_build else "False",
                            "container_image": WHEEL_CONTAINER_IMAGES[
                                arch_version
                            ].split(":")[0],
@ -395,7 +388,6 @@ def generate_wheels_matrix(
                        "desired_cuda": translate_desired_cuda(
                            gpu_arch_type, gpu_arch_version
                        ),
-                        "use_split_build": "True" if use_split_build else "False",
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version].split(
                            ":"
                        )[0],
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -59,9 +59,7 @@ class BinaryBuildWorkflow:
    is_scheduled: str = ""
    branches: str = "nightly"
    # Mainly for macos
-    cross_compile_arm64: bool = False
    macos_runner: str = "macos-14-xlarge"
-    use_split_build: bool = False
    # Mainly used for libtorch builds
    build_variant: str = ""

@ -72,9 +70,6 @@ class BinaryBuildWorkflow:
                for item in [self.os, "binary", self.package_type, self.build_variant]
                if item != ""
            )
-        if self.use_split_build:
-            # added to distinguish concurrency groups
-            self.build_environment += "-split"

    def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
        output_file_path = (
@ -117,21 +112,6 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
            isolated_workflow=True,
        ),
    ),
-    # See https://github.com/pytorch/pytorch/issues/138750
-    #   BinaryBuildWorkflow(
-    #     os=OperatingSystem.LINUX,
-    #     package_type="manywheel",
-    #     build_configs=generate_binary_build_matrix.generate_wheels_matrix(
-    #         OperatingSystem.LINUX,
-    #         use_split_build=True,
-    #         arches=["11.8", "12.1", "12.4", "cpu"],
-    #     ),
-    #     ciflow_config=CIFlowConfig(
-    #         labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
-    #         isolated_workflow=True,
-    #     ),
-    #     use_split_build=True,
-    # ),
    BinaryBuildWorkflow(
        os=OperatingSystem.LINUX,
        package_type="libtorch",
@ -175,27 +155,11 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
        package_type="manywheel",
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.LINUX,
-            arches=["12.6", "12.8", "12.9"],
-            python_versions=["3.9"],
+            arches=["12.8"],
+            python_versions=["3.12"],
        ),
        branches="main",
    ),
-    # See https://github.com/pytorch/pytorch/issues/138750
-    # BinaryBuildWorkflow(
-    #     os=OperatingSystem.LINUX,
-    #     package_type="manywheel",
-    #     build_configs=generate_binary_build_matrix.generate_wheels_matrix(
-    #         OperatingSystem.LINUX,
-    #         arches=["11.8", "12.1", "12.4"],
-    #         python_versions=["3.9"],
-    #         use_split_build=True,
-    #     ),
-    #     ciflow_config=CIFlowConfig(
-    #         labels={LABEL_CIFLOW_PERIODIC},
-    #     ),
-    #     branches="main",
-    #     use_split_build=True,
-    # ),
    BinaryBuildWorkflow(
        os=OperatingSystem.LINUX,
        package_type="libtorch",
@ -338,7 +302,6 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
            generate_binary_build_matrix.RELEASE,
            libtorch_variants=["shared-with-deps"],
        ),
-        cross_compile_arm64=False,
        macos_runner="macos-14-xlarge",
        ciflow_config=CIFlowConfig(
            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
@ -351,7 +314,6 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.MACOS_ARM64
        ),
-        cross_compile_arm64=False,
        macos_runner="macos-14-xlarge",
        ciflow_config=CIFlowConfig(
            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@ -2,7 +2,7 @@
 set -ex

 # Use uv to speed up lintrunner init
-python3 -m pip install uv==0.1.45 setuptools
+python3 -m pip install -U uv==0.8.* setuptools

 CACHE_DIRECTORY="/tmp/.lintbin"
 # Try to recover the cached binaries
--- a/.github/scripts/runner_determinator.py
+++ b/.github/scripts/runner_determinator.py
@ -262,7 +262,12 @@ def is_exception_branch(branch: str) -> bool:
    """
    Branches that get opted out of experiments by default, until they're explicitly enabled.
    """
-    return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}
+    return branch.split("/", maxsplit=1)[0] in {
+        "main",
+        "nightly",
+        "release",
+        "landchecks",
+    }


 def load_yaml(yaml_text: str) -> Any:
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -1891,7 +1891,9 @@ def validate_revert(
        else pr.get_comment_by_id(comment_id)
    )
    if comment.editor_login is not None:
-        raise PostCommentError("Don't want to revert based on edited command")
+        raise PostCommentError(
+            "Halting the revert as the revert comment has been edited."
+        )
    author_association = comment.author_association
    author_login = comment.author_login
    allowed_reverters = ["COLLABORATOR", "MEMBER", "OWNER"]
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -47,9 +47,6 @@ env:
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SKIP_ALL_TESTS: 0
-{%- if cross_compile_arm64 %}
-  CROSS_COMPILE_ARM64: 1
-{% endif %}
 !{{ common.concurrency(build_environment) }}

 jobs:
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@ -25,11 +25,6 @@
      DOCKER_IMAGE: !{{ config["container_image"] }}
      DOCKER_IMAGE_TAG_PREFIX: !{{ config["container_image_tag_prefix"] }}
 {%- endif %}
-{%- if config["package_type"] == "manywheel" %}
-  {%- if config.use_split_build is defined %}
-      use_split_build: !{{ config["use_split_build"] }}
-  {%- endif %}
-{%- endif %}
 {%- if config["package_type"] == "libtorch" %}
  {%- if config["libtorch_config"] %}
      LIBTORCH_CONFIG: !{{ config["libtorch_config"] }}
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -26,13 +26,6 @@ on:
        default: 240
        type: number
        description: timeout for the job
-      use_split_build:
-        description: |
-          [Experimental] Build a libtorch only wheel and build pytorch such that
-          are built from the libtorch wheel.
-        required: false
-        type: boolean
-        default: false
      ALPINE_IMAGE:
        required: false
        type: string
@ -117,7 +110,6 @@ jobs:
      PR_NUMBER: ${{ github.event.pull_request.number }}
      PYTORCH_FINAL_PACKAGE_DIR: /artifacts
      SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
    steps:
      - name: Make the env permanent during this workflow (but not the secrets)
        shell: bash
@ -142,7 +134,6 @@ jobs:
            echo "PR_NUMBER=${{ env.PR_NUMBER }}"
            echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
            echo "SHA1=${{ env.SHA1 }}"
-            echo "USE_SPLIT_BUILD=${{ env.use_split_build }}"
          } >> "${GITHUB_ENV} }}"

      - name: List the env
@ -261,7 +252,6 @@ jobs:
            -e PYTORCH_ROOT \
            -e SKIP_ALL_TESTS \
            -e PYTORCH_EXTRA_INSTALL_REQUIREMENTS \
-            -e USE_SPLIT_BUILD \
            --tty \
            --detach \
            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -64,13 +64,6 @@ on:
        required: true
        type: string
        description: Hardware to run this job on. Valid values are linux.4xlarge, linux.4xlarge.nvidia.gpu, linux.arm64.2xlarge, and linux.rocm.gpu
-      use_split_build:
-        description: |
-          [Experimental] Build a libtorch only wheel and build pytorch such that
-          are built from the libtorch wheel.
-        required: false
-        type: boolean
-        default: false
    secrets:
      github-token:
        required: true
@ -104,7 +97,6 @@ jobs:
      PR_NUMBER: ${{ github.event.pull_request.number }}
      PYTORCH_FINAL_PACKAGE_DIR: /artifacts
      SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
    steps:
      - name: Make the env permanent during this workflow (but not the secrets)
        shell: bash
@ -129,7 +121,6 @@ jobs:
            echo "PR_NUMBER=${{ env.PR_NUMBER }}"
            echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
            echo "SHA1=${{ env.SHA1 }}"
-            echo "USE_SPLIT_BUILD=${{ env.USE_SPLIT_BUILD }}"
          } >> "${GITHUB_ENV} }}"

      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
--- a/.github/workflows/_binary-upload.yml
+++ b/.github/workflows/_binary-upload.yml
@ -51,13 +51,6 @@ on:
        required: false
        type: string
        description: Desired python version
-      use_split_build:
-        description: |
-          [Experimental] Build a libtorch only wheel and build pytorch such that
-          are built from the libtorch wheel.
-        required: false
-        type: boolean
-        default: false
    secrets:
      github-token:
        required: true
@ -86,7 +79,6 @@ jobs:
      PR_NUMBER: ${{ github.event.pull_request.number }}
      PYTORCH_FINAL_PACKAGE_DIR: /artifacts
      SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -306,7 +306,6 @@ jobs:
            -e OUR_GITHUB_JOB_ID \
            -e HUGGING_FACE_HUB_TOKEN \
            -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
-            -e USE_SPLIT_BUILD \
            -e BUILD_ADDITIONAL_PACKAGES \
            --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
            --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -96,7 +96,7 @@ jobs:
    steps:
      - name: Setup SSH (Click me for login details)
        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        if: ${{ matrix.runner != 'B200' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
+        if: ${{ !contains(matrix.runner, 'b200') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
          instructions: |
@ -109,7 +109,7 @@ jobs:
          no-sudo: true

      - name: Setup Python
-        if: matrix.runner == 'B200'
+        if: contains(matrix.runner, 'b200')
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
        with:
          python-version: '3.12'
@ -117,7 +117,7 @@ jobs:

      - name: Setup Linux
        uses: ./.github/actions/setup-linux
-        if: inputs.build-environment != 'linux-s390x-binary-manywheel' && matrix.runner != 'B200'
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel' && !contains(matrix.runner, 'b200')

      - name: configure aws credentials
        if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
@ -128,7 +128,7 @@ jobs:
          aws-region: us-east-1

      - name: Login to Amazon ECR
-        if: ${{ inputs.aws-role-to-assume != '' && matrix.runner == 'B200' }}
+        if: ${{ inputs.aws-role-to-assume != '' && contains(matrix.runner, 'b200') }}
        id: login-ecr
        continue-on-error: true
        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
@ -166,17 +166,17 @@ jobs:
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
        with:
          driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
-        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && matrix.runner != 'B200' }}
+        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}

      - name: Setup GPU_FLAG for docker run
        id: setup-gpu-flag
        run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
-        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || matrix.runner == 'B200') }}
+        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || contains(matrix.runner, 'b200')) }}

      - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
        id: setup-sscache-port-flag
        run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
-        if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && matrix.runner != 'B200' }}
+        if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && !contains(matrix.runner, 'b200') }}

      - name: Lock NVIDIA A100 40GB Frequency
        run: |
@ -277,8 +277,8 @@ jobs:
          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
          TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
          # Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs
-          SCCACHE_BUCKET: ${{ matrix.runner != 'B200' && 'ossci-compiler-cache-circleci-v2' || '' }}
-          SCCACHE_REGION: ${{ matrix.runner != 'B200' && 'us-east-1' || '' }}
+          SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }}
+          SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }}
          SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
          DOCKER_IMAGE: ${{ inputs.docker-image }}
          XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
@ -403,7 +403,7 @@ jobs:
          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}

      - name: Authenticate with AWS
-        if: ${{ matrix.runner == 'B200' }}
+        if: ${{ contains(matrix.runner, 'b200') }}
        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
        with:
          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -269,8 +269,8 @@ jobs:
          # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"

-      - name: Change permissions (only needed for MI300 and MI355 kubernetes runners for now)
-        if: ${{ always() && steps.test.conclusion && (contains(matrix.runner, 'mi300') || contains(matrix.runner, 'mi355')) }}
+      - name: Change permissions (only needed for kubernetes runners for now)
+        if: ${{ always() && steps.test.conclusion && (contains(matrix.runner, 'gfx942') || contains(matrix.runner, 'mi355')) }}
        run: |
          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"

--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -50,7 +50,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t" ]
+        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
        device: ["cuda", "rocm", "xpu", "aarch64"]
        docker-image: ["pytorch/manylinux2_28-builder:cpu"]
        include:
@ -126,6 +126,12 @@ jobs:
          3.13t)
            PYTHON_EXECUTABLE=/opt/python/cp313-cp313t/bin/python
            ;;
+          3.14)
+            PYTHON_EXECUTABLE=/opt/python/cp314-cp314/bin/python
+            ;;
+          3.14t)
+            PYTHON_EXECUTABLE=/opt/python/cp314-cp314t/bin/python
+            ;;
          *)
            echo "Unsupported python version ${PY_VERS}"
            exit 1
--- a/.github/workflows/check-labels.yml
+++ b/.github/workflows/check-labels.yml
@ -34,7 +34,8 @@ jobs:
      contents: read
      pull-requests: write
    name: Check labels
-    if: github.repository_owner == 'pytorch'
+    # Disabling the job until https://github.com/pytorch/pytorch/issues/159825 is resolved
+    if: github.repository_owner == 'pytorch' && false
    runs-on: linux.24_04.4x
    steps:
      - name: Checkout PyTorch
--- a/.github/workflows/check_mergeability_ghstack.yml
+++ b/.github/workflows/check_mergeability_ghstack.yml
@ -7,7 +7,8 @@ on:

 jobs:
  ghstack-mergeability-check:
-    if: github.repository_owner == 'pytorch'
+    # Disabling the job until https://github.com/pytorch/pytorch/issues/159825 is resolved
+    if: github.repository_owner == 'pytorch' && false
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@ -56,7 +57,7 @@ jobs:
          cache: pip
          architecture: x64

-      - run: pip install pyyaml==6.0
+      - run: pip install pyyaml==6.0.2
        shell: bash

      - name: Verify mergeability
--- a/.github/workflows/cherry-pick.yml
+++ b/.github/workflows/cherry-pick.yml
@ -26,7 +26,7 @@ jobs:
          cache: pip

      # Not the direct dependencies but the script uses trymerge
-      - run: pip install pyyaml==6.0
+      - run: pip install pyyaml==6.0.2

      - name: Setup committer id
        run: |
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -51,21 +51,17 @@ jobs:
        docker-image-name: [
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
-          pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks,
-          pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks,
-          pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
          pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
          pytorch-linux-jammy-py3.9-clang12,
-          pytorch-linux-jammy-py3.11-clang12,
-          pytorch-linux-jammy-py3.12-clang12,
          pytorch-linux-jammy-py3.13-clang12,
          pytorch-linux-jammy-rocm-n-py3,
          pytorch-linux-noble-rocm-n-py3,
          pytorch-linux-noble-rocm-alpha-py3,
+          pytorch-linux-jammy-rocm-n-py3-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
          pytorch-linux-jammy-py3.9-gcc11,
          pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
@ -76,7 +72,8 @@ jobs:
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
-          pytorch-linux-jammy-py3-clang12-executorch,
+          # Executorch pin needs update
+          # pytorch-linux-jammy-py3-clang12-executorch,
          pytorch-linux-jammy-py3.12-triton-cpu
        ]
        include:
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@ -144,7 +144,7 @@ jobs:
        run: |
          make -f docker.Makefile "${BUILD_IMAGE_TYPE}-image"
      - name: Push nightly tags
-        if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' && matrix.build_platforms == 'linux/amd4' }}
+        if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' && matrix.platform == 'linux/amd4' }}
        run: |
          PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-cuda${CUDA_VERSION_SHORT}-cudnn${CUDNN_VERSION}-runtime"
          CUDA_SUFFIX="-cu${CUDA_VERSION}"
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -60,7 +60,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -84,7 +83,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -108,7 +106,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-aarch64
    secrets:
@ -129,7 +126,6 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -156,7 +152,6 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda-aarch64-12_9
    secrets:
@ -176,7 +171,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -200,7 +194,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -224,7 +217,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-aarch64
    secrets:
@ -245,7 +237,6 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -272,7 +263,6 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda-aarch64-12_9
    secrets:
@ -292,7 +282,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -316,7 +305,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -340,7 +328,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-aarch64
    secrets:
@ -361,7 +348,6 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -388,7 +374,6 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda-aarch64-12_9
    secrets:
@ -408,7 +393,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -432,7 +416,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -456,7 +439,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-aarch64
    secrets:
@ -477,7 +459,6 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -504,7 +485,6 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cuda-aarch64-12_9
    secrets:
@ -524,7 +504,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -548,7 +527,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -572,7 +550,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-aarch64
    secrets:
@ -593,7 +570,6 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -620,7 +596,6 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cuda-aarch64-12_9
    secrets:
@ -640,7 +615,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -664,7 +638,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.13t"
      build_name: manywheel-py3_13t-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -688,7 +661,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.13t"
      build_name: manywheel-py3_13t-cpu-aarch64
    secrets:
@ -709,7 +681,6 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -736,7 +707,6 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.13t"
      build_name: manywheel-py3_13t-cuda-aarch64-12_9
    secrets:
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -42,54 +42,7 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_9-cuda12_6-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_6
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_6-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda12_6-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_6
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_9-cuda12_8-build:
+  manywheel-py3_12-cuda12_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    needs: get-label-type
@ -103,18 +56,17 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_8
+      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_8-test:  # Testing
+  manywheel-py3_12-cuda12_8-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
-      - manywheel-py3_9-cuda12_8-build
+      - manywheel-py3_12-cuda12_8-build
      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
@ -127,56 +79,8 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_8
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_9-cuda12_9-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_9
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_9-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda12_9-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_9
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
--- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
@ -58,7 +58,6 @@ jobs:
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-rocm6_4
@ -83,7 +82,6 @@ jobs:
      SKIP_ALL_TESTS: 1
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
      DESIRED_PYTHON: "3.9"
    steps:
      - name: Setup ROCm
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@ -60,7 +60,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -84,7 +83,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -107,7 +105,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-s390x
    secrets:
@ -127,7 +124,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.10"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -151,7 +147,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -174,7 +169,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-s390x
    secrets:
@ -194,7 +188,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.11"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -218,7 +211,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -241,7 +233,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-s390x
    secrets:
@ -261,7 +252,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.12"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -285,7 +275,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -308,7 +297,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-s390x
    secrets:
@ -328,7 +316,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.13"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -352,7 +339,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -375,7 +361,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-s390x
    secrets:
--- a/.github/workflows/inductor-perf-test-b200.yml
+++ b/.github/workflows/inductor-perf-test-b200.yml
@ -0,0 +1,154 @@
+name: inductor-perf-b200
+
+on:
+  schedule:
+    - cron: 0 7 * * 1-6
+    - cron: 0 7 * * 0
+  # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
+  # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
+  workflow_dispatch:
+    inputs:
+      training:
+        description: Run training (on by default)?
+        required: false
+        type: boolean
+        default: true
+      inference:
+        description: Run inference (on by default)?
+        required: false
+        type: boolean
+        default: true
+      default:
+        description: Run inductor_default?
+        required: false
+        type: boolean
+        default: false
+      dynamic:
+        description: Run inductor_dynamic_shapes?
+        required: false
+        type: boolean
+        default: false
+      cppwrapper:
+        description: Run inductor_cpp_wrapper?
+        required: false
+        type: boolean
+        default: false
+      cudagraphs:
+        description: Run inductor_cudagraphs?
+        required: false
+        type: boolean
+        default: true
+      freezing_cudagraphs:
+        description: Run inductor_cudagraphs with freezing for inference?
+        required: false
+        type: boolean
+        default: false
+      aotinductor:
+        description: Run aot_inductor for inference?
+        required: false
+        type: boolean
+        default: false
+      maxautotune:
+        description: Run inductor_max_autotune?
+        required: false
+        type: boolean
+        default: false
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+        default: inductor_huggingface_perf_cuda_b200,inductor_timm_perf_cuda_b200,inductor_torchbench_perf_cuda_b200
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  get-label-type:
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+      opt_out_experiments: lf
+
+  build:
+    name: cuda12.8-py3.10-gcc9-sm100
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      # Use a bigger runner here because CUDA_ARCH 9.0 is only built for H100
+      # or newer GPUs, so it doesn't benefit much from existing compiler cache
+      # from trunk. Also use a memory-intensive runner here because memory is
+      # usually the bottleneck
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '10.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
+          { config: "inductor_timm_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
+          { config: "inductor_torchbench_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
+        ]}
+      selected-test-configs: ${{ inputs.benchmark_configs }}
+      build-additional-packages: "vision audio fbgemm torchao"
+    secrets: inherit
+
+  test-periodically:
+    name: cuda12.8-py3.10-gcc9-sm100
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build
+    if: github.event.schedule == '0 7 * * 1-6'
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-matrix: ${{ needs.build.outputs.test-matrix }}
+      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+      timeout-minutes: 720
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
+
+  test-weekly:
+    name: cuda12.8-py3.10-gcc9-sm100
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build
+    if: github.event.schedule == '0 7 * * 0'
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-matrix: ${{ needs.build.outputs.test-matrix }}
+      timeout-minutes: 1440
+      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
+
+  test:
+    name: cuda12.8-py3.10-gcc9-sm100
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-matrix: ${{ needs.build.outputs.test-matrix }}
+      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+      timeout-minutes: 720
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -2,7 +2,7 @@ name: inductor-perf-nightly-h100

 on:
  schedule:
-    - cron: 15 0,4,8,12,16,20 * * 1-6
+    - cron: 15 0,12 * * 1-6
    - cron: 0 7 * * 0
  # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
  # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
@ -126,7 +126,7 @@ jobs:
    name: cuda12.8-py3.10-gcc9-sm90
    uses: ./.github/workflows/_linux-test.yml
    needs: build
-    if: github.event.schedule == '15 0,4,8,12,16,20 * * 1-6'
+    if: github.event.schedule == '15 0,12 * * 1-6'
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
--- a/.github/workflows/inductor-perf-test-nightly-rocm.yml
+++ b/.github/workflows/inductor-perf-test-nightly-rocm.yml
@ -85,26 +85,26 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-rocm-py3_10
-      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
      test-matrix: |
        { include: [
-          { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
        ]}
    secrets: inherit

--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -81,21 +81,21 @@ jobs:
      sync-tag: rocm-build
      test-matrix: |
        { include: [
-          { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
        ]}
    secrets: inherit

--- a/.github/workflows/inductor-rocm-mi300.yml
+++ b/.github/workflows/inductor-rocm-mi300.yml
@ -47,8 +47,8 @@ jobs:
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
        ]}
    secrets: inherit

--- a/.github/workflows/mac-mps.yml
+++ b/.github/workflows/mac-mps.yml
@ -28,7 +28,6 @@ jobs:
      # than our AWS macos-m1-14 runners
      test-matrix: |
        { include: [
-          { config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m1-13" },
          { config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
          { config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m2-15" },
        ]}
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -75,10 +75,11 @@ jobs:
            repo-owner: pytorch
            branch: main
            pin-folder: .github/ci_commit_pins
-          - repo-name: executorch
-            repo-owner: pytorch
-            branch: main
-            pin-folder: .ci/docker/ci_commit_pins
+          # executorch jobs are disabled since it needs some manual work for the hash update
+          # - repo-name: executorch
+          #   repo-owner: pytorch
+          #   branch: main
+          #   pin-folder: .ci/docker/ci_commit_pins
          - repo-name: triton
            repo-owner: triton-lang
            branch: main
--- a/.github/workflows/periodic-rocm-mi300.yml
+++ b/.github/workflows/periodic-rocm-mi300.yml
@ -59,9 +59,9 @@ jobs:
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
      test-matrix: |
        { include: [
-          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
        ]}
    secrets: inherit

--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -51,37 +51,6 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

-  linux-jammy-cuda12_4-py3_10-gcc11-sm89-build:
-    name: linux-jammy-cuda12.4-py3.10-gcc11-sm89
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.4-py3.10-gcc11-sm89
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11
-      cuda-arch-list: 8.9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_4-py3_10-gcc11-sm89-test:
-    name: linux-jammy-cuda12.4-py3.10-gcc11-sm89
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda12_4-py3_10-gcc11-sm89-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-cuda12.4-py3.10-gcc11-sm89
-      docker-image: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-sm89-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-sm89-build.outputs.test-matrix }}
-    secrets: inherit
-
  linux-jammy-cuda12_4-py3_10-gcc11-build:
    name: linux-jammy-cuda12.4-py3.10-gcc11
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -254,36 +254,6 @@ jobs:
      timeout-minutes: 600
    secrets: inherit

-  linux-jammy-cuda12_8-py3_10-gcc11-build-distributed:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      cuda-arch-list: '7.5'
-      test-matrix: |
-        { include: [
-          { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc11-test-distributed:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-test
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda12_8-py3_10-gcc11-build-distributed
-      - target-determination
-    with:
-      timeout-minutes: 360
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.test-matrix }}
-    secrets: inherit
-
  linux-jammy-cuda12_8-py3_10-gcc11-build:
    name: linux-jammy-cuda12.8-py3.10-gcc11
    uses: ./.github/workflows/_linux-build.yml
@ -292,13 +262,18 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '7.5 8.9'
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
        ]}
    secrets: inherit

@ -329,30 +304,6 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-py3_9-clang9-xla-build:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
-      test-matrix: |
-        { include: [
-          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-py3_9-clang9-xla-test:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-py3_9-clang9-xla-build
-    with:
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
-    secrets: inherit
-
  linux-jammy-cpu-py3_10-gcc11-bazel-test:
    name: linux-jammy-cpu-py3.10-gcc11-bazel-test
    uses: ./.github/workflows/_bazel-build-test.yml
@ -402,38 +353,8 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-cuda12_8-py3_10-gcc11-sm89-build:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-sm89
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      cuda-arch-list: 8.9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc11-sm89-test:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-sm89
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda12_8-py3_10-gcc11-sm89-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.test-matrix }}
-    secrets: inherit
-
  linux-jammy-py3-clang12-executorch-build:
+    if: false  # Docker build needs pin update
    name: linux-jammy-py3-clang12-executorch
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
@ -458,31 +379,6 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
-    name: cuda12.8-py3.10-gcc9-sm75
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
-      cuda-arch-list: '7.5'
-      test-matrix: |
-        { include: [
-          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
-    name: cuda12.8-py3.10-gcc9-sm75
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
-    secrets: inherit
-
  linux-jammy-xpu-2025_1-py3_9-build:
    name: linux-jammy-xpu-2025.1-py3.9
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/revert.yml
+++ b/.github/workflows/revert.yml
@ -26,7 +26,7 @@ jobs:
          architecture: x64
          check-latest: false
          cache: pip
-      - run: pip install pyyaml==6.0
+      - run: pip install pyyaml==6.0.2

      - name: Setup committer id
        run: |
--- a/.github/workflows/rocm-mi300.yml
+++ b/.github/workflows/rocm-mi300.yml
@ -48,12 +48,12 @@ jobs:
      sync-tag: rocm-build
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
        ]}
    secrets: inherit

--- a/.github/workflows/rocm-mi355.yml
+++ b/.github/workflows/rocm-mi355.yml
@ -3,7 +3,7 @@ name: rocm-mi355
 on:
  workflow_dispatch:
  schedule:
-    - cron: 30 9 * * *  # about 2:30am PDT
+    - cron: 30 11,1 * * *  # about 4:30am PDT and 6:30pm PDT

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
--- a/.github/workflows/torchbench.yml
+++ b/.github/workflows/torchbench.yml
@ -10,6 +10,10 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

+permissions:
+  id-token: write
+  contents: read
+
 jobs:
  get-default-label-prefix:
    if: github.repository_owner == 'pytorch'
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -94,7 +94,6 @@ jobs:
          { config: "default", shard: 1, num_shards: 3, runner: "macos-m1-stable" },
          { config: "default", shard: 2, num_shards: 3, runner: "macos-m1-stable" },
          { config: "default", shard: 3, num_shards: 3, runner: "macos-m1-stable" },
-          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-13" },
          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m2-15" },
        ]}
@ -206,7 +205,7 @@ jobs:
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.9-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@ -28,7 +28,7 @@ jobs:
          check-latest: false
          cache: pip
          architecture: x64
-      - run: pip install pyyaml==6.0
+      - run: pip install pyyaml==6.0.2

      - name: Setup committer id
        run: |
--- a/.github/workflows/tryrebase.yml
+++ b/.github/workflows/tryrebase.yml
@ -25,7 +25,7 @@ jobs:
          architecture: x64
          check-latest: false
          cache: pip
-      - run: pip install pyyaml==6.0
+      - run: pip install pyyaml==6.0.2

      - name: Setup committer id
        run: |
--- a/.github/workflows/unstable.yml
+++ b/.github/workflows/unstable.yml
@ -12,7 +12,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  # There must be at least one job here to satisfy GitHub action workflow syntax
@ -51,3 +53,27 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-py3_9-clang9-xla-build:
+    name: linux-jammy-py3_9-clang9-xla
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3.9-clang9-xla
+      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
+      test-matrix: |
+        { include: [
+          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-py3_9-clang9-xla-test:
+    name: linux-jammy-py3_9-clang9-xla
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-py3_9-clang9-xla-build
+    with:
+      build-environment: linux-jammy-py3.9-clang9-xla
+      docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
+    secrets: inherit
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@ -23,7 +23,7 @@ jobs:
        with:
          repository: pytorch/pytorch
          stable-branch: viable/strict
-          requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\"]'
+          requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\", \"linux-aarch64\"]'
          secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
          clickhouse-url: ${{ secrets.CLICKHOUSE_URL }}
          clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }}
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -39,16 +39,16 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'flake8==6.1.0',
-    'flake8-bugbear==23.3.23',
-    'flake8-comprehensions==3.15.0',
+    'flake8==7.3.0',
+    'flake8-bugbear==24.12.12',
+    'flake8-comprehensions==3.16.0',
    'flake8-executable==2.1.3',
-    'flake8-logging-format==0.9.0',
-    'flake8-pyi==23.3.1',
-    'flake8-simplify==0.19.3',
+    'flake8-logging-format==2024.24.12',
+    'flake8-pyi==25.5.0',
+    'flake8-simplify==0.22.0',
    'mccabe==0.7.0',
-    'pycodestyle==2.11.1',
-    'pyflakes==3.1.0',
+    'pycodestyle==2.14.0',
+    'pyflakes==3.4.0',
    'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"',
 ]

@ -158,16 +158,16 @@ init_command = [
    'mypy==1.16.0',
    'sympy==1.13.3',
    'types-requests==2.27.25',
-    'types-pyyaml==6.0.1',
+    'types-pyyaml==6.0.2',
    'types-tabulate==0.8.8',
    'types-protobuf==5.29.1.20250403',
    'types-setuptools==79.0.0.20250422',
    'types-jinja2==2.11.9',
    'types-colorama==0.4.6',
-    'filelock==3.13.1',
+    'filelock==3.18.0',
    'junitparser==2.1.1',
-    'rich==10.9.0',
-    'pyyaml==6.0.1',
+    'rich==14.1.0',
+    'pyyaml==6.0.2',
    'optree==0.13.0',
    'dataclasses-json==0.6.7',
    'pandas==2.2.3',
@ -1111,7 +1111,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'PyYAML==6.0.1',
+    'pyyaml==6.0.2',
 ]

 [[linter]]
@ -1133,7 +1133,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'PyYAML==6.0.1',
+    'pyyaml==6.0.2',
 ]

 [[linter]]
@ -1452,8 +1452,6 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    '--no-black-binary',
-    'black==23.12.1',
    'usort==1.0.8.post1',
    'isort==6.0.1',
    'ruff==0.12.2',  # sync with RUFF
--- a/AGENTS.md
+++ b/AGENTS.md
@ -1 +1,17 @@
 - This is the only AGENTS.md, there are no recursive AGENTS.md
+- When you are working on a bug, first create a standalone file that
+  reproduces the bug and verify it fails in the expected way.  Use this to
+  test if your changes work.  Once the change is passing, find an appropriate
+  test file to add the test to and make sure to follow local conventions on
+  the test file.
+- If you are running the real test suite, DO NOT run the entire test suite.
+  Instead run only a single test case, e.g., 'python test/test_torch.py TestTorch.test_dir'
+- Do NOT run setup.py, you do not have a working build environment
+- Do NOT run pre-commit, it is not setup
+- To run lint, run 'lintrunner -a' (which will autoapply changes)
+- Do NOT attempt to install dependencies, you do not have Internet access
+- When you are ready to make a PR, do exactly these steps:
+  - git stash -u
+  - git reset --hard $(cat /tmp/orig_work.txt) # NB: reset to the LOCAL branch, do NOT fetch
+  - git stash pop
+  - Resolve conflicts if necessary
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -679,6 +679,7 @@ cc_library(
        [
            "torch/*.h",
            "torch/csrc/**/*.h",
+            "torch/nativert/**/*.h",
            "torch/csrc/distributed/c10d/**/*.hpp",
            "torch/lib/libshm/*.h",
        ],
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -240,6 +240,8 @@ cmake_dependent_option(
  BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON
  "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
 cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF)
+cmake_dependent_option(USE_ROCM_CK_GEMM "Use ROCm Composable Kernel for GEMMs" ON "USE_ROCM;NOT WIN32" OFF)
+option(USE_ROCM_CK_SDPA "Use ROCm Composable Kernel for SDPA" OFF)
 option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
 cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF)
 cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
@ -251,7 +253,6 @@ cmake_dependent_option(USE_CUFILE "Use cuFile" ON "USE_CUDA AND NOT WIN32" OFF)
 option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
 option(USE_KINETO "Use Kineto profiling library" ON)
 option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
-option(USE_FAKELOWP "Use FakeLowp operators" OFF)
 option(USE_GFLAGS "Use GFLAGS" OFF)
 option(USE_GLOG "Use GLOG" OFF)
 option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
@ -260,8 +261,9 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
 option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
+option(USE_DISTRIBUTED "Use distributed" ON)
 cmake_dependent_option(USE_NCCL "Use NCCL" ON
-                       "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
+                       "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_XCCL "Use XCCL" ON
                       "USE_XPU;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
@ -322,7 +324,6 @@ set(MKLDNN_ENABLE_CONCURRENT_EXEC ${USE_MKLDNN})
 cmake_dependent_option(USE_MKLDNN_CBLAS "Use CBLAS in MKLDNN" OFF "USE_MKLDNN"
                       OFF)
 option(USE_STATIC_MKL "Prefer to link with MKL statically (Unix only)" OFF)
-option(USE_DISTRIBUTED "Use distributed" ON)
 cmake_dependent_option(
  USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." ON
  "USE_DISTRIBUTED" OFF)
@ -564,7 +565,7 @@ if(MSVC)
  set(CMAKE_NINJA_CMCLDEPS_RC OFF)
  if(MSVC_Z7_OVERRIDE)
    # CMake set debug flags to use /Z7
-    set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT Embedded)
+    set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$<CONFIG:Debug,RelWithDebInfo>:Embedded>")
  endif()
  foreach(
    flag_var
@ -834,10 +835,11 @@ include(ExternalProject)

 # ---[ Dependencies ---[ FBGEMM doesn't work on x86 32bit and
 # CMAKE_SYSTEM_PROCESSOR thinks its 64bit
-if(USE_FBGEMM
-   AND((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND CMAKE_SIZEOF_VOID_P EQUAL
-                                                      4)
-        OR CMAKE_SYSTEM_PROCESSOR STREQUAL "x86"))
+if(USE_FBGEMM AND NOT CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+  message(WARNING
+    "x64 operating system is required for FBGEMM. "
+    "Not compiling with FBGEMM. "
+    "Turn this warning off by USE_FBGEMM=OFF.")
  set(USE_FBGEMM OFF)
 endif()

@ -872,6 +874,14 @@ cmake_dependent_option(
  "USE_CUDA OR USE_ROCM;NOT MSVC"
  OFF)

+cmake_dependent_option(
+  USE_FBGEMM_GENAI
+  "Whether to build FBGEMM GenAI quantized GEMM kernels.\
+  Will be disabled if not supported by the platform"
+  OFF
+  "USE_CUDA OR USE_ROCM"
+  OFF)
+
 # CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
 # Eff Attention won't
 cmake_dependent_option(
@ -905,6 +915,10 @@ if(USE_FBGEMM)
  string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM")
 endif()

+if(USE_FBGEMM_GENAI)
+  string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM_GENAI")
+endif()
+
 if(USE_PYTORCH_QNNPACK)
  string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK")
 endif()
--- a/19
+++ b/19
@ -14,7 +14,6 @@
 /torch/csrc/autograd/ @albanD @soulitzer
 /torch/autograd/ @albanD @soulitzer
 /tools/autograd/ @albanD @soulitzer
-/torch/header_only_apis.txt @janeyx99
 /torch/nn/ @albanD @jbschlosser @mikaylagawarecki
 /torch/optim/ @albanD @janeyx99
 /test/test_public_bindings.py @albanD
@ -51,12 +50,12 @@ nn/qat/ @jerryzh168
 /torch/csrc/distributed/c10d/Ops.* @kwen2501

 # ONNX Export
-/torch/_dynamo/backends/onnxrt.py @wschin
-/torch/csrc/jit/passes/onnx.h @titaiwangms @shubhambhokare1
-/torch/csrc/jit/passes/onnx.cpp @titaiwangms @shubhambhokare1
-/torch/csrc/jit/passes/onnx/ @titaiwangms @shubhambhokare1
-/torch/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin
-/test/onnx/  @titaiwangms @shubhambhokare1 @justinchuby @wschin
+/torch/_dynamo/backends/onnxrt.py @titaiwangms @xadupre @justinchuby
+/torch/csrc/jit/passes/onnx.h @titaiwangms @xadupre
+/torch/csrc/jit/passes/onnx.cpp @titaiwangms @xadupre
+/torch/csrc/jit/passes/onnx/ @titaiwangms @xadupre
+/torch/onnx/ @titaiwangms @xadupre @justinchuby
+/test/onnx/  @titaiwangms @xadupre @justinchuby

 # CI
 /.ci  @pytorch/pytorch-dev-infra
@ -165,6 +164,7 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd
 # torch.export
 /torch/export/ @avikchaudhuri @tugsbayasgalan @zhxchen17 @ydwu4 @angelayi
 /torch/_export/ @avikchaudhuri @tugsbayasgalan @zhxchen17 @ydwu4 @angelayi
+/torch/_export/serde/schema.py @SherlockNoMad @zhxchen17

 # Dynamic Shapes
 /torch/fx/experimental/symbolic_shapes.py @bobrenjc93 @laithsakka
@ -196,3 +196,8 @@ torch/backends/cudnn/ @eqy @syed-ahmed
 /torch/utils/_cxx_pytree.py @XuehaiPan
 /torch/utils/pytree/ @XuehaiPan
 /torch/_dynamo/polyfills/pytree.py @XuehaiPan
+
+# Relating to libtorch ABI
+/torch/csrc/stable/ @janeyx99 @mikaylagawarecki
+/torch/headeronly/ @janeyx99
+/torch/header_only_apis.txt @janeyx99
--- a/15
+++ b/15
@ -47,18 +47,6 @@ WORKDIR /opt/pytorch
 COPY . .
 RUN git submodule update --init --recursive

-FROM conda as build
-ARG CMAKE_VARS
-WORKDIR /opt/pytorch
-COPY --from=conda /opt/conda /opt/conda
-COPY --from=submodule-update /opt/pytorch /opt/pytorch
-RUN make triton
-RUN --mount=type=cache,target=/opt/ccache \
-    export eval ${CMAKE_VARS} && \
-    TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
-    CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
-    python -m pip install --no-build-isolation -v .
-
 FROM conda as conda-installs
 ARG PYTHON_VERSION=3.11
 ARG CUDA_PATH=cu121
@ -109,4 +97,5 @@ WORKDIR /workspace

 FROM official as dev
 # Should override the already installed version from the official-image stage
-COPY --from=build /opt/conda /opt/conda
+COPY --from=conda /opt/conda /opt/conda
+COPY --from=submodule-update /opt/pytorch /opt/pytorch
--- a/README.md
+++ b/README.md
@ -243,7 +243,7 @@ git submodule update --init --recursive

 ```bash
 conda install cmake ninja
-# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section below
+# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section above
 pip install -r requirements.txt
 ```

@ -276,7 +276,7 @@ conda install pkg-config libuv
 pip install mkl-static mkl-include
 # Add these packages if torch.distributed is needed.
 # Distributed package support on Windows is a prototype feature and is subject to changes.
-conda install -c conda-forge libuv=1.39
+conda install -c conda-forge libuv
 ```

 #### Install PyTorch
@ -560,7 +560,7 @@ To learn more about making a contribution to Pytorch, please see our [Contributi

 PyTorch is a community-driven project with several skillful engineers and researchers contributing to it.

-PyTorch is currently maintained by [Soumith Chintala](http://soumith.ch), [Gregory Chanan](https://github.com/gchanan), [Dmytro Dzhulgakov](https://github.com/dzhulgakov), [Edward Yang](https://github.com/ezyang), and [Nikita Shulga](https://github.com/malfet) with major contributions coming from hundreds of talented individuals in various forms and means.
+PyTorch is currently maintained by [Soumith Chintala](http://soumith.ch), [Gregory Chanan](https://github.com/gchanan), [Dmytro Dzhulgakov](https://github.com/dzhulgakov), [Edward Yang](https://github.com/ezyang), [Alban Desmaison](https://github.com/albanD), [Piotr Bialecki](https://github.com/ptrblck) and [Nikita Shulga](https://github.com/malfet) with major contributions coming from hundreds of talented individuals in various forms and means.
 A non-exhaustive but growing list needs to mention: [Trevor Killeen](https://github.com/killeent), [Sasank Chilamkurthy](https://github.com/chsasank), [Sergey Zagoruyko](https://github.com/szagoruyko), [Adam Lerer](https://github.com/adamlerer), [Francisco Massa](https://github.com/fmassa), [Alykhan Tejani](https://github.com/alykhantejani), [Luca Antiga](https://github.com/lantiga), [Alban Desmaison](https://github.com/albanD), [Andreas Koepf](https://github.com/andreaskoepf), [James Bradbury](https://github.com/jekbradbury), [Zeming Lin](https://github.com/ebetica), [Yuandong Tian](https://github.com/yuandong-tian), [Guillaume Lample](https://github.com/glample), [Marat Dukhan](https://github.com/Maratyszcza), [Natalia Gimelshein](https://github.com/ngimel), [Christian Sarofeen](https://github.com/csarofeen), [Martin Raison](https://github.com/martinraison), [Edward Yang](https://github.com/ezyang), [Zachary Devito](https://github.com/zdevito). <!-- codespell:ignore -->

 Note: This project is unrelated to [hughperkins/pytorch](https://github.com/hughperkins/pytorch) with the same name. Hugh is a valuable contributor to the Torch community and has helped with many things Torch and PyTorch.
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -119,6 +119,8 @@ file(GLOB_RECURSE native_mps_cpp "native/mps/*.cpp")
 file(GLOB_RECURSE native_mps_mm "native/mps/*.mm")
 file(GLOB_RECURSE native_mps_metal "native/mps/*.metal")
 file(GLOB_RECURSE native_mps_h "native/mps/*.h")
+file(GLOB_RECURSE native_sparse_mps_mm "native/sparse/mps/*.mm")
+file(GLOB_RECURSE native_mps_sparse_metal "native/sparse/mps/*.metal")

 file(GLOB native_sparse_cpp "native/sparse/*.cpp")
 file(GLOB native_quantized_cpp
@ -178,26 +180,27 @@ file(GLOB native_flash_attn_api_cpp "native/transformers/cuda/flash_attn/flash_a
 file(GLOB flash_attention_hip_hip "native/transformers/hip/flash_attn/*.hip")
 # if USE_FLASH_ATTENTION is set, ensure CK instances get generated
 if(USE_FLASH_ATTENTION)
-  if(DEFINED ENV{USE_CK_FLASH_ATTENTION})
-    set(USE_CK_FLASH_ATTENTION $ENV{USE_CK_FLASH_ATTENTION})
-      if(USE_CK_FLASH_ATTENTION STREQUAL "1")
-        if(DEFINED ENV{PYTORCH_ROCM_ARCH})
-          list(LENGTH PYTORCH_ROCM_ARCH NUM_ARCHS)
-          if(NUM_ARCHS GREATER 1)
-            message(WARNING "Building CK for multiple archs can increase build time considerably!
-            Consider setting PYTORCH_ROCM_ARCH env var value as the gfx arch you need to build for")
-          endif()
-        endif()
-        message(STATUS "USE_CK_FLASH_ATTENTION is set; building PyTorch with CK Flash Attention enabled")
-        message(STATUS "Generating CK kernel instances...")
-        add_subdirectory(native/transformers/hip/flash_attn/ck)
-        file(GLOB flash_attention_hip_ck_hip "native/transformers/hip/flash_attn/ck/*.hip")
-        list(APPEND native_transformers_hip_hip ${flash_attention_hip_ck_hip})
-        # FAv3 Generation
-        add_subdirectory(native/transformers/hip/flash_attn/ck/fav_v3)
-        file(GLOB flash_attention_v3_hip "native/transformers/hip/flash_attn/ck/fav_v3/*.hip")
-        list(APPEND native_transformers_hip_hip ${flash_attention_v3_hip})
+  if("$ENV{USE_CK_FLASH_ATTENTION}" STREQUAL "1")
+    message(STATUS "USE_CK_FLASH_ATTENTION is being deprecated. Please use USE_ROCM_CK_SDPA instead")
+    caffe2_update_option(USE_ROCM_CK_SDPA ON)
+  endif()
+  if(USE_ROCM_CK_SDPA)
+    if(DEFINED ENV{PYTORCH_ROCM_ARCH})
+      list(LENGTH PYTORCH_ROCM_ARCH NUM_ARCHS)
+      if(NUM_ARCHS GREATER 1)
+        message(WARNING "Building CK for multiple archs can increase build time considerably!
+        Consider setting PYTORCH_ROCM_ARCH env var value as the gfx arch you need to build for")
      endif()
+    endif()
+    message(STATUS "USE_ROCM_CK_SDPA is set; building PyTorch with CK SDPA enabled")
+    message(STATUS "Generating CK kernel instances...")
+    add_subdirectory(native/transformers/hip/flash_attn/ck)
+    file(GLOB flash_attention_hip_ck_hip "native/transformers/hip/flash_attn/ck/*.hip")
+    list(APPEND native_transformers_hip_hip ${flash_attention_hip_ck_hip})
+    # FAv3 Generation
+    add_subdirectory(native/transformers/hip/flash_attn/ck/fav_v3)
+    file(GLOB flash_attention_v3_hip "native/transformers/hip/flash_attn/ck/fav_v3/*.hip")
+    list(APPEND native_transformers_hip_hip ${flash_attention_v3_hip})
  endif()
  file(GLOB flash_attention_hip_aot_hip "native/transformers/hip/flash_attn/aot/*.hip")
  file(GLOB flash_attention_src_hip_hip "native/transformers/hip/flash_attn/src/*.hip")
@ -247,6 +250,50 @@ if(USE_MEM_EFF_ATTENTION)
  list(APPEND ATen_ATTENTION_KERNEL_SRCS ${mem_eff_attention_cuda_kernels_cu})
 endif()

+IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
+  message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF")
+  set(USE_FBGEMM_GENAI off)
+endif()
+
+# FBGEMM GenAI
+IF(USE_FBGEMM_GENAI)
+  set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/)
+  set(FBGEMM_GENAI_DIR ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
+
+  if(USE_ROCM)
+    # Only include the kernels we want to build to avoid increasing binary size.
+    file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
+      "${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
+      "${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
+    set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+
+    # Add additional HIPCC compiler flags for performance
+    set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
+      -mllvm
+      -amdgpu-coerce-illegal-types=1
+      -mllvm
+      -enable-post-misched=0
+      -mllvm
+      -greedy-reverse-local-assignment=1
+      -fhip-new-launch-api)
+
+    hip_add_library(
+      fbgemm_genai STATIC
+      ${fbgemm_genai_native_rocm_hip}
+      HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
+    set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
+
+    target_include_directories(fbgemm_genai PUBLIC
+      # FBGEMM version of Composable Kernel is used due to some customizations
+      ${FBGEMM_THIRD_PARTY}/composable_kernel/include
+      ${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
+      ${FBGEMM_GENAI_DIR}/include/
+      ${FBGEMM_GENAI_DIR}/common/include/
+    )
+  endif()
+endif()
+
 # XNNPACK
 file(GLOB native_xnnpack "native/xnnpack/*.cpp")

@ -372,39 +419,42 @@ if(USE_CUDA)
 endif()

 if(USE_ROCM)
-  # NOTE: The PyTorch build does not actually add_subdirectory
-  # third_party/composable_kernel or use it as a CMake library. What is used
-  # is header only, so this should be ok, except that the CMake build generates
-  # a ck/config.h. We just do that part here. Without this, the ck.h from the
-  # ROCM SDK may get accidentally used instead.
-  function(_pytorch_rocm_generate_ck_conf)
-    set(CK_ENABLE_INT8 "ON")
-    set(CK_ENABLE_FP16 "ON")
-    set(CK_ENABLE_FP32 "ON")
-    set(CK_ENABLE_FP64 "ON")
-    set(CK_ENABLE_BF16 "ON")
-    set(CK_ENABLE_FP8 "ON")
-    set(CK_ENABLE_BF8 "ON")
-    set(CK_USE_XDL "ON")
-    set(CK_USE_WMMA "ON")
-    configure_file(
-      "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in"
-      "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h"
-      )
-  endfunction()
-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/aiter/csrc/include)
-  _pytorch_rocm_generate_ck_conf()
+  if((USE_FLASH_ATTENTION AND USE_ROCM_CK_SDPA) OR USE_ROCM_CK_GEMM)
+    # NOTE: The PyTorch build does not actually add_subdirectory
+    # third_party/composable_kernel or use it as a CMake library. What is used
+    # is header only, so this should be ok, except that the CMake build generates
+    # a ck/config.h. We just do that part here. Without this, the ck.h from the
+    # ROCM SDK may get accidentally used instead.
+    function(_pytorch_rocm_generate_ck_conf)
+      set(CK_ENABLE_INT8 "ON")
+      set(CK_ENABLE_FP16 "ON")
+      set(CK_ENABLE_FP32 "ON")
+      set(CK_ENABLE_FP64 "ON")
+      set(CK_ENABLE_BF16 "ON")
+      set(CK_ENABLE_FP8 "ON")
+      set(CK_ENABLE_BF8 "ON")
+      set(CK_USE_XDL "ON")
+      set(CK_USE_WMMA "ON")
+      configure_file(
+        "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in"
+        "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h"
+        )
+    endfunction()
+    list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
+    list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
+    list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
+    list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/example/ck_tile/01_fmha)
+    list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
+    list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/aiter/csrc/include)
+    _pytorch_rocm_generate_ck_conf()
+  endif()

  # Next two lines are needed because TunableOp uses third-party/fmt
  list(APPEND ATen_HIP_INCLUDE $<TARGET_PROPERTY:fmt::fmt-header-only,INTERFACE_INCLUDE_DIRECTORIES>)
  list(APPEND ATen_HIP_DEPENDENCY_LIBS fmt::fmt-header-only)
-if(USE_FLASH_ATTENTION)
-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/native/transformers/hip/flash_attn/ck)
-endif()
+  if(USE_FLASH_ATTENTION AND USE_ROCM_CK_SDPA)
+    list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/native/transformers/hip/flash_attn/ck)
+  endif()
  list(APPEND ATen_HIP_SRCS
    ${ATen_HIP_SRCS}
    ${hip_hip}
@ -414,12 +464,17 @@ endif()
    ${native_quantized_hip_hip}
    ${native_transformers_hip_hip} ${native_transformers_src_hip_hip}
  )
-  if(WIN32) # Windows doesn't support Composable Kernels
+  if(NOT USE_ROCM_CK_GEMM)
    file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip")
    file(GLOB native_hip_ck "native/hip/ck*.hip")
    exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
      ${native_hip_bgemm} ${native_hip_ck})
  endif()
+  if(WIN32) # Windows doesn't support Composable Kernels and Triton
+    exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
+      ${native_transformers_hip_hip} ${native_transformers_hip_cpp})
+  endif()
+
  # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources)
  list(APPEND all_hip_cpp
    ${native_nested_hip_cpp}
@ -586,17 +641,10 @@ if(USE_CUDA AND NOT USE_ROCM)
      CUDA::cufft_static_nocallback
    )
   if(NOT BUILD_LAZY_CUDA_LINALG)
-     if(CUDA_VERSION_MAJOR LESS_EQUAL 11)
-       list(APPEND ATen_CUDA_DEPENDENCY_LIBS
-         CUDA::cusolver_static
-         ${CUDAToolkit_LIBRARY_DIR}/liblapack_static.a     # needed for libcusolver_static
-       )
-     elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 12)
-       list(APPEND ATen_CUDA_DEPENDENCY_LIBS
-         CUDA::cusolver_static
-         ${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a     # needed for libcusolver_static
-       )
-     endif()
+     list(APPEND ATen_CUDA_DEPENDENCY_LIBS
+       CUDA::cusolver_static
+       ${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a     # needed for libcusolver_static
+     )
   endif()
  else()
    list(APPEND ATen_CUDA_DEPENDENCY_LIBS
@ -661,29 +709,25 @@ endif()
 if(USE_MPS)
    include(../../../cmake/Metal.cmake)

-    set(ATen_MPS_SRCS ${ATen_MPS_SRCS} ${mps_cpp} ${mps_mm} ${mps_h} ${native_mps_cpp} ${native_mps_mm} ${native_mps_h})
+    set(ATen_MPS_SRCS ${ATen_MPS_SRCS} ${mps_cpp} ${mps_mm} ${mps_h} ${native_mps_cpp} ${native_mps_mm} ${native_mps_h} ${native_sparse_mps_mm})

    if(CAN_COMPILE_METAL)
-        foreach(SHADER ${native_mps_metal})
+        foreach(SHADER ${native_mps_metal} ${native_mps_sparse_metal})
            cmake_path(GET SHADER STEM TGT_STEM)
-            string(CONCAT TGT_BASIC ${TGT_STEM} "_30.air")
-            string(CONCAT TGT_BFLOAT ${TGT_STEM} "_31.air")
+            string(CONCAT TGT_BASIC ${TGT_STEM} "_31.air")
            list(APPEND AIR_BASIC ${TGT_BASIC})
-            list(APPEND AIR_BFLOAT ${TGT_BFLOAT})
-            metal_to_air(${SHADER} ${TGT_BASIC} "-std=metal3.0")
-            metal_to_air(${SHADER} ${TGT_BFLOAT} "-std=metal3.1")
+            metal_to_air(${SHADER} ${TGT_BASIC} "-std=metal3.1")
        endforeach()
        air_to_metallib(kernels_basic.metallib ${AIR_BASIC})
-        air_to_metallib(kernels_bfloat.metallib ${AIR_BFLOAT})
        add_custom_command(
                          COMMAND echo "// $$(date)" > metallib_dummy.cpp
-                          DEPENDS kernels_basic.metallib kernels_bfloat.metallib
+                          DEPENDS kernels_basic.metallib
                          OUTPUT metallib_dummy.cpp
                          COMMENT "Updating metallibs timestamp")
-        add_custom_target(metallibs DEPENDS kernels_basic.metallib kernels_bfloat.metallib metallib_dummy.cpp)
+        add_custom_target(metallibs DEPENDS kernels_basic.metallib metallib_dummy.cpp)
    else()
        file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/native/mps")
-        foreach(SHADER ${native_mps_metal})
+        foreach(SHADER ${native_mps_metal} ${native_mps_sparse_metal})
            cmake_path(GET SHADER STEM TGT_STEM)
            string(CONCAT SHADER_HDR_NAME  "${CMAKE_CURRENT_BINARY_DIR}" /native/mps/ ${TGT_STEM} "_metallib.h")
            metal_to_metallib_h(${SHADER} ${SHADER_HDR_NAME})
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -334,6 +334,14 @@ void Context::setBenchmarkLimitCuDNN(int b) {
  benchmark_limit_cudnn = b;
 }

+bool Context::immediateMiopen() const {
+  return immediate_miopen;
+}
+
+void Context::setImmediateMiopen(bool b) {
+  immediate_miopen = b;
+}
+
 bool Context::allowTF32CuBLAS() const {
 #ifdef USE_ROCM
    const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
@ -472,6 +480,9 @@ at::BlasBackend Context::blasPreferredBackend() {
  // call site for blasPreferredBackend(), we set it to an actual value.
  if (blas_preferred_backend == at::BlasBackend::Default) {
    blas_preferred_backend = at::BlasBackend::Cublas;
+    // This logic sits in the getter because it needs to validate
+    // values set via env vars such as TORCH_BLAS_PREFER_CUBLASLT
+    // which initialize the backend without calling the setter
 #ifdef USE_ROCM
    // AMD Instinct targets prefer hipblaslt
    static const bool hipblaslt_preferred = []() {
@ -501,10 +512,14 @@ at::BlasBackend Context::blasPreferredBackend() {
  // hipblaslt support for all archs is not as complete as hipblas
  if (blas_preferred_backend == at::BlasBackend::Cublaslt) {
    static const bool hipblaslt_unsupported = []() {
+      if(!hasCuBLASLt())
+      {
+          return true;
+      }
      static const std::vector<std::string> archs = {
          "gfx90a", "gfx942",
 #if ROCM_VERSION >= 60300
-          "gfx1100", "gfx1101", "gfx1200", "gfx1201",
+          "gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
 #endif
 #if ROCM_VERSION >= 60500
          "gfx950"
@ -526,6 +541,24 @@ at::BlasBackend Context::blasPreferredBackend() {
  return blas_preferred_backend;
 }

+bool Context::ckSupported() {
+#ifdef USE_ROCM
+  static const std::vector<std::string> supported_archs = {
+    "gfx90a", "gfx942", "gfx950"
+  };
+  for (auto index : c10::irange(detail::getCUDAHooks().deviceCount())) {
+    if(!detail::getCUDAHooks().isGPUArch(supported_archs, index)) {
+      TORCH_WARN_ONCE(
+        "Attempting to use CK on an unsupported architecture! Cannot set backend to CK");
+      return false;
+    }
+  }
+  return true;
+#else
+  return false;
+#endif
+}
+
 void Context::setBlasPreferredBackend(at::BlasBackend b) {
 #ifdef _MSC_VER
  TORCH_WARN_ONCE(
@ -535,8 +568,14 @@ void Context::setBlasPreferredBackend(at::BlasBackend b) {
 #else
  TORCH_CHECK((b != at::BlasBackend::Cublaslt) || hasCuBLASLt(),
      "Cannot set preferred backend to cuBLASLt if PyTorch has not been compiled with cuBLASLt.");
-  TORCH_CHECK((b != at::BlasBackend::Ck) || hasROCM(),
-      "Cannot set preferred backend to Ck if PyTorch has not been compiled for ROCm.");
+#ifdef USE_ROCM
+  static const bool ckSupportedFlag = ckSupported();
+  static const bool hasCKGEMMFlag = hasCKGEMM();
+  TORCH_CHECK((b != at::BlasBackend::Ck) || (ckSupportedFlag && hasCKGEMMFlag),
+      "Cannot set preferred blas backend to CK since following conditions are not true: ",
+      "architecture supported for CK: ", ckSupportedFlag,
+      ", PyTorch built with CK GEMM support: ", hasCKGEMMFlag);
+#endif
  if (b != at::BlasBackend::Default && b != at::BlasBackend::Cublas) {
    TORCH_WARN_ONCE(
      "torch.backends.cuda.preferred_blas_library is an experimental feature. "
@ -548,35 +587,40 @@ void Context::setBlasPreferredBackend(at::BlasBackend b) {
 #endif
 }

-at::ROCmFABackend Context::getROCmFAPreferredBackend() const {
+at::ROCmFABackend Context::getROCmFAPreferredBackend() {
+#ifdef USE_ROCM
+  // Set potential "Default" value so we don't have to interpret at call sites.
+  // We use aotriton backend as the default, for now.
+  if(rocm_fa_preferred_backend == at::ROCmFABackend::Default) {
+    rocm_fa_preferred_backend = at::ROCmFABackend::AOTriton;
+  } else if (rocm_fa_preferred_backend == at::ROCmFABackend::Ck) {
+    // This logic sits in the getter because it needs to validate
+    // values set via env vars such as TORCH_ROCM_FA_PREFER_CK
+    // which initialize the backend without calling the setter
+    // Perform validity checking
+    static const bool hasCKSDPAFlag = hasCKSDPA();
+    static const bool ckSupportedFlag = ckSupported();
+    if(!(hasCKSDPAFlag && ckSupportedFlag)){
+      TORCH_WARN_ONCE(
+        "Cannot set preferred SDPA backend to CK since following conditions are not true: ",
+        "architecture supported for CK: ", ckSupportedFlag,
+        ", PyTorch built with CK SDPA support: ", hasCKSDPAFlag);
+      rocm_fa_preferred_backend = at::ROCmFABackend::AOTriton;
+    }
+  }
+#endif
+
  return rocm_fa_preferred_backend;
 }

 void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
-
-  // TODO: add plumbing for hasCK for validity checking
-  TORCH_CHECK((b != at::ROCmFABackend::Ck) || hasROCM(),
-      "Cannot set preferred flash attention backend to Ck if PyTorch has not been compiled for ROCm.");
 #ifdef USE_ROCM
-  if(b == at::ROCmFABackend::Ck) {
-    static const bool ck_unsupported = []() {
-      static const std::vector<std::string> archs = {
-          "gfx90a",  "gfx942"
-      };
-      for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
-        if (!detail::getCUDAHooks().isGPUArch(archs, index)) {
-          TORCH_WARN_ONCE(
-            "Attempting to use CK on an unsupported architecture! Cannot set backend to CK");
-          return true;
-        }
-      }
-      return false;
-    }();
-    if(!ck_unsupported) rocm_fa_preferred_backend = b;
-  }
-  else {
-     rocm_fa_preferred_backend = b;
-  }
+  static const bool hasCKSDPAFlag = hasCKSDPA();
+  static const bool ckSupportedFlag = ckSupported();
+  TORCH_CHECK((b != at::ROCmFABackend::Ck) || (hasCKSDPAFlag && ckSupportedFlag),
+      "Cannot set preferred SDPA backend to CK since following conditions are not true: ",
+      "architecture supported for CK: ", ckSupportedFlag,
+      ", PyTorch built with CK SDPA support: ", hasCKSDPAFlag);
 #endif
  rocm_fa_preferred_backend = b;
 }
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -132,6 +132,7 @@ class TORCH_API Context {
  static bool hasKleidiAI();
  static bool hasLAPACK();
  static bool hasMKLDNN();
+  static bool ckSupported();
  static bool hasMAGMA() {
    return detail::getCUDAHooks().hasMAGMA();
  }
@ -162,6 +163,12 @@ class TORCH_API Context {
  static bool hasROCM() {
    return detail::getCUDAHooks().hasROCM();
  }
+  static bool hasCKSDPA() {
+    return detail::getCUDAHooks().hasCKSDPA();
+  }
+  static bool hasCKGEMM() {
+    return detail::getCUDAHooks().hasCKGEMM();
+  }
  static bool hasHIP() {
    return detail::getHIPHooks().hasHIP();
  }
@ -205,6 +212,8 @@ class TORCH_API Context {
  void setBenchmarkCuDNN(bool);
  int benchmarkLimitCuDNN() const;
  void setBenchmarkLimitCuDNN(int);
+  bool immediateMiopen() const;
+  void setImmediateMiopen(bool);
  bool deterministicCuDNN() const;
  void setDeterministicCuDNN(bool);
  bool deterministicMkldnn() const;
@ -250,7 +259,7 @@ class TORCH_API Context {
  at::BlasBackend blasPreferredBackend();
  void setBlasPreferredBackend(at::BlasBackend);

-  at::ROCmFABackend getROCmFAPreferredBackend() const;
+  at::ROCmFABackend getROCmFAPreferredBackend();
  void setROCmFAPreferredBackend(at::ROCmFABackend);

  // Note [Enabling Deterministic Operations]
@ -440,6 +449,7 @@ class TORCH_API Context {
  bool enabled_overrideable = true;
  bool allow_fp16_bf16_reduction_mathSDP = false;
  bool benchmark_cudnn = false;
+  bool immediate_miopen = false;
  Float32MatmulPrecision float32_matmul_precision =
      c10::utils::check_env("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") == true
      ? at::Float32MatmulPrecision::HIGH
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@ -132,6 +132,9 @@ DLDevice torchDeviceToDLDevice(at::Device device) {
    case DeviceType::PrivateUse1:
      ctx.device_type = DLDeviceType::kDLExtDev;
      break;
+    case DeviceType::MPS:
+      ctx.device_type = DLDeviceType::kDLMetal;
+      break;
    default:
      TORCH_CHECK_BUFFER(false, "Cannot pack tensors on " + device.str());
  }
@ -164,6 +167,8 @@ static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* dat
      return at::Device(DeviceType::MAIA, index);
    case DLDeviceType::kDLExtDev:
      return at::Device(DeviceType::PrivateUse1, index);
+    case DLDeviceType::kDLMetal:
+      return at::Device(DeviceType::MPS, index);
    default:
      TORCH_CHECK_BUFFER(
          false, "Unsupported device_type: ", std::to_string(type));
--- a/aten/src/ATen/DeviceAccelerator.h
+++ b/aten/src/ATen/DeviceAccelerator.h
@ -1,5 +1,6 @@
 #pragma once

+#include <c10/core/CachingDeviceAllocator.h>
 #include <c10/core/DeviceType.h>
 #include <c10/macros/Macros.h>

@ -72,6 +73,27 @@ TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index);
 // original device index that was active before the change.
 TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index);

+TORCH_API inline void emptyCache() {
+  const auto device_type = getAccelerator(true).value();
+  at::getDeviceAllocator(device_type)->emptyCache();
+}
+
+TORCH_API inline at::CachingDeviceAllocator::DeviceStats getDeviceStats(
+    c10::DeviceIndex device_index) {
+  const auto device_type = getAccelerator(true).value();
+  return at::getDeviceAllocator(device_type)->getDeviceStats(device_index);
+}
+
+TORCH_API inline void resetAccumulatedStats(c10::DeviceIndex device_index) {
+  const auto device_type = getAccelerator(true).value();
+  at::getDeviceAllocator(device_type)->resetAccumulatedStats(device_index);
+}
+
+TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
+  const auto device_type = getAccelerator(true).value();
+  at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
+}
+
 } // namespace at::accelerator

 namespace at {
--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@ -31,7 +31,9 @@ c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
      return at::globalContext().getPinnedMemoryAllocator(opt_device_type);
    } else {
      TORCH_CHECK(
-          false, "Need to provide pin_memory allocator to use pin memory.")
+          false,
+          "pin_memory=True requires a CUDA or other accelerator backend; "
+          "no pinned memory allocator is available on this system.")
    }
  }

--- a/aten/src/ATen/ZeroTensorFallback.cpp
+++ b/aten/src/ATen/ZeroTensorFallback.cpp
@ -9,7 +9,36 @@

 namespace at {

-  // TODO: add a note explaining the design decisions
+ /*
+  * Design:
+  * 1. ZeroTensors are regular tensors with TensorOptions, a storage
+  *    pointing to nullptr and a ZeroTensor dispatch key set.
+  *
+  * 2. ZeroTensors are immutable. This is done to prevent data race in the case of multithreading
+  *    (when two threads try to read the same zero tensor and materialize it in-place).
+  *
+  * 3. ZeroTensor has a boxed fallback that will be dispatched to any ops that don't
+  *    have special ZeroTensor handling. This fallback materializes each ZeroTensor to
+  *    `at::zeros({}, tensor.options()).expand(tensor.sizes())`.
+
+  * 4. ZeroTensors are handled above autograd. This is necessary because fallback
+  *    operations are not differentiable.
+  *     - Example: Consider add in the case it was using the fallback: zerotensor_a + b.
+  *       zerotensor_a would be materialized to c=torch.zeros_like(zerotensor_a) after
+  *       passing through the fallback. If this happens above the autograd, then the
+  *       gradients would be populated on c instead of zerotensor_a.
+  *
+  * 5. The grad field is always populated with an honest to goodness tensor. This
+  *    materialization of ZeroTensors will happen in:
+  *     - AccumulateGrad for Backward Mode AD.
+  *     - will never be required for ForwardMode AD.
+  *       - This is because if all the tangents were undefined (efficient ZeroTensors),
+  *         no computation will be performed (this is ensured via an existing pre-check).
+  *
+  * Today ZeroTensors are primarily used to represent undefined gradients in forward AD,
+  * it does not perfectly handle NaNs and Infs as we don't check the actual values
+  * and assume that they are non-zero, non-inf, non-NaN etc.
+  */
  // ZeroTensors are designed to be immutable. Thus, we error out when an in-place operation is performed on ZeroTensors
  static void zeroTensorFallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
    const auto& arguments = op.schema().arguments();
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@ -239,6 +239,7 @@ TORCH_LIBRARY_IMPL(aten, AutocastMPS, m) {
  KERNEL_MPS(scaled_dot_product_attention, lower_precision_fp)

  // fp32
+  KERNEL_MPS(conv_transpose3d, input, fp32)
  KERNEL_MPS(acos, fp32)
  KERNEL_MPS(asin, fp32)
  KERNEL_MPS(cosh, fp32)
--- a/Show More
+++ b/Show More