[export] Preserve nn_module_stack for aliased nn modules

[ROCm][CD] librocroller.so missing from ROCm 7 wheel (#164244 )
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164244 Approved by: https://github.com/jeffdaily, https://github.com/Skylion007 Co-authored-by: Jeff Daily <jeff.daily@amd.com>
2025-10-22 22:25:10 +08:00 · 2025-09-30 18:02:43 -07:00 · 2025-10-01 00:02:34 +00:00 · 2025-09-30 23:13:27 +00:00 · 2025-09-30 22:43:49 +00:00 · 2025-09-30 22:42:49 +00:00
1056 changed files with 35106 additions and 17678 deletions
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -15,6 +15,8 @@ fi
 # Compress the fatbin with -compress-mode=size for CUDA 13
 if [[ "$DESIRED_CUDA" == *"13"* ]]; then
    export TORCH_NVCC_FLAGS="-compress-mode=size"
+    # Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801
+    export BUILD_BUNDLE_PTXAS=1
 fi

 SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -372,7 +372,7 @@ if __name__ == "__main__":
    else:
        print("build pytorch without mkldnn backend")

-    os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel")
+    os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation")
    if enable_cuda:
        print("Updating Cuda Dependency")
        filename = os.listdir("/pytorch/dist/")
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -241,7 +241,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
        try:
            with socket.create_connection((addr, port), timeout=timeout):
                return
-        except (ConnectionRefusedError, socket.timeout):  # noqa: PERF203
+        except (ConnectionRefusedError, TimeoutError):  # noqa: PERF203
            if i == attempt_cnt - 1:
                raise
            time.sleep(timeout)
@ -442,7 +442,7 @@ def build_torchvision(
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

-    host.run_cmd(f"cd vision && {build_vars} python3 setup.py bdist_wheel")
+    host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation")
    vision_wheel_name = host.list_dir("vision/dist")[0]
    embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name))

@ -497,7 +497,7 @@ def build_torchdata(
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

-    host.run_cmd(f"cd data && {build_vars} python3 setup.py bdist_wheel")
+    host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation")
    wheel_name = host.list_dir("data/dist")[0]
    embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name))

@ -553,7 +553,7 @@ def build_torchtext(
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

-    host.run_cmd(f"cd text && {build_vars} python3 setup.py bdist_wheel")
+    host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation")
    wheel_name = host.list_dir("text/dist")[0]
    embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name))

@ -614,7 +614,7 @@ def build_torchaudio(
    host.run_cmd(
        f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \
        && ./packaging/ffmpeg/build.sh \
-        && {build_vars} python3 setup.py bdist_wheel"
+        && {build_vars} python3 -m build --wheel --no-isolation"
    )

    wheel_name = host.list_dir("audio/dist")[0]
@ -726,7 +726,7 @@ def start_build(
    print("Building PyTorch wheel")
    build_opts = ""
    if pytorch_build_number is not None:
-        build_opts += f" --build-number {pytorch_build_number}"
+        build_opts += f" -C--build-option=--build-number={pytorch_build_number}"
    # Breakpad build fails on aarch64
    build_vars = "USE_BREAKPAD=0 "
    if branch == "nightly":
@ -747,7 +747,8 @@ def start_build(
        print("build pytorch with mkldnn+acl backend")
        build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
        host.run_cmd(
-            f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}"
+            f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && "
+            f"{build_vars} python3 -m build --wheel --no-isolation{build_opts}"
        )
        print("Repair the wheel")
        pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
@ -763,7 +764,7 @@ def start_build(
    else:
        print("build pytorch without mkldnn backend")
        host.run_cmd(
-            f"cd pytorch && {build_vars} python3 setup.py bdist_wheel{build_opts}"
+            f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
        )

    print("Deleting build folder")
@ -1004,7 +1005,7 @@ if __name__ == "__main__":
        install_condaforge_python(host, args.python_version)
        sys.exit(0)

-    python_version = args.python_version if args.python_version is not None else "3.9"
+    python_version = args.python_version if args.python_version is not None else "3.10"

    if args.use_torch_from_pypi:
        configure_system(host, compiler=args.compiler, python_version=python_version)
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -69,7 +69,8 @@ RUN bash ./install_cuda.sh 13.0
 ENV DESIRED_CUDA=13.0

 FROM ${ROCM_IMAGE} as rocm
-ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+ARG PYTORCH_ROCM_ARCH
+ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
 ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh
 ENV MKLROOT /opt/intel
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -36,6 +36,12 @@ case ${DOCKER_TAG_PREFIX} in
    ;;
  rocm*)
    BASE_TARGET=rocm
+    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+    # add gfx950 conditionally starting in ROCm 7.0
+    if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
+        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
+    fi
+    EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
    ;;
  *)
    echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -84,8 +84,8 @@ fi
 _UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152
 _UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96
 if [[ "$image" == *rocm* ]]; then
-  _UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
-  _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
+  _UCX_COMMIT=29831d319e6be55cb8c768ca61de335c934ca39e
+  _UCC_COMMIT=9f4b242cbbd8b1462cbc732eb29316cdfa124b77
 fi

 tag=$(echo $image | awk -F':' '{print $2}')
@ -175,20 +175,6 @@ case "$tag" in
    fi
    GCC_VERSION=11
    VISION=yes
-    ROCM_VERSION=6.4
-    NINJA_VERSION=1.9.0
-    TRITON=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    if [[ $tag =~ "benchmarks" ]]; then
-      INDUCTOR_BENCHMARKS=yes
-    fi
-    ;;
-  pytorch-linux-noble-rocm-alpha-py3)
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=11
-    VISION=yes
    ROCM_VERSION=7.0
    NINJA_VERSION=1.9.0
    TRITON=yes
@ -196,6 +182,9 @@ case "$tag" in
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
+    if [[ $tag =~ "benchmarks" ]]; then
+      INDUCTOR_BENCHMARKS=yes
+    fi
    ;;
  pytorch-linux-jammy-xpu-n-1-py3)
    ANACONDA_PYTHON_VERSION=3.10
@ -262,13 +251,10 @@ case "$tag" in
    TRITON_CPU=yes
    ;;
  pytorch-linux-jammy-linter)
-    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
-    # We will need to update mypy version eventually, but that's for another day. The task
-    # would be to upgrade mypy to 1.0.0 with Python 3.11
-    PYTHON_VERSION=3.9
+    PYTHON_VERSION=3.10
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
-    PYTHON_VERSION=3.9
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter)
+    PYTHON_VERSION=3.10
    CUDA_VERSION=12.8.1
    ;;
  pytorch-linux-jammy-aarch64-py3.10-gcc11)
@ -455,12 +441,3 @@ elif [ "$HAS_TRITON" = "yes" ]; then
  echo "expecting triton to not be installed, but it is"
  exit 1
 fi
-
-# Sanity check cmake version.  Executorch reinstalls cmake and I'm not sure if
-# they support 4.0.0 yet, so exclude them from this check.
-CMAKE_VERSION=$(drun cmake --version)
-if [[ "$EXECUTORCH" != *yes* && "$CMAKE_VERSION" != *4.* ]]; then
-  echo "CMake version is not 4.0.0:"
-  drun cmake --version
-  exit 1
-fi
--- a/.ci/docker/ci_commit_pins/nccl-cu12.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu12.txt
@ -1 +1 @@
-v2.27.5-1
+v2.28.3-1
--- a/.ci/docker/ci_commit_pins/nccl-cu13.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu13.txt
@ -1 +1 @@
-v2.27.7-1
+v2.28.3-1
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-5ae38bdb0dc066c5823e34dc9797afb9de42c866
+bbb06c0334a6772b92d24bde54956e675c8c6604
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -42,12 +42,6 @@ EOF
    rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
    amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"

-    # Special case for ROCM_VERSION == 7.0
-    if [[ $(ver "$ROCM_VERSION") -eq $(ver 7.0) ]]; then
-        rocm_baseurl="https://repo.radeon.com/rocm/apt/7.0_alpha2"
-        amdgpu_baseurl="https://repo.radeon.com/amdgpu/30.10_alpha2/ubuntu"
-    fi
-
    # Add amdgpu repository
    UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
    echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -12,8 +12,8 @@ function do_install() {

    rocm_version_nodot=${rocm_version//./}

-    # Version 2.7.2 + ROCm related updates
-    MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
+    # https://github.com/icl-utk-edu/magma/pull/65
+    MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
    magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"

    rocm_dir="/opt/rocm"
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -66,15 +66,15 @@ if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}"
  # Triton needs at least gcc-9 to build
  apt-get install -y g++-9

-  CXX=g++-9 conda_run python setup.py bdist_wheel
+  CXX=g++-9 conda_run python -m build --wheel --no-isolation
 elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
  # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
  add-apt-repository -y ppa:ubuntu-toolchain-r/test
  apt-get install -y g++-9

-  CXX=g++-9 conda_run python setup.py bdist_wheel
+  CXX=g++-9 conda_run python -m build --wheel --no-isolation
 else
-  conda_run python setup.py bdist_wheel
+  conda_run python -m build --wheel --no-isolation
 fi

 # Copy the wheel to /opt for multi stage docker builds
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -40,12 +40,16 @@ case ${DOCKER_TAG_PREFIX} in
        ;;
    rocm*)
        # we want the patch version of 6.4 instead
-        if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
+        if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
        fi
        BASE_TARGET=rocm
        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+        # add gfx950 conditionally starting in ROCm 7.0
+        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
+        fi
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
        ;;
    *)
--- a/.ci/docker/manywheel/Dockerfile_cxx11-abi
+++ b/.ci/docker/manywheel/Dockerfile_cxx11-abi
@ -1,71 +0,0 @@
-FROM centos:8 as base
-
-ENV LC_ALL en_US.UTF-8
-ENV LANG en_US.UTF-8
-ENV LANGUAGE en_US.UTF-8
-ENV PATH /opt/rh/gcc-toolset-11/root/bin/:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
-
-# change to a valid repo
-RUN sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-Linux-*.repo
-# enable to install ninja-build
-RUN sed -i 's|enabled=0|enabled=1|g' /etc/yum.repos.d/CentOS-Linux-PowerTools.repo
-
-RUN yum -y update
-RUN yum install -y wget curl perl util-linux xz bzip2 git patch which zlib-devel sudo
-RUN yum install -y autoconf automake make cmake gdb gcc-toolset-11-gcc-c++
-
-
-FROM base as openssl
-ADD ./common/install_openssl.sh install_openssl.sh
-RUN bash ./install_openssl.sh && rm install_openssl.sh
-
-# Install python
-FROM base as python
-RUN yum install -y openssl-devel zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel libpcap-devel xz-devel libffi-devel
-ADD common/install_cpython.sh install_cpython.sh
-RUN bash ./install_cpython.sh && rm install_cpython.sh
-
-FROM base as conda
-ADD ./common/install_conda_docker.sh install_conda.sh
-RUN bash ./install_conda.sh && rm install_conda.sh
-RUN /opt/conda/bin/conda install -y cmake
-
-FROM base as intel
-# Install MKL
-COPY --from=python             /opt/python                           /opt/python
-COPY --from=python             /opt/_internal                        /opt/_internal
-COPY --from=conda              /opt/conda                            /opt/conda
-ENV PATH=/opt/conda/bin:$PATH
-ADD ./common/install_mkl.sh install_mkl.sh
-RUN bash ./install_mkl.sh && rm install_mkl.sh
-
-FROM base as patchelf
-ADD ./common/install_patchelf.sh install_patchelf.sh
-RUN bash ./install_patchelf.sh && rm install_patchelf.sh
-RUN cp $(which patchelf) /patchelf
-
-FROM base as jni
-ADD ./common/install_jni.sh install_jni.sh
-ADD ./java/jni.h jni.h
-RUN bash ./install_jni.sh && rm install_jni.sh
-
-FROM base as libpng
-ADD ./common/install_libpng.sh install_libpng.sh
-RUN bash ./install_libpng.sh && rm install_libpng.sh
-
-FROM base as final
-COPY --from=openssl            /opt/openssl                          /opt/openssl
-COPY --from=python             /opt/python                           /opt/python
-COPY --from=python             /opt/_internal                        /opt/_internal
-COPY --from=intel              /opt/intel                            /opt/intel
-COPY --from=conda              /opt/conda                            /opt/conda
-COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
-COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
-COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
-COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
-COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
-COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
-COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
-COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
-
-RUN yum install -y ninja-build
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -43,12 +43,6 @@ case ${image} in
        MANY_LINUX_VERSION="2_28_aarch64"
        OPENBLAS_VERSION="v0.3.30"
        ;;
-    manylinuxcxx11-abi-builder:cpu-cxx11-abi)
-        TARGET=final
-        GPU_IMAGE=""
-        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
-        MANY_LINUX_VERSION="cxx11-abi"
-        ;;
    manylinuxs390x-builder:cpu-s390x)
        TARGET=final
        GPU_IMAGE=s390x/almalinux:8
@ -82,7 +76,7 @@ case ${image} in
        ;;
    manylinux2_28-builder:rocm*)
        # we want the patch version of 6.4 instead
-        if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
+        if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
        fi
        TARGET=rocm_final
@ -90,6 +84,10 @@ case ${image} in
        DEVTOOLSET_VERSION="11"
        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+        # add gfx950 conditionally starting in ROCm 7.0
+        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
+        fi
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
    manylinux2_28-builder:xpu)
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -10,6 +10,11 @@ boto3==1.35.42
 #Pinned versions: 1.19.12, 1.16.34
 #test that import:

+build==1.3.0
+#Description: A simple, correct Python build frontend.
+#Pinned versions: 1.3.0
+#test that import:
+
 click
 #Description: Command Line Interface Creation Kit
 #Pinned versions:
@ -106,14 +111,12 @@ networkx==2.8.8
 #Pinned versions: 2.8.8
 #test that import: functorch

-ninja==1.11.1.3
+ninja==1.11.1.4
 #Description: build system. Used in some tests. Used in build to generate build
 #time tracing information
-#Pinned versions: 1.11.1.3
+#Pinned versions: 1.11.1.4
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py

-numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
-numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x"
 numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
 numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: Just-In-Time Compiler for Numerical Functions
@ -134,7 +137,7 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
 #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
 #test_binary_ufuncs.py
-numpy==1.22.4; python_version == "3.9" or python_version == "3.10"
+numpy==1.22.4; python_version == "3.10"
 numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
 numpy==2.1.2; python_version >= "3.13"

@ -169,9 +172,9 @@ pillow==11.0.0
 #Pinned versions: 10.3.0
 #test that import:

-protobuf==5.29.4
+protobuf==5.29.5
 #Description:  Google's data interchange format
-#Pinned versions: 5.29.4
+#Pinned versions: 5.29.5
 #test that import: test_tensorboard.py, test/onnx/*

 psutil
@ -326,8 +329,6 @@ pywavelets==1.7.0 ; python_version >= "3.12"
 lxml==5.3.0
 #Description: This is a requirement of unittest-xml-reporting

-# Python-3.9 binaries
-
 PyGithub==2.3.0

 sympy==1.13.3
@ -377,7 +378,7 @@ dataclasses_json==0.6.7
 #Pinned versions: 0.6.7
 #test that import:

-cmake==4.0.0
+cmake==3.31.6
 #Description: required for building

 tlparse==0.4.0
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,8 +1,15 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2

+standard-imghdr==3.13.0; python_version >= "3.13"
+#Description: This is needed by Sphinx, so it needs to be added here.
+# The reasons are as follows:
+# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
+# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
+# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
+
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
 # something related to Docker setup. We can investigate this later.
--- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@ -72,7 +72,7 @@ def sample_vllm_test_library():
                    ]
                ),
                "pytest -v -s entrypoints/llm/test_generate.py",
-                "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
+                "pytest -v -s entrypoints/offline_mode",
            ],
        },
        "vllm_regression_test": {
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@ -1,11 +1,11 @@
 SHELL=/usr/bin/env bash

 DOCKER_CMD ?= docker
-DESIRED_ROCM ?= 6.4
+DESIRED_ROCM ?= 7.0
 DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
 PACKAGE_NAME = magma-rocm
 # inherit this from underlying docker image, do not pass this env var to docker
-#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
+#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201

 DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
@ -16,6 +16,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	magma-rocm/build_magma.sh

 .PHONY: all
+all: magma-rocm70
 all: magma-rocm64
 all: magma-rocm63

@ -24,6 +25,11 @@ clean:
 	$(RM) -r magma-*
 	$(RM) -r output

+.PHONY: magma-rocm70
+magma-rocm70: DESIRED_ROCM := 7.0
+magma-rocm70:
+	$(DOCKER_RUN)
+
 .PHONY: magma-rocm64
 magma-rocm64: DESIRED_ROCM := 6.4
 magma-rocm64:
--- a/.ci/magma-rocm/build_magma.sh
+++ b/.ci/magma-rocm/build_magma.sh
@ -6,8 +6,8 @@ set -eou pipefail
 # The script expects DESIRED_CUDA and PACKAGE_NAME to be set
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"

-# Version 2.7.2 + ROCm related updates
-MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
+# https://github.com/icl-utk-edu/magma/pull/65
+MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec

 # Folders for the build
 PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE

 # Fetch magma sources and verify checksum
 pushd ${PACKAGE_DIR}
-git clone https://bitbucket.org/icl/magma.git
+git clone https://github.com/jeffdaily/magma
 pushd magma
 git checkout ${MAGMA_VERSION}
 popd
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -142,7 +142,7 @@ time CMAKE_ARGS=${CMAKE_ARGS[@]} \
    EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
    BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
    USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
-    python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
+    python -m build --wheel --no-isolation --outdir /tmp/$WHEELHOUSE_DIR
 echo "Finished setup.py bdist at $(date)"

 # Build libtorch packages
--- a/.ci/manywheel/build_libtorch.sh
+++ b/.ci/manywheel/build_libtorch.sh
@ -104,7 +104,7 @@ if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
    export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr
 fi

-echo "Calling 'python -m pip install .' at $(date)"
+echo "Calling -m pip install . -v --no-build-isolation at $(date)"

 if [[ $LIBTORCH_VARIANT = *"static"* ]]; then
    STATIC_CMAKE_FLAG="-DTORCH_STATIC=1"
--- a/.ci/manywheel/build_rocm.sh
+++ b/.ci/manywheel/build_rocm.sh
@ -107,6 +107,10 @@ if [[ $ROCM_INT -ge 60200 ]]; then
    ROCM_SO_FILES+=("librocm-core.so")
 fi

+if [[ $ROCM_INT -ge 70000 ]]; then
+    ROCM_SO_FILES+=("librocroller.so")
+fi
+
 OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
 if [[ "$OS_NAME" == *"CentOS Linux"* || "$OS_NAME" == *"AlmaLinux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -290,13 +290,13 @@ else

      WERROR=1 python setup.py clean

-      WERROR=1 python setup.py bdist_wheel
+      WERROR=1 python -m build --wheel --no-isolation
    else
      python setup.py clean
      if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
        source .ci/pytorch/install_cache_xla.sh
      fi
-      python setup.py bdist_wheel
+      python -m build --wheel --no-isolation
    fi
    pip_install_whl "$(echo dist/*.whl)"

--- a/.ci/pytorch/cpp_doc_push_script.sh
+++ b/.ci/pytorch/cpp_doc_push_script.sh
@ -58,7 +58,7 @@ time python tools/setup_helpers/generate_code.py \

 # Build the docs
 pushd docs/cpp
-time make VERBOSE=1 html -j
+time make VERBOSE=1 html

 popd
 popd
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -35,11 +35,12 @@ fi

 print_cmake_info
 if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
-  USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
+  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
+  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python -m build --wheel --no-isolation
 else
-  # NB: we always build with distributed; USE_DISTRIBUTED turns off all
-  # backends (specifically the gloo backend), so test that this case works too
-  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
+  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
+  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
+  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python -m build --wheel --no-isolation -C--build-option=--plat-name=macosx_11_0_arm64
 fi
 if which sccache > /dev/null; then
  print_sccache_stats
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -13,13 +13,9 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
 fi
 popd

-python -mpip install -r requirements.txt
-
 # enable debug asserts in serialization
 export TORCH_SERIALIZATION_DEBUG=1

-python -mpip install --no-input -r requirements.txt
-
 setup_test_python() {
  # The CircleCI worker hostname doesn't resolve to an address.
  # This environment variable makes ProcessGroupGloo default to
@ -59,7 +55,7 @@ test_python_shard() {

  setup_test_python

-  time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS"
+  time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "$1" "$NUM_TEST_SHARDS"

  assert_git_not_dirty
 }
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -26,6 +26,7 @@ if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then
    time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
    time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
    time python test/run_test.py --verbose -i distributed/test_compute_comm_reordering
+    time python test/run_test.py --verbose -i distributed/test_aten_comm_compute_reordering
    time python test/run_test.py --verbose -i distributed/test_store
    time python test/run_test.py --verbose -i distributed/test_symmetric_memory
    time python test/run_test.py --verbose -i distributed/test_pg_wrapper
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -322,23 +322,29 @@ test_python_shard() {

  # modify LD_LIBRARY_PATH to ensure it has the conda env.
  # This set of tests has been shown to be buggy without it for the split-build
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running

  assert_git_not_dirty
 }

 test_python() {
  # shellcheck disable=SC2086
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
  assert_git_not_dirty
 }

 test_python_smoke() {
-  # Smoke tests for H100
+  # Smoke tests for H100/B200
  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }

+test_python_smoke_b200() {
+  # Targeted smoke tests for B200 - staged approach to avoid too many failures
+  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  assert_git_not_dirty
+}
+
 test_h100_distributed() {
  # Distributed tests at H100
  time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
@ -384,6 +390,7 @@ test_dynamo_wrapped_shard() {
    --exclude-distributed-tests \
    --exclude-torch-export-tests \
    --exclude-aot-dispatch-tests \
+    --exclude-quantization-tests \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose \
    --upload-artifacts-while-running
@ -428,7 +435,7 @@ test_inductor_distributed() {

  # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
  # with if required # gpus aren't available
-  python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_compute_comm_reordering --verbose
+  python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_aten_comm_compute_reordering distributed/test_compute_comm_reordering --verbose
  assert_git_not_dirty
 }

@ -1156,6 +1163,12 @@ test_distributed() {
  fi
 }

+test_quantization() {
+  echo "Testing quantization"
+
+  python test/test_quantization.py
+}
+
 test_rpc() {
  echo "Testing RPC C++ tests"
  # NB: the ending test_rpc must match the current function name for the current
@ -1402,7 +1415,7 @@ EOF
  pip3 install -r requirements.txt
  # shellcheck source=./common-build.sh
  source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
-  python setup.py bdist_wheel --bdist-dir="base_bdist_tmp" --dist-dir="base_dist"
+  python -m build --wheel --no-isolation -C--build-option=--bdist-dir="base_bdist_tmp" --outdir "base_dist"
  python -mpip install base_dist/*.whl
  echo "::endgroup::"

@ -1573,7 +1586,7 @@ test_executorch() {
 test_linux_aarch64() {
  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
        test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
-        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
+        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops profiler/test_memory_profiler \
        distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
        --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose

@ -1604,7 +1617,7 @@ test_operator_benchmark() {
  test_inductor_set_cpu_affinity

  cd benchmarks/operator_benchmark/pt_extension
-  python -m pip install .
+  python -m pip install . -v --no-build-isolation

  cd "${TEST_DIR}"/benchmarks/operator_benchmark
  $TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \
@ -1617,6 +1630,25 @@ test_operator_benchmark() {
      --expected "expected_ci_operator_benchmark_eager_float32_cpu.csv"
 }

+test_operator_microbenchmark() {
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+  TEST_DIR=$(pwd)
+
+  cd benchmarks/operator_benchmark/pt_extension
+  python -m pip install .
+
+  cd "${TEST_DIR}"/benchmarks/operator_benchmark
+
+  for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do
+    $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
+      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
+      --benchmark-name "PyTorch operator microbenchmark" --use-compile
+    $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
+      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}.json" \
+      --benchmark-name "PyTorch operator microbenchmark"
+  done
+}

 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
  (cd test && python -c "import torch; print(torch.__config__.show())")
@ -1649,6 +1681,8 @@ elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
  test_executorch
 elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
  test_python_legacy_jit
+elif [[ "$TEST_CONFIG" == 'quantization' ]]; then
+  test_quantization
 elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
  # TODO: run some C++ tests
  echo "no-op at the moment"
@ -1671,6 +1705,8 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
    test_operator_benchmark cpu ${TEST_MODE}

  fi
+elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then
+  test_operator_microbenchmark
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
@ -1773,10 +1809,14 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
  test_xpu_bin
 elif [[ "${TEST_CONFIG}" == smoke ]]; then
  test_python_smoke
+elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then
+  test_python_smoke_b200
 elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
  test_h100_distributed
 elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
  test_h100_symm_mem
+elif [[ "${TEST_CONFIG}" == "b200-symm-mem" ]]; then
+  test_h100_symm_mem
 elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then
  test_h100_cutlass_backend
 else
--- a/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
+++ b/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
@ -70,7 +70,7 @@ sccache --zero-stats
 sccache --show-stats

 # Build the wheel
-python setup.py bdist_wheel
+python -m build --wheel --no-build-isolation
 if ($LASTEXITCODE -ne 0) { exit 1 }

 # Install the wheel locally
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -130,7 +130,7 @@ if "%USE_CUDA%"=="1" (
 :: Print all existing environment variable for debugging
 set

-python setup.py bdist_wheel
+python -m build --wheel --no-isolation
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
 sccache --show-stats
--- a/.ci/pytorch/win-test-helpers/test_python_shard.bat
+++ b/.ci/pytorch/win-test-helpers/test_python_shard.bat
@ -25,7 +25,7 @@ echo Copying over test times file
 robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files"

 echo Run nn tests
-python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
+python run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
 if ERRORLEVEL 1 goto fail

 popd
--- a/.ci/pytorch/windows/arm64/build_pytorch.bat
+++ b/.ci/pytorch/windows/arm64/build_pytorch.bat
@ -48,7 +48,7 @@ sccache --zero-stats
 sccache --show-stats

 :: Call PyTorch build script
-python setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%"
+python -m build --wheel --no-isolation --outdir "%PYTORCH_FINAL_PACKAGE_DIR%"

 :: show sccache stats
 sccache --show-stats
--- a/.ci/pytorch/windows/internal/install_python.bat
+++ b/.ci/pytorch/windows/internal/install_python.bat
@ -28,5 +28,5 @@ start /wait "" python-amd64.exe /quiet InstallAllUsers=1 PrependPath=0 Include_t
 if errorlevel 1 exit /b 1

 set "PATH=%CD%\Python\Scripts;%CD%\Python;%PATH%"
-%PYTHON_EXEC% -m pip install --upgrade pip setuptools packaging wheel
+%PYTHON_EXEC% -m pip install --upgrade pip setuptools packaging wheel build
 if errorlevel 1 exit /b 1
--- a/.ci/pytorch/windows/internal/setup.bat
+++ b/.ci/pytorch/windows/internal/setup.bat
@ -86,7 +86,7 @@ copy /Y "%LIBTORCH_PREFIX%-%PYTORCH_BUILD_VERSION%.zip" "%PYTORCH_FINAL_PACKAGE_
 goto build_end

 :pytorch
-%PYTHON_EXEC% setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%"
+%PYTHON_EXEC% -m build --wheel --no-isolation --outdir "%PYTORCH_FINAL_PACKAGE_DIR%"

 :build_end
 IF ERRORLEVEL 1 exit /b 1
--- a/.ci/pytorch/windows/internal/static_lib_test.bat
+++ b/.ci/pytorch/windows/internal/static_lib_test.bat
@ -63,7 +63,7 @@ if errorlevel 1 exit /b 1
 call %CONDA_HOME%\condabin\activate.bat testenv
 if errorlevel 1 exit /b 1

-call conda install  -y -q -c conda-forge libuv=1.39
+call conda install  -y -q -c conda-forge libuv=1.51
 call conda install -y -q intel-openmp

 echo "install and test libtorch"
--- a/.ci/pytorch/windows/setup_build.bat
+++ b/.ci/pytorch/windows/setup_build.bat
@ -18,7 +18,7 @@ if "%DESIRED_PYTHON%" == "3.9" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake

 %PYTHON_EXEC% -m pip install pyyaml
 %PYTHON_EXEC% -m pip install mkl-include mkl-static
-%PYTHON_EXEC% -m pip install boto3 ninja typing_extensions setuptools==72.1.0
+%PYTHON_EXEC% -m pip install boto3 requests ninja typing_extensions setuptools==72.1.0

 where cmake.exe

--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -143,7 +143,8 @@ case $desired_python in
        RENAME_WHEEL=false
        ;;
    3.13t)
-        echo "Using 3.13 deps"
+        echo "Using 3.13t deps"
+        mac_version='macosx-11.0-arm64'
        NUMPY_PINNED_VERSION="==2.1.0"
        RENAME_WHEEL=false
        ;;
@ -177,8 +178,7 @@ source ~/${desired_python}-build/bin/activate
 retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
 retry brew install libomp

-# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
-# is build as part of tensorpipe submodule
+# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
 export USE_DISTRIBUTED=1

 export USE_MKLDNN=OFF
@ -186,11 +186,11 @@ export USE_QNNPACK=OFF
 export BUILD_TEST=OFF

 pushd "$pytorch_rootdir"
-echo "Calling setup.py bdist_wheel at $(date)"
+echo "Calling -m build --wheel --no-isolation at $(date)"

-_PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name "${mac_version//[-.]/_}"
+_PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python -m build --wheel --no-isolation --outdir "$whl_tmp_dir" -C--plat-name="${mac_version//[-.]/_}"

-echo "Finished setup.py bdist_wheel at $(date)"
+echo "Finished -m build --wheel --no-isolation at $(date)"

 if [[ $package_type != 'libtorch' ]]; then
    echo "delocating wheel dependencies"
--- a/.circleci/scripts/functorch_doc_push_script.sh
+++ b/.circleci/scripts/functorch_doc_push_script.sh
@ -1,47 +0,0 @@
-#!/bin/bash
-# =================== The following code **should** be executed inside Docker container ===================
-
-# Install dependencies
-sudo apt-get -y update
-sudo apt-get -y install expect-dev
-
-# This is where the local pytorch install in the docker image is located
-pt_checkout="/var/lib/jenkins/workspace"
-source "$pt_checkout/.ci/pytorch/common_utils.sh"
-echo "functorch_doc_push_script.sh: Invoked with $*"
-
-set -ex
-
-version=${DOCS_VERSION:-nightly}
-echo "version: $version"
-
-# Build functorch docs
-pushd $pt_checkout/functorch/docs
-pip -q install -r requirements.txt
-make html
-popd
-
-git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages
-pushd functorch_ghpages
-
-if [ $version == "main" ]; then
-  version=nightly
-fi
-
-git rm -rf "$version" || true
-mv "$pt_checkout/functorch/docs/build/html" "$version"
-
-git add "$version" || true
-git status
-git config user.email "soumith+bot@pytorch.org"
-git config user.name "pytorchbot"
-# If there aren't changes, don't make a commit; push is no-op
-git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true
-git status
-
-if [[ "${WITH_PUSH:-}" == true ]]; then
-  git push -u origin gh-pages
-fi
-
-popd
-# =================== The above code **should** be executed inside Docker container ===================
--- a/.clang-tidy
+++ b/.clang-tidy
@ -69,6 +69,8 @@ readability-string-compare,
 '
 HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
 WarningsAsErrors: '*'
+LineFilter:
+  - name: '/usr/include/.*'
 CheckOptions:
  cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: true
  cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove: true
--- a/.github/ISSUE_TEMPLATE/ci-sev.md
+++ b/.github/ISSUE_TEMPLATE/ci-sev.md
@ -1,6 +1,10 @@
 ---
 name: "⚠️ CI SEV"
 about: Tracking incidents for PyTorch's CI infra.
+title: ''
+labels: ''
+assignees: ''
+
 ---

 > NOTE: Remember to label this issue with "`ci: sev`"
--- a/.github/ISSUE_TEMPLATE/disable-autorevert.md
+++ b/.github/ISSUE_TEMPLATE/disable-autorevert.md
@ -0,0 +1,18 @@
+---
+name: DISABLE AUTOREVERT
+about: Disables autorevert when open
+title: "❌\U0001F519 [DISABLE AUTOREVERT]"
+labels: 'ci: disable-autorevert'
+assignees: ''
+
+---
+
+This issue, while open, disables the autorevert functionality.
+
+More details can be found [here](https://github.com/pytorch/test-infra/blob/main/aws/lambda/pytorch-auto-revert/README.md)
+
+
+## Why are you disabling autorevert?
+
+
+## Links to any issues/commits/errors that shows the source of problem
--- a/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
+++ b/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
@ -1,8 +1,10 @@
 ---
 name: Disable CI jobs (PyTorch Dev Infra only)
 about: Use this template to disable CI jobs
-title: "DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]"
-labels: "module: ci"
+title: DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]
+labels: 'module: ci'
+assignees: ''
+
 ---

 > For example, DISABLED pull / win-vs2022-cpu-py3 / test (default). Once
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -22,6 +22,9 @@ self-hosted-runner:
    - linux.arm64.m7g.4xlarge
    - linux.arm64.m7g.4xlarge.ephemeral
    - linux.arm64.r7g.12xlarge.memory
+    - linux.aws.h100
+    - linux.aws.h100.4
+    - linux.aws.h100.8
    - linux.4xlarge.nvidia.gpu
    - linux.8xlarge.nvidia.gpu
    - linux.16xlarge.nvidia.gpu
--- a/.github/actions/setup-win/action.yml
+++ b/.github/actions/setup-win/action.yml
@ -59,7 +59,7 @@ runs:
        set -x

        # Create new py_tmp env with python-version
-        ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp
+        ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp libuv

        PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
        EXIT_CODE=$?
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-9d1c50a5ac8726f4af0d4a4e85ad4d26a674ad26
+78a47f87ce259a48f0391fa9ae15add05ea7432b
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-c77852e117bdf056c8e9a087e51d6f65cf6ba53d
+0fc62aa26a30ed7ca419d285f285cb5ba02c4394
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -525,6 +525,21 @@
  - Lint
  - pull

+- name: typechecking
+  patterns:
+  - 'pyrefly.toml'
+  - 'mypy.ini'
+  - 'mypy-strict.ini'
+  approved_by:
+  - lolpack
+  - maggiemoss
+  - ndmitchell
+  - kinto0
+  mandatory_checks_name:
+  - EasyCLA
+  - Lint
+  - pull
+
 - name: superuser
  patterns:
  - '*'
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -1,41 +1,44 @@
 tracking_issue: 24422
 ciflow_tracking_issue: 64124
 ciflow_push_tags:
+- ciflow/b200
+- ciflow/b200-symm-mem
 - ciflow/binaries
 - ciflow/binaries_libtorch
 - ciflow/binaries_wheel
- ciflow/triton_binaries
+- ciflow/h100
+- ciflow/h100-cutlass-backend
+- ciflow/h100-distributed
+- ciflow/h100-symm-mem
 - ciflow/inductor
- ciflow/inductor-periodic
- ciflow/inductor-rocm
- ciflow/inductor-perf-test-nightly-rocm
- ciflow/inductor-perf-compare
+- ciflow/inductor-cu126
 - ciflow/inductor-micro-benchmark
 - ciflow/inductor-micro-benchmark-cpu-x86
+- ciflow/inductor-perf-compare
+- ciflow/inductor-perf-test-nightly-rocm
 - ciflow/inductor-perf-test-nightly-x86-zen
- ciflow/inductor-cu126
+- ciflow/inductor-periodic
+- ciflow/inductor-rocm
 - ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly
+- ciflow/op-benchmark
 - ciflow/periodic
 - ciflow/periodic-rocm-mi300
+- ciflow/pull
+- ciflow/quantization-periodic
+- ciflow/riscv64
 - ciflow/rocm
 - ciflow/rocm-mi300
 - ciflow/s390
- ciflow/riscv64
 - ciflow/slow
+- ciflow/torchbench
+- ciflow/triton_binaries
 - ciflow/trunk
 - ciflow/unstable
- ciflow/xpu
 - ciflow/vllm
- ciflow/torchbench
- ciflow/op-benchmark
- ciflow/pull
- ciflow/h100
- ciflow/h100-distributed
 - ciflow/win-arm64
- ciflow/h100-symm-mem
- ciflow/h100-cutlass-backend
+- ciflow/xpu
 retryable_workflows:
 - pull
 - trunk
@ -44,4 +47,4 @@ retryable_workflows:
 - inductor-A100-perf-nightly
 labeler_config: labeler.yml
 label_to_label_config: label_to_label.yml
-mergebot: True
+mergebot: true
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -1,4 +1,5 @@
 boto3==1.35.42
+build==1.2.2.post1
 cmake==3.27.*
 expecttest==0.3.0
 fbscribelogger==0.1.7
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -30,7 +30,7 @@ CUDA_ARCHES_CUDNN_VERSION = {
 }

 # NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
-ROCM_ARCHES = ["6.3", "6.4"]
+ROCM_ARCHES = ["6.4", "7.0"]

 XPU_ARCHES = ["xpu"]

@ -53,7 +53,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | "
        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
-        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
+        "nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | "
        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
@ -70,7 +70,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | "
        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
-        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
+        "nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | "
        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
@ -87,7 +87,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
-        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
+        "nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | "
        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -127,53 +127,6 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
    ),
 ]

-ROCM_SMOKE_WORKFLOWS = [
-    BinaryBuildWorkflow(
-        os=OperatingSystem.LINUX,
-        package_type="manywheel",
-        build_variant="rocm",
-        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
-            OperatingSystem.LINUX,
-            arches=["6.4"],
-            python_versions=["3.10"],
-        ),
-        ciflow_config=CIFlowConfig(
-            labels={
-                LABEL_CIFLOW_BINARIES,
-                LABEL_CIFLOW_BINARIES_WHEEL,
-                LABEL_CIFLOW_ROCM,
-            },
-            isolated_workflow=True,
-        ),
-        branches="main",
-    ),
-]
-
-LINUX_BINARY_SMOKE_WORKFLOWS = [
-    BinaryBuildWorkflow(
-        os=OperatingSystem.LINUX,
-        package_type="manywheel",
-        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
-            OperatingSystem.LINUX,
-            arches=["12.8"],
-            python_versions=["3.12"],
-        ),
-        branches="main",
-    ),
-    BinaryBuildWorkflow(
-        os=OperatingSystem.LINUX,
-        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.RELEASE,
-        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.LINUX,
-            generate_binary_build_matrix.RELEASE,
-            arches=["cpu"],
-            libtorch_variants=["shared-with-deps"],
-        ),
-        branches="main",
-    ),
-]
-
 WINDOWS_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS,
@ -259,39 +212,6 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
    ),
 ]

-WINDOWS_BINARY_SMOKE_WORKFLOWS = [
-    BinaryBuildWorkflow(
-        os=OperatingSystem.WINDOWS,
-        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.RELEASE,
-        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.WINDOWS,
-            generate_binary_build_matrix.RELEASE,
-            arches=["cpu"],
-            libtorch_variants=["shared-with-deps"],
-        ),
-        branches="main",
-        ciflow_config=CIFlowConfig(
-            isolated_workflow=True,
-        ),
-    ),
-    BinaryBuildWorkflow(
-        os=OperatingSystem.WINDOWS,
-        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.DEBUG,
-        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.WINDOWS,
-            generate_binary_build_matrix.DEBUG,
-            arches=["cpu"],
-            libtorch_variants=["shared-with-deps"],
-        ),
-        branches="main",
-        ciflow_config=CIFlowConfig(
-            isolated_workflow=True,
-        ),
-    ),
-]
-
 MACOS_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.MACOS_ARM64,
@ -372,23 +292,10 @@ def main() -> None:
            jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
            S390X_BINARY_BUILD_WORKFLOWS,
        ),
-        (
-            # Give rocm it's own workflow file
-            jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
-            ROCM_SMOKE_WORKFLOWS,
-        ),
-        (
-            jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
-            LINUX_BINARY_SMOKE_WORKFLOWS,
-        ),
        (
            jinja_env.get_template("windows_binary_build_workflow.yml.j2"),
            WINDOWS_BINARY_BUILD_WORKFLOWS,
        ),
-        (
-            jinja_env.get_template("windows_binary_build_workflow.yml.j2"),
-            WINDOWS_BINARY_SMOKE_WORKFLOWS,
-        ),
        (
            jinja_env.get_template("macos_binary_build_workflow.yml.j2"),
            MACOS_BINARY_BUILD_WORKFLOWS,
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -71,12 +71,15 @@ jobs:
    with:!{{ upload.binary_env_as_input(config) }}
      {%- if "aarch64" in build_environment %}
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      {%- elif "s390x" in build_environment %}
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      timeout-minutes: 420
+      {%- elif config["gpu_arch_type"] == "rocm" %}
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.24xlarge.ephemeral
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@ -67,7 +67,7 @@ jobs:
            # an OOM issue when running the job, so this upgrades the runner from 4xlarge
            # to the next available tier of 12xlarge. So much memory just to generate cpp
            # doc
-            runner: ${{ inputs.runner_prefix }}linux.12xlarge
+            runner: ${{ inputs.runner_prefix }}linux.12xlarge.memory
            # TODO: Nightly cpp docs take longer and longer to finish (more than 3h now)
            # Let's try to figure out how this can be improved
            timeout-minutes: 360
--- a/.github/workflows/_get-changed-files.yml
+++ b/.github/workflows/_get-changed-files.yml
@ -2,6 +2,12 @@ name: Get Changed Files

 on:
  workflow_call:
+    inputs:
+      all_files:
+        description: "Whether to return all files instead of just changed files"
+        required: false
+        type: boolean
+        default: false
    outputs:
      changed-files:
        description: "List of changed files (space-separated) or '*' if not in a PR"
@ -26,17 +32,23 @@ jobs:
            # Get the PR number from the github context
            PR_NUMBER="${{ github.event.number }}"

-            # Use gh CLI to get changed files in the PR with explicit repo
-            CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
+            # Check if all_files is requested
+            if [ "${{ inputs.all_files }}" = "true" ]; then
+              echo "all_files input is true, returning all files"
+              echo "changed-files=*" >> "$GITHUB_OUTPUT"
+            else
+              # Use gh CLI to get changed files in the PR with explicit repo
+              CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')

-            if [ -z "$CHANGED_FILES" ]; then
-              echo "No changed files found, setting to '*'"
-              CHANGED_FILES="*"
+              if [ -z "$CHANGED_FILES" ]; then
+                echo "No changed files found, setting to '*'"
+                CHANGED_FILES="*"
+              fi
+
+              echo "Changed files: $CHANGED_FILES"
+              echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
            fi

-            echo "Changed files: $CHANGED_FILES"
-            echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
-
          else
            echo "Not in PR context, setting changed files to '*'"
            echo "changed-files=*" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -273,6 +273,8 @@ jobs:
          TEST_CONFIG: ${{ matrix.config }}
          SHARD_NUMBER: ${{ matrix.shard }}
          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+          EXTRA_FLAGS: ${{ matrix.extra_flags || '' }}
+          OP_BENCHMARK_TESTS: ${{ matrix.op_benchmark_tests }}
          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
--- a/.github/workflows/b200-symm-mem.yml
+++ b/.github/workflows/b200-symm-mem.yml
@ -0,0 +1,60 @@
+name: Limited CI for symmetric memory tests on B200
+
+on:
+  pull_request:
+    paths:
+      - .github/workflows/b200-symm-mem.yml
+  workflow_dispatch:
+  push:
+    tags:
+      - ciflow/b200-symm-mem/*
+  schedule:
+    - cron: 22 8 * * *  # about 1:22am PDT
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '10.0'
+      test-matrix: |
+        { include: [
+          { config: "b200-symm-mem", shard: 1, num_shards: 1, runner: "linux.dgx.b200.8" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.test-matrix }}
+      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+    secrets: inherit
--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@ -36,7 +36,7 @@ jobs:
    runs-on: linux.9xlarge.ephemeral
    strategy:
      matrix:
-        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "cpu"]
+        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "cpu"]
    steps:
      - name: Build docker image
        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -52,8 +52,8 @@ jobs:
          { tag: "cuda12.9" },
          { tag: "cuda12.8" },
          { tag: "cuda12.6" },
-          { tag: "rocm6.3"  },
          { tag: "rocm6.4"  },
+          { tag: "rocm7.0"  },
          { tag: "cpu"      },
        ]
    steps:
--- a/.github/workflows/build-magma-rocm-linux.yml
+++ b/.github/workflows/build-magma-rocm-linux.yml
@ -34,7 +34,7 @@ jobs:
      id-token: write
    strategy:
      matrix:
-        rocm_version: ["64", "63"]
+        rocm_version: ["70", "64"]
    steps:
      - name: Checkout PyTorch
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -52,11 +52,10 @@ jobs:
          { name: "manylinuxaarch64-builder",       tag: "cuda13.0",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinuxaarch64-builder",       tag: "cuda12.8",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinuxaarch64-builder",       tag: "cuda12.6",          runner: "linux.arm64.2xlarge.ephemeral" },
-          { name: "manylinux2_28-builder",          tag: "rocm6.3",           runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "rocm6.4",           runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "rocm7.0",           runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "cpu",               runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28_aarch64-builder",  tag: "cpu-aarch64",       runner: "linux.arm64.2xlarge.ephemeral" },
-          { name: "manylinuxcxx11-abi-builder",     tag: "cpu-cxx11-abi",     runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "xpu",               runner: "linux.9xlarge.ephemeral" },
        ]
    runs-on: ${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -50,12 +50,12 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
+        py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
        device: ["cuda", "rocm", "xpu", "aarch64"]
        docker-image: ["pytorch/manylinux2_28-builder:cpu"]
        include:
          - device: "rocm"
-            rocm_version: "6.4"
+            rocm_version: "7.0"
            runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
          - device: "cuda"
            rocm_version: ""
@ -108,9 +108,6 @@ jobs:

          # Determine python executable for given version
          case $PY_VERS in
-          3.9)
-            PYTHON_EXECUTABLE=/opt/python/cp39-cp39/bin/python
-            ;;
          3.10)
            PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python
            ;;
@ -194,7 +191,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
+        py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
        device: ["xpu"]
    timeout-minutes: 40
    env:
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@ -35,6 +35,7 @@ jobs:
      contents: write
    outputs:
      pt_release_name: ${{ steps.release_name.outputs.pt_release_name }}
+      pt_pep517_release_name: ${{ steps.release_name.outputs.pt_pep517_release_name }}
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
@ -53,8 +54,12 @@ jobs:
          tag_or_branch="${tag_or_branch#refs/heads/}"
          # replace directory separators with _ in branch name
          tag_or_branch="${tag_or_branch//\//_}"
-          echo "PT_RELEASE_NAME=pytorch-$tag_or_branch" >> "$GITHUB_ENV"
-          echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
+          torch_version="$(python -c 'from tools.generate_torch_version import get_torch_version; print(get_torch_version())')"
+          {
+            echo "PT_RELEASE_NAME=pytorch-$tag_or_branch";
+            echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz";
+            echo "PT_PEP517_RELEASE_FILE=torch-${torch_version}.tar.gz";
+          } >> "$GITHUB_ENV"
      - name: Checkout optional submodules
        run: python3 tools/optional_submodules.py
      - name: Copy docs requirements for inclusion
@ -64,30 +69,47 @@ jobs:
          cp .ci/docker/requirements-docs.txt docs/requirements.txt
      - name: Create source distribution
        run: |
-            # Create new folder with specified name so extracting the archive yields that
-            rm -rf "/tmp/$PT_RELEASE_NAME"
-            cp -r "$PWD" "/tmp/$PT_RELEASE_NAME"
-            mv "/tmp/$PT_RELEASE_NAME" .
-            # Cleanup
-            rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci}
-            find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
-            # Create archive
-            tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
-            echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
+          # Create new folder with specified name so extracting the archive yields that
+          rm -rf "/tmp/$PT_RELEASE_NAME"
+          cp -r "$PWD" "/tmp/$PT_RELEASE_NAME"
+          mv "/tmp/$PT_RELEASE_NAME" .
+          # Cleanup
+          rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci}
+          find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
+          # Create archive
+          tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
+          echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
+      - name: Create PEP 517 compatible source distribution
+        run: |
+          pip install build==1.2.2.post1 || exit 1
+          python -m build --sdist || exit 1
+          cd dist || exit 1
      - name: Upload source distribution for release
        if: ${{ github.event_name == 'release' }}
        uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2
        with:
-          files: ${{env.PT_RELEASE_FILE}}
-      - name: Upload source distribution to GHA artifacts for release tags
+          files: |
+            ${{ env.PT_RELEASE_FILE }}
+            ${{ env.PT_PEP517_RELEASE_FILE }}
+      - name: Upload source distribution to GHA artifacts  # for release tags
        if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
        with:
          name: ${{ env.PT_RELEASE_FILE }}
          path: ${{ env.PT_RELEASE_FILE }}
+      - name: Upload PEP 517 source distribution to GHA artifacts  # for release tags
+        if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        with:
+          name: ${{ env.PT_PEP517_RELEASE_FILE }}
+          path: dist/${{ env.PT_PEP517_RELEASE_FILE }}
      - name: Set output
        id: release_name
-        run: echo "pt_release_name=${{ env.PT_RELEASE_NAME }}.tar.gz" >> "${GITHUB_OUTPUT}"
+        run: |
+          {
+            echo "pt_release_name=${{ env.PT_RELEASE_FILE }}";
+            echo "pt_pep517_release_name=${{ env.PT_PEP517_RELEASE_FILE }}";
+          } >> "${GITHUB_OUTPUT}"

  upload_source_code_to_s3:
    if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
@ -103,6 +125,9 @@ jobs:
      - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
        with:
          name: ${{ needs.release.outputs.pt_release_name }}
+      - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
+        with:
+          name: ${{ needs.release.outputs.pt_pep517_release_name }}
      - name: Configure AWS credentials(PyTorch account)
        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
        with:
@ -113,7 +138,9 @@ jobs:
          s3-bucket: pytorch
          s3-prefix: source_code/test
          if-no-files-found: warn
-          path: ${{ needs.release.outputs.pt_release_name }}
+          path: |
+            ${{ needs.release.outputs.pt_release_name }}
+            ${{ needs.release.outputs.pt_pep517_release_name }}

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -59,7 +59,6 @@ jobs:
          pytorch-linux-jammy-py3.13-clang12,
          pytorch-linux-jammy-rocm-n-py3,
          pytorch-linux-noble-rocm-n-py3,
-          pytorch-linux-noble-rocm-alpha-py3,
          pytorch-linux-jammy-rocm-n-py3-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12,
          pytorch-linux-jammy-py3.10-gcc11,
@ -70,7 +69,7 @@ jobs:
          pytorch-linux-jammy-py3-clang18-asan,
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter,
          pytorch-linux-jammy-py3-clang12-executorch,
          pytorch-linux-jammy-py3.12-triton-cpu,
          pytorch-linux-noble-riscv64-py3.12-gcc14
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -62,7 +62,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -128,11 +128,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -174,11 +174,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -220,11 +220,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -265,7 +265,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -331,11 +331,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -377,11 +377,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -423,11 +423,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -468,7 +468,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -534,11 +534,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -580,11 +580,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -626,11 +626,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -671,7 +671,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -737,11 +737,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -783,11 +783,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -829,11 +829,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -874,7 +874,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -940,11 +940,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -986,11 +986,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1032,11 +1032,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1077,7 +1077,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -1143,11 +1143,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1189,11 +1189,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1235,11 +1235,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1280,7 +1280,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -1346,11 +1346,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1392,11 +1392,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1438,11 +1438,11 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@ -316,120 +316,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  libtorch-rocm6_3-shared-with-deps-release-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: libtorch-rocm6_3-shared-with-deps-release
-      build_environment: linux-binary-libtorch
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-rocm6_3-shared-with-deps-release-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - libtorch-rocm6_3-shared-with-deps-release-build
-      - get-label-type
-    runs-on: linux.rocm.gpu.mi250
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
-      GPU_ARCH_TYPE: rocm
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-    steps:
-      - name: Setup ROCm
-        uses: ./.github/actions/setup-rocm
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: libtorch-rocm6_3-shared-with-deps-release
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: ROCm set GPU_FLAG
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-      - name: configure aws credentials
-        id: aws_creds
-        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-          role-duration-seconds: 18000
-      - name: Calculate docker image
-        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
-          docker-image-name: libtorch-cxx11-builder
-          custom-tag-prefix: rocm6.3
-          docker-build-dir: .ci/docker
-          working-directory: pytorch
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-        env:
-          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      - name: Teardown ROCm
-        uses: ./.github/actions/teardown-rocm
-  libtorch-rocm6_3-shared-with-deps-release-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: libtorch-rocm6_3-shared-with-deps-release-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      build_name: libtorch-rocm6_3-shared-with-deps-release
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  libtorch-rocm6_4-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -447,6 +333,7 @@ jobs:
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: libtorch-rocm6_4-shared-with-deps-release
      build_environment: linux-binary-libtorch
    secrets:
@ -543,3 +430,118 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  libtorch-rocm7_0-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
+      build_name: libtorch-rocm7_0-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-rocm7_0-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-rocm7_0-shared-with-deps-release-build
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm7_0-shared-with-deps-release
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: libtorch-cxx11-builder
+          custom-tag-prefix: rocm7.0
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  libtorch-rocm7_0-shared-with-deps-release-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-rocm7_0-shared-with-deps-release-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-rocm7_0-shared-with-deps-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-libtorch-release-main.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-release-main.yml
@ -1,87 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-binary-libtorch-release
-
-
-on:
-  push:
-    branches:
-      - main
-    tags:
-      - 'ciflow/trunk/*'
-  workflow_dispatch:
-
-permissions:
-  id-token: write
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  AWS_DEFAULT_REGION: us-east-1
-  BINARY_ENV_FILE: /tmp/env
-  BUILD_ENVIRONMENT: linux-binary-libtorch-release
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
-  PYTORCH_ROOT: /pytorch
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 0
-concurrency:
-  group: linux-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  get-label-type:
-    if: github.repository_owner == 'pytorch'
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-  libtorch-cpu-shared-with-deps-release-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: libtorch-cpu-shared-with-deps-release
-      build_environment: linux-binary-libtorch-release
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cpu-shared-with-deps-release-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - libtorch-cpu-shared-with-deps-release-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      build_name: libtorch-cpu-shared-with-deps-release
-      build_environment: linux-binary-libtorch-release
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -1,88 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-binary-manywheel
-
-
-on:
-  push:
-    branches:
-      - main
-    tags:
-      - 'ciflow/trunk/*'
-  workflow_dispatch:
-
-permissions:
-  id-token: write
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  AWS_DEFAULT_REGION: us-east-1
-  BINARY_ENV_FILE: /tmp/env
-  BUILD_ENVIRONMENT: linux-binary-manywheel
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
-  PYTORCH_ROOT: /pytorch
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 0
-concurrency:
-  group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  get-label-type:
-    if: github.repository_owner == 'pytorch'
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_12-cuda12_8-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "12.8"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      DESIRED_PYTHON: "3.12"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_12-cuda12_8
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda12_8-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_12-cuda12_8-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "12.8"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda12_8
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
--- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
@ -1,135 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-binary-manywheel-rocm
-
-
-on:
-  push:
-    branches:
-      - main
-    tags:
-      - 'ciflow/binaries/*'
-      - 'ciflow/binaries_wheel/*'
-      - 'ciflow/rocm/*'
-  workflow_dispatch:
-
-permissions:
-  id-token: write
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  AWS_DEFAULT_REGION: us-east-1
-  BINARY_ENV_FILE: /tmp/env
-  BUILD_ENVIRONMENT: linux-binary-manywheel-rocm
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
-  PYTORCH_ROOT: /pytorch
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 0
-concurrency:
-  group: linux-binary-manywheel-rocm-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  get-label-type:
-    if: github.repository_owner == 'pytorch'
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_10-rocm6_4-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      DESIRED_PYTHON: "3.10"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_10-rocm6_4
-      build_environment: linux-binary-manywheel-rocm
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-rocm6_4-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_10-rocm6_4-build
-      - get-label-type
-    runs-on: linux.rocm.gpu.mi250
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
-      GPU_ARCH_TYPE: rocm
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Setup ROCm
-        uses: ./.github/actions/setup-rocm
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_10-rocm6_4
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: ROCm set GPU_FLAG
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-      - name: configure aws credentials
-        id: aws_creds
-        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-          role-duration-seconds: 18000
-      - name: Calculate docker image
-        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
-          docker-image-name: manylinux2_28-builder
-          custom-tag-prefix: rocm6.4
-          docker-build-dir: .ci/docker
-          working-directory: pytorch
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-        env:
-          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      - name: Teardown ROCm
-        uses: ./.github/actions/teardown-rocm
--- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
@ -1,261 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: windows-binary-libtorch-debug
-
-on:
-  push:
-    branches:
-      - main
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  AWS_DEFAULT_REGION: us-east-1
-  BUILD_ENVIRONMENT: windows-binary-libtorch-debug
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
-  OS: windows
-concurrency:
-  group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  get-label-type:
-    if: github.repository_owner == 'pytorch'
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-  libtorch-cpu-shared-with-deps-debug-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 360
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.10"
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
-        shell: bash
-        run: |
-          git config --global core.longpaths true
-          git config --global core.symlinks true
-
-          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
-          # the directory on Windows and prevent GHA from checking out as reported
-          # in https://github.com/actions/checkout/issues/1018
-          git config --global core.fsmonitor false
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v4.4.0
-        if: always()
-        with:
-          name: libtorch-cpu-shared-with-deps-debug
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-
-  libtorch-cpu-shared-with-deps-debug-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - libtorch-cpu-shared-with-deps-debug-build
-      - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 360
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
-        shell: bash
-        run: |
-          git config --global core.longpaths true
-          git config --global core.symlinks true
-
-          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
-          # the directory on Windows and prevent GHA from checking out as reported
-          # in https://github.com/actions/checkout/issues/1018
-          git config --global core.fsmonitor false
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-with-deps-debug
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
--- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
@ -1,261 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: windows-binary-libtorch-release
-
-on:
-  push:
-    branches:
-      - main
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  AWS_DEFAULT_REGION: us-east-1
-  BUILD_ENVIRONMENT: windows-binary-libtorch-release
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
-  OS: windows
-concurrency:
-  group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  get-label-type:
-    if: github.repository_owner == 'pytorch'
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-  libtorch-cpu-shared-with-deps-release-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 360
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.10"
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
-        shell: bash
-        run: |
-          git config --global core.longpaths true
-          git config --global core.symlinks true
-
-          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
-          # the directory on Windows and prevent GHA from checking out as reported
-          # in https://github.com/actions/checkout/issues/1018
-          git config --global core.fsmonitor false
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v4.4.0
-        if: always()
-        with:
-          name: libtorch-cpu-shared-with-deps-release
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-
-  libtorch-cpu-shared-with-deps-release-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - libtorch-cpu-shared-with-deps-release-build
-      - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 360
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
-        shell: bash
-        run: |
-          git config --global core.longpaths true
-          git config --global core.symlinks true
-
-          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
-          # the directory on Windows and prevent GHA from checking out as reported
-          # in https://github.com/actions/checkout/issues/1018
-          git config --global core.fsmonitor false
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-with-deps-release
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -31,6 +31,8 @@ jobs:
    if: github.repository_owner == 'pytorch'
    name: Get changed files
    uses: ./.github/workflows/_get-changed-files.yml
+    with:
+      all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') }}

  lintrunner-clang:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@ -53,7 +55,7 @@ jobs:
    with:
      timeout: 120
      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
-      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter
+      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter
      # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
      # to run git rev-parse HEAD~:.ci/docker when a new image is needed
      fetch-depth: 0
@ -264,10 +266,10 @@ jobs:
        with:
          submodules: false
          fetch-depth: 1
-      - name: Setup Python 3.9
+      - name: Setup Python 3.10
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
        with:
-          python-version: '3.9'
+          python-version: '3.10'
          architecture: x64
          cache: pip
      - name: Install dependencies
--- a/.github/workflows/operator_microbenchmark.yml
+++ b/.github/workflows/operator_microbenchmark.yml
@ -0,0 +1,46 @@
+name: operator_microbenchmark
+
+on:
+  push:
+    tags:
+      - ciflow/op-benchmark/*
+  workflow_dispatch:
+  schedule:
+    # Run at 06:00 UTC everyday
+    - cron: 0 6 * * *
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  opmicrobenchmark-build:
+    if: github.repository_owner == 'pytorch'
+    name: opmicrobenchmark-build
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '8.0 9.0'
+      test-matrix: |
+        { include: [
+          { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.h100" },
+          { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
+        ]}
+    secrets: inherit
+
+  opmicrobenchmark-test:
+    name: opmicrobenchmark-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs: opmicrobenchmark-build
+    with:
+      timeout-minutes: 500
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
+    secrets: inherit
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -59,13 +59,14 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-cuda12.4-py3.10-gcc11
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11
+      cuda-arch-list: 7.5
      test-matrix: |
        { include: [
-          { config: "legacy_nvidia_driver", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
        ]}
    secrets: inherit

@ -112,13 +113,13 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-cuda12_8-py3_9-gcc9-build:
-    name: linux-jammy-cuda12.8-py3.9-gcc9
+  linux-jammy-cuda12_8-py3_10-gcc9-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc9
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.9-gcc9
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
      cuda-arch-list: 8.6
      test-matrix: |
@ -128,14 +129,14 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-cuda12_8-py3_9-gcc9-test:
-    name: linux-jammy-cuda12.8-py3.9-gcc9
+  linux-jammy-cuda12_8-py3_10-gcc9-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc9
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_9-gcc9-build
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-build
    with:
-      build-environment: linux-jammy-cuda12.8-py3.9-gcc9
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_9-gcc9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_9-gcc9-build.outputs.test-matrix }}
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.test-matrix }}
    secrets: inherit

  linux-jammy-cuda12_8-py3_10-gcc9-debug-build:
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -343,14 +343,14 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-xpu-n-py3_9-build:
-    name: linux-jammy-xpu-n-py3.9
+  linux-jammy-xpu-n-py3_10-build:
+    name: linux-jammy-xpu-n-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      sync-tag: linux-xpu-n-build
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-n-py3.9
+      build-environment: linux-jammy-xpu-n-py3.10
      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
      test-matrix: |
        { include: [
--- a/.github/workflows/quantization-periodic.yml
+++ b/.github/workflows/quantization-periodic.yml
@ -0,0 +1,54 @@
+name: quantization-periodic
+
+on:
+  push:
+    tags:
+      - ciflow/quantization-periodic/*
+  workflow_dispatch:
+  schedule:
+    # run weekly
+    - cron: "45 0 * * 0"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  get-default-label-prefix:
+    name: get-default-label-prefix
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+      opt_out_experiments: lf
+
+  periodic-quantization-build:
+    name: periodic-quantization-build
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-default-label-prefix
+    with:
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '8.9'
+      test-matrix: |
+        { include: [
+          { config: "quantization", shard: 1, num_shards: 1, runner: "${{ needs.get-default-label-prefix.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+        ]}
+    secrets: inherit
+  periodic-test-quantization:
+    name: periodic-test-quantization
+    uses: ./.github/workflows/_linux-test.yml
+    needs: periodic-quantization-build
+    with:
+      build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      docker-image: ${{ needs.periodic-quantization-build.outputs.docker-image }}
+      test-matrix: ${{ needs.periodic-quantization-build.outputs.test-matrix }}
+    secrets: inherit
--- a/.github/workflows/rocm-mi355.yml
+++ b/.github/workflows/rocm-mi355.yml
@ -38,7 +38,7 @@ jobs:
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-noble-rocm-py3.12-mi355
-      docker-image-name: ci-image:pytorch-linux-noble-rocm-alpha-py3
+      docker-image-name: ci-image:pytorch-linux-noble-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
        { include: [
--- a/.github/workflows/test-b200.yml
+++ b/.github/workflows/test-b200.yml
@ -0,0 +1,76 @@
+# B200 Smoke Tests CI Workflow
+#
+# This workflow runs smoke tests on B200 hardware
+#
+# Flow:
+# 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200
+# 2. Runs smoke tests on linux.dgx.b200 runner
+# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function
+#
+# Triggered by:
+# - Pull requests modifying this workflow file
+# - Manual dispatch
+# - Schedule (every 6 hours)
+# - Adding ciflow/b200 label to a PR (creates ciflow/b200/* tag)
+
+name: B200 Smoke Tests
+
+on:
+  pull_request:
+    paths:
+      - .github/workflows/test-b200.yml
+  workflow_dispatch:
+  schedule:
+    - cron: 0 4,10,16,22 * * *  # every 6 hours
+  push:
+    tags:
+      - ciflow/b200/*
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm100-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '10.0'
+      test-matrix: |
+        { include: [
+          { config: "smoke_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
+        ]}
+      # config: "smoke_b200" maps to test_python_smoke_b200() in .ci/pytorch/test.sh
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-sm100-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }}
+      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+    secrets: inherit
--- a/.github/workflows/unstable.yml
+++ b/.github/workflows/unstable.yml
@ -53,27 +53,3 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
-
-  linux-jammy-py3_9-clang9-xla-build:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
-      test-matrix: |
-        { include: [
-          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-py3_9-clang9-xla-test:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-py3_9-clang9-xla-build
-    with:
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
-    secrets: inherit
--- a/.gitignore
+++ b/.gitignore
@ -82,6 +82,7 @@ torch/return_types.pyi
 torch/nn/functional.pyi
 torch/utils/data/datapipes/datapipe.pyi
 torch/csrc/autograd/generated/*
+torch/csrc/functionalization/generated/*
 torch/csrc/lazy/generated/*.[!m]*
 torch_compile_debug/
 # Listed manually because some files in this directory are not generated
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -49,7 +49,7 @@ init_command = [
    'mccabe==0.7.0',
    'pycodestyle==2.14.0',
    'pyflakes==3.4.0',
-    'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"',
+    'torchfix==0.4.0 ; python_version >= "3.10" and python_version < "3.13"',
 ]


@ -123,6 +123,7 @@ is_formatter = true
 code = 'MYPY'
 include_patterns = [
    'setup.py',
+    'functorch/dim/**/*.py',
    'torch/**/*.py',
    'torch/**/*.pyi',
    'caffe2/**/*.py',
@ -152,7 +153,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11"',
+    'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"',
    'numpy==2.1.0 ; python_version >= "3.12"',
    'expecttest==0.3.0',
    'mypy==1.16.0',
@ -195,6 +196,7 @@ exclude_patterns = [
    'tools/test/gen_operators_yaml_test.py',
    'tools/test/gen_oplist_test.py',
    'tools/test/test_selective_build.py',
+    'tools/experimental/dynamic_shapes/torchfuzz/**',
 ]
 command = [
    'python3',
@ -1451,7 +1453,7 @@ init_command = [
    '--dry-run={{DRYRUN}}',
    'usort==1.0.8.post1',
    'isort==6.0.1',
-    'ruff==0.12.9',  # sync with RUFF
+    'ruff==0.13.1',  # sync with RUFF
 ]
 is_formatter = true

@ -1585,7 +1587,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'ruff==0.12.9',  # sync with PYFMT
+    'ruff==0.13.1',  # sync with PYFMT
 ]
 is_formatter = true

--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -22,6 +22,7 @@ COMMON_COPTS = [
    "-DHAVE_SHM_UNLINK=1",
    "-D_FILE_OFFSET_BITS=64",
    "-DUSE_FBGEMM",
+    "-DUSE_DISTRIBUTED",
    "-DAT_PER_OPERATOR_HEADERS",
    "-DATEN_THREADING=NATIVE",
    "-DNO_CUDNN_DESTROY_HANDLE",
@ -90,6 +91,8 @@ generated_cpu_cpp = [
    "aten/src/ATen/NativeMetaFunctions.h",
    "aten/src/ATen/RegistrationDeclarations.h",
    "aten/src/ATen/VmapGeneratedPlumbing.h",
+    "aten/src/ATen/ViewMetaClasses.h",
+    "aten/src/ATen/ViewMetaClasses.cpp",
    "aten/src/ATen/core/aten_interned_strings.h",
    "aten/src/ATen/core/enum_tag.h",
    "aten/src/ATen/core/TensorBody.h",
@ -810,7 +813,7 @@ cc_library(
    name = "torch_python",
    srcs = libtorch_python_core_sources
        + if_cuda(libtorch_python_cuda_sources)
-        + libtorch_python_distributed_sources
+        + if_cuda(libtorch_python_distributed_sources)
        + GENERATED_AUTOGRAD_PYTHON,
    hdrs = glob([
        "torch/csrc/generic/*.cpp",
@ -832,36 +835,6 @@ pybind_extension(
    ],
 )

-cc_library(
-    name = "functorch",
-    hdrs = glob([
-        "functorch/csrc/dim/*.h",
-    ]),
-    srcs = glob([
-        "functorch/csrc/dim/*.cpp",
-    ]),
-    deps = [
-        ":aten_nvrtc",
-        ":torch_python",
-        "@pybind11",
-    ],
-)
-
-pybind_extension(
-    name = "functorch/_C",
-    copts=[
-        "-DTORCH_EXTENSION_NAME=_C"
-    ],
-    srcs = [
-        "functorch/csrc/init_dim_only.cpp",
-    ],
-    deps = [
-        ":functorch",
-        ":torch_python",
-        ":aten_nvrtc",
-    ],
-)
-
 cc_binary(
    name = "torch/bin/torch_shm_manager",
    srcs = [
@ -902,7 +875,6 @@ py_library(
    ],
    data = [
        ":torch/_C.so",
-        ":functorch/_C.so",
        ":torch/bin/torch_shm_manager",
    ],
 )
@ -1105,6 +1077,7 @@ test_suite(
        "aten/src/ATen/templates/LazyNonNativeIr.h",
        "aten/src/ATen/templates/RegisterDispatchKey.cpp",
        "aten/src/ATen/templates/RegisterDispatchDefinitions.ini",
+        "aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp",
        "aten/src/ATen/native/native_functions.yaml",
        "aten/src/ATen/native/tags.yaml",
        "aten/src/ATen/native/ts_native_functions.yaml",
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,4 @@
 cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
-# cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW)

 # Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this
 # sometimes makes XCode C compiler gets detected as "Clang", even when the C++
@ -181,9 +180,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
  set(CPU_POWER ON)
 endif()

-# For non-supported platforms, turn USE_DISTRIBUTED off by default.
-# NB: USE_DISTRIBUTED simply disables the backend; distributed code
-# still gets built
+# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
+# tested and likely won't work without additional changes.
 if(NOT LINUX AND NOT WIN32)
  set(USE_DISTRIBUTED
      OFF
@ -263,11 +261,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
 option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
-option(USE_DISTRIBUTED "Enable default distributed backends" ON)
+option(USE_DISTRIBUTED "Use distributed" ON)
 cmake_dependent_option(USE_NCCL "Use NCCL" ON
                       "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_XCCL "Use XCCL" ON
-                       "USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
+                       "USE_XPU;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
 cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
@ -439,11 +437,12 @@ if(WIN32)
      PATH_SUFFIXES lib
      NO_DEFAULT_PATH)
    if(NOT libuv_tmp_LIBRARY)
+      set(USE_DISTRIBUTED OFF)
      set(USE_GLOO OFF)
      message(
        WARNING
-          "Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
-          "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
+          "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
+          "Please run command 'conda install -c conda-forge libuv=1.51' to install libuv."
      )
    else()
      set(ENV{libuv_ROOT} ${libuv_tmp_LIBRARY}/../../)
@ -889,23 +888,28 @@ cmake_dependent_option(
  "(USE_CUDA AND NOT MSVC) OR USE_ROCM"
  OFF)

+
+IF(USE_ROCM AND "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
+  message(WARNING "Setting USE_FBGEMM_GENAI for gfx942 to ON by default, doing ROCM build")
+  set(USE_FBGEMM_GENAI_DEFAULT ON)
+elseif(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32)
+  message(STATUS "Setting USE_FBGEMM_GENAI to ON by default , doing CUDA build for SM100a")
+  set(USE_FBGEMM_GENAI_DEFAULT ON)
+else()
+  set(USE_FBGEMM_GENAI_DEFAULT OFF)
+endif()
+
 cmake_dependent_option(
  USE_FBGEMM_GENAI
  "Whether to build FBGEMM GenAI quantized GEMM kernels.\
  Will be disabled if not supported by the platform"
-  ON
-  "USE_ROCM"
+  ${USE_FBGEMM_GENAI_DEFAULT}
+  "(USE_CUDA AND NOT MSVC) OR USE_ROCM"
  OFF)

-IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
-  message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF")
-  set(USE_FBGEMM_GENAI off)
-endif()

 # Set USE_FBGEMM_GENAI to ON for CUDA build on SM100.
 if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32)
-  message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a")
-  set(USE_FBGEMM_GENAI ON)
 endif()

 # CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
@ -1391,10 +1395,6 @@ endif()
 include(cmake/Summary.cmake)
 caffe2_print_configuration_summary()

-if(BUILD_FUNCTORCH)
-  add_subdirectory(functorch)
-endif()
-
 # Parse custom debug info
 if(DEFINED USE_CUSTOM_DEBINFO)
  string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}")
@ -1486,4 +1486,4 @@ else()
    To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
    ]])
  endif()
-endif()
+endif()
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,20 +1,61 @@
 # Reference: https://setuptools.pypa.io/en/latest/userguide/miscellaneous.html

-# Include source files in SDist
-include CMakeLists.txt
-include *.bzl *.bazel .bazel* BUILD *.BUILD BUILD.* WORKSPACE
-include BUCK BUCK.*
-include requirements*.txt
-include version.txt
-include [Mm]akefile *.[Mm]akefile [Mm]akefile.*
-include [Dd]ockerfile *.[Dd]ockerfile [Dd]ockerfile.* .dockerignore
+# Include individual top-level files
+include CITATION.cff
+include CODEOWNERS
+include Dockerfile
+include LICENSE
+include MANIFEST.in
+include Makefile
+include NOTICE
+include .bc-linter.yml
+include .clang-format .clang-tidy
+include .cmakelintrc
+include .coveragerc
+include .dockerignore
+include .editorconfig
+include .flake8
+include .gdbinit
+include .lintrunner.toml
+include .lldbinit
+include codex_setup.sh
+include docker.Makefile
+include pyrefly.toml
+include ubsan.supp
+
+# Include bazel and BUCK related files
+include BUILD.bazel BUCK.oss
+include WORKSPACE
+include *.bzl
+include .bazelignore .bazelrc .bazelversion
+
+# Include general configuration files
+include *.ini
+# Include important top-level information
+include *.md
+# Include technical text files at the moment, comprises
+# version.txt, CMakeLists.txt, requirements.txt
+include *.txt
+
+# Include ctags configuration
+include .ctags.d/*.ctags
+
+# Include subfolders completely
+graft .devcontainer
+graft .vscode
 graft android
 graft aten
+graft benchmarks
 graft binaries
 graft c10
 graft caffe2
 graft cmake
+graft docs
 graft functorch
+graft ios
+graft mypy_plugins
+graft scripts
+graft test
 graft third_party
 graft tools
 graft torch
@ -22,29 +63,37 @@ graft torchgen
 # FIXME: torch-xla build during codegen will fail if include this file in wheel
 exclude torchgen/BUILD.bazel

-# Misc files and directories in SDist
-include *.md
-include CITATION.cff
-include LICENSE NOTICE
-include mypy*.ini
-graft benchmarks
-graft docs
-graft mypy_plugins
-graft scripts
+# The following exclusions omit parts from third-party dependencies that
+# contain invalid symlinks[1] and that are not needed for pytorch, such as
+# bindings for unused languages
+prune third_party/flatbuffers/java
+prune third_party/flatbuffers/kotlin
+prune third_party/ittapi/rust
+prune third_party/nccl/pkg/debian
+prune third_party/opentelemetry-cpp/third_party/prometheus-cpp/cmake/project-import-*
+
+# The following document is also an invalid symlink[1] and superfluous
+exclude third_party/flatbuffers/docs/source/CONTRIBUTING.md
+
+# Omit autogenerated code
+prune torchgen/packaged
+
+# Omit caches, compiled, and scm related content
+prune */__pycache__
+prune **/.github
+prune **/.gitlab
+global-exclude *.o *.obj *.so *.dylib *.a *.pxd *.dll *.lib
+global-exclude *.py[cod] *.swp *~
+global-exclude .git .git-blame-ignore-revs .gitattributes .gitignore .gitmodules
+global-exclude .gitlab-ci.yml

 # Misc files needed for custom setuptools command
 include .gitignore
 include .gitmodules

-# Include test suites in SDist
-graft test
-include pytest.ini
-include .coveragerc
+# [1] Invalid symlinks for the purposes of Python source distributions are,
+# according to the source distribution format[2] links pointing outside the
+# destination directory or links with a `..` component, which is those of
+# concern here.

-# Prune generated/compiled files
-prune torchgen/packaged
-prune */__pycache__
-global-exclude *.o *.obj *.so *.a *.dylib *.pxd *.dll *.lib *.py[cod]
-
-prune */.git
-global-exclude .git *~ *.swp
+# [2] https://packaging.python.org/en/latest/specifications/source-distribution-format/#source-distribution-archive-features
--- a/README.md
+++ b/README.md
@ -161,7 +161,7 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv)

 #### Prerequisites
 If you are installing from source, you will need:
- Python 3.9 or later
+- Python 3.10 or later
 - A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux)
 - Visual Studio or Visual Studio Build Tool (Windows only)

@ -275,7 +275,7 @@ conda install pkg-config libuv
 pip install mkl-static mkl-include
 # Add these packages if torch.distributed is needed.
 # Distributed package support on Windows is a prototype feature and is subject to changes.
-conda install -c conda-forge libuv
+conda install -c conda-forge libuv=1.51
 ```

 #### Install PyTorch
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -317,10 +317,20 @@ IF(USE_FBGEMM_GENAI)
        -greedy-reverse-local-assignment=1
        -fhip-new-launch-api)

+      # Only compile for gfx942 for now.
+      # This is rather hacky, I could not figure out a clean solution :(
+      set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
+      string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
+      if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
+        list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
+      endif()
+      set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
+
      hip_add_library(
        fbgemm_genai STATIC
        ${fbgemm_genai_native_rocm_hip}
        HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
+      set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
      set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
      target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)

--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@ -401,30 +401,13 @@ T* toDLPackImpl(const Tensor& src) {
  // The following code detects whether the src follows
  // a continuous pattern. If the src follows such pattern (common-case)
  // then we do not need to normalize the strides.
-  bool need_normalize_strides = false;
-  int64_t expected_stride = 1;
-  for (int i = src.dim() - 1; i >= 0; i--) {
-    // detect if we do not meet continuous pattern
-    // and the size is 1, so there is opportunity to normalize
-    if (src.stride(i) != expected_stride && src.size(i) == 1) {
-      need_normalize_strides = true;
-      break;
-    }
-    expected_stride *= src.size(i);
-  }
-
+  bool need_normalize_strides = src.dim() == 1 && src.size(0) == 1 && src.stride(0) != 1;
  // less common case, try normalizing the strides
  if (need_normalize_strides) {
    // create a new tensor with possibly normalized strides
    // gh-83069
    auto shape = src.sizes();
-    auto strides = src.strides().vec();
-    for (int i = 0; i < src.dim(); i++) {
-      if (shape[i] < 2) {
-        strides[i] = 1;
-      }
-    }
-    view = src.as_strided(shape, strides, src.storage_offset());
+    view = src.as_strided(shape, {1}, src.storage_offset());
  }

  ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@ -468,7 +468,7 @@ inline Tensor _sum_to(
      // if we assume no reduction due to unbacked we ensure that at runtime.
      TORCH_MAYBE_SYM_CHECK(
          sym_eq(shape[i - leading_dims], sizes[i]),
-          "non-reduction path was assumed due to unabcked symbols expected those two sizes to be the same:",
+          "non-reduction path was assumed due to unbacked symbols expected those two sizes to be the same:",
          shape[i - leading_dims],
          ", ",
          sizes[i])
--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@ -9,11 +9,6 @@

 namespace at::functionalization {

-ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
-  if (out_idx == this->out_index) return *this;
-  return ViewMeta(forward_fn, reverse_fn, has_symbolic_inputs, is_multi_output, is_as_strided, out_idx);
-}
-
 // Note [Functionalization: Alias Removal Part 2]
 // See Note [Functionalization: Alias Removal] for more details.
 // This function applies a single update from one of the views to the StorageImpl.
@ -42,12 +37,12 @@ ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
 static const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) {
  at::Tensor t = update.new_val;
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
-  if (update.view_metas.empty()) return t;
+  if (update.view_metas.empty()) { return t; }

  std::vector<at::Tensor> tmp_values({base});
  tmp_values.reserve(update.view_metas.size());
  for (size_t i = 0; i < update.view_metas.size() - 1; ++i) {
-    at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index);
+    at::Tensor next_view = update.view_metas[i]->forward(tmp_values.back());
    // NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided
    // All of these ops require additional information to recover the sizes of the original tensor.
    // If need to, we could probably apply this optimization and only bother computing tmp_values
@ -55,9 +50,8 @@ static const Tensor apply_update(const FunctionalStorageImpl::Update& update, co
    tmp_values.push_back(std::move(next_view));
  }
  for(int64_t i = static_cast<int64_t>(update.view_metas.size()) - 1; i >= 0; --i) {
-    int64_t out_idx = update.view_metas[i].out_index;
    // Each view inverse is implemented in ViewInverses.cpp.
-    t = update.view_metas[i].reverse_fn(tmp_values[i], t, out_idx);
+    t = update.view_metas[i]->reverse(tmp_values[i], t);
  }
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
  return t;
@ -111,13 +105,13 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_));
 }

-void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& metas) {
+void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<std::shared_ptr<ViewMeta>>& metas) {
  TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage");

  if (metas.size() > 1) {
    for (size_t i = 1; i < metas.size(); ++i) {
      // Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI
-      TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i].is_as_strided,
+      TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i]->is_as_strided,
 "During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i,
 " was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today,"
 "so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you "
--- a/aten/src/ATen/FunctionalStorageImpl.h
+++ b/aten/src/ATen/FunctionalStorageImpl.h
@ -8,44 +8,89 @@ namespace at::functionalization {

 // See Note [Functionalization Pass In Core]

+enum class InverseReturnMode {
+  /// Specifies that functional inverses should always return a view.
+  AlwaysView,
+  /// Specifies that functional inverses should always return a non-view / copy.
+  NeverView,
+  /// Specifies that functional inverses should return a view unless a (copying)
+  /// scatter
+  /// inverse exists, in which case that will be used instead.
+  /// This avoids as_strided() calls that can be difficult for subclasses to
+  /// handle.
+  ViewOrScatterInverse,
+};
+
+#define FUNCTIONALIZATION_VIEWMETA_NAME(TYPE) \
+  static const char* name() {                 \
+    return #TYPE;                             \
+  }
+
+#define FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(...) \
+  using SerializableTuple = std::tuple<__VA_ARGS__>
+
 // ViewMeta is a class used by the functionalization pass to navigate between
 // a base tensor and a view tensor.
 // For example, if I call `b = a.view1(...)`
-// the functionalization pass will generate and store a ViewMeta on b that looks
-// like:
+// the functionalization pass will generate and store a ViewMeta specialization
+// for `view1` operation on b that looks like:
 //
-// ViewMeta(
-//   [<captures>](const Tensor& base, int64_t mutated_view_idx) {
-//     return base.view1(...);
-//   },
-//   [<captures>](const at::Tensor& base, const at::Tensor& mutated_view,
-//   int64_t mutated_view_idx) -> at::Tensor {
-//     return at::functionalization::impl::view1_inverse(base, mutated_view,
-//     ...);
+// struct TORCH_API view1_ViewMeta : public ViewMeta {
+//   FUNCTIONALIZATION_VIEWMETA_NAME(view1_ViewMeta);
+//   FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
+//       bool /* reapply_views */,
+//       const std::vector<int64_t>&);
+//
+//   view1_ViewMeta(const SerializableTuple& tpl)
+//       : view1_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
+//
+//   view1_ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
+//       : ViewMeta(/*has_symbolic_inputs=*/false),
+//         reapply_views(reapply_views),
+//         size(size) {}
+//
+//   Tensor forward(const Tensor& base) override {
+//       return base.view1(...);
 //   }
 //
-// The forward_fn lambda describes how to replay view1 on a tensor.
+//   Tensor reverse(const Tensor& base, const Tensor& mutated_view) override {
+//       return at::functionalization::impl::view1_inverse(base, mutated_view,
+//       ...);
+//   }
 //
-// The reverse_fn lambda describes how, given a tensor that is already a view,
+//   SerializableTuple to_serializable_tuple() {
+//     return std::make_tuple(reapply_views, size);
+//   }
+//
+//   bool reapply_views;
+//   std::vector<int64_t> size;
+// };
+//
+// The forward function describes how to replay view1 on a tensor.
+//
+// The reverse function describes how, given a tensor that is already a view,
 // how to get the corresponding base tensor. See Note [Functionalization Pass:
 // View Inverses] for details.
+//
+// `SerializedTuple` is a typedef that defines an `std::tuple<...>` type
+// representing the `ViewMeta` instance state. Methods that take in/return such
+// a type are used for supporting pickle serialization.
 struct ViewMeta {
  ViewMeta(
-      std::function<Tensor(const Tensor&, int64_t)> forward,
-      std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse,
      bool has_symbolic_inputs,
      bool is_multi_output = false,
      bool is_as_strided = false,
      int64_t out_idx = 0)
-      : forward_fn(std::move(forward)),
-        reverse_fn(std::move(reverse)),
-        out_index(out_idx),
+      : out_index(out_idx),
        is_multi_output(is_multi_output),
        is_as_strided(is_as_strided),
        has_symbolic_inputs(has_symbolic_inputs) {}

-  std::function<Tensor(const Tensor&, int64_t)> forward_fn;
-  std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse_fn;
+  virtual ~ViewMeta() = default;
+
+  virtual Tensor forward(const Tensor& base) = 0;
+  virtual Tensor reverse(const Tensor& base, const Tensor& mutated_view) = 0;
+
  // See Note [out_idx in ViewMeta]
  int64_t out_index;

@ -57,10 +102,17 @@ struct ViewMeta {
  // Tells us if this view operation has any symbolic inputs
  bool has_symbolic_inputs;

-  // Returns a copy of the current ViewMeta, if out_idx matches the current
-  // out_index. Otherwise, returns a new ViewMeta with the same forward/reverse
+  // Returns a new ViewMeta with the same forward/reverse
  // functions, but a new out index.
-  ViewMeta to_out_idx(int64_t out_idx);
+  //
+  // This method should be implemented by those `ViewMeta` that have more than
+  // one output.
+  virtual std::shared_ptr<ViewMeta> to_out_index(int64_t out_index) {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false,
+        "ViewMeta::to_out_index not implemented. ",
+        "Likely because there's only one output.");
+  }
 };

 // FunctionalStorageImpl is a subclass of StorageImpl used by the
@ -93,14 +145,14 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
    const at::Tensor new_val;
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
-    const std::vector<ViewMeta> view_metas;
+    const std::vector<std::shared_ptr<ViewMeta>> view_metas;
  };

  explicit FunctionalStorageImpl(const Tensor& value);

  void add_update(
      const Tensor& updated_val,
-      const std::vector<ViewMeta>& view_metas);
+      const std::vector<std::shared_ptr<ViewMeta>>& view_metas);
  bool apply_updates();
  const Tensor& base() {
    return base_;
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -129,17 +129,19 @@ void FunctionalTensorWrapper::freeze_storage() const {
 // - view_value: The output tensor that we need to wrap.
 // - base: The "base" of the view that `view_value` was generated from.
 // See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic.
-FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, const functionalization::ViewMeta& meta)
-  : c10::TensorImpl(
-      c10::DispatchKeySet(DispatchKey::Functionalize),
-      view_value.dtype(),
-      base->storage().data_ptr().device()
-    ),
-    value_(view_value),
-    is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
-    was_storage_changed_(base->was_storage_changed_),
-    is_symbolic_(base->is_symbolic_)
-{
+FunctionalTensorWrapper::FunctionalTensorWrapper(
+    const Tensor& view_value,
+    const FunctionalTensorWrapper* base,
+    const std::shared_ptr<functionalization::ViewMeta>& meta)
+    : c10::TensorImpl(
+          c10::DispatchKeySet(DispatchKey::Functionalize),
+          view_value.dtype(),
+          base->storage().data_ptr().device()),
+      value_(view_value),
+      is_multi_output_view_(
+          base->is_multi_output_view_ || meta->is_multi_output),
+      was_storage_changed_(base->was_storage_changed_),
+      is_symbolic_(base->is_symbolic_) {
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_));
  TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
  set_constructor_metadata();
@ -148,11 +150,10 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const
      view_metas_ = base->view_metas_;  // copy
  }
  view_metas_.push_back(meta);
-  maybe_mark_symbolic(meta);
+  maybe_mark_symbolic(meta.get());
  storage_ = base->storage_; // alias this tensor's storage with the base tensor's
 }

-
 functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const {
  return static_cast<functionalization::FunctionalStorageImpl*>(storage_.unsafeGetStorageImpl());
 }
@ -176,18 +177,18 @@ bool FunctionalTensorWrapper::is_up_to_date() const {
 }

 // See Note [Functionalization Pass - Inplace View Ops]
-void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) {
+void FunctionalTensorWrapper::mutate_view_meta(const std::shared_ptr<at::functionalization::ViewMeta>& meta) {
  view_metas_.push_back(meta);
  // Manually track the fact that this tensor received a metadata mutation!
  has_metadata_mutation_ = true;
  // Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation.
-  maybe_mark_symbolic(meta);
+  maybe_mark_symbolic(meta.get());
  // Note [Functionalization Pass - Inplace View Ops]
  // So, these ops are special - they're mutation AND view ops. They get special codegen.
  // An example is transpose_, e.g. `a.transpose_()`
  // Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas.
  at::AutoDispatchSkipFunctionalize guard;
-  value_ = meta.forward_fn(value_, meta.out_index);
+  value_ = meta->forward(value_);
  TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
 }

@ -368,15 +369,8 @@ void FunctionalTensorWrapper::sync_() {
  regenerate_from_base();
 }

-Tensor FunctionalTensorWrapper::apply_view_metas(const Tensor& base) {
-  auto t = base;
-
-  // Reapply views to get the viewed tensor from the base in alias_
-  for (auto& view_meta: view_metas_) {
-    t = view_meta.forward_fn(t, view_meta.out_index);
-  }
-
-  return t;
+const std::vector<std::shared_ptr<functionalization::ViewMeta>>& FunctionalTensorWrapper::view_metas() const {
+  return view_metas_;
 }

 void FunctionalTensorWrapper::regenerate_from_base() {
@ -385,7 +379,7 @@ void FunctionalTensorWrapper::regenerate_from_base() {
  auto t = storage_impl->base();

  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
-  t = apply_view_metas(t);
+  t = at::functionalization::impl::apply_view_meta_sequence(t, view_metas_);
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));

  replace_(t, /*from_lazy_regenerate=*/true);
@ -727,11 +721,11 @@ bool isFunctionalTensor(const std::optional<Tensor>& t) {
 }

 bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
-  if (t_list.empty()) return false;
+  if (t_list.empty()) { return false; }
  auto functional_count = 0;
  for (const auto i : c10::irange(t_list.size())) {
    auto const & e= t_list[i];
-    if (!e.has_value() || !e->defined()) continue;
+    if (!e.has_value() || !e->defined()) { continue; }
    if (isFunctionalTensor(e)) {
      ++functional_count;
    }
@ -741,10 +735,10 @@ bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {

 template <typename T>
 static bool isFunctionalTensorIListRef(c10::IListRef<T> list) {
-  if (list.size() == 0) return false;
+  if (list.size() == 0) { return false; }
  auto functional_count = 0;
  for (const auto& tensor : list) {
-    if (!tensor.defined()) continue;
+    if (!tensor.defined()) { continue; }
    if (isFunctionalTensor(tensor)) {
      ++functional_count;
    }
@ -762,20 +756,28 @@ void freeze_functional_tensor(const Tensor& tensor) {
  functional_base_impl->freeze_storage();
 }

-Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) {
+Tensor create_functional_tensor_with_view_meta(
+    const at::Tensor& view_to_wrap,
+    const at::Tensor& base,
+    const std::shared_ptr<functionalization::ViewMeta>& meta,
+    int64_t out_idx) {
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap));
  TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base));
  auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base);
+  auto meta_ = meta;
  if (out_idx != 0) {
    // Note [out_idx in ViewMeta]
    // When a view op outputs multiple tensors, each output needs its own separate ViewMeta.
    // Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function.
-    meta = meta.to_out_idx(out_idx);
+    meta_ = meta->to_out_index(out_idx);
  }
-  return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta);
+  return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta_);
 }

-std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, const functionalization::ViewMeta& meta) {
+std::vector<Tensor> create_functional_tensor_with_view_meta(
+    ITensorListRef view_to_wrap,
+    const at::Tensor& base,
+    const std::shared_ptr<functionalization::ViewMeta>& meta) {
  std::vector<Tensor> outputs(view_to_wrap.size());
  int64_t i = 0;
  for (const auto& tensor : view_to_wrap) {
@ -785,12 +787,22 @@ std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_
  return outputs;
 }

-void mutate_view_meta(const at::Tensor& self, const functionalization::ViewMeta& meta) {
+void mutate_view_meta(const at::Tensor& self, const std::shared_ptr<functionalization::ViewMeta>& meta) {
  TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self));
  auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
  self_impl->mutate_view_meta(meta);
 }

+Tensor apply_view_meta_sequence(
+    const Tensor& base,
+    const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence) {
+  Tensor r = base;
+  for (auto& vm : sequence) {
+    r = vm->forward(r);
+  }
+  return r;
+}
+
 // Note [Propagating strides in the functionalization pass]
 // In order to properly compute stride information, the functionalization pass
 // calls each {view} reference implementations with meta tensors.
@ -884,7 +896,7 @@ void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* s
    const auto& ivalue = returns[idx];
    if (ivalue.isTensor()) {
      const auto& t = ivalue.toTensor();
-      if (!t.defined()) continue;
+      if (!t.defined()) { continue; }
      at::functionalization::impl::sync(t);
      auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t));
      (*stack)[returns_begin + idx] = t_new;
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@ -56,7 +56,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
  explicit FunctionalTensorWrapper(
      const Tensor& view_value,
      const FunctionalTensorWrapper* base,
-      const functionalization::ViewMeta& meta);
+      const std::shared_ptr<functionalization::ViewMeta>& meta);

  // Get the underlying, actual tensor, that doesn't know anything about
  // functionalization.
@ -99,17 +99,17 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
        ->are_all_mutations_under_no_grad_or_inference_mode();
  }

-  void maybe_mark_symbolic(const functionalization::ViewMeta& meta) {
-    is_symbolic_ = is_symbolic_ | meta.has_symbolic_inputs;
+  void maybe_mark_symbolic(functionalization::ViewMeta* meta) {
+    is_symbolic_ = is_symbolic_ | meta->has_symbolic_inputs;
  }

  bool is_symbolic() const {
    return is_symbolic_;
  }

-  // Runs the forward_fn of every ViewMeta collected in the current instance
-  // to some other base.
-  Tensor apply_view_metas(const Tensor& base);
+  // Retrieves the ViewMeta sequence of this tensor.
+  const std::vector<std::shared_ptr<functionalization::ViewMeta>>& view_metas()
+      const;

  // Sync's the underlying tensor with its alias, if it's out of date. This
  // involves two steps: 1) Apply any pending updates/mutations to the alias 2)
@ -146,7 +146,8 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
  // from the base tensor. This method is used by inplace-view ops like
  // transpose_. It appends a ViewMeta to the existing stack, and refreshes the
  // tensor by replaying the views off of the alias.
-  void mutate_view_meta(const at::functionalization::ViewMeta& meta);
+  void mutate_view_meta(
+      const std::shared_ptr<at::functionalization::ViewMeta>& meta);

  // Custom implementation of self.set_(src)
  void set__impl(const FunctionalTensorWrapper* other);
@ -285,7 +286,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
  bool is_symbolic_ = false;

  size_t generation_ = 0;
-  std::vector<at::functionalization::ViewMeta> view_metas_;
+  std::vector<std::shared_ptr<at::functionalization::ViewMeta>> view_metas_;

 protected:
  static void copy_tensor_metadata(
@ -377,16 +378,20 @@ TORCH_API void propagate_xla_data_direct(
 Tensor create_functional_tensor_with_view_meta(
    const Tensor& view_to_wrap,
    const Tensor& base,
-    functionalization::ViewMeta meta,
+    const std::shared_ptr<functionalization::ViewMeta>& meta,
    int64_t out_idx = 0);
 std::vector<Tensor> create_functional_tensor_with_view_meta(
    ITensorListRef view_to_wrap,
    const Tensor& base,
-    const functionalization::ViewMeta& meta);
+    const std::shared_ptr<functionalization::ViewMeta>& meta);

 void mutate_view_meta(
    const Tensor& self,
-    const functionalization::ViewMeta& meta);
+    const std::shared_ptr<functionalization::ViewMeta>& meta);
+
+TORCH_API Tensor apply_view_meta_sequence(
+    const Tensor& base,
+    const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence);

 void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
 void set_sizes_strides_offset(
--- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
@ -1,3 +1,5 @@
+#include <ATen/FunctionalizeFallbackKernel.h>
+
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/EmptyTensor.h>
@ -7,7 +9,6 @@
 #include <torch/library.h>
 #include <c10/util/irange.h>
 #include <c10/util/strides.h>
-#include <ATen/EmptyTensor.h>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/ATen.h>
@ -28,6 +29,31 @@
 #include <utility>
 #endif

+namespace at::functionalization {
+
+Tensor resize__ViewMeta::forward(const Tensor& base) {
+  if (reapply_views) {
+    return base.as_strided(size, c10::contiguous_strides(size));
+  } else {
+    return at::as_strided_copy(base, size, c10::contiguous_strides(size));
+  }
+}
+
+Tensor resize__ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
+  return base.as_strided_scatter(
+      mutated_view, size, c10::contiguous_strides(size));
+}
+
+Tensor _unsafe_view_ViewMeta::forward(const Tensor& base) {
+  return at::_unsafe_view_symint(base, size);
+}
+
+Tensor _unsafe_view_ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
+  return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
+}
+
+} // namespace at::functionalization
+
 namespace {
  void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet [[maybe_unused]], torch::jit::Stack* stack) {
    const auto& schema = op.schema();
@ -106,7 +132,9 @@ namespace {
      const auto& ivalue = returns[idx];
      if (ivalue.isTensor() && should_wrap_outputs) {
        const auto& t = ivalue.toTensor();
-        if (!t.defined()) continue;
+        if (!t.defined()) {
+          continue;
+        }
        auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t));
        (*stack)[returns_begin + idx] = t_new;
      } else if (ivalue.isTensorList() && should_wrap_outputs) {
@ -169,19 +197,8 @@ static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatch
  // The output of resizing is equivalent to taking a slice of a larger tensor.
  // We have to emulate this "slicing" with an as_strided call.
  auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS();
-  at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
-    [reapply_views = reapply_views, size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
-      if (reapply_views) {
-        return base.as_strided(size, c10::contiguous_strides(size));
-      } else {
-        return at::as_strided_copy(base, size, c10::contiguous_strides(size));
-      }
-    },
-    [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
-      return base.as_strided_scatter(mutated_view, size, c10::contiguous_strides(size));
-    },
-    /*has_symbolic_inputs=*/false
-  );
+  auto view_meta = std::make_shared<at::functionalization::resize__ViewMeta>(
+      reapply_views, size.vec());
  at::functionalization::impl::mutate_view_meta(self, view_meta);
  return self;
 }
@ -300,17 +317,11 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt
    tmp_output = at::_unsafe_view_symint(self_, size);
  }

-  bool has_symbolic_inputs = std::any_of(size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
-
-  at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
-    [size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
-      return at::_unsafe_view_symint(base, size);
-    },
-    [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
-      return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
-    },
-    /*has_symbolic_inputs=*/has_symbolic_inputs
-  );
+  bool has_symbolic_inputs = std::any_of(
+      size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
+  auto view_meta =
+      std::make_shared<at::functionalization::_unsafe_view_ViewMeta>(
+          has_symbolic_inputs, size.vec());

  auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, self, std::move(view_meta));
  // See  Note [Propagating strides in the functionalization pass]
--- a/aten/src/ATen/FunctionalizeFallbackKernel.h
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.h
@ -0,0 +1,58 @@
+#pragma once
+
+#include <ATen/FunctionalStorageImpl.h>
+
+namespace at::functionalization {
+
+// `ViewMeta` implementation for `resize_` operation.
+struct TORCH_API resize__ViewMeta : public ViewMeta {
+  FUNCTIONALIZATION_VIEWMETA_NAME(resize__ViewMeta)
+  FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
+      bool /* reapply_views */,
+      const std::vector<int64_t>&);
+
+  resize__ViewMeta(const SerializableTuple& tpl)
+      : resize__ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
+
+  resize__ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
+      : ViewMeta(/*has_symbolic_inputs=*/false),
+        reapply_views(reapply_views),
+        size(size) {}
+
+  Tensor forward(const Tensor& base) override;
+  Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
+
+  SerializableTuple to_serializable_tuple() {
+    return std::make_tuple(reapply_views, size);
+  }
+
+  bool reapply_views;
+  std::vector<int64_t> size;
+};
+
+// `ViewMeta` implementation for `_unsafe_view` operation.
+struct TORCH_API _unsafe_view_ViewMeta : public ViewMeta {
+  FUNCTIONALIZATION_VIEWMETA_NAME(_unsafe_view_ViewMeta)
+  FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
+      bool /* has_symbolic_inputs */,
+      const std::vector<c10::SymInt>&);
+
+  _unsafe_view_ViewMeta(const SerializableTuple& tpl)
+      : _unsafe_view_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
+
+  _unsafe_view_ViewMeta(
+      bool has_symbolic_inputs,
+      const std::vector<c10::SymInt>& size)
+      : ViewMeta(has_symbolic_inputs), size(size) {}
+
+  Tensor forward(const Tensor& base) override;
+  Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
+
+  SerializableTuple to_serializable_tuple() {
+    return std::make_tuple(has_symbolic_inputs, size);
+  }
+
+  std::vector<c10::SymInt> size;
+};
+
+} // namespace at::functionalization
--- a/aten/src/ATen/InferSize.h
+++ b/aten/src/ATen/InferSize.h
@ -45,7 +45,39 @@ inline void infer_size_impl(
    }
  }

-  auto set_infer_dim = [&]() {
+  if (infer_dim) {
+    // numel is the product of known sizes, it has to be divisible by newsize.
+    // and newsize should be positive unless newsize == numel (we throw
+    // different) error message in that case.
+    if constexpr (std::is_same_v<NumelType, c10::SymInt>) {
+      auto v = newsize.maybe_as_int();
+      if (v and *v == 0) {
+        // Avoid div by 0 when sym_eq(numel % newsize, 0) is constructed!
+        // which may happen when newsize is not a symbol! if its a symbol
+        // division won't happen anyway during compile.
+        TORCH_MAYBE_SYM_CHECK(
+            numel == newsize,
+            "shape '",
+            shape,
+            "' is invalid for input of size ",
+            numel);
+      } else {
+        auto cond = sym_gt(newsize, 0)
+                        .sym_and(sym_eq(numel % newsize, 0))
+                        .sym_or(sym_eq(numel, newsize));
+        TORCH_MAYBE_SYM_CHECK(
+            cond, "shape '", shape, "' is invalid for input of size ", numel);
+      }
+
+    } else {
+      TORCH_CHECK(
+          (newsize > 0 && (numel % newsize == 0)) || numel == newsize,
+          "shape '",
+          shape,
+          "' is invalid for input of size ",
+          numel);
+    }
+
    // We have a degree of freedom here to select the dimension size; follow
    // NumPy semantics and just bail.  However, a nice error message is needed
    // because users often use `view` as a way to flatten & unflatten
@ -54,18 +86,14 @@ inline void infer_size_impl(
    // works yet
    //   empty_tensor.view(-1, 0)
    // doesn't.
-    TORCH_CHECK(
+    TORCH_MAYBE_SYM_CHECK(
        newsize != 0,
        "cannot reshape tensor of 0 elements into shape ",
        shape,
        " because the unspecified dimension size -1 can be any "
        "value and is ambiguous");
-    res[*infer_dim] = numel / newsize;
-    return;
-  };

-  if (infer_dim && newsize > 0 && numel % newsize == 0) {
-    set_infer_dim();
+    res[*infer_dim] = numel / newsize;
    return;
  }

@ -75,9 +103,6 @@ inline void infer_size_impl(
      shape,
      "' is invalid for input of size ",
      numel);
-  if (infer_dim) {
-    set_infer_dim();
-  }
 }

 inline std::vector<int64_t> infer_size(IntArrayRef shape, int64_t numel) {
--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@ -103,7 +103,9 @@ std::string get_cpu_capability() {
 #elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
    case native::CPUCapability::ZVECTOR:
      return "Z VECTOR";
-#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
+#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
+    case native::CPUCapability::SVE128:
+      return "SVE128";
    case native::CPUCapability::SVE256:
      return "SVE256";
 #else
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -1234,7 +1234,7 @@ struct TORCH_API TupleType : public NamedType {
  std::shared_ptr<FunctionSchema> schema_;
 };

-// the common supertype of all Enums, only used in operator registraion.
+// the common supertype of all Enums, only used in operator registration.
 // EnumType <: AnyEnumType for all Enums
 struct AnyEnumType;
 using AnyEnumTypePtr = SingletonTypePtr<AnyEnumType>;
--- a/aten/src/ATen/cpu/vec/functional_base.h
+++ b/aten/src/ATen/cpu/vec/functional_base.h
@ -102,8 +102,31 @@ struct VecReduceAllSIMD<float, Op> {
 #endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) &&
       // !defined(C10_MOBILE)

-#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
-    !defined(CPU_CAPABILITY_SVE)
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+#if defined(CPU_CAPABILITY_SVE256)
+template <typename Op>
+struct VecReduceAllSIMD<float, Op> {
+  static inline float apply(
+      const Op& vec_fun,
+      const Vectorized<float>& acc_vec) {
+    using Vec = Vectorized<float>;
+    Vec v = acc_vec;
+    // 128-bit shuffle
+    svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
+    Vec v1 = svtbl_f32(v, ind);
+    v = vec_fun(v, v1);
+    // 64-bit shuffle
+    ind = svdupq_n_u32(2, 3, 0, 1);
+    v1 = svtbl_f32(v, ind);
+    v = vec_fun(v, v1);
+    // 32-bit shuffle
+    ind = svdupq_n_u32(1, 0, 2, 3);
+    v1 = svtbl_f32(v, ind);
+    v = vec_fun(v, v1);
+    return svlasta(svpfalse(), v);
+  }
+};
+#else
 template <typename Op>
 struct VecReduceAllSIMD<float, Op> {
  static inline float apply(
@ -140,35 +163,8 @@ struct VecReduceAllSIMD<float, std::plus<Vectorized<float>>> {
    return vaddvq_f32(acc_vec);
  }
 };
+#endif // defined(CPU_CAPABILITY_SVE256)
 #endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
-       // && !defined(CPU_CAPABILITY_SVE)
-
-#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
-    defined(CPU_CAPABILITY_SVE256)
-template <typename Op>
-struct VecReduceAllSIMD<float, Op> {
-  static inline float apply(
-      const Op& vec_fun,
-      const Vectorized<float>& acc_vec) {
-    using Vec = Vectorized<float>;
-    Vec v = acc_vec;
-    // 128-bit shuffle
-    svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
-    Vec v1 = svtbl_f32(v, ind);
-    v = vec_fun(v, v1);
-    // 64-bit shuffle
-    ind = svdupq_n_u32(2, 3, 0, 1);
-    v1 = svtbl_f32(v, ind);
-    v = vec_fun(v, v1);
-    // 32-bit shuffle
-    ind = svdupq_n_u32(1, 0, 2, 3);
-    v1 = svtbl_f32(v, ind);
-    v = vec_fun(v, v1);
-    return svlasta(svpfalse(), v);
-  }
-};
-#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
-       // && defined(CPU_CAPABILITY_SVE256)

 template <typename scalar_t, typename Op>
 inline scalar_t vec_reduce_all(
--- a/aten/src/ATen/cpu/vec/sve/sve_helper.h
+++ b/aten/src/ATen/cpu/vec/sve/sve_helper.h
@ -1,9 +1,21 @@
 #pragma once

 #include <ATen/cpu/vec/intrinsics.h>
+#include <c10/macros/Macros.h>
+#include <cstdint>

 #include <ATen/cpu/vec/vec_base.h>

+#if defined(__aarch64__) &&                     \
+    (defined(AT_BUILD_ARM_VEC256_WITH_SLEEF) || \
+     defined(AT_BUILD_ARM_VECSVE_WITH_SLEEF))
+#define SLEEF_STATIC_LIBS
+#include <sleef.h>
+#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
+#else
+#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
+#endif
+
 #if defined(CPU_CAPABILITY_SVE)

 // Define the data type of VLS(vector-length specific).
--- a/Show More
+++ b/Show More