[c10d] Add more tests for preventing extra context

[c10d] Fix extra CUDA context created by barrier
Fixes #149119. In ProcessGroup.hpp, we create a dummy tensor for dispatching. This requires a correct device index. This PR uses `device_id` given by user when calling `init_process_group`. This PR also uses `torch._C._get_accelerator()` to determine the device type. ghstack-source-id: 96c32b9565794d995c26bd1794856d1ef7961652 Pull Request resolved: https://github.com/pytorch/pytorch/pull/149144
2025-10-31 20:34:54 +08:00 · 2025-05-22 17:47:47 -07:00 · 2025-05-05 09:22:17 -07:00 · 2025-04-15 15:56:51 -07:00 · 2025-04-10 10:39:40 -04:00 · 2025-04-10 10:39:03 -04:00
1650 changed files with 39144 additions and 67081 deletions
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -19,11 +19,13 @@ import boto3

 # AMI images for us-east-1, change the following based on your ~/.aws/config
 os_amis = {
+    "ubuntu18_04": "ami-078eece1d8119409f",  # login_name: ubuntu
    "ubuntu20_04": "ami-052eac90edaa9d08f",  # login_name: ubuntu
    "ubuntu22_04": "ami-0c6c29c5125214c77",  # login_name: ubuntu
    "redhat8": "ami-0698b90665a2ddcf1",  # login_name: ec2-user
 }

+ubuntu18_04_ami = os_amis["ubuntu18_04"]
 ubuntu20_04_ami = os_amis["ubuntu20_04"]


@ -657,6 +659,18 @@ def configure_system(
            "sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip"
        )
    host.run_cmd("pip3 install dataclasses typing-extensions")
+    # Install and switch to gcc-8 on Ubuntu-18.04
+    if not host.using_docker() and host.ami == ubuntu18_04_ami and compiler == "gcc-8":
+        host.run_cmd("sudo apt-get install -y g++-8 gfortran-8")
+        host.run_cmd(
+            "sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 100"
+        )
+        host.run_cmd(
+            "sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-8 100"
+        )
+        host.run_cmd(
+            "sudo update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-8 100"
+        )
    if not use_conda:
        print("Installing Cython + numpy from PyPy")
        host.run_cmd("sudo pip3 install Cython")
@ -1012,7 +1026,7 @@ if __name__ == "__main__":
        install_condaforge_python(host, args.python_version)
        sys.exit(0)

-    python_version = args.python_version if args.python_version is not None else "3.9"
+    python_version = args.python_version if args.python_version is not None else "3.8"

    if args.use_torch_from_pypi:
        configure_system(host, compiler=args.compiler, python_version=python_version)
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -44,8 +44,6 @@ FROM base as cuda
 ARG CUDA_VERSION=12.4
 RUN rm -rf /usr/local/cuda-*
 ADD ./common/install_cuda.sh install_cuda.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
 ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
 # Preserve CUDA_VERSION for the builds
 ENV CUDA_VERSION=${CUDA_VERSION}
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -105,6 +105,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -118,6 +119,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -132,6 +134,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -146,6 +149,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -160,6 +164,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -173,6 +178,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -187,6 +193,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -201,6 +208,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -215,6 +223,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -226,6 +235,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=10
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    ONNX=yes
@ -234,7 +244,10 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=10
    PROTOBUF=yes
+    DB=yes
    VISION=yes
+    VULKAN_SDK_VERSION=1.2.162.1
+    SWIFTSHADER=yes
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
@ -242,7 +255,10 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.11
    CLANG_VERSION=10
    PROTOBUF=yes
+    DB=yes
    VISION=yes
+    VULKAN_SDK_VERSION=1.2.162.1
+    SWIFTSHADER=yes
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
@ -250,6 +266,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    TRITON=yes
@ -258,6 +275,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    ROCM_VERSION=6.2.4
    NINJA_VERSION=1.9.0
@ -272,6 +290,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    ROCM_VERSION=6.3
    NINJA_VERSION=1.9.0
@ -286,6 +305,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    XPU_VERSION=0.5
    NINJA_VERSION=1.9.0
@ -296,6 +316,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    XPU_VERSION=2025.0
    NINJA_VERSION=1.9.0
@ -306,6 +327,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    CONDA_CMAKE=yes
@ -319,6 +341,7 @@ case "$image" in
    CUDNN_VERSION=9
    CLANG_VERSION=12
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    TRITON=yes
    ;;
@ -326,6 +349,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=12
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    TRITON=yes
@ -346,6 +370,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    CONDA_CMAKE=yes
@ -378,19 +403,20 @@ case "$image" in
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
    # We will need to update mypy version eventually, but that's for another day. The task
    # would be to upgrade mypy to 1.0.0 with Python 3.11
-    PYTHON_VERSION=3.9
-    PIP_CMAKE=yes
+    ANACONDA_PYTHON_VERSION=3.9
+    CONDA_CMAKE=yes
    ;;
  pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter)
-    PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.9
    CUDA_VERSION=11.8
-    PIP_CMAKE=yes
+    CONDA_CMAKE=yes
    ;;
  pytorch-linux-jammy-aarch64-py3.10-gcc11)
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    ACL=yes
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    # snadampal: skipping llvm src build install because the current version
@ -402,6 +428,7 @@ case "$image" in
    GCC_VERSION=11
    ACL=yes
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    # snadampal: skipping llvm src build install because the current version
@ -412,6 +439,7 @@ case "$image" in
  *)
    # Catch-all for builds that are not hardcoded.
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    echo "image '$image' did not match an existing build configuration"
    if [[ "$image" == *py* ]]; then
@ -460,21 +488,14 @@ if [[ "$image" == *cuda*  && ${OS} == "ubuntu" ]]; then
  fi
 fi

-no_cache_flag=""
-progress_flag=""
-# Do not use cache and progress=plain when in CI
-if [[ -n "${CI:-}" ]]; then
-  no_cache_flag="--no-cache"
-  progress_flag="--progress=plain"
-fi
-
 # Build image
 docker build \
-       ${no_cache_flag} \
-       ${progress_flag} \
+       --no-cache \
+       --progress=plain \
       --build-arg "BUILD_ENVIRONMENT=${image}" \
       --build-arg "PROTOBUF=${PROTOBUF:-}" \
       --build-arg "LLVMDEV=${LLVMDEV:-}" \
+       --build-arg "DB=${DB:-}" \
       --build-arg "VISION=${VISION:-}" \
       --build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \
       --build-arg "CENTOS_VERSION=${CENTOS_VERSION}" \
@ -482,12 +503,13 @@ docker build \
       --build-arg "GLIBC_VERSION=${GLIBC_VERSION}" \
       --build-arg "CLANG_VERSION=${CLANG_VERSION}" \
       --build-arg "ANACONDA_PYTHON_VERSION=${ANACONDA_PYTHON_VERSION}" \
-       --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
       --build-arg "GCC_VERSION=${GCC_VERSION}" \
       --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
       --build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \
       --build-arg "TENSORRT_VERSION=${TENSORRT_VERSION}" \
       --build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \
+       --build-arg "VULKAN_SDK_VERSION=${VULKAN_SDK_VERSION}" \
+       --build-arg "SWIFTSHADER=${SWIFTSHADER}" \
       --build-arg "CMAKE_VERSION=${CMAKE_VERSION:-}" \
       --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
       --build-arg "KATEX=${KATEX:-}" \
@ -497,7 +519,6 @@ docker build \
       --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
       --build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \
-       --build-arg "PIP_CMAKE=${PIP_CMAKE}" \
       --build-arg "TRITON=${TRITON}" \
       --build-arg "TRITON_CPU=${TRITON_CPU}" \
       --build-arg "ONNX=${ONNX}" \
@ -523,7 +544,7 @@ docker build \
 UBUNTU_VERSION=$(echo ${UBUNTU_VERSION} | sed 's/-rc$//')

 function drun() {
-  docker run --rm "$tmp_tag" "$@"
+  docker run --rm "$tmp_tag" $*
 }

 if [[ "$OS" == "ubuntu" ]]; then
@ -571,14 +592,3 @@ if [ -n "$KATEX" ]; then
    exit 1
  fi
 fi
-
-HAS_TRITON=$(drun python -c "import triton" > /dev/null 2>&1 && echo "yes" || echo "no")
-if [[ -n "$TRITON" || -n "$TRITON_CPU" ]]; then
-  if [ "$HAS_TRITON" = "no" ]; then
-    echo "expecting triton to be installed, but it is not"
-    exit 1
-  fi
-elif [ "$HAS_TRITON" = "yes" ]; then
-  echo "expecting triton to not be installed, but it is"
-  exit 1
-fi
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -55,6 +55,13 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
 RUN rm install_protobuf.sh
 ENV INSTALLED_PROTOBUF ${PROTOBUF}

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -68,7 +75,7 @@ COPY ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh
 RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
-RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
+RUN bash ./install_rocm_magma.sh
 RUN rm install_rocm_magma.sh
 COPY ./common/install_amdsmi.sh install_amdsmi.sh
 RUN bash ./install_amdsmi.sh
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-4022ff142a5392aa5197e05f4dfe85d356f742bf
+01a22b6f16d117454b7d21ebdc691b0785b84a7f
--- a/.ci/docker/common/install_clang.sh
+++ b/.ci/docker/common/install_clang.sh
@ -4,10 +4,16 @@ set -ex

 if [ -n "$CLANG_VERSION" ]; then

-  if [[ $UBUNTU_VERSION == 22.04 ]]; then
+  if [[ $CLANG_VERSION == 9 && $UBUNTU_VERSION == 18.04 ]]; then
+    sudo apt-get update
+    # gpg-agent is not available by default on 18.04
+    sudo apt-get install  -y --no-install-recommends gpg-agent
+    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add  -
+    apt-add-repository "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-${CLANG_VERSION} main"
+  elif [[ $UBUNTU_VERSION == 22.04 ]]; then
    # work around ubuntu apt-get conflicts
    sudo apt-get -y -f install
-    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
+    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add  -
    if [[ $CLANG_VERSION == 18 ]]; then
      apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main"
    fi
@ -35,7 +41,7 @@ if [ -n "$CLANG_VERSION" ]; then
  # clang's packaging is a little messed up (the runtime libs aren't
  # added into the linker path), so give it a little help
  clang_lib=("/usr/lib/llvm-$CLANG_VERSION/lib/clang/"*"/lib/linux")
-  echo "$clang_lib" >/etc/ld.so.conf.d/clang.conf
+  echo "$clang_lib" > /etc/ld.so.conf.d/clang.conf
  ldconfig

  # Cleanup package manager
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -62,7 +62,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then

  # libstdcxx from conda default channels are too old, we need GLIBCXX_3.4.30
  # which is provided in libstdcxx 12 and up.
-  conda_install libstdcxx-ng=12.3.0 --update-deps -c conda-forge
+  conda_install libstdcxx-ng=12.3.0 -c conda-forge

  # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
  if [[ $(uname -m) == "aarch64" ]]; then
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -7,7 +7,7 @@ PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/hea
 GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py

 # Python versions to be installed in /opt/$VERSION_NO
-CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"}
+CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"}

 function check_var {
    if [ -z "$1" ]; then
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -2,6 +2,7 @@

 set -ex

+NCCL_VERSION=v2.26.2-1
 CUDNN_VERSION=9.5.1.17

 function install_cusparselt_040 {
@ -39,7 +40,8 @@ function install_cusparselt_063 {

 function install_118 {
    CUDNN_VERSION=9.1.0.70
-    echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.4.0"
+    NCCL_VERSION=v2.21.5-1
+    echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0"
    rm -rf /usr/local/cuda-11.8 /usr/local/cuda
    # install CUDA 11.8.0 in the same container
    wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
@ -57,7 +59,14 @@ function install_118 {
    cd ..
    rm -rf tmp_cudnn

-    CUDA_VERSION=11.8 bash install_nccl.sh
+    # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+    # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+    git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+    cd nccl && make -j src.build
+    cp -a build/include/* /usr/local/cuda/include/
+    cp -a build/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf nccl

    install_cusparselt_040

@ -66,7 +75,7 @@ function install_118 {

 function install_124 {
  CUDNN_VERSION=9.1.0.70
-  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.2"
+  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
  # install CUDA 12.4.1 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
@ -84,7 +93,14 @@ function install_124 {
  cd ..
  rm -rf tmp_cudnn

-  CUDA_VERSION=12.4 bash install_nccl.sh
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl

  install_cusparselt_062

@ -92,7 +108,7 @@ function install_124 {
 }

 function install_126 {
-  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.3"
+  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
  rm -rf /usr/local/cuda-12.6 /usr/local/cuda
  # install CUDA 12.6.3 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run
@ -110,7 +126,14 @@ function install_126 {
  cd ..
  rm -rf tmp_cudnn

-  CUDA_VERSION=12.6 bash install_nccl.sh
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl

  install_cusparselt_063

@ -217,8 +240,8 @@ function prune_126 {
 }

 function install_128 {
-  CUDNN_VERSION=9.8.0.87
-  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.3"
+  CUDNN_VERSION=9.7.1.26
+  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
  rm -rf /usr/local/cuda-12.8 /usr/local/cuda
  # install CUDA 12.8.0 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux.run
@ -236,7 +259,14 @@ function install_128 {
  cd ..
  rm -rf tmp_cudnn

-  CUDA_VERSION=12.8 bash install_nccl.sh
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl

  install_cusparselt_063

--- a/.ci/docker/common/install_cuda_aarch64.sh
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@ -3,6 +3,7 @@

 set -ex

+NCCL_VERSION=v2.26.2-1
 CUDNN_VERSION=9.8.0.87

 function install_cusparselt_063 {
@ -17,7 +18,7 @@ function install_cusparselt_063 {
 }

 function install_128 {
-  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.3"
+  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
  rm -rf /usr/local/cuda-12.8 /usr/local/cuda
  # install CUDA 12.8.0 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux_sbsa.run
@ -35,7 +36,14 @@ function install_128 {
  cd ..
  rm -rf tmp_cudnn

-  CUDA_VERSION=12.8 bash install_nccl.sh
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b ${NCCL_VERSION} --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl

  install_cusparselt_063

--- a/.ci/docker/common/install_cudnn.sh
+++ b/.ci/docker/common/install_cudnn.sh
@ -5,7 +5,7 @@ if [[ -n "${CUDNN_VERSION}" ]]; then
    mkdir tmp_cudnn
    pushd tmp_cudnn
    if [[ ${CUDA_VERSION:0:4} == "12.8" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.8.0.87_cuda12-archive"
+        CUDNN_NAME="cudnn-linux-x86_64-9.7.1.26_cuda12-archive"
    elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
        CUDNN_NAME="cudnn-linux-x86_64-9.5.1.17_cuda12-archive"
    elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
--- a/.ci/docker/common/install_db.sh
+++ b/.ci/docker/common/install_db.sh
@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -ex
+
+install_ubuntu() {
+  apt-get update
+
+  # Cleanup
+  apt-get autoclean && apt-get clean
+  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+}
+
+install_centos() {
+  # Need EPEL for many packages we depend on.
+  # See http://fedoraproject.org/wiki/EPEL
+  yum --enablerepo=extras install -y epel-release
+
+  # Cleanup
+  yum clean all
+  rm -rf /var/cache/yum
+  rm -rf /var/lib/yum/yumdb
+  rm -rf /var/lib/yum/history
+}
+
+# Install base packages depending on the base OS
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  ubuntu)
+    install_ubuntu
+    ;;
+  centos)
+    install_centos
+    ;;
+  *)
+    echo "Unable to determine OS..."
+    exit 1
+    ;;
+esac
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -50,7 +50,8 @@ setup_executorch() {
  pushd executorch

  export PYTHON_EXECUTABLE=python
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+  export EXECUTORCH_BUILD_PYBIND=ON
+  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"

  as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
  popd
--- a/.ci/docker/common/install_halide.sh
+++ b/.ci/docker/common/install_halide.sh
@ -35,9 +35,7 @@ git clone https://github.com/halide/Halide.git
 pushd Halide
 git checkout ${COMMIT} && git submodule update --init --recursive
 pip_install -r requirements.txt
-# NOTE: pybind has a requirement for cmake > 3.5 so set the minimum cmake version here with a flag
-#       Context: https://github.com/pytorch/pytorch/issues/150420
-cmake -G Ninja -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_BUILD_TYPE=Release -S . -B build
+cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -S . -B build
 cmake --build build
 test -e ${CONDA_PREFIX}/lib/python3 || ln -s python${ANACONDA_PYTHON_VERSION} ${CONDA_PREFIX}/lib/python3
 cmake --install build --prefix ${CONDA_PREFIX}
--- a/.ci/docker/common/install_linter.sh
+++ b/.ci/docker/common/install_linter.sh
@ -2,6 +2,8 @@

 set -ex

+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
 if [ -n "${UBUNTU_VERSION}" ]; then
  apt update
  apt-get install -y clang doxygen git graphviz nodejs npm libtinfo5
@ -13,8 +15,8 @@ chown -R jenkins pytorch

 pushd pytorch
 # Install all linter dependencies
-pip install -r requirements.txt
-lintrunner init
+pip_install -r requirements.txt
+conda_run lintrunner init

 # Cache .lintbin directory as part of the Docker image
 cp -r .lintbin /tmp
--- a/.ci/docker/common/install_nccl.sh
+++ b/.ci/docker/common/install_nccl.sh
@ -1,26 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-NCCL_VERSION=""
-if [[ ${CUDA_VERSION:0:2} == "11" ]]; then
-  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu11.txt)
-elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
-  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu12.txt)
-else
-  echo "Unexpected CUDA_VERSION ${CUDA_VERSION}"
-  exit 1
-fi
-
-if [[ -n "${NCCL_VERSION}" ]]; then
-  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
-  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
-  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
-  pushd nccl
-  make -j src.build
-  cp -a build/include/* /usr/local/cuda/include/
-  cp -a build/lib/* /usr/local/cuda/lib64/
-  popd
-  rm -rf nccl
-  ldconfig
-fi
--- a/.ci/docker/common/install_python.sh
+++ b/.ci/docker/common/install_python.sh
@ -1,18 +0,0 @@
-#!/bin/bash
-set -ex
-
-apt-get update
-# Use deadsnakes in case we need an older python version
-sudo add-apt-repository ppa:deadsnakes/ppa
-apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python3-pip python${PYTHON_VERSION}-venv
-
-# Use a venv because uv and some other package managers don't support --user install
-ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
-python -m venv /var/lib/jenkins/ci_env
-source /var/lib/jenkins/ci_env/bin/activate
-
-python -mpip install --upgrade pip
-python -mpip install -r /opt/requirements-ci.txt
-if [ -n "${PIP_CMAKE}" ]; then
-  python -mpip install cmake==3.31.6
-fi
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -8,6 +8,10 @@ ver() {

 install_ubuntu() {
    apt-get update
+    if [[ $UBUNTU_VERSION == 18.04 ]]; then
+      # gpg-agent is not available by default on 18.04
+      apt-get install -y --no-install-recommends gpg-agent
+    fi
    if [[ $UBUNTU_VERSION == 20.04 ]]; then
      # gpg-agent is not available by default on 20.04
      apt-get install -y --no-install-recommends gpg-agent
--- a/.ci/docker/common/install_rocm_drm.sh
+++ b/.ci/docker/common/install_rocm_drm.sh
@ -25,9 +25,7 @@ python3 -m pip install meson ninja
 ###########################
 ### clone repo
 ###########################
-# TEMPORARY FIX: https://gitlab.freedesktop.org/mesa/drm.git is down until 2025/03/22
-# GIT_SSL_NO_VERIFY=true git clone https://gitlab.freedesktop.org/mesa/drm.git
-GIT_SSL_NO_VERIFY=true git clone git://anongit.freedesktop.org/mesa/drm
+GIT_SSL_NO_VERIFY=true git clone https://gitlab.freedesktop.org/mesa/drm.git
 pushd drm

 ###########################
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -1,28 +1,50 @@
-#!/usr/bin/env bash
-# Script used only in CD pipeline
+#!/bin/bash
+# Script used in CI and CD pipeline

-set -eou pipefail
+set -ex

-function do_install() {
-    rocm_version=$1
-    rocm_version_nodot=${1//./}
+# Magma build scripts need `python`
+ln -sf /usr/bin/python3 /usr/bin/python

-    # Version 2.7.2 + ROCm related updates
-    MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
-    magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  almalinux)
+    yum install -y gcc-gfortran
+    ;;
+  *)
+    echo "No preinstalls to build magma..."
+    ;;
+esac

-    rocm_dir="/opt/rocm"
-    (
-        set -x
-        tmp_dir=$(mktemp -d)
-        pushd ${tmp_dir}
-        curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
-        tar -xvf "${magma_archive}"
-        mkdir -p "${rocm_dir}/magma"
-        mv include "${rocm_dir}/magma/include"
-        mv lib "${rocm_dir}/magma/lib"
-        popd
-    )
-}
+MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}

-do_install $1
+# "install" hipMAGMA into /opt/rocm/magma by copying after build
+git clone https://bitbucket.org/icl/magma.git
+pushd magma
+
+# Version 2.7.2 + ROCm related updates
+git checkout a1625ff4d9bc362906bd01f805dbbe12612953f6
+
+cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
+echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
+if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then
+    echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc
+fi
+echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc
+echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc
+export PATH="${PATH}:/opt/rocm/bin"
+if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
+  amdgpu_targets=`echo $PYTORCH_ROCM_ARCH | sed 's/;/ /g'`
+else
+  amdgpu_targets=`rocm_agent_enumerator | grep -v gfx000 | sort -u | xargs`
+fi
+for arch in $amdgpu_targets; do
+  echo "DEVCCFLAGS += --offload-arch=$arch" >> make.inc
+done
+# hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition
+sed -i 's/^FOPENMP/#FOPENMP/g' make.inc
+make -f make.gen.hipMAGMA -j $(nproc)
+LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}"
+make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}"
+popd
+mv magma /opt/rocm
--- a/.ci/docker/common/install_swiftshader.sh
+++ b/.ci/docker/common/install_swiftshader.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -ex
+
+[ -n "${SWIFTSHADER}" ]
+
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+_https_amazon_aws=https://ossci-android.s3.amazonaws.com
+
+# SwiftShader
+_swiftshader_dir=/var/lib/jenkins/swiftshader
+_swiftshader_file_targz=swiftshader-abe07b943-prebuilt.tar.gz
+mkdir -p $_swiftshader_dir
+_tmp_swiftshader_targz="/tmp/${_swiftshader_file_targz}"
+
+curl --silent --show-error --location --fail --retry 3 \
+  --output "${_tmp_swiftshader_targz}" "$_https_amazon_aws/${_swiftshader_file_targz}"
+
+tar -C "${_swiftshader_dir}" -xzf "${_tmp_swiftshader_targz}"
+
+export VK_ICD_FILENAMES="${_swiftshader_dir}/build/Linux/vk_swiftshader_icd.json"
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -2,12 +2,6 @@

 set -ex

-mkdir -p /opt/triton
-if [ -z "${TRITON}" ] && [ -z "${TRITON_CPU}" ]; then
-  echo "TRITON and TRITON_CPU are not set. Exiting..."
-  exit 0
-fi
-
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

 get_conda_version() {
@ -58,7 +52,6 @@ cd triton
 as_jenkins git checkout ${TRITON_PINNED_COMMIT}
 as_jenkins git submodule update --init --recursive
 cd python
-pip_install pybind11==2.13.6

 # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
 as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py
@ -67,22 +60,17 @@ if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}"
  # Triton needs at least gcc-9 to build
  apt-get install -y g++-9

-  CXX=g++-9 conda_run python setup.py bdist_wheel
+  CXX=g++-9 pip_install .
 elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
  # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
  add-apt-repository -y ppa:ubuntu-toolchain-r/test
  apt-get install -y g++-9

-  CXX=g++-9 conda_run python setup.py bdist_wheel
+  CXX=g++-9 pip_install .
 else
-  conda_run python setup.py bdist_wheel
+  pip_install .
 fi

-# Copy the wheel to /opt for multi stage docker builds
-cp dist/*.whl /opt/triton
-# Install the wheel for docker builds that don't use multi stage
-pip_install dist/*.whl
-
 if [ -n "${CONDA_CMAKE}" ]; then
  # TODO: This is to make sure that the same cmake and numpy version from install conda
  # script is used. Without this step, the newer cmake version (3.25.2) downloaded by
--- a/.ci/docker/common/install_vulkan_sdk.sh
+++ b/.ci/docker/common/install_vulkan_sdk.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -ex
+
+[ -n "${VULKAN_SDK_VERSION}" ]
+
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+_vulkansdk_dir=/var/lib/jenkins/vulkansdk
+_tmp_vulkansdk_targz=/tmp/vulkansdk.tar.gz
+
+curl \
+  --silent \
+  --show-error \
+  --location \
+  --fail \
+  --retry 3 \
+  --output "${_tmp_vulkansdk_targz}" "https://ossci-android.s3.amazonaws.com/vulkansdk-linux-x86_64-${VULKAN_SDK_VERSION}.tar.gz"
+
+mkdir -p "${_vulkansdk_dir}"
+tar -C "${_vulkansdk_dir}" -xzf "${_tmp_vulkansdk_targz}" --strip-components 1
+rm -rf "${_tmp_vulkansdk_targz}"
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -47,6 +47,9 @@ function install_ubuntu() {
    # Development Packages
    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
    # Install Intel Support Packages
+    if [[ "$XPU_VERSION" == "2025.0" ]]; then
+        XPU_PACKAGES="${XPU_PACKAGES} intel-oneapi-dnnl=2025.0.1-6"
+    fi
    apt-get install -y ${XPU_PACKAGES}

    # Cleanup
@ -82,6 +85,9 @@ gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.
 EOF

    # Install Intel Support Packages
+    if [[ "$XPU_VERSION" == "2025.0" ]]; then
+        XPU_PACKAGES="${XPU_PACKAGES} intel-oneapi-dnnl-2025.0.1-6"
+    fi
    yum install -y ${XPU_PACKAGES}
    # The xpu-smi packages
    dnf install -y xpu-smi
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@ -49,8 +49,6 @@ RUN bash ./install_mkl.sh && rm install_mkl.sh
 FROM cpu as cuda
 ADD ./common/install_cuda.sh install_cuda.sh
 ADD ./common/install_magma.sh install_magma.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
 ENV CUDA_HOME /usr/local/cuda

 FROM cuda as cuda11.8
@ -74,7 +72,6 @@ RUN bash ./install_magma.sh 12.8
 RUN ln -sf /usr/local/cuda-12.8 /usr/local/cuda

 FROM cpu as rocm
-ARG ROCM_VERSION
 ARG PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
 ENV MKLROOT /opt/intel
@ -93,7 +90,7 @@ RUN apt-get update -y && \
    apt-get clean

 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
-RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh
+RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh

 FROM ${BASE_TARGET} as final
 COPY --from=openssl            /opt/openssl           /opt/openssl
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -40,7 +40,7 @@ case ${GPU_ARCH_TYPE} in
        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
        GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
+        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
        ;;
    *)
        echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
--- a/.ci/docker/linter-cuda/Dockerfile
+++ b/.ci/docker/linter-cuda/Dockerfile
@ -18,30 +18,28 @@ COPY ./common/install_user.sh install_user.sh
 RUN bash ./install_user.sh && rm install_user.sh

 # Install conda and other packages (e.g., numpy, pytest)
-ARG PYTHON_VERSION
-ARG PIP_CMAKE
-# Put venv into the env vars so users don't need to activate it
-ENV PATH /var/lib/jenkins/ci_env/bin:$PATH
-ENV VIRTUAL_ENV /var/lib/jenkins/ci_env
-COPY requirements-ci.txt /opt/requirements-ci.txt
-COPY ./common/install_python.sh install_python.sh
-RUN bash ./install_python.sh && rm install_python.sh /opt/requirements-ci.txt
+ARG ANACONDA_PYTHON_VERSION
+ARG CONDA_CMAKE
+ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
+ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
+COPY requirements-ci.txt /opt/conda/requirements-ci.txt
+COPY ./common/install_conda.sh install_conda.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./common/install_magma_conda.sh install_magma_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh install_magma_conda.sh common_utils.sh /opt/conda/requirements-ci.txt

 # Install cuda and cudnn
 ARG CUDA_VERSION
 COPY ./common/install_cuda.sh install_cuda.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu*
+RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH

 # Note that Docker build forbids copying file outside the build context
 COPY ./common/install_linter.sh install_linter.sh
+COPY ./common/common_utils.sh common_utils.sh
 RUN bash ./install_linter.sh
-RUN rm install_linter.sh
-
-RUN chown -R jenkins:jenkins /var/lib/jenkins/ci_env
+RUN rm install_linter.sh common_utils.sh

 USER jenkins
 CMD ["bash"]
--- a/.ci/docker/linter/Dockerfile
+++ b/.ci/docker/linter/Dockerfile
@ -15,18 +15,20 @@ COPY ./common/install_user.sh install_user.sh
 RUN bash ./install_user.sh && rm install_user.sh

 # Install conda and other packages (e.g., numpy, pytest)
-ARG PYTHON_VERSION
-ARG PIP_CMAKE
-ENV PATH /var/lib/jenkins/ci_env/bin:$PATH
-ENV VIRTUAL_ENV /var/lib/jenkins/ci_env
-COPY requirements-ci.txt /opt/requirements-ci.txt
-COPY ./common/install_python.sh install_python.sh
-RUN bash ./install_python.sh && rm install_python.sh /opt/requirements-ci.txt
+ARG ANACONDA_PYTHON_VERSION
+ARG CONDA_CMAKE
+ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
+ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
+COPY requirements-ci.txt /opt/conda/requirements-ci.txt
+COPY ./common/install_conda.sh install_conda.sh
+COPY ./common/common_utils.sh common_utils.sh
+RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt

 # Note that Docker build forbids copying file outside the build context
 COPY ./common/install_linter.sh install_linter.sh
+COPY ./common/common_utils.sh common_utils.sh
 RUN bash ./install_linter.sh
-RUN rm install_linter.sh
+RUN rm install_linter.sh common_utils.sh

 USER jenkins
 CMD ["bash"]
--- a/.ci/docker/manywheel/Dockerfile
+++ b/.ci/docker/manywheel/Dockerfile
@ -64,9 +64,7 @@ FROM base as cuda
 ARG BASE_CUDA_VERSION=10.2
 # Install CUDA
 ADD ./common/install_cuda.sh install_cuda.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu*
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh

 FROM base as intel
 # MKL
@ -197,6 +195,6 @@ RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
 # cmake3 is needed for the MIOpen build
 RUN ln -sf /usr/local/bin/cmake /usr/bin/cmake3
 ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
-RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh
+RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -36,9 +36,7 @@ FROM base as cuda
 ARG BASE_CUDA_VERSION=11.8
 # Install CUDA
 ADD ./common/install_cuda.sh install_cuda.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh ci_commit_pins/nccl-cu*
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh

 FROM base as intel
 # MKL
@ -160,7 +158,7 @@ ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
 ENV MKLROOT /opt/intel
 ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
-RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh
+RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh

--- a/.ci/docker/manywheel/Dockerfile_cuda_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
@ -67,9 +67,7 @@ FROM base as cuda
 ARG BASE_CUDA_VERSION
 # Install CUDA
 ADD ./common/install_cuda_aarch64.sh install_cuda_aarch64.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-RUN bash ./install_cuda_aarch64.sh ${BASE_CUDA_VERSION} && rm install_cuda_aarch64.sh install_nccl.sh ci_commit_pins/nccl-cu*
+RUN bash ./install_cuda_aarch64.sh ${BASE_CUDA_VERSION} && rm install_cuda_aarch64.sh

 FROM base as magma
 ARG BASE_CUDA_VERSION
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -42,7 +42,6 @@ RUN yum install -y \
  llvm-devel \
  libzstd-devel \
  python3.12-devel \
-  python3.12-test \
  python3.12-setuptools \
  python3.12-pip \
  python3-virtualenv \
@ -102,33 +101,24 @@ CMD ["/bin/bash"]

 # install test dependencies:
 # - grpcio requires system openssl, bundled crypto fails to build
+# - ml_dtypes 0.4.0 requires some fixes provided in later commits to build
 RUN dnf install -y \
  protobuf-devel \
  protobuf-c-devel \
  protobuf-lite-devel \
-  hdf5-devel \
-  python3-h5py \
-  git
+  wget \
+  patch

-RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
-
-# cmake-3.28.0 from pip for onnxruntime
-RUN python3 -mpip install cmake==3.28.0
-
-# build onnxruntime 1.21.0 from sources.
-# it is not possible to build it from sources using pip,
-# so just build it from upstream repository.
-# h5py is dependency of onnxruntime_training.
-# h5py==3.11.0 builds with hdf5-devel 1.10.5 from repository.
-# install newest flatbuffers version first:
-# for some reason old version is getting pulled in otherwise.
-# packaging package is required for onnxruntime wheel build.
-RUN pip3 install flatbuffers && \
-  pip3 install h5py==3.11.0 && \
-  pip3 install packaging && \
-  git clone https://github.com/microsoft/onnxruntime && \
-  cd onnxruntime && git checkout v1.21.0 && \
+RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio==1.65.4
+RUN cd ~ && \
+  git clone https://github.com/jax-ml/ml_dtypes && \
+  cd ml_dtypes && \
+  git checkout v0.4.0 && \
  git submodule update --init --recursive && \
-  ./build.sh --config Release --parallel 0 --enable_pybind --build_wheel --enable_training --enable_training_apis --enable_training_ops --skip_tests --allow_running_as_root && \
-  pip3 install ./build/Linux/Release/dist/onnxruntime_training-*.whl && \
-  cd .. && /bin/rm -rf ./onnxruntime
+  wget https://github.com/jax-ml/ml_dtypes/commit/b969f76914d6b30676721bc92bf0f6021a0d1321.patch && \
+  wget https://github.com/jax-ml/ml_dtypes/commit/d4e6d035ecda073eab8bcf60f4eef572ee7087e6.patch && \
+  patch -p1 < b969f76914d6b30676721bc92bf0f6021a0d1321.patch && \
+  patch -p1 < d4e6d035ecda073eab8bcf60f4eef572ee7087e6.patch && \
+  python3 setup.py bdist_wheel && \
+  pip3 install dist/*.whl && \
+  rm -rf ml_dtypes
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -41,14 +41,11 @@ fbscribelogger==0.1.7
 #Pinned versions: 0.1.6
 #test that import:

-flatbuffers==2.0 ; platform_machine != "s390x"
+flatbuffers==2.0
 #Description: cross platform serialization library
 #Pinned versions: 2.0
 #test that import:

-flatbuffers ; platform_machine == "s390x"
-#Description: cross platform serialization library; Newer version is required on s390x for new python version
-
 hypothesis==5.35.1
 # Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
 #Description: advanced library for generating parametrized tests
@ -105,10 +102,10 @@ networkx==2.8.8
 #Pinned versions: 2.8.8
 #test that import: functorch

-ninja==1.11.1.3
-#Description: build system. Used in some tests. Used in build to generate build
-#time tracing information
-#Pinned versions: 1.11.1.3
+#ninja
+#Description: build system.  Note that it install from
+#here breaks things so it is commented out
+#Pinned versions: 1.10.0.post1
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py

 numba==0.49.0 ; python_version < "3.9"
@ -356,7 +353,7 @@ parameterized==0.8.1
 #Pinned versions: 1.24.0
 #test that import: test_sac_estimator.py

-pwlf==2.2.1
+pwlf==2.2.1 ; python_version >= "3.8"
 #Description: required for testing torch/distributed/_tools/sac_estimator.py
 #Pinned versions: 2.2.1
 #test that import: test_sac_estimator.py
@ -368,9 +365,10 @@ PyYAML
 pyzstd
 setuptools

+ninja==1.11.1 ; platform_machine == "aarch64"
 scons==4.5.2 ; platform_machine == "aarch64"

-pulp==2.9.0
+pulp==2.9.0 ; python_version >= "3.8"
 #Description: required for testing ilp formulaiton under torch/distributed/_tools
 #Pinned versions: 2.9.0
 #test that import: test_sac_ilp.py
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -2,7 +2,7 @@ ARG UBUNTU_VERSION
 ARG CUDA_VERSION
 ARG IMAGE_NAME

-FROM ${IMAGE_NAME} as base
+FROM ${IMAGE_NAME}

 ARG UBUNTU_VERSION
 ARG CUDA_VERSION
@ -50,6 +50,13 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
 RUN rm install_protobuf.sh
 ENV INSTALLED_PROTOBUF ${PROTOBUF}

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -90,20 +97,14 @@ RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
 RUN rm install_cmake.sh

 ARG TRITON
-
-FROM base as triton-builder
 # Install triton, this needs to be done before sccache because the latter will
 # try to reach out to S3, which docker build runners don't have access
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/triton.txt triton.txt
 COPY triton_version.txt triton_version.txt
-RUN bash ./install_triton.sh
-
-FROM base as final
-COPY --from=triton-builder /opt/triton /opt/triton
-RUN if [ -n "${TRITON}" ]; then pip install /opt/triton/*.whl; chown -R jenkins:jenkins /opt/conda; fi
-RUN rm -rf /opt/triton
+RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
+RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt

 ARG HALIDE
 # Build and install halide
@ -158,16 +159,6 @@ COPY ./common/install_cusparselt.sh install_cusparselt.sh
 RUN bash install_cusparselt.sh
 RUN rm install_cusparselt.sh

-# Install NCCL
-ARG CUDA_VERSION
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-RUN bash install_nccl.sh
-RUN rm install_nccl.sh /ci_commit_pins/nccl-cu*
-ENV USE_SYSTEM_NCCL=1
-ENV NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
-ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
-
 # Install CUDSS
 ARG CUDA_VERSION
 COPY ./common/install_cudss.sh install_cudss.sh
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -50,6 +50,13 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
 RUN rm install_protobuf.sh
 ENV INSTALLED_PROTOBUF ${PROTOBUF}

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -63,7 +70,7 @@ COPY ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh
 RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
-RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
+RUN bash ./install_rocm_magma.sh
 RUN rm install_rocm_magma.sh
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -77,6 +77,13 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton-xpu.txt triton_version.txt

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION

-FROM ubuntu:${UBUNTU_VERSION} as base
+FROM ubuntu:${UBUNTU_VERSION}

 ARG UBUNTU_VERSION

@ -52,16 +52,9 @@ RUN  bash ./install_lcov.sh && rm install_lcov.sh
 # Install cuda and cudnn
 ARG CUDA_VERSION
 COPY ./common/install_cuda.sh install_cuda.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu*
+RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
-# No effect if cuda not installed
-ENV USE_SYSTEM_NCCL=1
-ENV NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
-ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
-

 # (optional) Install UCC
 ARG UCX_COMMIT
@ -81,6 +74,13 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
 RUN rm install_protobuf.sh
 ENV INSTALLED_PROTOBUF ${PROTOBUF}

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -88,6 +88,18 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
 RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
 ENV INSTALLED_VISION ${VISION}

+# (optional) Install Vulkan SDK
+ARG VULKAN_SDK_VERSION
+COPY ./common/install_vulkan_sdk.sh install_vulkan_sdk.sh
+RUN if [ -n "${VULKAN_SDK_VERSION}" ]; then bash ./install_vulkan_sdk.sh; fi
+RUN rm install_vulkan_sdk.sh
+
+# (optional) Install swiftshader
+ARG SWIFTSHADER
+COPY ./common/install_swiftshader.sh install_swiftshader.sh
+RUN if [ -n "${SWIFTSHADER}" ]; then bash ./install_swiftshader.sh; fi
+RUN rm install_swiftshader.sh
+
 # (optional) Install non-default CMake version
 ARG CMAKE_VERSION
 COPY ./common/install_cmake.sh install_cmake.sh
@ -115,21 +127,20 @@ RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_d
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

 ARG TRITON
-ARG TRITON_CPU
-
-# Create a separate stage for building Triton and Triton-CPU.  install_triton
-# will check for the presence of env vars
-FROM base as triton-builder
+# Install triton, this needs to be done before sccache because the latter will
+# try to reach out to S3, which docker build runners don't have access
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/triton.txt triton.txt
-COPY ci_commit_pins/triton-cpu.txt triton-cpu.txt
-RUN bash ./install_triton.sh
+RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
+RUN rm install_triton.sh common_utils.sh triton.txt

-FROM base as final
-COPY --from=triton-builder /opt/triton /opt/triton
-RUN if [ -n "${TRITON}" ] || [ -n "${TRITON_CPU}" ]; then pip install /opt/triton/*.whl; chown -R jenkins:jenkins /opt/conda; fi
-RUN rm -rf /opt/triton
+ARG TRITON_CPU
+COPY ./common/install_triton.sh install_triton.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/triton-cpu.txt triton-cpu.txt
+RUN if [ -n "${TRITON_CPU}" ]; then bash ./install_triton.sh; fi
+RUN rm install_triton.sh common_utils.sh triton-cpu.txt

 ARG EXECUTORCH
 # Build and install executorch
--- a/.ci/magma-rocm/.gitignore
+++ b/.ci/magma-rocm/.gitignore
@ -1,2 +0,0 @@
-output/
-magma-rocm*/
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@ -1,35 +0,0 @@
-SHELL=/usr/bin/env bash
-
-DOCKER_CMD ?= docker
-DESIRED_ROCM ?= 6.3
-DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
-PACKAGE_NAME = magma-rocm
-# inherit this from underlying docker image, do not pass this env var to docker
-#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
-
-DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
-	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
-	-w /builder \
-	-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_ROCM_SHORT} \
-	-e DESIRED_ROCM=${DESIRED_ROCM} \
-	"pytorch/manylinux2_28-builder:rocm${DESIRED_ROCM}-main" \
-	magma-rocm/build_magma.sh
-
-.PHONY: all
-all: magma-rocm63
-all: magma-rocm624
-
-.PHONY:
-clean:
-	$(RM) -r magma-*
-	$(RM) -r output
-
-.PHONY: magma-rocm63
-magma-rocm63: DESIRED_ROCM := 6.3
-magma-rocm63:
-	$(DOCKER_RUN)
-
-.PHONY: magma-rocm624
-magma-rocm624: DESIRED_ROCM := 6.2.4
-magma-rocm624:
-	$(DOCKER_RUN)
--- a/.ci/magma-rocm/README.md
+++ b/.ci/magma-rocm/README.md
@ -1,48 +0,0 @@
-# Magma ROCm
-
-This folder contains the scripts and configurations to build libmagma.so, linked for various versions of ROCm.
-
-## Building
-
-Look in the `Makefile` for available targets to build. To build any target, for example `magma-rocm63`, run
-
-```
-# Using `docker`
-make magma-rocm63
-
-# Using `podman`
-DOCKER_CMD=podman make magma-rocm63
-```
-
-This spawns a `pytorch/manylinux-rocm<version>` docker image, which has the required `devtoolset` and ROCm versions installed.
-Within the docker image, it runs `build_magma.sh` with the correct environment variables set, which package the necessary files
-into a tarball, with the following structure:
-
-```
-.
-├── include       # header files
-├── lib           # libmagma.so
-├── info
-│   ├── licenses  # license file
-│   └── recipe    # build script
-```
-
-More specifically, `build_magma.sh` copies over the relevant files from the `package_files` directory depending on the ROCm version.
-Outputted binaries should be in the `output` folder.
-
-
-## Pushing
-
-Packages can be uploaded to an S3 bucket using:
-
-```
-aws s3 cp output/*/magma-cuda*.bz2 <bucket-with-path>
-```
-
-If you do not have upload permissions, please ping @seemethere or @soumith to gain access
-
-## New versions
-
-New ROCm versions can be added by creating a new make target with the next desired version. For ROCm version N.n, the target should be named `magma-rocmNn`.
-
-Make sure to edit the appropriate environment variables (e.g., DESIRED_ROCM) in the `Makefile` accordingly. Remember also to check `build_magma.sh` to ensure the logic for copying over the files remains correct.
--- a/.ci/magma-rocm/build_magma.sh
+++ b/.ci/magma-rocm/build_magma.sh
@ -1,42 +0,0 @@
-#!/usr/bin/env bash
-
-set -eou pipefail
-
-# Environment variables
-# The script expects DESIRED_CUDA and PACKAGE_NAME to be set
-ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
-
-# Version 2.7.2 + ROCm related updates
-MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
-
-# Folders for the build
-PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
-PACKAGE_DIR=${ROOT_DIR}/magma-rocm/${PACKAGE_NAME} # build workspace
-PACKAGE_OUTPUT=${ROOT_DIR}/magma-rocm/output # where tarballs are stored
-PACKAGE_BUILD=${PACKAGE_DIR} # where the content of the tarball is prepared
-PACKAGE_RECIPE=${PACKAGE_BUILD}/info/recipe
-PACKAGE_LICENSE=${PACKAGE_BUILD}/info/licenses
-mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RECIPE} ${PACKAGE_LICENSE}
-
-# Fetch magma sources and verify checksum
-pushd ${PACKAGE_DIR}
-git clone https://bitbucket.org/icl/magma.git
-pushd magma
-git checkout ${MAGMA_VERSION}
-popd
-popd
-
-# build
-pushd ${PACKAGE_DIR}/magma
-# The build.sh script expects to be executed from the sources root folder
-INSTALL_DIR=${PACKAGE_BUILD} ${PACKAGE_FILES}/build.sh
-popd
-
-# Package recipe, license and tarball
-# Folder and package name are backward compatible for the build workflow
-cp ${PACKAGE_FILES}/build.sh ${PACKAGE_RECIPE}/build.sh
-cp ${PACKAGE_DIR}/magma/COPYRIGHT ${PACKAGE_LICENSE}/COPYRIGHT
-pushd ${PACKAGE_BUILD}
-tar cjf ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2 include lib info
-echo Built in ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2
-popd
--- a/.ci/magma-rocm/package_files/build.sh
+++ b/.ci/magma-rocm/package_files/build.sh
@ -1,38 +0,0 @@
-# Magma build scripts need `python`
-ln -sf /usr/bin/python3 /usr/bin/python
-
-ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
-case "$ID" in
-  almalinux)
-    yum install -y gcc-gfortran
-    ;;
-  *)
-    echo "No preinstalls to build magma..."
-    ;;
-esac
-
-MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}
-
-cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
-echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
-if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then
-    echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc
-fi
-echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc
-echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc
-export PATH="${PATH}:/opt/rocm/bin"
-if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
-  amdgpu_targets=`echo $PYTORCH_ROCM_ARCH | sed 's/;/ /g'`
-else
-  amdgpu_targets=`rocm_agent_enumerator | grep -v gfx000 | sort -u | xargs`
-fi
-for arch in $amdgpu_targets; do
-  echo "DEVCCFLAGS += --offload-arch=$arch" >> make.inc
-done
-# hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition
-sed -i 's/^FOPENMP/#FOPENMP/g' make.inc
-make -f make.gen.hipMAGMA -j $(nproc)
-LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}"
-make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}"
-cp -R lib ${INSTALL_DIR}
-cp -R include ${INSTALL_DIR}
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -111,6 +111,12 @@ case ${DESIRED_PYTHON} in
    ;;
 esac

+if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    export _GLIBCXX_USE_CXX11_ABI=1
+else
+    export _GLIBCXX_USE_CXX11_ABI=0
+fi
+
 if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
    echo "Calling build_amd.py at $(date)"
    python tools/amd_build/build_amd.py
@ -203,6 +209,12 @@ if [[ -n "$BUILD_PYTHONLESS" ]]; then

    mkdir -p /tmp/$LIBTORCH_HOUSE_DIR

+    if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+        LIBTORCH_ABI="cxx11-abi-"
+    else
+        LIBTORCH_ABI=
+    fi
+
    zip -rq /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip libtorch
    cp /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip \
       /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-latest.zip
--- a/.ci/manywheel/build_libtorch.sh
+++ b/.ci/manywheel/build_libtorch.sh
@ -95,6 +95,12 @@ python setup.py clean
 retry pip install -qr requirements.txt
 retry pip install -q numpy==2.0.1

+if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    export _GLIBCXX_USE_CXX11_ABI=1
+else
+    export _GLIBCXX_USE_CXX11_ABI=0
+fi
+
 if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
    echo "Calling build_amd.py at $(date)"
    python tools/amd_build/build_amd.py
@ -163,6 +169,12 @@ fi

 )

+if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    LIBTORCH_ABI="cxx11-abi-"
+else
+    LIBTORCH_ABI=
+fi
+
 (
    set -x

--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -35,7 +35,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi

 if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
-  if [[ "$BUILD_ENVIRONMENT" != *clang* ]]; then
+  if [[ "$BUILD_ENVIRONMENT" != *cuda11.3* && "$BUILD_ENVIRONMENT" != *clang* ]]; then
    # TODO: there is a linking issue when building with UCC using clang,
    # disable it for now and to be fix later.
    # TODO: disable UCC temporarily to enable CUDA 12.1 in CI
@ -277,8 +277,10 @@ else
    # or building non-XLA tests.
    if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
          "$BUILD_ENVIRONMENT" != *xla* ]]; then
-      # Install numpy-2.0.2 for builds which are backward compatible with 1.X
-      python -mpip install numpy==2.0.2
+      if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
+        # Install numpy-2.0.2 for builds which are backward compatible with 1.X
+        python -mpip install numpy==2.0.2
+      fi

      WERROR=1 python setup.py clean

@ -301,18 +303,6 @@ else
    fi
    pip_install_whl "$(echo dist/*.whl)"

-    if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
-      echo "Checking that xpu is compiled"
-      pushd dist/
-      if python -c 'import torch; exit(0 if torch.xpu._is_compiled() else 1)'; then
-        echo "XPU support is compiled in."
-      else
-        echo "XPU support is NOT compiled in."
-        exit 1
-      fi
-      popd
-    fi
-
    # TODO: I'm not sure why, but somehow we lose verbose commands
    set -x

--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -59,16 +59,78 @@ else
  export install_root="$(dirname $(which python))/../lib/python${py_dot}/site-packages/torch/"
 fi

+###############################################################################
+# Setup XPU ENV
+###############################################################################
+if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
+  set +u
+  # Refer https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
+  source /opt/intel/oneapi/compiler/latest/env/vars.sh
+  source /opt/intel/oneapi/pti/latest/env/vars.sh
+fi
+
 ###############################################################################
 # Check GCC ABI
 ###############################################################################

-# NOTE: As of https://github.com/pytorch/pytorch/issues/126551 we only produce
-#       wheels with cxx11-abi
+# NOTE [ Building libtorch with old vs. new gcc ABI ]
+#
+# Packages built with one version of ABI could not be linked against by client
+# C++ libraries that were compiled using the other version of ABI. Since both
+# gcc ABIs are still common in the wild, we need to support both ABIs. Currently:
+#
+# - All the nightlies built on CentOS 7 + devtoolset7 use the old gcc ABI.
+# - All the nightlies built on Ubuntu 16.04 + gcc 5.4 use the new gcc ABI.

 echo "Checking that the gcc ABI is what we expect"
 if [[ "$(uname)" != 'Darwin' ]]; then
-  # We also check that there are cxx11 symbols in libtorch
+  function is_expected() {
+    if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* || "$DESIRED_CUDA" == *"rocm"* ]]; then
+      if [[ "$1" -gt 0 || "$1" == "ON " ]]; then
+        echo 1
+      fi
+    else
+      if [[ -z "$1" || "$1" == 0 || "$1" == "OFF" ]]; then
+        echo 1
+      fi
+    fi
+  }
+
+  # First we check that the env var in TorchConfig.cmake is correct
+
+  # We search for D_GLIBCXX_USE_CXX11_ABI=1 in torch/TorchConfig.cmake
+  torch_config="${install_root}/share/cmake/Torch/TorchConfig.cmake"
+  if [[ ! -f "$torch_config" ]]; then
+    echo "No TorchConfig.cmake found!"
+    ls -lah "$install_root/share/cmake/Torch"
+    exit 1
+  fi
+  echo "Checking the TorchConfig.cmake"
+  cat "$torch_config"
+
+  # The sed call below is
+  #   don't print lines by default (only print the line we want)
+  # -n
+  #   execute the following expression
+  # e
+  #   replace lines that match with the first capture group and print
+  # s/.*D_GLIBCXX_USE_CXX11_ABI=\(.\)".*/\1/p
+  #   any characters, D_GLIBCXX_USE_CXX11_ABI=, exactly one any character, a
+  #   quote, any characters
+  #   Note the exactly one single character after the '='. In the case that the
+  #     variable is not set the '=' will be followed by a '"' immediately and the
+  #     line will fail the match and nothing will be printed; this is what we
+  #     want.  Otherwise it will capture the 0 or 1 after the '='.
+  # /.*D_GLIBCXX_USE_CXX11_ABI=\(.\)".*/
+  #   replace the matched line with the capture group and print
+  # /\1/p
+  actual_gcc_abi="$(sed -ne 's/.*D_GLIBCXX_USE_CXX11_ABI=\(.\)".*/\1/p' < "$torch_config")"
+  if [[ "$(is_expected "$actual_gcc_abi")" != 1 ]]; then
+    echo "gcc ABI $actual_gcc_abi not as expected."
+    exit 1
+  fi
+
+  # We also check that there are [not] cxx11 symbols in libtorch
  #
  echo "Checking that symbols in libtorch.so have the right gcc abi"
  python3 "$(dirname ${BASH_SOURCE[0]})/smoke_test/check_binary_symbols.py"
@ -146,11 +208,35 @@ setup_link_flags () {

 TEST_CODE_DIR="$(dirname $(realpath ${BASH_SOURCE[0]}))/test_example_code"
 build_and_run_example_cpp () {
+  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    GLIBCXX_USE_CXX11_ABI=1
+  else
+    GLIBCXX_USE_CXX11_ABI=0
+  fi
  setup_link_flags
-  g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
+  g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -D_GLIBCXX_USE_CXX11_ABI=$GLIBCXX_USE_CXX11_ABI -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
  ./$1
 }

+build_example_cpp_with_incorrect_abi () {
+  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    GLIBCXX_USE_CXX11_ABI=0
+  else
+    GLIBCXX_USE_CXX11_ABI=1
+  fi
+  set +e
+  setup_link_flags
+  g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -D_GLIBCXX_USE_CXX11_ABI=$GLIBCXX_USE_CXX11_ABI -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
+  ERRCODE=$?
+  set -e
+  if [ "$ERRCODE" -eq "0" ]; then
+    echo "Building example with incorrect ABI didn't throw error. Aborting."
+    exit 1
+  else
+    echo "Building example with incorrect ABI throws expected error. Proceeding."
+  fi
+}
+
 ###############################################################################
 # Check simple Python/C++ calls
 ###############################################################################
@ -160,6 +246,11 @@ if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
    export LD_LIBRARY_PATH=/usr/local/cuda/lib64
  fi
  build_and_run_example_cpp simple-torch-test
+  # `_GLIBCXX_USE_CXX11_ABI` is always ignored by gcc in devtoolset7, so we test
+  # the expected failure case for Ubuntu 16.04 + gcc 5.4 only.
+  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    build_example_cpp_with_incorrect_abi simple-torch-test
+  fi
 else
  pushd /tmp
  python -c 'import torch'
@ -216,14 +307,6 @@ else
  fi
 fi

-###############################################################################
-# Check XPU configured correctly
-###############################################################################
-if [[ "$DESIRED_CUDA" == 'xpu' && "$PACKAGE_TYPE" != 'libtorch' ]]; then
-  echo "Checking that xpu is compiled"
-  python -c 'import torch; exit(0 if torch.xpu._is_compiled() else 1)'
-fi
-
 ###############################################################################
 # Check CUDA configured correctly
 ###############################################################################
@ -302,19 +385,10 @@ except RuntimeError as e:
 fi

 ###############################################################################
-# Check for C++ ABI compatibility to GCC-11
+# Check for C++ ABI compatibility between gcc7 and gcc9 compiled binaries
 ###############################################################################
 if [[ "$(uname)" == 'Linux' &&  "$PACKAGE_TYPE" == 'manywheel' ]]; then
  pushd /tmp
-  # Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html gcc-11 is ABI16
-  # Though manylinux_2.28 should have been build with gcc-14, per
-  # https://github.com/pypa/manylinux?tab=readme-ov-file#manylinux_2_28-almalinux-8-based
-  # On s390x gcc 14 is used because it contains fix for interaction
-  # between precompiled headers and vectorization builtins.
-  # This fix is not available in earlier gcc versions.
-  # gcc-14 uses ABI19.
-  if [[ "$(uname -m)" != "s390x" ]]; then
-    python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi1016' else 1)"
-  fi
+  python -c "import torch; exit(0 if torch.compiled_with_cxx11_abi() else (0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi1011' else 1))"
  popd
 fi
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -202,7 +202,7 @@ function install_torchrec_and_fbgemm() {

 function clone_pytorch_xla() {
  if [[ ! -d ./xla ]]; then
-    git clone --recursive --quiet https://github.com/pytorch/xla.git
+    git clone --recursive -b r2.7 https://github.com/pytorch/xla.git
    pushd xla
    # pin the xla hash so that we don't get broken by changes to xla
    git checkout "$(cat ../.github/ci_commit_pins/xla.txt)"
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -33,11 +33,55 @@ if which sccache > /dev/null; then
  export PATH="${tmp_dir}:$PATH"
 fi

+cross_compile_arm64() {
+  # Cross compilation for arm64
+  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
+  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
+  USE_DISTRIBUTED=0 CMAKE_OSX_ARCHITECTURES=arm64 MACOSX_DEPLOYMENT_TARGET=11.0 USE_MKLDNN=OFF USE_QNNPACK=OFF WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
+}
+
+compile_arm64() {
+  # Compilation for arm64
+  # TODO: Compile with OpenMP support (but this causes CI regressions as cross-compilation were done with OpenMP disabled)
+  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
+}
+
+compile_x86_64() {
+  USE_DISTRIBUTED=0 WERROR=1 python setup.py bdist_wheel --plat-name=macosx_10_9_x86_64
+}
+
+build_lite_interpreter() {
+    echo "Testing libtorch (lite interpreter)."
+
+    CPP_BUILD="$(pwd)/../cpp_build"
+    # Ensure the removal of the tmp directory
+    trap 'rm -rfv ${CPP_BUILD}' EXIT
+    rm -rf "${CPP_BUILD}"
+    mkdir -p "${CPP_BUILD}/caffe2"
+
+    # It looks libtorch need to be built in "${CPP_BUILD}/caffe2 folder.
+    BUILD_LIBTORCH_PY=$PWD/tools/build_libtorch.py
+    pushd "${CPP_BUILD}/caffe2" || exit
+    VERBOSE=1 DEBUG=1 python "${BUILD_LIBTORCH_PY}"
+    popd || exit
+
+    "${CPP_BUILD}/caffe2/build/bin/test_lite_interpreter_runtime"
+}
+
 print_cmake_info

-# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
-# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
-USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
+if [[ ${BUILD_ENVIRONMENT} = *arm64* ]]; then
+  if [[ $(uname -m) == "arm64" ]]; then
+    compile_arm64
+  else
+    cross_compile_arm64
+  fi
+elif [[ ${BUILD_ENVIRONMENT} = *lite-interpreter* ]]; then
+  export BUILD_LITE_INTERPRETER=1
+  build_lite_interpreter
+else
+  compile_x86_64
+fi

 if which sccache > /dev/null; then
  print_sccache_stats
--- a/.ci/pytorch/smoke_test/check_binary_symbols.py
+++ b/.ci/pytorch/smoke_test/check_binary_symbols.py
@ -80,7 +80,7 @@ def grep_symbols(lib: str, patterns: list[Any]) -> list[str]:
        return functools.reduce(list.__add__, (x.result() for x in tasks), [])


-def check_lib_symbols_for_abi_correctness(lib: str) -> None:
+def check_lib_symbols_for_abi_correctness(lib: str, pre_cxx11_abi: bool = True) -> None:
    print(f"lib: {lib}")
    cxx11_symbols = grep_symbols(lib, LIBTORCH_CXX11_PATTERNS)
    pre_cxx11_symbols = grep_symbols(lib, LIBTORCH_PRE_CXX11_PATTERNS)
@ -88,12 +88,28 @@ def check_lib_symbols_for_abi_correctness(lib: str) -> None:
    num_pre_cxx11_symbols = len(pre_cxx11_symbols)
    print(f"num_cxx11_symbols: {num_cxx11_symbols}")
    print(f"num_pre_cxx11_symbols: {num_pre_cxx11_symbols}")
-    if num_pre_cxx11_symbols > 0:
-        raise RuntimeError(
-            f"Found pre-cxx11 symbols, but there shouldn't be any, see: {pre_cxx11_symbols[:100]}"
+    if pre_cxx11_abi:
+        if num_cxx11_symbols > 0:
+            raise RuntimeError(
+                f"Found cxx11 symbols, but there shouldn't be any, see: {cxx11_symbols[:100]}"
+            )
+        if num_pre_cxx11_symbols < 1000:
+            raise RuntimeError("Didn't find enough pre-cxx11 symbols.")
+        # Check for no recursive iterators, regression test for https://github.com/pytorch/pytorch/issues/133437
+        rec_iter_symbols = grep_symbols(
+            lib, [re.compile("std::filesystem::recursive_directory_iterator.*")]
        )
-    if num_cxx11_symbols < 100:
-        raise RuntimeError("Didn't find enought cxx11 symbols")
+        if len(rec_iter_symbols) > 0:
+            raise RuntimeError(
+                f"recursive_directory_iterator in used pre-CXX11 binaries, see; {rec_iter_symbols}"
+            )
+    else:
+        if num_pre_cxx11_symbols > 0:
+            raise RuntimeError(
+                f"Found pre-cxx11 symbols, but there shouldn't be any, see: {pre_cxx11_symbols[:100]}"
+            )
+        if num_cxx11_symbols < 100:
+            raise RuntimeError("Didn't find enought cxx11 symbols")


 def main() -> None:
@ -105,8 +121,9 @@ def main() -> None:
        else:
            install_root = Path(distutils.sysconfig.get_python_lib()) / "torch"

-    libtorch_cpu_path = str(install_root / "lib" / "libtorch_cpu.so")
-    check_lib_symbols_for_abi_correctness(libtorch_cpu_path)
+    libtorch_cpu_path = install_root / "lib" / "libtorch_cpu.so"
+    pre_cxx11_abi = "cxx11-abi" not in os.getenv("DESIRED_DEVTOOLSET", "")
+    check_lib_symbols_for_abi_correctness(libtorch_cpu_path, pre_cxx11_abi)


 if __name__ == "__main__":
--- a/.ci/pytorch/smoke_test/smoke_test.py
+++ b/.ci/pytorch/smoke_test/smoke_test.py
@ -7,7 +7,6 @@ import subprocess
 import sys
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from typing import Optional

 import torch
 import torch._dynamo
@ -77,13 +76,10 @@ def read_release_matrix():


 def test_numpy():
-    try:
-        import numpy as np
+    import numpy as np

-        x = np.arange(5)
-        torch.tensor(x)
-    except ImportError:
-        print("Numpy check skipped. Numpy is not installed.")
+    x = np.arange(5)
+    torch.tensor(x)


 def check_version(package: str) -> None:
@ -196,41 +192,8 @@ def test_cuda_gds_errors_captured() -> None:
        )


-def find_pypi_package_version(package: str) -> Optional[str]:
-    from importlib import metadata
-
-    dists = metadata.distributions()
-    for dist in dists:
-        if dist.metadata["Name"].startswith(package):
-            return dist.version
-    return None
-
-
-def cudnn_to_version_str(cudnn_version: int) -> str:
-    patch = int(cudnn_version % 10)
-    minor = int((cudnn_version / 100) % 100)
-    major = int((cudnn_version / 10000) % 10000)
-    return f"{major}.{minor}.{patch}"
-
-
-def compare_pypi_to_torch_versions(
-    package: str, pypi_version: str, torch_version: str
-) -> None:
-    if pypi_version is None:
-        raise RuntimeError(f"Can't find {package} in PyPI for Torch: {torch_version}")
-    if pypi_version.startswith(torch_version):
-        print(f"Found matching {package}. Torch: {torch_version} PyPI {pypi_version}")
-    else:
-        raise RuntimeError(
-            f"Wrong {package} version. Torch: {torch_version} PyPI: {pypi_version}"
-        )
-
-
 def smoke_test_cuda(
-    package: str,
-    runtime_error_check: str,
-    torch_compile_check: str,
-    pypi_pkg_check: str,
+    package: str, runtime_error_check: str, torch_compile_check: str
 ) -> None:
    if not torch.cuda.is_available() and is_cuda_system:
        raise RuntimeError(f"Expected CUDA {gpu_arch_ver}. However CUDA is not loaded.")
@ -260,30 +223,20 @@ def smoke_test_cuda(
            raise RuntimeError(
                f"Wrong CUDA version. Loaded: {torch.version.cuda} Expected: {gpu_arch_ver}"
            )
-
        print(f"torch cuda: {torch.version.cuda}")
+        # todo add cudnn version validation
+        print(f"torch cudnn: {torch.backends.cudnn.version()}")
+        print(f"cuDNN enabled? {torch.backends.cudnn.enabled}")
+
        torch.cuda.init()
        print("CUDA initialized successfully")
        print(f"Number of CUDA devices: {torch.cuda.device_count()}")
        for i in range(torch.cuda.device_count()):
            print(f"Device {i}: {torch.cuda.get_device_name(i)}")

-        print(f"cuDNN enabled? {torch.backends.cudnn.enabled}")
-        torch_cudnn_version = cudnn_to_version_str(torch.backends.cudnn.version())
-        print(f"Torch cuDNN version: {torch_cudnn_version}")
-
+        # nccl is availbale only on Linux
        if sys.platform in ["linux", "linux2"]:
-            torch_nccl_version = ".".join(str(v) for v in torch.cuda.nccl.version())
-            print(f"Torch nccl; version: {torch_nccl_version}")
-
-        # Pypi dependencies are installed on linux ony and nccl is availbale only on Linux.
-        if pypi_pkg_check == "enabled" and sys.platform in ["linux", "linux2"]:
-            compare_pypi_to_torch_versions(
-                "cudnn", find_pypi_package_version("nvidia-cudnn"), torch_cudnn_version
-            )
-            compare_pypi_to_torch_versions(
-                "nccl", find_pypi_package_version("nvidia-nccl"), torch_nccl_version
-            )
+            print(f"torch nccl version: {torch.cuda.nccl.version()}")

        if runtime_error_check == "enabled":
            test_cuda_runtime_errors_captured()
@ -442,13 +395,6 @@ def parse_args():
        choices=["enabled", "disabled"],
        default="enabled",
    )
-    parser.add_argument(
-        "--pypi-pkg-check",
-        help="Check pypi package versions cudnn and nccl",
-        type=str,
-        choices=["enabled", "disabled"],
-        default="enabled",
-    )
    return parser.parse_args()


@ -464,7 +410,6 @@ def main() -> None:
    smoke_test_conv2d()
    test_linalg()
    test_numpy()
-
    if is_cuda_system:
        test_linalg("cuda")
        test_cuda_gds_errors_captured()
@ -473,10 +418,7 @@ def main() -> None:
        smoke_test_modules()

    smoke_test_cuda(
-        options.package,
-        options.runtime_error_check,
-        options.torch_compile_check,
-        options.pypi_pkg_check,
+        options.package, options.runtime_error_check, options.torch_compile_check
    )


--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -1175,6 +1175,7 @@ build_xla() {
  # These functions are defined in .circleci/common.sh in pytorch/xla repo
  retry install_pre_deps_pytorch_xla $XLA_DIR $USE_CACHE
  CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch:${CMAKE_PREFIX_PATH}" XLA_SANDBOX_BUILD=1 build_torch_xla $XLA_DIR
+  retry install_post_deps_pytorch_xla
  assert_git_not_dirty
 }

@ -1474,7 +1475,8 @@ test_executorch() {
  pushd /executorch

  export PYTHON_EXECUTABLE=python
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+  export EXECUTORCH_BUILD_PYBIND=ON
+  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"

  # For llama3
  bash examples/models/llama3_2_vision/install_requirements.sh
@ -1526,27 +1528,6 @@ test_linux_aarch64() {
       --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
 }

-test_operator_benchmark() {
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-  TEST_DIR=$(pwd)
-
-  test_inductor_set_cpu_affinity
-
-  cd benchmarks/operator_benchmark/pt_extension
-  python setup.py install
-
-  cd "${TEST_DIR}"/benchmarks/operator_benchmark
-  $TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \
-      --output-dir "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv"
-
-  pip_install pandas
-  python check_perf_csv.py \
-      --actual "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv" \
-      --expected "expected_ci_operator_benchmark_eager_float32_cpu.csv"
-}
-
-
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
  (cd test && python -c "import torch; print(torch.__config__.show())")
  (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
@ -1577,19 +1558,6 @@ elif [[ "$TEST_CONFIG" == distributed ]]; then
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
    test_rpc
  fi
-elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
-  TEST_MODE="short"
-
-  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
-    if [[ "${TEST_CONFIG}" == *long* ]]; then
-      TEST_MODE="long"
-    elif [[ "${TEST_CONFIG}" == *all* ]]; then
-      TEST_MODE="all"
-    fi
-
-    test_operator_benchmark cpu ${TEST_MODE}
-
-  fi
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
@ -1652,7 +1620,6 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
  install_torchvision
  checkout_install_torchbench hf_T5 llama moco
  PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
-  test_inductor_aoti
 elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  install_torchvision
  test_inductor_shard "${SHARD_NUMBER}"
--- a/.ci/pytorch/windows/internal/smoke_test.bat
+++ b/.ci/pytorch/windows/internal/smoke_test.bat
@ -42,6 +42,7 @@ if "%DESIRED_PYTHON%" == "3.12" set "PYTHON_INSTALLER_URL=https://www.python.org
 if "%DESIRED_PYTHON%" == "3.11" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.11.0/python-3.11.0-amd64.exe"
 if "%DESIRED_PYTHON%" == "3.10" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.10.0/python-3.10.0-amd64.exe"
 if "%DESIRED_PYTHON%" == "3.9" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.9.0/python-3.9.0-amd64.exe"
+if "%DESIRED_PYTHON%" == "3.8" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.8.2/python-3.8.2-amd64.exe"
 if "%PYTHON_INSTALLER_URL%" == "" (
    echo Python %DESIRED_PYTHON% not supported yet
 )
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -90,17 +90,8 @@ fi
 /pytorch/.ci/pytorch/check_binary.sh

 if [[ "\$GPU_ARCH_TYPE" != *s390x* && "\$GPU_ARCH_TYPE" != *xpu* && "\$GPU_ARCH_TYPE" != *rocm*  && "$PACKAGE_TYPE" != libtorch ]]; then
-
-  torch_pkg_size="$(ls -1 /final_pkgs/torch-* | sort |tail -1 |xargs wc -c |cut -d ' ' -f1)"
-  # todo: implement check for large binaries
-  # if the package is larger than 1.5GB, we disable the pypi check.
-  # this package contains all libraries packaged in torch libs folder
-  # example of such package is https://download.pytorch.org/whl/cu126_full/torch
-  if [[ "\$torch_pkg_size" -gt  1500000000 ]]; then
-    python /pytorch/.ci/pytorch/smoke_test/smoke_test.py --package=torchonly --torch-compile-check disabled --pypi-pkg-check disabled
-  else
-    python /pytorch/.ci/pytorch/smoke_test/smoke_test.py --package=torchonly --torch-compile-check disabled $extra_parameters
-  fi
+  # Exclude s390, xpu, rocm and libtorch builds from smoke testing
+  python /pytorch/.ci/pytorch/smoke_test/smoke_test.py --package=torchonly --torch-compile-check disabled
 fi

 # Clean temp files
--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@ -55,16 +55,12 @@ s3_upload() {
    s3_upload_dir="${s3_root_dir}/${UPLOAD_SUBFOLDER}/"
  fi
  (
-    cache_control_flag=""
-    if [[ "${UPLOAD_CHANNEL}" = "test" ]]; then
-      cache_control_flag="--cache-control='no-cache,no-store,must-revalidate'"
-    fi
    for pkg in ${PKG_DIR}/*.${extension}; do
      (
        set -x
        shm_id=$(sha256sum "${pkg}" | awk '{print $1}')
        ${AWS_S3_CP} --no-progress --acl public-read "${pkg}" "${s3_upload_dir}" \
-          --metadata "checksum-sha256=${shm_id}" ${cache_control_flag}
+          --metadata "checksum-sha256=${shm_id}"
      )
    done
  )
--- a/.clang-tidy
+++ b/.clang-tidy
@ -48,10 +48,12 @@ misc-*,
 -misc-no-recursion,
 -misc-non-private-member-variables-in-classes,
 -misc-unused-using-decls,
+-misc-use-internal-linkage,
 modernize-*,
 -modernize-macro-to-enum,
 -modernize-return-braced-init-list,
 -modernize-use-auto,
+-modernize-use-default-member-init,
 -modernize-use-using,
 -modernize-use-trailing-return-type,
 -modernize-use-nodiscard,
--- a/.editorconfig
+++ b/.editorconfig
@ -1,14 +0,0 @@
-root = true
-
-[*]
-end_of_line = lf
-insert_final_newline = true
-
-# Python
-[*.py]
-indent_style = space
-indent_size = 4
-
-# Make
-[Makefile]
-indent_style = tab
--- a/.github/ISSUE_TEMPLATE/pt2-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/pt2-bug-report.yml
@ -20,7 +20,7 @@ body:

        - Don't compare indices of max/min etc, because that avoids the above requirement

-        - When comparing eager and torch.compile, use a higher precision result as a baseline. `torch._dynamo.utils.same` with fp64_ref will handle this comparison.
+        - If comparing eager and torch.compile at fp16/bf16, you should use fp32 as baseline

        - Ensure rng state used to compare results is equivalent. Use `torch._inductor.config.fallback_random=True` and reset the torch rng seed between comparisons

--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -3,8 +3,8 @@ self-hosted-runner:
    # GitHub hosted runner that actionlint doesn't recognize because actionlint version (1.6.21) is too old
    - ubuntu-24.04
    # GitHub hosted x86 Linux runners
-    - linux.24_04.4x
-    - linux.24_04.16x
+    - linux.20_04.4x
+    - linux.20_04.16x
    # Organization-wide AWS Linux Runners
    - linux.large
    - linux.2xlarge
@ -45,15 +45,10 @@ self-hosted-runner:
    - windows.g5.4xlarge.nvidia.gpu
    # Windows ARM64 runners
    - windows-11-arm64
-    # Organization-wide AMD-hosted runners
-    # MI2xx runners
+    # Organization-wide AMD hosted runners
    - linux.rocm.gpu
    - linux.rocm.gpu.2
    - linux.rocm.gpu.4
-    # MI300 runners
-    - linux.rocm.gpu.mi300.2
-    - linux.rocm.gpu.mi300.4
-    - rocm-docker
    # Repo-specific Apple hosted  runners
    - macos-m1-ultra
    - macos-m2-14
--- a/.github/actions/checkout-pytorch/action.yml
+++ b/.github/actions/checkout-pytorch/action.yml
@ -23,44 +23,9 @@ runs:
      id: check_container_runner
      run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

-    - name: Set up parallel fetch and clean workspace
-      id: first-clean
-      continue-on-error: true
+    - name: Clean workspace
      shell: bash
      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
-      env:
-        NO_SUDO: ${{ inputs.no-sudo }}
-      run: |
-        # Use all available CPUs for fetching
-        cd "${GITHUB_WORKSPACE}"
-        git config --global fetch.parallel 0
-        git config --global submodule.fetchJobs 0
-
-        # Clean workspace. The default checkout action should also do this, but
-        # do it here as well just in case
-        if [[ -d .git ]]; then
-          if [ -z "${NO_SUDO}" ]; then
-            sudo git clean -ffdx
-          else
-            git clean -ffdx
-          fi
-        fi
-
-    - name: Checkout PyTorch
-      id: first-checkout-attempt
-      continue-on-error: true
-      uses: actions/checkout@v4
-      with:
-        ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-        # --depth=1 for speed, manually fetch history and other refs as necessary
-        fetch-depth: ${{ inputs.fetch-depth }}
-        submodules: ${{ inputs.submodules }}
-        show-progress: false
-
-    - name: Clean workspace (try again)
-      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' &&
-        (steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }}
-      shell: bash
      env:
        NO_SUDO: ${{ inputs.no-sudo }}
      run: |
@ -75,11 +40,16 @@ runs:
        fi
        mkdir "${GITHUB_WORKSPACE}"

-    - name: Checkout PyTorch (try again)
+        # Use all available CPUs for fetching
+        cd "${GITHUB_WORKSPACE}"
+        git config --global fetch.parallel 0
+        git config --global submodule.fetchJobs 0
+
+    - name: Checkout PyTorch
      uses: actions/checkout@v4
-      if: ${{ steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success' }}
      with:
        ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+        # --depth=1 for speed, manually fetch history and other refs as necessary
        fetch-depth: ${{ inputs.fetch-depth }}
        submodules: ${{ inputs.submodules }}
        show-progress: false
--- a/.github/actions/linux-test/action.yml
+++ b/.github/actions/linux-test/action.yml
@ -66,7 +66,7 @@ runs:

    - name: configure aws credentials
      if : ${{ inputs.aws-role-to-assume != '' }}
-      uses: aws-actions/configure-aws-credentials@v4
+      uses: aws-actions/configure-aws-credentials@v3
      with:
        role-to-assume: ${{ inputs.aws-role-to-assume }}
        role-session-name: gha-linux-test
--- a/.github/actions/test-pytorch-binary/action.yml
+++ b/.github/actions/test-pytorch-binary/action.yml
@ -15,6 +15,7 @@ runs:
          -e BINARY_ENV_FILE \
          -e BUILD_ENVIRONMENT \
          -e DESIRED_CUDA \
+          -e DESIRED_DEVTOOLSET \
          -e DESIRED_PYTHON \
          -e GITHUB_ACTIONS \
          -e GPU_ARCH_TYPE \
--- a/.github/actions/upload-test-artifacts/action.yml
+++ b/.github/actions/upload-test-artifacts/action.yml
@ -48,8 +48,14 @@ runs:
      run: |
        # Remove any previous usage logs if they exist
        rm -f logs-*.zip
-        zip "logs-${FILE_SUFFIX}.zip" 'usage_log.txt' || true
-        zip -r "logs-${FILE_SUFFIX}.zip" test/test-reports -i '*.log' || true
+        # this workflow is also run in bazel build test, but we dont generate usage reports for it
+        # so check to see if the file exists first
+        if [ -f 'usage_log.txt' ]; then
+            zip "logs-${FILE_SUFFIX}.zip" 'usage_log.txt'
+        fi
+        if find "test/test-reports" -name "*.log" 2>/dev/null | grep -q .; then
+            zip -r "logs-${FILE_SUFFIX}.zip" test/test-reports -i '*.log'
+        fi

    - name: Zip debugging artifacts for upload
      if: runner.os != 'Windows' && !inputs.use-gha
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-bccaa454a54c3c648697cc2f46a4fb0500b1f01b
+c670ad81fda266b6598aeeef434583eb98197ae8
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-ac9a39f4b768cef09b9d2be8e074be496d7783b6
+r2.7
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -501,9 +501,7 @@
 - name: XPU
  patterns:
  - '**xpu**'
-  - '**XPU**'
  - '**sycl**'
-  - '**SYCL**'
  approved_by:
  - EikanWang
  - jgong5
@ -540,7 +538,6 @@
  - bdhirsh
  - zou3519
  - isuruf
-  - Chillee
  mandatory_checks_name:
  - EasyCLA
  - Lint
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -16,7 +16,6 @@ ciflow_push_tags:
 - ciflow/mps
 - ciflow/nightly
 - ciflow/periodic
- ciflow/periodic-rocm-mi300
 - ciflow/rocm
 - ciflow/rocm-mi300
 - ciflow/s390
@ -26,7 +25,6 @@ ciflow_push_tags:
 - ciflow/xpu
 - ciflow/torchbench
 - ciflow/autoformat
- ciflow/op-benchmark
 retryable_workflows:
 - pull
 - trunk
--- a/.github/scripts/amd/package_triton_wheel.sh
+++ b/.github/scripts/amd/package_triton_wheel.sh
@ -61,14 +61,10 @@ fi
 ROCM_SO=(
    "${libamdhip}"
    "libhsa-runtime64.so.1"
+    "libamd_comgr.so.2"
    "libdrm.so.2"
    "libdrm_amdgpu.so.1"
 )
-if [[ $ROCM_INT -ge 60400 ]]; then
-    ROCM_SO+=("libamd_comgr.so.3")
-else
-    ROCM_SO+=("libamd_comgr.so.2")
-fi

 if [[ $ROCM_INT -ge 60100 ]]; then
    ROCM_SO+=("librocprofiler-register.so.0")
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -39,9 +39,9 @@ SUPPORTED_PERIODICAL_MODES: dict[str, Callable[[Optional[str]], bool]] = {
 }

 # The link to the published list of disabled jobs
-DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json"
+DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json?versionId=n.FT07XR3dLMwOLBwmRNquyYSeGk8Het"
 # and unstable jobs
-UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json"
+UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json?versionId=.Ox7WAXa21I1PVqadHyPfhMRPhl0aCnD"

 # Some constants used to handle disabled and unstable jobs
 JOB_NAME_SEP = "/"
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -34,6 +34,8 @@ ROCM_ARCHES = ["6.2.4", "6.3"]

 XPU_ARCHES = ["xpu"]

+CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]
+
 CPU_AARCH64_ARCH = ["cpu-aarch64"]

 CPU_S390X_ARCH = ["cpu-s390x"]
@ -75,7 +77,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
@ -144,6 +146,8 @@ def arch_type(arch_version: str) -> str:
        return "rocm"
    elif arch_version in XPU_ARCHES:
        return "xpu"
+    elif arch_version in CPU_CXX11_ABI_ARCH:
+        return "cpu-cxx11-abi"
    elif arch_version in CPU_AARCH64_ARCH:
        return "cpu-aarch64"
    elif arch_version in CPU_S390X_ARCH:
@ -172,23 +176,31 @@ WHEEL_CONTAINER_IMAGES = {
    },
    "xpu": f"pytorch/manylinux2_28-builder:xpu-{DEFAULT_TAG}",
    "cpu": f"pytorch/manylinux2_28-builder:cpu-{DEFAULT_TAG}",
+    "cpu-cxx11-abi": f"pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-{DEFAULT_TAG}",
    "cpu-aarch64": f"pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-{DEFAULT_TAG}",
    "cpu-s390x": f"pytorch/manylinuxs390x-builder:cpu-s390x-{DEFAULT_TAG}",
 }

+CXX11_ABI = "cxx11-abi"
 RELEASE = "release"
 DEBUG = "debug"

-LIBTORCH_CONTAINER_IMAGES: dict[str, str] = {
+LIBTORCH_CONTAINER_IMAGES: dict[tuple[str, str], str] = {
    **{
-        gpu_arch: f"pytorch/libtorch-cxx11-builder:cuda{gpu_arch}-{DEFAULT_TAG}"
+        (
+            gpu_arch,
+            CXX11_ABI,
+        ): f"pytorch/libtorch-cxx11-builder:cuda{gpu_arch}-{DEFAULT_TAG}"
        for gpu_arch in CUDA_ARCHES
    },
    **{
-        gpu_arch: f"pytorch/libtorch-cxx11-builder:rocm{gpu_arch}-{DEFAULT_TAG}"
+        (
+            gpu_arch,
+            CXX11_ABI,
+        ): f"pytorch/libtorch-cxx11-builder:rocm{gpu_arch}-{DEFAULT_TAG}"
        for gpu_arch in ROCM_ARCHES
    },
-    "cpu": f"pytorch/libtorch-cxx11-builder:cpu-{DEFAULT_TAG}",
+    ("cpu", CXX11_ABI): f"pytorch/libtorch-cxx11-builder:cpu-{DEFAULT_TAG}",
 }

 FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"]
@ -198,6 +210,7 @@ def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
    return {
        "cpu": "cpu",
        "cpu-aarch64": "cpu",
+        "cpu-cxx11-abi": "cpu-cxx11-abi",
        "cpu-s390x": "cpu",
        "cuda": f"cu{gpu_arch_version.replace('.', '')}",
        "cuda-aarch64": f"cu{gpu_arch_version.replace('-aarch64', '').replace('.', '')}",
@ -212,7 +225,7 @@ def list_without(in_list: list[str], without: list[str]) -> list[str]:

 def generate_libtorch_matrix(
    os: str,
-    release_type: str,
+    abi_version: str,
    arches: Optional[list[str]] = None,
    libtorch_variants: Optional[list[str]] = None,
 ) -> list[dict[str, str]]:
@ -234,6 +247,9 @@ def generate_libtorch_matrix(
    ret: list[dict[str, str]] = []
    for arch_version in arches:
        for libtorch_variant in libtorch_variants:
+            # one of the values in the following list must be exactly
+            # CXX11_ABI, but the precise value of the other one doesn't
+            # matter
            gpu_arch_type = arch_type(arch_version)
            gpu_arch_version = "" if arch_version == "cpu" else arch_version
            # ROCm builds without-deps failed even in ROCm runners; skip for now
@ -246,15 +262,20 @@ def generate_libtorch_matrix(
                    "desired_cuda": translate_desired_cuda(
                        gpu_arch_type, gpu_arch_version
                    ),
-                    "libtorch_config": release_type,
                    "libtorch_variant": libtorch_variant,
+                    "libtorch_config": abi_version
+                    if os in ("windows", "windows-arm64")
+                    else "",
+                    "devtoolset": abi_version
+                    if os not in ("windows", "windows-arm64")
+                    else "",
                    "container_image": (
-                        LIBTORCH_CONTAINER_IMAGES[arch_version]
+                        LIBTORCH_CONTAINER_IMAGES[(arch_version, abi_version)]
                        if os not in ("windows", "windows-arm64")
                        else ""
                    ),
                    "package_type": "libtorch",
-                    "build_name": f"libtorch-{gpu_arch_type}{gpu_arch_version}-{libtorch_variant}-{release_type}".replace(
+                    "build_name": f"libtorch-{gpu_arch_type}{gpu_arch_version}-{libtorch_variant}-{abi_version}".replace(
                        ".", "_"
                    ),
                }
@ -280,7 +301,7 @@ def generate_wheels_matrix(
        # Define default compute archivectures
        arches = ["cpu"]
        if os == "linux":
-            arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
+            arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
        elif os == "windows":
            arches += CUDA_ARCHES + XPU_ARCHES
        elif os == "linux-aarch64":
@ -299,6 +320,7 @@ def generate_wheels_matrix(
            gpu_arch_version = (
                ""
                if arch_version == "cpu"
+                or arch_version == "cpu-cxx11-abi"
                or arch_version == "cpu-aarch64"
                or arch_version == "cpu-s390x"
                or arch_version == "xpu"
@ -333,6 +355,7 @@ def generate_wheels_matrix(
                        "gpu_arch_version": gpu_arch_version,
                        "desired_cuda": desired_cuda,
                        "use_split_build": "True" if use_split_build else "False",
+                        "devtoolset": "cxx11-abi",
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
                        "pytorch_extra_install_requirements": (
@ -361,6 +384,7 @@ def generate_wheels_matrix(
                                gpu_arch_type, gpu_arch_version
                            ),
                            "use_split_build": "True" if use_split_build else "False",
+                            "devtoolset": "",
                            "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                            "package_type": package_type,
                            "pytorch_extra_install_requirements": "",
@ -379,6 +403,12 @@ def generate_wheels_matrix(
                            gpu_arch_type, gpu_arch_version
                        ),
                        "use_split_build": "True" if use_split_build else "False",
+                        "devtoolset": (
+                            "cxx11-abi"
+                            if (arch_version in ["cpu-cxx11-abi", "cpu-aarch64"])
+                            or os == "linux"
+                            else ""
+                        ),
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
                        "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -54,6 +54,7 @@ class BinaryBuildWorkflow:

    # Optional fields
    build_environment: str = ""
+    abi_version: str = ""
    ciflow_config: CIFlowConfig = field(default_factory=CIFlowConfig)
    is_scheduled: str = ""
    branches: str = "nightly"
@ -61,16 +62,14 @@ class BinaryBuildWorkflow:
    cross_compile_arm64: bool = False
    macos_runner: str = "macos-14-xlarge"
    use_split_build: bool = False
-    # Mainly used for libtorch builds
-    build_variant: str = ""

    def __post_init__(self) -> None:
-        if self.build_environment == "":
-            self.build_environment = "-".join(
-                item
-                for item in [self.os, "binary", self.package_type, self.build_variant]
-                if item != ""
+        if self.abi_version:
+            self.build_environment = (
+                f"{self.os}-binary-{self.package_type}-{self.abi_version}"
            )
+        else:
+            self.build_environment = f"{self.os}-binary-{self.package_type}"
        if self.use_split_build:
            # added to distinguish concurrency groups
            self.build_environment += "-split"
@ -134,9 +133,10 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.LINUX,
        package_type="libtorch",
+        abi_version=generate_binary_build_matrix.CXX11_ABI,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.LINUX,
-            generate_binary_build_matrix.RELEASE,
+            generate_binary_build_matrix.CXX11_ABI,
            libtorch_variants=["shared-with-deps"],
        ),
        ciflow_config=CIFlowConfig(
@ -176,10 +176,10 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.LINUX,
        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.RELEASE,
+        abi_version=generate_binary_build_matrix.CXX11_ABI,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.LINUX,
-            generate_binary_build_matrix.RELEASE,
+            generate_binary_build_matrix.CXX11_ABI,
            arches=["cpu"],
            libtorch_variants=["shared-with-deps"],
        ),
@ -202,7 +202,7 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS,
        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.RELEASE,
+        abi_version=generate_binary_build_matrix.RELEASE,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.WINDOWS,
            generate_binary_build_matrix.RELEASE,
@ -216,7 +216,7 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS,
        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.DEBUG,
+        abi_version=generate_binary_build_matrix.DEBUG,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.WINDOWS,
            generate_binary_build_matrix.DEBUG,
@ -233,7 +233,7 @@ WINDOWS_BINARY_SMOKE_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS,
        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.RELEASE,
+        abi_version=generate_binary_build_matrix.RELEASE,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.WINDOWS,
            generate_binary_build_matrix.RELEASE,
@ -248,7 +248,7 @@ WINDOWS_BINARY_SMOKE_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS,
        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.DEBUG,
+        abi_version=generate_binary_build_matrix.DEBUG,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.WINDOWS,
            generate_binary_build_matrix.DEBUG,
@ -279,7 +279,7 @@ WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS_ARM64,
        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.RELEASE,
+        abi_version=generate_binary_build_matrix.RELEASE,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.WINDOWS_ARM64,
            generate_binary_build_matrix.RELEASE,
@ -294,7 +294,7 @@ WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS_ARM64,
        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.DEBUG,
+        abi_version=generate_binary_build_matrix.DEBUG,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.WINDOWS_ARM64,
            generate_binary_build_matrix.DEBUG,
@ -312,10 +312,10 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.MACOS_ARM64,
        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.RELEASE,
+        abi_version=generate_binary_build_matrix.CXX11_ABI,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.MACOS,
-            generate_binary_build_matrix.RELEASE,
+            generate_binary_build_matrix.CXX11_ABI,
            libtorch_variants=["shared-with-deps"],
        ),
        cross_compile_arm64=False,
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@ -1,6 +1,11 @@
 #!/usr/bin/env bash
 set -ex

+# The generic Linux job chooses to use base env, not the one setup by the image
+CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+eval "$(command conda 'shell.bash' 'hook' 2> /dev/null)"
+conda activate "${CONDA_ENV}"
+
 # Use uv to speed up lintrunner init
 python3 -m pip install uv==0.1.45

--- a/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile
+++ b/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile
@ -5,50 +5,6 @@ FROM --platform=linux/amd64 docker.io/ubuntu:24.04 as ld-prefix
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get -y install ca-certificates libicu74 libssl3

-# Patched podman
-FROM --platform=linux/s390x docker.io/ubuntu:24.04 as podman
-ENV DEBIAN_FRONTEND=noninteractive
-RUN sed -i 's/^Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/ubuntu.sources
-RUN apt-get update && \
-    apt-get install -y \
-        cmake \
-        curl \
-        devscripts \
-        dpkg-dev \
-        gdb \
-        less \
-        make \
-        python3 \
-        python3-pip \
-        quilt \
-        rsync \
-        software-properties-common \
-        stress-ng \
-        vim \
-        nano \
-        wget && \
-    apt-get build-dep -y podman && \
-    apt-get source podman
-
-COPY podman-patches/podman-25245.patch /tmp/podman-25245.patch
-COPY podman-patches/podman-25102-backport.patch /tmp/podman-25102-backport.patch
-
-# import and apply patches
-# patches:
-# https://github.com/containers/podman/pull/25102
-# https://github.com/containers/podman/pull/25245
-RUN cd /libpod-* && \
-    quilt import /tmp/podman-25245.patch && quilt push && \
-    quilt import /tmp/podman-25102-backport.patch && quilt push && \
-    dch -i "Fix podman deadlock and add option to clean up build leftovers" && \
-    /bin/rm /tmp/podman-25245.patch /tmp/podman-25102-backport.patch
-
-# build patched podman
-RUN cd /libpod-* && \
-    debuild -i -us -uc -b && \
-    /bin/rm /podman-remote_*.deb && \
-    mkdir /tmp/podman && cp -v /podman*.deb /tmp/podman
-
 # Main image.
 FROM --platform=linux/s390x docker.io/ubuntu:24.04

@ -89,11 +45,7 @@ COPY fs/ /
 RUN chmod +x /usr/bin/actions-runner /usr/bin/entrypoint

 # install podman
-# RUN apt-get update && apt -y install podman podman-docker
-
-# install patched podman
-COPY --from=podman /tmp/podman /tmp/podman
-RUN apt-get update && apt -y install /tmp/podman/*.deb && /bin/rm -rfv /tmp/podman
+RUN apt -y install podman podman-docker

 # amd64 Github Actions Runner.
 RUN useradd -m actions-runner
@ -113,7 +65,7 @@ RUN virtualenv --system-site-packages venv
 #
 COPY --chown=actions-runner:actions-runner manywheel-s390x.tar /home/actions-runner/manywheel-s390x.tar

-RUN curl -L https://github.com/actions/runner/releases/download/v2.322.0/actions-runner-linux-x64-2.322.0.tar.gz | tar -xz
+RUN curl -L https://github.com/actions/runner/releases/download/v2.317.0/actions-runner-linux-x64-2.317.0.tar.gz | tar -xz

 ENTRYPOINT ["/usr/bin/entrypoint"]
 CMD ["/usr/bin/actions-runner"]
--- a/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25102-backport.patch
+++ b/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25102-backport.patch
@ -1,358 +0,0 @@
-diff --git a/cmd/podman/system/prune.go b/cmd/podman/system/prune.go
-index f7cf7b551..739f87cde 100644
--- a/cmd/podman/system/prune.go
-+++ b/cmd/podman/system/prune.go
-@@ -48,6 +48,7 @@ func init() {
- 	flags.BoolVarP(&force, "force", "f", false, "Do not prompt for confirmation.  The default is false")
- 	flags.BoolVarP(&pruneOptions.All, "all", "a", false, "Remove all unused data")
- 	flags.BoolVar(&pruneOptions.External, "external", false, "Remove container data in storage not controlled by podman")
-+	flags.BoolVar(&pruneOptions.Build, "build", false, "Remove build containers")
- 	flags.BoolVar(&pruneOptions.Volume, "volumes", false, "Prune volumes")
- 	filterFlagName := "filter"
- 	flags.StringArrayVar(&filters, filterFlagName, []string{}, "Provide filter values (e.g. 'label=<key>=<value>')")
-@@ -64,8 +65,12 @@ func prune(cmd *cobra.Command, args []string) error {
- 			volumeString = `
- 	- all volumes not used by at least one container`
- 		}
-
-		fmt.Printf(createPruneWarningMessage(pruneOptions), volumeString, "Are you sure you want to continue? [y/N] ")
-+		buildString := ""
-+		if pruneOptions.Build {
-+			buildString = `
-+	- all build containers`
-+		}
-+		fmt.Printf(createPruneWarningMessage(pruneOptions), volumeString, buildString, "Are you sure you want to continue? [y/N] ")
- 
- 		answer, err := reader.ReadString('\n')
- 		if err != nil {
-@@ -124,7 +129,7 @@ func createPruneWarningMessage(pruneOpts entities.SystemPruneOptions) string {
- 	if pruneOpts.All {
- 		return `WARNING! This command removes:
- 	- all stopped containers
-	- all networks not used by at least one container%s
-+	- all networks not used by at least one container%s%s
- 	- all images without at least one container associated with them
- 	- all build cache
- 
-@@ -132,7 +137,7 @@ func createPruneWarningMessage(pruneOpts entities.SystemPruneOptions) string {
- 	}
- 	return `WARNING! This command removes:
- 	- all stopped containers
-	- all networks not used by at least one container%s
-+	- all networks not used by at least one container%s%s
- 	- all dangling images
- 	- all dangling build cache
- 
-diff --git a/docs/source/markdown/podman-system-prune.1.md b/docs/source/markdown/podman-system-prune.1.md
-index 52f9ec1c7..95099d018 100644
--- a/docs/source/markdown/podman-system-prune.1.md
-+++ b/docs/source/markdown/podman-system-prune.1.md
-@@ -7,20 +7,28 @@ podman\-system\-prune - Remove all unused pods, containers, images, networks, an
- **podman system prune** [*options*]
- 
- ## DESCRIPTION
-**podman system prune** removes all unused containers (both dangling and unreferenced), pods, networks, and optionally, volumes from local storage.
-+**podman system prune** removes all unused containers (both dangling and unreferenced), build containers, pods, networks, and optionally, volumes from local storage.
- 
- Use the **--all** option to delete all unused images.  Unused images are dangling images as well as any image that does not have any containers based on it.
- 
- By default, volumes are not removed to prevent important data from being deleted if there is currently no container using the volume. Use the **--volumes** flag when running the command to prune volumes as well.
- 
-+By default, build containers are not removed to prevent interference with builds in progress. Use the **--build** flag when running the command to remove build containers as well.
-+
- ## OPTIONS
- #### **--all**, **-a**
- 
- Recursively remove all unused pods, containers, images, networks, and volume data. (Maximum 50 iterations.)
- 
-+#### **--build**
-+
-+Removes any build containers that were created during the build, but were not removed because the build was unexpectedly terminated.
-+
-+Note: **This is not safe operation and should be executed only when no builds are in progress. It can interfere with builds in progress.**
-+
- #### **--external**
- 
-Removes all leftover container storage files from local storage not managed by Podman. In normal circumstances, no such data exists, but in case of an unclean shutdown, the Podman database may be corrupted and cause this.
-+Tries to clean up remainders of previous containers or layers that are not references in the storage json files. These can happen in the case of unclean shutdowns or regular restarts in transient storage mode.
- 
- However, when using transient storage mode, the Podman database does not persist. This means containers leave the writable layers on disk after a reboot. When using a transient store, it is recommended that the **podman system prune --external** command is run during boot.
- 
-diff --git a/libpod/runtime.go b/libpod/runtime.go
-index 986e40f60..609fbba57 100644
--- a/libpod/runtime.go
-+++ b/libpod/runtime.go
-@@ -33,6 +33,7 @@ import (
- 	"github.com/containers/podman/v4/libpod/lock"
- 	"github.com/containers/podman/v4/libpod/plugin"
- 	"github.com/containers/podman/v4/libpod/shutdown"
-+	"github.com/containers/podman/v4/pkg/domain/entities/reports"
- 	"github.com/containers/podman/v4/pkg/rootless"
- 	"github.com/containers/podman/v4/pkg/systemd"
- 	"github.com/containers/podman/v4/pkg/util"
-@@ -1250,3 +1251,52 @@ func (r *Runtime) LockConflicts() (map[uint32][]string, []uint32, error) {
- 
- 	return toReturn, locksHeld, nil
- }
-+
-+// Exists checks whether a file or directory exists at the given path.
-+// If the path is a symlink, the symlink is followed.
-+func Exists(path string) error {
-+	// It uses unix.Faccessat which is a faster operation compared to os.Stat for
-+	// simply checking the existence of a file.
-+	err := unix.Faccessat(unix.AT_FDCWD, path, unix.F_OK, 0)
-+	if err != nil {
-+		return &os.PathError{Op: "faccessat", Path: path, Err: err}
-+	}
-+	return nil
-+}
-+
-+// PruneBuildContainers removes any build containers that were created during the build,
-+// but were not removed because the build was unexpectedly terminated.
-+//
-+// Note: This is not safe operation and should be executed only when no builds are in progress. It can interfere with builds in progress.
-+func (r *Runtime) PruneBuildContainers() ([]*reports.PruneReport, error) {
-+	stageContainersPruneReports := []*reports.PruneReport{}
-+
-+	containers, err := r.store.Containers()
-+	if err != nil {
-+		return stageContainersPruneReports, err
-+	}
-+	for _, container := range containers {
-+		path, err := r.store.ContainerDirectory(container.ID)
-+		if err != nil {
-+			return stageContainersPruneReports, err
-+		}
-+		if err := Exists(filepath.Join(path, "buildah.json")); err != nil {
-+			continue
-+		}
-+
-+		report := &reports.PruneReport{
-+			Id: container.ID,
-+		}
-+		size, err := r.store.ContainerSize(container.ID)
-+		if err != nil {
-+			report.Err = err
-+		}
-+		report.Size = uint64(size)
-+
-+		if err := r.store.DeleteContainer(container.ID); err != nil {
-+			report.Err = errors.Join(report.Err, err)
-+		}
-+		stageContainersPruneReports = append(stageContainersPruneReports, report)
-+	}
-+	return stageContainersPruneReports, nil
-+}
-diff --git a/pkg/api/handlers/libpod/system.go b/pkg/api/handlers/libpod/system.go
-index 70d4493f8..7c129b1ba 100644
--- a/pkg/api/handlers/libpod/system.go
-+++ b/pkg/api/handlers/libpod/system.go
-@@ -22,6 +22,7 @@ func SystemPrune(w http.ResponseWriter, r *http.Request) {
- 		All      bool `schema:"all"`
- 		Volumes  bool `schema:"volumes"`
- 		External bool `schema:"external"`
-+		Build    bool `schema:"build"`
- 	}{}
- 
- 	if err := decoder.Decode(&query, r.URL.Query()); err != nil {
-@@ -43,6 +44,7 @@ func SystemPrune(w http.ResponseWriter, r *http.Request) {
- 		Volume:   query.Volumes,
- 		Filters:  *filterMap,
- 		External: query.External,
-+		Build:    query.Build,
- 	}
- 	report, err := containerEngine.SystemPrune(r.Context(), pruneOptions)
- 	if err != nil {
-diff --git a/pkg/bindings/system/types.go b/pkg/bindings/system/types.go
-index 89e093f68..b4a4ff064 100644
--- a/pkg/bindings/system/types.go
-+++ b/pkg/bindings/system/types.go
-@@ -18,6 +18,7 @@ type PruneOptions struct {
- 	Filters  map[string][]string
- 	Volumes  *bool
- 	External *bool
-+	Build    *bool
- }
- 
- // VersionOptions are optional options for getting version info
-diff --git a/pkg/bindings/system/types_prune_options.go b/pkg/bindings/system/types_prune_options.go
-index d00498520..5f3bd652c 100644
--- a/pkg/bindings/system/types_prune_options.go
-+++ b/pkg/bindings/system/types_prune_options.go
-@@ -76,3 +76,18 @@ func (o *PruneOptions) GetExternal() bool {
- 	}
- 	return *o.External
- }
-+
-+// WithBuild set field Build to given value
-+func (o *PruneOptions) WithBuild(value bool) *PruneOptions {
-+	o.Build = &value
-+	return o
-+}
-+
-+// GetBuild returns value of field Build
-+func (o *PruneOptions) GetBuild() bool {
-+	if o.Build == nil {
-+		var z bool
-+		return z
-+	}
-+	return *o.Build
-+}
-diff --git a/pkg/domain/entities/system.go b/pkg/domain/entities/system.go
-index 473db3530..f6938652a 100644
--- a/pkg/domain/entities/system.go
-+++ b/pkg/domain/entities/system.go
-@@ -22,6 +22,7 @@ type SystemPruneOptions struct {
- 	Volume   bool
- 	Filters  map[string][]string `json:"filters" schema:"filters"`
- 	External bool
-+	Build    bool
- }
- 
- // SystemPruneReport provides report after system prune is executed.
-diff --git a/pkg/domain/infra/abi/system.go b/pkg/domain/infra/abi/system.go
-index 24ee64d29..ea3e5f203 100644
--- a/pkg/domain/infra/abi/system.go
-+++ b/pkg/domain/infra/abi/system.go
-@@ -150,16 +150,16 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool)
- 	return nil
- }
- 
-// SystemPrune removes unused data from the system. Pruning pods, containers, networks, volumes and images.
-+// SystemPrune removes unused data from the system. Pruning pods, containers, build container, networks, volumes and images.
- func (ic *ContainerEngine) SystemPrune(ctx context.Context, options entities.SystemPruneOptions) (*entities.SystemPruneReport, error) {
- 	var systemPruneReport = new(entities.SystemPruneReport)
- 
- 	if options.External {
-		if options.All || options.Volume || len(options.Filters) > 0 {
-+		if options.All || options.Volume || len(options.Filters) > 0 || options.Build {
- 			return nil, fmt.Errorf("system prune --external cannot be combined with other options")
- 		}
-		err := ic.Libpod.GarbageCollect()
-		if err != nil {
-+
-+		if err := ic.Libpod.GarbageCollect(); err != nil {
- 			return nil, err
- 		}
- 		return systemPruneReport, nil
-@@ -170,6 +170,17 @@ func (ic *ContainerEngine) SystemPrune(ctx context.Context, options entities.Sys
- 		filters = append(filters, fmt.Sprintf("%s=%s", k, v[0]))
- 	}
- 	reclaimedSpace := (uint64)(0)
-+
-+	// Prune Build Containers
-+	if options.Build {
-+		stageContainersPruneReports, err := ic.Libpod.PruneBuildContainers()
-+		if err != nil {
-+			return nil, err
-+		}
-+		reclaimedSpace += reports.PruneReportsSize(stageContainersPruneReports)
-+		systemPruneReport.ContainerPruneReports = append(systemPruneReport.ContainerPruneReports, stageContainersPruneReports...)
-+	}
-+
- 	found := true
- 	for found {
- 		found = false
-diff --git a/pkg/domain/infra/tunnel/system.go b/pkg/domain/infra/tunnel/system.go
-index fc82e7b2b..142a9fa5c 100644
--- a/pkg/domain/infra/tunnel/system.go
-+++ b/pkg/domain/infra/tunnel/system.go
-@@ -19,7 +19,7 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool)
- 
- // SystemPrune prunes unused data from the system.
- func (ic *ContainerEngine) SystemPrune(ctx context.Context, opts entities.SystemPruneOptions) (*entities.SystemPruneReport, error) {
-	options := new(system.PruneOptions).WithAll(opts.All).WithVolumes(opts.Volume).WithFilters(opts.Filters).WithExternal(opts.External)
-+	options := new(system.PruneOptions).WithAll(opts.All).WithVolumes(opts.Volume).WithFilters(opts.Filters).WithExternal(opts.External).WithBuild(opts.Build)
- 	return system.Prune(ic.ClientCtx, options)
- }
- 
-diff --git a/test/e2e/prune_test.go b/test/e2e/prune_test.go
-index 01e848478..57bd5582d 100644
--- a/test/e2e/prune_test.go
-+++ b/test/e2e/prune_test.go
-@@ -4,6 +4,8 @@ import (
- 	"fmt"
- 	"os"
- 	"path/filepath"
-+	"syscall"
-+	"time"
- 
- 	. "github.com/containers/podman/v4/test/utils"
- 	. "github.com/onsi/ginkgo/v2"
-@@ -22,6 +24,11 @@ FROM scratch
- ENV test1=test1
- ENV test2=test2`
- 
-+var longBuildImage = fmt.Sprintf(`
-+FROM %s
-+RUN echo "Hello, World!"
-+RUN RUN echo "Please use signal 9 this will never ends" && sleep 10000s`, ALPINE)
-+
- var _ = Describe("Podman prune", func() {
- 
- 	It("podman container prune containers", func() {
-@@ -593,4 +600,63 @@ var _ = Describe("Podman prune", func() {
- 		Expect(err).ToNot(HaveOccurred())
- 		Expect(dirents).To(HaveLen(3))
- 	})
-+
-+	It("podman system prune --build clean up after terminated build", func() {
-+		useCustomNetworkDir(podmanTest, tempdir)
-+
-+		podmanTest.BuildImage(pruneImage, "alpine_notleaker:latest", "false")
-+
-+		create := podmanTest.Podman([]string{"create", "--name", "test", BB, "sleep", "10000"})
-+		create.WaitWithDefaultTimeout()
-+		Expect(create).Should(ExitCleanly())
-+
-+		containerFilePath := filepath.Join(podmanTest.TempDir, "ContainerFile-podman-leaker")
-+		err := os.WriteFile(containerFilePath, []byte(longBuildImage), 0755)
-+		Expect(err).ToNot(HaveOccurred())
-+
-+		build := podmanTest.Podman([]string{"build", "-f", containerFilePath, "-t", "podmanleaker"})
-+		// Build will never finish so let's wait for build to ask for SIGKILL to simulate a failed build that leaves stage containers.
-+		matchedOutput := false
-+		for range 900 {
-+			if build.LineInOutputContains("Please use signal 9") {
-+				matchedOutput = true
-+				build.Signal(syscall.SIGKILL)
-+				break
-+			}
-+			time.Sleep(100 * time.Millisecond)
-+		}
-+		if !matchedOutput {
-+			Fail("Did not match special string in podman build")
-+		}
-+
-+		// Check Intermediate image of stage container
-+		none := podmanTest.Podman([]string{"images", "-a"})
-+		none.WaitWithDefaultTimeout()
-+		Expect(none).Should(ExitCleanly())
-+		Expect(none.OutputToString()).Should(ContainSubstring("none"))
-+
-+		// Check if Container and Stage Container exist
-+		count := podmanTest.Podman([]string{"ps", "-aq", "--external"})
-+		count.WaitWithDefaultTimeout()
-+		Expect(count).Should(ExitCleanly())
-+		Expect(count.OutputToStringArray()).To(HaveLen(3))
-+
-+		prune := podmanTest.Podman([]string{"system", "prune", "--build", "-f"})
-+		prune.WaitWithDefaultTimeout()
-+		Expect(prune).Should(ExitCleanly())
-+
-+		// Container should still exist, but no stage containers
-+		count = podmanTest.Podman([]string{"ps", "-aq", "--external"})
-+		count.WaitWithDefaultTimeout()
-+		Expect(count).Should(ExitCleanly())
-+		Expect(count.OutputToString()).To(BeEmpty())
-+
-+		Expect(podmanTest.NumberOfContainers()).To(Equal(0))
-+
-+		after := podmanTest.Podman([]string{"images", "-a"})
-+		after.WaitWithDefaultTimeout()
-+		Expect(after).Should(ExitCleanly())
-+		Expect(after.OutputToString()).ShouldNot(ContainSubstring("none"))
-+		Expect(after.OutputToString()).Should(ContainSubstring("notleaker"))
-+	})
- })
-
--- a/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25245.patch
+++ b/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25245.patch
@ -1,21 +0,0 @@
-diff --git a/pkg/rootless/rootless_linux.c b/pkg/rootless/rootless_linux.c
-index 4f71d49e5c..3d74af6a6c 100644
--- a/pkg/rootless/rootless_linux.c
-+++ b/pkg/rootless/rootless_linux.c
-@@ -658,7 +658,7 @@ create_pause_process (const char *pause_pid_file_path, char **argv)
-   if (pipe (p) < 0)
-     return -1;
-
-  pid = fork ();
-+  pid = syscall_clone (SIGCHLD, NULL);
-   if (pid < 0)
-     {
-       close (p[0]);
-@@ -689,7 +689,7 @@ create_pause_process (const char *pause_pid_file_path, char **argv)
-       close (p[0]);
-
-       setsid ();
-      pid = fork ();
-+      pid = syscall_clone (SIGCHLD, NULL);
-       if (pid < 0)
-         _exit (EXIT_FAILURE);
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -819,9 +819,10 @@ class GitHubPR:
                    cursor=info["reviews"]["pageInfo"]["startCursor"],
                )
                info = rc["data"]["repository"]["pullRequest"]
-        reviews = {
-            author: state for author, state in self._reviews if state != "COMMENTED"
-        }
+        reviews = {}
+        for author, state in self._reviews:
+            if state != "COMMENTED":
+                reviews[author] = state
        return list(reviews.items())

    def get_approved_by(self) -> list[str]:
@ -2281,8 +2282,7 @@ def merge(
        except MandatoryChecksMissingError as ex:
            last_exception = str(ex)
            print(
-                f"Merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} failed due to: {ex}. Retrying in 5 min",
-                flush=True,
+                f"Merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} failed due to: {ex}. Retrying in 5 min"
            )
            time.sleep(5 * 60)
    # Finally report timeout back
--- a/.github/scripts/windows/build_magma.bat
+++ b/.github/scripts/windows/build_magma.bat
@ -54,8 +54,7 @@ cmake .. -DGPU_TARGET="%GPU_TARGET%" ^
            -DCMAKE_BUILD_TYPE=%CONFIG% ^
            -DCMAKE_GENERATOR=Ninja ^
            -DCMAKE_INSTALL_PREFIX=..\install\ ^
-            -DCUDA_ARCH_LIST="%CUDA_ARCH_LIST%" ^
-            -DCMAKE_POLICY_VERSION_MINIMUM=3.5
+            -DCUDA_ARCH_LIST="%CUDA_ARCH_LIST%"
 if errorlevel 1 exit /b 1

 cmake --build . --target install --config %CONFIG% -- -j%NUMBER_OF_PROCESSORS%
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@ -32,7 +32,7 @@ concurrency:
 {%- macro setup_ec2_windows() -%}
      !{{ display_ec2_information() }}
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -53,7 +53,7 @@ jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -135,7 +135,7 @@ jobs:
        uses: ./.github/actions/setup-xpu
      - name: configure aws credentials
        id: aws_creds
-        uses: aws-actions/configure-aws-credentials@v4
+        uses: aws-actions/configure-aws-credentials@v1.7.0
        with:
          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
          aws-region: us-east-1
@ -147,9 +147,9 @@ jobs:
        with:
          name: !{{ config["build_name"] }}
          path: "${{ runner.temp }}/artifacts/"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
          docker-image: !{{ config["container_image"] }}
      - name: Test Pytorch binary
@ -168,12 +168,12 @@ jobs:
        with:
          name: !{{ config["build_name"] }}
          path: "${{ runner.temp }}/artifacts/"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
      - name: ROCm set GPU_FLAG
        run: |
          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
          docker-image: !{{ config["container_image"] }}
      - name: Test Pytorch binary
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -76,7 +76,7 @@ jobs:
          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
          fi
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
      - name: Populate binary env
        run: |
          # shellcheck disable=SC1091
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@ -25,6 +25,9 @@
      DOCKER_IMAGE: !{{ config["container_image"] }}
 {%- endif %}
 {%- if config["package_type"] == "manywheel" %}
+  {%- if config["devtoolset"] %}
+      DESIRED_DEVTOOLSET: !{{ config["devtoolset"] }}
+  {%- endif %}
  {%- if config.use_split_build is defined %}
      use_split_build: !{{ config["use_split_build"] }}
  {%- endif %}
@ -34,6 +37,9 @@
      LIBTORCH_CONFIG: !{{ config["libtorch_config"] }}
  {%- endif %}
      LIBTORCH_VARIANT: !{{ config["libtorch_variant"] }}
+  {%- if config["devtoolset"] %}
+      DESIRED_DEVTOOLSET: !{{ config["devtoolset"] }}
+  {%- endif %}
  {%- if is_windows %}
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
--- a/.github/templates/windows_arm64_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_arm64_binary_build_workflow.yml.j2
@ -55,7 +55,7 @@ jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@ -55,7 +55,7 @@ jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -79,7 +79,7 @@ jobs:
    steps:
      !{{ common.setup_ec2_windows() }}
      !{{ set_runner_specific_vars() }}
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
      - name: Populate binary env
        shell: bash
        run: |
@ -124,7 +124,7 @@ jobs:
        with:
          name: !{{ config["build_name"] }}
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
      - name: Populate binary env
        shell: bash
        run: |
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@ -47,7 +47,7 @@ jobs:
      reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          fetch-depth: 1
          submodules: false
@ -69,25 +69,25 @@ jobs:
    runs-on: ${{ matrix.runner }}
    steps:
      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7

      - name: Setup Linux
        uses: ./.github/actions/setup-linux

      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
          docker-image-name: ${{ inputs.docker-image-name }}

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

@ -97,7 +97,7 @@ jobs:
        run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.7
        if: ${{ inputs.cuda-version != 'cpu' && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}

      - name: Output disk space left
@ -209,5 +209,5 @@ jobs:
          file-suffix: bazel-${{ github.job }}_${{ steps.get-job-id.outputs.job-id }}

      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
        if: always()
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -23,7 +23,7 @@ on:
        description: Hardware to run this "build" job on, linux.12xlarge or linux.arm64.2xlarge.
      timeout-minutes:
        required: false
-        default: 240
+        default: 210
        type: number
        description: timeout for the job
      use_split_build:
@ -70,6 +70,10 @@ on:
        required: false
        type: string
        description: Desired libtorch variant (for libtorch builds only)
+      DESIRED_DEVTOOLSET:
+        required: false
+        type: string
+        description: Desired dev toolset
      DESIRED_PYTHON:
        required: false
        type: string
@ -100,6 +104,7 @@ jobs:
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: ${{ inputs.LIBTORCH_CONFIG }}
      LIBTORCH_VARIANT: ${{ inputs.LIBTORCH_VARIANT }}
+      DESIRED_DEVTOOLSET: ${{ inputs.DESIRED_DEVTOOLSET }}
      DESIRED_PYTHON: ${{ inputs.DESIRED_PYTHON }}
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: ${{ inputs.PYTORCH_EXTRA_INSTALL_REQUIREMENTS }}
      ALPINE_IMAGE: ${{ inputs.ALPINE_IMAGE }}
@ -125,6 +130,7 @@ jobs:
            echo "SKIP_ALL_TESTS=${{ env.SKIP_ALL_TESTS }}"
            echo "LIBTORCH_CONFIG=${{ env.LIBTORCH_CONFIG }}"
            echo "LIBTORCH_VARIANT=${{ env.LIBTORCH_VARIANT }}"
+            echo "DESIRED_DEVTOOLSET=${{ env.DESIRED_DEVTOOLSET }}"
            echo "DESIRED_PYTHON=${{ env.DESIRED_PYTHON }}"
            echo "PYTORCH_EXTRA_INSTALL_REQUIREMENTS=${{ env.PYTORCH_EXTRA_INSTALL_REQUIREMENTS }}"
            echo "ALPINE_IMAGE=${{ env.ALPINE_IMAGE }}"
@ -144,13 +150,13 @@ jobs:

      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        if: inputs.build_environment != 'linux-s390x-binary-manywheel'
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        continue-on-error: true
        with:
          github-secret: ${{ secrets.github-token }}

      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }}

@ -180,7 +186,6 @@ jobs:
      - name: Checkout PyTorch to pytorch dir
        uses: actions/checkout@v4
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
@ -205,7 +210,7 @@ jobs:

      - name: Pull Docker image
        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
          docker-image: ${{ inputs.DOCKER_IMAGE }}

@ -218,6 +223,7 @@ jobs:
            -e BINARY_ENV_FILE \
            -e BUILD_ENVIRONMENT \
            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
            -e DESIRED_PYTHON \
            -e GITHUB_ACTIONS \
            -e GPU_ARCH_TYPE \
@ -260,7 +266,7 @@ jobs:

      - name: Teardown Linux
        if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7

      - name: Chown workspace
        if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -47,6 +47,10 @@ on:
        required: false
        type: string
        description: Desired libtorch variant (for libtorch builds only)
+      DESIRED_DEVTOOLSET:
+        required: false
+        type: string
+        description: Desired dev toolset
      DESIRED_PYTHON:
        required: false
        type: string
@ -88,6 +92,7 @@ jobs:
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: ${{ inputs.LIBTORCH_CONFIG }}
      LIBTORCH_VARIANT: ${{ inputs.LIBTORCH_VARIANT }}
+      DESIRED_DEVTOOLSET: ${{ inputs.DESIRED_DEVTOOLSET }}
      DESIRED_PYTHON: ${{ inputs.DESIRED_PYTHON }}
      ALPINE_IMAGE: ${{ inputs.ALPINE_IMAGE }}
      AWS_DEFAULT_REGION: us-east-1
@ -113,6 +118,7 @@ jobs:
            echo "SKIP_ALL_TESTS=${{ env.SKIP_ALL_TESTS }}"
            echo "LIBTORCH_CONFIG=${{ env.LIBTORCH_CONFIG }}"
            echo "LIBTORCH_VARIANT=${{ env.LIBTORCH_VARIANT }}"
+            echo "DESIRED_DEVTOOLSET=${{ env.DESIRED_DEVTOOLSET }}"
            echo "DESIRED_PYTHON=${{ env.DESIRED_PYTHON }}"

            echo "ALPINE_IMAGE=${{ env.ALPINE_IMAGE }}"
@ -127,14 +133,14 @@ jobs:

      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        if: inputs.build_environment != 'linux-s390x-binary-manywheel'
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        continue-on-error: true
        with:
          github-secret: ${{ secrets.github-token }}

        # Setup the environment
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }}

@ -157,7 +163,6 @@ jobs:
      - name: Checkout PyTorch to pytorch dir
        uses: actions/checkout@v4
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          show-progress: false
          path: pytorch
@ -188,12 +193,12 @@ jobs:
          path: "${{ runner.temp }}/artifacts/"

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.7
        if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}

      - name: Pull Docker image
        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
          docker-image: ${{ inputs.DOCKER_IMAGE }}

@ -203,7 +208,7 @@ jobs:

      - name: Teardown Linux
        if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7

      - name: Chown workspace
        if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
--- a/.github/workflows/_binary-upload.yml
+++ b/.github/workflows/_binary-upload.yml
@ -43,6 +43,10 @@ on:
        required: false
        type: string
        description: Desired libtorch variant (for libtorch builds only)
+      DESIRED_DEVTOOLSET:
+        required: false
+        type: string
+        description: Desired dev toolset
      DESIRED_PYTHON:
        required: false
        type: string
@ -76,6 +80,7 @@ jobs:
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: ${{ inputs.LIBTORCH_CONFIG }}
      LIBTORCH_VARIANT: ${{ inputs.LIBTORCH_VARIANT }}
+      DESIRED_DEVTOOLSET: ${{ inputs.DESIRED_DEVTOOLSET }}
      DESIRED_PYTHON: ${{ inputs.DESIRED_PYTHON }}
      BINARY_ENV_FILE: /tmp/env
      GITHUB_TOKEN: ${{ secrets.github-token }}
@ -85,20 +90,20 @@ jobs:
      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          no-sudo: true

      - name: Configure AWS credentials(PyTorch account) for nightly
        if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/nightly' }}
-        uses: aws-actions/configure-aws-credentials@v4
+        uses: aws-actions/configure-aws-credentials@v3
        with:
          role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels
          aws-region: us-east-1

      - name: Configure AWS credentials(PyTorch account) for RC builds
        if: ${{ github.event_name == 'push' &&  (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/')) }}
-        uses: aws-actions/configure-aws-credentials@v4
+        uses: aws-actions/configure-aws-credentials@v3
        with:
          role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_test_build_wheels
          aws-region: us-east-1
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@ -84,7 +84,7 @@ jobs:
    name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
    steps:
      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
          instructions: |
@ -95,14 +95,14 @@ jobs:

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7

      - name: Setup Linux
        uses: ./.github/actions/setup-linux

      - name: configure aws credentials
        if : ${{ inputs.aws-role-to-assume != '' }}
-        uses: aws-actions/configure-aws-credentials@v4
+        uses: aws-actions/configure-aws-credentials@v3
        with:
          role-to-assume: ${{ inputs.aws-role-to-assume }}
          role-session-name: gha-linux-test
@ -110,12 +110,12 @@ jobs:

      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
          docker-image-name: ${{ inputs.docker-image }}

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

@ -185,7 +185,7 @@ jobs:

      - name: configure aws credentials
        if : ${{ inputs.upload-aws-role-to-assume != '' }}
-        uses: aws-actions/configure-aws-credentials@v4
+        uses: aws-actions/configure-aws-credentials@v3
        with:
          role-to-assume: ${{ inputs.upload-aws-role-to-assume }}
          role-session-name: gha-linux-test
@ -222,5 +222,5 @@ jobs:
          s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs

      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
        if: always()
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -106,7 +106,7 @@ jobs:
      test-matrix: ${{ steps.filter.outputs.test-matrix }}
    steps:
      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
@ -116,7 +116,7 @@ jobs:
      # checkout because when we run this action we don't *have* a local
      # checkout. In other cases you should prefer a local checkout.
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          no-sudo: true

@ -125,7 +125,7 @@ jobs:
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'

      - name: configure aws credentials
-        uses: aws-actions/configure-aws-credentials@v4
+        uses: aws-actions/configure-aws-credentials@v3
        if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
        with:
          role-to-assume: ${{ inputs.aws-role-to-assume }}
@ -134,7 +134,7 @@ jobs:

      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image-name: ${{ inputs.docker-image-name }}
@ -150,7 +150,7 @@ jobs:
          echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
@ -312,7 +312,7 @@ jobs:
          build-time: ${{ steps.build.outputs.build_time }}

      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
        if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel'

      - name: Cleanup docker
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -80,7 +80,7 @@ jobs:
    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
    steps:
      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        if: ${{ !contains(matrix.runner, 'gcp.a100') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
@ -89,7 +89,7 @@ jobs:
              docker exec -it $(docker container ps --format '{{.ID}}') bash

      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          no-sudo: true

@ -99,7 +99,7 @@ jobs:

      - name: configure aws credentials
        if : ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
-        uses: aws-actions/configure-aws-credentials@v4
+        uses: aws-actions/configure-aws-credentials@v3
        with:
          role-to-assume: ${{ inputs.aws-role-to-assume }}
          role-session-name: gha-linux-test
@ -107,7 +107,7 @@ jobs:

      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image-name: ${{ inputs.docker-image }}
@ -123,7 +123,7 @@ jobs:
          echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
@ -135,7 +135,7 @@ jobs:

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        id: install-nvidia-driver
-        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.7
        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}

      - name: Setup GPU_FLAG for docker run
@ -371,7 +371,7 @@ jobs:
          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}

      - name: Upload the benchmark results
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.7
        with:
          benchmark-results-dir: test/test-reports
          dry-run: false
@ -428,7 +428,7 @@ jobs:
          workflow_attempt: ${{github.run_attempt}}

      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
        if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'

      # NB: We are currently having an intermittent GPU-related issue on G5 runners with
@ -445,6 +445,8 @@ jobs:
      - name: Check NVIDIA driver installation step
        if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped'
        shell: bash
+        env:
+          RUNNER_WORKSPACE: ${{ runner.workspace }}
        run: |
          set +e
          set -x
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@ -33,6 +33,10 @@ on:
        default: "3.9"
        description: |
          The python version to be used. Will be 3.9 by default
+      environment-file:
+        required: false
+        type: string
+        description: Set the conda environment file used to setup macOS build.
      test-matrix:
        required: false
        type: string
@ -67,11 +71,11 @@ jobs:
      test-matrix: ${{ steps.filter.outputs.test-matrix }}
    steps:
      - name: Clean up disk space before running MacOS workflow
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.7

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7

      - name: Set xcode version
        env:
@ -82,12 +86,23 @@ jobs:
          fi

      - name: Setup miniconda
-        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        if: inputs.environment-file == ''
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@release/2.7
        with:
          python-version: ${{ inputs.python-version }}
          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
          pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt

+      # This option is used when cross-compiling arm64 from x86-64. Specifically, we need arm64 conda
+      # environment even though the arch is x86-64
+      - name: Setup miniconda using the provided environment file
+        if: inputs.environment-file != ''
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@release/2.7
+        with:
+          python-version: ${{ inputs.python-version }}
+          environment-file: ${{ inputs.environment-file }}
+          pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
+
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        uses: nick-fields/retry@v3.0.0
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
@ -192,4 +207,4 @@ jobs:
      - name: Clean up disk space
        if: always()
        continue-on-error: true
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.7
--- a/.github/workflows/_mac-test-mps.yml
+++ b/.github/workflows/_mac-test-mps.yml
@ -41,7 +41,7 @@ jobs:
      reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          submodules: false

@ -82,7 +82,7 @@ jobs:
          use-gha: true

      - name: Setup miniconda
-        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@release/2.7
        with:
          python-version: ${{ inputs.python-version }}
          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
@ -160,14 +160,6 @@ jobs:
        run: |
          cat test/**/*_toprint.log || true

-      - name: Run OP benchmark
-        run: |
-          if [[ -n "$CONDA_ENV" ]]; then
-            # Use binaries under conda environment
-            export PATH="$CONDA_ENV/bin":$PATH
-          fi
-          ${CONDA_RUN} python3 test/bench_mps_ops.py
-
      - name: Upload test artifacts
        uses: ./.github/actions/upload-test-artifacts
        if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
@ -178,4 +170,4 @@ jobs:
      - name: Clean up disk space
        if: always()
        continue-on-error: true
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.7
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -82,11 +82,11 @@ jobs:
          done

      - name: Clean up disk space before running MacOS workflow
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.7

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7

      - name: Start monitoring script
        id: monitor-script
@ -109,7 +109,7 @@ jobs:
          use-gha: true

      - name: Setup miniconda
-        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@release/2.7
        with:
          python-version: ${{ inputs.python-version }}
          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
@ -224,7 +224,7 @@ jobs:
          file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}

      - name: Upload the benchmark results
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.7
        with:
          benchmark-results-dir: test/test-reports
          dry-run: false
@ -234,4 +234,4 @@ jobs:
      - name: Clean up disk space
        if: always()
        continue-on-error: true
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.7
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -70,7 +70,7 @@ jobs:
    steps:
      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          no-sudo: true

@ -92,12 +92,12 @@ jobs:

      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
          docker-image-name: ${{ inputs.docker-image }}

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

@ -302,7 +302,7 @@ jobs:
          aws-region: us-east-1

      - name: Upload the benchmark results
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.7
        with:
          benchmark-results-dir: test/test-reports
          dry-run: false
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@ -54,7 +54,7 @@ jobs:
      PR_NUMBER: ${{ github.event.pull_request.number }}
    steps:
      # - name: Checkout PyTorch
-      #   uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+      #   uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
      #   with:
      #     fetch-depth: 1
      #     submodules: true
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -84,10 +84,10 @@ jobs:
          git config --global core.fsmonitor false

      - name: Clean up leftover processes on non-ephemeral Windows runner
-        uses: pytorch/test-infra/.github/actions/cleanup-runner@main
+        uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.7

      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
          instructions: |
@ -102,7 +102,7 @@ jobs:

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          no-sudo: true

--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -66,10 +66,10 @@ jobs:
          git config --global core.fsmonitor false

      - name: Clean up leftover processes on non-ephemeral Windows runner
-        uses: pytorch/test-infra/.github/actions/cleanup-runner@main
+        uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.7

      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
          instructions: |
@ -85,7 +85,7 @@ jobs:

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          no-sudo: true

--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -62,14 +62,14 @@ jobs:
    steps:
      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7

      - name: Setup XPU
        uses: ./.github/actions/setup-xpu

      - name: configure aws credentials
        id: aws_creds
-        uses: aws-actions/configure-aws-credentials@v4
+        uses: aws-actions/configure-aws-credentials@v1.7.0
        with:
          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
          aws-region: us-east-1
@ -80,21 +80,12 @@ jobs:

      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
          docker-image-name: ${{ inputs.docker-image }}

-      - name: Use following to pull public copy of the image
-        id: print-ghcr-mirror
-        env:
-          ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
-        shell: bash
-        run: |
-          tag=${ECR_DOCKER_IMAGE##*/}
-          echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
-
      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

@ -253,11 +244,6 @@ jobs:
          # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"

-      - name: Change permissions
-        if: ${{ always() && steps.test.conclusion }}
-        run: |
-          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
-
      - name: Print remaining test logs
        shell: bash
        if: always() && steps.test.conclusion
--- a/.github/workflows/assigntome-docathon.yml
+++ b/.github/workflows/assigntome-docathon.yml
@ -12,7 +12,7 @@ jobs:
      issues: write
    steps:
      - name: Check for "/assigntome" in comment
-        uses: actions/github-script@v7
+        uses: actions/github-script@v6
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@ -41,12 +41,12 @@ jobs:
      CUDA_VERSION: ${{ matrix.cuda_version }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
            docker-image-name: almalinux-builder${{ matrix.cuda_version == 'cpu' && '-' || '-cuda' }}${{matrix.cuda_version}}
            docker-build-dir:  .ci/docker/almalinux
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -32,7 +32,7 @@ jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -51,12 +51,12 @@ jobs:
      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
            docker-image-name: libtorch-cxx11-builder-cuda${{matrix.cuda_version}}
            docker-build-dir:  .ci/docker/libtorch
@ -93,12 +93,12 @@ jobs:
      GPU_ARCH_VERSION: ${{ matrix.rocm_version }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
            docker-image-name: libtorch-cxx11-builder-rocm${{matrix.rocm_version}}
            docker-build-dir:  .ci/docker/libtorch
@ -129,12 +129,12 @@ jobs:
    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
            docker-image-name: libtorch-cxx11-builder-cpu
            docker-build-dir:  .ci/docker/libtorch
--- a/Show More
+++ b/Show More