CUDA 13.0 Windows Nvidia Driver Update to 580.88 (#162501 )

CUDA 13.0 Windows Nvidia Driver Update to 580.88 (#162425) Related to https://github.com/pytorch/pytorch/issues/162333 https://github.com/pytorch/pytorch/issues/159779 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162425 Approved by: https://github.com/tinglvv, https://github.com/malfet (cherry picked from commit e38e953432764e00f16999c8b7df6346ad357a16) Co-authored-by: atalman <atalman@fb.com>
[Release 2.9] Release only changes (#162493 )
2025-10-23 06:34:55 +08:00 · 2025-09-09 14:27:57 -04:00 · 2025-09-09 11:15:20 -07:00
1993 changed files with 25469 additions and 60468 deletions
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -5,9 +5,9 @@ GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}

 # Set CUDA architecture lists to match x86 build_cuda.sh
 if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="8.0;9.0"
+    export TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;8.0;9.0"
 elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
+    export TORCH_CUDA_ARCH_LIST="7.0;8.0;9.0;10.0;12.0"
 elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
 fi
@ -15,8 +15,6 @@ fi
 # Compress the fatbin with -compress-mode=size for CUDA 13
 if [[ "$DESIRED_CUDA" == *"13"* ]]; then
    export TORCH_NVCC_FLAGS="-compress-mode=size"
-    # Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801
-    export BUILD_BUNDLE_PTXAS=1
 fi

 SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
@ -33,7 +31,8 @@ pip install -r /pytorch/requirements.txt
 pip install auditwheel==6.2.0 wheel
 if [ "$DESIRED_CUDA" = "cpu" ]; then
    echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
-    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
+    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
+    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
 else
    echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
    export USE_SYSTEM_NCCL=1
@ -43,9 +42,13 @@ else
        echo "Bundling CUDA libraries with wheel for aarch64."
    else
        echo "Using nvidia libs from pypi for aarch64."
+        # Fix platform constraints in PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64
+        # Replace 'platform_machine == "x86_64"' with 'platform_machine == "aarch64"'
+        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS//platform_machine == \'x86_64\'/platform_machine == \'aarch64\'}"
        echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"
        export USE_NVIDIA_PYPI_LIBS=1
    fi

-    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
+    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
+    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
 fi
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -138,8 +138,6 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
    folder = os.path.dirname(wheel_path)
    os.mkdir(f"{folder}/tmp")
    os.system(f"unzip {wheel_path} -d {folder}/tmp")
-    # Delete original wheel since it will be repackaged
-    os.system(f"rm {wheel_path}")

    # Check if we should use PyPI NVIDIA libraries or bundle system libraries
    use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
@ -213,8 +211,7 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
        ]

        # CUDA version-specific libraries
-        if "13" in desired_cuda:
-            minor_version = desired_cuda[-1]
+        if "130" in desired_cuda:
            version_specific_libs = [
                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
                "/usr/local/cuda/lib64/libcublas.so.13",
@ -224,7 +221,7 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
                "/usr/local/cuda/lib64/libcusolver.so.12",
                "/usr/local/cuda/lib64/libnvJitLink.so.13",
                "/usr/local/cuda/lib64/libnvrtc.so.13",
-                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}",
+                "/usr/local/cuda/lib64/libnvrtc-builtins.so.13.0",
            ]
        elif "12" in desired_cuda:
            # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
@ -240,8 +237,6 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
                "/usr/local/cuda/lib64/libnvrtc.so.12",
                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
            ]
-        else:
-            raise ValueError(f"Unsupported CUDA version: {desired_cuda}.")

        # Combine all libraries
        libs_to_copy = common_libs + version_specific_libs
@ -280,7 +275,14 @@ def complete_wheel(folder: str) -> str:
            f"/{folder}/dist/{repaired_wheel_name}",
        )
    else:
-        repaired_wheel_name = list_dir(f"/{folder}/dist")[0]
+        repaired_wheel_name = wheel_name.replace(
+            "linux_aarch64", "manylinux_2_28_aarch64"
+        )
+        print(f"Renaming {wheel_name} wheel to {repaired_wheel_name}")
+        os.rename(
+            f"/{folder}/dist/{wheel_name}",
+            f"/{folder}/dist/{repaired_wheel_name}",
+        )

    print(f"Copying {repaired_wheel_name} to artifacts")
    shutil.copy2(
@ -317,7 +319,7 @@ if __name__ == "__main__":
    ).decode()

    print("Building PyTorch wheel")
-    build_vars = ""
+    build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
    if enable_cuda:
        build_vars += "MAX_JOBS=5 "
@ -372,7 +374,7 @@ if __name__ == "__main__":
    else:
        print("build pytorch without mkldnn backend")

-    os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation")
+    os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel")
    if enable_cuda:
        print("Updating Cuda Dependency")
        filename = os.listdir("/pytorch/dist/")
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -241,7 +241,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
        try:
            with socket.create_connection((addr, port), timeout=timeout):
                return
-        except (ConnectionRefusedError, TimeoutError):  # noqa: PERF203
+        except (ConnectionRefusedError, socket.timeout):  # noqa: PERF203
            if i == attempt_cnt - 1:
                raise
            time.sleep(timeout)
@ -442,7 +442,7 @@ def build_torchvision(
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

-    host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation")
+    host.run_cmd(f"cd vision && {build_vars} python3 setup.py bdist_wheel")
    vision_wheel_name = host.list_dir("vision/dist")[0]
    embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name))

@ -497,7 +497,7 @@ def build_torchdata(
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

-    host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation")
+    host.run_cmd(f"cd data && {build_vars} python3 setup.py bdist_wheel")
    wheel_name = host.list_dir("data/dist")[0]
    embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name))

@ -553,7 +553,7 @@ def build_torchtext(
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

-    host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation")
+    host.run_cmd(f"cd text && {build_vars} python3 setup.py bdist_wheel")
    wheel_name = host.list_dir("text/dist")[0]
    embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name))

@ -614,7 +614,7 @@ def build_torchaudio(
    host.run_cmd(
        f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \
        && ./packaging/ffmpeg/build.sh \
-        && {build_vars} python3 -m build --wheel --no-isolation"
+        && {build_vars} python3 setup.py bdist_wheel"
    )

    wheel_name = host.list_dir("audio/dist")[0]
@ -726,7 +726,7 @@ def start_build(
    print("Building PyTorch wheel")
    build_opts = ""
    if pytorch_build_number is not None:
-        build_opts += f" -C--build-option=--build-number={pytorch_build_number}"
+        build_opts += f" --build-number {pytorch_build_number}"
    # Breakpad build fails on aarch64
    build_vars = "USE_BREAKPAD=0 "
    if branch == "nightly":
@ -747,8 +747,7 @@ def start_build(
        print("build pytorch with mkldnn+acl backend")
        build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
        host.run_cmd(
-            f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && "
-            f"{build_vars} python3 -m build --wheel --no-isolation{build_opts}"
+            f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}"
        )
        print("Repair the wheel")
        pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
@ -764,7 +763,7 @@ def start_build(
    else:
        print("build pytorch without mkldnn backend")
        host.run_cmd(
-            f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
+            f"cd pytorch && {build_vars} python3 setup.py bdist_wheel{build_opts}"
        )

    print("Deleting build folder")
@ -1005,7 +1004,7 @@ if __name__ == "__main__":
        install_condaforge_python(host, args.python_version)
        sys.exit(0)

-    python_version = args.python_version if args.python_version is not None else "3.10"
+    python_version = args.python_version if args.python_version is not None else "3.9"

    if args.use_torch_from_pypi:
        configure_system(host, compiler=args.compiler, python_version=python_version)
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -69,8 +69,7 @@ RUN bash ./install_cuda.sh 13.0
 ENV DESIRED_CUDA=13.0

 FROM ${ROCM_IMAGE} as rocm
-ARG PYTORCH_ROCM_ARCH
-ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
+ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
 ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh
 ENV MKLROOT /opt/intel
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -36,12 +36,6 @@ case ${DOCKER_TAG_PREFIX} in
    ;;
  rocm*)
    BASE_TARGET=rocm
-    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-    # add gfx950 conditionally starting in ROCm 7.0
-    if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
-        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
-    fi
-    EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
    ;;
  *)
    echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -84,8 +84,8 @@ fi
 _UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152
 _UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96
 if [[ "$image" == *rocm* ]]; then
-  _UCX_COMMIT=29831d319e6be55cb8c768ca61de335c934ca39e
-  _UCC_COMMIT=9f4b242cbbd8b1462cbc732eb29316cdfa124b77
+  _UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
+  _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
 fi

 tag=$(echo $image | awk -F':' '{print $2}')
@ -175,6 +175,20 @@ case "$tag" in
    fi
    GCC_VERSION=11
    VISION=yes
+    ROCM_VERSION=6.4
+    NINJA_VERSION=1.9.0
+    TRITON=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    if [[ $tag =~ "benchmarks" ]]; then
+      INDUCTOR_BENCHMARKS=yes
+    fi
+    ;;
+  pytorch-linux-noble-rocm-alpha-py3)
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=11
+    VISION=yes
    ROCM_VERSION=7.0
    NINJA_VERSION=1.9.0
    TRITON=yes
@ -182,9 +196,6 @@ case "$tag" in
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
-    if [[ $tag =~ "benchmarks" ]]; then
-      INDUCTOR_BENCHMARKS=yes
-    fi
    ;;
  pytorch-linux-jammy-xpu-n-1-py3)
    ANACONDA_PYTHON_VERSION=3.10
@ -203,7 +214,8 @@ case "$tag" in
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
-    ANACONDA_PYTHON_VERSION=3.10
+    # TODO (huydhn): Upgrade this to Python >= 3.10
+    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    VISION=yes
    KATEX=yes
@ -251,10 +263,13 @@ case "$tag" in
    TRITON_CPU=yes
    ;;
  pytorch-linux-jammy-linter)
-    PYTHON_VERSION=3.10
+    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
+    # We will need to update mypy version eventually, but that's for another day. The task
+    # would be to upgrade mypy to 1.0.0 with Python 3.11
+    PYTHON_VERSION=3.9
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter)
-    PYTHON_VERSION=3.10
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
+    PYTHON_VERSION=3.9
    CUDA_VERSION=12.8.1
    ;;
  pytorch-linux-jammy-aarch64-py3.10-gcc11)
@ -441,3 +456,12 @@ elif [ "$HAS_TRITON" = "yes" ]; then
  echo "expecting triton to not be installed, but it is"
  exit 1
 fi
+
+# Sanity check cmake version.  Executorch reinstalls cmake and I'm not sure if
+# they support 4.0.0 yet, so exclude them from this check.
+CMAKE_VERSION=$(drun cmake --version)
+if [[ "$EXECUTORCH" != *yes* && "$CMAKE_VERSION" != *4.* ]]; then
+  echo "CMake version is not 4.0.0:"
+  drun cmake --version
+  exit 1
+fi
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -56,13 +56,9 @@ ENV INSTALLED_VISION ${VISION}

 # Install rocm
 ARG ROCM_VERSION
-RUN mkdir ci_commit_pins
-COPY ./common/common_utils.sh common_utils.sh
-COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt
 COPY ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh
-RUN rm install_rocm.sh common_utils.sh
-RUN rm -r ci_commit_pins
+RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
 RUN rm install_rocm_magma.sh
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-e0dda9059d082537cee36be6c5e4fe3b18c880c0
+56392aa978594cc155fa8af48cd949f5b5f1823a
--- a/.ci/docker/ci_commit_pins/huggingface-requirements.txt
+++ b/.ci/docker/ci_commit_pins/huggingface-requirements.txt
@ -1,2 +1,2 @@
-transformers==4.56.0
+transformers==4.54.0
 soxr==0.5.0
--- a/.ci/docker/ci_commit_pins/nccl-cu12.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu12.txt
@ -1 +1 @@
-v2.28.3-1
+v2.27.5-1
--- a/.ci/docker/ci_commit_pins/nccl-cu13.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu13.txt
@ -1 +1 @@
-v2.28.3-1
+v2.27.7-1
--- a/.ci/docker/ci_commit_pins/rocm-composable-kernel.txt
+++ b/.ci/docker/ci_commit_pins/rocm-composable-kernel.txt
@ -1 +0,0 @@
-7fe50dc3da2069d6645d9deb8c017a876472a977
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-bbb06c0334a6772b92d24bde54956e675c8c6604
+fccfc522864cf8bc172abe0cd58ae5581e2d44b9
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -42,27 +42,22 @@ install_pip_dependencies() {
  # A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current
  # numba and scipy version used in PyTorch CI
  conda_run pip uninstall -y numba scipy
-  # Yaspin is needed for running CI test (get_benchmark_analysis_data.py)
-  pip_install yaspin==3.1.0

  popd
 }

 setup_executorch() {
+  pushd executorch
+
  export PYTHON_EXECUTABLE=python
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON"
+  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"

  as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
+  popd
 }

-if [ $# -eq 0 ]; then
-  clone_executorch
-  install_buck2
-  install_conda_dependencies
-  install_pip_dependencies
-  pushd executorch
-  setup_executorch
-  popd
-else
-  "$@"
-fi
+clone_executorch
+install_buck2
+install_conda_dependencies
+install_pip_dependencies
+setup_executorch
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -2,11 +2,6 @@

 set -ex

-# for pip_install function
-source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
-
-ROCM_COMPOSABLE_KERNEL_VERSION="$(cat $(dirname $0)/../ci_commit_pins/rocm-composable-kernel.txt)"
-
 ver() {
    printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' ');
 }
@ -42,6 +37,12 @@ EOF
    rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
    amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"

+    # Special case for ROCM_VERSION == 7.0
+    if [[ $(ver "$ROCM_VERSION") -eq $(ver 7.0) ]]; then
+        rocm_baseurl="https://repo.radeon.com/rocm/apt/7.0_alpha2"
+        amdgpu_baseurl="https://repo.radeon.com/amdgpu/30.10_alpha2/ubuntu"
+    fi
+
    # Add amdgpu repository
    UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
    echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
@ -112,8 +113,6 @@ EOF
        rm -rf HIP clr
    fi

-    pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION"
-
    # Cleanup
    apt-get autoclean && apt-get clean
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
@ -177,8 +176,6 @@ install_centos() {
      sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
  done

-  pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION"
-
  # Cleanup
  yum clean all
  rm -rf /var/cache/yum
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -12,8 +12,8 @@ function do_install() {

    rocm_version_nodot=${rocm_version//./}

-    # https://github.com/icl-utk-edu/magma/pull/65
-    MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
+    # Version 2.7.2 + ROCm related updates
+    MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
    magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"

    rocm_dir="/opt/rocm"
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -66,15 +66,15 @@ if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}"
  # Triton needs at least gcc-9 to build
  apt-get install -y g++-9

-  CXX=g++-9 conda_run python -m build --wheel --no-isolation
+  CXX=g++-9 conda_run python setup.py bdist_wheel
 elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
  # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
  add-apt-repository -y ppa:ubuntu-toolchain-r/test
  apt-get install -y g++-9

-  CXX=g++-9 conda_run python -m build --wheel --no-isolation
+  CXX=g++-9 conda_run python setup.py bdist_wheel
 else
-  conda_run python -m build --wheel --no-isolation
+  conda_run python setup.py bdist_wheel
 fi

 # Copy the wheel to /opt for multi stage docker builds
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -40,16 +40,12 @@ case ${DOCKER_TAG_PREFIX} in
        ;;
    rocm*)
        # we want the patch version of 6.4 instead
-        if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
+        if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
        fi
        BASE_TARGET=rocm
        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950 conditionally starting in ROCm 7.0
-        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
-        fi
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
        ;;
    *)
--- a/.ci/docker/manywheel/Dockerfile_cxx11-abi
+++ b/.ci/docker/manywheel/Dockerfile_cxx11-abi
@ -0,0 +1,71 @@
+FROM centos:8 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+ENV PATH /opt/rh/gcc-toolset-11/root/bin/:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+
+# change to a valid repo
+RUN sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-Linux-*.repo
+# enable to install ninja-build
+RUN sed -i 's|enabled=0|enabled=1|g' /etc/yum.repos.d/CentOS-Linux-PowerTools.repo
+
+RUN yum -y update
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which zlib-devel sudo
+RUN yum install -y autoconf automake make cmake gdb gcc-toolset-11-gcc-c++
+
+
+FROM base as openssl
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+# Install python
+FROM base as python
+RUN yum install -y openssl-devel zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel libpcap-devel xz-devel libffi-devel
+ADD common/install_cpython.sh install_cpython.sh
+RUN bash ./install_cpython.sh && rm install_cpython.sh
+
+FROM base as conda
+ADD ./common/install_conda_docker.sh install_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh
+RUN /opt/conda/bin/conda install -y cmake
+
+FROM base as intel
+# Install MKL
+COPY --from=python             /opt/python                           /opt/python
+COPY --from=python             /opt/_internal                        /opt/_internal
+COPY --from=conda              /opt/conda                            /opt/conda
+ENV PATH=/opt/conda/bin:$PATH
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM base as patchelf
+ADD ./common/install_patchelf.sh install_patchelf.sh
+RUN bash ./install_patchelf.sh && rm install_patchelf.sh
+RUN cp $(which patchelf) /patchelf
+
+FROM base as jni
+ADD ./common/install_jni.sh install_jni.sh
+ADD ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+FROM base as libpng
+ADD ./common/install_libpng.sh install_libpng.sh
+RUN bash ./install_libpng.sh && rm install_libpng.sh
+
+FROM base as final
+COPY --from=openssl            /opt/openssl                          /opt/openssl
+COPY --from=python             /opt/python                           /opt/python
+COPY --from=python             /opt/_internal                        /opt/_internal
+COPY --from=intel              /opt/intel                            /opt/intel
+COPY --from=conda              /opt/conda                            /opt/conda
+COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
+COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
+COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
+COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
+COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
+COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
+COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
+COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
+
+RUN yum install -y ninja-build
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -43,6 +43,12 @@ case ${image} in
        MANY_LINUX_VERSION="2_28_aarch64"
        OPENBLAS_VERSION="v0.3.30"
        ;;
+    manylinuxcxx11-abi-builder:cpu-cxx11-abi)
+        TARGET=final
+        GPU_IMAGE=""
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
+        MANY_LINUX_VERSION="cxx11-abi"
+        ;;
    manylinuxs390x-builder:cpu-s390x)
        TARGET=final
        GPU_IMAGE=s390x/almalinux:8
@ -76,7 +82,7 @@ case ${image} in
        ;;
    manylinux2_28-builder:rocm*)
        # we want the patch version of 6.4 instead
-        if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
+        if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
        fi
        TARGET=rocm_final
@ -84,10 +90,6 @@ case ${image} in
        DEVTOOLSET_VERSION="11"
        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950 conditionally starting in ROCm 7.0
-        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
-        fi
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
    manylinux2_28-builder:xpu)
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -10,11 +10,6 @@ boto3==1.35.42
 #Pinned versions: 1.19.12, 1.16.34
 #test that import:

-build==1.3.0
-#Description: A simple, correct Python build frontend.
-#Pinned versions: 1.3.0
-#test that import:
-
 click
 #Description: Command Line Interface Creation Kit
 #Pinned versions:
@ -98,9 +93,8 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
 #Pinned versions:
 #test that import:

-mypy==1.16.0 ; platform_system != "Windows"
+mypy==1.16.0
 # Pin MyPy version because new errors are likely to appear with each release
-# Skip on Windows as lots of type annotations are POSIX specific
 #Description: linter
 #Pinned versions: 1.16.0
 #test that import: test_typing.py, test_type_hints.py
@ -111,12 +105,14 @@ networkx==2.8.8
 #Pinned versions: 2.8.8
 #test that import: functorch

-ninja==1.11.1.4
+ninja==1.11.1.3
 #Description: build system. Used in some tests. Used in build to generate build
 #time tracing information
-#Pinned versions: 1.11.1.4
+#Pinned versions: 1.11.1.3
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py

+numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
+numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x"
 numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
 numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: Just-In-Time Compiler for Numerical Functions
@ -137,7 +133,7 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
 #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
 #test_binary_ufuncs.py
-numpy==1.22.4; python_version == "3.10"
+numpy==1.22.4; python_version == "3.9" or python_version == "3.10"
 numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
 numpy==2.1.2; python_version >= "3.13"

@ -172,9 +168,9 @@ pillow==11.0.0
 #Pinned versions: 10.3.0
 #test that import:

-protobuf==5.29.5
+protobuf==5.29.4
 #Description:  Google's data interchange format
-#Pinned versions: 5.29.5
+#Pinned versions: 5.29.4
 #test that import: test_tensorboard.py, test/onnx/*

 psutil
@ -329,6 +325,8 @@ pywavelets==1.7.0 ; python_version >= "3.12"
 lxml==5.3.0
 #Description: This is a requirement of unittest-xml-reporting

+# Python-3.9 binaries
+
 PyGithub==2.3.0

 sympy==1.13.3
@ -378,7 +376,7 @@ dataclasses_json==0.6.7
 #Pinned versions: 0.6.7
 #test that import:

-cmake==3.31.6
+cmake==4.0.0
 #Description: required for building

 tlparse==0.4.0
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,15 +1,8 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2

-standard-imghdr==3.13.0; python_version >= "3.13"
-#Description: This is needed by Sphinx, so it needs to be added here.
-# The reasons are as follows:
-# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
-# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
-# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
-
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
 # something related to Docker setup. We can investigate this later.
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -52,13 +52,9 @@ ENV INSTALLED_VISION ${VISION}

 # Install rocm
 ARG ROCM_VERSION
-RUN mkdir ci_commit_pins
-COPY ./common/common_utils.sh common_utils.sh
-COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt
 COPY ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh
-RUN rm install_rocm.sh common_utils.sh
-RUN rm -r ci_commit_pins
+RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
 RUN rm install_rocm_magma.sh
--- a/.ci/libtorch/build.sh
+++ b/.ci/libtorch/build.sh
@ -7,4 +7,4 @@ set -ex

 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

-USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.10" ${SCRIPTPATH}/../manywheel/build.sh
+USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
--- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@ -41,6 +41,7 @@ def sample_vllm_test_library():
                "pytest -v -s basic_correctness/test_cumem.py",
                "pytest -v -s basic_correctness/test_basic_correctness.py",
                "pytest -v -s basic_correctness/test_cpu_offload.py",
+                "VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py",
            ],
        },
        "vllm_basic_models_test": {
@ -67,12 +68,15 @@ def sample_vllm_test_library():
                        "-v",
                        "-s",
                        "entrypoints/llm",
+                        "--ignore=entrypoints/llm/test_lazy_outlines.py",
                        "--ignore=entrypoints/llm/test_generate.py",
+                        "--ignore=entrypoints/llm/test_generate_multiple_loras.py",
                        "--ignore=entrypoints/llm/test_collective_rpc.py",
                    ]
                ),
-                "pytest -v -s entrypoints/llm/test_generate.py",
-                "pytest -v -s entrypoints/offline_mode",
+                "pytest -v -s entrypoints/llm/test_lazy_outlines.py",
+                "pytest -v -s entrypoints/llm/test_generate.py ",
+                "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
            ],
        },
        "vllm_regression_test": {
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
@ -66,11 +66,6 @@ class VllmBuildParameters:
        "DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm"
    )

-    # the cleaning script to remove torch dependencies from pip
-    cleaning_script: Path = env_path_field(
-        "cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py"
-    )
-
    # OUTPUT_DIR: where docker buildx (local exporter) will write artifacts
    output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm")

@ -165,7 +160,6 @@ class VllmBuildRunner(BaseRunner):
        logger.info("Running vllm build with inputs: %s", inputs)
        vllm_commit = clone_vllm()

-        self.cp_torch_cleaning_script(inputs)
        self.cp_dockerfile_if_exist(inputs)
        # cp torch wheels from root direct to vllm workspace if exist
        self.cp_torch_whls_if_exist(inputs)
@ -211,11 +205,6 @@ class VllmBuildRunner(BaseRunner):
        copy(inputs.torch_whls_path, tmp_dir)
        return tmp_dir

-    def cp_torch_cleaning_script(self, inputs: VllmBuildParameters):
-        script = get_path(inputs.cleaning_script, resolve=True)
-        vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py")
-        copy(script, vllm_script)
-
    def cp_dockerfile_if_exist(self, inputs: VllmBuildParameters):
        if not inputs.use_local_dockerfile:
            logger.info("using vllm default dockerfile.torch_nightly for build")
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
@ -11,7 +11,7 @@ from typing import Any

 from cli.lib.common.cli_helper import BaseRunner
 from cli.lib.common.envs_helper import env_path_field, env_str_field, get_env
-from cli.lib.common.path_helper import copy, get_path, remove_dir
+from cli.lib.common.path_helper import copy, remove_dir
 from cli.lib.common.pip_helper import (
    pip_install_first_match,
    pip_install_packages,
@ -43,10 +43,6 @@ class VllmTestParameters:

    torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9")

-    cleaning_script: Path = env_path_field(
-        "cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py"
-    )
-
    def __post_init__(self):
        if not self.torch_whls_path.exists():
            raise ValueError("missing torch_whls_path")
@ -96,13 +92,11 @@ class VllmTestRunner(BaseRunner):
        self._set_envs(params)

        clone_vllm(dst=self.work_directory)
-        self.cp_torch_cleaning_script(params)
        with working_directory(self.work_directory):
            remove_dir(Path("vllm"))
            self._install_wheels(params)
            self._install_dependencies()
        # verify the torches are not overridden by test dependencies
-
        check_versions()

    def run(self):
@ -131,11 +125,6 @@ class VllmTestRunner(BaseRunner):
            # double check the torches are not overridden by other packages
            check_versions()

-    def cp_torch_cleaning_script(self, params: VllmTestParameters):
-        script = get_path(params.cleaning_script, resolve=True)
-        vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py")
-        copy(script, vllm_script)
-
    def _install_wheels(self, params: VllmTestParameters):
        logger.info("Running vllm test with inputs: %s", params)
        if not pkg_exists("torch"):
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@ -1,11 +1,11 @@
 SHELL=/usr/bin/env bash

 DOCKER_CMD ?= docker
-DESIRED_ROCM ?= 7.0
+DESIRED_ROCM ?= 6.4
 DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
 PACKAGE_NAME = magma-rocm
 # inherit this from underlying docker image, do not pass this env var to docker
-#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
+#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201

 DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
@ -16,7 +16,6 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	magma-rocm/build_magma.sh

 .PHONY: all
-all: magma-rocm70
 all: magma-rocm64
 all: magma-rocm63

@ -25,11 +24,6 @@ clean:
 	$(RM) -r magma-*
 	$(RM) -r output

-.PHONY: magma-rocm70
-magma-rocm70: DESIRED_ROCM := 7.0
-magma-rocm70:
-	$(DOCKER_RUN)
-
 .PHONY: magma-rocm64
 magma-rocm64: DESIRED_ROCM := 6.4
 magma-rocm64:
--- a/.ci/magma-rocm/build_magma.sh
+++ b/.ci/magma-rocm/build_magma.sh
@ -6,8 +6,8 @@ set -eou pipefail
 # The script expects DESIRED_CUDA and PACKAGE_NAME to be set
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"

-# https://github.com/icl-utk-edu/magma/pull/65
-MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
+# Version 2.7.2 + ROCm related updates
+MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6

 # Folders for the build
 PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE

 # Fetch magma sources and verify checksum
 pushd ${PACKAGE_DIR}
-git clone https://github.com/jeffdaily/magma
+git clone https://bitbucket.org/icl/magma.git
 pushd magma
 git checkout ${MAGMA_VERSION}
 popd
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -142,7 +142,7 @@ time CMAKE_ARGS=${CMAKE_ARGS[@]} \
    EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
    BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
    USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
-    python -m build --wheel --no-isolation --outdir /tmp/$WHEELHOUSE_DIR
+    python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
 echo "Finished setup.py bdist at $(date)"

 # Build libtorch packages
--- a/.ci/manywheel/build_libtorch.sh
+++ b/.ci/manywheel/build_libtorch.sh
@ -104,7 +104,7 @@ if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
    export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr
 fi

-echo "Calling -m pip install . -v --no-build-isolation at $(date)"
+echo "Calling 'python -m pip install .' at $(date)"

 if [[ $LIBTORCH_VARIANT = *"static"* ]]; then
    STATIC_CMAKE_FLAG="-DTORCH_STATIC=1"
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -290,13 +290,13 @@ else

      WERROR=1 python setup.py clean

-      WERROR=1 python -m build --wheel --no-isolation
+      WERROR=1 python setup.py bdist_wheel
    else
      python setup.py clean
      if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
        source .ci/pytorch/install_cache_xla.sh
      fi
-      python -m build --wheel --no-isolation
+      python setup.py bdist_wheel
    fi
    pip_install_whl "$(echo dist/*.whl)"

--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -258,19 +258,11 @@ function install_torchrec_and_fbgemm() {
      git clone --recursive https://github.com/pytorch/fbgemm
      pushd fbgemm/fbgemm_gpu
      git checkout "${fbgemm_commit}" --recurse-submodules
-      # until the fbgemm_commit includes the tbb patch
-      patch <<'EOF'
--- a/FbgemmGpu.cmake
-+++ b/FbgemmGpu.cmake
-@@ -184,5 +184,6 @@ gpu_cpp_library(
-     fbgemm_gpu_tbe_cache
-     fbgemm_gpu_tbe_optimizers
-     fbgemm_gpu_tbe_utils
-+    tbb
-   DESTINATION
-     fbgemm_gpu)
-EOF
-      python setup.py bdist_wheel --build-variant=rocm
+      python setup.py bdist_wheel \
+        --build-variant=rocm \
+        -DHIP_ROOT_DIR="${ROCM_PATH}" \
+        -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
+        -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
      popd

      # Save the wheel before cleaning up
@ -292,7 +284,7 @@ EOF

 function clone_pytorch_xla() {
  if [[ ! -d ./xla ]]; then
-    git clone --recursive --quiet https://github.com/pytorch/xla.git
+    git clone --recursive -b r2.9 https://github.com/pytorch/xla.git
    pushd xla
    # pin the xla hash so that we don't get broken by changes to xla
    git checkout "$(cat ../.github/ci_commit_pins/xla.txt)"
--- a/.ci/pytorch/cpp_doc_push_script.sh
+++ b/.ci/pytorch/cpp_doc_push_script.sh
@ -58,7 +58,7 @@ time python tools/setup_helpers/generate_code.py \

 # Build the docs
 pushd docs/cpp
-time make VERBOSE=1 html
+time make VERBOSE=1 html -j

 popd
 popd
--- a/.ci/pytorch/functorch_doc_push_script.sh
+++ b/.ci/pytorch/functorch_doc_push_script.sh
@ -0,0 +1,40 @@
+#!/bin/bash
+
+# This is where the local pytorch install in the docker image is located
+pt_checkout="/var/lib/jenkins/workspace"
+source "$pt_checkout/.ci/pytorch/common_utils.sh"
+echo "functorch_doc_push_script.sh: Invoked with $*"
+
+set -ex -o pipefail
+
+version=${DOCS_VERSION:-nightly}
+echo "version: $version"
+
+# Build functorch docs
+pushd $pt_checkout/functorch/docs
+make html
+popd
+
+git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages
+pushd functorch_ghpages
+
+if [ "$version" == "main" ]; then
+  version=nightly
+fi
+
+git rm -rf "$version" || true
+mv "$pt_checkout/functorch/docs/build/html" "$version"
+
+git add "$version" || true
+git status
+git config user.email "soumith+bot@pytorch.org"
+git config user.name "pytorchbot"
+# If there aren't changes, don't make a commit; push is no-op
+git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true
+git status
+
+if [[ "${WITH_PUSH:-}" == true ]]; then
+  git push -u origin gh-pages
+fi
+
+popd
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -35,12 +35,11 @@ fi

 print_cmake_info
 if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
-  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
-  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python -m build --wheel --no-isolation
+  USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
 else
-  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
-  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
-  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python -m build --wheel --no-isolation -C--build-option=--plat-name=macosx_11_0_arm64
+  # NB: we always build with distributed; USE_DISTRIBUTED turns off all
+  # backends (specifically the gloo backend), so test that this case works too
+  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
 fi
 if which sccache > /dev/null; then
  print_sccache_stats
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -13,9 +13,13 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
 fi
 popd

+python -mpip install -r requirements.txt
+
 # enable debug asserts in serialization
 export TORCH_SERIALIZATION_DEBUG=1

+python -mpip install --no-input -r requirements.txt
+
 setup_test_python() {
  # The CircleCI worker hostname doesn't resolve to an address.
  # This environment variable makes ProcessGroupGloo default to
@ -55,7 +59,7 @@ test_python_shard() {

  setup_test_python

-  time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "$1" "$NUM_TEST_SHARDS"
+  time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS"

  assert_git_not_dirty
 }
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -26,7 +26,6 @@ if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then
    time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
    time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
    time python test/run_test.py --verbose -i distributed/test_compute_comm_reordering
-    time python test/run_test.py --verbose -i distributed/test_aten_comm_compute_reordering
    time python test/run_test.py --verbose -i distributed/test_store
    time python test/run_test.py --verbose -i distributed/test_symmetric_memory
    time python test/run_test.py --verbose -i distributed/test_pg_wrapper
--- a/.ci/pytorch/numba-cuda-13.patch
+++ b/.ci/pytorch/numba-cuda-13.patch
@ -1,25 +0,0 @@
-From 6e08c9d08e9de59c7af28b720289debbbd384764 Mon Sep 17 00:00:00 2001
-From: Michael Wang <13521008+isVoid@users.noreply.github.com>
-Date: Tue, 1 Apr 2025 17:28:05 -0700
-Subject: [PATCH] Avoid bumping certain driver API to avoid future breakage
- (#185)
-
-Co-authored-by: isVoid <isVoid@users.noreply.github.com>
---
- numba_cuda/numba/cuda/cudadrv/driver.py | 3 +++
- 1 file changed, 3 insertions(+)
-
-diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
-index 1641bf77..233e9ed7 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
-+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
-@@ -365,6 +365,9 @@ def _find_api(self, fname):
-         else:
-             variants = ('_v2', '')
- 
-+        if fname in ("cuCtxGetDevice", "cuCtxSynchronize"):
-+            return getattr(self.lib, fname)
-+
-         for variant in variants:
-             try:
-                 return getattr(self.lib, f'{fname}{variant}')
--- a/.ci/pytorch/smoke_test/smoke_test.py
+++ b/.ci/pytorch/smoke_test/smoke_test.py
@ -386,8 +386,8 @@ def smoke_test_compile(device: str = "cpu") -> None:


 def smoke_test_nvshmem() -> None:
-    if not torch.cuda.is_available() or target_os == "windows":
-        print("Windows platform or CUDA is not available, skipping NVSHMEM test")
+    if not torch.cuda.is_available():
+        print("CUDA is not available, skipping NVSHMEM test")
        return

    # Check if NVSHMEM is compiled in current build
@ -396,9 +396,7 @@ def smoke_test_nvshmem() -> None:
    except ImportError:
        # Not built with NVSHMEM support.
        # torch is not compiled with NVSHMEM prior to 2.9
-        from torch.torch_version import TorchVersion
-
-        if TorchVersion(torch.__version__) < (2, 9):
+        if torch.__version__ < "2.9":
            return
        else:
            # After 2.9: NVSHMEM is expected to be compiled in current build
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -32,16 +32,6 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v
  git config --global --add safe.directory /var/lib/jenkins/workspace
 fi

-
-# Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
-NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
-if [ -n "$NUMBA_CUDA_DIR" ]; then
-  NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
-  pushd "$NUMBA_CUDA_DIR"
-  patch -p4 <"$NUMBA_PATCH"
-  popd
-fi
-
 echo "Environment variables:"
 env

@ -322,29 +312,23 @@ test_python_shard() {

  # modify LD_LIBRARY_PATH to ensure it has the conda env.
  # This set of tests has been shown to be buggy without it for the split-build
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running

  assert_git_not_dirty
 }

 test_python() {
  # shellcheck disable=SC2086
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
  assert_git_not_dirty
 }

 test_python_smoke() {
-  # Smoke tests for H100/B200
+  # Smoke tests for H100
  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }

-test_python_smoke_b200() {
-  # Targeted smoke tests for B200 - staged approach to avoid too many failures
-  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
-  assert_git_not_dirty
-}
-
 test_h100_distributed() {
  # Distributed tests at H100
  time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
@ -390,7 +374,6 @@ test_dynamo_wrapped_shard() {
    --exclude-distributed-tests \
    --exclude-torch-export-tests \
    --exclude-aot-dispatch-tests \
-    --exclude-quantization-tests \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose \
    --upload-artifacts-while-running
@ -435,7 +418,7 @@ test_inductor_distributed() {

  # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
  # with if required # gpus aren't available
-  python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_aten_comm_compute_reordering distributed/test_compute_comm_reordering --verbose
+  python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_compute_comm_reordering --verbose
  assert_git_not_dirty
 }

@ -1163,12 +1146,6 @@ test_distributed() {
  fi
 }

-test_quantization() {
-  echo "Testing quantization"
-
-  python test/test_quantization.py
-}
-
 test_rpc() {
  echo "Testing RPC C++ tests"
  # NB: the ending test_rpc must match the current function name for the current
@ -1415,7 +1392,7 @@ EOF
  pip3 install -r requirements.txt
  # shellcheck source=./common-build.sh
  source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
-  python -m build --wheel --no-isolation -C--build-option=--bdist-dir="base_bdist_tmp" --outdir "base_dist"
+  python setup.py bdist_wheel --bdist-dir="base_bdist_tmp" --dist-dir="base_dist"
  python -mpip install base_dist/*.whl
  echo "::endgroup::"

@ -1563,10 +1540,14 @@ test_executorch() {
  install_torchvision
  install_torchaudio

-  INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh"
-
  pushd /executorch
-  "${INSTALL_SCRIPT}" setup_executorch
+
+  export PYTHON_EXECUTABLE=python
+  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+
+  # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
+  # from the PR
+  bash .ci/scripts/setup-linux.sh --build-tool cmake

  echo "Run ExecuTorch unit tests"
  pytest -v -n auto
@ -1580,14 +1561,17 @@ test_executorch() {

  popd

+  # Test torchgen generated code for Executorch.
+  echo "Testing ExecuTorch op registration"
+  "$BUILD_BIN_DIR"/test_edge_op_registration
+
  assert_git_not_dirty
 }

 test_linux_aarch64() {
  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
        test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
-        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops profiler/test_memory_profiler \
-        distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
+        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
        --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose

  # Dynamo tests
@ -1617,7 +1601,7 @@ test_operator_benchmark() {
  test_inductor_set_cpu_affinity

  cd benchmarks/operator_benchmark/pt_extension
-  python -m pip install . -v --no-build-isolation
+  python -m pip install .

  cd "${TEST_DIR}"/benchmarks/operator_benchmark
  $TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \
@ -1630,25 +1614,6 @@ test_operator_benchmark() {
      --expected "expected_ci_operator_benchmark_eager_float32_cpu.csv"
 }

-test_operator_microbenchmark() {
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-  TEST_DIR=$(pwd)
-
-  cd benchmarks/operator_benchmark/pt_extension
-  python -m pip install .
-
-  cd "${TEST_DIR}"/benchmarks/operator_benchmark
-
-  for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do
-    $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
-      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
-      --benchmark-name "PyTorch operator microbenchmark" --use-compile
-    $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
-      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}.json" \
-      --benchmark-name "PyTorch operator microbenchmark"
-  done
-}

 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
  (cd test && python -c "import torch; print(torch.__config__.show())")
@ -1681,8 +1646,6 @@ elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
  test_executorch
 elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
  test_python_legacy_jit
-elif [[ "$TEST_CONFIG" == 'quantization' ]]; then
-  test_quantization
 elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
  # TODO: run some C++ tests
  echo "no-op at the moment"
@ -1705,8 +1668,6 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
    test_operator_benchmark cpu ${TEST_MODE}

  fi
-elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then
-  test_operator_microbenchmark
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
@ -1760,6 +1721,11 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
 elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  install_torchvision
  test_inductor_shard "${SHARD_NUMBER}"
+  if [[ "${SHARD_NUMBER}" == 1 ]]; then
+    if [[ "${BUILD_ENVIRONMENT}" != linux-jammy-py3.9-gcc11-build ]]; then
+      test_inductor_distributed
+    fi
+  fi
 elif [[ "${TEST_CONFIG}" == *einops* ]]; then
  test_einops
 elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
@ -1809,14 +1775,10 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
  test_xpu_bin
 elif [[ "${TEST_CONFIG}" == smoke ]]; then
  test_python_smoke
-elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then
-  test_python_smoke_b200
 elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
  test_h100_distributed
 elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
  test_h100_symm_mem
-elif [[ "${TEST_CONFIG}" == "b200-symm-mem" ]]; then
-  test_h100_symm_mem
 elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then
  test_h100_cutlass_backend
 else
--- a/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
+++ b/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
@ -70,7 +70,7 @@ sccache --zero-stats
 sccache --show-stats

 # Build the wheel
-python -m build --wheel --no-build-isolation
+python setup.py bdist_wheel
 if ($LASTEXITCODE -ne 0) { exit 1 }

 # Install the wheel locally
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -130,14 +130,14 @@ if "%USE_CUDA%"=="1" (
 :: Print all existing environment variable for debugging
 set

-python -m build --wheel --no-isolation
+python setup.py bdist_wheel
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
 sccache --show-stats
 python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
 (
  if "%BUILD_ENVIRONMENT%"=="" (
-    echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%\envs\py_tmp` in Command Prompt before running Git Bash.
+    echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash.
  ) else (
    copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%"

--- a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
+++ b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
@ -3,12 +3,12 @@ if "%BUILD_ENVIRONMENT%"=="" (
 ) else (
  set CONDA_PARENT_DIR=C:\Jenkins
 )
-set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3
+

 :: Be conservative here when rolling out the new AMI with conda. This will try
 :: to install conda as before if it couldn't find the conda installation. This
 :: can be removed eventually after we gain enough confidence in the AMI
-if not exist %CONDA_ROOT_DIR% (
+if not exist %CONDA_PARENT_DIR%\Miniconda3 (
  set INSTALL_FRESH_CONDA=1
 )

@ -17,14 +17,10 @@ if "%INSTALL_FRESH_CONDA%"=="1" (
  if errorlevel 1 exit /b
  if not errorlevel 0 exit /b

-  %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR%
+  %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
  if errorlevel 1 exit /b
  if not errorlevel 0 exit /b
 )

 :: Activate conda so that we can use its commands, i.e. conda, python, pip
-call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%
-:: Activate conda so that we can use its commands, i.e. conda, python, pip
-call conda activate py_tmp
-
-call pip install -r .ci/docker/requirements-ci.txt
+call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
--- a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
@ -14,7 +14,7 @@ if not errorlevel 0 exit /b
 :: build\torch. Rather than changing all these references, making a copy of torch folder
 :: from conda to the current workspace is easier. The workspace will be cleaned up after
 :: the job anyway
-xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
+xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\

 pushd .
 if "%VC_VERSION%" == "" (
--- a/.ci/pytorch/win-test-helpers/test_python_shard.bat
+++ b/.ci/pytorch/win-test-helpers/test_python_shard.bat
@ -25,7 +25,7 @@ echo Copying over test times file
 robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files"

 echo Run nn tests
-python run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
+python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
 if ERRORLEVEL 1 goto fail

 popd
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -38,14 +38,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi

 # TODO: Move both of them to Windows AMI
-python -m pip install tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
-
-# Copied from https://github.com/pytorch/test-infra/blob/be01a40157c36cd5a48391fdf44a7bc3ebd4c7e3/aws/ami/windows/scripts/Installers/Install-Pip-Dependencies.ps1#L16 with some adjustments
-# pytest-rerunfailures==10.3 as 10.2 fails with INTERNALERROR> pluggy._manager.PluginValidationError: unknown hook 'pytest_configure_node'
-# scipy from 1.6.3 to 1.10
-# expecttest from 0.1.3 to 0.3.0
-# xdoctest from 1.0.2 to 1.3.0
-python -m pip install "future==0.18.2" "hypothesis==5.35.1" "expecttest==0.3.0" "librosa>=0.6.2" "scipy==1.10.1" "psutil==5.9.1" "pynvml==11.4.1" "pillow==9.2.0" "unittest-xml-reporting<=3.2.0,>=2.0.0" "pytest==7.1.3" "pytest-xdist==2.5.0" "pytest-flakefinder==1.1.0" "pytest-rerunfailures==10.3" "pytest-shard==0.1.2" "sympy==1.11.1" "xdoctest==1.3.0" "pygments==2.12.0" "opt-einsum>=3.3" "networkx==2.8.8" "mpmath==1.2.1" "pytest-cpp==2.3.0" "boto3==1.35.42"
+python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1

 # Install Z3 optional dependency for Windows builds.
 python -m pip install z3-solver==4.15.1.0
@ -59,6 +52,9 @@ python -m pip install parameterized==0.8.1
 # Install pulp for testing ilps under torch\distributed\_tools
 python -m pip install pulp==2.9.0

+# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308
+python -m pip install expecttest==0.3.0
+
 run_tests() {
    # Run nvidia-smi if available
    for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
--- a/.ci/pytorch/windows/arm64/build_pytorch.bat
+++ b/.ci/pytorch/windows/arm64/build_pytorch.bat
@ -48,7 +48,7 @@ sccache --zero-stats
 sccache --show-stats

 :: Call PyTorch build script
-python -m build --wheel --no-isolation --outdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+python setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%"

 :: show sccache stats
 sccache --show-stats
--- a/.ci/pytorch/windows/internal/install_python.bat
+++ b/.ci/pytorch/windows/internal/install_python.bat
@ -28,5 +28,5 @@ start /wait "" python-amd64.exe /quiet InstallAllUsers=1 PrependPath=0 Include_t
 if errorlevel 1 exit /b 1

 set "PATH=%CD%\Python\Scripts;%CD%\Python;%PATH%"
-%PYTHON_EXEC% -m pip install --upgrade pip setuptools packaging wheel build
+%PYTHON_EXEC% -m pip install --upgrade pip setuptools packaging wheel
 if errorlevel 1 exit /b 1
--- a/.ci/pytorch/windows/internal/setup.bat
+++ b/.ci/pytorch/windows/internal/setup.bat
@ -86,7 +86,7 @@ copy /Y "%LIBTORCH_PREFIX%-%PYTORCH_BUILD_VERSION%.zip" "%PYTORCH_FINAL_PACKAGE_
 goto build_end

 :pytorch
-%PYTHON_EXEC% -m build --wheel --no-isolation --outdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+%PYTHON_EXEC% setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%"

 :build_end
 IF ERRORLEVEL 1 exit /b 1
--- a/.ci/pytorch/windows/internal/static_lib_test.bat
+++ b/.ci/pytorch/windows/internal/static_lib_test.bat
@ -63,7 +63,7 @@ if errorlevel 1 exit /b 1
 call %CONDA_HOME%\condabin\activate.bat testenv
 if errorlevel 1 exit /b 1

-call conda install  -y -q -c conda-forge libuv=1.51
+call conda install  -y -q -c conda-forge libuv=1.39
 call conda install -y -q intel-openmp

 echo "install and test libtorch"
--- a/.ci/pytorch/windows/setup_build.bat
+++ b/.ci/pytorch/windows/setup_build.bat
@ -18,7 +18,7 @@ if "%DESIRED_PYTHON%" == "3.9" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake

 %PYTHON_EXEC% -m pip install pyyaml
 %PYTHON_EXEC% -m pip install mkl-include mkl-static
-%PYTHON_EXEC% -m pip install boto3 requests ninja typing_extensions setuptools==72.1.0
+%PYTHON_EXEC% -m pip install boto3 ninja typing_extensions setuptools==72.1.0

 where cmake.exe

--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -85,7 +85,7 @@ mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
 # Create an isolated directory to store this builds pytorch checkout and conda
 # installation
 if [[ -z "$MAC_PACKAGE_WORK_DIR" ]]; then
-    MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_${DESIRED_PYTHON}_$(date +%H%M%S)"
+    MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_conda_${DESIRED_PYTHON}_$(date +%H%M%S)"
 fi
 mkdir -p "$MAC_PACKAGE_WORK_DIR" || true
 if [[ -n ${GITHUB_ACTIONS} ]]; then
@ -96,11 +96,11 @@ fi
 whl_tmp_dir="${MAC_PACKAGE_WORK_DIR}/dist"
 mkdir -p "$whl_tmp_dir"

-mac_version='macosx-11_0-arm64'
+mac_version='macosx_11_0_arm64'
 libtorch_arch='arm64'

 # Create a consistent wheel package name to rename the wheel to
-wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version//[-,]/_}.whl"
+wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version}.whl"

 ###########################################################

@ -125,6 +125,7 @@ popd
 export TH_BINARY_BUILD=1
 export INSTALL_TEST=0 # dont install test binaries into site-packages
 export MACOSX_DEPLOYMENT_TARGET=11.0
+export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}

 EXTRA_CONDA_INSTALL_FLAGS=""
 CONDA_ENV_CREATE_FLAGS=""
@ -132,20 +133,25 @@ RENAME_WHEEL=true
 case $desired_python in
    3.14t)
        echo "Using 3.14 deps"
-        mac_version='macosx-11.0-arm64'
        NUMPY_PINNED_VERSION="==2.1.0"
+        CONDA_ENV_CREATE_FLAGS="python-freethreading"
+        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+        desired_python="3.14.0rc1"
        RENAME_WHEEL=false
        ;;
    3.14)
        echo "Using 3.14t deps"
-        mac_version='macosx-11.0-arm64'
        NUMPY_PINNED_VERSION="==2.1.0"
+        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+        desired_python="3.14.0rc1"
        RENAME_WHEEL=false
        ;;
    3.13t)
-        echo "Using 3.13t deps"
-        mac_version='macosx-11.0-arm64'
+        echo "Using 3.13 deps"
        NUMPY_PINNED_VERSION="==2.1.0"
+        CONDA_ENV_CREATE_FLAGS="python-freethreading"
+        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+        desired_python="3.13"
        RENAME_WHEEL=false
        ;;
    3.13)
@ -170,15 +176,21 @@ case $desired_python in
        ;;
 esac

+# Install into a fresh env
+tmp_env_name="wheel_py$python_nodot"
+conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS}
+source activate "$tmp_env_name"
+
 PINNED_PACKAGES=(
    "numpy${NUMPY_PINNED_VERSION}"
 )
-python -mvenv ~/${desired_python}-build
-source ~/${desired_python}-build/bin/activate
-retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
+retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements-build.txt"
+pip install requests ninja typing-extensions
+retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
 retry brew install libomp

-# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
+# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
+# is build as part of tensorpipe submodule
 export USE_DISTRIBUTED=1

 export USE_MKLDNN=OFF
@ -186,11 +198,11 @@ export USE_QNNPACK=OFF
 export BUILD_TEST=OFF

 pushd "$pytorch_rootdir"
-echo "Calling -m build --wheel --no-isolation at $(date)"
+echo "Calling setup.py bdist_wheel at $(date)"

-_PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python -m build --wheel --no-isolation --outdir "$whl_tmp_dir" -C--plat-name="${mac_version//[-.]/_}"
+python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name ${mac_version}

-echo "Finished -m build --wheel --no-isolation at $(date)"
+echo "Finished setup.py bdist_wheel at $(date)"

 if [[ $package_type != 'libtorch' ]]; then
    echo "delocating wheel dependencies"
--- a/.circleci/scripts/functorch_doc_push_script.sh
+++ b/.circleci/scripts/functorch_doc_push_script.sh
@ -0,0 +1,47 @@
+#!/bin/bash
+# =================== The following code **should** be executed inside Docker container ===================
+
+# Install dependencies
+sudo apt-get -y update
+sudo apt-get -y install expect-dev
+
+# This is where the local pytorch install in the docker image is located
+pt_checkout="/var/lib/jenkins/workspace"
+source "$pt_checkout/.ci/pytorch/common_utils.sh"
+echo "functorch_doc_push_script.sh: Invoked with $*"
+
+set -ex
+
+version=${DOCS_VERSION:-nightly}
+echo "version: $version"
+
+# Build functorch docs
+pushd $pt_checkout/functorch/docs
+pip -q install -r requirements.txt
+make html
+popd
+
+git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages
+pushd functorch_ghpages
+
+if [ $version == "main" ]; then
+  version=nightly
+fi
+
+git rm -rf "$version" || true
+mv "$pt_checkout/functorch/docs/build/html" "$version"
+
+git add "$version" || true
+git status
+git config user.email "soumith+bot@pytorch.org"
+git config user.name "pytorchbot"
+# If there aren't changes, don't make a commit; push is no-op
+git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true
+git status
+
+if [[ "${WITH_PUSH:-}" == true ]]; then
+  git push -u origin gh-pages
+fi
+
+popd
+# =================== The above code **should** be executed inside Docker container ===================
--- a/.clang-tidy
+++ b/.clang-tidy
@ -69,8 +69,6 @@ readability-string-compare,
 '
 HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
 WarningsAsErrors: '*'
-LineFilter:
-  - name: '/usr/include/.*'
 CheckOptions:
  cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: true
  cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove: true
--- a/.flake8
+++ b/.flake8
@ -73,7 +73,7 @@ exclude =
    ./docs/src,
    ./functorch/docs,
    ./functorch/examples,
-    ./functorch/docs/source/tutorials,
+    ./functorch/notebooks,
    ./scripts,
    ./test/generated_type_hints_smoketest.py,
    ./third_party,
--- a/.github/ISSUE_TEMPLATE/ci-sev.md
+++ b/.github/ISSUE_TEMPLATE/ci-sev.md
@ -1,10 +1,6 @@
 ---
 name: "⚠️ CI SEV"
 about: Tracking incidents for PyTorch's CI infra.
-title: ''
-labels: ''
-assignees: ''
-
 ---

 > NOTE: Remember to label this issue with "`ci: sev`"
--- a/.github/ISSUE_TEMPLATE/disable-autorevert.md
+++ b/.github/ISSUE_TEMPLATE/disable-autorevert.md
@ -1,18 +0,0 @@
---
-name: DISABLE AUTOREVERT
-about: Disables autorevert when open
-title: "❌\U0001F519 [DISABLE AUTOREVERT]"
-labels: 'ci: disable-autorevert'
-assignees: ''
-
---
-
-This issue, while open, disables the autorevert functionality.
-
-More details can be found [here](https://github.com/pytorch/test-infra/blob/main/aws/lambda/pytorch-auto-revert/README.md)
-
-
-## Why are you disabling autorevert?
-
-
-## Links to any issues/commits/errors that shows the source of problem
--- a/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
+++ b/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
@ -1,10 +1,8 @@
 ---
 name: Disable CI jobs (PyTorch Dev Infra only)
 about: Use this template to disable CI jobs
-title: DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]
-labels: 'module: ci'
-assignees: ''
-
+title: "DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]"
+labels: "module: ci"
 ---

 > For example, DISABLED pull / win-vs2022-cpu-py3 / test (default). Once
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -21,10 +21,6 @@ self-hosted-runner:
    - linux.arm64.2xlarge.ephemeral
    - linux.arm64.m7g.4xlarge
    - linux.arm64.m7g.4xlarge.ephemeral
-    - linux.arm64.r7g.12xlarge.memory
-    - linux.aws.h100
-    - linux.aws.h100.4
-    - linux.aws.h100.8
    - linux.4xlarge.nvidia.gpu
    - linux.8xlarge.nvidia.gpu
    - linux.16xlarge.nvidia.gpu
--- a/.github/actions/reuse-old-whl/reuse_old_whl.py
+++ b/.github/actions/reuse-old-whl/reuse_old_whl.py
@ -264,7 +264,7 @@ def unzip_artifact_and_replace_files() -> None:
        change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py")

        for file in Path(f"artifacts/dist/{old_stem}").glob(
-            "*.dist-info/*",
+            "*.dist-info/**",
        ):
            change_content_to_new_version(file)

--- a/.github/actions/setup-win/action.yml
+++ b/.github/actions/setup-win/action.yml
@ -6,12 +6,6 @@ inputs:
  cuda-version:
    description: which cuda version to install, 'cpu' for none
    required: true
-  python-version:
-    required: false
-    type: string
-    default: "3.10"
-    description: |
-      The python version to be used. Will be 3.10 by default

 runs:
  using: composite
@ -44,24 +38,18 @@ runs:
        CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat"

        {
-          echo "CONDA=${CONDA}";
          echo "CONDA_RUN=${CONDA} run --no-capture-output";
          echo "CONDA_BUILD=${CONDA} run conda-build";
          echo "CONDA_INSTALL=${CONDA} install";
        } >> "${GITHUB_ENV}"

    - name: Setup Python3
-      env:
-          PYTHON_VERSION: ${{ inputs.python-version }}
      shell: bash
      run: |
        set +e
        set -x

-        # Create new py_tmp env with python-version
-        ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp libuv
-
-        PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
+        PYTHON3=$(${CONDA_RUN} which python3)
        EXIT_CODE=$?

        if [[ "${EXIT_CODE}" == "0" ]]; then
@ -74,7 +62,7 @@ runs:
          # installation, which is Python 3 based. Its Python is default to Python 3. Further, there
          # is also the Miniconda installation that is Python 2 based, and both can be installed if
          # needed. In both cases, Python binary is just called python
-          PYTHON=$(${CONDA_RUN} -n py_tmp which python)
+          PYTHON=$(${CONDA_RUN} which python)
          EXIT_CODE=$?

          if [[ "${EXIT_CODE}" == "0" ]]; then
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-87ff22e49ed0e92576c4935ccb8c143daac4a3cd
+27fc2493d383354a008106f22f3be232badee9a1
--- a/.github/ci_commit_pins/fbgemm_rocm.txt
+++ b/.github/ci_commit_pins/fbgemm_rocm.txt
@ -1 +1 @@
-08ae0af1395c8d8471f4025deb6af9aef90b342f
+7f1de94a4c2d14f59ad4ca84538c36084ea6b2c8
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-78a47f87ce259a48f0391fa9ae15add05ea7432b
+e10fef08838612b4560e9c72e5cb1414a5edfa13
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-0fc62aa26a30ed7ca419d285f285cb5ba02c4394
+r2.9
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@ -82,10 +82,16 @@ RUN if command -v apt-get >/dev/null; then \
        apt-get update -y \
        && apt-get install -y ccache software-properties-common git curl wget sudo vim; \
    else \
-        dnf install -y git curl wget sudo; \
+        dnf install -y git curl wget sudo vim; \
    fi \
    && python3 --version && python3 -m pip --version

+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
    if ! python3 -m uv --version >/dev/null 2>&1; then \
@ -214,16 +220,11 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
    if [ "$USE_SCCACHE" = "1" ]; then \
-        echo "Installing sccache..."; \
-        if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-            SCCACHE_ARCHIVE="sccache-v0.8.1-aarch64-unknown-linux-musl"; \
-        else \
-            SCCACHE_ARCHIVE="sccache-v0.8.1-x86_64-unknown-linux-musl"; \
-        fi; \
-        curl -L -o sccache.tar.gz "https://github.com/mozilla/sccache/releases/download/v0.8.1/${SCCACHE_ARCHIVE}.tar.gz" \
+        echo "Installing sccache..." \
+        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
        && tar -xzf sccache.tar.gz \
-        && sudo mv "${SCCACHE_ARCHIVE}"/sccache /usr/bin/sccache \
-        && rm -rf sccache.tar.gz "${SCCACHE_ARCHIVE}" \
+        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
@ -284,7 +285,7 @@ RUN if command -v apt-get >/dev/null; then \
        && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
        && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
    else \
-        dnf install -y git curl wget sudo; \
+        dnf install -y git curl wget sudo vim; \
    fi \
    && python3 --version && python3 -m pip --version

@ -297,6 +298,12 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
    echo "[INFO] Showing torch_build_versions.txt content:" && \
    cat torch_build_versions.txt

+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
    if ! python3 -m uv --version > /dev/null 2>&1; then \
--- a/.github/ci_configs/vllm/use_existing_torch.py
+++ b/.github/ci_configs/vllm/use_existing_torch.py
@ -1,17 +0,0 @@
-import glob
-
-
-requires_files = glob.glob("requirements/*.txt")
-requires_files += ["pyproject.toml"]
-for file in requires_files:
-    print(f">>> cleaning {file}")
-    with open(file) as f:
-        lines = f.readlines()
-    if "torch" in "".join(lines).lower():
-        print("removed:")
-        with open(file, "w") as f:
-            for line in lines:
-                if "torch" not in line.lower():
-                    f.write(line)
-    print(f"<<< done cleaning {file}")
-    print()
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -130,6 +130,3 @@
 - torch/csrc/inductor/aoti_include/**
 - torchgen/aoti/**
 - torchgen/gen_aoti_c_shim.py
-
-"ciflow/vllm":
- .github/ci_commit_pins/vllm.txt
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -525,21 +525,6 @@
  - Lint
  - pull

- name: typechecking
-  patterns:
-  - 'pyrefly.toml'
-  - 'mypy.ini'
-  - 'mypy-strict.ini'
-  approved_by:
-  - lolpack
-  - maggiemoss
-  - ndmitchell
-  - kinto0
-  mandatory_checks_name:
-  - EasyCLA
-  - Lint
-  - pull
-
 - name: superuser
  patterns:
  - '*'
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -1,44 +1,41 @@
 tracking_issue: 24422
 ciflow_tracking_issue: 64124
 ciflow_push_tags:
- ciflow/b200
- ciflow/b200-symm-mem
 - ciflow/binaries
 - ciflow/binaries_libtorch
 - ciflow/binaries_wheel
- ciflow/h100
- ciflow/h100-cutlass-backend
- ciflow/h100-distributed
- ciflow/h100-symm-mem
+- ciflow/triton_binaries
 - ciflow/inductor
- ciflow/inductor-cu126
- ciflow/inductor-micro-benchmark
- ciflow/inductor-micro-benchmark-cpu-x86
- ciflow/inductor-perf-compare
- ciflow/inductor-perf-test-nightly-rocm
- ciflow/inductor-perf-test-nightly-x86-zen
 - ciflow/inductor-periodic
 - ciflow/inductor-rocm
+- ciflow/inductor-perf-test-nightly-rocm
+- ciflow/inductor-perf-compare
+- ciflow/inductor-micro-benchmark
+- ciflow/inductor-micro-benchmark-cpu-x86
+- ciflow/inductor-perf-test-nightly-x86-zen
+- ciflow/inductor-cu126
 - ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly
- ciflow/op-benchmark
 - ciflow/periodic
 - ciflow/periodic-rocm-mi300
- ciflow/pull
- ciflow/quantization-periodic
- ciflow/riscv64
 - ciflow/rocm
 - ciflow/rocm-mi300
 - ciflow/s390
+- ciflow/riscv64
 - ciflow/slow
- ciflow/torchbench
- ciflow/triton_binaries
 - ciflow/trunk
 - ciflow/unstable
- ciflow/vllm
- ciflow/win-arm64
 - ciflow/xpu
+- ciflow/vllm
+- ciflow/torchbench
+- ciflow/op-benchmark
+- ciflow/pull
+- ciflow/h100
+- ciflow/h100-distributed
+- ciflow/win-arm64
+- ciflow/h100-symm-mem
+- ciflow/h100-cutlass-backend
 retryable_workflows:
 - pull
 - trunk
@ -47,4 +44,4 @@ retryable_workflows:
 - inductor-A100-perf-nightly
 labeler_config: labeler.yml
 label_to_label_config: label_to_label.yml
-mergebot: true
+mergebot: True
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -1,5 +1,4 @@
 boto3==1.35.42
-build==1.2.2.post1
 cmake==3.27.*
 expecttest==0.3.0
 fbscribelogger==0.1.7
@ -16,7 +15,7 @@ optree==0.13.0
 packaging==23.1
 parameterized==0.8.1
 pillow==10.3.0
-protobuf==5.29.5
+protobuf==5.29.4
 psutil==5.9.8
 pygments==2.15.0
 pytest-cpp==2.3.0
@ -27,7 +26,7 @@ pytest-xdist==3.3.1
 pytest==7.3.2
 pyyaml==6.0.2
 scipy==1.12.0
-setuptools==78.1.1
+setuptools==72.1.0
 sympy==1.13.3
 tlparse==0.4.0
 tensorboard==2.13.0
--- a/.github/scripts/docathon-label-sync.py
+++ b/.github/scripts/docathon-label-sync.py
@ -39,9 +39,7 @@ def main() -> None:
    pull_request_label_names = [label.name for label in pull_request_labels]
    issue_label_names = [label.name for label in issue_labels]
    labels_to_add = [
-        label
-        for label in issue_label_names
-        if label not in pull_request_label_names and label != "actionable"
+        label for label in issue_label_names if label not in pull_request_label_names
    ]
    if not labels_to_add:
        print("The pull request already has the same labels.")
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -41,9 +41,9 @@ SUPPORTED_PERIODICAL_MODES: dict[str, Callable[[Optional[str]], bool]] = {
 }

 # The link to the published list of disabled jobs
-DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json"
+DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json?versionId=hjktHz2WOejHpxKpkqpDknTt5rMTM9KK"
 # and unstable jobs
-UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json"
+UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json?versionId=wrjdvvQTJxgvMO.rGw5MEuMsj6XbjuV7"

 # Some constants used to handle disabled and unstable jobs
 JOB_NAME_SEP = "/"
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -30,7 +30,7 @@ CUDA_ARCHES_CUDNN_VERSION = {
 }

 # NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
-ROCM_ARCHES = ["6.4", "7.0"]
+ROCM_ARCHES = ["6.3", "6.4"]

 XPU_ARCHES = ["xpu"]

@ -43,55 +43,55 @@ CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "13.0-aarch64"]

 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
    "12.6": (
-        "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | "
-        "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | "
-        "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | "
-        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
-        "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | "
-        "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | "
-        "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | "
-        "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | "
-        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
-        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
-        "nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | "
-        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
-        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
-        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
-        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'"
+        "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
    "12.8": (
-        "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | "
-        "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | "
-        "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | "
-        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
-        "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | "
-        "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | "
-        "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | "
-        "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | "
-        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
-        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
-        "nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | "
-        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
-        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
-        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
-        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
+        "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
    "13.0": (
-        "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
-        "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | "
-        "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | "
-        "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | "
-        "nvidia-cublas==13.0.0.19; platform_system == 'Linux' | "
-        "nvidia-cufft==12.0.0.15; platform_system == 'Linux' | "
-        "nvidia-curand==10.4.0.35; platform_system == 'Linux' | "
-        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
-        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
-        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
-        "nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | "
-        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
-        "nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
-        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
-        "nvidia-cufile==1.15.0.42; platform_system == 'Linux'"
+        "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
    "xpu": (
        "intel-cmplr-lib-rt==2025.2.1 | "
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -127,6 +127,53 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
    ),
 ]

+ROCM_SMOKE_WORKFLOWS = [
+    BinaryBuildWorkflow(
+        os=OperatingSystem.LINUX,
+        package_type="manywheel",
+        build_variant="rocm",
+        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+            OperatingSystem.LINUX,
+            arches=["6.4"],
+            python_versions=["3.9"],
+        ),
+        ciflow_config=CIFlowConfig(
+            labels={
+                LABEL_CIFLOW_BINARIES,
+                LABEL_CIFLOW_BINARIES_WHEEL,
+                LABEL_CIFLOW_ROCM,
+            },
+            isolated_workflow=True,
+        ),
+        branches="main",
+    ),
+]
+
+LINUX_BINARY_SMOKE_WORKFLOWS = [
+    BinaryBuildWorkflow(
+        os=OperatingSystem.LINUX,
+        package_type="manywheel",
+        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+            OperatingSystem.LINUX,
+            arches=["12.8"],
+            python_versions=["3.12"],
+        ),
+        branches="main",
+    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.LINUX,
+        package_type="libtorch",
+        build_variant=generate_binary_build_matrix.RELEASE,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.LINUX,
+            generate_binary_build_matrix.RELEASE,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        branches="main",
+    ),
+]
+
 WINDOWS_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS,
@ -212,6 +259,39 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
    ),
 ]

+WINDOWS_BINARY_SMOKE_WORKFLOWS = [
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS,
+        package_type="libtorch",
+        build_variant=generate_binary_build_matrix.RELEASE,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.WINDOWS,
+            generate_binary_build_matrix.RELEASE,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        branches="main",
+        ciflow_config=CIFlowConfig(
+            isolated_workflow=True,
+        ),
+    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS,
+        package_type="libtorch",
+        build_variant=generate_binary_build_matrix.DEBUG,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.WINDOWS,
+            generate_binary_build_matrix.DEBUG,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        branches="main",
+        ciflow_config=CIFlowConfig(
+            isolated_workflow=True,
+        ),
+    ),
+]
+
 MACOS_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.MACOS_ARM64,
@ -292,10 +372,23 @@ def main() -> None:
            jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
            S390X_BINARY_BUILD_WORKFLOWS,
        ),
+        (
+            # Give rocm it's own workflow file
+            jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
+            ROCM_SMOKE_WORKFLOWS,
+        ),
+        (
+            jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
+            LINUX_BINARY_SMOKE_WORKFLOWS,
+        ),
        (
            jinja_env.get_template("windows_binary_build_workflow.yml.j2"),
            WINDOWS_BINARY_BUILD_WORKFLOWS,
        ),
+        (
+            jinja_env.get_template("windows_binary_build_workflow.yml.j2"),
+            WINDOWS_BINARY_SMOKE_WORKFLOWS,
+        ),
        (
            jinja_env.get_template("macos_binary_build_workflow.yml.j2"),
            MACOS_BINARY_BUILD_WORKFLOWS,
--- a/.github/scripts/prepare_vllm_wheels.sh
+++ b/.github/scripts/prepare_vllm_wheels.sh
@ -1,94 +0,0 @@
-#!/usr/bin/env bash
-
-set -eux
-
-torch_version=$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-nightly=$(echo ${torch_version} | cut -d'.' -f4)
-
-# Copied from .ci/manywheel/build_common.sh
-make_wheel_record() {
-  fpath=$1
-  if echo $fpath | grep RECORD >/dev/null 2>&1; then
-    echo "$fpath,,"
-  else
-    fhash=$(openssl dgst -sha256 -binary $fpath | openssl base64 | sed -e 's/+/-/g' | sed -e 's/\//_/g' | sed -e 's/=//g')
-    fsize=$(ls -nl $fpath | awk '{print $5}')
-    echo "$fpath,sha256=$fhash,$fsize"
-  fi
-}
-
-change_wheel_version() {
-  local package=$1
-  local wheel=$2
-  local f_version=$3
-  local t_version=$4
-
-  # Extract the wheel
-  ${PYTHON_EXECUTABLE} -mwheel unpack $wheel
-
-  mv "${package}-${f_version}" "${package}-${t_version}"
-  # Change the version from f_version to t_version in the dist-info dir
-  pushd "${package}-${t_version}"
-  mv "${package}-${f_version}.dist-info" "${package}-${t_version}.dist-info"
-
-  pushd "${package}-${t_version}.dist-info"
-  sed -i "s/${package}-${f_version}.dist-info/${package}-${t_version}.dist-info/g" RECORD
-
-  # Update the version in METADATA and its SHA256 hash
-  sed -i "s/Version: ${f_version}/Version: ${t_version}/g" METADATA
-  # then add PyTorch nightly dependency of vLLM
-  if [[ "${package}" == vllm ]] || [[ "${package}" == xformers ]]; then
-    sed -i "/License-File/a\Requires-Dist: torch==${torch_version}" METADATA
-  fi
-  sed -i '/METADATA,sha256/d' RECORD
-  popd
-
-  make_wheel_record "${package}-${t_version}.dist-info/METADATA" >> "${package}-${t_version}.dist-info/RECORD"
-  popd
-
-  # Repack the wheel
-  ${PYTHON_EXECUTABLE} -mwheel pack "${package}-${t_version}"
-
-  # Clean up
-  rm -rf "${package}-${t_version}"
-}
-
-repackage_wheel() {
-  local package=$1
-  pushd $package
-
-  local orig_wheel=$(find . -name *${package//-/_}*)
-  local orig_version=$(unzip -p $orig_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-
-  local version=""
-  if [[ "${package}" == vllm ]]; then
-    # Copied from vllm/.buildkite/scripts/upload-wheels.sh
-    version=1.0.0
-  else
-    version=$(echo $orig_version | tr '.+' '.' | cut -d'.' -f1-3)
-  fi
-  local nightly_version=$version.$nightly
-
-  # Use nightly version
-  change_wheel_version ${package//-/_} $orig_wheel $orig_version $nightly_version
-  # Clean up
-  rm "${orig_wheel}"
-
-  auditwheel repair --plat $PLATFORM *.whl \
-    --exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv*
-  local repair_wheel=$(find wheelhouse -name *${PLATFORM}*)
-  local repair_wheel=$(basename ${repair_wheel})
-  popd
-
-  cp ${package}/wheelhouse/${repair_wheel} .
-  rm -rf $package
-}
-
-# Require to re-package the wheel
-${PYTHON_EXECUTABLE} -mpip install wheel==0.45.1
-
-pushd externals/vllm/wheels
-for package in xformers flashinfer-python vllm; do
-  repackage_wheel $package
-done
-popd
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@ -32,7 +32,7 @@ concurrency:
 {%- macro setup_ec2_windows() -%}
      !{{ display_ec2_information() }}
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -56,7 +56,7 @@ jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -71,15 +71,12 @@ jobs:
    with:!{{ upload.binary_env_as_input(config) }}
      {%- if "aarch64" in build_environment %}
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      {%- elif "s390x" in build_environment %}
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      timeout-minutes: 420
-      {%- elif config["gpu_arch_type"] == "rocm" %}
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
      {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.24xlarge.ephemeral
@ -138,7 +135,7 @@ jobs:
      contents: read
    steps:
      - name: Setup XPU
-        uses: pytorch/pytorch/.github/actions/setup-xpu@main
+        uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.9
      - name: configure aws credentials
        id: aws_creds
        uses: aws-actions/configure-aws-credentials@v4
@ -153,10 +150,10 @@ jobs:
        with:
          name: !{{ config["build_name"] }}
          path: "${{ runner.temp }}/artifacts/"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
        with:
          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
          docker-image-name: !{{ config["container_image"] }}
@ -164,7 +161,7 @@ jobs:
          docker-build-dir: .ci/docker
          working-directory: pytorch
      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
      - name: Test Pytorch binary
@ -185,7 +182,7 @@ jobs:
        with:
          name: !{{ config["build_name"] }}
          path: "${{ runner.temp }}/artifacts/"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
      - name: ROCm set GPU_FLAG
        run: |
          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
@ -199,7 +196,7 @@ jobs:
          role-duration-seconds: 18000
      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
        with:
          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
          docker-image-name: !{{ config["container_image"] }}
@ -207,7 +204,7 @@ jobs:
          docker-build-dir: .ci/docker
          working-directory: pytorch
      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
      - name: Test Pytorch binary
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -22,16 +22,6 @@ name: !{{ build_environment }}
          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
 {%- endmacro %}

-{%- macro setup_python(py_ver) -%}
-      - name: Setup Python
-        uses: actions/setup-python@v6
-        with:
-          # TODO: Removeme once 3.14 is out
-          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
-          python-version: "!{{ (py_ver.strip('t') + '.4') if '3.14' not in py_ver else '3.14.0-rc.2' }}"
-          freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }}
-{%- endmacro %}
-
 on:
 # TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
  push:
@ -71,13 +61,23 @@ jobs:
    {%- endif %}
    steps:
      !{{ set_runner_specific_vars() }}
-      !{{ setup_python(config.get("python_version", "3.10")) }}
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
      - name: Populate binary env
        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
          set -eux -o pipefail
          # shellcheck disable=SC1090
          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -94,6 +94,8 @@ jobs:
 {%- if config["package_type"] == "wheel" %}
      - name: Test PyTorch wheel
        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
          set -eux -o pipefail
          # shellcheck disable=SC1090
          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -104,9 +106,33 @@ jobs:

          SMOKE_TEST_PARAMS=""

+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
          # shellcheck disable=SC2086
-          python -mvenv test_venv
-          source test_venv/bin/activate
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
+          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

          # shellcheck disable=SC2086
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@ -64,7 +64,7 @@ jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -135,7 +135,7 @@ jobs:
 {%- else %}
      !{{ set_runner_specific_vars() }}
      !{{ common.setup_ec2_windows() }}
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
 {%- endif %}
      - name: Populate binary env
        shell: bash
@ -211,7 +211,7 @@ jobs:
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
 {%- else %}
      !{{ common.setup_ec2_windows() }}
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
      !{{ set_runner_specific_vars() }}
 {%- endif %}
      - uses: !{{ common.download_artifact_action }}
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@ -47,7 +47,7 @@ jobs:
      reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
        with:
          fetch-depth: 1
          submodules: false
@ -69,25 +69,25 @@ jobs:
    runs-on: ${{ matrix.runner }}
    steps:
      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9

      - name: Setup Linux
        uses: ./.github/actions/setup-linux

      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
        with:
          docker-image-name: ${{ inputs.docker-image-name }}

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

@ -97,7 +97,7 @@ jobs:
        run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.9
        if: ${{ inputs.cuda-version != 'cpu' && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}

      - name: Output disk space left
@ -209,5 +209,5 @@ jobs:
          file-suffix: bazel-${{ github.job }}_${{ steps.get-job-id.outputs.job-id }}

      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9
        if: always()
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -142,13 +142,13 @@ jobs:

      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        if: inputs.build_environment != 'linux-s390x-binary-manywheel'
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
        continue-on-error: true
        with:
          github-secret: ${{ secrets.github-token }}

      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
        with:
          no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }}

@ -178,7 +178,6 @@ jobs:
      - name: Checkout PyTorch to pytorch dir
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
@ -213,9 +212,9 @@ jobs:
      - name: Calculate docker image
        id: calculate-docker-image
        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
        with:
-          # If doing this in main or release branch, use docker.io. Otherwise
+          # If doing this in release/2.9 or release branch, use docker.io. Otherwise
          # use ECR
          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
          docker-image-name: ${{ inputs.DOCKER_IMAGE }}
@ -227,7 +226,7 @@ jobs:

      - name: Pull Docker image
        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

@ -283,7 +282,7 @@ jobs:

      - name: Teardown Linux
        if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9

      - name: Chown workspace
        if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -125,14 +125,14 @@ jobs:

      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        if: inputs.build_environment != 'linux-s390x-binary-manywheel'
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
        continue-on-error: true
        with:
          github-secret: ${{ secrets.github-token }}

        # Setup the environment
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
        with:
          no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }}

@ -155,7 +155,6 @@ jobs:
      - name: Checkout PyTorch to pytorch dir
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          show-progress: false
          path: pytorch
@ -186,7 +185,9 @@ jobs:
          path: "${{ runner.temp }}/artifacts/"

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.9
+        with:
+          driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }}
        if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}

      - name: configure aws credentials
@ -201,7 +202,7 @@ jobs:
      - name: Calculate docker image
        id: calculate-docker-image
        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
        with:
          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
          docker-image-name: ${{ inputs.DOCKER_IMAGE }}
@ -211,7 +212,7 @@ jobs:

      - name: Pull Docker image
        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

@ -223,7 +224,7 @@ jobs:

      - name: Teardown Linux
        if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9

      - name: Chown workspace
        if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
--- a/.github/workflows/_binary-upload.yml
+++ b/.github/workflows/_binary-upload.yml
@ -81,7 +81,7 @@ jobs:
      SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
        with:
          no-sudo: true

--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@ -67,7 +67,7 @@ jobs:
            # an OOM issue when running the job, so this upgrades the runner from 4xlarge
            # to the next available tier of 12xlarge. So much memory just to generate cpp
            # doc
-            runner: ${{ inputs.runner_prefix }}linux.12xlarge.memory
+            runner: ${{ inputs.runner_prefix }}linux.12xlarge
            # TODO: Nightly cpp docs take longer and longer to finish (more than 3h now)
            # Let's try to figure out how this can be improved
            timeout-minutes: 360
@ -75,12 +75,16 @@ jobs:
            runner: ${{ inputs.runner_prefix }}linux.2xlarge
            # It takes less than 30m to finish python docs unless there are issues
            timeout-minutes: 30
+          - docs_type: functorch
+            runner: ${{ inputs.runner_prefix }}linux.2xlarge
+            # It takes less than 15m to finish functorch docs unless there are issues
+            timeout-minutes: 15
    # Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)
    # The current name requires updating the database last docs push query from test-infra every time the matrix is updated
    name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
    steps:
      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
          instructions: |
@ -91,7 +95,7 @@ jobs:

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9

      - name: Setup Linux
        uses: ./.github/actions/setup-linux
@ -106,12 +110,12 @@ jobs:

      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
        with:
          docker-image-name: ${{ inputs.docker-image }}

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

@ -207,6 +211,16 @@ jobs:
          path: cppdocs/
          s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/cppdocs

+      - name: Upload functorch Docs Preview
+        uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
+        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'functorch' && steps.build-docs.outcome == 'success' }}
+        with:
+          retention-days: 14
+          s3-bucket: doc-previews
+          if-no-files-found: error
+          path: functorch_ghpages/nightly/
+          s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs
+
      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9
        if: always()
--- a/.github/workflows/_get-changed-files.yml
+++ b/.github/workflows/_get-changed-files.yml
@ -2,12 +2,6 @@ name: Get Changed Files

 on:
  workflow_call:
-    inputs:
-      all_files:
-        description: "Whether to return all files instead of just changed files"
-        required: false
-        type: boolean
-        default: false
    outputs:
      changed-files:
        description: "List of changed files (space-separated) or '*' if not in a PR"
@ -32,23 +26,17 @@ jobs:
            # Get the PR number from the github context
            PR_NUMBER="${{ github.event.number }}"

-            # Check if all_files is requested
-            if [ "${{ inputs.all_files }}" = "true" ]; then
-              echo "all_files input is true, returning all files"
-              echo "changed-files=*" >> "$GITHUB_OUTPUT"
-            else
-              # Use gh CLI to get changed files in the PR with explicit repo
-              CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
+            # Use gh CLI to get changed files in the PR with explicit repo
+            CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')

-              if [ -z "$CHANGED_FILES" ]; then
-                echo "No changed files found, setting to '*'"
-                CHANGED_FILES="*"
-              fi
-
-              echo "Changed files: $CHANGED_FILES"
-              echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
+            if [ -z "$CHANGED_FILES" ]; then
+              echo "No changed files found, setting to '*'"
+              CHANGED_FILES="*"
            fi

+            echo "Changed files: $CHANGED_FILES"
+            echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
+
          else
            echo "Not in PR context, setting changed files to '*'"
            echo "changed-files=*" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/_link_check.yml
+++ b/.github/workflows/_link_check.yml
@ -11,7 +11,7 @@ on:
 jobs:
  lint-urls:
    if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-url-lint') }}
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9
    with:
      job-name: lint-urls
      timeout: 120
@ -37,7 +37,7 @@ jobs:

  lint-xrefs:
    if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-xref-lint') }}
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9
    with:
      job-name: lint-xrefs
      timeout: 60
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -134,7 +134,7 @@ jobs:
      test-matrix: ${{ steps.filter.outputs.test-matrix }}
    steps:
      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
@ -147,7 +147,7 @@ jobs:
      # checkout because when we run this action we don't *have* a local
      # checkout. In other cases you should prefer a local checkout.
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
        with:
          no-sudo: true

@ -183,7 +183,7 @@ jobs:

      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image-name: ${{ inputs.docker-image-name }}
@ -199,7 +199,7 @@ jobs:
          echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
        if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true'
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
@ -457,7 +457,7 @@ jobs:
          artifact_prefix: usage_log_build_${{ steps.get-job-id.outputs.job-id }}

      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9
        if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel'

      - name: Cleanup docker
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -99,7 +99,7 @@ jobs:
      contents: read
    steps:
      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
        if: ${{ !contains(matrix.runner, 'b200') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
@ -108,7 +108,7 @@ jobs:
              docker exec -it $(docker container ps --format '{{.ID}}') bash

      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
        with:
          no-sudo: true

@ -139,7 +139,7 @@ jobs:

      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image-name: ${{ inputs.docker-image }}
@ -155,7 +155,7 @@ jobs:
          echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
@ -167,9 +167,9 @@ jobs:

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        id: install-nvidia-driver
-        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.9
        with:
-          driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }}
+          driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}

      - name: Setup GPU_FLAG for docker run
@ -273,8 +273,6 @@ jobs:
          TEST_CONFIG: ${{ matrix.config }}
          SHARD_NUMBER: ${{ matrix.shard }}
          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-          EXTRA_FLAGS: ${{ matrix.extra_flags || '' }}
-          OP_BENCHMARK_TESTS: ${{ matrix.op_benchmark_tests }}
          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
@ -420,7 +418,7 @@ jobs:
          aws-region: us-east-1

      - name: Upload the benchmark results
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.9
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          benchmark-results-dir: test/test-reports
@ -478,7 +476,7 @@ jobs:
          workflow_attempt: ${{github.run_attempt}}

      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9
        if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'

      # NB: We are currently having an intermittent GPU-related issue on G5 runners with
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@ -67,11 +67,11 @@ jobs:
      test-matrix: ${{ steps.filter.outputs.test-matrix }}
    steps:
      - name: Clean up disk space before running MacOS workflow
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.9

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9

      - name: Set xcode version
        env:
@ -82,7 +82,7 @@ jobs:
          fi

      - name: Setup Python
-        uses: pytorch/test-infra/.github/actions/setup-python@main
+        uses: pytorch/test-infra/.github/actions/setup-python@release/2.9
        with:
          python-version: ${{ inputs.python-version }}
          pip-requirements-file: .github/requirements/pip-requirements-macOS.txt
@ -188,4 +188,4 @@ jobs:
      - name: Clean up disk space
        if: always()
        continue-on-error: true
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.9
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -105,11 +105,11 @@ jobs:
          done

      - name: Clean up disk space before running MacOS workflow
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.9

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9

      - name: Get workflow job id
        id: get-job-id
@ -119,7 +119,7 @@ jobs:
          github-token: ${{ secrets.GITHUB_TOKEN }}

      - name: Setup Python
-        uses: pytorch/test-infra/.github/actions/setup-python@main
+        uses: pytorch/test-infra/.github/actions/setup-python@release/2.9
        with:
          python-version: ${{ inputs.python-version }}
          pip-requirements-file: .github/requirements/pip-requirements-macOS.txt
@ -257,7 +257,7 @@ jobs:
          file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}

      - name: Upload the benchmark results
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.9
        with:
          benchmark-results-dir: test/test-reports
          dry-run: false
@ -287,4 +287,4 @@ jobs:
      - name: Clean up disk space
        if: always()
        continue-on-error: true
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.9
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -62,11 +62,6 @@ on:
        required: false
        type: number
        default: 1
-    secrets:
-      HUGGING_FACE_HUB_TOKEN:
-        required: false
-        description: |
-          HF Auth token to avoid rate limits when downloading models or datasets from hub
 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}

@ -81,11 +76,12 @@ jobs:
    strategy:
      matrix: ${{ fromJSON(inputs.test-matrix) }}
      fail-fast: false
-    runs-on: ${{ matrix.runner }}
    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
+    runs-on: ${{ matrix.runner }}
    steps:
+      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
        with:
          no-sudo: true

@ -117,12 +113,12 @@ jobs:

      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
        with:
          docker-image-name: ${{ inputs.docker-image }}

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

@ -135,9 +131,6 @@ jobs:

      - name: Start monitoring script
        id: monitor-script
-        if: ${{ !inputs.disable-monitor }}
-        shell: bash
-        continue-on-error: true
        env:
          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
@ -145,6 +138,9 @@ jobs:
          WORKFLOW_RUN_ID: ${{github.run_id}}
          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
+        if: ${{ !inputs.disable-monitor }}
+        shell: bash
+        continue-on-error: true
        run: |
          python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7
          python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
@ -182,12 +178,6 @@ jobs:
        run: |
          echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"

-      - name: Preserve github env variables for use in docker
-        shell: bash
-        run: |
-          env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
-          env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
-
      - name: Test
        id: test
        env:
@ -203,22 +193,20 @@ jobs:
          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
          BRANCH: ${{ steps.parse-ref.outputs.branch }}
          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
-          TEST_CONFIG: ${{ matrix.config }}
-          SHARD_NUMBER: ${{ matrix.shard }}
-          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
          TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
+          TEST_CONFIG: ${{ matrix.config }}
+          SHARD_NUMBER: ${{ matrix.shard }}
+          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
          DOCKER_IMAGE: ${{ inputs.docker-image }}
          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
          TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
          DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
-          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
        run: |
          set -x
@ -248,7 +236,6 @@ jobs:
            -e GITHUB_RUN_ATTEMPT \
            -e JOB_ID \
            -e JOB_NAME \
-            -e BASE_SHA \
            -e BRANCH \
            -e SHA1 \
            -e AWS_DEFAULT_REGION \
@ -266,12 +253,10 @@ jobs:
            -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
            -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
            -e TESTS_TO_INCLUDE \
-            -e HUGGING_FACE_HUB_TOKEN \
            -e DASHBOARD_TAG \
            --env-file="${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}" \
            --ulimit stack=10485760:83886080 \
            --ulimit core=0 \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --security-opt seccomp=unconfined \
            --cap-add=SYS_PTRACE \
            --shm-size="8g" \
@ -345,7 +330,7 @@ jobs:
          aws-region: us-east-1

      - name: Upload the benchmark results
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.9
        with:
          benchmark-results-dir: test/test-reports
          dry-run: false
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@ -59,7 +59,7 @@ jobs:
      PR_NUMBER: ${{ github.event.pull_request.number }}
    steps:
      # - name: Checkout PyTorch
-      #   uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+      #   uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
      #   with:
      #     fetch-depth: 1
      #     submodules: true
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -85,10 +85,10 @@ jobs:
          git config --global core.fsmonitor false

      - name: Clean up leftover processes on non-ephemeral Windows runner
-        uses: pytorch/test-infra/.github/actions/cleanup-runner@main
+        uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.9

      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
          instructions: |
@ -103,7 +103,7 @@ jobs:

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
        with:
          no-sudo: true

@ -151,7 +151,7 @@ jobs:
          BUILD_WHEEL: 1
          MAX_JOBS: 8
          CUDA_VERSION: ${{ inputs.cuda-version }}
-          PYTHON_VERSION: "3.10"
+          PYTHON_VERSION: "3.9"
          SCCACHE_BUCKET: "ossci-compiler-cache"
          SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
          SCCACHE_REGION: us-east-1
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -78,10 +78,10 @@ jobs:
          git config --global core.fsmonitor false

      - name: Clean up leftover processes on non-ephemeral Windows runner
-        uses: pytorch/test-infra/.github/actions/cleanup-runner@main
+        uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.9

      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
          instructions: |
@ -97,7 +97,7 @@ jobs:

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
        with:
          no-sudo: true

@ -184,7 +184,7 @@ jobs:
        env:
          USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
          INSTALL_WINDOWS_SDK: 1
-          PYTHON_VERSION: "3.10"
+          PYTHON_VERSION: 3.9
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
          TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -77,7 +77,7 @@ jobs:
    steps:
      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9

      - name: Setup XPU
        uses: ./.github/actions/setup-xpu
@ -95,7 +95,7 @@ jobs:

      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
        with:
          docker-image-name: ${{ inputs.docker-image }}

@ -109,7 +109,7 @@ jobs:
          echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

--- a/.github/workflows/b200-symm-mem.yml
+++ b/.github/workflows/b200-symm-mem.yml
@ -1,60 +0,0 @@
-name: Limited CI for symmetric memory tests on B200
-
-on:
-  pull_request:
-    paths:
-      - .github/workflows/b200-symm-mem.yml
-  workflow_dispatch:
-  push:
-    tags:
-      - ciflow/b200-symm-mem/*
-  schedule:
-    - cron: 22 8 * * *  # about 1:22am PDT
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-
-  get-label-type:
-    if: github.repository_owner == 'pytorch'
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-
-  linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.12xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      cuda-arch-list: '10.0'
-      test-matrix: |
-        { include: [
-          { config: "b200-symm-mem", shard: 1, num_shards: 1, runner: "linux.dgx.b200.8" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.test-matrix }}
-      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-    secrets: inherit
--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@ -36,10 +36,10 @@ jobs:
    runs-on: linux.9xlarge.ephemeral
    strategy:
      matrix:
-        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "cpu"]
+        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "cpu"]
    steps:
      - name: Build docker image
-        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
+        uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.9
        with:
          docker-image-name: almalinux-builder
          custom-tag-prefix: ${{matrix.tag}}
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -32,7 +32,7 @@ jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -52,13 +52,13 @@ jobs:
          { tag: "cuda12.9" },
          { tag: "cuda12.8" },
          { tag: "cuda12.6" },
+          { tag: "rocm6.3"  },
          { tag: "rocm6.4"  },
-          { tag: "rocm7.0"  },
          { tag: "cpu"      },
        ]
    steps:
      - name: Build docker image
-        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
+        uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.9
        with:
          docker-image-name: libtorch-cxx11-builder
          custom-tag-prefix: ${{ matrix.tag }}
--- a/Show More
+++ b/Show More