[async_tp] Base support ag-transpose-mm(mat_B) case

ghstack-source-id: edd51b9c46e46e8eca0c45e0ea53c1b26b375c01 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163069
[async_tp] Support ag+mm with gather_dim lastdim of mat_A
2025-10-23 23:04:52 +08:00 · 2025-09-19 08:35:51 -07:00 · 2025-09-16 04:42:16 -07:00 · 2025-09-12 04:34:10 -07:00 · 2025-09-09 05:56:41 +00:00 · 2025-09-09 05:53:46 +00:00
1032 changed files with 62563 additions and 36687 deletions
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -3,8 +3,18 @@ set -eux -o pipefail

 GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}

-if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
+# Set CUDA architecture lists to match x86 build_cuda.sh
+if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;8.0;9.0"
+elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="7.0;8.0;9.0;10.0;12.0"
+elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
+fi
+
+# Compress the fatbin with -compress-mode=size for CUDA 13
+if [[ "$DESIRED_CUDA" == *"13"* ]]; then
+    export TORCH_NVCC_FLAGS="-compress-mode=size"
 fi

 SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
@ -18,7 +28,7 @@ cd /
 # on the mounted pytorch repo
 git config --global --add safe.directory /pytorch
 pip install -r /pytorch/requirements.txt
-pip install auditwheel==6.2.0
+pip install auditwheel==6.2.0 wheel
 if [ "$DESIRED_CUDA" = "cpu" ]; then
    echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
@ -26,6 +36,19 @@ if [ "$DESIRED_CUDA" = "cpu" ]; then
 else
    echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
    export USE_SYSTEM_NCCL=1
+
+    # Check if we should use NVIDIA libs from PyPI (similar to x86 build_cuda.sh logic)
+    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
+        echo "Bundling CUDA libraries with wheel for aarch64."
+    else
+        echo "Using nvidia libs from pypi for aarch64."
+        # Fix platform constraints in PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64
+        # Replace 'platform_machine == "x86_64"' with 'platform_machine == "aarch64"'
+        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS//platform_machine == \'x86_64\'/platform_machine == \'aarch64\'}"
+        echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"
+        export USE_NVIDIA_PYPI_LIBS=1
+    fi
+
    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
 fi
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -69,62 +69,181 @@ def replace_tag(filename) -> None:
        f.writelines(lines)


+def patch_library_rpath(
+    folder: str,
+    lib_name: str,
+    use_nvidia_pypi_libs: bool = False,
+    desired_cuda: str = "",
+) -> None:
+    """Apply patchelf to set RPATH for a library in torch/lib"""
+    lib_path = f"{folder}/tmp/torch/lib/{lib_name}"
+
+    if use_nvidia_pypi_libs:
+        # For PyPI NVIDIA libraries, construct CUDA RPATH
+        cuda_rpaths = [
+            "$ORIGIN/../../nvidia/cudnn/lib",
+            "$ORIGIN/../../nvidia/nvshmem/lib",
+            "$ORIGIN/../../nvidia/nccl/lib",
+            "$ORIGIN/../../nvidia/cusparselt/lib",
+        ]
+
+        if "130" in desired_cuda:
+            cuda_rpaths.append("$ORIGIN/../../nvidia/cu13/lib")
+        else:
+            cuda_rpaths.extend(
+                [
+                    "$ORIGIN/../../nvidia/cublas/lib",
+                    "$ORIGIN/../../nvidia/cuda_cupti/lib",
+                    "$ORIGIN/../../nvidia/cuda_nvrtc/lib",
+                    "$ORIGIN/../../nvidia/cuda_runtime/lib",
+                    "$ORIGIN/../../nvidia/cufft/lib",
+                    "$ORIGIN/../../nvidia/curand/lib",
+                    "$ORIGIN/../../nvidia/cusolver/lib",
+                    "$ORIGIN/../../nvidia/cusparse/lib",
+                    "$ORIGIN/../../nvidia/nvtx/lib",
+                    "$ORIGIN/../../nvidia/cufile/lib",
+                ]
+            )
+
+        # Add $ORIGIN for local torch libs
+        rpath = ":".join(cuda_rpaths) + ":$ORIGIN"
+    else:
+        # For bundled libraries, just use $ORIGIN
+        rpath = "$ORIGIN"
+
+    if os.path.exists(lib_path):
+        os.system(
+            f"cd {folder}/tmp/torch/lib/; "
+            f"patchelf --set-rpath '{rpath}' --force-rpath {lib_name}"
+        )
+
+
+def copy_and_patch_library(
+    src_path: str,
+    folder: str,
+    use_nvidia_pypi_libs: bool = False,
+    desired_cuda: str = "",
+) -> None:
+    """Copy a library to torch/lib and patch its RPATH"""
+    if os.path.exists(src_path):
+        lib_name = os.path.basename(src_path)
+        shutil.copy2(src_path, f"{folder}/tmp/torch/lib/{lib_name}")
+        patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
+
+
 def package_cuda_wheel(wheel_path, desired_cuda) -> None:
    """
    Package the cuda wheel libraries
    """
    folder = os.path.dirname(wheel_path)
-    wheelname = os.path.basename(wheel_path)
    os.mkdir(f"{folder}/tmp")
    os.system(f"unzip {wheel_path} -d {folder}/tmp")
-    libs_to_copy = [
-        "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
-        "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
-        "/usr/local/cuda/lib64/libcudnn.so.9",
-        "/usr/local/cuda/lib64/libcublas.so.12",
-        "/usr/local/cuda/lib64/libcublasLt.so.12",
-        "/usr/local/cuda/lib64/libcudart.so.12",
-        "/usr/local/cuda/lib64/libcufft.so.11",
-        "/usr/local/cuda/lib64/libcusparse.so.12",
-        "/usr/local/cuda/lib64/libcusparseLt.so.0",
-        "/usr/local/cuda/lib64/libcusolver.so.11",
-        "/usr/local/cuda/lib64/libcurand.so.10",
-        "/usr/local/cuda/lib64/libnccl.so.2",
-        "/usr/local/cuda/lib64/libnvJitLink.so.12",
-        "/usr/local/cuda/lib64/libnvrtc.so.12",
-        "/usr/local/cuda/lib64/libnvshmem_host.so.3",
-        "/usr/local/cuda/lib64/libcudnn_adv.so.9",
-        "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
-        "/usr/local/cuda/lib64/libcudnn_graph.so.9",
-        "/usr/local/cuda/lib64/libcudnn_ops.so.9",
-        "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
-        "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
-        "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
-        "/lib64/libgomp.so.1",
-        "/usr/lib64/libgfortran.so.5",
-        "/acl/build/libarm_compute.so",
-        "/acl/build/libarm_compute_graph.so",
-        "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
-        "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
-        "/usr/local/lib/libnvpl_lapack_core.so.0",
-        "/usr/local/lib/libnvpl_blas_core.so.0",
-    ]

-    if "129" in desired_cuda:
-        libs_to_copy += [
-            "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9",
-            "/usr/local/cuda/lib64/libcufile.so.0",
-            "/usr/local/cuda/lib64/libcufile_rdma.so.1",
+    # Check if we should use PyPI NVIDIA libraries or bundle system libraries
+    use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
+
+    if use_nvidia_pypi_libs:
+        print("Using nvidia libs from pypi - skipping CUDA library bundling")
+        # For PyPI approach, we don't bundle CUDA libraries - they come from PyPI packages
+        # We only need to bundle non-NVIDIA libraries
+        minimal_libs_to_copy = [
+            "/lib64/libgomp.so.1",
+            "/usr/lib64/libgfortran.so.5",
+            "/acl/build/libarm_compute.so",
+            "/acl/build/libarm_compute_graph.so",
+            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_lapack_core.so.0",
+            "/usr/local/lib/libnvpl_blas_core.so.0",
        ]

-    # Copy libraries to unzipped_folder/a/lib
-    for lib_path in libs_to_copy:
-        lib_name = os.path.basename(lib_path)
-        shutil.copy2(lib_path, f"{folder}/tmp/torch/lib/{lib_name}")
-        os.system(
-            f"cd {folder}/tmp/torch/lib/; "
-            f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}"
-        )
+        # Copy minimal libraries to unzipped_folder/torch/lib
+        for lib_path in minimal_libs_to_copy:
+            copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
+
+        # Patch torch libraries used for searching libraries
+        torch_libs_to_patch = [
+            "libtorch.so",
+            "libtorch_cpu.so",
+            "libtorch_cuda.so",
+            "libtorch_cuda_linalg.so",
+            "libtorch_global_deps.so",
+            "libtorch_python.so",
+            "libtorch_nvshmem.so",
+            "libc10.so",
+            "libc10_cuda.so",
+            "libcaffe2_nvrtc.so",
+            "libshm.so",
+        ]
+        for lib_name in torch_libs_to_patch:
+            patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
+    else:
+        print("Bundling CUDA libraries with wheel")
+        # Original logic for bundling system CUDA libraries
+        # Common libraries for all CUDA versions
+        common_libs = [
+            # Non-NVIDIA system libraries
+            "/lib64/libgomp.so.1",
+            "/usr/lib64/libgfortran.so.5",
+            "/acl/build/libarm_compute.so",
+            "/acl/build/libarm_compute_graph.so",
+            # Common CUDA libraries (same for all versions)
+            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_lapack_core.so.0",
+            "/usr/local/lib/libnvpl_blas_core.so.0",
+            "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
+            "/usr/local/cuda/lib64/libcudnn.so.9",
+            "/usr/local/cuda/lib64/libcusparseLt.so.0",
+            "/usr/local/cuda/lib64/libcurand.so.10",
+            "/usr/local/cuda/lib64/libnccl.so.2",
+            "/usr/local/cuda/lib64/libnvshmem_host.so.3",
+            "/usr/local/cuda/lib64/libcudnn_adv.so.9",
+            "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
+            "/usr/local/cuda/lib64/libcudnn_graph.so.9",
+            "/usr/local/cuda/lib64/libcudnn_ops.so.9",
+            "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
+            "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
+            "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
+            "/usr/local/cuda/lib64/libcufile.so.0",
+            "/usr/local/cuda/lib64/libcufile_rdma.so.1",
+            "/usr/local/cuda/lib64/libcusparse.so.12",
+        ]
+
+        # CUDA version-specific libraries
+        if "130" in desired_cuda:
+            version_specific_libs = [
+                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
+                "/usr/local/cuda/lib64/libcublas.so.13",
+                "/usr/local/cuda/lib64/libcublasLt.so.13",
+                "/usr/local/cuda/lib64/libcudart.so.13",
+                "/usr/local/cuda/lib64/libcufft.so.12",
+                "/usr/local/cuda/lib64/libcusolver.so.12",
+                "/usr/local/cuda/lib64/libnvJitLink.so.13",
+                "/usr/local/cuda/lib64/libnvrtc.so.13",
+                "/usr/local/cuda/lib64/libnvrtc-builtins.so.13.0",
+            ]
+        elif "12" in desired_cuda:
+            # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
+            minor_version = desired_cuda[-1]
+            version_specific_libs = [
+                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
+                "/usr/local/cuda/lib64/libcublas.so.12",
+                "/usr/local/cuda/lib64/libcublasLt.so.12",
+                "/usr/local/cuda/lib64/libcudart.so.12",
+                "/usr/local/cuda/lib64/libcufft.so.11",
+                "/usr/local/cuda/lib64/libcusolver.so.11",
+                "/usr/local/cuda/lib64/libnvJitLink.so.12",
+                "/usr/local/cuda/lib64/libnvrtc.so.12",
+                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
+            ]
+
+        # Combine all libraries
+        libs_to_copy = common_libs + version_specific_libs
+
+        # Copy libraries to unzipped_folder/torch/lib
+        for lib_path in libs_to_copy:
+            copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)

    # Make sure the wheel is tagged with manylinux_2_28
    for f in os.scandir(f"{folder}/tmp/"):
@ -132,14 +251,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
            replace_tag(f"{f.path}/WHEEL")
            break

-    os.mkdir(f"{folder}/cuda_wheel")
-    os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *")
-    shutil.move(
-        f"{folder}/cuda_wheel/{wheelname}",
-        f"{folder}/{wheelname}",
-        copy_function=shutil.copy2,
-    )
-    os.system(f"rm -rf {folder}/tmp/ {folder}/cuda_wheel/")
+    os.system(f"wheel pack {folder}/tmp/ -d {folder}")
+    os.system(f"rm -rf {folder}/tmp/")


 def complete_wheel(folder: str) -> str:
@ -211,6 +324,16 @@ if __name__ == "__main__":
    if enable_cuda:
        build_vars += "MAX_JOBS=5 "

+        # Handle PyPI NVIDIA libraries vs bundled libraries
+        use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
+        if use_nvidia_pypi_libs:
+            print("Configuring build for PyPI NVIDIA libraries")
+            # Configure for dynamic linking (matching x86 logic)
+            build_vars += "ATEN_STATIC_CUDA=0 USE_CUDA_STATIC_LINK=0 USE_CUPTI_SO=1 "
+        else:
+            print("Configuring build for bundled NVIDIA libraries")
+            # Keep existing static linking approach - already configured above
+
    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
    desired_cuda = os.getenv("DESIRED_CUDA")
    if override_package_version is not None:
--- a/.ci/docker/README.md
+++ b/.ci/docker/README.md
@ -120,8 +120,8 @@ If your new Docker image needs a library installed from a specific pinned commit
   If you're introducing a new argument to the Docker build, make sure to add it in the Docker build step in `.ci/docker/build.sh`:
   ```bash
   docker build \
-      ....
-      --build-arg "NEW_ARG_1=${NEW_ARG_1}"
+     ....
+     --build-arg "NEW_ARG_1=${NEW_ARG_1}"
   ```

 3. **Update Dockerfile logic**:
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -81,8 +81,8 @@ elif [[ "$image" == *riscv* ]]; then
  DOCKERFILE="ubuntu-cross-riscv/Dockerfile"
 fi

-_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
-_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
+_UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152
+_UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96
 if [[ "$image" == *rocm* ]]; then
  _UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
  _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
@ -114,6 +114,16 @@ case "$tag" in
    UCC_COMMIT=${_UCC_COMMIT}
    TRITON=yes
    ;;
+  pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
+    CUDA_VERSION=13.0.0
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=11
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    TRITON=yes
+    ;;
  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.8.1
    ANACONDA_PYTHON_VERSION=3.10
@ -125,28 +135,6 @@ case "$tag" in
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.8.1
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.8.1
-    ANACONDA_PYTHON_VERSION=3.13
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
  pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)
    CUDA_VERSION=12.8.1
    ANACONDA_PYTHON_VERSION=3.12
@ -173,8 +161,8 @@ case "$tag" in
    VISION=yes
    ONNX=yes
    ;;
-  pytorch-linux-jammy-py3.9-clang12)
-    ANACONDA_PYTHON_VERSION=3.9
+  pytorch-linux-jammy-py3.10-clang12)
+    ANACONDA_PYTHON_VERSION=3.10
    CLANG_VERSION=12
    VISION=yes
    TRITON=yes
@ -209,23 +197,24 @@ case "$tag" in
    UCC_COMMIT=${_UCC_COMMIT}
    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
    ;;
-  pytorch-linux-jammy-xpu-2025.0-py3)
-    ANACONDA_PYTHON_VERSION=3.9
-    GCC_VERSION=11
-    VISION=yes
-    XPU_VERSION=2025.0
-    NINJA_VERSION=1.9.0
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-xpu-2025.1-py3)
-    ANACONDA_PYTHON_VERSION=3.9
+  pytorch-linux-jammy-xpu-n-1-py3)
+    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    VISION=yes
    XPU_VERSION=2025.1
    NINJA_VERSION=1.9.0
    TRITON=yes
    ;;
-  pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
+  pytorch-linux-jammy-xpu-n-py3)
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=11
+    VISION=yes
+    XPU_VERSION=2025.2
+    NINJA_VERSION=1.9.0
+    TRITON=yes
+    ;;
+  pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
+    # TODO (huydhn): Upgrade this to Python >= 3.10
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    VISION=yes
@ -234,8 +223,8 @@ case "$tag" in
    DOCS=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
-    ANACONDA_PYTHON_VERSION=3.9
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12)
+    ANACONDA_PYTHON_VERSION=3.10
    CUDA_VERSION=12.8.1
    CLANG_VERSION=12
    VISION=yes
@ -246,8 +235,8 @@ case "$tag" in
    CLANG_VERSION=18
    VISION=yes
    ;;
-  pytorch-linux-jammy-py3.9-gcc11)
-    ANACONDA_PYTHON_VERSION=3.9
+  pytorch-linux-jammy-py3.10-gcc11)
+    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    VISION=yes
    KATEX=yes
--- a/.ci/docker/ci_commit_pins/torchbench.txt
+++ b/.ci/docker/ci_commit_pins/torchbench.txt
@ -1 +1 @@
-e03a63be43e33596f7f0a43b0f530353785e4a59
+74a23feff57432129df84d8099e622773cf77925
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-0958dc9b2bb815e428f721f9da599dab0dc1c5d7
+1b0418a9a454b2b93ab8d71f40e59d2297157fae
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-f7888497a1eb9e98d4c07537f0d0bcfe180d1363
+fccfc522864cf8bc172abe0cd58ae5581e2d44b9
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -83,9 +83,9 @@ function build_cpython {
        py_suffix=${py_ver::-1}
        py_folder=$py_suffix
    fi
-    # Only b3 is available now
+    # Update to rc2 due to https://github.com/python/cpython/commit/c72699086fe4
    if [ "$py_suffix" == "3.14.0" ]; then
-        py_suffix="3.14.0b3"
+        py_suffix="3.14.0rc2"
    fi
    wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz
    do_cpython_build $py_ver Python-$py_suffix
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -10,7 +10,7 @@ else
  arch_path='sbsa'
 fi

-NVSHMEM_VERSION=3.3.20
+NVSHMEM_VERSION=3.3.24

 function install_cuda {
  version=$1
@ -65,7 +65,7 @@ function install_nvshmem {
  # This pattern is a lie as it is not consistent across versions, for 3.3.9 it was cuda_ver-arch-nvshhem-ver
  filename="libnvshmem-linux-${arch_path}-${nvshmem_version}_cuda${cuda_major_version}-archive"
  suffix=".tar.xz"
-  url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}${suffix}"
+  url="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/linux-${arch_path}/${filename}${suffix}"

  # download, unpack, install
  wget -q "${url}"
@ -147,8 +147,7 @@ function install_128 {
 }

 function install_130 {
-  CUDNN_VERSION=9.12.0.46
-  NVSHMEM_VERSION=3.3.20
+  CUDNN_VERSION=9.13.0.50
  echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
  # install CUDA 13.0 in the same container
  install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -57,7 +57,7 @@ if [ ! -f setup.py ]; then
  cd python
 fi

-pip_install pybind11==2.13.6
+pip_install pybind11==3.0.1

 # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
 as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py
--- a/.ci/docker/common/install_ucc.sh
+++ b/.ci/docker/common/install_ucc.sh
@ -44,8 +44,12 @@ function install_ucc() {

  ./autogen.sh

-  # We only run distributed tests on Tesla M60 and A10G
-  NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
+  if [[ -n "$CUDA_VERSION"  && $CUDA_VERSION == 13* ]]; then
+    NVCC_GENCODE="-gencode=arch=compute_86,code=compute_86"
+  else
+    # We only run distributed tests on Tesla M60 and A10G
+    NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
+  fi

  if [[ -n "$ROCM_VERSION" ]]; then
    if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -65,10 +65,14 @@ function install_ubuntu() {

 function install_rhel() {
    . /etc/os-release
-
-    if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
-        echo "RHEL version ${VERSION_ID} not supported"
-        exit
+    if [[ "${ID}" == "rhel" ]]; then
+        if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
+            echo "RHEL version ${VERSION_ID} not supported"
+            exit
+        fi
+    elif [[ "${ID}" == "almalinux" ]]; then
+        # Workaround for almalinux8 which used by quay.io/pypa/manylinux_2_28_x86_64
+        VERSION_ID="8.8"
    fi

    dnf install -y 'dnf-command(config-manager)'
@ -146,11 +150,11 @@ if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
    XPU_DRIVER_VERSION="/lts/2350"
 fi

-# Default use Intel® oneAPI Deep Learning Essentials 2025.0
-if [[ "$XPU_VERSION" == "2025.1" ]]; then
-    XPU_PACKAGES="intel-deep-learning-essentials-2025.1"
+# Default use Intel® oneAPI Deep Learning Essentials 2025.1
+if [[ "$XPU_VERSION" == "2025.2" ]]; then
+    XPU_PACKAGES="intel-deep-learning-essentials-2025.2"
 else
-    XPU_PACKAGES="intel-deep-learning-essentials-2025.0"
+    XPU_PACKAGES="intel-deep-learning-essentials-2025.1"
 fi

 # The installation depends on the base OS
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@ -74,6 +74,14 @@ RUN bash ./install_cuda.sh 13.0
 RUN bash ./install_magma.sh 13.0
 RUN ln -sf /usr/local/cuda-13.0 /usr/local/cuda

+# Install libibverbs for libtorch and copy to CUDA directory
+RUN apt-get update -y && \
+    apt-get install -y libibverbs-dev librdmacm-dev && \
+    cp /usr/lib/x86_64-linux-gnu/libmlx5.so* /usr/local/cuda/lib64/ && \
+    cp /usr/lib/x86_64-linux-gnu/librdmacm.so* /usr/local/cuda/lib64/ && \
+    cp /usr/lib/x86_64-linux-gnu/libibverbs.so* /usr/local/cuda/lib64/ && \
+    cp /usr/lib/x86_64-linux-gnu/libnl* /usr/local/cuda/lib64/
+
 FROM cpu as rocm
 ARG ROCM_VERSION
 ARG PYTORCH_ROCM_ARCH
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -175,6 +175,6 @@ ENV XPU_DRIVER_TYPE ROLLING
 RUN python3 -m pip install --upgrade pip && \
    python3 -mpip install cmake==3.28.4
 ADD ./common/install_xpu.sh install_xpu.sh
-ENV XPU_VERSION 2025.1
+ENV XPU_VERSION 2025.2
 RUN bash ./install_xpu.sh && rm install_xpu.sh
 RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -379,7 +379,7 @@ dataclasses_json==0.6.7
 cmake==4.0.0
 #Description: required for building

-tlparse==0.3.30
+tlparse==0.4.0
 #Description: required for log parsing

 cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x"
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,7 +1,7 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2

 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.4.0
+3.5.0
--- a/.ci/docker/triton_xpu_version.txt
+++ b/.ci/docker/triton_xpu_version.txt
@ -1 +1 @@
-3.4.0
+3.5.0
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -66,6 +66,7 @@ ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
 # (optional) Install UCC
 ARG UCX_COMMIT
 ARG UCC_COMMIT
+ARG CUDA_VERSION
 ENV UCX_COMMIT $UCX_COMMIT
 ENV UCC_COMMIT $UCC_COMMIT
 ENV UCX_HOME /usr
--- a/.ci/lumen_cli/cli/lib/common/gh_summary.py
+++ b/.ci/lumen_cli/cli/lib/common/gh_summary.py
@ -0,0 +1,143 @@
+from __future__ import annotations
+
+import logging
+import os
+import textwrap
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from cli.lib.common.utils import get_wheels
+from jinja2 import Template
+
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Mapping
+
+
+logger = logging.getLogger(__name__)
+
+_TPL_CONTENT = Template(
+    textwrap.dedent("""\
+    ## {{ title }}
+
+    ```{{ lang }}
+    {{ content }}
+    ```
+""")
+)
+
+_TPL_LIST_ITEMS = Template(
+    textwrap.dedent("""\
+    ## {{ title }}
+    {% for it in items %}
+    - {{ it.pkg }}: {{ it.relpath }}
+    {% else %}
+    _(no item found)_
+    {% endfor %}
+    """)
+)
+
+_TPL_TABLE = Template(
+    textwrap.dedent("""\
+    {%- if rows %}
+    | {{ cols | join(' | ') }} |
+    |{%- for _ in cols %} --- |{%- endfor %}
+    {%- for r in rows %}
+    | {%- for c in cols %} {{ r.get(c, "") }} |{%- endfor %}
+    {%- endfor %}
+    {%- else %}
+    _(no data)_
+    {%- endif %}
+""")
+)
+
+
+def gh_summary_path() -> Path | None:
+    """Return the Path to the GitHub step summary file, or None if not set."""
+    p = os.environ.get("GITHUB_STEP_SUMMARY")
+    return Path(p) if p else None
+
+
+def write_gh_step_summary(md: str, *, append_content: bool = True) -> bool:
+    """
+    Write Markdown content to the GitHub Step Summary file if GITHUB_STEP_SUMMARY is set.
+    append_content: default true, if True, append to the end of the file, else overwrite the whole file
+
+    Returns:
+        True if written successfully (in GitHub Actions environment),
+        False if skipped (e.g., running locally where the variable is not set).
+    """
+    sp = gh_summary_path()
+    if not sp:
+        logger.info("[gh-summary] GITHUB_STEP_SUMMARY not set, skipping write.")
+        return False
+
+    md_clean = textwrap.dedent(md).strip() + "\n"
+
+    mode = "a" if append_content else "w"
+    with sp.open(mode, encoding="utf-8") as f:
+        f.write(md_clean)
+    return True
+
+
+def md_heading(text: str, level: int = 2) -> str:
+    """Generate a Markdown heading string with the given level (1-6)."""
+    return f"{'#' * max(1, min(level, 6))} {text}\n"
+
+
+def md_details(summary: str, content: str) -> str:
+    """Generate a collapsible <details> block with a summary and inner content."""
+    return f"<details>\n<summary>{summary}</summary>\n\n{content}\n\n</details>\n"
+
+
+def summarize_content_from_file(
+    output_dir: Path,
+    freeze_file: str,
+    title: str = "Content from file",
+    code_lang: str = "",  # e.g. "text" or "ini"
+) -> bool:
+    f = Path(output_dir) / freeze_file
+    if not f.exists():
+        return False
+    content = f.read_text(encoding="utf-8").strip()
+    md = render_content(content, title=title, lang=code_lang)
+    return write_gh_step_summary(md)
+
+
+def summarize_wheels(path: Path, title: str = "Wheels", max_depth: int = 3):
+    items = get_wheels(path, max_depth=max_depth)
+    if not items:
+        return False
+    md = render_list(items, title=title)
+    return write_gh_step_summary(md)
+
+
+def md_kv_table(rows: Iterable[Mapping[str, str | int | float]]) -> str:
+    """
+    Render a list of dicts as a Markdown table using Jinja template.
+    """
+    rows = list(rows)
+    cols = list({k for r in rows for k in r.keys()})
+    md = _TPL_TABLE.render(cols=cols, rows=rows).strip() + "\n"
+    return md
+
+
+def render_list(
+    items: Iterable[str],
+    *,
+    title: str = "List",
+) -> str:
+    tpl = _TPL_LIST_ITEMS
+    md = tpl.render(title=title, items=items)
+    return md
+
+
+def render_content(
+    content: str,
+    *,
+    title: str = "Content",
+    lang: str = "text",
+) -> str:
+    tpl = _TPL_CONTENT
+    md = tpl.render(title=title, content=content, lang=lang)
+    return md
--- a/.ci/lumen_cli/cli/lib/common/git_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/git_helper.py
@ -45,7 +45,7 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules

        # Checkout pinned commit
        commit = get_post_build_pinned_commit(target)
-        logger.info("Checking out pinned commit %s", commit)
+        logger.info("Checking out pinned %s commit %s", target, commit)
        r.git.checkout(commit)

        # Update submodules if requested
@ -55,7 +55,7 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules
                sm.update(init=True, recursive=True, progress=PrintProgress())

        logger.info("Successfully cloned %s", target)
-        return r
+        return r, commit

    except GitCommandError as e:
        logger.error("Git operation failed: %s", e)
--- a/.ci/lumen_cli/cli/lib/common/pip_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/pip_helper.py
@ -4,7 +4,7 @@ import shlex
 import shutil
 import sys
 from collections.abc import Iterable
-from importlib.metadata import PackageNotFoundError, version
+from importlib.metadata import PackageNotFoundError, version  # noqa: UP035
 from typing import Optional, Union

 from cli.lib.common.utils import run_command
--- a/.ci/lumen_cli/cli/lib/common/utils.py
+++ b/.ci/lumen_cli/cli/lib/common/utils.py
@ -8,6 +8,7 @@ import shlex
 import subprocess
 import sys
 from contextlib import contextmanager
+from pathlib import Path
 from typing import Optional


@ -115,3 +116,24 @@ def working_directory(path: str):
        yield
    finally:
        os.chdir(prev_cwd)
+
+
+def get_wheels(
+    output_dir: Path,
+    max_depth: Optional[int] = None,
+) -> list[str]:
+    """Return a list of wheels found in the given output directory."""
+    root = Path(output_dir)
+    if not root.exists():
+        return []
+    items = []
+    for dirpath, _, filenames in os.walk(root):
+        depth = Path(dirpath).relative_to(root).parts
+        if max_depth is not None and len(depth) > max_depth:
+            continue
+        for fname in sorted(filenames):
+            if fname.endswith(".whl"):
+                pkg = fname.split("-")[0]
+                relpath = str((Path(dirpath) / fname).relative_to(root))
+                items.append({"pkg": pkg, "relpath": relpath})
+    return items
--- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@ -1,13 +1,27 @@
 import logging
+import os
+import textwrap
 from typing import Any

+from cli.lib.common.gh_summary import write_gh_step_summary
 from cli.lib.common.git_helper import clone_external_repo
 from cli.lib.common.pip_helper import pip_install_packages
 from cli.lib.common.utils import run_command, temp_environ, working_directory
+from jinja2 import Template


 logger = logging.getLogger(__name__)

+_TPL_VLLM_INFO = Template(
+    textwrap.dedent("""\
+    ##  Vllm against Pytorch CI Test Summary
+    **Vllm Commit**: [{{ vllm_commit }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})
+    {%- if torch_sha %}
+    **Pytorch Commit**: [{{ torch_sha }}](https://github.com/pytorch/pytorch/commit/{{ torch_sha }})
+    {%- endif %}
+""")
+)
+

 def sample_vllm_test_library():
    """
@ -62,7 +76,6 @@ def sample_vllm_test_library():
                ),
                "pytest -v -s entrypoints/llm/test_lazy_outlines.py",
                "pytest -v -s entrypoints/llm/test_generate.py ",
-                "pytest -v -s entrypoints/llm/test_generate_multiple_loras.py",
                "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
            ],
        },
@ -83,14 +96,24 @@ def sample_vllm_test_library():
            "num_gpus": 4,
            "steps": [
                "pytest -v -s -x lora/test_chatglm3_tp.py",
-                "echo $VLLM_WORKER_MULTIPROC_METHOD",
                "pytest -v -s -x lora/test_llama_tp.py",
-                "pytest -v -s -x lora/test_multi_loras_with_tp.py",
+                "pytest -v -s -x lora/test_llm_with_multi_loras.py",
            ],
        },
-        "vllm_lora_280_failure_test": {
-            "title": "LoRA 280 failure test",
-            "id": "vllm_lora_280_failure_test",
+        "vllm_distributed_test_28_failure_test": {
+            "title": "Distributed Tests (2 GPUs) pytorch 2.8 release failure",
+            "id": "vllm_distributed_test_28_failure_test",
+            "env_vars": {
+                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
+            },
+            "num_gpus": 4,
+            "steps": [
+                "pytest -v -s distributed/test_sequence_parallel.py",
+            ],
+        },
+        "vllm_lora_28_failure_test": {
+            "title": "LoRA pytorch 2.8 failure test",
+            "id": "vllm_lora_28_failure_test",
            "steps": ["pytest -v lora/test_quant_model.py"],
        },
        "vllm_multi_model_processor_test": {
@ -101,6 +124,15 @@ def sample_vllm_test_library():
                "pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py",
            ],
        },
+        "vllm_multi_model_test_28_failure_test": {
+            "title": "Multi-Model Test (Failed 2.8 release)",
+            "id": "vllm_multi_model_test_28_failure_test",
+            "package_install": ["git+https://github.com/TIGER-AI-Lab/Mantis.git"],
+            "steps": [
+                "pytest -v -s models/multimodal/generation/test_voxtral.py",
+                "pytest -v -s models/multimodal/pooling",
+            ],
+        },
        "vllm_pytorch_compilation_unit_tests": {
            "title": "PyTorch Compilation Unit Tests",
            "id": "vllm_pytorch_compilation_unit_tests",
@ -115,6 +147,28 @@ def sample_vllm_test_library():
                "pytest -v -s compile/test_decorator.py",
            ],
        },
+        "vllm_languagde_model_test_extended_generation_28_failure_test": {
+            "title": "Language Models Test (Extended Generation) 2.8 release failure",
+            "id": "vllm_languagde_model_test_extended_generation_28_failure_test",
+            "package_install": [
+                "--no-build-isolation",
+                "git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8",
+            ],
+            "steps": [
+                "pytest -v -s models/language/generation/test_mistral.py",
+            ],
+        },
+        "vllm_distributed_test_2_gpu_28_failure_test": {
+            "title": "Distributed Tests (2 GPUs) pytorch 2.8 release failure",
+            "id": "vllm_distributed_test_2_gpu_28_failure_test",
+            "env_vars": {
+                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
+            },
+            "num_gpus": 4,
+            "steps": [
+                "pytest -v -s distributed/test_sequence_parallel.py",
+            ],
+        },
        # TODO(elainewy):need to add g6 with 4 gpus to run this test
        "vllm_lora_test": {
            "title": "LoRA Test %N",
@ -214,12 +268,13 @@ def run_test_plan(


 def clone_vllm(dst: str = "vllm"):
-    clone_external_repo(
+    _, commit = clone_external_repo(
        target="vllm",
        repo="https://github.com/vllm-project/vllm.git",
        dst=dst,
        update_submodules=True,
    )
+    return commit


 def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) -> str:
@ -230,3 +285,12 @@ def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) ->
    for k in sorted(mapping, key=len, reverse=True):
        step = step.replace(k, mapping[k])
    return step
+
+
+def summarize_build_info(vllm_commit: str) -> bool:
+    torch_sha = os.getenv("GITHUB_SHA")
+    md = (
+        _TPL_VLLM_INFO.render(vllm_commit=vllm_commit, torch_sha=torch_sha).strip()
+        + "\n"
+    )
+    return write_gh_step_summary(md)
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
@ -13,6 +13,11 @@ from cli.lib.common.envs_helper import (
    env_str_field,
    with_params_help,
 )
+from cli.lib.common.gh_summary import (
+    gh_summary_path,
+    summarize_content_from_file,
+    summarize_wheels,
+)
 from cli.lib.common.path_helper import (
    copy,
    ensure_dir_exists,
@ -21,7 +26,7 @@ from cli.lib.common.path_helper import (
    is_path_exist,
 )
 from cli.lib.common.utils import run_command
-from cli.lib.core.vllm.lib import clone_vllm
+from cli.lib.core.vllm.lib import clone_vllm, summarize_build_info


 logger = logging.getLogger(__name__)
@ -153,18 +158,43 @@ class VllmBuildRunner(BaseRunner):
        """
        inputs = VllmBuildParameters()
        logger.info("Running vllm build with inputs: %s", inputs)
-        clone_vllm()
+        vllm_commit = clone_vllm()

        self.cp_dockerfile_if_exist(inputs)
-
        # cp torch wheels from root direct to vllm workspace if exist
        self.cp_torch_whls_if_exist(inputs)

-        ensure_dir_exists(inputs.output_dir)
+        # make sure the output dir to store the build artifacts exist
+        ensure_dir_exists(Path(inputs.output_dir))

        cmd = self._generate_docker_build_cmd(inputs)
        logger.info("Running docker build: \n %s", cmd)
-        run_command(cmd, cwd="vllm", env=os.environ.copy())
+
+        try:
+            run_command(cmd, cwd="vllm", env=os.environ.copy())
+        finally:
+            self.genearte_vllm_build_summary(vllm_commit, inputs)
+
+    def genearte_vllm_build_summary(
+        self, vllm_commit: str, inputs: VllmBuildParameters
+    ):
+        if not gh_summary_path():
+            return logger.info("Skipping, not detect GH Summary env var....")
+        logger.info("Generate GH Summary ...")
+        # summarize vllm build info
+        summarize_build_info(vllm_commit)
+
+        # summarize vllm build artifacts
+        vllm_artifact_dir = inputs.output_dir / "wheels"
+        summarize_content_from_file(
+            vllm_artifact_dir,
+            "build_summary.txt",
+            title="Vllm build env pip package summary",
+        )
+        summarize_wheels(
+            inputs.torch_whls_path, max_depth=3, title="Torch Wheels Artifacts"
+        )
+        summarize_wheels(vllm_artifact_dir, max_depth=3, title="Vllm Wheels Artifacts")

    def cp_torch_whls_if_exist(self, inputs: VllmBuildParameters) -> str:
        if not inputs.use_torch_whl:
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
@ -104,20 +104,26 @@ class VllmTestRunner(BaseRunner):
        main function to run vllm test
        """
        self.prepare()
-        with working_directory(self.work_directory):
-            if self.test_type == TestInpuType.TEST_PLAN:
-                if self.num_shards > 1:
-                    run_test_plan(
-                        self.test_plan,
-                        "vllm",
-                        sample_vllm_test_library(),
-                        self.shard_id,
-                        self.num_shards,
-                    )
+        try:
+            with working_directory(self.work_directory):
+                if self.test_type == TestInpuType.TEST_PLAN:
+                    if self.num_shards > 1:
+                        run_test_plan(
+                            self.test_plan,
+                            "vllm",
+                            sample_vllm_test_library(),
+                            self.shard_id,
+                            self.num_shards,
+                        )
+                    else:
+                        run_test_plan(
+                            self.test_plan, "vllm", sample_vllm_test_library()
+                        )
                else:
-                    run_test_plan(self.test_plan, "vllm", sample_vllm_test_library())
-            else:
-                raise ValueError(f"Unknown test type {self.test_type}")
+                    raise ValueError(f"Unknown test type {self.test_type}")
+        finally:
+            # double check the torches are not overridden by other packages
+            check_versions()

    def _install_wheels(self, params: VllmTestParameters):
        logger.info("Running vllm test with inputs: %s", params)
@ -220,6 +226,8 @@ def preprocess_test_in(
    target_path = Path(target_file)
    lines = target_path.read_text().splitlines()

+    pkgs_to_add = []
+
    # Remove lines starting with the package names (==, @, >=) — case-insensitive
    pattern = re.compile(rf"^({'|'.join(pkgs_to_remove)})\s*(==|@|>=)", re.IGNORECASE)
    kept_lines = [line for line in lines if not pattern.match(line)]
@ -236,7 +244,11 @@ def preprocess_test_in(
    ]

    # Write back: header_lines + blank + kept_lines
-    out = "\n".join(header_lines + [""] + kept_lines) + "\n"
+    out_lines = header_lines + [""] + kept_lines
+    if pkgs_to_add:
+        out_lines += [""] + pkgs_to_add
+
+    out = "\n".join(out_lines) + "\n"
    target_path.write_text(out)
    logger.info("[INFO] Updated %s", target_file)

--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -124,6 +124,7 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
    fi
    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
        echo "Bundling with cudnn and cublas."
+
        DEPS_LIST+=(
            "/usr/local/cuda/lib64/libcudnn_adv.so.9"
            "/usr/local/cuda/lib64/libcudnn_cnn.so.9"
@ -133,16 +134,11 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
            "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9"
            "/usr/local/cuda/lib64/libcudnn_heuristic.so.9"
            "/usr/local/cuda/lib64/libcudnn.so.9"
-            "/usr/local/cuda/lib64/libcublas.so.12"
-            "/usr/local/cuda/lib64/libcublasLt.so.12"
            "/usr/local/cuda/lib64/libcusparseLt.so.0"
-            "/usr/local/cuda/lib64/libcudart.so.12"
-            "/usr/local/cuda/lib64/libnvrtc.so.12"
            "/usr/local/cuda/lib64/libnvrtc-builtins.so"
            "/usr/local/cuda/lib64/libcufile.so.0"
            "/usr/local/cuda/lib64/libcufile_rdma.so.1"
            "/usr/local/cuda/lib64/libnvshmem_host.so.3"
-            "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12"
            "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so"
        )
        DEPS_SONAME+=(
@ -154,22 +150,56 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
            "libcudnn_engines_precompiled.so.9"
            "libcudnn_heuristic.so.9"
            "libcudnn.so.9"
-            "libcublas.so.12"
-            "libcublasLt.so.12"
            "libcusparseLt.so.0"
-            "libcudart.so.12"
-            "libnvrtc.so.12"
            "libnvrtc-builtins.so"
            "libnvshmem_host.so.3"
            "libcufile.so.0"
            "libcufile_rdma.so.1"
-            "libcupti.so.12"
            "libnvperf_host.so"
        )
        # Add libnvToolsExt only if CUDA version is not 12.9
-        if [[ $CUDA_VERSION != 12.9* ]]; then
-            DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1")
-            DEPS_SONAME+=("libnvToolsExt.so.1")
+        if [[ $CUDA_VERSION == 13* ]]; then
+            DEPS_LIST+=(
+                "/usr/local/cuda/lib64/libcublas.so.13"
+                "/usr/local/cuda/lib64/libcublasLt.so.13"
+                "/usr/local/cuda/lib64/libcudart.so.13"
+                "/usr/local/cuda/lib64/libnvrtc.so.13"
+                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13"
+                "/usr/local/cuda/lib64/libibverbs.so.1"
+                "/usr/local/cuda/lib64/librdmacm.so.1"
+                "/usr/local/cuda/lib64/libmlx5.so.1"
+                "/usr/local/cuda/lib64/libnl-3.so.200"
+                "/usr/local/cuda/lib64/libnl-route-3.so.200")
+            DEPS_SONAME+=(
+                "libcublas.so.13"
+                "libcublasLt.so.13"
+                "libcudart.so.13"
+                "libnvrtc.so.13"
+                "libcupti.so.13"
+                "libibverbs.so.1"
+                "librdmacm.so.1"
+                "libmlx5.so.1"
+                "libnl-3.so.200"
+                "libnl-route-3.so.200")
+            export USE_CUPTI_SO=1
+            export ATEN_STATIC_CUDA=0
+            export USE_CUDA_STATIC_LINK=0
+            export USE_CUFILE=0
+        else
+            DEPS_LIST+=(
+                "/usr/local/cuda/lib64/libnvToolsExt.so.1"
+                "/usr/local/cuda/lib64/libcublas.so.12"
+                "/usr/local/cuda/lib64/libcublasLt.so.12"
+                "/usr/local/cuda/lib64/libcudart.so.12"
+                "/usr/local/cuda/lib64/libnvrtc.so.12"
+                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12")
+            DEPS_SONAME+=(
+                "libnvToolsExt.so.1"
+                "libcublas.so.12"
+                "libcublasLt.so.12"
+                "libcudart.so.12"
+                "libnvrtc.so.12"
+                "libcupti.so.12")
        fi
    else
        echo "Using nvidia libs from pypi."
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -300,24 +300,3 @@ except RuntimeError as e:
    exit 1
  fi
 fi
-
-###############################################################################
-# Check for C++ ABI compatibility to GCC-11 - GCC 13
-###############################################################################
-if [[ "$(uname)" == 'Linux' &&  "$PACKAGE_TYPE" == 'manywheel' ]]; then
-  pushd /tmp
-  # Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html
-  # gcc-11 is ABI16, gcc-13 is ABI18, gcc-14 is ABI19
-  # gcc 11 - CUDA 11.8, xpu, rocm
-  # gcc 13 - CUDA 12.6, 12.8 and cpu
-  # Please see issue for reference: https://github.com/pytorch/pytorch/issues/152426
-  if [[ "$(uname -m)" == "s390x" ]]; then
-    cxx_abi="19"
-  elif [[ "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'rocm'* ]]; then
-    cxx_abi="18"
-  else
-    cxx_abi="16"
-  fi
-  python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi10${cxx_abi}' else 1)"
-  popd
-fi
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -149,13 +149,22 @@ function get_pinned_commit() {
  cat .github/ci_commit_pins/"${1}".txt
 }

+function detect_cuda_arch() {
+  if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
+    if command -v nvidia-smi; then
+      TORCH_CUDA_ARCH_LIST=$(nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1)
+    elif [[ "${TEST_CONFIG}" == *nogpu* ]]; then
+      # There won't be nvidia-smi in nogpu tests, so just set TORCH_CUDA_ARCH_LIST to the default
+      # minimum supported value here
+      TORCH_CUDA_ARCH_LIST=8.0
+    fi
+    export TORCH_CUDA_ARCH_LIST
+  fi
+}
+
 function install_torchaudio() {
  local commit
  commit=$(get_pinned_commit audio)
-  if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]] && command -v nvidia-smi; then
-    TORCH_CUDA_ARCH_LIST=$(nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1)
-    export TORCH_CUDA_ARCH_LIST
-  fi
  pip_build_and_install "git+https://github.com/pytorch/audio.git@${commit}" dist/audio
 }

--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -35,11 +35,10 @@ fi

 print_cmake_info
 if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
-  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
-  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
+  USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
 else
-  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
-  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
+  # NB: we always build with distributed; USE_DISTRIBUTED turns off all
+  # backends (specifically the gloo backend), so test that this case works too
  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
 fi
 if which sccache > /dev/null; then
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -13,9 +13,13 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
 fi
 popd

+python -mpip install -r requirements.txt
+
 # enable debug asserts in serialization
 export TORCH_SERIALIZATION_DEBUG=1

+python -mpip install --no-input -r requirements.txt
+
 setup_test_python() {
  # The CircleCI worker hostname doesn't resolve to an address.
  # This environment variable makes ProcessGroupGloo default to
@ -195,7 +199,7 @@ torchbench_setup_macos() {
  git checkout "$(cat ../.github/ci_commit_pins/vision.txt)"
  git submodule update --init --recursive
  python setup.py clean
-  python setup.py develop
+  python -m pip install -e . -v --no-build-isolation
  popd

  pushd torchaudio
@ -204,7 +208,7 @@ torchbench_setup_macos() {
  git submodule update --init --recursive
  python setup.py clean
  #TODO: Remove me, when figure out how to make TorchAudio find brew installed openmp
-  USE_OPENMP=0 python setup.py develop
+  USE_OPENMP=0 python -m pip install -e . -v --no-build-isolation
  popd

  checkout_install_torchbench
@ -302,6 +306,47 @@ test_torchbench_smoketest() {
    fi

  done
+  echo "Pytorch benchmark on mps device completed"
+}
+
+test_aoti_torchbench_smoketest() {
+  print_cmake_info
+
+  echo "Launching AOTInductor torchbench setup"
+  pip_benchmark_deps
+  # shellcheck disable=SC2119,SC2120
+  torchbench_setup_macos
+
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+
+  local device=mps
+  local dtypes=(undefined float16 bfloat16 notset)
+  local dtype=${dtypes[$1]}
+  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
+
+  echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}"
+  local dtype_arg="--${dtype}"
+  if [ "$dtype" == notset ]; then
+      dtype_arg="--float32"
+  fi
+  touch "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv"
+  for model in "${models[@]}"; do
+    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
+      --performance --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
+      --output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv" || true
+    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
+      --accuracy --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
+      --output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_accuracy.csv" || true
+  done
+
+  echo "Launching HuggingFace inference performance run for AOT Inductor and dtype ${dtype}"
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
+    --performance --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
+    --output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_performance.csv" || true
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
+    --accuracy --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
+    --output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_accuracy.csv" || true

  echo "Pytorch benchmark on mps device completed"
 }
@ -350,6 +395,8 @@ elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then
  test_timm_perf
 elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then
  test_torchbench_smoketest "${SHARD_NUMBER}"
+elif [[ $TEST_CONFIG == *"aot_inductor_perf_smoketest"* ]]; then
+  test_aoti_torchbench_smoketest "${SHARD_NUMBER}"
 elif [[ $TEST_CONFIG == *"mps"* ]]; then
  test_python_mps
 elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -45,6 +45,7 @@ if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then
    # DTensor tests
    time python test/run_test.py --verbose -i distributed/tensor/test_random_ops
    time python test/run_test.py --verbose -i distributed/tensor/test_dtensor_compile
+    time python test/run_test.py --verbose -i distributed/tensor/test_utils.py

    # DeviceMesh test
    time python test/run_test.py --verbose -i distributed/test_device_mesh
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -91,6 +91,7 @@ if [[ "$BUILD_ENVIRONMENT" == *clang9* || "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  export VALGRIND=OFF
 fi

+detect_cuda_arch

 if [[ "$BUILD_ENVIRONMENT" == *s390x* ]]; then
  # There are additional warnings on s390x, maybe due to newer gcc.
@ -495,6 +496,14 @@ test_inductor_cpp_wrapper_shard() {
    -k 'take' \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose
+
+  if [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
+    python test/run_test.py \
+      --include inductor/test_mkldnn_pattern_matcher \
+      -k 'xpu' \
+      --shard "$1" "$NUM_TEST_SHARDS" \
+      --verbose
+  fi
 }

 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
@ -1630,11 +1639,7 @@ elif [[ "${TEST_CONFIG}" == *xla* ]]; then
  build_xla
  test_xla
 elif [[ "$TEST_CONFIG" == *vllm* ]]; then
-    if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
-      TORCH_CUDA_ARCH_LIST=$(nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1)
-      export TORCH_CUDA_ARCH_LIST
-    fi
-    echo "VLLM CI TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST"
+    echo "vLLM CI uses TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST"
    (cd .ci/lumen_cli && python -m pip install -e .)
    python -m cli.run test external vllm --test-plan "$TEST_CONFIG" --shard-id "$SHARD_NUMBER" --num-shards "$NUM_TEST_SHARDS"
 elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -44,7 +44,7 @@ python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==
 python -m pip install z3-solver==4.15.1.0

 # Install tlparse for test\dynamo\test_structured_trace.py UTs.
-python -m pip install tlparse==0.3.30
+python -m pip install tlparse==0.4.0

 # Install parameterized
 python -m pip install parameterized==0.8.1
@ -55,9 +55,6 @@ python -m pip install pulp==2.9.0
 # Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308
 python -m pip install expecttest==0.3.0

-# Install intel-openmp
-python -m pip install intel-openmp==2025.1.1
-
 run_tests() {
    # Run nvidia-smi if available
    for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
--- a/.ci/pytorch/windows/internal/copy.bat
+++ b/.ci/pytorch/windows/internal/copy.bat
@ -1,12 +1,20 @@
-copy "%CUDA_PATH%\bin\cusparse*64_*.dll*" pytorch\torch\lib
-copy "%CUDA_PATH%\bin\cublas*64_*.dll*" pytorch\torch\lib
-copy "%CUDA_PATH%\bin\cudart*64_*.dll*" pytorch\torch\lib
-copy "%CUDA_PATH%\bin\curand*64_*.dll*" pytorch\torch\lib
-copy "%CUDA_PATH%\bin\cufft*64_*.dll*" pytorch\torch\lib
-copy "%CUDA_PATH%\bin\cusolver*64_*.dll*" pytorch\torch\lib
+
+if %CUDA_VERSION% geq 130 (
+    set "dll_path=bin\x64"
+) else (
+    set "dll_path=bin"
+)
+
+copy "%CUDA_PATH%\%dll_path%\cusparse*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\%dll_path%\cublas*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\%dll_path%\cudart*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\%dll_path%\curand*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\%dll_path%\cufft*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\%dll_path%\cusolver*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\%dll_path%\nvrtc*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\%dll_path%\nvJitLink_*.dll*"  pytorch\torch\lib

 copy "%CUDA_PATH%\bin\cudnn*64_*.dll*" pytorch\torch\lib
-copy "%CUDA_PATH%\bin\nvrtc*64_*.dll*" pytorch\torch\lib
 copy "%CUDA_PATH%\extras\CUPTI\lib64\cupti64_*.dll*" pytorch\torch\lib
 copy "%CUDA_PATH%\extras\CUPTI\lib64\nvperf_host*.dll*" pytorch\torch\lib

@ -20,8 +28,3 @@ copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib
 if exist "C:\Windows\System32\zlibwapi.dll" (
    copy "C:\Windows\System32\zlibwapi.dll"  pytorch\torch\lib
 )
-
-::copy nvJitLink dll is requires for cuda 12+
-if exist "%CUDA_PATH%\bin\nvJitLink_*.dll*" (
-    copy "%CUDA_PATH%\bin\nvJitLink_*.dll*"  pytorch\torch\lib
-)
--- a/.ci/pytorch/windows/internal/xpu_install.bat
+++ b/.ci/pytorch/windows/internal/xpu_install.bat
@ -13,9 +13,9 @@ if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
 :xpu_bundle_install_start

 set XPU_BUNDLE_PARENT_DIR=C:\Program Files (x86)\Intel\oneAPI
-set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d6d6c17-ca2d-4735-9331-99447e4a1280/intel-deep-learning-essentials-2025.0.1.28_offline.exe
+set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
 set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product
-set XPU_BUNDLE_VERSION=2025.0.1+20
+set XPU_BUNDLE_VERSION=2025.1.3+5
 set XPU_BUNDLE_INSTALLED=0
 set XPU_BUNDLE_UNINSTALL=0
 set XPU_EXTRA_URL=NULL
@ -24,9 +24,9 @@ set XPU_EXTRA_VERSION=2025.0.1+1226
 set XPU_EXTRA_INSTALLED=0
 set XPU_EXTRA_UNINSTALL=0

-if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.1] (
-    set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
-    set XPU_BUNDLE_VERSION=2025.1.3+5
+if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.2] (
+    set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
+    set XPU_BUNDLE_VERSION=2025.2.1+20
 )

 :: Check if XPU bundle is target version or already installed
@ -90,14 +90,3 @@ if errorlevel 1 exit /b 1
 del xpu_extra.exe

 :xpu_install_end
-
-if not "%XPU_ENABLE_KINETO%"=="1" goto install_end
-:: Install Level Zero SDK
-set XPU_EXTRA_LZ_URL=https://github.com/oneapi-src/level-zero/releases/download/v1.14.0/level-zero-sdk_1.14.0.zip
-curl -k -L %XPU_EXTRA_LZ_URL% --output "%SRC_DIR%\temp_build\level_zero_sdk.zip"
-echo "Installing level zero SDK..."
-7z x "%SRC_DIR%\temp_build\level_zero_sdk.zip" -o"%SRC_DIR%\temp_build\level_zero"
-set "INCLUDE=%SRC_DIR%\temp_build\level_zero\include;%INCLUDE%"
-del "%SRC_DIR%\temp_build\level_zero_sdk.zip"
-
-:install_end
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -124,20 +124,16 @@ popd

 export TH_BINARY_BUILD=1
 export INSTALL_TEST=0 # dont install test binaries into site-packages
-export MACOSX_DEPLOYMENT_TARGET=10.15
+export MACOSX_DEPLOYMENT_TARGET=11.0
 export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}

-SETUPTOOLS_PINNED_VERSION="==70.1.0"
-PYYAML_PINNED_VERSION="=5.3"
 EXTRA_CONDA_INSTALL_FLAGS=""
 CONDA_ENV_CREATE_FLAGS=""
 RENAME_WHEEL=true
 case $desired_python in
    3.14t)
        echo "Using 3.14 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=6.0.1"
-        NUMPY_PINNED_VERSION="=2.1.0"
+        NUMPY_PINNED_VERSION="==2.1.0"
        CONDA_ENV_CREATE_FLAGS="python-freethreading"
        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
        desired_python="3.14.0rc1"
@ -145,18 +141,14 @@ case $desired_python in
        ;;
    3.14)
        echo "Using 3.14t deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=6.0.1"
-        NUMPY_PINNED_VERSION="=2.1.0"
+        NUMPY_PINNED_VERSION="==2.1.0"
        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
        desired_python="3.14.0rc1"
        RENAME_WHEEL=false
        ;;
    3.13t)
        echo "Using 3.13 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=6.0.1"
-        NUMPY_PINNED_VERSION="=2.1.0"
+        NUMPY_PINNED_VERSION="==2.1.0"
        CONDA_ENV_CREATE_FLAGS="python-freethreading"
        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
        desired_python="3.13"
@ -164,37 +156,23 @@ case $desired_python in
        ;;
    3.13)
        echo "Using 3.13 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=6.0.1"
-        NUMPY_PINNED_VERSION="=2.1.0"
+        NUMPY_PINNED_VERSION="==2.1.0"
        ;;
    3.12)
        echo "Using 3.12 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=6.0.1"
-        NUMPY_PINNED_VERSION="=2.0.2"
+        NUMPY_PINNED_VERSION="==2.0.2"
        ;;
    3.11)
        echo "Using 3.11 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=5.3"
-        NUMPY_PINNED_VERSION="=2.0.2"
+        NUMPY_PINNED_VERSION="==2.0.2"
        ;;
    3.10)
        echo "Using 3.10 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=5.3"
-        NUMPY_PINNED_VERSION="=2.0.2"
-        ;;
-    3.9)
-        echo "Using 3.9 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=5.3"
-        NUMPY_PINNED_VERSION="=2.0.2"
+        NUMPY_PINNED_VERSION="==2.0.2"
        ;;
    *)
-        echo "Using default deps"
-        NUMPY_PINNED_VERSION="=1.11.3"
+        echo "Unsupported version $desired_python"
+        exit 1
        ;;
 esac

@ -203,12 +181,16 @@ tmp_env_name="wheel_py$python_nodot"
 conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS}
 source activate "$tmp_env_name"

-retry pip install -r "${pytorch_rootdir}/requirements-build.txt"
-pip install "numpy=${NUMPY_PINNED_VERSION}"  "pyyaml${PYYAML_PINNED_VERSION}" requests ninja "setuptools${SETUPTOOLS_PINNED_VERSION}" typing-extensions
+PINNED_PACKAGES=(
+    "numpy${NUMPY_PINNED_VERSION}"
+)
+retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements-build.txt"
+pip install requests ninja typing-extensions
 retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
 retry brew install libomp

-# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
+# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
+# is build as part of tensorpipe submodule
 export USE_DISTRIBUTED=1

 export USE_MKLDNN=OFF
@ -218,7 +200,7 @@ export BUILD_TEST=OFF
 pushd "$pytorch_rootdir"
 echo "Calling setup.py bdist_wheel at $(date)"

-python setup.py bdist_wheel -d "$whl_tmp_dir"
+python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name ${mac_version}

 echo "Finished setup.py bdist_wheel at $(date)"

--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -75,8 +75,8 @@ TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
 # Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
 TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"

-# CUDA 12.9 builds have triton for Linux and Linux aarch64 binaries.
-if [[ "$DESIRED_CUDA" == "cu129" ]]; then
+# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries.
+if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then
  TRITON_CONSTRAINT="platform_system == 'Linux'"
 fi

--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@ -15,8 +15,7 @@ fi
 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
    export VC_YEAR=2022
    export USE_SCCACHE=0
-    export XPU_VERSION=2025.1
-    export XPU_ENABLE_KINETO=1
+    export XPU_VERSION=2025.2
 fi

 echo "Free space on filesystem before build:"
--- a/.circleci/scripts/binary_windows_test.sh
+++ b/.circleci/scripts/binary_windows_test.sh
@ -8,7 +8,7 @@ export VC_YEAR=2022

 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
    export VC_YEAR=2022
-    export XPU_VERSION=2025.1
+    export XPU_VERSION=2025.2
 fi

 pushd "$PYTORCH_ROOT/.ci/pytorch/"
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -12,7 +12,9 @@ self-hosted-runner:
    - linux.9xlarge.ephemeral
    - am2.linux.9xlarge.ephemeral
    - linux.12xlarge
+    - linux.12xlarge.memory
    - linux.24xlarge
+    - linux.24xlarge.memory
    - linux.24xlarge.ephemeral
    - linux.24xlarge.amd
    - linux.arm64.2xlarge
--- a/.github/actions/build-external-packages/action.yml
+++ b/.github/actions/build-external-packages/action.yml
@ -4,6 +4,11 @@ name: Build External packages
 description: build external packages for PyTorch

 inputs:
+  cuda-version:
+    description: CUDA version to use
+    type: string
+    required: true
+    default: '12.8.1'
  cuda-arch-list:
    description: TORCH_CUDA_ARCH_LIST (e.g., "8.0;8.9;9.0")
    type: string
@ -44,10 +49,12 @@ runs:
      env:
        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
        SCCACHE_REGION: us-east-1
+        CUDA_VERSION: ${{ inputs.cuda-version }}
        TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
        BASE_IMAGE: ${{ inputs.docker-image }}
        BUILD_TARGETS: ${{ inputs.build-targets }}
-        PARENT_OUTPUT_DIR: ${{ inputs.output-dir}}
+        PARENT_OUTPUT_DIR: ${{ inputs.output-dir }}
+        TORCH_WHEELS_PATH: ${{ inputs.torch-wheel-dir }}
      shell: bash
      run: |
        set -euo pipefail
@ -68,7 +75,6 @@ runs:
          export OUTPUT_DIR
          echo "Building external package: $target in directory $OUTPUT_DIR"
          python3 -m cli.run build external "$target"
-
        done

        END_TIME=$(date +%s)
--- a/.github/actions/checkout-pytorch/action.yml
+++ b/.github/actions/checkout-pytorch/action.yml
@ -57,6 +57,21 @@ runs:
        submodules: ${{ inputs.submodules }}
        show-progress: false

+    - name: Clean submodules post checkout
+      id: clean-submodules
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
+      shell: bash
+      env:
+        NO_SUDO: ${{ inputs.no-sudo }}
+      run: |
+        cd "${GITHUB_WORKSPACE}"
+        # Clean stale submodule dirs
+        if [ -z "${NO_SUDO}" ]; then
+          sudo git submodule foreach --recursive git clean -ffdx
+        else
+          git submodule foreach --recursive git clean -ffdx
+        fi
+
    - name: Clean workspace (try again)
      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' &&
        (steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }}
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-10a5002c6195bd95e34df8fe28ff8a2d55a2a922
+27fc2493d383354a008106f22f3be232badee9a1
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-add1adfec742dfb13e614dab3372b5aafd1ff046
+e10fef08838612b4560e9c72e5cb1414a5edfa13
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-a1c6ee92c85e8b0955c20892ed68f032a6015c09
+6c5478ff7c3d50dd1e3047d72ec5909bea474073
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@ -12,54 +12,46 @@ ARG BUILD_BASE_IMAGE=torch-nightly-base
 # by default, it uses devel-ubuntu22.04 official image.
 ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04

+# The logic is copied from https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile
+ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py"

-#################### TORCH NIGHTLY  BASE IMAGE ####################
+
+#################### TORCH NIGHTLY BASE IMAGE ####################
 # A base image for building vLLM with devel ubuntu 22.04, this is mainly used to build vllm in vllm builtkite ci
-From nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 as torch-nightly-base
-ARG CUDA_VERSION=12.8.1
-ARG PYTHON_VERSION=3.12
-ARG TARGETPLATFORM
-ENV DEBIAN_FRONTEND=noninteractive
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 as torch-nightly-base

-RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
-    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
+ARG CUDA_VERSION
+ARG PYTHON_VERSION
+ARG GET_PIP_URL

-# Install Python and other dependencies if it does not existed
-RUN if ! command -v python3 >/dev/null || ! python3 --version | grep -q "${PYTHON_VERSION}"; then \
-      echo "Installing Python ${PYTHON_VERSION}..." && \
-      echo 'tzdata tzdata/Areas select America' | debconf-set-selections && \
-      echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections && \
-      apt-get update -y && \
-      apt-get install -y ccache software-properties-common git curl sudo && \
-      for i in 1 2 3; do \
-        add-apt-repository -y ppa:deadsnakes/ppa && break || \
-        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
-      done && \
-      apt-get update -y && \
-      apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv && \
-      update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 && \
-      update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} && \
-      ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config && \
-      curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}; \
-   else \
-      echo "Python ${PYTHON_VERSION} already present, skipping setup."; \
-   fi \
-   && python3 --version && python3 -m pip --version
+# Install Python and other dependencies
+RUN apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl wget sudo vim \
+    && add-apt-repository -y ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version

 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
 # as it was causing spam when compiling the CUTLASS kernels
 # Ensure gcc >= 10 to avoid CUTLASS issues (bug 92519)
 RUN current_gcc_version=$(gcc -dumpversion | cut -f1 -d.) && \
-    if [ "$current_gcc_version" -lt 10 ]; then \
-      echo "GCC version is $current_gcc_version, installing gcc-10..."; \
-      apt-get update && \
-      apt-get install -y gcc-10 g++-10 && \
-      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 && \
-      update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100; \
-    else \
-      echo "GCC version is $current_gcc_version, no need to install gcc-10."; \
-    fi && \
-    gcc --version && g++ --version
+    if command -v apt-get >/dev/null; then \
+        if [ "$current_gcc_version" -lt 10 ]; then \
+            echo "GCC version is $current_gcc_version, installing gcc-10..."; \
+            apt-get update \
+            && apt-get install -y gcc-10 g++-10 \
+            && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 \
+            && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100; \
+        else \
+            echo "GCC version is $current_gcc_version, no need to install gcc-10."; \
+        fi \
+    fi \
+    && gcc --version && g++ --version

 # install uv for faster pip installs
 RUN --mount=type=cache,target=/root/.cache/uv \
@ -79,6 +71,21 @@ ENV UV_LINK_MODE=copy
 FROM ${BUILD_BASE_IMAGE} AS base
 USER root

+ARG CUDA_VERSION
+ARG PYTHON_VERSION
+
+# TODO (huydhn): Only work with PyTorch manylinux builder
+ENV PATH="/opt/python/cp312-cp312/bin:${PATH}"
+
+# Install some system dependencies and double check python version
+RUN if command -v apt-get >/dev/null; then \
+        apt-get update -y \
+        && apt-get install -y ccache software-properties-common git curl wget sudo vim; \
+    else \
+        dnf install -y git curl wget sudo vim; \
+    fi \
+    && python3 --version && python3 -m pip --version
+
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
@ -118,17 +125,15 @@ RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
    if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
        echo "[INFO] Installing torch wheels to build vllm"; \
        torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
-        vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \
-        audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \
-        uv pip install --system "${torch_whl}[opt-einsum]"; \
-        uv pip install --system "${vision_whl}"; \
-        uv pip install --system "${audio_whl}"; \
+        vision_whl=$(find /dist -name 'torchvision*.whl' | head -n1 | xargs); \
+        audio_whl=$(find /dist -name 'torchaudio*.whl' | head -n1 | xargs); \
+        uv pip install --system "${torch_whl}[opt-einsum]" "${vision_whl}" "${audio_whl}" /dist/*.whl; \
    elif [ -n "$PINNED_TORCH_VERSION" ]; then \
        echo "[INFO] Installing pinned torch nightly version to build vllm: $PINNED_TORCH_VERSION"; \
-        uv pip install --system "$PINNED_TORCH_VERSION" --index-url https://download.pytorch.org/whl/nightly/cu128; \
+        uv pip install --system "$PINNED_TORCH_VERSION" --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
    else \
        echo "[INFO] Installing torch nightly with latest one to build vllm"; \
-        uv pip install --system torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128; \
+        uv pip install --system torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
    fi

 # Install numba 0.61.2 for cuda environment
@ -137,12 +142,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \

 # Install common dependencies from vllm common.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-uv pip install --system -r requirements/common.txt
-
+    uv pip install --system -r requirements/common.txt

 # Must put before installing xformers, so it can install the correct version of xfomrers.
-ARG exformer_cuda_arch_list='7.5;8.0+PTX;9.0a'
-ENV TORCH_CUDA_ARCH_LIST=${exformer_cuda_arch_list}
+ARG xformers_cuda_arch_list='7.5;8.0+PTX;9.0a'
+ENV TORCH_CUDA_ARCH_LIST=${xformers_cuda_arch_list}

 ARG max_jobs=16
 ENV MAX_JOBS=${max_jobs}
@ -153,8 +157,8 @@ RUN pip freeze | grep -E 'ninja'

 # Build xformers with cuda and torch nightly/wheel
 # following official xformers guidance: https://github.com/facebookresearch/xformers#build
-# sha for https://github.com/facebookresearch/xformers/tree/v0.0.31
-ARG XFORMERS_COMMIT=eb0946a363464da96ea40afd1a7f72a907c25497
+# sha for https://github.com/facebookresearch/xformers/tree/v0.0.32.post2
+ARG XFORMERS_COMMIT=5d4b92a5e5a9c6c6d4878283f47d82e17995b468
 ENV CCACHE_DIR=/root/.cache/ccache

 RUN --mount=type=cache,target=/root/.cache/ccache \
@ -176,6 +180,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
 # track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
 RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
+
 RUN cat torch_build_versions.txt
 RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'

@ -187,11 +192,6 @@ RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'
 FROM base AS build
 ARG TARGETPLATFORM

-ENV UV_HTTP_TIMEOUT=500
-ENV UV_INDEX_STRATEGY="unsafe-best-match"
-# Use copy mode to avoid hardlink failures with Docker cache mounts
-ENV UV_LINK_MODE=copy
-
 COPY . .

 RUN python3 use_existing_torch.py
@ -250,9 +250,9 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
        python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38; \
    fi

-RUN echo "[DEBUG] Listing  current directory:" && \
+RUN echo "[INFO] Listing current directory:" && \
    ls -al && \
-    echo "[DEBUG] Showing torch_build_versions.txt content:" && \
+    echo "[INFO] Showing torch_build_versions.txt content:" && \
    cat torch_build_versions.txt

 #################### WHEEL BUILD IMAGE ####################
@ -262,42 +262,40 @@ RUN echo "[DEBUG] Listing  current directory:" && \
 # Setup clean environment for vLLM for test and api server using ubuntu22.04 with AOT flashinfer
 FROM ${FINAL_BASE_IMAGE} AS vllm-base
 USER root
+
+ARG CUDA_VERSION
+ARG PYTHON_VERSION
+ARG GET_PIP_URL
+
+# TODO (huydhn): Only work with PyTorch manylinux builder
+ENV PATH="/opt/python/cp312-cp312/bin:${PATH}"
+
 # prepare for environment starts
 WORKDIR /workspace

-RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
-    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
-
-# Install Python and other dependencies if it does not existed
-RUN if ! command -v python3 >/dev/null || ! python3 --version | grep -q "${PYTHON_VERSION}"; then \
-      echo "Installing Python ${PYTHON_VERSION}..." && \
-      echo 'tzdata tzdata/Areas select America' | debconf-set-selections && \
-      echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections && \
-      apt-get update -y && \
-      apt-get install -y ccache software-properties-common git curl sudo && \
-      for i in 1 2 3; do \
-        add-apt-repository -y ppa:deadsnakes/ppa && break || \
-        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
-      done && \
-      apt-get update -y && \
-      apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv && \
-      update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 && \
-      update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} && \
-      ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config && \
-      curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}; \
-   else \
-      echo "Python ${PYTHON_VERSION} already present, skipping setup."; \
-   fi \
-   && python3 --version && python3 -m pip --version
-
+# Install Python and other dependencies
+RUN if command -v apt-get >/dev/null; then \
+        apt-get update -y \
+        && apt-get install -y ccache software-properties-common git curl wget sudo vim \
+        && add-apt-repository -y ppa:deadsnakes/ppa \
+        && apt-get update -y \
+        && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+        && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+        && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+        && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+        && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
+    else \
+        dnf install -y git curl wget sudo vim; \
+    fi \
+    && python3 --version && python3 -m pip --version

 # Get the torch versions, and whls used in previous stagtes for consistency
 COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
 COPY --from=base /workspace/xformers-dist /wheels/xformers
 COPY --from=build /workspace/vllm-dist /wheels/vllm
-RUN echo "[DEBUG] Listing current directory before torch install step:" && \
+RUN echo "[INFO] Listing current directory before torch install step:" && \
    ls -al && \
-    echo "[DEBUG] Showing torch_build_versions.txt content:" && \
+    echo "[INFO] Showing torch_build_versions.txt content:" && \
    cat torch_build_versions.txt

 # Workaround for https://github.com/openai/triton/issues/2507 and
@ -306,7 +304,6 @@ RUN echo "[DEBUG] Listing current directory before torch install step:" && \
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/

-
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
    if ! python3 -m uv --version > /dev/null 2>&1; then \
@ -326,15 +323,13 @@ RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
    --mount=type=cache,target=/root/.cache/uv \
    if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
        torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
-        vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \
-        audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \
+        vision_whl=$(find /dist -name 'torchvision*.whl' | head -n1 | xargs); \
+        audio_whl=$(find /dist -name 'torchaudio*.whl' | head -n1 | xargs); \
        echo "[INFO] Use wheels to build : '${torch_whl}' '${audio_whl}' '${vision_whl}'"; \
-        uv pip install --system "${torch_whl}[opt-einsum]"; \
-        uv pip install --system "${vision_whl}"; \
-        uv pip install --system "${audio_whl}"; \
+        uv pip install --system "${torch_whl}[opt-einsum]" "${vision_whl}" "${audio_whl}" /dist/*.whl; \
    else \
        echo "[INFO] Installing torch versions from torch_build_versions.txt"; \
-        uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128; \
+        uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
    fi

 # Install the vllm wheel from previous stage
@ -345,9 +340,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system /wheels/xformers/*.whl --verbose

-
 # Build flashinfer from source.
-ARG torch_cuda_arch_list='8.0;8.9;9.0a'
+ARG torch_cuda_arch_list='8.0;8.9;9.0a;10.0a;12.0'
 # install package for build flashinfer
 # see issue: https://github.com/flashinfer-ai/flashinfer/issues/738

@ -358,7 +352,7 @@ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 # Build flashinfer for torch nightly from source around 10 mins
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
-ARG FLASHINFER_GIT_REF="v0.2.9rc2"
+ARG FLASHINFER_GIT_REF="v0.2.14.post1"
 RUN --mount=type=cache,target=/root/.cache/uv \
    git clone --depth 1 --recursive --shallow-submodules \
        --branch ${FLASHINFER_GIT_REF} \
@ -376,6 +370,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \

 # Logging to confirm the torch versions
 RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
+RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio\|^xformers\|^vllm\|^flashinfer' > build_summary.txt
 ################### VLLM INSTALLED IMAGE ####################


@ -414,11 +409,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r requirements/nightly_torch_test.txt

-# Workaround for #17068
-# pinned commit for v2.2.4
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@95d8aba8a8c75aedcaa6143713b11e745e7cd0d9#egg=mamba-ssm"
-
 # Logging to confirm the torch versions
 RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'

@ -433,4 +423,5 @@ FROM scratch as export-wheels
 # Just copy the wheels we prepared in previous stages
 COPY --from=base /workspace/xformers-dist /wheels/xformers
 COPY --from=build /workspace/vllm-dist /wheels/vllm
+COPY --from=vllm-base /workspace/build_summary.txt /wheels/build_summary.txt
 COPY --from=vllm-base /workspace/wheels/flashinfer /wheels/flashinfer-python
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -28,7 +28,7 @@ pyyaml==6.0.2
 scipy==1.12.0
 setuptools==72.1.0
 sympy==1.13.3
-tlparse==0.3.30
+tlparse==0.4.0
 tensorboard==2.13.0
 typing-extensions==4.12.2
 unittest-xml-reporting<=3.2.0,>=2.0.0
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@ -84,6 +84,7 @@ def build_triton(
                ["git", "checkout", f"release/{ver}.{rev}.x"], cwd=triton_basedir
            )
        else:
+            check_call(["git", "fetch", "origin", commit_hash], cwd=triton_basedir)
            check_call(["git", "checkout", commit_hash], cwd=triton_basedir)

        # change built wheel name and version
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -16,18 +16,16 @@ from typing import Optional


 # NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
-CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
+CUDA_ARCHES = ["12.6", "12.8", "13.0"]
 CUDA_STABLE = "12.8"
 CUDA_ARCHES_FULL_VERSION = {
    "12.6": "12.6.3",
    "12.8": "12.8.1",
-    "12.9": "12.9.1",
    "13.0": "13.0.0",
 }
 CUDA_ARCHES_CUDNN_VERSION = {
    "12.6": "9",
    "12.8": "9",
-    "12.9": "9",
    "13.0": "9",
 }

@ -40,7 +38,7 @@ CPU_AARCH64_ARCH = ["cpu-aarch64"]

 CPU_S390X_ARCH = ["cpu-s390x"]

-CUDA_AARCH64_ARCHES = ["12.9-aarch64"]
+CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "13.0-aarch64"]


 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
@ -78,28 +76,11 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
-    "12.9": (
-        "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
-    ),
    "13.0": (
        "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | "
@ -107,32 +88,32 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
    "xpu": (
-        "intel-cmplr-lib-rt==2025.1.1 | "
-        "intel-cmplr-lib-ur==2025.1.1 | "
-        "intel-cmplr-lic-rt==2025.1.1 | "
-        "intel-sycl-rt==2025.1.1 | "
-        "oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "onemkl-sycl-blas==2025.1.0 | "
-        "onemkl-sycl-dft==2025.1.0 | "
-        "onemkl-sycl-lapack==2025.1.0 | "
-        "onemkl-sycl-rng==2025.1.0 | "
-        "onemkl-sycl-sparse==2025.1.0 | "
-        "dpcpp-cpp-rt==2025.1.1 | "
-        "intel-opencl-rt==2025.1.1 | "
-        "mkl==2025.1.0 | "
-        "intel-openmp==2025.1.1 | "
-        "tbb==2022.1.0 | "
-        "tcmlib==1.3.0 | "
-        "umf==0.10.0 | "
-        "intel-pti==0.12.3"
+        "intel-cmplr-lib-rt==2025.2.1 | "
+        "intel-cmplr-lib-ur==2025.2.1 | "
+        "intel-cmplr-lic-rt==2025.2.1 | "
+        "intel-sycl-rt==2025.2.1 | "
+        "oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "onemkl-sycl-blas==2025.2.0 | "
+        "onemkl-sycl-dft==2025.2.0 | "
+        "onemkl-sycl-lapack==2025.2.0 | "
+        "onemkl-sycl-rng==2025.2.0 | "
+        "onemkl-sycl-sparse==2025.2.0 | "
+        "dpcpp-cpp-rt==2025.2.1 | "
+        "intel-opencl-rt==2025.2.1 | "
+        "mkl==2025.2.0 | "
+        "intel-openmp==2025.2.1 | "
+        "tbb==2022.2.0 | "
+        "tcmlib==1.4.0 | "
+        "umf==0.11.0 | "
+        "intel-pti==0.13.1"
    ),
 }

@ -210,7 +191,7 @@ LIBTORCH_CONTAINER_IMAGES: dict[str, str] = {
    "cpu": "libtorch-cxx11-builder:cpu",
 }

-FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]
+FULL_PYTHON_VERSIONS = ["3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]


 def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
@ -240,12 +221,8 @@ def generate_libtorch_matrix(
        if os == "linux":
            arches += CUDA_ARCHES
            arches += ROCM_ARCHES
-            if "13.0" in arches:
-                arches.remove("13.0")
        elif os == "windows":
            arches += CUDA_ARCHES
-            if "13.0" in arches:
-                arches.remove("13.0")
    if libtorch_variants is None:
        libtorch_variants = [
            "shared-with-deps",
@ -310,8 +287,6 @@ def generate_wheels_matrix(
            arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
        elif os == "windows":
            arches += CUDA_ARCHES + XPU_ARCHES
-            if "13.0" in arches:
-                arches.remove("13.0")
        elif os == "linux-aarch64":
            # Separate new if as the CPU type is different and
            # uses different build/test scripts
@ -334,19 +309,20 @@ def generate_wheels_matrix(
                else arch_version
            )

-            # TODO: Enable python 3.13t on cpu-s390x
-            if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
-                continue
            # TODO: Enable python 3.14 for rest
-            if os not in ["linux", "linux-aarch64", "macos-arm64", "windows"] and (
-                python_version == "3.14" or python_version == "3.14t"
-            ):
+            if os not in [
+                "linux",
+                "linux-aarch64",
+                "linux-s390x",
+                "macos-arm64",
+                "windows",
+            ] and (python_version == "3.14" or python_version == "3.14t"):
                continue

            # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install

            if (
-                arch_version in ["13.0", "12.9", "12.8", "12.6"]
+                arch_version in ["13.0", "12.8", "12.6"]
                and os == "linux"
                or arch_version in CUDA_AARCH64_ARCHES
            ):
@ -410,6 +386,5 @@ def generate_wheels_matrix(


 validate_nccl_dep_consistency("13.0")
-validate_nccl_dep_consistency("12.9")
 validate_nccl_dep_consistency("12.8")
 validate_nccl_dep_consistency("12.6")
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -27,6 +27,7 @@ from trymerge import (
    get_drci_classifications,
    gh_get_team_members,
    GitHubPR,
+    iter_issue_timeline_until_comment,
    JobCheckState,
    main as trymerge_main,
    MandatoryChecksMissingError,
@ -34,6 +35,8 @@ from trymerge import (
    RE_GHSTACK_DESC,
    read_merge_rules,
    remove_job_name_suffix,
+    sha_from_committed_event,
+    sha_from_force_push_after,
    validate_revert,
 )

@ -124,7 +127,7 @@ def mock_parse_args(revert: bool = False, force: bool = False) -> Any:
            self.force = force
            self.pr_num = 76123
            self.dry_run = True
-            self.comment_id = 0
+            self.comment_id = 12345  # Set to non-zero value
            self.reason = "this is for testing"
            self.ignore_current = False
            self.check_mergeability = False
@ -152,9 +155,9 @@ def mock_revert(
 def mock_merge(
    pr: GitHubPR,
    repo: GitRepo,
+    comment_id: int,
    dry_run: bool = False,
    skip_mandatory_checks: bool = False,
-    comment_id: Optional[int] = None,
    timeout_minutes: int = 400,
    stale_pr_days: int = 3,
    ignore_current: bool = False,
@ -470,9 +473,9 @@ class TestTryMerge(TestCase):
        mock_merge.assert_called_once_with(
            mock.ANY,
            mock.ANY,
+            comment_id=mock.ANY,
            dry_run=mock.ANY,
            skip_mandatory_checks=True,
-            comment_id=mock.ANY,
            ignore_current=False,
        )

@ -485,9 +488,9 @@ class TestTryMerge(TestCase):
        mock_merge.assert_called_once_with(
            mock.ANY,
            mock.ANY,
+            comment_id=mock.ANY,
            dry_run=mock.ANY,
            skip_mandatory_checks=False,
-            comment_id=mock.ANY,
            ignore_current=False,
        )

@ -1138,5 +1141,176 @@ Pull Request resolved: https://github.com/pytorch/pytorch/pull/154394"""
        )


+@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
+@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
+@mock.patch(
+    "trymerge.get_drci_classifications", side_effect=mocked_drci_classifications
+)
+class TestTimelineFunctions(TestCase):
+    """Tests for the new timeline-related functions"""
+
+    def test_sha_from_committed_event(self, *args: Any) -> None:
+        """Test extracting SHA from committed event"""
+        # Based on actual GitHub API format - committed events have "sha" at top level
+        event = {
+            "event": "committed",
+            "sha": "fb21ce932ded6670c918804a0d9151b773770a7c",
+        }
+        self.assertEqual(
+            sha_from_committed_event(event), "fb21ce932ded6670c918804a0d9151b773770a7c"
+        )
+
+        # Test with missing SHA
+        event_no_sha = {"event": "committed"}
+        self.assertIsNone(sha_from_committed_event(event_no_sha))
+
+    def test_sha_from_force_push_after(self, *args: Any) -> None:
+        """Test extracting SHA from force push event"""
+        # NOTE: The current function doesn't handle the actual GitHub API format
+        # Real force push events have "commit_id" at top level, but this function
+        # looks for "after", "after_commit", "after_sha", or "head_sha" fields
+
+        # Test with the legacy format the current function handles
+        event_legacy = {
+            "event": "head_ref_force_pushed",
+            "after": {"sha": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e"},
+        }
+        self.assertEqual(
+            sha_from_force_push_after(event_legacy),
+            "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
+        )
+
+        # Test with current GitHub API format (should return None with current implementation)
+        event_real_api = {
+            "event": "head_ref_force_pushed",
+            "commit_id": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
+        }
+        self.assertEqual(
+            sha_from_force_push_after(event_real_api),
+            "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
+        )  # Current function doesn't handle commit_id
+
+        # Test with missing SHA
+        event_no_sha = {"event": "head_ref_force_pushed"}
+        self.assertIsNone(sha_from_force_push_after(event_no_sha))
+
+    @mock.patch("trymerge.gh_fetch_json_list")
+    def test_iter_issue_timeline_until_comment(
+        self, mock_gh_fetch_json_list: Any, *args: Any
+    ) -> None:
+        """Test timeline iteration until target comment"""
+        # Mock timeline data based on actual GitHub API format
+        timeline_data = [
+            {"event": "commented", "id": 100, "body": "first comment"},
+            {"event": "committed", "sha": "fb21ce932ded6670c918804a0d9151b773770a7c"},
+            {"event": "commented", "id": 200, "body": "target comment"},
+            {"event": "commented", "id": 300, "body": "after target"},
+        ]
+        mock_gh_fetch_json_list.return_value = timeline_data
+
+        # Test iteration stops at target comment
+        events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 200))
+        self.assertEqual(len(events), 3)  # Should stop at target comment
+        self.assertEqual(events[0]["event"], "commented")
+        self.assertEqual(events[0]["id"], 100)
+        self.assertEqual(events[1]["event"], "committed")
+        self.assertEqual(events[1]["sha"], "fb21ce932ded6670c918804a0d9151b773770a7c")
+        self.assertEqual(events[2]["event"], "commented")
+        self.assertEqual(events[2]["id"], 200)
+
+    @mock.patch("trymerge.gh_fetch_json_list")
+    def test_iter_issue_timeline_until_comment_not_found(
+        self, mock_gh_fetch_json_list: Any, *args: Any
+    ) -> None:
+        """Test timeline iteration when target comment is not found"""
+        # Mock empty timeline
+        mock_gh_fetch_json_list.return_value = []
+
+        events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 999))
+        self.assertEqual(len(events), 0)
+
+    @mock.patch("trymerge.iter_issue_timeline_until_comment")
+    def test_get_commit_sha_at_comment_commit_after_comment(
+        self, mock_iter_timeline: Any, *args: Any
+    ) -> None:
+        """Test get_commit_sha_at_comment returns correct SHA after comment"""
+        mock_iter_timeline.return_value = [
+            {"event": "committed", "sha": "commit1"},
+            {"event": "committed", "sha": "commit2"},
+            {"event": "commented", "id": 100},
+            {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
+        ]
+        pr = GitHubPR("pytorch", "pytorch", 77700)
+        sha = pr.get_commit_sha_at_comment(100)
+        self.assertEqual(sha, "commit2")
+
+    @mock.patch("trymerge.iter_issue_timeline_until_comment")
+    def test_get_commit_sha_at_comment_force_push_before_comment(
+        self, mock_iter_timeline: Any, *args: Any
+    ) -> None:
+        mock_iter_timeline.return_value = [
+            {"event": "committed", "sha": "commit1"},
+            {"event": "committed", "sha": "commit2"},
+            {"event": "head_ref_force_pushed", "commit_id": "commit3"},
+            {"event": "commented", "id": 100},
+        ]
+        pr = GitHubPR("pytorch", "pytorch", 77700)
+        sha = pr.get_commit_sha_at_comment(100)
+        self.assertEqual(sha, "commit3")
+
+    @mock.patch("trymerge.iter_issue_timeline_until_comment")
+    def test_get_commit_sha_at_comment_force_push_before_comment_legacy_mode(
+        self, mock_iter_timeline: Any, *args: Any
+    ) -> None:
+        mock_iter_timeline.return_value = [
+            {"event": "committed", "sha": "commit1"},
+            {"event": "committed", "sha": "commit2"},
+            {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
+            {"event": "commented", "id": 100},
+        ]
+        pr = GitHubPR("pytorch", "pytorch", 77700)
+        sha = pr.get_commit_sha_at_comment(100)
+        self.assertEqual(sha, "commit3")
+
+    @mock.patch("trymerge.iter_issue_timeline_until_comment")
+    def test_get_commit_sha_at_comment_multiple_comments(
+        self, mock_iter_timeline: Any, *args: Any
+    ) -> None:
+        mock_iter_timeline.return_value = [
+            {"event": "committed", "sha": "commit1"},
+            {"event": "commented", "id": 100},
+            {"event": "committed", "sha": "commit2"},
+            {"event": "commented", "id": 200},
+            {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
+            {"event": "commented", "id": 300},
+        ]
+        pr = GitHubPR("pytorch", "pytorch", 77700)
+        sha = pr.get_commit_sha_at_comment(200)
+        self.assertEqual(sha, "commit2")
+        sha = pr.get_commit_sha_at_comment(300)
+        self.assertEqual(sha, "commit3")
+
+    @mock.patch("trymerge.iter_issue_timeline_until_comment")
+    def test_get_commit_sha_at_comment_no_events(
+        self, mock_iter_timeline: Any, *args: Any
+    ) -> None:
+        mock_iter_timeline.return_value = [
+            {"event": "commented", "id": 100},
+            {"event": "labeled", "label": {"name": "test"}},
+        ]
+        pr = GitHubPR("pytorch", "pytorch", 77700)
+        sha = pr.get_commit_sha_at_comment(100)
+        self.assertIsNone(sha)
+
+    @mock.patch("trymerge.iter_issue_timeline_until_comment")
+    def test_get_commit_sha_at_comment_exception(
+        self, mock_iter_timeline: Any, *args: Any
+    ) -> None:
+        mock_iter_timeline.side_effect = Exception("API error")
+        pr = GitHubPR("pytorch", "pytorch", 77700)
+        sha = pr.get_commit_sha_at_comment(100)
+        self.assertIsNone(sha)
+
+
 if __name__ == "__main__":
    main()
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -450,6 +450,63 @@ HAS_NO_CONNECTED_DIFF_TITLE = (
 IGNORABLE_FAILED_CHECKS_THESHOLD = 10


+def iter_issue_timeline_until_comment(
+    org: str, repo: str, issue_number: int, target_comment_id: int, max_pages: int = 200
+) -> Any:
+    """
+    Yield timeline entries in order until (and including) the entry whose id == target_comment_id
+    for a 'commented' event. Stops once the target comment is encountered.
+    """
+    page = 1
+
+    while page <= max_pages:
+        url = (
+            f"https://api.github.com/repos/{org}/{repo}/issues/{issue_number}/timeline"
+        )
+        params = {"per_page": 100, "page": page}
+
+        batch = gh_fetch_json_list(url, params)
+
+        if not batch:
+            return
+        for ev in batch:
+            # The target is the issue comment row with event == "commented" and id == issue_comment_id
+            if ev.get("event") == "commented" and ev.get("id") == target_comment_id:
+                yield ev  # nothing in the timeline after this matters, so stop early
+                return
+            yield ev
+        if len(batch) < 100:
+            return
+        page += 1
+
+    # If we got here without finding the comment, then we either hit a bug or some github PR
+    # has a _really_ long timeline.
+    # The max # of pages found on any pytorch/pytorch PR at the time of this change was 41
+    raise RuntimeError(
+        f"Could not find a merge commit in the first {max_pages} pages of the timeline at url {url}."
+        f"This is most likely a bug, please report it to the @pytorch/pytorch-dev-infra team."
+    )
+
+
+def sha_from_committed_event(ev: dict[str, Any]) -> Optional[str]:
+    """Extract SHA from committed event in timeline"""
+    return ev.get("sha")
+
+
+def sha_from_force_push_after(ev: dict[str, Any]) -> Optional[str]:
+    """Extract SHA from force push event in timeline"""
+    # The current GitHub API format
+    commit_id = ev.get("commit_id")
+    if commit_id:
+        return str(commit_id)
+
+    # Legacy format
+    after = ev.get("after") or ev.get("after_commit") or {}
+    if isinstance(after, dict):
+        return after.get("sha") or after.get("oid")
+    return ev.get("after_sha") or ev.get("head_sha")
+
+
 def gh_get_pr_info(org: str, proj: str, pr_no: int) -> Any:
    rc = gh_graphql(GH_GET_PR_INFO_QUERY, name=proj, owner=org, number=pr_no)
    return rc["data"]["repository"]["pullRequest"]
@ -737,16 +794,24 @@ class GitHubPR:
    def last_commit(self) -> Any:
        return self.info["commits"]["nodes"][-1]["commit"]

+    def last_commit_sha(self, default: Optional[str] = None) -> str:
+        # for commits, the oid is the sha
+
+        if default is None:
+            return str(self.last_commit()["oid"])
+
+        return str(self.last_commit().get("oid", default))
+
    def get_merge_base(self) -> str:
        if self.merge_base:
            return self.merge_base

-        last_commit_oid = self.last_commit()["oid"]
+        last_commit_sha = self.last_commit_sha()
        # NB: We could use self.base_ref() here for regular PR, however, that doesn't
        # work for ghstack where the base is the custom branch, i.e. gh/USER/ID/base,
        # so let's just use main instead
        self.merge_base = gh_fetch_merge_base(
-            self.org, self.project, last_commit_oid, self.default_branch()
+            self.org, self.project, last_commit_sha, self.default_branch()
        )

        # Fallback to baseRefOid if the API call fails, i.e. rate limit. Note that baseRefOid
@ -835,6 +900,44 @@ class GitHubPR:
    def get_commit_count(self) -> int:
        return int(self.info["commits_with_authors"]["totalCount"])

+    def get_commit_sha_at_comment(self, comment_id: int) -> Optional[str]:
+        """
+        Get the PR head commit SHA that was present when a specific comment was posted.
+        This ensures we only merge the state of the PR at the time the merge command was issued,
+        not any subsequent commits that may have been pushed after.
+
+        Returns None if no head-changing events found before the comment or if the comment was not found.
+        """
+        head = None
+
+        try:
+            for event in iter_issue_timeline_until_comment(
+                self.org, self.project, self.pr_num, comment_id
+            ):
+                etype = event.get("event")
+                if etype == "committed":
+                    sha = sha_from_committed_event(event)
+                    if sha:
+                        head = sha
+                        print(f"Timeline: Found commit event for SHA {sha}")
+                elif etype == "head_ref_force_pushed":
+                    sha = sha_from_force_push_after(event)
+                    if sha:
+                        head = sha
+                        print(f"Timeline: Found force push event for SHA {sha}")
+                elif etype == "commented":
+                    if event.get("id") == comment_id:
+                        print(f"Timeline: Found final comment with sha {sha}")
+                        return head
+        except Exception as e:
+            print(
+                f"Warning: Failed to reconstruct timeline for comment {comment_id}: {e}"
+            )
+            return None
+
+        print(f"Did not find comment with id {comment_id} in the PR timeline")
+        return None
+
    def get_pr_creator_login(self) -> str:
        return cast(str, self.info["author"]["login"])

@ -1151,7 +1254,7 @@ class GitHubPR:
        *,
        skip_mandatory_checks: bool = False,
        dry_run: bool = False,
-        comment_id: Optional[int] = None,
+        comment_id: int,
        ignore_current_checks: Optional[list[str]] = None,
    ) -> None:
        # Raises exception if matching rule is not found
@ -1167,7 +1270,7 @@ class GitHubPR:
            skip_internal_checks=can_skip_internal_checks(self, comment_id),
            ignore_current_checks=ignore_current_checks,
        )
-        additional_merged_prs = self.merge_changes(
+        additional_merged_prs = self.merge_changes_locally(
            repo, skip_mandatory_checks, comment_id
        )

@ -1196,7 +1299,7 @@ class GitHubPR:
                broken_trunk_checks=ignorable_checks.get("BROKEN_TRUNK", []),
                flaky_checks=ignorable_checks.get("FLAKY", []),
                unstable_checks=ignorable_checks.get("UNSTABLE", []),
-                last_commit_sha=self.last_commit().get("oid", ""),
+                last_commit_sha=self.last_commit_sha(default=""),
                merge_base_sha=self.get_merge_base(),
                merge_commit_sha=merge_commit_sha,
                is_failed=False,
@ -1217,7 +1320,7 @@ class GitHubPR:
            dry_run=dry_run,
        )

-    def merge_changes(
+    def merge_changes_locally(
        self,
        repo: GitRepo,
        skip_mandatory_checks: bool = False,
@ -1226,27 +1329,15 @@ class GitHubPR:
        skip_all_rule_checks: bool = False,
    ) -> list["GitHubPR"]:
        """
-        :param skip_all_rule_checks: If true, skips all rule checks, useful for dry-running merge locally
+        :param skip_all_rule_checks: If true, skips all rule checks on ghstack PRs, useful for dry-running merge locally
        """
        branch_to_merge_into = self.default_branch() if branch is None else branch
        if repo.current_branch() != branch_to_merge_into:
            repo.checkout(branch_to_merge_into)
-        if not self.is_ghstack_pr():
-            msg = self.gen_commit_message()
-            pr_branch_name = f"__pull-request-{self.pr_num}__init__"
-            repo.fetch(self.last_commit()["oid"], pr_branch_name)
-            repo._run_git("merge", "--squash", pr_branch_name)
-            repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)

-            # Did the PR change since we started the merge?
-            pulled_sha = repo.show_ref(pr_branch_name)
-            latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
-            if pulled_sha != latest_pr_status.last_commit()["oid"]:
-                raise RuntimeError(
-                    "PR has been updated since CI checks last passed. Please rerun the merge command."
-                )
-            return []
-        else:
+        # It's okay to skip the commit SHA check for ghstack PRs since
+        # authoring requires write access to the repo.
+        if self.is_ghstack_pr():
            return self.merge_ghstack_into(
                repo,
                skip_mandatory_checks,
@ -1254,6 +1345,48 @@ class GitHubPR:
                skip_all_rule_checks=skip_all_rule_checks,
            )

+        msg = self.gen_commit_message()
+        pr_branch_name = f"__pull-request-{self.pr_num}__init__"
+
+        # Determine which commit SHA to merge
+        commit_to_merge = None
+        if not comment_id:
+            raise ValueError("Must provide --comment-id when merging regular PRs")
+
+        # Get the commit SHA that was present when the comment was made
+        commit_to_merge = self.get_commit_sha_at_comment(comment_id)
+        if not commit_to_merge:
+            raise RuntimeError(
+                f"Could not find commit that was pushed before comment {comment_id}"
+            )
+
+        # Validate that this commit is the latest commit on the PR
+        latest_commit = self.last_commit_sha()
+        if commit_to_merge != latest_commit:
+            raise RuntimeError(
+                f"Commit {commit_to_merge} was HEAD when comment {comment_id} was posted "
+                f"but now the latest commit on the PR is {latest_commit}. "
+                f"Please re-issue the merge command to merge the latest commit."
+            )
+
+        print(f"Merging commit {commit_to_merge} locally")
+
+        repo.fetch(commit_to_merge, pr_branch_name)
+        repo._run_git("merge", "--squash", pr_branch_name)
+        repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
+
+        # Did the PR change since we started the merge?
+        pulled_sha = repo.show_ref(pr_branch_name)
+        latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
+        if (
+            pulled_sha != latest_pr_status.last_commit_sha()
+            or pulled_sha != commit_to_merge
+        ):
+            raise RuntimeError(
+                "PR has been updated since CI checks last passed. Please rerun the merge command."
+            )
+        return []
+

 class MergeRuleFailedError(RuntimeError):
    def __init__(self, message: str, rule: Optional["MergeRule"] = None) -> None:
@ -1458,7 +1591,7 @@ def find_matching_merge_rule(
            pending_checks = []
            failed_checks = []

-        hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit()['oid']}"
+        hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit_sha()}"
        if len(failed_checks) > 0:
            if reject_reason_score < 30000:
                reject_reason_score = 30000
@ -2156,14 +2289,14 @@ def categorize_checks(
 def merge(
    pr: GitHubPR,
    repo: GitRepo,
+    comment_id: int,
    dry_run: bool = False,
    skip_mandatory_checks: bool = False,
-    comment_id: Optional[int] = None,
    timeout_minutes: int = 400,
    stale_pr_days: int = 3,
    ignore_current: bool = False,
 ) -> None:
-    initial_commit_sha = pr.last_commit()["oid"]
+    initial_commit_sha = pr.last_commit_sha()
    pr_link = f"https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num}"
    print(f"Attempting merge of {initial_commit_sha} ({pr_link})")

@ -2234,7 +2367,7 @@ def merge(
            f"Attempting merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} ({elapsed_time / 60} minutes elapsed)"
        )
        pr = GitHubPR(pr.org, pr.project, pr.pr_num)
-        if initial_commit_sha != pr.last_commit()["oid"]:
+        if initial_commit_sha != pr.last_commit_sha():
            raise RuntimeError(
                "New commits were pushed while merging. Please rerun the merge command."
            )
@ -2401,7 +2534,7 @@ def main() -> None:
    if args.check_mergeability:
        if pr.is_ghstack_pr():
            get_ghstack_prs(repo, pr)  # raises error if out of sync
-        pr.merge_changes(
+        pr.merge_changes_locally(
            repo,
            skip_mandatory_checks=True,
            skip_all_rule_checks=True,
@ -2416,12 +2549,18 @@ def main() -> None:
        gh_post_pr_comment(org, project, args.pr_num, message, dry_run=args.dry_run)
        return
    try:
+        # Ensure comment id is set, else fail
+        if not args.comment_id:
+            raise ValueError(
+                "Comment ID is required for merging PRs, please provide it using --comment-id"
+            )
+
        merge(
            pr,
            repo,
+            comment_id=args.comment_id,
            dry_run=args.dry_run,
            skip_mandatory_checks=args.force,
-            comment_id=args.comment_id,
            ignore_current=args.ignore_current,
        )
    except Exception as e:
@ -2443,7 +2582,7 @@ def main() -> None:
                broken_trunk_checks=[],
                flaky_checks=[],
                unstable_checks=[],
-                last_commit_sha=pr.last_commit().get("oid", ""),
+                last_commit_sha=pr.last_commit_sha(default=""),
                merge_base_sha=pr.get_merge_base(),
                is_failed=True,
                skip_mandatory_checks=args.force,
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@ -4,7 +4,7 @@
 {%- set download_artifact_action = "actions/download-artifact@v4.1.7" -%}

 {%- set timeout_minutes = 240 -%}
-{%- set timeout_minutes_windows_binary = 300 -%}
+{%- set timeout_minutes_windows_binary = 360 -%}

 {%- macro concurrency(build_environment) -%}
 concurrency:
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -135,7 +135,7 @@ jobs:
      contents: read
    steps:
      - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
+        uses: pytorch/pytorch/.github/actions/setup-xpu@main
      - name: configure aws credentials
        id: aws_creds
        uses: aws-actions/configure-aws-credentials@v4
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -68,11 +68,6 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
      - name: Populate binary env
        run: |
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@ -33,7 +33,7 @@
  {%- if is_windows %}
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
  {%- endif %}

 {%- else %}
--- a/.github/workflows/_link_check.yml
+++ b/.github/workflows/_link_check.yml
@ -13,6 +13,7 @@ jobs:
    if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-url-lint') }}
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
    with:
+      job-name: lint-urls
      timeout: 120
      runner: ${{ inputs.runner }}linux.2xlarge
      docker-image: ci-image:pytorch-linux-jammy-linter
@ -38,6 +39,7 @@ jobs:
    if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-xref-lint') }}
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
    with:
+      job-name: lint-xrefs
      timeout: 60
      runner: ${{ inputs.runner }}linux.2xlarge
      docker-image: ci-image:pytorch-linux-jammy-linter
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -409,7 +409,7 @@ jobs:
          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}

      - name: Authenticate with AWS
-        if: ${{ contains(matrix.runner, 'b200') }}
+        if: ${{ always() && contains(matrix.runner, 'b200') }}
        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
        with:
          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -77,6 +77,7 @@ jobs:
        run: |
          git config --global core.longpaths true
          git config --global core.symlinks true
+          git config --global core.ignorecase false

          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
          # the directory on Windows and prevent GHA from checking out as reported
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -70,6 +70,7 @@ jobs:
        run: |
          git config --global core.longpaths true
          git config --global core.symlinks true
+          git config --global core.ignorecase false

          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
          # the directory on Windows and prevent GHA from checking out as reported
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -275,7 +275,7 @@ jobs:
      - name: Change permissions
        if: ${{ always() && steps.test.conclusion }}
        run: |
-          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
+          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1000:1000 test"

      - name: Print remaining test logs
        shell: bash
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -47,12 +47,11 @@ jobs:
      matrix:
        include: [
          { name: "manylinux2_28-builder",          tag: "cuda13.0",         runner: "linux.9xlarge.ephemeral" },
-          { name: "manylinux2_28-builder",          tag: "cuda12.9",         runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "cuda12.8",          runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "cuda12.6",          runner: "linux.9xlarge.ephemeral" },
          { name: "manylinuxaarch64-builder",       tag: "cuda13.0",          runner: "linux.arm64.2xlarge.ephemeral" },
-          { name: "manylinuxaarch64-builder",       tag: "cuda12.9",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinuxaarch64-builder",       tag: "cuda12.8",          runner: "linux.arm64.2xlarge.ephemeral" },
+          { name: "manylinuxaarch64-builder",       tag: "cuda12.6",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "rocm6.3",           runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "rocm6.4",           runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "cpu",               runner: "linux.9xlarge.ephemeral" },
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -145,7 +145,7 @@ jobs:
          fi

          docker exec -t "${container_name}" yum install -y zlib-devel zip
-          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}"  -m pip install -U setuptools==78.1.0 pybind11==2.13.1 auditwheel wheel
+          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}"  -m pip install -U setuptools==78.1.0 pybind11==3.0.1 auditwheel wheel
          set +e
          docker exec -t "${container_name}" command -v pip
          has_pip=$?
--- a/.github/workflows/build-vllm-wheel.yml
+++ b/.github/workflows/build-vllm-wheel.yml
@ -0,0 +1,248 @@
+name: Build vLLM wheels
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - .github/workflows/build-vllm-wheel.yml
+      - .github/ci_commit_pins/vllm.txt
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - .github/workflows/build-vllm-wheel.yml
+      - .github/ci_commit_pins/vllm.txt
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  build-wheel:
+    if: github.repository_owner == 'pytorch'
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [ '3.12' ]
+        # TODO (huydhn): Add cu130 https://github.com/pytorch/pytorch/pull/162000#issuecomment-3261541554
+        device: [ 'cu128', 'cu129' ]
+        runner: [ 'linux.12xlarge.memory' ]
+        include:
+          - device: cu128
+            manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.8'
+          - device: cu129
+            manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
+    name: "Build ${{ matrix.device }} vLLM wheel"
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 480
+    env:
+      PY_VERS: ${{ matrix.python-version }}
+      MANYLINUX_IMAGE: ${{ matrix.manylinux-image }}
+      PLATFORM: 'manylinux_2_28_x86_64'
+      BUILD_DEVICE: ${{ matrix.device }}
+    steps:
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+
+      - name: Get latest PyTorch nightly
+        shell: bash
+        run: |
+          set -eux
+
+          # Keep PyTorch nightly wheel here so that we can install it later during
+          # vLLM build process
+          mkdir -p "${RUNNER_TEMP}/artifacts/"
+
+          container_name=$(docker run \
+            --tty \
+            --detach \
+            -e PLATFORM \
+            -v "${GITHUB_WORKSPACE}:/pytorch" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w /artifacts/ \
+            "${MANYLINUX_IMAGE}"
+          )
+
+          # Determine python executable for given version (copied from build-triton-wheel)
+          case $PY_VERS in
+          3.10)
+            PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python
+            ;;
+          3.11)
+            PYTHON_EXECUTABLE=/opt/python/cp311-cp311/bin/python
+            ;;
+          3.12)
+            PYTHON_EXECUTABLE=/opt/python/cp312-cp312/bin/python
+            ;;
+          3.13)
+            PYTHON_EXECUTABLE=/opt/python/cp313-cp313/bin/python
+            ;;
+          3.13t)
+            PYTHON_EXECUTABLE=/opt/python/cp313-cp313t/bin/python
+            ;;
+          3.14)
+            PYTHON_EXECUTABLE=/opt/python/cp314-cp314/bin/python
+            ;;
+          3.14t)
+            PYTHON_EXECUTABLE=/opt/python/cp314-cp314t/bin/python
+            ;;
+          *)
+            echo "Unsupported python version ${PY_VERS}"
+            exit 1
+            ;;
+          esac
+
+          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip install \
+            --pre torch torchvision torchaudio \
+            --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
+
+          # I wonder if there is a command to both download and install the wheels
+          # in one go
+          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip download \
+            --pre torch torchvision torchaudio \
+            --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
+
+          # Save this for later
+          echo "PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}" >> "$GITHUB_ENV"
+          echo "container_name=${container_name}" >> "$GITHUB_ENV"
+
+      - name: Build vLLM wheel
+        uses: ./.github/actions/build-external-packages
+        with:
+          build-targets: vllm
+          docker-image: ${{ env.MANYLINUX_IMAGE }}
+          cuda-arch-list: '8.0;8.9;9.0;10.0;12.0'
+          torch-wheel-dir: ${{ runner.temp }}/artifacts
+          output-dir: ${{ runner.temp }}/artifacts/externals
+
+      - name: Prepare vLLM wheel
+        shell: bash
+        run: |
+          set -eux
+
+          # Get these wheels ready, the vllm renaming logic is copied from its .buildkite/scripts/upload-wheels.sh
+          docker exec -t "${container_name}" bash -c "
+            set -eux
+
+            nightly=\$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2 | cut -d'.' -f4)
+
+            pushd externals/vllm/wheels
+            for package in xformers flashinfer-python vllm; do
+              pushd \$package
+              auditwheel repair --plat \$PLATFORM *.whl \
+                --exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv*
+              repair_wheel=\$(find wheelhouse -name *\${PLATFORM}*)
+              repair_wheel=\$(basename \${repair_wheel})
+              popd
+
+              cp \${package}/wheelhouse/\${repair_wheel} .
+              version=\$(unzip -p \$repair_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
+
+              if [[ \$package == vllm ]]; then
+                new_wheel=\${repair_wheel/\$version/1.0.0.\$nightly}
+              else
+                major_version=\$(echo \$version | tr '.+' '.' | cut -d'.' -f1-3)
+                new_wheel=\${repair_wheel/\$version/\$major_version.\$nightly}
+              fi
+
+              mv -- \$repair_wheel \$new_wheel
+              rm -rf \$package
+            done
+            popd
+          "
+
+          docker exec -t "${container_name}" chown -R 1000:1000 /artifacts
+
+      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        with:
+          name: vllm-wheel-${{ matrix.device }}-${{ matrix.python-version }}-${{ env.PLATFORM }}
+          if-no-files-found: error
+          path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl
+
+      - name: Teardown Linux
+        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        if: always()
+
+  # Copied from build-triton-wheel workflow (mostly)
+  upload-wheel:
+    name: "Upload ${{ matrix.device }} vLLM wheel"
+    needs:
+      - build-wheel
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        device: [ 'cu128', 'cu129' ]
+    env:
+      BUILD_DEVICE: ${{ matrix.device }}
+    permissions:
+      id-token: write
+      contents: read
+    container:
+      image: continuumio/miniconda3:4.12.0
+    environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }}
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Configure AWS credentials(PyTorch account) for main
+        if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }}
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels
+          aws-region: us-east-1
+
+      - name: Configure AWS credentials(PyTorch account) for RC builds
+        if: ${{ github.event_name == 'push' &&  (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/')) }}
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_test_build_wheels
+          aws-region: us-east-1
+
+      - name: Download Build Artifacts
+        uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
+        with:
+          # Download all available artifacts
+          path: ${{ runner.temp }}/artifacts-all
+
+      - name: Select Wheel Artifacts
+        shell: bash
+        run: |
+          set -eux
+          mkdir -p "${RUNNER_TEMP}/artifacts/"
+          mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-*/* "${RUNNER_TEMP}/artifacts/"
+
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
+        shell: bash
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }}
+        shell: bash
+        run: |
+          set -ex
+
+          if [[ "${GITHUB_REF_NAME}" = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+
+      - name: Upload binaries
+        env:
+          PACKAGE_TYPE: wheel
+          UPLOAD_SUBFOLDER: ${{ env.BUILD_DEVICE }}
+          PKG_DIR: ${{ runner.temp }}/artifacts
+        shell: bash
+        run: |
+          set -ex
+          bash .circleci/scripts/binary_upload.sh
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -50,24 +50,23 @@ jobs:
        runner: [linux.12xlarge]
        docker-image-name: [
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
+          pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
          pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
-          pytorch-linux-jammy-py3.9-clang12,
+          pytorch-linux-jammy-py3.10-clang12,
          pytorch-linux-jammy-py3.13-clang12,
          pytorch-linux-jammy-rocm-n-py3,
          pytorch-linux-noble-rocm-n-py3,
          pytorch-linux-noble-rocm-alpha-py3,
          pytorch-linux-jammy-rocm-n-py3-benchmarks,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
-          pytorch-linux-jammy-py3.9-gcc11,
-          pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12,
+          pytorch-linux-jammy-py3.10-gcc11,
+          pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
          pytorch-linux-jammy-py3.12-halide,
-          pytorch-linux-jammy-xpu-2025.0-py3,
-          pytorch-linux-jammy-xpu-2025.1-py3,
+          pytorch-linux-jammy-xpu-n-1-py3,
+          pytorch-linux-jammy-xpu-n-py3,
          pytorch-linux-jammy-py3-clang18-asan,
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
@ -124,7 +123,7 @@ jobs:
          GHCR_PAT: ${{ secrets.GHCR_PAT }}
        with:
          shell: bash
-          timeout_minutes: 30
+          timeout_minutes: 60
          max_attempts: 5
          retry_wait_seconds: 90
          command: |
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@ -248,7 +248,7 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  libtorch-cuda12_9-shared-with-deps-release-build:
+  libtorch-cuda13_0-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    needs: get-label-type
@ -257,22 +257,22 @@ jobs:
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: libtorch-cuda12_9-shared-with-deps-release
+      build_name: libtorch-cuda13_0-shared-with-deps-release
      build_environment: linux-binary-libtorch
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda12_9-shared-with-deps-release-test:  # Testing
+  libtorch-cuda13_0-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
-      - libtorch-cuda12_9-shared-with-deps-release-build
+      - libtorch-cuda13_0-shared-with-deps-release-build
      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
@ -280,38 +280,38 @@ jobs:
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
-      build_name: libtorch-cuda12_9-shared-with-deps-release
+      build_name: libtorch-cuda13_0-shared-with-deps-release
      build_environment: linux-binary-libtorch
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda12_9-shared-with-deps-release-upload:  # Uploading
+  libtorch-cuda13_0-shared-with-deps-release-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    permissions:
      id-token: write
      contents: read
-    needs: libtorch-cuda12_9-shared-with-deps-release-test
+    needs: libtorch-cuda13_0-shared-with-deps-release-test
    with:
      PYTORCH_ROOT: /pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
-      build_name: libtorch-cuda12_9-shared-with-deps-release
+      build_name: libtorch-cuda13_0-shared-with-deps-release
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@ -47,70 +47,6 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_9-cpu-s390x-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      DESIRED_PYTHON: "3.9"
-      runs_on: linux.s390x
-      ALPINE_IMAGE: "docker.io/s390x/alpine"
-      timeout-minutes: 420
-      build_name: manywheel-py3_9-cpu-s390x
-      build_environment: linux-s390x-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cpu-s390x-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cpu-s390x-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cpu-s390x
-      build_environment: linux-s390x-binary-manywheel
-      runs_on: linux.s390x
-      ALPINE_IMAGE: "docker.io/s390x/alpine"
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cpu-s390x-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_9-cpu-s390x-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cpu-s390x
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_10-cpu-s390x-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -366,3 +302,195 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cpu-s390x-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.13t"
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+      timeout-minutes: 420
+      build_name: manywheel-py3_13t-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-s390x-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cpu-s390x-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-s390x-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cpu-s390x-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu-s390x
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14-cpu-s390x-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.14"
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+      timeout-minutes: 420
+      build_name: manywheel-py3_14-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cpu-s390x-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_14-cpu-s390x-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cpu-s390x-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14-cpu-s390x-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cpu-s390x
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14t-cpu-s390x-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.14t"
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+      timeout-minutes: 420
+      build_name: manywheel-py3_14t-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cpu-s390x-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_14t-cpu-s390x-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cpu-s390x-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14t-cpu-s390x-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cpu-s390x
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
@ -46,7 +46,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -67,11 +67,6 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -30,151 +30,6 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  wheel-py3_9-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-14-xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          set -eux -o pipefail
-          # shellcheck disable=SC1090
-          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
-          mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
-
-          # Build
-          USE_PYTORCH_METAL_EXPORT=1
-          USE_COREML_DELEGATE=1
-          TORCH_PACKAGE_NAME="${TORCH_PACKAGE_NAME//-/_}"
-          export USE_PYTORCH_METAL_EXPORT
-          export USE_COREML_DELEGATE
-          export TORCH_PACKAGE_NAME
-          "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
-      - name: Test PyTorch wheel
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          set -eux -o pipefail
-          # shellcheck disable=SC1090
-          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
-          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
-          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
-
-          # Create new "clean" conda environment for testing
-
-          SMOKE_TEST_PARAMS=""
-
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
-          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
-          conda activate test_conda_env
-          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
-
-          # shellcheck disable=SC2086
-          python "${PYTORCH_ROOT}/.ci/pytorch/smoke_test/smoke_test.py" --package torchonly ${SMOKE_TEST_PARAMS}
-      - uses: actions/upload-artifact@v4.4.0
-        if: always()
-        with:
-          name: wheel-py3_9-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_9-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: wheel-py3_9-cpu-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu
-      DESIRED_PYTHON: "3.9"
-      build_name: wheel-py3_9-cpu
-      use_s3: False
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_10-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    runs-on: macos-14-xlarge
@ -208,11 +63,6 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -353,11 +203,6 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -498,11 +343,6 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -643,11 +483,6 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -788,11 +623,6 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -933,11 +763,6 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -1078,11 +903,6 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
@ -51,7 +51,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -64,7 +64,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      - name: Populate binary env
        shell: cmd
@ -128,7 +128,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-debug-build
      - get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -141,7 +141,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      - name: Populate binary env
        shell: cmd
@ -201,7 +201,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
      build_name: libtorch-cpu-shared-with-deps-debug
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
@ -51,7 +51,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -64,7 +64,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      - name: Populate binary env
        shell: cmd
@ -128,7 +128,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-release-build
      - get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -141,7 +141,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      - name: Populate binary env
        shell: cmd
@ -201,7 +201,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
      build_name: libtorch-cpu-shared-with-deps-release
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
@ -51,7 +51,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
@ -124,7 +124,7 @@ jobs:
      - wheel-py3_11-cpu-build
      - get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
@ -198,7 +198,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
@ -271,7 +271,7 @@ jobs:
      - wheel-py3_12-cpu-build
      - get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
@ -345,7 +345,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
@ -418,7 +418,7 @@ jobs:
      - wheel-py3_13-cpu-build
      - get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
--- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
@ -38,7 +38,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -51,7 +51,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -153,7 +153,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-debug-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -166,7 +166,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      - name: Display EC2 information
        shell: bash
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@ -45,7 +45,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -58,7 +58,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -160,7 +160,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-debug-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -173,7 +173,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      - name: Display EC2 information
        shell: bash
@ -283,7 +283,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
      build_name: libtorch-cpu-shared-with-deps-debug
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -292,7 +292,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -306,7 +306,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -408,7 +408,7 @@ jobs:
      - libtorch-cuda12_6-shared-with-deps-debug-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -422,7 +422,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      - name: Display EC2 information
        shell: bash
@ -533,7 +533,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
      build_name: libtorch-cuda12_6-shared-with-deps-debug
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -542,7 +542,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -556,7 +556,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -658,7 +658,7 @@ jobs:
      - libtorch-cuda12_8-shared-with-deps-debug-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -672,7 +672,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      - name: Display EC2 information
        shell: bash
@ -783,30 +783,30 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
      build_name: libtorch-cuda12_8-shared-with-deps-debug
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda12_9-shared-with-deps-debug-build:
+  libtorch-cuda13_0-shared-with-deps-debug-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: debug
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -884,7 +884,7 @@ jobs:
      - uses: actions/upload-artifact@v4.4.0
        if: always()
        with:
-          name: libtorch-cuda12_9-shared-with-deps-debug
+          name: libtorch-cuda13_0-shared-with-deps-debug
          retention-days: 14
          if-no-files-found: error
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@ -902,27 +902,27 @@ jobs:
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1

-  libtorch-cuda12_9-shared-with-deps-debug-test:  # Testing
+  libtorch-cuda13_0-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
-      - libtorch-cuda12_9-shared-with-deps-debug-build
+      - libtorch-cuda13_0-shared-with-deps-debug-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: debug
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      - name: Display EC2 information
        shell: bash
@ -992,7 +992,7 @@ jobs:
      - uses: actions/download-artifact@v4.1.7
        name: Download Build Artifacts
        with:
-          name: libtorch-cuda12_9-shared-with-deps-debug
+          name: libtorch-cuda13_0-shared-with-deps-debug
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Populate binary env
        shell: bash
@ -1015,26 +1015,26 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda12_9-shared-with-deps-debug-upload:  # Uploading
+  libtorch-cuda13_0-shared-with-deps-debug-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    permissions:
      id-token: write
      contents: read
-    needs: libtorch-cuda12_9-shared-with-deps-debug-test
+    needs: libtorch-cuda13_0-shared-with-deps-debug-test
    with:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
      GPU_ARCH_TYPE: cuda
      LIBTORCH_CONFIG: debug
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-      build_name: libtorch-cuda12_9-shared-with-deps-debug
+      DESIRED_PYTHON: "3.10"
+      build_name: libtorch-cuda13_0-shared-with-deps-debug
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
@ -38,7 +38,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -51,7 +51,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -153,7 +153,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-release-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -166,7 +166,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      - name: Display EC2 information
        shell: bash
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@ -45,7 +45,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -58,7 +58,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -160,7 +160,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-release-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -173,7 +173,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      - name: Display EC2 information
        shell: bash
@ -283,7 +283,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
      build_name: libtorch-cpu-shared-with-deps-release
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -292,7 +292,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -306,7 +306,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -408,7 +408,7 @@ jobs:
      - libtorch-cuda12_6-shared-with-deps-release-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -422,7 +422,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      - name: Display EC2 information
        shell: bash
@ -533,7 +533,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
      build_name: libtorch-cuda12_6-shared-with-deps-release
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -542,7 +542,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -556,7 +556,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -658,7 +658,7 @@ jobs:
      - libtorch-cuda12_8-shared-with-deps-release-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -672,7 +672,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      - name: Display EC2 information
        shell: bash
@ -783,30 +783,30 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
      build_name: libtorch-cuda12_8-shared-with-deps-release
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda12_9-shared-with-deps-release-build:
+  libtorch-cuda13_0-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -884,7 +884,7 @@ jobs:
      - uses: actions/upload-artifact@v4.4.0
        if: always()
        with:
-          name: libtorch-cuda12_9-shared-with-deps-release
+          name: libtorch-cuda13_0-shared-with-deps-release
          retention-days: 14
          if-no-files-found: error
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@ -902,27 +902,27 @@ jobs:
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1

-  libtorch-cuda12_9-shared-with-deps-release-test:  # Testing
+  libtorch-cuda13_0-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
-      - libtorch-cuda12_9-shared-with-deps-release-build
+      - libtorch-cuda13_0-shared-with-deps-release-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      - name: Display EC2 information
        shell: bash
@ -992,7 +992,7 @@ jobs:
      - uses: actions/download-artifact@v4.1.7
        name: Download Build Artifacts
        with:
-          name: libtorch-cuda12_9-shared-with-deps-release
+          name: libtorch-cuda13_0-shared-with-deps-release
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Populate binary env
        shell: bash
@ -1015,26 +1015,26 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda12_9-shared-with-deps-release-upload:  # Uploading
+  libtorch-cuda13_0-shared-with-deps-release-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    permissions:
      id-token: write
      contents: read
-    needs: libtorch-cuda12_9-shared-with-deps-release-test
+    needs: libtorch-cuda13_0-shared-with-deps-release-test
    with:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
      GPU_ARCH_TYPE: cuda
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-      build_name: libtorch-cuda12_9-shared-with-deps-release
+      DESIRED_PYTHON: "3.10"
+      build_name: libtorch-cuda13_0-shared-with-deps-release
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
--- a/.github/workflows/inductor-micro-benchmark-x86.yml
+++ b/.github/workflows/inductor-micro-benchmark-x86.yml
@ -18,13 +18,13 @@ permissions:
  contents: read

 jobs:
-  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+  inductor-build:
    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+    name: inductor-build
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-py3.9-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      # Use metal host for benchmark jobs
      test-matrix: |
        { include: [
@ -32,13 +32,13 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-cpu-py3_9-gcc11-inductor-micro-benchmark-test:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-micro-benchmark-test:
+    name: inductor-micro-benchmark-test
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    needs: inductor-build
    with:
      build-environment: linux-jammy-py3.9-gcc11
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
    secrets: inherit
--- a/.github/workflows/inductor-nightly.yml
+++ b/.github/workflows/inductor-nightly.yml
@ -32,13 +32,13 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build:
-    name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks
+  nightly-dynamo-benchmarks-build:
+    name: nightly-dynamo-benchmarks-build
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      test-matrix: |
        { include: [
@ -51,13 +51,13 @@ jobs:
      build-additional-packages: "vision audio torchao"
    secrets: inherit

-  linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-test:
-    name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks
+  nightly-dynamo-benchmarks-test:
+    name: nightly-dynamo-benchmarks-test
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build
+    needs: nightly-dynamo-benchmarks-build
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.test-matrix }}
+      docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }}
      timeout-minutes: 720
    secrets: inherit
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -84,9 +84,8 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  # NB: Keep this in sync with trunk.yml
  build:
-    name: cuda12.8-py3.10-gcc9-sm90
+    name: build
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
@ -128,7 +127,7 @@ jobs:
    secrets: inherit

  test-periodically:
-    name: cuda12.8-py3.10-gcc9-sm90
+    name: test-periodically
    uses: ./.github/workflows/_linux-test.yml
    needs: build
    if: github.event.schedule == '15 0,12 * * 1-6'
@ -145,7 +144,7 @@ jobs:
    secrets: inherit

  test-weekly:
-    name: cuda12.8-py3.10-gcc9-sm90
+    name: test-weekly
    uses: ./.github/workflows/_linux-test.yml
    needs: build
    if: github.event.schedule == '0 7 * * 0'
@ -162,9 +161,12 @@ jobs:
    secrets: inherit

  test:
-    name: cuda12.8-py3.10-gcc9-sm90
+    name: test
    uses: ./.github/workflows/_linux-test.yml
    needs: build
+    # The pull_request trigger is used in PR to bump transformers pin which always
+    # needs one round of benchmark
+    if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }}
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
      dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}
--- a/.github/workflows/inductor-perf-test-nightly-macos.yml
+++ b/.github/workflows/inductor-perf-test-nightly-macos.yml
@ -48,6 +48,9 @@ jobs:
          { config: "perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" },
          { config: "perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" },
          { config: "perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" },
+          { config: "aot_inductor_perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" },
+          { config: "aot_inductor_perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" },
+          { config: "aot_inductor_perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" },
        ]}
    secrets: inherit

--- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
@ -69,14 +69,14 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  linux-jammy-zen-cpu-py3_9-gcc11-inductor-build:
-    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
+  inductor-build:
+    name: inductor-build
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "inductor_huggingface_perf_cpu_x86_zen", shard: 1, num_shards: 3, runner: "linux.24xlarge.amd" },
@ -95,16 +95,16 @@ jobs:
      selected-test-configs: ${{ inputs.benchmark_configs }}
    secrets: inherit

-  linux-jammy-zen-cpu-py3_9-gcc11-inductor-test-nightly:
-    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
+  inductor-test-nightly:
+    name: inductor-test-nightly
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
+    needs: inductor-build
    if: github.event.schedule == '0 7 * * *'
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
-      docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
      # disable monitor in perf tests
      disable-monitor: false
@ -112,17 +112,16 @@ jobs:
      monitor-data-collect-interval: 4
    secrets: inherit

-
-  linux-jammy-zen-cpu-py3_9-gcc11-inductor-test:
-    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
+  inductor-test:
+    name: inductor-test
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
+    needs: inductor-build
    if: github.event_name == 'workflow_dispatch'
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
-      docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
      # disable monitor in perf tests
      disable-monitor: false
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@ -74,14 +74,14 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  linux-jammy-cpu-py3_9-gcc11-inductor-build:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-build:
+    name: inductor-build
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "inductor_huggingface_perf_cpu_x86", shard: 1, num_shards: 3, runner: "linux.24xl.spr-metal" },
@ -101,16 +101,16 @@ jobs:
      build-additional-packages: "vision audio torchao"
    secrets: inherit

-  linux-jammy-cpu-py3_9-gcc11-inductor-test-nightly-freezing:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-test-nightly-freezing:
+    name: inductor-test-nightly-freezing
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    needs: inductor-build
    if: github.event.schedule == '0 7 * * *'
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
      # disable monitor in perf tests
      disable-monitor: false
@ -118,16 +118,16 @@ jobs:
      monitor-data-collect-interval: 4
    secrets: inherit

-  linux-jammy-cpu-py3_9-gcc11-inductor-test:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-test:
+    name: inductor-test
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    needs: inductor-build
    if: github.event_name == 'workflow_dispatch'
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }}
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
      # disable monitor in perf tests
      disable-monitor: false
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -79,7 +79,6 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  # NB: Keep this in sync with trunk.yml
  build:
    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -31,8 +31,8 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build:
-    name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
+  periodic-dynamo-benchmarks-build:
+    name: periodic-dynamo-benchmarks-build
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
@ -57,23 +57,33 @@ jobs:
          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
        ]}
      build-additional-packages: "vision audio fbgemm torchao"
    secrets: inherit

-  linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-test:
-    name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
+  periodic-dynamo-benchmarks-test:
+    name: periodic-dynamo-benchmarks-test
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build
+    needs: periodic-dynamo-benchmarks-build
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+      docker-image: ${{ needs.periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build:
+  rocm-periodic-dynamo-benchmarks-build:
    if: github.repository_owner == 'pytorch'
-    name: rocm-py3_10-periodic-dynamo-benchmarks
+    name: rocm-periodic-dynamo-benchmarks-build
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-rocm-py3_10
@ -99,21 +109,21 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-test:
+  rocm-periodic-dynamo-benchmarks-test:
    permissions:
      id-token: write
      contents: read
-    name: rocm-py3_10-periodic-dynamo-benchmarks
+    name: rocm-periodic-dynamo-benchmarks-test
    uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build
+    needs: rocm-periodic-dynamo-benchmarks-build
    with:
      build-environment: linux-jammy-rocm-py3_10
-      docker-image: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+      docker-image: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build:
-    name: cuda12.8-py3.10-gcc9-sm80
+  inductor-smoke-build:
+    name: inductor-smoke-build
    uses: ./.github/workflows/_linux-build.yml
    needs:
      - get-default-label-prefix
@ -129,23 +139,23 @@ jobs:
      build-additional-packages: "vision audio fbgemm torchao"
    secrets: inherit

-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-test:
-    name: cuda12.8-py3.10-gcc9-sm80
+  inductor-smoke-test:
+    name: inductor-smoke-test
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build
+    needs: inductor-smoke-build
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-smoke-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build:
-    name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks
+  periodic-dynamo-benchmarks-cpu-build:
+    name: periodic-dynamo-benchmarks-cpu-build
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      test-matrix: |
        { include: [
@ -160,68 +170,6 @@ jobs:
          { config: "cpu_inductor_freezing_avx2_torchbench", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
          { config: "cpu_inductor_freezing_avx2_timm", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" },
          { config: "cpu_inductor_freezing_avx2_timm", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
-        ]}
-      build-additional-packages: "vision audio torchao"
-    secrets: inherit
-
-  linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-test:
-    name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build
-    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
-    secrets: inherit
-
-
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
-    name: cuda12.8-py3.10-gcc9-sm86
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-default-label-prefix
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.6'
-      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
-      test-matrix: |
-        { include: [
-          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-        ]}
-      build-additional-packages: "vision audio fbgemm torchao"
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
-    name: cuda12.8-py3.10-gcc9-sm86
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
-    secrets: inherit
-
-  linux-jammy-cpu-py3_9-gcc11-inductor-build:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-default-label-prefix
-    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
-      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
-      test-matrix: |
-        { include: [
          { config: "cpu_inductor_freezing_huggingface", shard: 1, num_shards: 1, runner: "linux.8xlarge.amx" },
          { config: "cpu_inductor_freezing_timm", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
          { config: "cpu_inductor_freezing_timm", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
@ -247,12 +195,12 @@ jobs:
      build-additional-packages: "vision audio torchao"
    secrets: inherit

-  linux-jammy-cpu-py3_9-gcc11-inductor-test:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  periodic-dynamo-benchmarks-cpu-test:
+    name: periodic-dynamo-benchmarks-cpu-test
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    needs: periodic-dynamo-benchmarks-cpu-build
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -28,8 +28,8 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
-    name: cuda12.8-py3.10-gcc9-sm86
+  inductor-build:
+    name: inductor-build
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
@ -47,44 +47,18 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
-    name: cuda12.8-py3.10-gcc9-sm86
+  inductor-test:
+    name: inductor-test
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+    needs: inductor-build
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-cuda12_8-py3_12-gcc9-inductor-build:
-    name: cuda12.8-py3.12-gcc9-sm86
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.6'
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      test-matrix: |
-        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_12-gcc9-inductor-test:
-    name: cuda12.8-py3.12-gcc9-sm86
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_12-gcc9-inductor-build
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.test-matrix }}
-    secrets: inherit
-
-  linux-jammy-cpu-py3_12-inductor-halide-build:
-    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
+  inductor-halide-build:
+    name: inductor-halide-build
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
@ -97,18 +71,18 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-cpu-py3_12-inductor-halide-test:
-    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
+  inductor-halide-test:
+    name: inductor-halide-test
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_12-inductor-halide-build
+    needs: inductor-halide-build
    with:
      build-environment: linux-jammy-py3.12-gcc11
-      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-halide-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-cpu-py3_12-inductor-triton-cpu-build:
-    name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
+  inductor-triton-cpu-build:
+    name: inductor-triton-cpu-build
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
@ -121,23 +95,23 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-cpu-py3_12-inductor-triton-cpu-test:
+  inductor-triton-cpu-test:
    name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_12-inductor-triton-cpu-build
+    needs: inductor-triton-cpu-build
    with:
      build-environment: linux-jammy-py3.12-gcc11
-      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-triton-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-triton-cpu-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-cpu-py3_9-gcc11-inductor-build:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-cpu-build:
+    name: inductor-cpu-build
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
@ -148,37 +122,12 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-cpu-py3_9-gcc11-inductor-test:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-cpu-test:
+    name: inductor-cpu-test
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    needs: inductor-cpu-build
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_13-gcc9-inductor-build:
-    name: cuda12.8-py3.13-gcc9-sm86
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.6'
-      test-matrix: |
-        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_13-gcc9-inductor-test:
-    name: cuda12.8-py3.13-gcc9-sm86
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_13-gcc9-inductor-build
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -44,8 +44,8 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
-    name: cuda12.8-py3.10-gcc9-sm86
+  inductor-build:
+    name: inductor-build
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
@ -53,7 +53,6 @@ jobs:
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
      test-matrix: |
        { include: [
          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
@ -65,25 +64,24 @@ jobs:
      build-additional-packages: "vision audio fbgemm torchao"
    secrets: inherit

-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
-    name: cuda12.8-py3.10-gcc9-sm86
+  inductor-test:
+    name: inductor-test
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+    needs: inductor-build
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-cpu-py3_9-gcc11-inductor-build:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-cpu-build:
+    name: inductor-cpu-build
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
      test-matrix: |
        { include: [
          { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
@ -98,12 +96,12 @@ jobs:
      build-additional-packages: "vision audio torchao"
    secrets: inherit

-  linux-jammy-cpu-py3_9-gcc11-inductor-test:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-cpu-test:
+    name: inductor-cpu-test
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    needs: inductor-cpu-build
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -42,8 +42,8 @@ jobs:
    needs: get-label-type
    with:
      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
-      build-environment: linux-jammy-py3.9-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
    secrets: inherit

  docs-push:
--- a/.github/workflows/operator_benchmark.yml
+++ b/.github/workflows/operator_benchmark.yml
@ -24,38 +24,38 @@ permissions:
  contents: read

 jobs:
-  linux-jammy-cpu-py3_9-gcc11-opbenchmark-build:
+  opbenchmark-build:
    if: github.repository_owner == 'pytorch'
-    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
+    name: opbenchmark-build
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
        ]}
    secrets: inherit

-  linux-jammy-cpu-py3_9-gcc11-opbenchmark-on-demand-build:
+  opbenchmark-on-demand-build:
    if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'pytorch' }}
-    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
+    name: opbenchmark-on-demand-build
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "cpu_operator_benchmark_${{ inputs.test_mode }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
        ]}
    secrets: inherit

-  linux-jammy-cpu-py3_9-gcc11-opbenchmark-test:
-    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
+  opbenchmark-test:
+    name: opbenchmark-test
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-opbenchmark-build
+    needs: opbenchmark-build
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.test-matrix }}
+      docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -170,6 +170,38 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }}
    secrets: inherit

+  linux-jammy-cuda13_0-py3_10-gcc11-build:
+    name: linux-jammy-cuda13.0-py3.10-gcc11
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      cuda-arch-list: 7.5
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
+      test-matrix: |
+        { include: [
+          { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda13_0-py3_10-gcc11-test:
+    name: linux-jammy-cuda13.0-py3.10-gcc11
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda13_0-py3_10-gcc11-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
+    secrets: inherit
+
  linux-jammy-rocm-py3_10-build:
    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -49,14 +49,14 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}

-  linux-jammy-py3_9-gcc11-build:
-    name: linux-jammy-py3.9-gcc11
+  linux-jammy-py3_10-gcc11-build:
+    name: linux-jammy-py3.10-gcc11
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@ -73,49 +73,49 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-py3_9-gcc11-test:
-    name: linux-jammy-py3.9-gcc11
+  linux-jammy-py3_10-gcc11-test:
+    name: linux-jammy-py3.10-gcc11
    uses: ./.github/workflows/_linux-test.yml
    needs:
-      - linux-jammy-py3_9-gcc11-build
+      - linux-jammy-py3_10-gcc11-build
      - target-determination
    with:
-      build-environment: linux-jammy-py3.9-gcc11
-      docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.test-matrix }}
    secrets: inherit

  linux-docs:
    name: linux-docs
    uses: ./.github/workflows/_docs.yml
-    needs: linux-jammy-py3_9-gcc11-build
+    needs: linux-jammy-py3_10-gcc11-build
    with:
-      build-environment: linux-jammy-py3.9-gcc11
-      docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }}
+      build-environment: linux-jammy-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }}
    secrets: inherit

-  linux-jammy-py3_9-gcc11-no-ops:
-    name: linux-jammy-py3.9-gcc11-no-ops
+  linux-jammy-py3_10-gcc11-no-ops:
+    name: linux-jammy-py3.10-gcc11-no-ops
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11-no-ops
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11-no-ops
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 1 },
        ]}
    secrets: inherit

-  linux-jammy-py3_9-gcc11-pch:
-    name: linux-jammy-py3.9-gcc11-pch
+  linux-jammy-py3_10-gcc11-pch:
+    name: linux-jammy-py3.10-gcc11-pch
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11-pch
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11-pch
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 1 },
@ -132,17 +132,17 @@ jobs:
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 2, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 3, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 4, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 5, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 6, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 1, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 2, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 3, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 4, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 5, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 6, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 7, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
        ]}
      sync-tag: asan-build
    secrets: inherit

-
  linux-jammy-py3_10-clang18-asan-test:
    name: linux-jammy-py3.10-clang18-asan
    uses: ./.github/workflows/_linux-test.yml
@ -183,14 +183,14 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-py3_9-clang12-build:
-    name: linux-jammy-py3.9-clang12
+  linux-jammy-py3_10-clang12-build:
+    name: linux-jammy-py3.10-clang12
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-clang12
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12
+      build-environment: linux-jammy-py3.10-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
@ -207,16 +207,16 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-py3_9-clang12-test:
-    name: linux-jammy-py3.9-clang12
+  linux-jammy-py3_10-clang12-test:
+    name: linux-jammy-py3.10-clang12
    uses: ./.github/workflows/_linux-test.yml
    needs:
-      - linux-jammy-py3_9-clang12-build
+      - linux-jammy-py3_10-clang12-build
      - target-determination
    with:
-      build-environment: linux-jammy-py3.9-clang12
-      docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.10-clang12
+      docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
    secrets: inherit

  linux-jammy-py3_13-clang12-build:
@ -253,14 +253,14 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-cuda12_8-cudnn9-py3_9-clang12-build:
-    name: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
+  linux-jammy-cuda12_8-cudnn9-py3_10-clang12-build:
+    name: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12
+      build-environment: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 1 },
@ -282,14 +282,14 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-py3_9-gcc11-mobile-lightweight-dispatch-build:
-    name: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build
+  linux-jammy-py3_10-gcc11-mobile-lightweight-dispatch-build:
+    name: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
      build-generates-artifacts: false
      test-matrix: |
        { include: [
@ -342,15 +342,40 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-xpu-2025_1-py3_9-build:
-    name: linux-jammy-xpu-2025.1-py3.9
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm75
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      sync-tag: linux-xpu-2025-1-build
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '7.5'
+      test-matrix: |
+        { include: [
+          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm75
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-xpu-n-py3_9-build:
+    name: linux-jammy-xpu-n-py3.9
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      sync-tag: linux-xpu-n-build
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-2025.1-py3.9
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3
+      build-environment: linux-jammy-xpu-n-py3.9
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" },
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -78,14 +78,14 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-py3_9-clang12-build:
-    name: linux-jammy-py3.9-clang12
+  linux-jammy-py3_10-clang12-build:
+    name: linux-jammy-py3.10-clang12
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-clang12
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12
+      build-environment: linux-jammy-py3.10-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
      test-matrix: |
        { include: [
          { config: "slow", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
@ -93,16 +93,16 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-py3_9-clang12-test:
-    name: linux-jammy-py3.9-clang12
+  linux-jammy-py3_10-clang12-test:
+    name: linux-jammy-py3.10-clang12
    uses: ./.github/workflows/_linux-test.yml
    needs:
-      - linux-jammy-py3_9-clang12-build
+      - linux-jammy-py3_10-clang12-build
      - target-determination
    with:
-      build-environment: linux-jammy-py3.9-clang12
-      docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.10-clang12
+      docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
    secrets: inherit

  linux-jammy-rocm-py3_10-build:
--- a/.github/workflows/test-check-binary.yml
+++ b/.github/workflows/test-check-binary.yml
@ -30,7 +30,7 @@ jobs:
    name: Test check_binary.sh for Linux CUDA
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
    with:
-      runner: linux.4xlarge.nvidia.gpu
+      runner: linux.g4dn.4xlarge.nvidia.gpu
      docker-image: python:3.11
      docker-build-dir: "skip-docker-build"
      script: |
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -224,13 +224,12 @@ jobs:
      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
    secrets: inherit

-  # NB: Keep this in sync with inductor-perf-test-nightly.yml
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
-    name: cuda12.8-py3.10-gcc9-sm80
+  inductor-build:
+    name: inductor-build
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm80
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
    secrets: inherit
@ -242,7 +241,7 @@ jobs:
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.9-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@ -59,22 +59,19 @@ jobs:
            # on the PR appear in chronological order (timing issues can shuffle them around)
            sleep 60
          fi
+
+          # Require a comment id for merge operations
+          if [ -z "${COMMENT_ID}" ]; then
+            echo "Error: merge requires COMMENT_ID to be specified"
+            exit 1
+          fi
+
          if [ -n "${FORCE}" ]; then
-            if [ -n "${COMMENT_ID}" ]; then
-              python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}"
-            else
-              python3 .github/scripts/trymerge.py --force "${PR_NUM}"
-            fi
+            python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}"
          elif [ -n "${IGNORE_CURRENT}" ]; then
-            if [ -n "${COMMENT_ID}" ]; then
-              python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}"
-            else
-              python3 .github/scripts/trymerge.py --ignore-current "${PR_NUM}"
-            fi
-          elif [ -n "${COMMENT_ID}" ]; then
-            python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}"
+            python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}"
          else
-            python3 .github/scripts/trymerge.py "${PR_NUM}"
+            python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}"
          fi
      - name: Comment on Canceled
        if: ${{ cancelled() && steps.checkout.outcome == 'success' }}
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@ -23,7 +23,7 @@ jobs:
        with:
          repository: pytorch/pytorch
          stable-branch: viable/strict
-          requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\", \"linux-aarch64\"]'
+          requires: '[\"pull\", \"trunk\", \"lint\", \"^linux-binary-manywheel$\", \"^linux-binary-libtorch-release$\", \"linux-aarch64\"]'
          secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
          clickhouse-url: ${{ secrets.CLICKHOUSE_URL }}
          clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }}
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@ -2,12 +2,14 @@ name: vllm-test

 on:
  push:
+    branches:
+      - main
+      - release/*
    tags:
      - ciflow/vllm/*
  workflow_dispatch:
  schedule:
-    # Every 12 hours starting at 00:00 UTC (00:00 and 12:00)
-    - cron: '0 0,12 * * *'
+    - cron: '0 */8 * * *'  # every 8 hours at minute 0 (UTC)

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@ -46,14 +48,18 @@ jobs:
          { config: "vllm_basic_models_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
          { config: "vllm_entrypoints_test", shard: 1, num_shards: 1,runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
          { config: "vllm_regression_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "vllm_lora_280_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
          { config: "vllm_multi_model_processor_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
          { config: "vllm_pytorch_compilation_unit_tests", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_multi_model_test_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"},
+          { config: "vllm_languagde_model_test_extended_generation_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"},
+          { config: "vllm_distributed_test_2_gpu_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
          { config: "vllm_lora_test", shard: 0, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
          { config: "vllm_lora_test", shard: 1, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
          { config: "vllm_lora_test", shard: 2, num_shards: 4,  runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
          { config: "vllm_lora_test", shard: 3, num_shards: 4,  runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "vllm_lora_tp_test_distributed", shard: 1, num_shards: 1, runner: "linux.aws.h100.4"},
+          { config: "vllm_lora_tp_test_distributed", shard: 1, num_shards: 1, runner: "linux.g6.12xlarge.nvidia.gpu"},
+          { config: "vllm_distributed_test_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.12xlarge.nvidia.gpu"}
        ]}
    secrets: inherit

--- a/.github/workflows/win-arm64-build-test.yml
+++ b/.github/workflows/win-arm64-build-test.yml
@ -4,6 +4,9 @@ on:
  push:
    tags:
      - ciflow/win-arm64/*
+  schedule:
+    # Every 4 hours starting at 00:00 UTC
+    - cron: '0 */4 * * *'

 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .4.0
 .5.0