add workflow to dispatch

echo variables
ghstack-source-id: 3c8f54e83cad9760fb06b39366bea2f31a39342f Pull-Request: https://github.com/pytorch/pytorch/pull/161565
2025-10-24 15:44:58 +08:00 · 2025-08-26 18:20:49 -07:00 · 2025-08-26 17:10:56 -07:00 · 2025-08-26 17:10:55 -07:00
745 changed files with 29025 additions and 46106 deletions
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -7,15 +7,6 @@ if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
 fi

-if [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0"
-fi
-
-# Compress the fatbin with -compress-mode=size for CUDA 13
-if [[ "$DESIRED_CUDA" == *"13"* ]]; then
-    export TORCH_NVCC_FLAGS="-compress-mode=size"
-fi
-
 SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 source $SCRIPTPATH/aarch64_ci_setup.sh

--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -77,23 +77,21 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
    wheelname = os.path.basename(wheel_path)
    os.mkdir(f"{folder}/tmp")
    os.system(f"unzip {wheel_path} -d {folder}/tmp")
-    # Common libraries for all CUDA versions
-    common_libs = [
-        # Non-NVIDIA system libraries
-        "/lib64/libgomp.so.1",
-        "/usr/lib64/libgfortran.so.5",
-        "/acl/build/libarm_compute.so",
-        "/acl/build/libarm_compute_graph.so",
-        # Common CUDA libraries (same for all versions)
-        "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
-        "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
-        "/usr/local/lib/libnvpl_lapack_core.so.0",
-        "/usr/local/lib/libnvpl_blas_core.so.0",
+    libs_to_copy = [
+        "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
        "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
        "/usr/local/cuda/lib64/libcudnn.so.9",
+        "/usr/local/cuda/lib64/libcublas.so.12",
+        "/usr/local/cuda/lib64/libcublasLt.so.12",
+        "/usr/local/cuda/lib64/libcudart.so.12",
+        "/usr/local/cuda/lib64/libcufft.so.11",
+        "/usr/local/cuda/lib64/libcusparse.so.12",
        "/usr/local/cuda/lib64/libcusparseLt.so.0",
+        "/usr/local/cuda/lib64/libcusolver.so.11",
        "/usr/local/cuda/lib64/libcurand.so.10",
        "/usr/local/cuda/lib64/libnccl.so.2",
+        "/usr/local/cuda/lib64/libnvJitLink.so.12",
+        "/usr/local/cuda/lib64/libnvrtc.so.12",
        "/usr/local/cuda/lib64/libnvshmem_host.so.3",
        "/usr/local/cuda/lib64/libcudnn_adv.so.9",
        "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
@ -102,41 +100,22 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
        "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
        "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
        "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
-        "/usr/local/cuda/lib64/libcufile.so.0",
-        "/usr/local/cuda/lib64/libcufile_rdma.so.1",
-        "/usr/local/cuda/lib64/libcusparse.so.12",
+        "/lib64/libgomp.so.1",
+        "/usr/lib64/libgfortran.so.5",
+        "/acl/build/libarm_compute.so",
+        "/acl/build/libarm_compute_graph.so",
+        "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+        "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+        "/usr/local/lib/libnvpl_lapack_core.so.0",
+        "/usr/local/lib/libnvpl_blas_core.so.0",
    ]

-    # CUDA version-specific libraries
-    if "130" in desired_cuda:
-        version_specific_libs = [
-            "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
-            "/usr/local/cuda/lib64/libcublas.so.13",
-            "/usr/local/cuda/lib64/libcublasLt.so.13",
-            "/usr/local/cuda/lib64/libcudart.so.13",
-            "/usr/local/cuda/lib64/libcufft.so.12",
-            "/usr/local/cuda/lib64/libcusolver.so.12",
-            "/usr/local/cuda/lib64/libnvJitLink.so.13",
-            "/usr/local/cuda/lib64/libnvrtc.so.13",
-            "/usr/local/cuda/lib64/libnvrtc-builtins.so.13.0",
+    if "129" in desired_cuda:
+        libs_to_copy += [
+            "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9",
+            "/usr/local/cuda/lib64/libcufile.so.0",
+            "/usr/local/cuda/lib64/libcufile_rdma.so.1",
        ]
-    elif "12" in desired_cuda:
-        # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
-        minor_version = desired_cuda[-1]
-        version_specific_libs = [
-            "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
-            "/usr/local/cuda/lib64/libcublas.so.12",
-            "/usr/local/cuda/lib64/libcublasLt.so.12",
-            "/usr/local/cuda/lib64/libcudart.so.12",
-            "/usr/local/cuda/lib64/libcufft.so.11",
-            "/usr/local/cuda/lib64/libcusolver.so.11",
-            "/usr/local/cuda/lib64/libnvJitLink.so.12",
-            "/usr/local/cuda/lib64/libnvrtc.so.12",
-            f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
-        ]
-
-    # Combine all libraries
-    libs_to_copy = common_libs + version_specific_libs

    # Copy libraries to unzipped_folder/a/lib
    for lib_path in libs_to_copy:
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -81,8 +81,8 @@ elif [[ "$image" == *riscv* ]]; then
  DOCKERFILE="ubuntu-cross-riscv/Dockerfile"
 fi

-_UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152
-_UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96
+_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
+_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
 if [[ "$image" == *rocm* ]]; then
  _UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
  _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
@ -114,19 +114,31 @@ case "$tag" in
    UCC_COMMIT=${_UCC_COMMIT}
    TRITON=yes
    ;;
-  pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
-    CUDA_VERSION=13.0.0
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.8.1
    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
+    GCC_VERSION=9
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.8.1
-    ANACONDA_PYTHON_VERSION=3.10
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=9
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.8.1
+    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
    VISION=yes
    KATEX=yes
@ -161,8 +173,8 @@ case "$tag" in
    VISION=yes
    ONNX=yes
    ;;
-  pytorch-linux-jammy-py3.10-clang12)
-    ANACONDA_PYTHON_VERSION=3.10
+  pytorch-linux-jammy-py3.9-clang12)
+    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=12
    VISION=yes
    TRITON=yes
@ -197,24 +209,23 @@ case "$tag" in
    UCC_COMMIT=${_UCC_COMMIT}
    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
    ;;
-  pytorch-linux-jammy-xpu-n-1-py3)
-    ANACONDA_PYTHON_VERSION=3.10
+  pytorch-linux-jammy-xpu-2025.0-py3)
+    ANACONDA_PYTHON_VERSION=3.9
+    GCC_VERSION=11
+    VISION=yes
+    XPU_VERSION=2025.0
+    NINJA_VERSION=1.9.0
+    TRITON=yes
+    ;;
+  pytorch-linux-jammy-xpu-2025.1-py3)
+    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    VISION=yes
    XPU_VERSION=2025.1
    NINJA_VERSION=1.9.0
    TRITON=yes
    ;;
-  pytorch-linux-jammy-xpu-n-py3)
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
-    VISION=yes
-    XPU_VERSION=2025.2
-    NINJA_VERSION=1.9.0
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
-    # TODO (huydhn): Upgrade this to Python >= 3.10
+  pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    VISION=yes
@ -223,8 +234,8 @@ case "$tag" in
    DOCS=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12)
-    ANACONDA_PYTHON_VERSION=3.10
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
+    ANACONDA_PYTHON_VERSION=3.9
    CUDA_VERSION=12.8.1
    CLANG_VERSION=12
    VISION=yes
@ -235,8 +246,8 @@ case "$tag" in
    CLANG_VERSION=18
    VISION=yes
    ;;
-  pytorch-linux-jammy-py3.10-gcc11)
-    ANACONDA_PYTHON_VERSION=3.10
+  pytorch-linux-jammy-py3.9-gcc11)
+    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    VISION=yes
    KATEX=yes
--- a/.ci/docker/ci_commit_pins/torchbench.txt
+++ b/.ci/docker/ci_commit_pins/torchbench.txt
@ -1 +1 @@
-74a23feff57432129df84d8099e622773cf77925
+e03a63be43e33596f7f0a43b0f530353785e4a59
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-d0e80f39c562c70986fc548fa6e5852ad86e16e7
+a6572fb0be5b9b0a19b0641a0ce05810fa04e44c
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -83,9 +83,9 @@ function build_cpython {
        py_suffix=${py_ver::-1}
        py_folder=$py_suffix
    fi
-    # Update to rc2 due to https://github.com/python/cpython/commit/c72699086fe4
+    # Only b3 is available now
    if [ "$py_suffix" == "3.14.0" ]; then
-        py_suffix="3.14.0rc2"
+        py_suffix="3.14.0b3"
    fi
    wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz
    do_cpython_build $py_ver Python-$py_suffix
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -57,7 +57,7 @@ if [ ! -f setup.py ]; then
  cd python
 fi

-pip_install pybind11==3.0.1
+pip_install pybind11==2.13.6

 # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
 as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py
--- a/.ci/docker/common/install_ucc.sh
+++ b/.ci/docker/common/install_ucc.sh
@ -44,12 +44,8 @@ function install_ucc() {

  ./autogen.sh

-  if [[ -n "$CUDA_VERSION"  && $CUDA_VERSION == 13* ]]; then
-    NVCC_GENCODE="-gencode=arch=compute_86,code=compute_86"
-  else
-    # We only run distributed tests on Tesla M60 and A10G
-    NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
-  fi
+  # We only run distributed tests on Tesla M60 and A10G
+  NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"

  if [[ -n "$ROCM_VERSION" ]]; then
    if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -65,14 +65,10 @@ function install_ubuntu() {

 function install_rhel() {
    . /etc/os-release
-    if [[ "${ID}" == "rhel" ]]; then
-        if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
-            echo "RHEL version ${VERSION_ID} not supported"
-            exit
-        fi
-    elif [[ "${ID}" == "almalinux" ]]; then
-        # Workaround for almalinux8 which used by quay.io/pypa/manylinux_2_28_x86_64
-        VERSION_ID="8.8"
+
+    if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
+        echo "RHEL version ${VERSION_ID} not supported"
+        exit
    fi

    dnf install -y 'dnf-command(config-manager)'
@ -150,11 +146,11 @@ if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
    XPU_DRIVER_VERSION="/lts/2350"
 fi

-# Default use Intel® oneAPI Deep Learning Essentials 2025.1
-if [[ "$XPU_VERSION" == "2025.2" ]]; then
-    XPU_PACKAGES="intel-deep-learning-essentials-2025.2"
-else
+# Default use Intel® oneAPI Deep Learning Essentials 2025.0
+if [[ "$XPU_VERSION" == "2025.1" ]]; then
    XPU_PACKAGES="intel-deep-learning-essentials-2025.1"
+else
+    XPU_PACKAGES="intel-deep-learning-essentials-2025.0"
 fi

 # The installation depends on the base OS
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -175,6 +175,6 @@ ENV XPU_DRIVER_TYPE ROLLING
 RUN python3 -m pip install --upgrade pip && \
    python3 -mpip install cmake==3.28.4
 ADD ./common/install_xpu.sh install_xpu.sh
-ENV XPU_VERSION 2025.2
+ENV XPU_VERSION 2025.1
 RUN bash ./install_xpu.sh && rm install_xpu.sh
 RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -379,7 +379,7 @@ dataclasses_json==0.6.7
 cmake==4.0.0
 #Description: required for building

-tlparse==0.4.0
+tlparse==0.3.30
 #Description: required for log parsing

 cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x"
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,7 +1,7 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2

 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -66,7 +66,6 @@ ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
 # (optional) Install UCC
 ARG UCX_COMMIT
 ARG UCC_COMMIT
-ARG CUDA_VERSION
 ENV UCX_COMMIT $UCX_COMMIT
 ENV UCC_COMMIT $UCC_COMMIT
 ENV UCX_HOME /usr
--- a/.ci/lumen_cli/cli/lib/common/gh_summary.py
+++ b/.ci/lumen_cli/cli/lib/common/gh_summary.py
@ -1,56 +1,14 @@
 from __future__ import annotations
-
-import logging
 import os
-import textwrap
 from pathlib import Path
-from typing import TYPE_CHECKING
-
-from cli.lib.common.utils import get_wheels
-from jinja2 import Template
-
-
-if TYPE_CHECKING:
-    from collections.abc import Iterable, Mapping
-
+from typing import Iterable, Mapping, Optional
+import logging
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from typing import Iterable, Tuple

 logger = logging.getLogger(__name__)

-_TPL_CONTENT = Template(
-    textwrap.dedent("""\
-    ## {{ title }}
-
-    ```{{ lang }}
-    {{ content }}
-    ```
-""")
-)
-
-_TPL_LIST_ITEMS = Template(
-    textwrap.dedent("""\
-    ## {{ title }}
-    {% for it in items %}
-    - {{ it.pkg }}: {{ it.relpath }}
-    {% else %}
-    _(no item found)_
-    {% endfor %}
-    """)
-)
-
-_TPL_TABLE = Template(
-    textwrap.dedent("""\
-    {%- if rows %}
-    | {{ cols | join(' | ') }} |
-    |{%- for _ in cols %} --- |{%- endfor %}
-    {%- for r in rows %}
-    | {%- for c in cols %} {{ r.get(c, "") }} |{%- endfor %}
-    {%- endfor %}
-    {%- else %}
-    _(no data)_
-    {%- endif %}
-""")
-)
-

 def gh_summary_path() -> Path | None:
    """Return the Path to the GitHub step summary file, or None if not set."""
@ -69,14 +27,13 @@ def write_gh_step_summary(md: str, *, append_content: bool = True) -> bool:
    """
    sp = gh_summary_path()
    if not sp:
+        # When running locally, just log to console instead of failing.
        logger.info("[gh-summary] GITHUB_STEP_SUMMARY not set, skipping write.")
        return False
-
-    md_clean = textwrap.dedent(md).strip() + "\n"
-
+    sp.parent.mkdir(parents=True, exist_ok=True)
    mode = "a" if append_content else "w"
    with sp.open(mode, encoding="utf-8") as f:
-        f.write(md_clean)
+        f.write(md.rstrip() + "\n")
    return True


@ -85,59 +42,182 @@ def md_heading(text: str, level: int = 2) -> str:
    return f"{'#' * max(1, min(level, 6))} {text}\n"


+def md_kv_table(rows: Iterable[Mapping[str, str | int | float]]) -> str:
+    """
+    Render a list of dictionaries as a Markdown table.
+    The first row (header) is derived from the union of all keys.
+        # Suppose you want to summarize benchmark results
+        rows = [
+            {"name": "transformer-small", "p50": 12.3, "p90(ms)": 18.4},
+            {"name": "transformer-large", "p50": 45.1, "p90(ms)": 60.7},
+        ]
+        content = []
+        content.append(md_heading("Benchmark Results", level=2))
+        content.append(md_kv_table(rows))
+        content.append(md_details("Raw logs", "```\n[INFO] benchmark log ...\n```"))
+        # Join the pieces into one Markdown block
+        markdown = '\n'.join(content)
+        # Write to GitHub Actions summary (or log locally if not in CI)
+        write_gh_step_summary(markdown, append=True)
+    """
+
+    rows = list(rows)
+    if not rows:
+        return "_(no data)_\n"
+    # Collect all columns across all rows
+    cols = list({k for r in rows for k in r.keys()})
+    header = "| " + " | ".join(cols) + " |\n"
+    sep = "|" + "|".join([" --- " for _ in cols]) + "|\n"
+    lines = []
+    for r in rows:
+        line = "| " + " | ".join(str(r.get(c, "")) for c in cols) + " |\n"
+        lines.append(line)
+    return header + sep + "".join(lines) + "\n"
+
+
 def md_details(summary: str, content: str) -> str:
    """Generate a collapsible <details> block with a summary and inner content."""
    return f"<details>\n<summary>{summary}</summary>\n\n{content}\n\n</details>\n"


+# ---- helper test to generate a summary for list of pytest failures ------#
+
+
+def summarize_failures_by_test_command(
+    xml_and_labels: Iterable[Tuple[str | Path, str]],
+    *,
+    title: str = "Pytest Failures by Test Command",
+    dedupe_within_command: bool = True,
+):
+    """
+    Args:
+      xml_and_labels: list of (xml_path, label) pairs.
+                      Each XML corresponds to one pytest subprocess (one test command).
+    Behavior:
+      - Writes a section per test command if it has failures.
+      - Each failed test is listed as 'path/to/test.py:test_name'.
+
+    Example:
+        xml = [
+            ("reports/junit_cmd0.xml", "pytest -v -s tests/unit"),
+            ("reports/junit_cmd1.xml", "pytest -v -s tests/integration"),
+            ("reports/junit_cmd2.xml", "pytest -v -s tests/entrypoints"),
+        ]
+        summarize_failures_by_test_command(
+            xmls,
+            title="Consolidated Pytest Failures",
+        )
+    """
+    write_gh_step_summary(md_heading(title, level=2))
+
+    for xml_path, label in xml_and_labels:
+        xmlp = Path(xml_path)
+        failed = _parse_failed_simple(xmlp)
+        if dedupe_within_command:
+            failed = sorted(set(failed))
+        if not failed:
+            continue  # skip commands with no failures
+        write_gh_step_summary(md_heading(f"Test Command: {label}", level=3))
+        lines = "\n".join(f"- {item}" for item in failed)
+        write_gh_step_summary(lines + "\n")
+
+
+def _to_simple_name_from_testcase(tc: ET.Element) -> str:
+    """
+    Convert a <testcase> into 'path/to/test.py:test_name' format.
+    Prefer the 'file' attribute if available, else fall back to classname.
+    """
+    name = tc.attrib.get("name", "")
+    file_attr = tc.attrib.get("file")
+    if file_attr:
+        return f"{file_attr}:{name}"
+
+    classname = tc.attrib.get("classname", "")
+    parts = classname.split(".") if classname else []
+    if len(parts) >= 1:
+        # drop last part if it's a class, treat rest as module path
+        mod_parts = parts[:-1] if len(parts) >= 2 else parts
+        mod_path = "/".join(mod_parts) + ".py" if mod_parts else "unknown.py"
+        return f"{mod_path}:{name}"
+    return f"unknown.py:{name or 'unknown_test'}"
+
+
+def _parse_failed_simple(xml_path: Path) -> list[str]:
+    """
+    Parse one XML, return failures as ['tests/a_test.py:test_x', ...].
+    Only include <failure> and <error>.
+    """
+    if not xml_path.exists():
+        return []
+    tree = ET.parse(xml_path)
+    root = tree.getroot()
+    failed = []
+    for tc in root.iter("testcase"):
+        if any(x.tag in {"failure", "error"} for x in tc):
+            failed.append(_to_simple_name_from_testcase(tc))
+    return failed
+
+
 def summarize_content_from_file(
    output_dir: Path,
    freeze_file: str,
-    title: str = "Content from file",
+    title: str = "Wheels (pip freeze)",
    code_lang: str = "",  # e.g. "text" or "ini"
 ) -> bool:
+    """
+    Read a text file from output_dir/freeze_file and append it to
+    the GitHub Step Summary as a Markdown code block.
+
+    Returns True if something was written, False otherwise.
+    """
+
    f = Path(output_dir) / freeze_file
    if not f.exists():
        return False
+
    content = f.read_text(encoding="utf-8").strip()
-    md = render_content(content, title=title, lang=code_lang)
-    return write_gh_step_summary(md)
-
-
-def summarize_wheels(path: Path, title: str = "Wheels", max_depth: int = 3):
-    items = get_wheels(path, max_depth=max_depth)
-    if not items:
+    if not content:
        return False
-    md = render_list(items, title=title)
-    return write_gh_step_summary(md)
+    md = []
+    md.append(md_heading(title, 2))
+    md.append(f"```{code_lang}".rstrip())
+    md.append(content)
+    md.append("```")
+
+    return write_gh_step_summary("\n".join(md) + "\n")


-def md_kv_table(rows: Iterable[Mapping[str, str | int | float]]) -> str:
+def summarize_wheels(
+    output_dir: Path,
+    title: str = "Wheels",
+    max_depth: Optional[int] = None,  # None = unlimited
+):
    """
-    Render a list of dicts as a Markdown table using Jinja template.
+    Walk output_dir up to max_depth and list all *.whl files.
+    Grouped as 'package: filename.whl'.
+
+    Args:
+        output_dir: base directory to search
+        title: section title in GH summary
+        max_depth: maximum folder depth relative to output_dir (0 = only top-level)
    """
-    rows = list(rows)
-    cols = list({k for r in rows for k in r.keys()})
-    md = _TPL_TABLE.render(cols=cols, rows=rows).strip() + "\n"
-    return md
+    if not output_dir.exists():
+        return False
+    root = Path(output_dir)
+    lines = [md_heading(title, 2)]

+    for dirpath, _, filenames in os.walk(root):
+        depth = Path(dirpath).relative_to(root).parts
+        if max_depth is not None and len(depth) > max_depth:
+            # skip going deeper
+            continue

-def render_list(
-    items: Iterable[str],
-    *,
-    title: str = "List",
-) -> str:
-    tpl = _TPL_LIST_ITEMS
-    md = tpl.render(title=title, items=items)
-    return md
+        for fname in sorted(filenames):
+            if not fname.endswith(".whl"):
+                continue
+            pkg = fname.split("-")[0]
+            relpath = str(Path(dirpath) / fname).replace(str(root) + os.sep, "")
+            lines.append(f"- {pkg}: {relpath}")

-
-def render_content(
-    content: str,
-    *,
-    title: str = "Content",
-    lang: str = "text",
-) -> str:
-    tpl = _TPL_CONTENT
-    md = tpl.render(title=title, content=content, lang=lang)
-    return md
+    if len(lines) > 1:
+        write_gh_step_summary("\n".join(lines) + "\n")
--- a/.ci/lumen_cli/cli/lib/common/pip_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/pip_helper.py
@ -4,7 +4,7 @@ import shlex
 import shutil
 import sys
 from collections.abc import Iterable
-from importlib.metadata import PackageNotFoundError, version  # noqa: UP035
+from importlib.metadata import PackageNotFoundError, version
 from typing import Optional, Union

 from cli.lib.common.utils import run_command
--- a/.ci/lumen_cli/cli/lib/common/utils.py
+++ b/.ci/lumen_cli/cli/lib/common/utils.py
@ -8,7 +8,6 @@ import shlex
 import subprocess
 import sys
 from contextlib import contextmanager
-from pathlib import Path
 from typing import Optional


@ -116,24 +115,3 @@ def working_directory(path: str):
        yield
    finally:
        os.chdir(prev_cwd)
-
-
-def get_wheels(
-    output_dir: Path,
-    max_depth: Optional[int] = None,
-) -> list[str]:
-    """Return a list of wheels found in the given output directory."""
-    root = Path(output_dir)
-    if not root.exists():
-        return []
-    items = []
-    for dirpath, _, filenames in os.walk(root):
-        depth = Path(dirpath).relative_to(root).parts
-        if max_depth is not None and len(depth) > max_depth:
-            continue
-        for fname in sorted(filenames):
-            if fname.endswith(".whl"):
-                pkg = fname.split("-")[0]
-                relpath = str((Path(dirpath) / fname).relative_to(root))
-                items.append({"pkg": pkg, "relpath": relpath})
-    return items
--- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@ -1,27 +1,15 @@
 import logging
-import os
-import textwrap
+from pathlib import Path
+import re
 from typing import Any

-from cli.lib.common.gh_summary import write_gh_step_summary
 from cli.lib.common.git_helper import clone_external_repo
 from cli.lib.common.pip_helper import pip_install_packages
 from cli.lib.common.utils import run_command, temp_environ, working_directory
-from jinja2 import Template
-
+from cli.lib.common.gh_summary import md_heading, write_gh_step_summary

 logger = logging.getLogger(__name__)

-_TPL_VLLM_INFO = Template(
-    textwrap.dedent("""\
-    ##  Vllm against Pytorch CI Test Summary
-    **Vllm Commit**: [{{ vllm_commit }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})
-    {%- if torch_sha %}
-    **Pytorch Commit**: [{{ torch_sha }}](https://github.com/pytorch/pytorch/commit/{{ torch_sha }})
-    {%- endif %}
-""")
-)
-

 def sample_vllm_test_library():
    """
@ -245,12 +233,3 @@ def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) ->
    for k in sorted(mapping, key=len, reverse=True):
        step = step.replace(k, mapping[k])
    return step
-
-
-def summarize_build_info(vllm_commit: str) -> bool:
-    torch_sha = os.getenv("GITHUB_SHA")
-    md = (
-        _TPL_VLLM_INFO.render(vllm_commit=vllm_commit, torch_sha=torch_sha).strip()
-        + "\n"
-    )
-    return write_gh_step_summary(md)
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
@ -4,7 +4,6 @@ import textwrap
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
-
 from cli.lib.common.cli_helper import BaseRunner
 from cli.lib.common.docker_helper import local_image_exists
 from cli.lib.common.envs_helper import (
@ -13,11 +12,6 @@ from cli.lib.common.envs_helper import (
    env_str_field,
    with_params_help,
 )
-from cli.lib.common.gh_summary import (
-    gh_summary_path,
-    summarize_content_from_file,
-    summarize_wheels,
-)
 from cli.lib.common.path_helper import (
    copy,
    ensure_dir_exists,
@ -26,7 +20,14 @@ from cli.lib.common.path_helper import (
    is_path_exist,
 )
 from cli.lib.common.utils import run_command
-from cli.lib.core.vllm.lib import clone_vllm, summarize_build_info
+from cli.lib.core.vllm.lib import clone_vllm, write_gh_step_summary
+from cli.lib.common.gh_summary import (
+    summarize_content_from_file,
+    summarize_wheels,
+    gh_summary_path,
+)
+import torch
+from torch import torch_version


 logger = logging.getLogger(__name__)
@ -160,7 +161,17 @@ class VllmBuildRunner(BaseRunner):
        logger.info("Running vllm build with inputs: %s", inputs)
        vllm_commit = clone_vllm()

+        vllm_sha_url = f"${vllm_commit}](https://github.com/vllm-project/vllm/commit/${vllm_commit})"
+        write_gh_step_summary(
+            f"""
+            ## Commit Info
+            - **Vllm Commit**: `{vllm_sha_url}`
+            - **Torch Version**: `{torch_version}`
+            """
+        )
+
        self.cp_dockerfile_if_exist(inputs)
+
        # cp torch wheels from root direct to vllm workspace if exist
        self.cp_torch_whls_if_exist(inputs)

@ -181,19 +192,26 @@ class VllmBuildRunner(BaseRunner):
        if not gh_summary_path():
            return logger.info("Skipping, not detect GH Summary env var....")
        logger.info("Generate GH Summary ...")
-        # summarize vllm build info
-        summarize_build_info(vllm_commit)
-
-        # summarize vllm build artifacts
+        vllm_sha_url = f"[{vllm_commit}](https://github.com/vllm-project/vllm/commit/{vllm_commit})"
+        write_gh_step_summary(
+            f"""
+            ## Build vllm against Pytorch CI
+            **Vllm Commit**: `{vllm_sha_url}`
+            """
+        )
+        torch_sha = os.getenv("GITHUB_SHA")
+        if torch_sha:  # only can grab this in github action
+            torch_sha_url = (
+                f"[{torch_sha}](https://github.com/pytorch/pytorch/commit/{torch_sha})]"
+            )
+            write_gh_step_summary(
+                f"""
+             **Pytorch Commit**: `{torch_sha_url}`
+             """
+            )
        vllm_artifact_dir = inputs.output_dir / "wheels"
-        summarize_content_from_file(
-            vllm_artifact_dir,
-            "build_summary.txt",
-            title="Vllm build env pip package summary",
-        )
-        summarize_wheels(
-            inputs.torch_whls_path, max_depth=3, title="Torch Wheels Artifacts"
-        )
+        summarize_content_from_file(vllm_artifact_dir, "build_summary.txt", title="Vllm build package summary")
+        summarize_wheels(inputs.torch_whls_path, max_depth=3, title="Torch Wheels Artifacts")
        summarize_wheels(vllm_artifact_dir, max_depth=3, title="Vllm Wheels Artifacts")

    def cp_torch_whls_if_exist(self, inputs: VllmBuildParameters) -> str:
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
@ -215,6 +215,7 @@ def preprocess_test_in(
        "torchaudio",
        "xformers",
        "mamba_ssm",
+        "pybind11",
    ] + additional_package_to_move
    # Read current requirements
    target_path = Path(target_file)
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -300,3 +300,24 @@ except RuntimeError as e:
    exit 1
  fi
 fi
+
+###############################################################################
+# Check for C++ ABI compatibility to GCC-11 - GCC 13
+###############################################################################
+if [[ "$(uname)" == 'Linux' &&  "$PACKAGE_TYPE" == 'manywheel' ]]; then
+  pushd /tmp
+  # Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html
+  # gcc-11 is ABI16, gcc-13 is ABI18, gcc-14 is ABI19
+  # gcc 11 - CUDA 11.8, xpu, rocm
+  # gcc 13 - CUDA 12.6, 12.8 and cpu
+  # Please see issue for reference: https://github.com/pytorch/pytorch/issues/152426
+  if [[ "$(uname -m)" == "s390x" ]]; then
+    cxx_abi="19"
+  elif [[ "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'rocm'* ]]; then
+    cxx_abi="18"
+  else
+    cxx_abi="16"
+  fi
+  python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi10${cxx_abi}' else 1)"
+  popd
+fi
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -35,10 +35,11 @@ fi

 print_cmake_info
 if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
-  USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
+  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
+  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
 else
-  # NB: we always build with distributed; USE_DISTRIBUTED turns off all
-  # backends (specifically the gloo backend), so test that this case works too
+  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
+  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
 fi
 if which sccache > /dev/null; then
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -13,13 +13,9 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
 fi
 popd

-python -mpip install -r requirements.txt
-
 # enable debug asserts in serialization
 export TORCH_SERIALIZATION_DEBUG=1

-python -mpip install --no-input -r requirements.txt
-
 setup_test_python() {
  # The CircleCI worker hostname doesn't resolve to an address.
  # This environment variable makes ProcessGroupGloo default to
@ -306,47 +302,6 @@ test_torchbench_smoketest() {
    fi

  done
-  echo "Pytorch benchmark on mps device completed"
-}
-
-test_aoti_torchbench_smoketest() {
-  print_cmake_info
-
-  echo "Launching AOTInductor torchbench setup"
-  pip_benchmark_deps
-  # shellcheck disable=SC2119,SC2120
-  torchbench_setup_macos
-
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-
-  local device=mps
-  local dtypes=(undefined float16 bfloat16 notset)
-  local dtype=${dtypes[$1]}
-  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
-
-  echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}"
-  local dtype_arg="--${dtype}"
-  if [ "$dtype" == notset ]; then
-      dtype_arg="--float32"
-  fi
-  touch "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv"
-  for model in "${models[@]}"; do
-    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-      --performance --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
-      --output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv" || true
-    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-      --accuracy --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
-      --output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_accuracy.csv" || true
-  done
-
-  echo "Launching HuggingFace inference performance run for AOT Inductor and dtype ${dtype}"
-  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
-    --performance --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
-    --output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_performance.csv" || true
-  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
-    --accuracy --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
-    --output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_accuracy.csv" || true

  echo "Pytorch benchmark on mps device completed"
 }
@ -395,8 +350,6 @@ elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then
  test_timm_perf
 elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then
  test_torchbench_smoketest "${SHARD_NUMBER}"
-elif [[ $TEST_CONFIG == *"aot_inductor_perf_smoketest"* ]]; then
-  test_aoti_torchbench_smoketest "${SHARD_NUMBER}"
 elif [[ $TEST_CONFIG == *"mps"* ]]; then
  test_python_mps
 elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -496,14 +496,6 @@ test_inductor_cpp_wrapper_shard() {
    -k 'take' \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose
-
-  if [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
-    python test/run_test.py \
-      --include inductor/test_mkldnn_pattern_matcher \
-      -k 'xpu' \
-      --shard "$1" "$NUM_TEST_SHARDS" \
-      --verbose
-  fi
 }

 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -44,7 +44,7 @@ python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==
 python -m pip install z3-solver==4.15.1.0

 # Install tlparse for test\dynamo\test_structured_trace.py UTs.
-python -m pip install tlparse==0.4.0
+python -m pip install tlparse==0.3.30

 # Install parameterized
 python -m pip install parameterized==0.8.1
--- a/.ci/pytorch/windows/internal/xpu_install.bat
+++ b/.ci/pytorch/windows/internal/xpu_install.bat
@ -13,9 +13,9 @@ if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
 :xpu_bundle_install_start

 set XPU_BUNDLE_PARENT_DIR=C:\Program Files (x86)\Intel\oneAPI
-set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
+set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d6d6c17-ca2d-4735-9331-99447e4a1280/intel-deep-learning-essentials-2025.0.1.28_offline.exe
 set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product
-set XPU_BUNDLE_VERSION=2025.1.3+5
+set XPU_BUNDLE_VERSION=2025.0.1+20
 set XPU_BUNDLE_INSTALLED=0
 set XPU_BUNDLE_UNINSTALL=0
 set XPU_EXTRA_URL=NULL
@ -24,9 +24,9 @@ set XPU_EXTRA_VERSION=2025.0.1+1226
 set XPU_EXTRA_INSTALLED=0
 set XPU_EXTRA_UNINSTALL=0

-if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.2] (
-    set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
-    set XPU_BUNDLE_VERSION=2025.2.1+20
+if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.1] (
+    set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
+    set XPU_BUNDLE_VERSION=2025.1.3+5
 )

 :: Check if XPU bundle is target version or already installed
@ -90,3 +90,14 @@ if errorlevel 1 exit /b 1
 del xpu_extra.exe

 :xpu_install_end
+
+if not "%XPU_ENABLE_KINETO%"=="1" goto install_end
+:: Install Level Zero SDK
+set XPU_EXTRA_LZ_URL=https://github.com/oneapi-src/level-zero/releases/download/v1.14.0/level-zero-sdk_1.14.0.zip
+curl -k -L %XPU_EXTRA_LZ_URL% --output "%SRC_DIR%\temp_build\level_zero_sdk.zip"
+echo "Installing level zero SDK..."
+7z x "%SRC_DIR%\temp_build\level_zero_sdk.zip" -o"%SRC_DIR%\temp_build\level_zero"
+set "INCLUDE=%SRC_DIR%\temp_build\level_zero\include;%INCLUDE%"
+del "%SRC_DIR%\temp_build\level_zero_sdk.zip"
+
+:install_end
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -213,8 +213,7 @@ pip install requests ninja typing-extensions
 retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
 retry brew install libomp

-# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
-# is build as part of tensorpipe submodule
+# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
 export USE_DISTRIBUTED=1

 export USE_MKLDNN=OFF
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -75,8 +75,8 @@ TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
 # Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
 TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"

-# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries.
-if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then
+# CUDA 12.9 builds have triton for Linux and Linux aarch64 binaries.
+if [[ "$DESIRED_CUDA" == "cu129" ]]; then
  TRITON_CONSTRAINT="platform_system == 'Linux'"
 fi

--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@ -15,7 +15,8 @@ fi
 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
    export VC_YEAR=2022
    export USE_SCCACHE=0
-    export XPU_VERSION=2025.2
+    export XPU_VERSION=2025.1
+    export XPU_ENABLE_KINETO=1
 fi

 echo "Free space on filesystem before build:"
--- a/.circleci/scripts/binary_windows_test.sh
+++ b/.circleci/scripts/binary_windows_test.sh
@ -8,7 +8,7 @@ export VC_YEAR=2022

 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
    export VC_YEAR=2022
-    export XPU_VERSION=2025.2
+    export XPU_VERSION=2025.1
 fi

 pushd "$PYTORCH_ROOT/.ci/pytorch/"
--- a/.github/actions/build-external-packages/action.yml
+++ b/.github/actions/build-external-packages/action.yml
@ -62,6 +62,8 @@ runs:
        MAX_JOBS="$(nproc --ignore=6)"
        export MAX_JOBS

+        echo "$GITHUB_STEP_SUMMARY"
+
        # Split the comma-separated list and build each target
        IFS=',' read -ra TARGETS <<< "$BUILD_TARGETS"
        for target in "${TARGETS[@]}"; do
--- a/.github/actions/checkout-pytorch/action.yml
+++ b/.github/actions/checkout-pytorch/action.yml
@ -57,21 +57,6 @@ runs:
        submodules: ${{ inputs.submodules }}
        show-progress: false

-    - name: Clean submodules post checkout
-      id: clean-submodules
-      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
-      shell: bash
-      env:
-        NO_SUDO: ${{ inputs.no-sudo }}
-      run: |
-        cd "${GITHUB_WORKSPACE}"
-        # Clean stale submodule dirs
-        if [ -z "${NO_SUDO}" ]; then
-          sudo git submodule foreach --recursive git clean -ffdx
-        else
-          git submodule foreach --recursive git clean -ffdx
-        fi
-
    - name: Clean workspace (try again)
      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' &&
        (steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }}
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-0757bbb660855272f7dd8d31cc84e7c631522805
+10a5002c6195bd95e34df8fe28ff8a2d55a2a922
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-862f2ef893d9751db0a92bd2d4ae0e3d9677872f
+add1adfec742dfb13e614dab3372b5aafd1ff046
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-763e5b78d4fcd74a9e812256656c075f99d9a781
+a1c6ee92c85e8b0955c20892ed68f032a6015c09
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@ -359,7 +359,7 @@ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 # Build flashinfer for torch nightly from source around 10 mins
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
-ARG FLASHINFER_GIT_REF="v0.2.14.post1"
+ARG ="v0.2.14.post1"
 RUN --mount=type=cache,target=/root/.cache/uv \
    git clone --depth 1 --recursive --shallow-submodules \
        --branch ${FLASHINFER_GIT_REF} \
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -28,7 +28,7 @@ pyyaml==6.0.2
 scipy==1.12.0
 setuptools==72.1.0
 sympy==1.13.3
-tlparse==0.4.0
+tlparse==0.3.30
 tensorboard==2.13.0
 typing-extensions==4.12.2
 unittest-xml-reporting<=3.2.0,>=2.0.0
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -40,7 +40,7 @@ CPU_AARCH64_ARCH = ["cpu-aarch64"]

 CPU_S390X_ARCH = ["cpu-s390x"]

-CUDA_AARCH64_ARCHES = ["12.9-aarch64", "13.0-aarch64"]
+CUDA_AARCH64_ARCHES = ["12.9-aarch64"]


 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
@ -113,26 +113,26 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
    "xpu": (
-        "intel-cmplr-lib-rt==2025.2.1 | "
-        "intel-cmplr-lib-ur==2025.2.1 | "
-        "intel-cmplr-lic-rt==2025.2.1 | "
-        "intel-sycl-rt==2025.2.1 | "
-        "oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "onemkl-sycl-blas==2025.2.0 | "
-        "onemkl-sycl-dft==2025.2.0 | "
-        "onemkl-sycl-lapack==2025.2.0 | "
-        "onemkl-sycl-rng==2025.2.0 | "
-        "onemkl-sycl-sparse==2025.2.0 | "
-        "dpcpp-cpp-rt==2025.2.1 | "
-        "intel-opencl-rt==2025.2.1 | "
-        "mkl==2025.2.0 | "
-        "intel-openmp==2025.2.1 | "
-        "tbb==2022.2.0 | "
-        "tcmlib==1.4.0 | "
-        "umf==0.11.0 | "
-        "intel-pti==0.13.1"
+        "intel-cmplr-lib-rt==2025.1.1 | "
+        "intel-cmplr-lib-ur==2025.1.1 | "
+        "intel-cmplr-lic-rt==2025.1.1 | "
+        "intel-sycl-rt==2025.1.1 | "
+        "oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "onemkl-sycl-blas==2025.1.0 | "
+        "onemkl-sycl-dft==2025.1.0 | "
+        "onemkl-sycl-lapack==2025.1.0 | "
+        "onemkl-sycl-rng==2025.1.0 | "
+        "onemkl-sycl-sparse==2025.1.0 | "
+        "dpcpp-cpp-rt==2025.1.1 | "
+        "intel-opencl-rt==2025.1.1 | "
+        "mkl==2025.1.0 | "
+        "intel-openmp==2025.1.1 | "
+        "tbb==2022.1.0 | "
+        "tcmlib==1.3.0 | "
+        "umf==0.10.0 | "
+        "intel-pti==0.12.3"
    ),
 }

@ -244,6 +244,8 @@ def generate_libtorch_matrix(
                arches.remove("13.0")
        elif os == "windows":
            arches += CUDA_ARCHES
+            if "13.0" in arches:
+                arches.remove("13.0")
    if libtorch_variants is None:
        libtorch_variants = [
            "shared-with-deps",
@ -308,6 +310,8 @@ def generate_wheels_matrix(
            arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
        elif os == "windows":
            arches += CUDA_ARCHES + XPU_ARCHES
+            if "13.0" in arches:
+                arches.remove("13.0")
        elif os == "linux-aarch64":
            # Separate new if as the CPU type is different and
            # uses different build/test scripts
@ -330,14 +334,13 @@ def generate_wheels_matrix(
                else arch_version
            )

+            # TODO: Enable python 3.13t on cpu-s390x
+            if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
+                continue
            # TODO: Enable python 3.14 for rest
-            if os not in [
-                "linux",
-                "linux-aarch64",
-                "linux-s390x",
-                "macos-arm64",
-                "windows",
-            ] and (python_version == "3.14" or python_version == "3.14t"):
+            if os not in ["linux", "linux-aarch64", "macos-arm64", "windows"] and (
+                python_version == "3.14" or python_version == "3.14t"
+            ):
                continue

            # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -22,7 +22,7 @@ LABEL_CIFLOW_BINARIES = "ciflow/binaries"
 LABEL_CIFLOW_PERIODIC = "ciflow/periodic"
 LABEL_CIFLOW_BINARIES_LIBTORCH = "ciflow/binaries_libtorch"
 LABEL_CIFLOW_BINARIES_WHEEL = "ciflow/binaries_wheel"
-LABEL_CIFLOW_ROCM = "ciflow/rocm-mi300"
+LABEL_CIFLOW_ROCM = "ciflow/rocm"


@dataclass
@ -139,6 +139,8 @@ ROCM_SMOKE_WORKFLOWS = [
        ),
        ciflow_config=CIFlowConfig(
            labels={
+                LABEL_CIFLOW_BINARIES,
+                LABEL_CIFLOW_BINARIES_WHEEL,
                LABEL_CIFLOW_ROCM,
            },
            isolated_workflow=True,
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -27,7 +27,6 @@ from trymerge import (
    get_drci_classifications,
    gh_get_team_members,
    GitHubPR,
-    iter_issue_timeline_until_comment,
    JobCheckState,
    main as trymerge_main,
    MandatoryChecksMissingError,
@ -35,8 +34,6 @@ from trymerge import (
    RE_GHSTACK_DESC,
    read_merge_rules,
    remove_job_name_suffix,
-    sha_from_committed_event,
-    sha_from_force_push_after,
    validate_revert,
 )

@ -127,7 +124,7 @@ def mock_parse_args(revert: bool = False, force: bool = False) -> Any:
            self.force = force
            self.pr_num = 76123
            self.dry_run = True
-            self.comment_id = 12345  # Set to non-zero value
+            self.comment_id = 0
            self.reason = "this is for testing"
            self.ignore_current = False
            self.check_mergeability = False
@ -155,9 +152,9 @@ def mock_revert(
 def mock_merge(
    pr: GitHubPR,
    repo: GitRepo,
-    comment_id: int,
    dry_run: bool = False,
    skip_mandatory_checks: bool = False,
+    comment_id: Optional[int] = None,
    timeout_minutes: int = 400,
    stale_pr_days: int = 3,
    ignore_current: bool = False,
@ -473,9 +470,9 @@ class TestTryMerge(TestCase):
        mock_merge.assert_called_once_with(
            mock.ANY,
            mock.ANY,
-            comment_id=mock.ANY,
            dry_run=mock.ANY,
            skip_mandatory_checks=True,
+            comment_id=mock.ANY,
            ignore_current=False,
        )

@ -488,9 +485,9 @@ class TestTryMerge(TestCase):
        mock_merge.assert_called_once_with(
            mock.ANY,
            mock.ANY,
-            comment_id=mock.ANY,
            dry_run=mock.ANY,
            skip_mandatory_checks=False,
+            comment_id=mock.ANY,
            ignore_current=False,
        )

@ -1141,176 +1138,5 @@ Pull Request resolved: https://github.com/pytorch/pytorch/pull/154394"""
        )


-@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
-@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
-@mock.patch(
-    "trymerge.get_drci_classifications", side_effect=mocked_drci_classifications
-)
-class TestTimelineFunctions(TestCase):
-    """Tests for the new timeline-related functions"""
-
-    def test_sha_from_committed_event(self, *args: Any) -> None:
-        """Test extracting SHA from committed event"""
-        # Based on actual GitHub API format - committed events have "sha" at top level
-        event = {
-            "event": "committed",
-            "sha": "fb21ce932ded6670c918804a0d9151b773770a7c",
-        }
-        self.assertEqual(
-            sha_from_committed_event(event), "fb21ce932ded6670c918804a0d9151b773770a7c"
-        )
-
-        # Test with missing SHA
-        event_no_sha = {"event": "committed"}
-        self.assertIsNone(sha_from_committed_event(event_no_sha))
-
-    def test_sha_from_force_push_after(self, *args: Any) -> None:
-        """Test extracting SHA from force push event"""
-        # NOTE: The current function doesn't handle the actual GitHub API format
-        # Real force push events have "commit_id" at top level, but this function
-        # looks for "after", "after_commit", "after_sha", or "head_sha" fields
-
-        # Test with the legacy format the current function handles
-        event_legacy = {
-            "event": "head_ref_force_pushed",
-            "after": {"sha": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e"},
-        }
-        self.assertEqual(
-            sha_from_force_push_after(event_legacy),
-            "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
-        )
-
-        # Test with current GitHub API format (should return None with current implementation)
-        event_real_api = {
-            "event": "head_ref_force_pushed",
-            "commit_id": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
-        }
-        self.assertEqual(
-            sha_from_force_push_after(event_real_api),
-            "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
-        )  # Current function doesn't handle commit_id
-
-        # Test with missing SHA
-        event_no_sha = {"event": "head_ref_force_pushed"}
-        self.assertIsNone(sha_from_force_push_after(event_no_sha))
-
-    @mock.patch("trymerge.gh_fetch_json_list")
-    def test_iter_issue_timeline_until_comment(
-        self, mock_gh_fetch_json_list: Any, *args: Any
-    ) -> None:
-        """Test timeline iteration until target comment"""
-        # Mock timeline data based on actual GitHub API format
-        timeline_data = [
-            {"event": "commented", "id": 100, "body": "first comment"},
-            {"event": "committed", "sha": "fb21ce932ded6670c918804a0d9151b773770a7c"},
-            {"event": "commented", "id": 200, "body": "target comment"},
-            {"event": "commented", "id": 300, "body": "after target"},
-        ]
-        mock_gh_fetch_json_list.return_value = timeline_data
-
-        # Test iteration stops at target comment
-        events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 200))
-        self.assertEqual(len(events), 3)  # Should stop at target comment
-        self.assertEqual(events[0]["event"], "commented")
-        self.assertEqual(events[0]["id"], 100)
-        self.assertEqual(events[1]["event"], "committed")
-        self.assertEqual(events[1]["sha"], "fb21ce932ded6670c918804a0d9151b773770a7c")
-        self.assertEqual(events[2]["event"], "commented")
-        self.assertEqual(events[2]["id"], 200)
-
-    @mock.patch("trymerge.gh_fetch_json_list")
-    def test_iter_issue_timeline_until_comment_not_found(
-        self, mock_gh_fetch_json_list: Any, *args: Any
-    ) -> None:
-        """Test timeline iteration when target comment is not found"""
-        # Mock empty timeline
-        mock_gh_fetch_json_list.return_value = []
-
-        events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 999))
-        self.assertEqual(len(events), 0)
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_commit_after_comment(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        """Test get_commit_sha_at_comment returns correct SHA after comment"""
-        mock_iter_timeline.return_value = [
-            {"event": "committed", "sha": "commit1"},
-            {"event": "committed", "sha": "commit2"},
-            {"event": "commented", "id": 100},
-            {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
-        ]
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(100)
-        self.assertEqual(sha, "commit2")
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_force_push_before_comment(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        mock_iter_timeline.return_value = [
-            {"event": "committed", "sha": "commit1"},
-            {"event": "committed", "sha": "commit2"},
-            {"event": "head_ref_force_pushed", "commit_id": "commit3"},
-            {"event": "commented", "id": 100},
-        ]
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(100)
-        self.assertEqual(sha, "commit3")
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_force_push_before_comment_legacy_mode(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        mock_iter_timeline.return_value = [
-            {"event": "committed", "sha": "commit1"},
-            {"event": "committed", "sha": "commit2"},
-            {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
-            {"event": "commented", "id": 100},
-        ]
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(100)
-        self.assertEqual(sha, "commit3")
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_multiple_comments(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        mock_iter_timeline.return_value = [
-            {"event": "committed", "sha": "commit1"},
-            {"event": "commented", "id": 100},
-            {"event": "committed", "sha": "commit2"},
-            {"event": "commented", "id": 200},
-            {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
-            {"event": "commented", "id": 300},
-        ]
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(200)
-        self.assertEqual(sha, "commit2")
-        sha = pr.get_commit_sha_at_comment(300)
-        self.assertEqual(sha, "commit3")
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_no_events(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        mock_iter_timeline.return_value = [
-            {"event": "commented", "id": 100},
-            {"event": "labeled", "label": {"name": "test"}},
-        ]
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(100)
-        self.assertIsNone(sha)
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_exception(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        mock_iter_timeline.side_effect = Exception("API error")
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(100)
-        self.assertIsNone(sha)
-
-
 if __name__ == "__main__":
    main()
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -450,63 +450,6 @@ HAS_NO_CONNECTED_DIFF_TITLE = (
 IGNORABLE_FAILED_CHECKS_THESHOLD = 10


-def iter_issue_timeline_until_comment(
-    org: str, repo: str, issue_number: int, target_comment_id: int, max_pages: int = 200
-) -> Any:
-    """
-    Yield timeline entries in order until (and including) the entry whose id == target_comment_id
-    for a 'commented' event. Stops once the target comment is encountered.
-    """
-    page = 1
-
-    while page <= max_pages:
-        url = (
-            f"https://api.github.com/repos/{org}/{repo}/issues/{issue_number}/timeline"
-        )
-        params = {"per_page": 100, "page": page}
-
-        batch = gh_fetch_json_list(url, params)
-
-        if not batch:
-            return
-        for ev in batch:
-            # The target is the issue comment row with event == "commented" and id == issue_comment_id
-            if ev.get("event") == "commented" and ev.get("id") == target_comment_id:
-                yield ev  # nothing in the timeline after this matters, so stop early
-                return
-            yield ev
-        if len(batch) < 100:
-            return
-        page += 1
-
-    # If we got here without finding the comment, then we either hit a bug or some github PR
-    # has a _really_ long timeline.
-    # The max # of pages found on any pytorch/pytorch PR at the time of this change was 41
-    raise RuntimeError(
-        f"Could not find a merge commit in the first {max_pages} pages of the timeline at url {url}."
-        f"This is most likely a bug, please report it to the @pytorch/pytorch-dev-infra team."
-    )
-
-
-def sha_from_committed_event(ev: dict[str, Any]) -> Optional[str]:
-    """Extract SHA from committed event in timeline"""
-    return ev.get("sha")
-
-
-def sha_from_force_push_after(ev: dict[str, Any]) -> Optional[str]:
-    """Extract SHA from force push event in timeline"""
-    # The current GitHub API format
-    commit_id = ev.get("commit_id")
-    if commit_id:
-        return str(commit_id)
-
-    # Legacy format
-    after = ev.get("after") or ev.get("after_commit") or {}
-    if isinstance(after, dict):
-        return after.get("sha") or after.get("oid")
-    return ev.get("after_sha") or ev.get("head_sha")
-
-
 def gh_get_pr_info(org: str, proj: str, pr_no: int) -> Any:
    rc = gh_graphql(GH_GET_PR_INFO_QUERY, name=proj, owner=org, number=pr_no)
    return rc["data"]["repository"]["pullRequest"]
@ -794,24 +737,16 @@ class GitHubPR:
    def last_commit(self) -> Any:
        return self.info["commits"]["nodes"][-1]["commit"]

-    def last_commit_sha(self, default: Optional[str] = None) -> str:
-        # for commits, the oid is the sha
-
-        if default is None:
-            return str(self.last_commit()["oid"])
-
-        return str(self.last_commit().get("oid", default))
-
    def get_merge_base(self) -> str:
        if self.merge_base:
            return self.merge_base

-        last_commit_sha = self.last_commit_sha()
+        last_commit_oid = self.last_commit()["oid"]
        # NB: We could use self.base_ref() here for regular PR, however, that doesn't
        # work for ghstack where the base is the custom branch, i.e. gh/USER/ID/base,
        # so let's just use main instead
        self.merge_base = gh_fetch_merge_base(
-            self.org, self.project, last_commit_sha, self.default_branch()
+            self.org, self.project, last_commit_oid, self.default_branch()
        )

        # Fallback to baseRefOid if the API call fails, i.e. rate limit. Note that baseRefOid
@ -900,44 +835,6 @@ class GitHubPR:
    def get_commit_count(self) -> int:
        return int(self.info["commits_with_authors"]["totalCount"])

-    def get_commit_sha_at_comment(self, comment_id: int) -> Optional[str]:
-        """
-        Get the PR head commit SHA that was present when a specific comment was posted.
-        This ensures we only merge the state of the PR at the time the merge command was issued,
-        not any subsequent commits that may have been pushed after.
-
-        Returns None if no head-changing events found before the comment or if the comment was not found.
-        """
-        head = None
-
-        try:
-            for event in iter_issue_timeline_until_comment(
-                self.org, self.project, self.pr_num, comment_id
-            ):
-                etype = event.get("event")
-                if etype == "committed":
-                    sha = sha_from_committed_event(event)
-                    if sha:
-                        head = sha
-                        print(f"Timeline: Found commit event for SHA {sha}")
-                elif etype == "head_ref_force_pushed":
-                    sha = sha_from_force_push_after(event)
-                    if sha:
-                        head = sha
-                        print(f"Timeline: Found force push event for SHA {sha}")
-                elif etype == "commented":
-                    if event.get("id") == comment_id:
-                        print(f"Timeline: Found final comment with sha {sha}")
-                        return head
-        except Exception as e:
-            print(
-                f"Warning: Failed to reconstruct timeline for comment {comment_id}: {e}"
-            )
-            return None
-
-        print(f"Did not find comment with id {comment_id} in the PR timeline")
-        return None
-
    def get_pr_creator_login(self) -> str:
        return cast(str, self.info["author"]["login"])

@ -1254,7 +1151,7 @@ class GitHubPR:
        *,
        skip_mandatory_checks: bool = False,
        dry_run: bool = False,
-        comment_id: int,
+        comment_id: Optional[int] = None,
        ignore_current_checks: Optional[list[str]] = None,
    ) -> None:
        # Raises exception if matching rule is not found
@ -1270,7 +1167,7 @@ class GitHubPR:
            skip_internal_checks=can_skip_internal_checks(self, comment_id),
            ignore_current_checks=ignore_current_checks,
        )
-        additional_merged_prs = self.merge_changes_locally(
+        additional_merged_prs = self.merge_changes(
            repo, skip_mandatory_checks, comment_id
        )

@ -1299,7 +1196,7 @@ class GitHubPR:
                broken_trunk_checks=ignorable_checks.get("BROKEN_TRUNK", []),
                flaky_checks=ignorable_checks.get("FLAKY", []),
                unstable_checks=ignorable_checks.get("UNSTABLE", []),
-                last_commit_sha=self.last_commit_sha(default=""),
+                last_commit_sha=self.last_commit().get("oid", ""),
                merge_base_sha=self.get_merge_base(),
                merge_commit_sha=merge_commit_sha,
                is_failed=False,
@ -1320,7 +1217,7 @@ class GitHubPR:
            dry_run=dry_run,
        )

-    def merge_changes_locally(
+    def merge_changes(
        self,
        repo: GitRepo,
        skip_mandatory_checks: bool = False,
@ -1329,15 +1226,27 @@ class GitHubPR:
        skip_all_rule_checks: bool = False,
    ) -> list["GitHubPR"]:
        """
-        :param skip_all_rule_checks: If true, skips all rule checks on ghstack PRs, useful for dry-running merge locally
+        :param skip_all_rule_checks: If true, skips all rule checks, useful for dry-running merge locally
        """
        branch_to_merge_into = self.default_branch() if branch is None else branch
        if repo.current_branch() != branch_to_merge_into:
            repo.checkout(branch_to_merge_into)
+        if not self.is_ghstack_pr():
+            msg = self.gen_commit_message()
+            pr_branch_name = f"__pull-request-{self.pr_num}__init__"
+            repo.fetch(self.last_commit()["oid"], pr_branch_name)
+            repo._run_git("merge", "--squash", pr_branch_name)
+            repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)

-        # It's okay to skip the commit SHA check for ghstack PRs since
-        # authoring requires write access to the repo.
-        if self.is_ghstack_pr():
+            # Did the PR change since we started the merge?
+            pulled_sha = repo.show_ref(pr_branch_name)
+            latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
+            if pulled_sha != latest_pr_status.last_commit()["oid"]:
+                raise RuntimeError(
+                    "PR has been updated since CI checks last passed. Please rerun the merge command."
+                )
+            return []
+        else:
            return self.merge_ghstack_into(
                repo,
                skip_mandatory_checks,
@ -1345,48 +1254,6 @@ class GitHubPR:
                skip_all_rule_checks=skip_all_rule_checks,
            )

-        msg = self.gen_commit_message()
-        pr_branch_name = f"__pull-request-{self.pr_num}__init__"
-
-        # Determine which commit SHA to merge
-        commit_to_merge = None
-        if not comment_id:
-            raise ValueError("Must provide --comment-id when merging regular PRs")
-
-        # Get the commit SHA that was present when the comment was made
-        commit_to_merge = self.get_commit_sha_at_comment(comment_id)
-        if not commit_to_merge:
-            raise RuntimeError(
-                f"Could not find commit that was pushed before comment {comment_id}"
-            )
-
-        # Validate that this commit is the latest commit on the PR
-        latest_commit = self.last_commit_sha()
-        if commit_to_merge != latest_commit:
-            raise RuntimeError(
-                f"Commit {commit_to_merge} was HEAD when comment {comment_id} was posted "
-                f"but now the latest commit on the PR is {latest_commit}. "
-                f"Please re-issue the merge command to merge the latest commit."
-            )
-
-        print(f"Merging commit {commit_to_merge} locally")
-
-        repo.fetch(commit_to_merge, pr_branch_name)
-        repo._run_git("merge", "--squash", pr_branch_name)
-        repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
-
-        # Did the PR change since we started the merge?
-        pulled_sha = repo.show_ref(pr_branch_name)
-        latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
-        if (
-            pulled_sha != latest_pr_status.last_commit_sha()
-            or pulled_sha != commit_to_merge
-        ):
-            raise RuntimeError(
-                "PR has been updated since CI checks last passed. Please rerun the merge command."
-            )
-        return []
-

 class MergeRuleFailedError(RuntimeError):
    def __init__(self, message: str, rule: Optional["MergeRule"] = None) -> None:
@ -1591,7 +1458,7 @@ def find_matching_merge_rule(
            pending_checks = []
            failed_checks = []

-        hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit_sha()}"
+        hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit()['oid']}"
        if len(failed_checks) > 0:
            if reject_reason_score < 30000:
                reject_reason_score = 30000
@ -2289,14 +2156,14 @@ def categorize_checks(
 def merge(
    pr: GitHubPR,
    repo: GitRepo,
-    comment_id: int,
    dry_run: bool = False,
    skip_mandatory_checks: bool = False,
+    comment_id: Optional[int] = None,
    timeout_minutes: int = 400,
    stale_pr_days: int = 3,
    ignore_current: bool = False,
 ) -> None:
-    initial_commit_sha = pr.last_commit_sha()
+    initial_commit_sha = pr.last_commit()["oid"]
    pr_link = f"https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num}"
    print(f"Attempting merge of {initial_commit_sha} ({pr_link})")

@ -2367,7 +2234,7 @@ def merge(
            f"Attempting merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} ({elapsed_time / 60} minutes elapsed)"
        )
        pr = GitHubPR(pr.org, pr.project, pr.pr_num)
-        if initial_commit_sha != pr.last_commit_sha():
+        if initial_commit_sha != pr.last_commit()["oid"]:
            raise RuntimeError(
                "New commits were pushed while merging. Please rerun the merge command."
            )
@ -2534,7 +2401,7 @@ def main() -> None:
    if args.check_mergeability:
        if pr.is_ghstack_pr():
            get_ghstack_prs(repo, pr)  # raises error if out of sync
-        pr.merge_changes_locally(
+        pr.merge_changes(
            repo,
            skip_mandatory_checks=True,
            skip_all_rule_checks=True,
@ -2549,18 +2416,12 @@ def main() -> None:
        gh_post_pr_comment(org, project, args.pr_num, message, dry_run=args.dry_run)
        return
    try:
-        # Ensure comment id is set, else fail
-        if not args.comment_id:
-            raise ValueError(
-                "Comment ID is required for merging PRs, please provide it using --comment-id"
-            )
-
        merge(
            pr,
            repo,
-            comment_id=args.comment_id,
            dry_run=args.dry_run,
            skip_mandatory_checks=args.force,
+            comment_id=args.comment_id,
            ignore_current=args.ignore_current,
        )
    except Exception as e:
@ -2582,7 +2443,7 @@ def main() -> None:
                broken_trunk_checks=[],
                flaky_checks=[],
                unstable_checks=[],
-                last_commit_sha=pr.last_commit_sha(default=""),
+                last_commit_sha=pr.last_commit().get("oid", ""),
                merge_base_sha=pr.get_merge_base(),
                is_failed=True,
                skip_mandatory_checks=args.force,
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@ -4,7 +4,7 @@
 {%- set download_artifact_action = "actions/download-artifact@v4.1.7" -%}

 {%- set timeout_minutes = 240 -%}
-{%- set timeout_minutes_windows_binary = 360 -%}
+{%- set timeout_minutes_windows_binary = 300 -%}

 {%- macro concurrency(build_environment) -%}
 concurrency:
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -135,7 +135,7 @@ jobs:
      contents: read
    steps:
      - name: Setup XPU
-        uses: pytorch/pytorch/.github/actions/setup-xpu@main
+        uses: ./.github/actions/setup-xpu
      - name: configure aws credentials
        id: aws_creds
        uses: aws-actions/configure-aws-credentials@v4
@ -171,7 +171,7 @@ jobs:
      - name: Teardown XPU
        uses: ./.github/actions/teardown-xpu
    {%- else %}
-    runs-on: linux.rocm.gpu.gfx942.1
+    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: !{{ common.timeout_minutes }}
    !{{ upload.binary_env(config) }}
    steps:
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -77,7 +77,6 @@ jobs:
        run: |
          git config --global core.longpaths true
          git config --global core.symlinks true
-          git config --global core.ignorecase false

          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
          # the directory on Windows and prevent GHA from checking out as reported
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -70,7 +70,6 @@ jobs:
        run: |
          git config --global core.longpaths true
          git config --global core.symlinks true
-          git config --global core.ignorecase false

          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
          # the directory on Windows and prevent GHA from checking out as reported
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -275,7 +275,7 @@ jobs:
      - name: Change permissions
        if: ${{ always() && steps.test.conclusion }}
        run: |
-          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1000:1000 test"
+          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"

      - name: Print remaining test logs
        shell: bash
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -145,7 +145,7 @@ jobs:
          fi

          docker exec -t "${container_name}" yum install -y zlib-devel zip
-          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}"  -m pip install -U setuptools==78.1.0 pybind11==3.0.1 auditwheel wheel
+          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}"  -m pip install -U setuptools==78.1.0 pybind11==2.13.1 auditwheel wheel
          set +e
          docker exec -t "${container_name}" command -v pip
          has_pip=$?
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -50,23 +50,24 @@ jobs:
        runner: [linux.12xlarge]
        docker-image-name: [
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
-          pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
          pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
-          pytorch-linux-jammy-py3.10-clang12,
+          pytorch-linux-jammy-py3.9-clang12,
          pytorch-linux-jammy-py3.13-clang12,
          pytorch-linux-jammy-rocm-n-py3,
          pytorch-linux-noble-rocm-n-py3,
          pytorch-linux-noble-rocm-alpha-py3,
          pytorch-linux-jammy-rocm-n-py3-benchmarks,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12,
-          pytorch-linux-jammy-py3.10-gcc11,
-          pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
+          pytorch-linux-jammy-py3.9-gcc11,
+          pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
          pytorch-linux-jammy-py3.12-halide,
-          pytorch-linux-jammy-xpu-n-1-py3,
-          pytorch-linux-jammy-xpu-n-py3,
+          pytorch-linux-jammy-xpu-2025.0-py3,
+          pytorch-linux-jammy-xpu-2025.1-py3,
          pytorch-linux-jammy-py3-clang18-asan,
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -158,52 +158,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_10-cuda-aarch64-13_0-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.10"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_10-cuda-aarch64-13_0
-      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda-aarch64-13_0-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_10-cuda-aarch64-13_0-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda-aarch64-13_0
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_11-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -315,52 +269,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_11-cuda-aarch64-13_0-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.11"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_11-cuda-aarch64-13_0
-      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda-aarch64-13_0-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_11-cuda-aarch64-13_0-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda-aarch64-13_0
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_12-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -472,52 +380,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_12-cuda-aarch64-13_0-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.12"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_12-cuda-aarch64-13_0
-      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda-aarch64-13_0-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_12-cuda-aarch64-13_0-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda-aarch64-13_0
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_13-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -629,52 +491,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_13-cuda-aarch64-13_0-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.13"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_13-cuda-aarch64-13_0
-      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda-aarch64-13_0-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13-cuda-aarch64-13_0-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda-aarch64-13_0
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_13t-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -786,52 +602,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_13t-cuda-aarch64-13_0-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.13t"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_13t-cuda-aarch64-13_0
-      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cuda-aarch64-13_0-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13t-cuda-aarch64-13_0-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cuda-aarch64-13_0
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_14-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -943,52 +713,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_14-cuda-aarch64-13_0-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.14"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_14-cuda-aarch64-13_0
-      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14-cuda-aarch64-13_0-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_14-cuda-aarch64-13_0-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.14"
-      build_name: manywheel-py3_14-cuda-aarch64-13_0
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_14t-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -1099,49 +823,3 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_14t-cuda-aarch64-13_0-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.14t"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_14t-cuda-aarch64-13_0
-      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14t-cuda-aarch64-13_0-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_14t-cuda-aarch64-13_0-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.14t"
-      build_name: manywheel-py3_14t-cuda-aarch64-13_0
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@ -342,7 +342,7 @@ jobs:
    needs:
      - libtorch-rocm6_3-shared-with-deps-release-build
      - get-label-type
-    runs-on: linux.rocm.gpu.gfx942.1
+    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
@ -456,7 +456,7 @@ jobs:
    needs:
      - libtorch-rocm6_4-shared-with-deps-release-build
      - get-label-type
-    runs-on: linux.rocm.gpu.gfx942.1
+    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -398,7 +398,7 @@ jobs:
    needs:
      - manywheel-py3_10-rocm6_3-build
      - get-label-type
-    runs-on: linux.rocm.gpu.gfx942.1
+    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
@ -509,7 +509,7 @@ jobs:
    needs:
      - manywheel-py3_10-rocm6_4-build
      - get-label-type
-    runs-on: linux.rocm.gpu.gfx942.1
+    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
@ -612,7 +612,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-xpu
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-xpu-test:  # Testing
@ -638,7 +638,7 @@ jobs:
      contents: read
    steps:
      - name: Setup XPU
-        uses: pytorch/pytorch/.github/actions/setup-xpu@main
+        uses: ./.github/actions/setup-xpu
      - name: configure aws credentials
        id: aws_creds
        uses: aws-actions/configure-aws-credentials@v4
@ -1056,7 +1056,7 @@ jobs:
    needs:
      - manywheel-py3_11-rocm6_3-build
      - get-label-type
-    runs-on: linux.rocm.gpu.gfx942.1
+    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
@ -1167,7 +1167,7 @@ jobs:
    needs:
      - manywheel-py3_11-rocm6_4-build
      - get-label-type
-    runs-on: linux.rocm.gpu.gfx942.1
+    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
@ -1270,7 +1270,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-xpu
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-xpu-test:  # Testing
@ -1296,7 +1296,7 @@ jobs:
      contents: read
    steps:
      - name: Setup XPU
-        uses: pytorch/pytorch/.github/actions/setup-xpu@main
+        uses: ./.github/actions/setup-xpu
      - name: configure aws credentials
        id: aws_creds
        uses: aws-actions/configure-aws-credentials@v4
@ -1714,7 +1714,7 @@ jobs:
    needs:
      - manywheel-py3_12-rocm6_3-build
      - get-label-type
-    runs-on: linux.rocm.gpu.gfx942.1
+    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
@ -1825,7 +1825,7 @@ jobs:
    needs:
      - manywheel-py3_12-rocm6_4-build
      - get-label-type
-    runs-on: linux.rocm.gpu.gfx942.1
+    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
@ -1928,7 +1928,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-xpu
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-xpu-test:  # Testing
@ -1954,7 +1954,7 @@ jobs:
      contents: read
    steps:
      - name: Setup XPU
-        uses: pytorch/pytorch/.github/actions/setup-xpu@main
+        uses: ./.github/actions/setup-xpu
      - name: configure aws credentials
        id: aws_creds
        uses: aws-actions/configure-aws-credentials@v4
@ -2372,7 +2372,7 @@ jobs:
    needs:
      - manywheel-py3_13-rocm6_3-build
      - get-label-type
-    runs-on: linux.rocm.gpu.gfx942.1
+    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
@ -2483,7 +2483,7 @@ jobs:
    needs:
      - manywheel-py3_13-rocm6_4-build
      - get-label-type
-    runs-on: linux.rocm.gpu.gfx942.1
+    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
@ -2586,7 +2586,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-xpu
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-xpu-test:  # Testing
@ -2612,7 +2612,7 @@ jobs:
      contents: read
    steps:
      - name: Setup XPU
-        uses: pytorch/pytorch/.github/actions/setup-xpu@main
+        uses: ./.github/actions/setup-xpu
      - name: configure aws credentials
        id: aws_creds
        uses: aws-actions/configure-aws-credentials@v4
@ -3030,7 +3030,7 @@ jobs:
    needs:
      - manywheel-py3_13t-rocm6_3-build
      - get-label-type
-    runs-on: linux.rocm.gpu.gfx942.1
+    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
@ -3141,7 +3141,7 @@ jobs:
    needs:
      - manywheel-py3_13t-rocm6_4-build
      - get-label-type
-    runs-on: linux.rocm.gpu.gfx942.1
+    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
@ -3244,7 +3244,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-xpu
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-xpu-test:  # Testing
@ -3270,7 +3270,7 @@ jobs:
      contents: read
    steps:
      - name: Setup XPU
-        uses: pytorch/pytorch/.github/actions/setup-xpu@main
+        uses: ./.github/actions/setup-xpu
      - name: configure aws credentials
        id: aws_creds
        uses: aws-actions/configure-aws-credentials@v4
@ -3688,7 +3688,7 @@ jobs:
    needs:
      - manywheel-py3_14-rocm6_3-build
      - get-label-type
-    runs-on: linux.rocm.gpu.gfx942.1
+    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
@ -3799,7 +3799,7 @@ jobs:
    needs:
      - manywheel-py3_14-rocm6_4-build
      - get-label-type
-    runs-on: linux.rocm.gpu.gfx942.1
+    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
@ -3902,7 +3902,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-xpu
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-xpu-test:  # Testing
@ -3928,7 +3928,7 @@ jobs:
      contents: read
    steps:
      - name: Setup XPU
-        uses: pytorch/pytorch/.github/actions/setup-xpu@main
+        uses: ./.github/actions/setup-xpu
      - name: configure aws credentials
        id: aws_creds
        uses: aws-actions/configure-aws-credentials@v4
@ -4346,7 +4346,7 @@ jobs:
    needs:
      - manywheel-py3_14t-rocm6_3-build
      - get-label-type
-    runs-on: linux.rocm.gpu.gfx942.1
+    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
@ -4457,7 +4457,7 @@ jobs:
    needs:
      - manywheel-py3_14t-rocm6_4-build
      - get-label-type
-    runs-on: linux.rocm.gpu.gfx942.1
+    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
@ -4560,7 +4560,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-xpu
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-xpu-test:  # Testing
@ -4586,7 +4586,7 @@ jobs:
      contents: read
    steps:
      - name: Setup XPU
-        uses: pytorch/pytorch/.github/actions/setup-xpu@main
+        uses: ./.github/actions/setup-xpu
      - name: configure aws credentials
        id: aws_creds
        uses: aws-actions/configure-aws-credentials@v4
--- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
@ -10,7 +10,9 @@ on:
    branches:
      - main
    tags:
-      - 'ciflow/rocm-mi300/*'
+      - 'ciflow/binaries/*'
+      - 'ciflow/binaries_wheel/*'
+      - 'ciflow/rocm/*'
  workflow_dispatch:

 permissions:
@ -67,7 +69,7 @@ jobs:
    needs:
      - manywheel-py3_9-rocm6_4-build
      - get-label-type
-    runs-on: linux.rocm.gpu.gfx942.1
+    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@ -302,195 +302,3 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_13t-cpu-s390x-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      DESIRED_PYTHON: "3.13t"
-      runs_on: linux.s390x
-      ALPINE_IMAGE: "docker.io/s390x/alpine"
-      timeout-minutes: 420
-      build_name: manywheel-py3_13t-cpu-s390x
-      build_environment: linux-s390x-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cpu-s390x-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_13t-cpu-s390x-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cpu-s390x
-      build_environment: linux-s390x-binary-manywheel
-      runs_on: linux.s390x
-      ALPINE_IMAGE: "docker.io/s390x/alpine"
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cpu-s390x-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13t-cpu-s390x-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cpu-s390x
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_14-cpu-s390x-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      DESIRED_PYTHON: "3.14"
-      runs_on: linux.s390x
-      ALPINE_IMAGE: "docker.io/s390x/alpine"
-      timeout-minutes: 420
-      build_name: manywheel-py3_14-cpu-s390x
-      build_environment: linux-s390x-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14-cpu-s390x-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_14-cpu-s390x-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      DESIRED_PYTHON: "3.14"
-      build_name: manywheel-py3_14-cpu-s390x
-      build_environment: linux-s390x-binary-manywheel
-      runs_on: linux.s390x
-      ALPINE_IMAGE: "docker.io/s390x/alpine"
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14-cpu-s390x-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_14-cpu-s390x-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      DESIRED_PYTHON: "3.14"
-      build_name: manywheel-py3_14-cpu-s390x
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_14t-cpu-s390x-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      DESIRED_PYTHON: "3.14t"
-      runs_on: linux.s390x
-      ALPINE_IMAGE: "docker.io/s390x/alpine"
-      timeout-minutes: 420
-      build_name: manywheel-py3_14t-cpu-s390x
-      build_environment: linux-s390x-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14t-cpu-s390x-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_14t-cpu-s390x-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      DESIRED_PYTHON: "3.14t"
-      build_name: manywheel-py3_14t-cpu-s390x
-      build_environment: linux-s390x-binary-manywheel
-      runs_on: linux.s390x
-      ALPINE_IMAGE: "docker.io/s390x/alpine"
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14t-cpu-s390x-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_14t-cpu-s390x-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      DESIRED_PYTHON: "3.14t"
-      build_name: manywheel-py3_14t-cpu-s390x
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
@ -51,7 +51,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -128,7 +128,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-debug-build
      - get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
@ -51,7 +51,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -128,7 +128,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-release-build
      - get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
--- a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
@ -51,7 +51,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
@ -124,7 +124,7 @@ jobs:
      - wheel-py3_11-cpu-build
      - get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
@ -198,7 +198,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
@ -271,7 +271,7 @@ jobs:
      - wheel-py3_12-cpu-build
      - get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
@ -345,7 +345,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
@ -418,7 +418,7 @@ jobs:
      - wheel-py3_13-cpu-build
      - get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
--- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
@ -38,7 +38,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -153,7 +153,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-debug-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@ -45,7 +45,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -160,7 +160,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-debug-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -292,7 +292,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -408,7 +408,7 @@ jobs:
      - libtorch-cuda12_6-shared-with-deps-debug-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -542,7 +542,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -658,7 +658,7 @@ jobs:
      - libtorch-cuda12_8-shared-with-deps-debug-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -792,7 +792,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -908,7 +908,7 @@ jobs:
      - libtorch-cuda12_9-shared-with-deps-debug-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -1038,253 +1038,3 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda13_0-shared-with-deps-debug-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
-        shell: bash
-        run: |
-          git config --global core.longpaths true
-          git config --global core.symlinks true
-
-          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
-          # the directory on Windows and prevent GHA from checking out as reported
-          # in https://github.com/actions/checkout/issues/1018
-          git config --global core.fsmonitor false
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v4.4.0
-        if: always()
-        with:
-          name: libtorch-cuda13_0-shared-with-deps-debug
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-
-  libtorch-cuda13_0-shared-with-deps-debug-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - libtorch-cuda13_0-shared-with-deps-debug-build
-      - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 360
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
-        shell: bash
-        run: |
-          git config --global core.longpaths true
-          git config --global core.symlinks true
-
-          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
-          # the directory on Windows and prevent GHA from checking out as reported
-          # in https://github.com/actions/checkout/issues/1018
-          git config --global core.fsmonitor false
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda13_0-shared-with-deps-debug
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda13_0-shared-with-deps-debug-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: libtorch-cuda13_0-shared-with-deps-debug-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-      build_name: libtorch-cuda13_0-shared-with-deps-debug
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
@ -38,7 +38,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -153,7 +153,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-release-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@ -45,7 +45,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -160,7 +160,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-release-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -292,7 +292,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -408,7 +408,7 @@ jobs:
      - libtorch-cuda12_6-shared-with-deps-release-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -542,7 +542,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -658,7 +658,7 @@ jobs:
      - libtorch-cuda12_8-shared-with-deps-release-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -792,7 +792,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -908,7 +908,7 @@ jobs:
      - libtorch-cuda12_9-shared-with-deps-release-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -1038,253 +1038,3 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda13_0-shared-with-deps-release-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
-        shell: bash
-        run: |
-          git config --global core.longpaths true
-          git config --global core.symlinks true
-
-          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
-          # the directory on Windows and prevent GHA from checking out as reported
-          # in https://github.com/actions/checkout/issues/1018
-          git config --global core.fsmonitor false
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v4.4.0
-        if: always()
-        with:
-          name: libtorch-cuda13_0-shared-with-deps-release
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-
-  libtorch-cuda13_0-shared-with-deps-release-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - libtorch-cuda13_0-shared-with-deps-release-build
-      - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 360
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
-        shell: bash
-        run: |
-          git config --global core.longpaths true
-          git config --global core.symlinks true
-
-          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
-          # the directory on Windows and prevent GHA from checking out as reported
-          # in https://github.com/actions/checkout/issues/1018
-          git config --global core.fsmonitor false
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda13_0-shared-with-deps-release
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda13_0-shared-with-deps-release-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: libtorch-cuda13_0-shared-with-deps-release-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-      build_name: libtorch-cuda13_0-shared-with-deps-release
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
--- a/.github/workflows/inductor-micro-benchmark-x86.yml
+++ b/.github/workflows/inductor-micro-benchmark-x86.yml
@ -18,13 +18,13 @@ permissions:
  contents: read

 jobs:
-  inductor-build:
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    name: inductor-build
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-py3.9-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      # Use metal host for benchmark jobs
      test-matrix: |
        { include: [
@ -32,13 +32,13 @@ jobs:
        ]}
    secrets: inherit

-  inductor-micro-benchmark-test:
-    name: inductor-micro-benchmark-test
+  linux-jammy-cpu-py3_9-gcc11-inductor-micro-benchmark-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
    with:
      build-environment: linux-jammy-py3.9-gcc11
-      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
    secrets: inherit
--- a/.github/workflows/inductor-nightly.yml
+++ b/.github/workflows/inductor-nightly.yml
@ -32,13 +32,13 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  nightly-dynamo-benchmarks-build:
-    name: nightly-dynamo-benchmarks-build
+  linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build:
+    name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      test-matrix: |
        { include: [
@ -51,13 +51,13 @@ jobs:
      build-additional-packages: "vision audio torchao"
    secrets: inherit

-  nightly-dynamo-benchmarks-test:
-    name: nightly-dynamo-benchmarks-test
+  linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-test:
+    name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks
    uses: ./.github/workflows/_linux-test.yml
-    needs: nightly-dynamo-benchmarks-build
+    needs: linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.test-matrix }}
      timeout-minutes: 720
    secrets: inherit
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -84,8 +84,9 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

+  # NB: Keep this in sync with trunk.yml
  build:
-    name: build
+    name: cuda12.8-py3.10-gcc9-sm90
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
@ -127,7 +128,7 @@ jobs:
    secrets: inherit

  test-periodically:
-    name: test-periodically
+    name: cuda12.8-py3.10-gcc9-sm90
    uses: ./.github/workflows/_linux-test.yml
    needs: build
    if: github.event.schedule == '15 0,12 * * 1-6'
@ -144,7 +145,7 @@ jobs:
    secrets: inherit

  test-weekly:
-    name: test-weekly
+    name: cuda12.8-py3.10-gcc9-sm90
    uses: ./.github/workflows/_linux-test.yml
    needs: build
    if: github.event.schedule == '0 7 * * 0'
@ -161,12 +162,9 @@ jobs:
    secrets: inherit

  test:
-    name: test
+    name: cuda12.8-py3.10-gcc9-sm90
    uses: ./.github/workflows/_linux-test.yml
    needs: build
-    # The pull_request trigger is used in PR to bump transformers pin which always
-    # needs one round of benchmark
-    if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }}
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
      dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}
--- a/.github/workflows/inductor-perf-test-nightly-macos.yml
+++ b/.github/workflows/inductor-perf-test-nightly-macos.yml
@ -48,9 +48,6 @@ jobs:
          { config: "perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" },
          { config: "perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" },
          { config: "perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" },
-          { config: "aot_inductor_perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" },
-          { config: "aot_inductor_perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" },
-          { config: "aot_inductor_perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" },
        ]}
    secrets: inherit

--- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
@ -69,14 +69,14 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  inductor-build:
-    name: inductor-build
+  linux-jammy-zen-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "inductor_huggingface_perf_cpu_x86_zen", shard: 1, num_shards: 3, runner: "linux.24xlarge.amd" },
@ -95,16 +95,16 @@ jobs:
      selected-test-configs: ${{ inputs.benchmark_configs }}
    secrets: inherit

-  inductor-test-nightly:
-    name: inductor-test-nightly
+  linux-jammy-zen-cpu-py3_9-gcc11-inductor-test-nightly:
+    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build
+    needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
    if: github.event.schedule == '0 7 * * *'
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
-      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
      # disable monitor in perf tests
      disable-monitor: false
@ -112,16 +112,17 @@ jobs:
      monitor-data-collect-interval: 4
    secrets: inherit

-  inductor-test:
-    name: inductor-test
+
+  linux-jammy-zen-cpu-py3_9-gcc11-inductor-test:
+    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build
+    needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
    if: github.event_name == 'workflow_dispatch'
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
-      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
      # disable monitor in perf tests
      disable-monitor: false
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@ -74,14 +74,14 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  inductor-build:
-    name: inductor-build
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "inductor_huggingface_perf_cpu_x86", shard: 1, num_shards: 3, runner: "linux.24xl.spr-metal" },
@ -101,16 +101,16 @@ jobs:
      build-additional-packages: "vision audio torchao"
    secrets: inherit

-  inductor-test-nightly-freezing:
-    name: inductor-test-nightly-freezing
+  linux-jammy-cpu-py3_9-gcc11-inductor-test-nightly-freezing:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
    if: github.event.schedule == '0 7 * * *'
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
-      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
      # disable monitor in perf tests
      disable-monitor: false
@ -118,16 +118,16 @@ jobs:
      monitor-data-collect-interval: 4
    secrets: inherit

-  inductor-test:
-    name: inductor-test
+  linux-jammy-cpu-py3_9-gcc11-inductor-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
    if: github.event_name == 'workflow_dispatch'
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }}
-      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
      # disable monitor in perf tests
      disable-monitor: false
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -79,6 +79,7 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

+  # NB: Keep this in sync with trunk.yml
  build:
    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -31,8 +31,8 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  periodic-dynamo-benchmarks-build:
-    name: periodic-dynamo-benchmarks-build
+  linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build:
+    name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
@ -57,33 +57,23 @@ jobs:
          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
        ]}
      build-additional-packages: "vision audio fbgemm torchao"
    secrets: inherit

-  periodic-dynamo-benchmarks-test:
-    name: periodic-dynamo-benchmarks-test
+  linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-test:
+    name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_linux-test.yml
-    needs: periodic-dynamo-benchmarks-build
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image: ${{ needs.periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
    secrets: inherit

-  rocm-periodic-dynamo-benchmarks-build:
+  linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build:
    if: github.repository_owner == 'pytorch'
-    name: rocm-periodic-dynamo-benchmarks-build
+    name: rocm-py3_10-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-rocm-py3_10
@ -109,21 +99,21 @@ jobs:
        ]}
    secrets: inherit

-  rocm-periodic-dynamo-benchmarks-test:
+  linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-test:
    permissions:
      id-token: write
      contents: read
-    name: rocm-periodic-dynamo-benchmarks-test
+    name: rocm-py3_10-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_rocm-test.yml
-    needs: rocm-periodic-dynamo-benchmarks-build
+    needs: linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build
    with:
      build-environment: linux-jammy-rocm-py3_10
-      docker-image: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
    secrets: inherit

-  inductor-smoke-build:
-    name: inductor-smoke-build
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
    needs:
      - get-default-label-prefix
@ -139,23 +129,23 @@ jobs:
      build-additional-packages: "vision audio fbgemm torchao"
    secrets: inherit

-  inductor-smoke-test:
-    name: inductor-smoke-test
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-test:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-smoke-build
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
-      docker-image: ${{ needs.inductor-smoke-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.test-matrix }}
    secrets: inherit

-  periodic-dynamo-benchmarks-cpu-build:
-    name: periodic-dynamo-benchmarks-cpu-build
+  linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build:
+    name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      test-matrix: |
        { include: [
@ -170,6 +160,68 @@ jobs:
          { config: "cpu_inductor_freezing_avx2_torchbench", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
          { config: "cpu_inductor_freezing_avx2_timm", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" },
          { config: "cpu_inductor_freezing_avx2_timm", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
+        ]}
+      build-additional-packages: "vision audio torchao"
+    secrets: inherit
+
+  linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-test:
+    name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+    secrets: inherit
+
+
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-default-label-prefix
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+      sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+      test-matrix: |
+        { include: [
+          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+      build-additional-packages: "vision audio fbgemm torchao"
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-default-label-prefix
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+      sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
+      test-matrix: |
+        { include: [
          { config: "cpu_inductor_freezing_huggingface", shard: 1, num_shards: 1, runner: "linux.8xlarge.amx" },
          { config: "cpu_inductor_freezing_timm", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
          { config: "cpu_inductor_freezing_timm", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
@ -195,12 +247,12 @@ jobs:
      build-additional-packages: "vision audio torchao"
    secrets: inherit

-  periodic-dynamo-benchmarks-cpu-test:
-    name: periodic-dynamo-benchmarks-cpu-test
+  linux-jammy-cpu-py3_9-gcc11-inductor-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: periodic-dynamo-benchmarks-cpu-build
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }}
-      test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/inductor-rocm.yml
+++ b/.github/workflows/inductor-rocm.yml
@ -3,18 +3,10 @@ name: inductor-rocm
 on:
  push:
    branches:
-      #- main
+      - main
      - release/*
    tags:
      - ciflow/inductor-rocm/*
-  schedule:
-    # We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
-    # Also run less frequently on weekends.
-    - cron: 45 0,8,16 * * 1-5
-    - cron: 45 4 * * 0,6
-    - cron: 45 4,12,20 * * 1-5
-    - cron: 45 12 * * 0,6
-    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
  workflow_dispatch:

 concurrency:
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -28,8 +28,8 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  inductor-build:
-    name: inductor-build
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
@ -47,18 +47,44 @@ jobs:
        ]}
    secrets: inherit

-  inductor-test:
-    name: inductor-test
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit

-  inductor-halide-build:
-    name: inductor-halide-build
+  linux-jammy-cuda12_8-py3_12-gcc9-inductor-build:
+    name: cuda12.8-py3.12-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      test-matrix: |
+        { include: [
+          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_12-gcc9-inductor-test:
+    name: cuda12.8-py3.12-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_12-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cpu-py3_12-inductor-halide-build:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
@ -71,18 +97,18 @@ jobs:
        ]}
    secrets: inherit

-  inductor-halide-test:
-    name: inductor-halide-test
+  linux-jammy-cpu-py3_12-inductor-halide-test:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-halide-build
+    needs: linux-jammy-cpu-py3_12-inductor-halide-build
    with:
      build-environment: linux-jammy-py3.12-gcc11
-      docker-image: ${{ needs.inductor-halide-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }}
    secrets: inherit

-  inductor-triton-cpu-build:
-    name: inductor-triton-cpu-build
+  linux-jammy-cpu-py3_12-inductor-triton-cpu-build:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
@ -95,23 +121,23 @@ jobs:
        ]}
    secrets: inherit

-  inductor-triton-cpu-test:
+  linux-jammy-cpu-py3_12-inductor-triton-cpu-test:
    name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-triton-cpu-build
+    needs: linux-jammy-cpu-py3_12-inductor-triton-cpu-build
    with:
      build-environment: linux-jammy-py3.12-gcc11
-      docker-image: ${{ needs.inductor-triton-cpu-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-triton-cpu-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.test-matrix }}
    secrets: inherit

-  inductor-cpu-build:
-    name: inductor-cpu-build
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
@ -122,12 +148,37 @@ jobs:
        ]}
    secrets: inherit

-  inductor-cpu-test:
-    name: inductor-cpu-test
+  linux-jammy-cpu-py3_9-gcc11-inductor-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-cpu-build
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_13-gcc9-inductor-build:
+    name: cuda12.8-py3.13-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      test-matrix: |
+        { include: [
+          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_13-gcc9-inductor-test:
+    name: cuda12.8-py3.13-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_13-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -44,8 +44,8 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  inductor-build:
-    name: inductor-build
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
@ -53,6 +53,7 @@ jobs:
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
      test-matrix: |
        { include: [
          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
@ -64,24 +65,25 @@ jobs:
      build-additional-packages: "vision audio fbgemm torchao"
    secrets: inherit

-  inductor-test:
-    name: inductor-test
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit

-  inductor-cpu-build:
-    name: inductor-cpu-build
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
      test-matrix: |
        { include: [
          { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
@ -96,12 +98,12 @@ jobs:
      build-additional-packages: "vision audio torchao"
    secrets: inherit

-  inductor-cpu-test:
-    name: inductor-cpu-test
+  linux-jammy-cpu-py3_9-gcc11-inductor-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-cpu-build
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -42,8 +42,8 @@ jobs:
    needs: get-label-type
    with:
      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
-      build-environment: linux-jammy-py3.10-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
    secrets: inherit

  docs-push:
--- a/.github/workflows/operator_benchmark.yml
+++ b/.github/workflows/operator_benchmark.yml
@ -24,38 +24,38 @@ permissions:
  contents: read

 jobs:
-  opbenchmark-build:
+  linux-jammy-cpu-py3_9-gcc11-opbenchmark-build:
    if: github.repository_owner == 'pytorch'
-    name: opbenchmark-build
+    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
        ]}
    secrets: inherit

-  opbenchmark-on-demand-build:
+  linux-jammy-cpu-py3_9-gcc11-opbenchmark-on-demand-build:
    if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'pytorch' }}
-    name: opbenchmark-on-demand-build
+    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "cpu_operator_benchmark_${{ inputs.test_mode }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
        ]}
    secrets: inherit

-  opbenchmark-test:
-    name: opbenchmark-test
+  linux-jammy-cpu-py3_9-gcc11-opbenchmark-test:
+    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
    uses: ./.github/workflows/_linux-test.yml
-    needs: opbenchmark-build
+    needs: linux-jammy-cpu-py3_9-gcc11-opbenchmark-build
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
-      test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -170,38 +170,6 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-cuda13_0-py3_10-gcc11-build:
-    name: linux-jammy-cuda13.0-py3.10-gcc11
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      cuda-arch-list: 7.5
-      build-environment: linux-jammy-cuda13.0-py3.10-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
-      test-matrix: |
-        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda13_0-py3_10-gcc11-test:
-    name: linux-jammy-cuda13.0-py3.10-gcc11
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda13_0-py3_10-gcc11-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-cuda13.0-py3.10-gcc11
-      docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
-    secrets: inherit
-
  linux-jammy-rocm-py3_10-build:
    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -49,14 +49,14 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}

-  linux-jammy-py3_10-gcc11-build:
-    name: linux-jammy-py3.10-gcc11
+  linux-jammy-py3_9-gcc11-build:
+    name: linux-jammy-py3.9-gcc11
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.10-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@ -73,49 +73,49 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-py3_10-gcc11-test:
-    name: linux-jammy-py3.10-gcc11
+  linux-jammy-py3_9-gcc11-test:
+    name: linux-jammy-py3.9-gcc11
    uses: ./.github/workflows/_linux-test.yml
    needs:
-      - linux-jammy-py3_10-gcc11-build
+      - linux-jammy-py3_9-gcc11-build
      - target-determination
    with:
-      build-environment: linux-jammy-py3.10-gcc11
-      docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.test-matrix }}
    secrets: inherit

  linux-docs:
    name: linux-docs
    uses: ./.github/workflows/_docs.yml
-    needs: linux-jammy-py3_10-gcc11-build
+    needs: linux-jammy-py3_9-gcc11-build
    with:
-      build-environment: linux-jammy-py3.10-gcc11
-      docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }}
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }}
    secrets: inherit

-  linux-jammy-py3_10-gcc11-no-ops:
-    name: linux-jammy-py3.10-gcc11-no-ops
+  linux-jammy-py3_9-gcc11-no-ops:
+    name: linux-jammy-py3.9-gcc11-no-ops
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.10-gcc11-no-ops
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+      build-environment: linux-jammy-py3.9-gcc11-no-ops
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 1 },
        ]}
    secrets: inherit

-  linux-jammy-py3_10-gcc11-pch:
-    name: linux-jammy-py3.10-gcc11-pch
+  linux-jammy-py3_9-gcc11-pch:
+    name: linux-jammy-py3.9-gcc11-pch
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.10-gcc11-pch
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+      build-environment: linux-jammy-py3.9-gcc11-pch
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 1 },
@ -132,17 +132,17 @@ jobs:
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 2, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 3, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 4, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 5, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 6, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 7, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 1, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 2, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 3, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 4, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 5, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 6, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
        ]}
      sync-tag: asan-build
    secrets: inherit

+
  linux-jammy-py3_10-clang18-asan-test:
    name: linux-jammy-py3.10-clang18-asan
    uses: ./.github/workflows/_linux-test.yml
@ -183,14 +183,14 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-py3_10-clang12-build:
-    name: linux-jammy-py3.10-clang12
+  linux-jammy-py3_9-clang12-build:
+    name: linux-jammy-py3.9-clang12
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.10-clang12
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
+      build-environment: linux-jammy-py3.9-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
@ -207,16 +207,16 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-py3_10-clang12-test:
-    name: linux-jammy-py3.10-clang12
+  linux-jammy-py3_9-clang12-test:
+    name: linux-jammy-py3.9-clang12
    uses: ./.github/workflows/_linux-test.yml
    needs:
-      - linux-jammy-py3_10-clang12-build
+      - linux-jammy-py3_9-clang12-build
      - target-determination
    with:
-      build-environment: linux-jammy-py3.10-clang12
-      docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.9-clang12
+      docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }}
    secrets: inherit

  linux-jammy-py3_13-clang12-build:
@ -253,14 +253,14 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-cuda12_8-cudnn9-py3_10-clang12-build:
-    name: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
+  linux-jammy-cuda12_8-cudnn9-py3_9-clang12-build:
+    name: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12
+      build-environment: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 1 },
@ -282,14 +282,14 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-py3_10-gcc11-mobile-lightweight-dispatch-build:
-    name: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
+  linux-jammy-py3_9-gcc11-mobile-lightweight-dispatch-build:
+    name: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+      build-environment: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
      build-generates-artifacts: false
      test-matrix: |
        { include: [
@ -342,40 +342,15 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
-    name: cuda12.8-py3.10-gcc9-sm75
+  linux-jammy-xpu-2025_1-py3_9-build:
+    name: linux-jammy-xpu-2025.1-py3.9
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
-      cuda-arch-list: '7.5'
-      test-matrix: |
-        { include: [
-          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
-    name: cuda12.8-py3.10-gcc9-sm75
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
-    secrets: inherit
-
-  linux-jammy-xpu-n-py3_9-build:
-    name: linux-jammy-xpu-n-py3.9
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      sync-tag: linux-xpu-n-build
+      sync-tag: linux-xpu-2025-1-build
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-n-py3.9
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
+      build-environment: linux-jammy-xpu-2025.1-py3.9
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" },
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@ -3,19 +3,13 @@ name: rocm
 on:
  push:
    branches:
-  #     - main
+      - main
      - release/*
    tags:
      - ciflow/rocm/*
  workflow_dispatch:
  schedule:
-    # We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
-    # Also run less frequently on weekends.
-    - cron: 45 0,8,16 * * 1-5
-    - cron: 45 4 * * 0,6
-    - cron: 45 4,12,20 * * 1-5
-    - cron: 45 12 * * 0,6
-    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
+    - cron: 29 8 * * *  # about 1:29am PDT

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -78,14 +78,14 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-py3_10-clang12-build:
-    name: linux-jammy-py3.10-clang12
+  linux-jammy-py3_9-clang12-build:
+    name: linux-jammy-py3.9-clang12
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.10-clang12
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
+      build-environment: linux-jammy-py3.9-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12
      test-matrix: |
        { include: [
          { config: "slow", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
@ -93,16 +93,16 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-py3_10-clang12-test:
-    name: linux-jammy-py3.10-clang12
+  linux-jammy-py3_9-clang12-test:
+    name: linux-jammy-py3.9-clang12
    uses: ./.github/workflows/_linux-test.yml
    needs:
-      - linux-jammy-py3_10-clang12-build
+      - linux-jammy-py3_9-clang12-build
      - target-determination
    with:
-      build-environment: linux-jammy-py3.10-clang12
-      docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.9-clang12
+      docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }}
    secrets: inherit

  linux-jammy-rocm-py3_10-build:
--- a/.github/workflows/test-check-binary.yml
+++ b/.github/workflows/test-check-binary.yml
@ -30,7 +30,7 @@ jobs:
    name: Test check_binary.sh for Linux CUDA
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
    with:
-      runner: linux.g4dn.4xlarge.nvidia.gpu
+      runner: linux.4xlarge.nvidia.gpu
      docker-image: python:3.11
      docker-build-dir: "skip-docker-build"
      script: |
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -224,12 +224,13 @@ jobs:
      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
    secrets: inherit

-  inductor-build:
-    name: inductor-build
+  # NB: Keep this in sync with inductor-perf-test-nightly.yml
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
    secrets: inherit
@ -241,7 +242,7 @@ jobs:
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.9-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@ -59,19 +59,22 @@ jobs:
            # on the PR appear in chronological order (timing issues can shuffle them around)
            sleep 60
          fi
-
-          # Require a comment id for merge operations
-          if [ -z "${COMMENT_ID}" ]; then
-            echo "Error: merge requires COMMENT_ID to be specified"
-            exit 1
-          fi
-
          if [ -n "${FORCE}" ]; then
-            python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}"
+            if [ -n "${COMMENT_ID}" ]; then
+              python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}"
+            else
+              python3 .github/scripts/trymerge.py --force "${PR_NUM}"
+            fi
          elif [ -n "${IGNORE_CURRENT}" ]; then
-            python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}"
-          else
+            if [ -n "${COMMENT_ID}" ]; then
+              python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}"
+            else
+              python3 .github/scripts/trymerge.py --ignore-current "${PR_NUM}"
+            fi
+          elif [ -n "${COMMENT_ID}" ]; then
            python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}"
+          else
+            python3 .github/scripts/trymerge.py "${PR_NUM}"
          fi
      - name: Comment on Canceled
        if: ${{ cancelled() && steps.checkout.outcome == 'success' }}
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@ -23,7 +23,7 @@ jobs:
        with:
          repository: pytorch/pytorch
          stable-branch: viable/strict
-          requires: '[\"pull\", \"trunk\", \"lint\", \"^linux-binary-manywheel$\", \"^linux-binary-libtorch-release$\", \"linux-aarch64\"]'
+          requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\", \"linux-aarch64\"]'
          secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
          clickhouse-url: ${{ secrets.CLICKHOUSE_URL }}
          clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }}
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@ -6,7 +6,8 @@ on:
      - ciflow/vllm/*
  workflow_dispatch:
  schedule:
-    - cron: '0 */8 * * *'  # every 8 hours at minute 0 (UTC)
+    # Every 12 hours starting at 00:00 UTC (00:00 and 12:00)
+    - cron: '0 0,12 * * *'

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
--- a/.github/workflows/win-arm64-build-test.yml
+++ b/.github/workflows/win-arm64-build-test.yml
@ -4,9 +4,6 @@ on:
  push:
    tags:
      - ciflow/win-arm64/*
-  schedule:
-    # Every 4 hours starting at 00:00 UTC
-    - cron: '0 */4 * * *'

 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@ -26,15 +26,15 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

-  linux-jammy-xpu-n-1-py3_10-build:
-    name: linux-jammy-xpu-n-1-py3.10
+  linux-jammy-xpu-2025_0-py3_9-build:
+    name: linux-jammy-xpu-2025.0-py3.9
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      sync-tag: linux-xpu-n-1-build
+      sync-tag: linux-xpu-2025-0-build
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-n-1-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3
+      build-environment: linux-jammy-xpu-2025.0-py3.9
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.0-py3
      runner: linux.12xlarge
      test-matrix: |
        { include: [
@ -47,62 +47,60 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-xpu-n-py3_10-build:
-    name: linux-jammy-xpu-n-py3.10
+  linux-jammy-xpu-2025_1-py3_9-build:
+    name: linux-jammy-xpu-2025.1-py3.9
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      sync-tag: linux-xpu-n-build
+      sync-tag: linux-xpu-2025-1-build
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-n-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
+      build-environment: linux-jammy-xpu-2025.1-py3.9
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3
      runner: linux.12xlarge
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
        ]}
    secrets: inherit

-  linux-jammy-xpu-n-py3_10-test:
-    name: linux-jammy-xpu-n-py3.10
+  linux-jammy-xpu-2025_1-py3_9-test:
+    name: linux-jammy-xpu-2025.1-py3.9
    uses: ./.github/workflows/_xpu-test.yml
-    needs: linux-jammy-xpu-n-py3_10-build
+    needs: linux-jammy-xpu-2025_1-py3_9-build
    permissions:
      id-token: write
      contents: read
    with:
-      build-environment: linux-jammy-xpu-n-py3.10
-      docker-image: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.test-matrix }}
+      build-environment: linux-jammy-xpu-2025.1-py3.9
+      docker-image: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.test-matrix }}
    secrets: inherit

-  windows-xpu-n-1-build:
+  windows-xpu-2025_0-build:
    if: github.repository_owner == 'pytorch'
-    name: win-vs2022-xpu-n-1-py3
+    name: win-vs2022-xpu-2025_0-py3
    uses: ./.github/workflows/_win-build.yml
    with:
-      build-environment: win-vs2022-xpu-n-1-py3
+      build-environment: win-vs2022-xpu-py3
+      cuda-version: cpu
+      use-xpu: true
+      xpu-version: '2025.0'
+      vc-year: '2022'
+    secrets: inherit
+
+  windows-xpu-2025_1-build:
+    if: github.repository_owner == 'pytorch'
+    name: win-vs2022-xpu-2025_1-py3
+    uses: ./.github/workflows/_win-build.yml
+    with:
+      build-environment: win-vs2022-xpu-py3
      cuda-version: cpu
      use-xpu: true
      xpu-version: '2025.1'
      vc-year: '2022'
    secrets: inherit
-
-  windows-xpu-n-build:
-    if: github.repository_owner == 'pytorch'
-    name: win-vs2022-xpu-n-py3
-    uses: ./.github/workflows/_win-build.yml
-    with:
-      build-environment: win-vs2022-xpu-n-py3
-      cuda-version: cpu
-      use-xpu: true
-      xpu-version: '2025.2'
-      vc-year: '2022'
-    secrets: inherit
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -583,7 +583,7 @@ exclude_patterns = [
 command = [
    'python3',
    'tools/linter/adapters/grep_linter.py',
-    '--pattern=#include <pybind11\/(^|[^(gil_simple\.h)])',
+    '--pattern=#include <pybind11\/(^|[^(gil\.h)])',
    '--allowlist-pattern=#include <torch\/csrc\/utils\/pybind.h>',
    '--linter-name=PYBIND11_INCLUDE',
    '--match-first-only',
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -22,6 +22,7 @@ COMMON_COPTS = [
    "-DHAVE_SHM_UNLINK=1",
    "-D_FILE_OFFSET_BITS=64",
    "-DUSE_FBGEMM",
+    "-DUSE_DISTRIBUTED",
    "-DAT_PER_OPERATOR_HEADERS",
    "-DATEN_THREADING=NATIVE",
    "-DNO_CUDNN_DESTROY_HANDLE",
@ -746,7 +747,6 @@ cc_library(
            "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu",
            "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu",
            "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
-            "torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp",
            "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
        ],
    )) + torch_sources,
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -181,9 +181,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
  set(CPU_POWER ON)
 endif()

-# For non-supported platforms, turn USE_DISTRIBUTED off by default.
-# NB: USE_DISTRIBUTED simply disables the backend; distributed code
-# still gets built
+# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
+# tested and likely won't work without additional changes.
 if(NOT LINUX AND NOT WIN32)
  set(USE_DISTRIBUTED
      OFF
@ -262,18 +261,18 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
 option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
-option(USE_DISTRIBUTED "Enable default distributed backends" ON)
+option(USE_DISTRIBUTED "Use distributed" ON)
 cmake_dependent_option(USE_NCCL "Use NCCL" ON
                       "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_XCCL "Use XCCL" ON
-                       "USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
+                       "USE_XPU;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
 cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
 cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL"
                       OFF)
 cmake_dependent_option(USE_NVSHMEM "Use NVSHMEM" ON
-                       "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
+                       "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
 option(USE_NNAPI "Use NNAPI" OFF)
 option(USE_NNPACK "Use NNPACK" ON)
 cmake_dependent_option(USE_NUMA "Use NUMA. Only available on Linux." ON "LINUX"
@ -431,10 +430,11 @@ if(WIN32)
      PATH_SUFFIXES lib
      NO_DEFAULT_PATH)
    if(NOT libuv_tmp_LIBRARY)
+      set(USE_DISTRIBUTED OFF)
      set(USE_GLOO OFF)
      message(
        WARNING
-          "Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
+          "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
          "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
      )
    else()
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -216,7 +216,7 @@ file(GLOB mem_eff_attention_cuda_cpp "native/transformers/cuda/mem_eff_attention
 if(USE_CUDA AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
  add_library(flash_attention OBJECT EXCLUDE_FROM_ALL ${flash_attention_cuda_kernels_cu} ${flash_attention_cuda_cpp})

-  target_include_directories(flash_attention SYSTEM PUBLIC
+  target_include_directories(flash_attention PUBLIC
    ${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc
    ${PROJECT_SOURCE_DIR}/third_party/flash-attention/include
    ${PROJECT_SOURCE_DIR}/third_party/cutlass/include
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -1,18 +1,5 @@
 #pragma once

-// See https://github.com/pytorch/pytorch/issues/161660
-// This compile flag is intended to be passed in to CppExtensions that rely on
-// the stable ABI via the `extra_compile_args` argument. This is a stopgap
-// solution to ensure that non-stable libtorch APIs are not used in the extension.
-// The long term solution is to have a torch_stable target that excludes headers
-// that are not in torch/stable or torch/headeronly.
-// See test/cpp_extensions/torch_stable_test_extension/setup.py for an example
-// of how this is used.
-#ifdef TORCH_STABLE_ONLY
-#error \
-    "TensorBase.h should not be included when TORCH_STABLE_ONLY compile flag is passed"
-#endif
-
 #include <c10/core/Device.h>
 #include <c10/core/Layout.h>
 #include <c10/core/MemoryFormat.h>
--- a/aten/src/ATen/core/boxing/KernelFunction_impl.h
+++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h
@ -15,7 +15,7 @@ std::enable_if_t<
        std::is_base_of_v<Base, Child>,
    std::unique_ptr<Base>>
 make_unique_base(Args&&... args) {
-  return std::make_unique<Child>(std::forward<Args>(args)...);
+  return std::unique_ptr<Base>(new Child(std::forward<Args>(args)...));
 }
 } // namespace detail

--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -996,6 +996,9 @@ void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {

 template <>
 void bgemm<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
+  #ifdef USE_ROCM
+  TORCH_CHECK(false, "bgemm input type at::Half and output type float is not supported for ROCm");
+  #endif
  // TODO: Support tuning for Half inputs and FP32 output
  bgemm_internal<at::Half, float>(CUDABLAS_BGEMM_ARGS(at::Half));
 }
@ -1003,7 +1006,9 @@ void bgemm<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)

 template <>
 void bgemm<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
-  #ifndef USE_ROCM
+  #ifdef USE_ROCM
+  TORCH_CHECK(false, "bgemm input type at::BFloat16 and output type float is not supported for ROCm");
+  #else
    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();

    if (prop->major < 8)
@ -1508,6 +1513,9 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {

 template <>
 void gemm<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
+  #ifdef USE_ROCM
+  TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+  #endif
  // TODO: Support Tuning for fp16-fp32 gemm
  gemm_internal<at::Half, float>(CUDABLAS_GEMM_ARGS(at::Half));
 }
@ -1515,7 +1523,9 @@ void gemm<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float))

 template <>
 void gemm<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
-  #ifndef USE_ROCM
+  #ifdef USE_ROCM
+  TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm");
+  #else
    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();

    if (prop->major < 8)
@ -2573,6 +2583,8 @@ void vdot<c10::complex<double>>(CUDABLAS_DOT_ARGTYPES(c10::complex<double>)) {
                                   reinterpret_cast<cuDoubleComplex*>(result)));
 }

+// HIP on Windows does not support
+#if !(defined(USE_ROCM) && defined(_MSC_VER))
 template <>
 void getrsBatched<float>(CUDABLAS_GETRS_ARGTYPES(float)) {
  TORCH_CUDABLAS_CHECK(cublasSgetrsBatched(
@ -2771,5 +2783,6 @@ void gelsBatched<c10::complex<float>>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::comple
      devInfoArray,
      batchSize));
 }
+#endif // !(defined(USE_ROCM) && defined(_MSC_VER))

 } // namespace at::cuda::blas
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@ -343,6 +343,9 @@ void vdot<c10::complex<double>>(CUDABLAS_DOT_ARGTYPES(c10::complex<double>));
  int m, int n, int nrhs, Dtype** dA_array, int ldda, \
  Dtype** dC_array, int lddc, int* info, int *devInfoArray, int batchSize

+// HIP on Windows does not support getrs, geqrf, getrf, gels
+#if !(defined(USE_ROCM) && defined(_MSC_VER))
+
 template<class Dtype>
 void getrsBatched(CUDABLAS_GETRS_ARGTYPES(Dtype)) {
  static_assert(false&&sizeof(Dtype),"at::cuda::blas::getrsBatched: not implemented");
@ -397,4 +400,28 @@ TORCH_CUDA_CU_API void gelsBatched<c10::complex<double>>(CUDABLAS_GELS_BATCHED_A
 template<>
 TORCH_CUDA_CU_API void gelsBatched<c10::complex<float>>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::complex<float>));

+#else // !(defined(USE_ROCM) && defined(_MSC_VER))
+
+template<class Dtype>
+void getrsBatched(CUDABLAS_GETRS_ARGTYPES(Dtype)) {
+  TORCH_CHECK(false, "at::cuda::blas::getrsBatched: not supported for HIP on Windows");
+}
+
+template <class Dtype>
+void geqrfBatched(CUDABLAS_GEQRF_BATCHED_ARGTYPES(Dtype)) {
+  TORCH_CHECK(false, "at::cuda::blas::geqrfBatched: not supported for HIP on Windows");
+}
+
+template<class Dtype>
+void getrfBatched(CUDABLAS_GETRF_ARGTYPES(Dtype)) {
+  TORCH_CHECK(false, "at::cuda::blas::getrfBatched: not supported for HIP on Windows");
+}
+
+template <class Dtype>
+void gelsBatched(CUDABLAS_GELS_BATCHED_ARGTYPES(Dtype)) {
+  TORCH_CHECK(false, "at::cuda::blas::gelsBatched: not supported for HIP on Windows");
+}
+
+#endif // !(defined(USE_ROCM) && defined(_MSC_VER))
+
 } // namespace at::cuda::blas
--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@ -220,17 +220,19 @@ TuningResultsValidator::TuningResultsValidator() {
      []() { return GetPyTorchVersion(); },
      [this](auto&& k) { return ValidatePyTorchVersion(std::forward<decltype(k)>(k)); });
 #ifdef USE_ROCM
-  // hip
+  // rocm
  {
-    // HIP version is more accurate than ROCm version.  User's environment could be a stock
-    // ROCm install but with a mix of newer components, making ROCm version meaningless.
-    std::string hip_version = c10::str(TORCH_HIP_VERSION);
+#ifdef _WIN32
+    std::string rocm_version = HIP_VERSION_BUILD_NAME;
+#else
+    std::string rocm_version = ROCM_BUILD_INFO;
+#endif
    RegisterValidator(
-       "HIP_VERSION",
-       [hip_version]() { return hip_version; },
-       [hip_version](auto&& k) {
-        TUNABLE_LOG1("HIP_VERSION validation: expect ", k, " to match ", hip_version);
-        return hip_version == k ? OK : FAIL;
+       "ROCM_VERSION",
+       [rocm_version]() { return rocm_version; },
+       [rocm_version](auto&& k) {
+        TUNABLE_LOG1("ROCM_VERSION validation: expect ", k, " to match ", rocm_version);
+        return rocm_version == k ? OK : FAIL;
      });
  }
  // gfx arch
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@ -38,7 +38,6 @@ inline int dataSize(cudnnDataType_t dataType)
  }
 }

-// NOTE [ cudnn fixSizeOneDimStride ]
 // The stride for a size-1 dimensions is not uniquely determined; in
 // fact, it can be anything you want, because the fact that the
 // tensor is size 1 at this dimension means that you will never actually
--- a/aten/src/ATen/miopen/Descriptors.cpp
+++ b/aten/src/ATen/miopen/Descriptors.cpp
@ -19,37 +19,31 @@ inline miopenDataType_t getDataType(const at::Tensor& t) {
  } else {
    TORCH_CHECK(
        false,
-        "TensorDescriptor does not support ", scalar_type);
+        "TensorDescriptor only supports float, half and bfloat16 tensors");
  }
 }

 } // anonymous namespace

-constexpr size_t MIOPEN_DIM_MAX = 5;
-
-void TensorDescriptor::set(const at::Tensor &t, at::MemoryFormat memory_format, size_t pad) {
-  set(getDataType(t), t.sizes(), t.strides(), pad,
-    memory_format == at::MemoryFormat::ChannelsLast ||
-    memory_format == at::MemoryFormat::ChannelsLast3d);
-}

 void TensorDescriptor::set(const at::Tensor &t, size_t pad) {
-  auto memory_format = t.suggest_memory_format();
-  set(getDataType(t), t.sizes(), t.strides(), pad,
-    memory_format == at::MemoryFormat::ChannelsLast ||
-    memory_format == at::MemoryFormat::ChannelsLast3d);
+  set(getDataType(t), t.sizes(), t.strides(), pad);
 }

+constexpr size_t MIOPEN_DIM_MAX = 5;
+
 void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntArrayRef t_strides, size_t pad) {
-  set(datatype, t_sizes, t_strides, pad,
-    is_channels_last_strides_2d(t_sizes, t_strides) ||
-    is_channels_last_strides_3d(t_sizes, t_strides));
-}
-
-void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntArrayRef t_strides, size_t pad, bool nhwc) {
  size_t dim = t_sizes.size();
  if (dim > MIOPEN_DIM_MAX || pad > MIOPEN_DIM_MAX)
-    TORCH_CHECK(false, "MIOpen supports only up to ", MIOPEN_DIM_MAX, " dimensions");
+#define _STR(X) #X
+#define STR(X) _STR(X)
+    TORCH_CHECK(
+        false,
+        "MIOpen supports only up to ",
+        STR(MIOPEN_DIM_MAX),
+        " dimensions");
+#undef _STR
+#undef STR
  int size[MIOPEN_DIM_MAX];
  int stride[MIOPEN_DIM_MAX];
  for (const auto i : c10::irange(dim)) {
@ -60,7 +54,7 @@ void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntAr
    size[i] = 1;
    stride[i] = 1;
  }
-  set(datatype, static_cast<int>(std::max(dim, pad)), size, stride, nhwc);
+  set(datatype, static_cast<int>(std::max(dim, pad)), size, stride);
 }

 std::string miopenTypeToString(miopenDataType_t dtype) {
@ -80,11 +74,10 @@ std::string miopenTypeToString(miopenDataType_t dtype) {

 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
-  int nbDims = 0;
+  int nbDims = 4;
  int dimA[MIOPEN_DIM_MAX];
  int strideA[MIOPEN_DIM_MAX];
  miopenDataType_t dtype;
-  miopenGetTensorDescriptorSize(d.desc(), &nbDims);
  miopenGetTensorDescriptor(d.desc(), &dtype, dimA, strideA);
  out << "    type = " << miopenTypeToString(dtype) << "\n";
  out << "    nbDims = " << nbDims << "\n";
@ -106,17 +99,19 @@ void TensorDescriptor::print() { std::cout << *this; }

 void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_format, int64_t pad) {
  auto dim = t.ndimension();
-  if (dim > MIOPEN_DIM_MAX || pad > MIOPEN_DIM_MAX)
-  TORCH_CHECK(false, "MIOpen supports only up to ", MIOPEN_DIM_MAX, " dimensions");
-  // NB: It is possible for this test to be insufficient, because the
-  // Tensor passed in to set the filter descriptor may not be the actual
-  // Tensor whose data pointer is passed to cuDNN.  Nevertheless,
-  // that is the common case, so we can catch most client errors with this test.
+  if (dim > static_cast<int64_t>(MIOPEN_DIM_MAX) || pad > static_cast<int64_t>(MIOPEN_DIM_MAX)) {
+#define _STR(X) #X
+#define STR(X) _STR(X)
+    TORCH_CHECK(
+        false,
+        "MIOpen supports only up to ",
+        STR(MIOPEN_DIM_MAX),
+        " dimensions");
+#undef _STR
+#undef STR
+  }
  TORCH_CHECK(t.is_contiguous(memory_format),
-    "MIOpen filters (a.k.a. weights) must be contiguous in desired memory_format\n",
-    "Weight sizes: ", t.sizes(), "\n",
-    "Weight strides: ", t.strides(), "\n",
-    "cuDNN suggested memory_format: ", memory_format);
+      "MIOpen filters (a.k.a. weights) must be contiguous");

  int size[MIOPEN_DIM_MAX];
  int stride[MIOPEN_DIM_MAX];
@ -136,9 +131,7 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
  }

  dim = std::max<int64_t>(dim, pad);
-  set(getDataType(t), static_cast<int>(dim), size, stride,
-    memory_format == at::MemoryFormat::ChannelsLast ||
-    memory_format == at::MemoryFormat::ChannelsLast3d);
+  set(getDataType(t), (int) dim, size, stride);
 }

 }}
--- a/aten/src/ATen/miopen/Descriptors.h
+++ b/aten/src/ATen/miopen/Descriptors.h
@ -9,8 +9,6 @@

 namespace at { namespace native {

-std::string miopenTypeToString(miopenDataType_t dtype);
-
 inline int dataSize(miopenDataType_t dataType)
 {
  switch (dataType) {
@ -21,32 +19,6 @@ inline int dataSize(miopenDataType_t dataType)
  }
 }

-// See NOTE [ cudnn fixSizeOneDimStride ] in aten/src/ATen/cudnn/Descriptors.h
-template <typename T>
-static inline void fixSizeOneDimStride(int dim, const T *size, T *stride, bool nhwc) {
-  int64_t z = 1;
-  int index = 0;
-  std::vector<int> permutation(dim);
-
-  if (nhwc) {
-    permutation[index++] = 1;
-  }
-  for (int d = dim-1; d > 1; d--) {
-    permutation[index++] = d;
-  }
-  if (!nhwc) {
-    permutation[index++] = 1;
-  }
-  permutation[index++] = 0;
-  for (int d : permutation) {
-    if (size[d] == 1) {
-      stride[d] = z;
-    } else {
-      z *= size[d];
-    }
-  }
-}
-
 template <typename T, miopenStatus_t (*dtor)(T*)>
 struct DescriptorDeleter {
  void operator()(T* x) {
@ -103,20 +75,14 @@ class TORCH_HIP_CPP_API TensorDescriptor : public Descriptor<
    set(t, pad);
  }

-  // See Note [CuDNN broadcast padding]
  void set(const at::Tensor &t, size_t pad = 0);
-  void set(const at::Tensor &t, at::MemoryFormat memory_format, size_t pad = 0);
  void set(miopenDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad = 0);

  void print();

 private:
-  void set(miopenDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad, bool nhwc);
-
-  void set(miopenDataType_t dataType, int dim, int* size, int* stride, bool nhwc) {
-    std::vector<int> strides_copy(stride, stride + dim);
-    fixSizeOneDimStride<int>(dim, size, strides_copy.data(), nhwc);
-    MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, strides_copy.data()));
+  void set(miopenDataType_t dataType, int dim, int* size, int* stride) {
+    MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride));
  }
 };

@ -134,10 +100,8 @@ class TORCH_HIP_CPP_API FilterDescriptor : public Descriptor<
  void set(const at::Tensor &t, const at::MemoryFormat memory_format, int64_t pad = 0);

 private:
-  void set(miopenDataType_t dataType, int dim, int* size, int* stride, bool nhwc) {
-    std::vector<int> strides_copy(stride, stride + dim);
-    fixSizeOneDimStride<int>(dim, size, strides_copy.data(), nhwc);
-    MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, strides_copy.data()));
+  void set(miopenDataType_t dataType, int dim, int* size, int* stride) {
+    MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride));
  }
 };

@ -202,4 +166,4 @@ union Constant
  }
 };

-}} // namespace
+}}  // namespace
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@ -457,9 +457,24 @@ void gemm(
    return;
  }
 #endif
+  // for the fallback path, first compute gemm with beta = 0,
+  // and then add c in full precision.
+  int64_t c_size = n * m;
+  std::vector<float> float_c(c_size, 0.f);
  gemm_no_downcast_stub(
      at::kCPU, at::kBFloat16,
-      transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+      transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m);
+  for (const auto j : c10::irange(n)) {
+    for (const auto i : c10::irange(m)) {
+      auto offset = j * ldc + i;
+      // beta == 0 won't propagate NaN from C
+      if (beta == 0.f) {
+        c[offset] = float_c[j * m + i];
+      } else {
+        c[offset] = beta * c[offset] + float_c[j * m + i];
+      }
+    }
+  }
 }

 void gemm(
@ -478,9 +493,24 @@ void gemm(
    return;
  }
 #endif
-  gemm_no_downcast_stub(
+  // for the fallback path, first compute gemm with beta = 0,
+  // and then add c in full precision.
+  int64_t c_size = n * m;
+  std::vector<at::Half> float16_c(c_size, 0.f);
+  gemm_stub(
      at::kCPU, at::kHalf,
-      transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+      transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float16_c.data(), m);
+  for (const auto j : c10::irange(n)) {
+    for (const auto i : c10::irange(m)) {
+      auto offset = j * ldc + i;
+      // beta == 0 won't propagate NaN from C
+      if (beta == 0.f) {
+        c[offset] = c10::convert<float>(float16_c[j * m + i]);
+      } else {
+        c[offset] = beta * c[offset] + c10::convert<float>(float16_c[j * m + i]);
+      }
+    }
+  }
 }

 void gemm(
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@ -353,21 +353,19 @@ TORCH_API void _cudnn_set_conv_benchmark_empty_cache(bool enable);
 TORCH_API bool _cudnn_get_conv_benchmark_empty_cache();


-inline at::MemoryFormat miopen_conv_suggest_memory_format(const at::Tensor& input, const at::Tensor& weight) {
+inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
+
  // disable NHWC for float64 input.
  if (!at::detail::getCUDAHooks().compiledWithMIOpen() ||
      input.scalar_type() == at::kDouble ||
      weight.scalar_type() == at::kDouble) {
-    return at::MemoryFormat::Contiguous;
+    return false;
  }

  // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
-  // See https://github.com/pytorch/pytorch/issues/64427.
-  // non static variable is used to be able to change environment variable in runtime for testing
-  // enabled by default for ROCm >= 7.0.0 with miopen 3.5
-  int miopen_version = detail::getCUDAHooks().compiledWithMIOpen() ? detail::getCUDAHooks().versionMIOpen() : 0;
-  bool is_miopen_3_5 = miopen_version >= 30500;  // ROCm 7.0
-  bool suggest_nhwc = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC").value_or(is_miopen_3_5);
+  // See #64427
+  static std::optional<bool> PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC");
+  static bool suggest_nhwc = PYTORCH_MIOPEN_SUGGEST_NHWC && *PYTORCH_MIOPEN_SUGGEST_NHWC;

  auto input_memory_format = input.suggest_memory_format();
  auto weight_memory_format = weight.suggest_memory_format();
@ -377,24 +375,13 @@ inline at::MemoryFormat miopen_conv_suggest_memory_format(const at::Tensor& inpu
    (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
    (weight_memory_format == at::MemoryFormat::ChannelsLast)
  );
-  if (can_use_miopen_channels_last_2d) {
-    return at::MemoryFormat::ChannelsLast;
-  }

  bool can_use_miopen_channels_last_3d = suggest_nhwc && (weight_ndim == 5) && (
    (input_memory_format  == at::MemoryFormat::ChannelsLast3d) ||
    (weight_memory_format == at::MemoryFormat::ChannelsLast3d)
  );
-  if (can_use_miopen_channels_last_3d) {
-    return at::MemoryFormat::ChannelsLast3d;
-  }

-  return at::MemoryFormat::Contiguous;
-}
-
-// deprecated, but to remove would be BC-breaking
-inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
-  return miopen_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous;
+  return can_use_miopen_channels_last_2d || can_use_miopen_channels_last_3d;
 }

 inline bool mkldnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -459,9 +459,6 @@ struct ConvParams {

  // Use cudnn for FP16 depthwise convolutions
  bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const  {
-    if (!detail::getCUDAHooks().compiledWithCuDNN()) {
-      return false;
-    }
    if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous && use_cudnn(input, weight)) {
      // always use cudnn_depthwise for channels_last format
      return true;
@ -1422,8 +1419,10 @@ static inline at::MemoryFormat determine_backend_memory_format(
    case ConvBackend::Miopen:
    case ConvBackend::MiopenDepthwise:
    case ConvBackend::MiopenTranspose:
-      if (detail::getCUDAHooks().compiledWithMIOpen()) {
-        backend_memory_format = miopen_conv_suggest_memory_format(input, weight);
+      if (detail::getCUDAHooks().compiledWithMIOpen() && miopen_conv_use_channels_last(input, weight)) {
+        TORCH_INTERNAL_ASSERT((k == 4 || k == 5),
+            "Expected 4D or 5D input for miopen memory format selection in determine_backend_memory_format()");
+        backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
      }
      break;
    case ConvBackend::Mkldnn:
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@ -185,17 +185,6 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
  // right:  "lro, summed, ro" permuted with rpermutation and the three flattened
  // then the permuted output is a view of bmm(left, right)
  // finally, opermutation reverts the permutation to the original order of dimensions
-  // By default the output is "lro, lo, 1-for-summed-dims, ro" with original shape dimensions.
-  // However, if all dimensions from the right operand appear before those from the left
-  // operand in the final output, we can swap the operands so that bmm directly produces
-  // the result in the correct memory order.
-
-  bool swap_lo_ro = !lo.empty() && !ro.empty() && ro.back() < lo.front();
-  if (swap_lo_ro) {
-    std::swap(left, right);
-    std::swap(lo, ro);
-    std::swap(lo_size, ro_size);
-  }
  auto out_num_dim = lro.size() + lo.size() + sum_dims_.size() + ro.size();
  std::vector<SymInt> out_size;
  out_size.reserve(out_num_dim);
--- a/Show More
+++ b/Show More