add workflow to dispatch

echo variables
ghstack-source-id: 3c8f54e83cad9760fb06b39366bea2f31a39342f Pull-Request: https://github.com/pytorch/pytorch/pull/161565
2025-11-11 22:34:53 +08:00 · 2025-08-26 18:20:49 -07:00 · 2025-08-26 17:10:56 -07:00 · 2025-08-26 17:10:55 -07:00
515 changed files with 5327 additions and 19687 deletions
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -7,15 +7,6 @@ if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
 fi

-if [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0"
-fi
-
-# Compress the fatbin with -compress-mode=size for CUDA 13
-if [[ "$DESIRED_CUDA" == *"13"* ]]; then
-    export TORCH_NVCC_FLAGS="-compress-mode=size"
-fi
-
 SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 source $SCRIPTPATH/aarch64_ci_setup.sh

--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -77,23 +77,21 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
    wheelname = os.path.basename(wheel_path)
    os.mkdir(f"{folder}/tmp")
    os.system(f"unzip {wheel_path} -d {folder}/tmp")
-    # Common libraries for all CUDA versions
-    common_libs = [
-        # Non-NVIDIA system libraries
-        "/lib64/libgomp.so.1",
-        "/usr/lib64/libgfortran.so.5",
-        "/acl/build/libarm_compute.so",
-        "/acl/build/libarm_compute_graph.so",
-        # Common CUDA libraries (same for all versions)
-        "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
-        "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
-        "/usr/local/lib/libnvpl_lapack_core.so.0",
-        "/usr/local/lib/libnvpl_blas_core.so.0",
+    libs_to_copy = [
+        "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
        "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
        "/usr/local/cuda/lib64/libcudnn.so.9",
+        "/usr/local/cuda/lib64/libcublas.so.12",
+        "/usr/local/cuda/lib64/libcublasLt.so.12",
+        "/usr/local/cuda/lib64/libcudart.so.12",
+        "/usr/local/cuda/lib64/libcufft.so.11",
+        "/usr/local/cuda/lib64/libcusparse.so.12",
        "/usr/local/cuda/lib64/libcusparseLt.so.0",
+        "/usr/local/cuda/lib64/libcusolver.so.11",
        "/usr/local/cuda/lib64/libcurand.so.10",
        "/usr/local/cuda/lib64/libnccl.so.2",
+        "/usr/local/cuda/lib64/libnvJitLink.so.12",
+        "/usr/local/cuda/lib64/libnvrtc.so.12",
        "/usr/local/cuda/lib64/libnvshmem_host.so.3",
        "/usr/local/cuda/lib64/libcudnn_adv.so.9",
        "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
@ -102,41 +100,22 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
        "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
        "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
        "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
-        "/usr/local/cuda/lib64/libcufile.so.0",
-        "/usr/local/cuda/lib64/libcufile_rdma.so.1",
-        "/usr/local/cuda/lib64/libcusparse.so.12",
+        "/lib64/libgomp.so.1",
+        "/usr/lib64/libgfortran.so.5",
+        "/acl/build/libarm_compute.so",
+        "/acl/build/libarm_compute_graph.so",
+        "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+        "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+        "/usr/local/lib/libnvpl_lapack_core.so.0",
+        "/usr/local/lib/libnvpl_blas_core.so.0",
    ]

-    # CUDA version-specific libraries
-    if "130" in desired_cuda:
-        version_specific_libs = [
-            "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
-            "/usr/local/cuda/lib64/libcublas.so.13",
-            "/usr/local/cuda/lib64/libcublasLt.so.13",
-            "/usr/local/cuda/lib64/libcudart.so.13",
-            "/usr/local/cuda/lib64/libcufft.so.12",
-            "/usr/local/cuda/lib64/libcusolver.so.12",
-            "/usr/local/cuda/lib64/libnvJitLink.so.13",
-            "/usr/local/cuda/lib64/libnvrtc.so.13",
-            "/usr/local/cuda/lib64/libnvrtc-builtins.so.13.0",
+    if "129" in desired_cuda:
+        libs_to_copy += [
+            "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9",
+            "/usr/local/cuda/lib64/libcufile.so.0",
+            "/usr/local/cuda/lib64/libcufile_rdma.so.1",
        ]
-    elif "12" in desired_cuda:
-        # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
-        minor_version = desired_cuda[-1]
-        version_specific_libs = [
-            "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
-            "/usr/local/cuda/lib64/libcublas.so.12",
-            "/usr/local/cuda/lib64/libcublasLt.so.12",
-            "/usr/local/cuda/lib64/libcudart.so.12",
-            "/usr/local/cuda/lib64/libcufft.so.11",
-            "/usr/local/cuda/lib64/libcusolver.so.11",
-            "/usr/local/cuda/lib64/libnvJitLink.so.12",
-            "/usr/local/cuda/lib64/libnvrtc.so.12",
-            f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
-        ]
-
-    # Combine all libraries
-    libs_to_copy = common_libs + version_specific_libs

    # Copy libraries to unzipped_folder/a/lib
    for lib_path in libs_to_copy:
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -81,8 +81,8 @@ elif [[ "$image" == *riscv* ]]; then
  DOCKERFILE="ubuntu-cross-riscv/Dockerfile"
 fi

-_UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152
-_UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96
+_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
+_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
 if [[ "$image" == *rocm* ]]; then
  _UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
  _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
@ -114,19 +114,31 @@ case "$tag" in
    UCC_COMMIT=${_UCC_COMMIT}
    TRITON=yes
    ;;
-  pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
-    CUDA_VERSION=13.0.0
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.8.1
    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
+    GCC_VERSION=9
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.8.1
-    ANACONDA_PYTHON_VERSION=3.10
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=9
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.8.1
+    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
    VISION=yes
    KATEX=yes
@ -161,8 +173,8 @@ case "$tag" in
    VISION=yes
    ONNX=yes
    ;;
-  pytorch-linux-jammy-py3.10-clang12)
-    ANACONDA_PYTHON_VERSION=3.10
+  pytorch-linux-jammy-py3.9-clang12)
+    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=12
    VISION=yes
    TRITON=yes
@ -197,24 +209,23 @@ case "$tag" in
    UCC_COMMIT=${_UCC_COMMIT}
    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
    ;;
-  pytorch-linux-jammy-xpu-n-1-py3)
-    ANACONDA_PYTHON_VERSION=3.10
+  pytorch-linux-jammy-xpu-2025.0-py3)
+    ANACONDA_PYTHON_VERSION=3.9
+    GCC_VERSION=11
+    VISION=yes
+    XPU_VERSION=2025.0
+    NINJA_VERSION=1.9.0
+    TRITON=yes
+    ;;
+  pytorch-linux-jammy-xpu-2025.1-py3)
+    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    VISION=yes
    XPU_VERSION=2025.1
    NINJA_VERSION=1.9.0
    TRITON=yes
    ;;
-  pytorch-linux-jammy-xpu-n-py3)
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
-    VISION=yes
-    XPU_VERSION=2025.2
-    NINJA_VERSION=1.9.0
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
-    # TODO (huydhn): Upgrade this to Python >= 3.10
+  pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    VISION=yes
@ -223,8 +234,8 @@ case "$tag" in
    DOCS=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12)
-    ANACONDA_PYTHON_VERSION=3.10
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
+    ANACONDA_PYTHON_VERSION=3.9
    CUDA_VERSION=12.8.1
    CLANG_VERSION=12
    VISION=yes
@ -235,8 +246,8 @@ case "$tag" in
    CLANG_VERSION=18
    VISION=yes
    ;;
-  pytorch-linux-jammy-py3.10-gcc11)
-    ANACONDA_PYTHON_VERSION=3.10
+  pytorch-linux-jammy-py3.9-gcc11)
+    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    VISION=yes
    KATEX=yes
--- a/.ci/docker/ci_commit_pins/torchbench.txt
+++ b/.ci/docker/ci_commit_pins/torchbench.txt
@ -1 +1 @@
-74a23feff57432129df84d8099e622773cf77925
+e03a63be43e33596f7f0a43b0f530353785e4a59
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-d0e80f39c562c70986fc548fa6e5852ad86e16e7
+a6572fb0be5b9b0a19b0641a0ce05810fa04e44c
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -57,7 +57,7 @@ if [ ! -f setup.py ]; then
  cd python
 fi

-pip_install pybind11==3.0.1
+pip_install pybind11==2.13.6

 # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
 as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py
--- a/.ci/docker/common/install_ucc.sh
+++ b/.ci/docker/common/install_ucc.sh
@ -44,12 +44,8 @@ function install_ucc() {

  ./autogen.sh

-  if [[ -n "$CUDA_VERSION"  && $CUDA_VERSION == 13* ]]; then
-    NVCC_GENCODE="-gencode=arch=compute_86,code=compute_86"
-  else
-    # We only run distributed tests on Tesla M60 and A10G
-    NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
-  fi
+  # We only run distributed tests on Tesla M60 and A10G
+  NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"

  if [[ -n "$ROCM_VERSION" ]]; then
    if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -146,11 +146,11 @@ if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
    XPU_DRIVER_VERSION="/lts/2350"
 fi

-# Default use Intel® oneAPI Deep Learning Essentials 2025.1
-if [[ "$XPU_VERSION" == "2025.2" ]]; then
-    XPU_PACKAGES="intel-deep-learning-essentials-2025.2"
-else
+# Default use Intel® oneAPI Deep Learning Essentials 2025.0
+if [[ "$XPU_VERSION" == "2025.1" ]]; then
    XPU_PACKAGES="intel-deep-learning-essentials-2025.1"
+else
+    XPU_PACKAGES="intel-deep-learning-essentials-2025.0"
 fi

 # The installation depends on the base OS
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -175,6 +175,6 @@ ENV XPU_DRIVER_TYPE ROLLING
 RUN python3 -m pip install --upgrade pip && \
    python3 -mpip install cmake==3.28.4
 ADD ./common/install_xpu.sh install_xpu.sh
-ENV XPU_VERSION 2025.2
+ENV XPU_VERSION 2025.1
 RUN bash ./install_xpu.sh && rm install_xpu.sh
 RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -379,7 +379,7 @@ dataclasses_json==0.6.7
 cmake==4.0.0
 #Description: required for building

-tlparse==0.4.0
+tlparse==0.3.30
 #Description: required for log parsing

 cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x"
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,7 +1,7 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2

 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -66,7 +66,6 @@ ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
 # (optional) Install UCC
 ARG UCX_COMMIT
 ARG UCC_COMMIT
-ARG CUDA_VERSION
 ENV UCX_COMMIT $UCX_COMMIT
 ENV UCC_COMMIT $UCC_COMMIT
 ENV UCX_HOME /usr
--- a/.ci/lumen_cli/cli/lib/common/gh_summary.py
+++ b/.ci/lumen_cli/cli/lib/common/gh_summary.py
@ -1,56 +1,14 @@
 from __future__ import annotations
-
-import logging
 import os
-import textwrap
 from pathlib import Path
-from typing import TYPE_CHECKING
-
-from cli.lib.common.utils import get_wheels
-from jinja2 import Template
-
-
-if TYPE_CHECKING:
-    from collections.abc import Iterable, Mapping
-
+from typing import Iterable, Mapping, Optional
+import logging
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from typing import Iterable, Tuple

 logger = logging.getLogger(__name__)

-_TPL_CONTENT = Template(
-    textwrap.dedent("""\
-    ## {{ title }}
-
-    ```{{ lang }}
-    {{ content }}
-    ```
-""")
-)
-
-_TPL_LIST_ITEMS = Template(
-    textwrap.dedent("""\
-    ## {{ title }}
-    {% for it in items %}
-    - {{ it.pkg }}: {{ it.relpath }}
-    {% else %}
-    _(no item found)_
-    {% endfor %}
-    """)
-)
-
-_TPL_TABLE = Template(
-    textwrap.dedent("""\
-    {%- if rows %}
-    | {{ cols | join(' | ') }} |
-    |{%- for _ in cols %} --- |{%- endfor %}
-    {%- for r in rows %}
-    | {%- for c in cols %} {{ r.get(c, "") }} |{%- endfor %}
-    {%- endfor %}
-    {%- else %}
-    _(no data)_
-    {%- endif %}
-""")
-)
-

 def gh_summary_path() -> Path | None:
    """Return the Path to the GitHub step summary file, or None if not set."""
@ -69,14 +27,13 @@ def write_gh_step_summary(md: str, *, append_content: bool = True) -> bool:
    """
    sp = gh_summary_path()
    if not sp:
+        # When running locally, just log to console instead of failing.
        logger.info("[gh-summary] GITHUB_STEP_SUMMARY not set, skipping write.")
        return False
-
-    md_clean = textwrap.dedent(md).strip() + "\n"
-
+    sp.parent.mkdir(parents=True, exist_ok=True)
    mode = "a" if append_content else "w"
    with sp.open(mode, encoding="utf-8") as f:
-        f.write(md_clean)
+        f.write(md.rstrip() + "\n")
    return True


@ -85,59 +42,182 @@ def md_heading(text: str, level: int = 2) -> str:
    return f"{'#' * max(1, min(level, 6))} {text}\n"


+def md_kv_table(rows: Iterable[Mapping[str, str | int | float]]) -> str:
+    """
+    Render a list of dictionaries as a Markdown table.
+    The first row (header) is derived from the union of all keys.
+        # Suppose you want to summarize benchmark results
+        rows = [
+            {"name": "transformer-small", "p50": 12.3, "p90(ms)": 18.4},
+            {"name": "transformer-large", "p50": 45.1, "p90(ms)": 60.7},
+        ]
+        content = []
+        content.append(md_heading("Benchmark Results", level=2))
+        content.append(md_kv_table(rows))
+        content.append(md_details("Raw logs", "```\n[INFO] benchmark log ...\n```"))
+        # Join the pieces into one Markdown block
+        markdown = '\n'.join(content)
+        # Write to GitHub Actions summary (or log locally if not in CI)
+        write_gh_step_summary(markdown, append=True)
+    """
+
+    rows = list(rows)
+    if not rows:
+        return "_(no data)_\n"
+    # Collect all columns across all rows
+    cols = list({k for r in rows for k in r.keys()})
+    header = "| " + " | ".join(cols) + " |\n"
+    sep = "|" + "|".join([" --- " for _ in cols]) + "|\n"
+    lines = []
+    for r in rows:
+        line = "| " + " | ".join(str(r.get(c, "")) for c in cols) + " |\n"
+        lines.append(line)
+    return header + sep + "".join(lines) + "\n"
+
+
 def md_details(summary: str, content: str) -> str:
    """Generate a collapsible <details> block with a summary and inner content."""
    return f"<details>\n<summary>{summary}</summary>\n\n{content}\n\n</details>\n"


+# ---- helper test to generate a summary for list of pytest failures ------#
+
+
+def summarize_failures_by_test_command(
+    xml_and_labels: Iterable[Tuple[str | Path, str]],
+    *,
+    title: str = "Pytest Failures by Test Command",
+    dedupe_within_command: bool = True,
+):
+    """
+    Args:
+      xml_and_labels: list of (xml_path, label) pairs.
+                      Each XML corresponds to one pytest subprocess (one test command).
+    Behavior:
+      - Writes a section per test command if it has failures.
+      - Each failed test is listed as 'path/to/test.py:test_name'.
+
+    Example:
+        xml = [
+            ("reports/junit_cmd0.xml", "pytest -v -s tests/unit"),
+            ("reports/junit_cmd1.xml", "pytest -v -s tests/integration"),
+            ("reports/junit_cmd2.xml", "pytest -v -s tests/entrypoints"),
+        ]
+        summarize_failures_by_test_command(
+            xmls,
+            title="Consolidated Pytest Failures",
+        )
+    """
+    write_gh_step_summary(md_heading(title, level=2))
+
+    for xml_path, label in xml_and_labels:
+        xmlp = Path(xml_path)
+        failed = _parse_failed_simple(xmlp)
+        if dedupe_within_command:
+            failed = sorted(set(failed))
+        if not failed:
+            continue  # skip commands with no failures
+        write_gh_step_summary(md_heading(f"Test Command: {label}", level=3))
+        lines = "\n".join(f"- {item}" for item in failed)
+        write_gh_step_summary(lines + "\n")
+
+
+def _to_simple_name_from_testcase(tc: ET.Element) -> str:
+    """
+    Convert a <testcase> into 'path/to/test.py:test_name' format.
+    Prefer the 'file' attribute if available, else fall back to classname.
+    """
+    name = tc.attrib.get("name", "")
+    file_attr = tc.attrib.get("file")
+    if file_attr:
+        return f"{file_attr}:{name}"
+
+    classname = tc.attrib.get("classname", "")
+    parts = classname.split(".") if classname else []
+    if len(parts) >= 1:
+        # drop last part if it's a class, treat rest as module path
+        mod_parts = parts[:-1] if len(parts) >= 2 else parts
+        mod_path = "/".join(mod_parts) + ".py" if mod_parts else "unknown.py"
+        return f"{mod_path}:{name}"
+    return f"unknown.py:{name or 'unknown_test'}"
+
+
+def _parse_failed_simple(xml_path: Path) -> list[str]:
+    """
+    Parse one XML, return failures as ['tests/a_test.py:test_x', ...].
+    Only include <failure> and <error>.
+    """
+    if not xml_path.exists():
+        return []
+    tree = ET.parse(xml_path)
+    root = tree.getroot()
+    failed = []
+    for tc in root.iter("testcase"):
+        if any(x.tag in {"failure", "error"} for x in tc):
+            failed.append(_to_simple_name_from_testcase(tc))
+    return failed
+
+
 def summarize_content_from_file(
    output_dir: Path,
    freeze_file: str,
-    title: str = "Content from file",
+    title: str = "Wheels (pip freeze)",
    code_lang: str = "",  # e.g. "text" or "ini"
 ) -> bool:
+    """
+    Read a text file from output_dir/freeze_file and append it to
+    the GitHub Step Summary as a Markdown code block.
+
+    Returns True if something was written, False otherwise.
+    """
+
    f = Path(output_dir) / freeze_file
    if not f.exists():
        return False
+
    content = f.read_text(encoding="utf-8").strip()
-    md = render_content(content, title=title, lang=code_lang)
-    return write_gh_step_summary(md)
-
-
-def summarize_wheels(path: Path, title: str = "Wheels", max_depth: int = 3):
-    items = get_wheels(path, max_depth=max_depth)
-    if not items:
+    if not content:
        return False
-    md = render_list(items, title=title)
-    return write_gh_step_summary(md)
+    md = []
+    md.append(md_heading(title, 2))
+    md.append(f"```{code_lang}".rstrip())
+    md.append(content)
+    md.append("```")
+
+    return write_gh_step_summary("\n".join(md) + "\n")


-def md_kv_table(rows: Iterable[Mapping[str, str | int | float]]) -> str:
+def summarize_wheels(
+    output_dir: Path,
+    title: str = "Wheels",
+    max_depth: Optional[int] = None,  # None = unlimited
+):
    """
-    Render a list of dicts as a Markdown table using Jinja template.
+    Walk output_dir up to max_depth and list all *.whl files.
+    Grouped as 'package: filename.whl'.
+
+    Args:
+        output_dir: base directory to search
+        title: section title in GH summary
+        max_depth: maximum folder depth relative to output_dir (0 = only top-level)
    """
-    rows = list(rows)
-    cols = list({k for r in rows for k in r.keys()})
-    md = _TPL_TABLE.render(cols=cols, rows=rows).strip() + "\n"
-    return md
+    if not output_dir.exists():
+        return False
+    root = Path(output_dir)
+    lines = [md_heading(title, 2)]

+    for dirpath, _, filenames in os.walk(root):
+        depth = Path(dirpath).relative_to(root).parts
+        if max_depth is not None and len(depth) > max_depth:
+            # skip going deeper
+            continue

-def render_list(
-    items: Iterable[str],
-    *,
-    title: str = "List",
-) -> str:
-    tpl = _TPL_LIST_ITEMS
-    md = tpl.render(title=title, items=items)
-    return md
+        for fname in sorted(filenames):
+            if not fname.endswith(".whl"):
+                continue
+            pkg = fname.split("-")[0]
+            relpath = str(Path(dirpath) / fname).replace(str(root) + os.sep, "")
+            lines.append(f"- {pkg}: {relpath}")

-
-def render_content(
-    content: str,
-    *,
-    title: str = "Content",
-    lang: str = "text",
-) -> str:
-    tpl = _TPL_CONTENT
-    md = tpl.render(title=title, content=content, lang=lang)
-    return md
+    if len(lines) > 1:
+        write_gh_step_summary("\n".join(lines) + "\n")
--- a/.ci/lumen_cli/cli/lib/common/pip_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/pip_helper.py
@ -4,7 +4,7 @@ import shlex
 import shutil
 import sys
 from collections.abc import Iterable
-from importlib.metadata import PackageNotFoundError, version  # noqa: UP035
+from importlib.metadata import PackageNotFoundError, version
 from typing import Optional, Union

 from cli.lib.common.utils import run_command
--- a/.ci/lumen_cli/cli/lib/common/utils.py
+++ b/.ci/lumen_cli/cli/lib/common/utils.py
@ -8,7 +8,6 @@ import shlex
 import subprocess
 import sys
 from contextlib import contextmanager
-from pathlib import Path
 from typing import Optional


@ -116,24 +115,3 @@ def working_directory(path: str):
        yield
    finally:
        os.chdir(prev_cwd)
-
-
-def get_wheels(
-    output_dir: Path,
-    max_depth: Optional[int] = None,
-) -> list[str]:
-    """Return a list of wheels found in the given output directory."""
-    root = Path(output_dir)
-    if not root.exists():
-        return []
-    items = []
-    for dirpath, _, filenames in os.walk(root):
-        depth = Path(dirpath).relative_to(root).parts
-        if max_depth is not None and len(depth) > max_depth:
-            continue
-        for fname in sorted(filenames):
-            if fname.endswith(".whl"):
-                pkg = fname.split("-")[0]
-                relpath = str((Path(dirpath) / fname).relative_to(root))
-                items.append({"pkg": pkg, "relpath": relpath})
-    return items
--- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@ -1,27 +1,15 @@
 import logging
-import os
-import textwrap
+from pathlib import Path
+import re
 from typing import Any

-from cli.lib.common.gh_summary import write_gh_step_summary
 from cli.lib.common.git_helper import clone_external_repo
 from cli.lib.common.pip_helper import pip_install_packages
 from cli.lib.common.utils import run_command, temp_environ, working_directory
-from jinja2 import Template
-
+from cli.lib.common.gh_summary import md_heading, write_gh_step_summary

 logger = logging.getLogger(__name__)

-_TPL_VLLM_INFO = Template(
-    textwrap.dedent("""\
-    ##  Vllm against Pytorch CI Test Summary
-    **Vllm Commit**: [{{ vllm_commit }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})
-    {%- if torch_sha %}
-    **Pytorch Commit**: [{{ torch_sha }}](https://github.com/pytorch/pytorch/commit/{{ torch_sha }})
-    {%- endif %}
-""")
-)
-

 def sample_vllm_test_library():
    """
@ -245,12 +233,3 @@ def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) ->
    for k in sorted(mapping, key=len, reverse=True):
        step = step.replace(k, mapping[k])
    return step
-
-
-def summarize_build_info(vllm_commit: str) -> bool:
-    torch_sha = os.getenv("GITHUB_SHA")
-    md = (
-        _TPL_VLLM_INFO.render(vllm_commit=vllm_commit, torch_sha=torch_sha).strip()
-        + "\n"
-    )
-    return write_gh_step_summary(md)
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
@ -4,7 +4,6 @@ import textwrap
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
-
 from cli.lib.common.cli_helper import BaseRunner
 from cli.lib.common.docker_helper import local_image_exists
 from cli.lib.common.envs_helper import (
@ -13,11 +12,6 @@ from cli.lib.common.envs_helper import (
    env_str_field,
    with_params_help,
 )
-from cli.lib.common.gh_summary import (
-    gh_summary_path,
-    summarize_content_from_file,
-    summarize_wheels,
-)
 from cli.lib.common.path_helper import (
    copy,
    ensure_dir_exists,
@ -26,7 +20,14 @@ from cli.lib.common.path_helper import (
    is_path_exist,
 )
 from cli.lib.common.utils import run_command
-from cli.lib.core.vllm.lib import clone_vllm, summarize_build_info
+from cli.lib.core.vllm.lib import clone_vllm, write_gh_step_summary
+from cli.lib.common.gh_summary import (
+    summarize_content_from_file,
+    summarize_wheels,
+    gh_summary_path,
+)
+import torch
+from torch import torch_version


 logger = logging.getLogger(__name__)
@ -160,7 +161,17 @@ class VllmBuildRunner(BaseRunner):
        logger.info("Running vllm build with inputs: %s", inputs)
        vllm_commit = clone_vllm()

+        vllm_sha_url = f"${vllm_commit}](https://github.com/vllm-project/vllm/commit/${vllm_commit})"
+        write_gh_step_summary(
+            f"""
+            ## Commit Info
+            - **Vllm Commit**: `{vllm_sha_url}`
+            - **Torch Version**: `{torch_version}`
+            """
+        )
+
        self.cp_dockerfile_if_exist(inputs)
+
        # cp torch wheels from root direct to vllm workspace if exist
        self.cp_torch_whls_if_exist(inputs)

@ -181,19 +192,26 @@ class VllmBuildRunner(BaseRunner):
        if not gh_summary_path():
            return logger.info("Skipping, not detect GH Summary env var....")
        logger.info("Generate GH Summary ...")
-        # summarize vllm build info
-        summarize_build_info(vllm_commit)
-
-        # summarize vllm build artifacts
+        vllm_sha_url = f"[{vllm_commit}](https://github.com/vllm-project/vllm/commit/{vllm_commit})"
+        write_gh_step_summary(
+            f"""
+            ## Build vllm against Pytorch CI
+            **Vllm Commit**: `{vllm_sha_url}`
+            """
+        )
+        torch_sha = os.getenv("GITHUB_SHA")
+        if torch_sha:  # only can grab this in github action
+            torch_sha_url = (
+                f"[{torch_sha}](https://github.com/pytorch/pytorch/commit/{torch_sha})]"
+            )
+            write_gh_step_summary(
+                f"""
+             **Pytorch Commit**: `{torch_sha_url}`
+             """
+            )
        vllm_artifact_dir = inputs.output_dir / "wheels"
-        summarize_content_from_file(
-            vllm_artifact_dir,
-            "build_summary.txt",
-            title="Vllm build env pip package summary",
-        )
-        summarize_wheels(
-            inputs.torch_whls_path, max_depth=3, title="Torch Wheels Artifacts"
-        )
+        summarize_content_from_file(vllm_artifact_dir, "build_summary.txt", title="Vllm build package summary")
+        summarize_wheels(inputs.torch_whls_path, max_depth=3, title="Torch Wheels Artifacts")
        summarize_wheels(vllm_artifact_dir, max_depth=3, title="Vllm Wheels Artifacts")

    def cp_torch_whls_if_exist(self, inputs: VllmBuildParameters) -> str:
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
@ -215,6 +215,7 @@ def preprocess_test_in(
        "torchaudio",
        "xformers",
        "mamba_ssm",
+        "pybind11",
    ] + additional_package_to_move
    # Read current requirements
    target_path = Path(target_file)
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -300,3 +300,24 @@ except RuntimeError as e:
    exit 1
  fi
 fi
+
+###############################################################################
+# Check for C++ ABI compatibility to GCC-11 - GCC 13
+###############################################################################
+if [[ "$(uname)" == 'Linux' &&  "$PACKAGE_TYPE" == 'manywheel' ]]; then
+  pushd /tmp
+  # Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html
+  # gcc-11 is ABI16, gcc-13 is ABI18, gcc-14 is ABI19
+  # gcc 11 - CUDA 11.8, xpu, rocm
+  # gcc 13 - CUDA 12.6, 12.8 and cpu
+  # Please see issue for reference: https://github.com/pytorch/pytorch/issues/152426
+  if [[ "$(uname -m)" == "s390x" ]]; then
+    cxx_abi="19"
+  elif [[ "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'rocm'* ]]; then
+    cxx_abi="18"
+  else
+    cxx_abi="16"
+  fi
+  python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi10${cxx_abi}' else 1)"
+  popd
+fi
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -35,10 +35,11 @@ fi

 print_cmake_info
 if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
-  USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
+  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
+  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
 else
-  # NB: we always build with distributed; USE_DISTRIBUTED turns off all
-  # backends (specifically the gloo backend), so test that this case works too
+  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
+  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
 fi
 if which sccache > /dev/null; then
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -13,13 +13,9 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
 fi
 popd

-python -mpip install -r requirements.txt
-
 # enable debug asserts in serialization
 export TORCH_SERIALIZATION_DEBUG=1

-python -mpip install --no-input -r requirements.txt
-
 setup_test_python() {
  # The CircleCI worker hostname doesn't resolve to an address.
  # This environment variable makes ProcessGroupGloo default to
@ -306,47 +302,6 @@ test_torchbench_smoketest() {
    fi

  done
-  echo "Pytorch benchmark on mps device completed"
-}
-
-test_aoti_torchbench_smoketest() {
-  print_cmake_info
-
-  echo "Launching AOTInductor torchbench setup"
-  pip_benchmark_deps
-  # shellcheck disable=SC2119,SC2120
-  torchbench_setup_macos
-
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-
-  local device=mps
-  local dtypes=(undefined float16 bfloat16 notset)
-  local dtype=${dtypes[$1]}
-  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
-
-  echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}"
-  local dtype_arg="--${dtype}"
-  if [ "$dtype" == notset ]; then
-      dtype_arg="--float32"
-  fi
-  touch "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv"
-  for model in "${models[@]}"; do
-    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-      --performance --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
-      --output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv" || true
-    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-      --accuracy --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
-      --output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_accuracy.csv" || true
-  done
-
-  echo "Launching HuggingFace inference performance run for AOT Inductor and dtype ${dtype}"
-  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
-    --performance --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
-    --output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_performance.csv" || true
-  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
-    --accuracy --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
-    --output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_accuracy.csv" || true

  echo "Pytorch benchmark on mps device completed"
 }
@ -395,8 +350,6 @@ elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then
  test_timm_perf
 elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then
  test_torchbench_smoketest "${SHARD_NUMBER}"
-elif [[ $TEST_CONFIG == *"aot_inductor_perf_smoketest"* ]]; then
-  test_aoti_torchbench_smoketest "${SHARD_NUMBER}"
 elif [[ $TEST_CONFIG == *"mps"* ]]; then
  test_python_mps
 elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -496,14 +496,6 @@ test_inductor_cpp_wrapper_shard() {
    -k 'take' \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose
-
-  if [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
-    python test/run_test.py \
-      --include inductor/test_mkldnn_pattern_matcher \
-      -k 'xpu' \
-      --shard "$1" "$NUM_TEST_SHARDS" \
-      --verbose
-  fi
 }

 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -44,7 +44,7 @@ python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==
 python -m pip install z3-solver==4.15.1.0

 # Install tlparse for test\dynamo\test_structured_trace.py UTs.
-python -m pip install tlparse==0.4.0
+python -m pip install tlparse==0.3.30

 # Install parameterized
 python -m pip install parameterized==0.8.1
--- a/.ci/pytorch/windows/internal/xpu_install.bat
+++ b/.ci/pytorch/windows/internal/xpu_install.bat
@ -13,9 +13,9 @@ if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
 :xpu_bundle_install_start

 set XPU_BUNDLE_PARENT_DIR=C:\Program Files (x86)\Intel\oneAPI
-set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
+set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d6d6c17-ca2d-4735-9331-99447e4a1280/intel-deep-learning-essentials-2025.0.1.28_offline.exe
 set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product
-set XPU_BUNDLE_VERSION=2025.1.3+5
+set XPU_BUNDLE_VERSION=2025.0.1+20
 set XPU_BUNDLE_INSTALLED=0
 set XPU_BUNDLE_UNINSTALL=0
 set XPU_EXTRA_URL=NULL
@ -24,9 +24,9 @@ set XPU_EXTRA_VERSION=2025.0.1+1226
 set XPU_EXTRA_INSTALLED=0
 set XPU_EXTRA_UNINSTALL=0

-if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.2] (
-    set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
-    set XPU_BUNDLE_VERSION=2025.2.1+20
+if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.1] (
+    set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
+    set XPU_BUNDLE_VERSION=2025.1.3+5
 )

 :: Check if XPU bundle is target version or already installed
@ -90,3 +90,14 @@ if errorlevel 1 exit /b 1
 del xpu_extra.exe

 :xpu_install_end
+
+if not "%XPU_ENABLE_KINETO%"=="1" goto install_end
+:: Install Level Zero SDK
+set XPU_EXTRA_LZ_URL=https://github.com/oneapi-src/level-zero/releases/download/v1.14.0/level-zero-sdk_1.14.0.zip
+curl -k -L %XPU_EXTRA_LZ_URL% --output "%SRC_DIR%\temp_build\level_zero_sdk.zip"
+echo "Installing level zero SDK..."
+7z x "%SRC_DIR%\temp_build\level_zero_sdk.zip" -o"%SRC_DIR%\temp_build\level_zero"
+set "INCLUDE=%SRC_DIR%\temp_build\level_zero\include;%INCLUDE%"
+del "%SRC_DIR%\temp_build\level_zero_sdk.zip"
+
+:install_end
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -213,8 +213,7 @@ pip install requests ninja typing-extensions
 retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
 retry brew install libomp

-# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
-# is build as part of tensorpipe submodule
+# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
 export USE_DISTRIBUTED=1

 export USE_MKLDNN=OFF
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@ -15,7 +15,8 @@ fi
 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
    export VC_YEAR=2022
    export USE_SCCACHE=0
-    export XPU_VERSION=2025.2
+    export XPU_VERSION=2025.1
+    export XPU_ENABLE_KINETO=1
 fi

 echo "Free space on filesystem before build:"
--- a/.circleci/scripts/binary_windows_test.sh
+++ b/.circleci/scripts/binary_windows_test.sh
@ -8,7 +8,7 @@ export VC_YEAR=2022

 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
    export VC_YEAR=2022
-    export XPU_VERSION=2025.2
+    export XPU_VERSION=2025.1
 fi

 pushd "$PYTORCH_ROOT/.ci/pytorch/"
--- a/.github/actions/build-external-packages/action.yml
+++ b/.github/actions/build-external-packages/action.yml
@ -62,6 +62,8 @@ runs:
        MAX_JOBS="$(nproc --ignore=6)"
        export MAX_JOBS

+        echo "$GITHUB_STEP_SUMMARY"
+
        # Split the comma-separated list and build each target
        IFS=',' read -ra TARGETS <<< "$BUILD_TARGETS"
        for target in "${TARGETS[@]}"; do
--- a/.github/actions/checkout-pytorch/action.yml
+++ b/.github/actions/checkout-pytorch/action.yml
@ -57,21 +57,6 @@ runs:
        submodules: ${{ inputs.submodules }}
        show-progress: false

-    - name: Clean submodules post checkout
-      id: clean-submodules
-      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
-      shell: bash
-      env:
-        NO_SUDO: ${{ inputs.no-sudo }}
-      run: |
-        cd "${GITHUB_WORKSPACE}"
-        # Clean stale submodule dirs
-        if [ -z "${NO_SUDO}" ]; then
-          sudo git submodule foreach --recursive git clean -ffdx
-        else
-          git submodule foreach --recursive git clean -ffdx
-        fi
-
    - name: Clean workspace (try again)
      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' &&
        (steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }}
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-1c66402d0fa47ea74d365dcaa468d397da481918
+10a5002c6195bd95e34df8fe28ff8a2d55a2a922
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-752d2e1c364e4195093e4f3f2fc33e3ae1840707
+add1adfec742dfb13e614dab3372b5aafd1ff046
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-763e5b78d4fcd74a9e812256656c075f99d9a781
+a1c6ee92c85e8b0955c20892ed68f032a6015c09
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@ -359,7 +359,7 @@ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 # Build flashinfer for torch nightly from source around 10 mins
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
-ARG FLASHINFER_GIT_REF="v0.2.14.post1"
+ARG ="v0.2.14.post1"
 RUN --mount=type=cache,target=/root/.cache/uv \
    git clone --depth 1 --recursive --shallow-submodules \
        --branch ${FLASHINFER_GIT_REF} \
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -28,7 +28,7 @@ pyyaml==6.0.2
 scipy==1.12.0
 setuptools==72.1.0
 sympy==1.13.3
-tlparse==0.4.0
+tlparse==0.3.30
 tensorboard==2.13.0
 typing-extensions==4.12.2
 unittest-xml-reporting<=3.2.0,>=2.0.0
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -40,7 +40,7 @@ CPU_AARCH64_ARCH = ["cpu-aarch64"]

 CPU_S390X_ARCH = ["cpu-s390x"]

-CUDA_AARCH64_ARCHES = ["12.9-aarch64", "13.0-aarch64"]
+CUDA_AARCH64_ARCHES = ["12.9-aarch64"]


 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
@ -113,26 +113,26 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
    "xpu": (
-        "intel-cmplr-lib-rt==2025.2.1 | "
-        "intel-cmplr-lib-ur==2025.2.1 | "
-        "intel-cmplr-lic-rt==2025.2.1 | "
-        "intel-sycl-rt==2025.2.1 | "
-        "oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "onemkl-sycl-blas==2025.2.0 | "
-        "onemkl-sycl-dft==2025.2.0 | "
-        "onemkl-sycl-lapack==2025.2.0 | "
-        "onemkl-sycl-rng==2025.2.0 | "
-        "onemkl-sycl-sparse==2025.2.0 | "
-        "dpcpp-cpp-rt==2025.2.1 | "
-        "intel-opencl-rt==2025.2.1 | "
-        "mkl==2025.2.0 | "
-        "intel-openmp==2025.2.1 | "
-        "tbb==2022.2.0 | "
-        "tcmlib==1.4.0 | "
-        "umf==0.11.0 | "
-        "intel-pti==0.13.1"
+        "intel-cmplr-lib-rt==2025.1.1 | "
+        "intel-cmplr-lib-ur==2025.1.1 | "
+        "intel-cmplr-lic-rt==2025.1.1 | "
+        "intel-sycl-rt==2025.1.1 | "
+        "oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "onemkl-sycl-blas==2025.1.0 | "
+        "onemkl-sycl-dft==2025.1.0 | "
+        "onemkl-sycl-lapack==2025.1.0 | "
+        "onemkl-sycl-rng==2025.1.0 | "
+        "onemkl-sycl-sparse==2025.1.0 | "
+        "dpcpp-cpp-rt==2025.1.1 | "
+        "intel-opencl-rt==2025.1.1 | "
+        "mkl==2025.1.0 | "
+        "intel-openmp==2025.1.1 | "
+        "tbb==2022.1.0 | "
+        "tcmlib==1.3.0 | "
+        "umf==0.10.0 | "
+        "intel-pti==0.12.3"
    ),
 }

@ -244,6 +244,8 @@ def generate_libtorch_matrix(
                arches.remove("13.0")
        elif os == "windows":
            arches += CUDA_ARCHES
+            if "13.0" in arches:
+                arches.remove("13.0")
    if libtorch_variants is None:
        libtorch_variants = [
            "shared-with-deps",
@ -308,6 +310,8 @@ def generate_wheels_matrix(
            arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
        elif os == "windows":
            arches += CUDA_ARCHES + XPU_ARCHES
+            if "13.0" in arches:
+                arches.remove("13.0")
        elif os == "linux-aarch64":
            # Separate new if as the CPU type is different and
            # uses different build/test scripts
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -27,7 +27,6 @@ from trymerge import (
    get_drci_classifications,
    gh_get_team_members,
    GitHubPR,
-    iter_issue_timeline_until_comment,
    JobCheckState,
    main as trymerge_main,
    MandatoryChecksMissingError,
@ -35,8 +34,6 @@ from trymerge import (
    RE_GHSTACK_DESC,
    read_merge_rules,
    remove_job_name_suffix,
-    sha_from_committed_event,
-    sha_from_force_push_after,
    validate_revert,
 )

@ -127,7 +124,7 @@ def mock_parse_args(revert: bool = False, force: bool = False) -> Any:
            self.force = force
            self.pr_num = 76123
            self.dry_run = True
-            self.comment_id = 12345  # Set to non-zero value
+            self.comment_id = 0
            self.reason = "this is for testing"
            self.ignore_current = False
            self.check_mergeability = False
@ -155,9 +152,9 @@ def mock_revert(
 def mock_merge(
    pr: GitHubPR,
    repo: GitRepo,
-    comment_id: int,
    dry_run: bool = False,
    skip_mandatory_checks: bool = False,
+    comment_id: Optional[int] = None,
    timeout_minutes: int = 400,
    stale_pr_days: int = 3,
    ignore_current: bool = False,
@ -473,9 +470,9 @@ class TestTryMerge(TestCase):
        mock_merge.assert_called_once_with(
            mock.ANY,
            mock.ANY,
-            comment_id=mock.ANY,
            dry_run=mock.ANY,
            skip_mandatory_checks=True,
+            comment_id=mock.ANY,
            ignore_current=False,
        )

@ -488,9 +485,9 @@ class TestTryMerge(TestCase):
        mock_merge.assert_called_once_with(
            mock.ANY,
            mock.ANY,
-            comment_id=mock.ANY,
            dry_run=mock.ANY,
            skip_mandatory_checks=False,
+            comment_id=mock.ANY,
            ignore_current=False,
        )

@ -1141,176 +1138,5 @@ Pull Request resolved: https://github.com/pytorch/pytorch/pull/154394"""
        )


-@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
-@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
-@mock.patch(
-    "trymerge.get_drci_classifications", side_effect=mocked_drci_classifications
-)
-class TestTimelineFunctions(TestCase):
-    """Tests for the new timeline-related functions"""
-
-    def test_sha_from_committed_event(self, *args: Any) -> None:
-        """Test extracting SHA from committed event"""
-        # Based on actual GitHub API format - committed events have "sha" at top level
-        event = {
-            "event": "committed",
-            "sha": "fb21ce932ded6670c918804a0d9151b773770a7c",
-        }
-        self.assertEqual(
-            sha_from_committed_event(event), "fb21ce932ded6670c918804a0d9151b773770a7c"
-        )
-
-        # Test with missing SHA
-        event_no_sha = {"event": "committed"}
-        self.assertIsNone(sha_from_committed_event(event_no_sha))
-
-    def test_sha_from_force_push_after(self, *args: Any) -> None:
-        """Test extracting SHA from force push event"""
-        # NOTE: The current function doesn't handle the actual GitHub API format
-        # Real force push events have "commit_id" at top level, but this function
-        # looks for "after", "after_commit", "after_sha", or "head_sha" fields
-
-        # Test with the legacy format the current function handles
-        event_legacy = {
-            "event": "head_ref_force_pushed",
-            "after": {"sha": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e"},
-        }
-        self.assertEqual(
-            sha_from_force_push_after(event_legacy),
-            "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
-        )
-
-        # Test with current GitHub API format (should return None with current implementation)
-        event_real_api = {
-            "event": "head_ref_force_pushed",
-            "commit_id": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
-        }
-        self.assertEqual(
-            sha_from_force_push_after(event_real_api),
-            "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
-        )  # Current function doesn't handle commit_id
-
-        # Test with missing SHA
-        event_no_sha = {"event": "head_ref_force_pushed"}
-        self.assertIsNone(sha_from_force_push_after(event_no_sha))
-
-    @mock.patch("trymerge.gh_fetch_json_list")
-    def test_iter_issue_timeline_until_comment(
-        self, mock_gh_fetch_json_list: Any, *args: Any
-    ) -> None:
-        """Test timeline iteration until target comment"""
-        # Mock timeline data based on actual GitHub API format
-        timeline_data = [
-            {"event": "commented", "id": 100, "body": "first comment"},
-            {"event": "committed", "sha": "fb21ce932ded6670c918804a0d9151b773770a7c"},
-            {"event": "commented", "id": 200, "body": "target comment"},
-            {"event": "commented", "id": 300, "body": "after target"},
-        ]
-        mock_gh_fetch_json_list.return_value = timeline_data
-
-        # Test iteration stops at target comment
-        events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 200))
-        self.assertEqual(len(events), 3)  # Should stop at target comment
-        self.assertEqual(events[0]["event"], "commented")
-        self.assertEqual(events[0]["id"], 100)
-        self.assertEqual(events[1]["event"], "committed")
-        self.assertEqual(events[1]["sha"], "fb21ce932ded6670c918804a0d9151b773770a7c")
-        self.assertEqual(events[2]["event"], "commented")
-        self.assertEqual(events[2]["id"], 200)
-
-    @mock.patch("trymerge.gh_fetch_json_list")
-    def test_iter_issue_timeline_until_comment_not_found(
-        self, mock_gh_fetch_json_list: Any, *args: Any
-    ) -> None:
-        """Test timeline iteration when target comment is not found"""
-        # Mock empty timeline
-        mock_gh_fetch_json_list.return_value = []
-
-        events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 999))
-        self.assertEqual(len(events), 0)
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_commit_after_comment(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        """Test get_commit_sha_at_comment returns correct SHA after comment"""
-        mock_iter_timeline.return_value = [
-            {"event": "committed", "sha": "commit1"},
-            {"event": "committed", "sha": "commit2"},
-            {"event": "commented", "id": 100},
-            {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
-        ]
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(100)
-        self.assertEqual(sha, "commit2")
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_force_push_before_comment(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        mock_iter_timeline.return_value = [
-            {"event": "committed", "sha": "commit1"},
-            {"event": "committed", "sha": "commit2"},
-            {"event": "head_ref_force_pushed", "commit_id": "commit3"},
-            {"event": "commented", "id": 100},
-        ]
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(100)
-        self.assertEqual(sha, "commit3")
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_force_push_before_comment_legacy_mode(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        mock_iter_timeline.return_value = [
-            {"event": "committed", "sha": "commit1"},
-            {"event": "committed", "sha": "commit2"},
-            {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
-            {"event": "commented", "id": 100},
-        ]
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(100)
-        self.assertEqual(sha, "commit3")
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_multiple_comments(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        mock_iter_timeline.return_value = [
-            {"event": "committed", "sha": "commit1"},
-            {"event": "commented", "id": 100},
-            {"event": "committed", "sha": "commit2"},
-            {"event": "commented", "id": 200},
-            {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
-            {"event": "commented", "id": 300},
-        ]
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(200)
-        self.assertEqual(sha, "commit2")
-        sha = pr.get_commit_sha_at_comment(300)
-        self.assertEqual(sha, "commit3")
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_no_events(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        mock_iter_timeline.return_value = [
-            {"event": "commented", "id": 100},
-            {"event": "labeled", "label": {"name": "test"}},
-        ]
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(100)
-        self.assertIsNone(sha)
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_exception(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        mock_iter_timeline.side_effect = Exception("API error")
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(100)
-        self.assertIsNone(sha)
-
-
 if __name__ == "__main__":
    main()
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -450,63 +450,6 @@ HAS_NO_CONNECTED_DIFF_TITLE = (
 IGNORABLE_FAILED_CHECKS_THESHOLD = 10


-def iter_issue_timeline_until_comment(
-    org: str, repo: str, issue_number: int, target_comment_id: int, max_pages: int = 200
-) -> Any:
-    """
-    Yield timeline entries in order until (and including) the entry whose id == target_comment_id
-    for a 'commented' event. Stops once the target comment is encountered.
-    """
-    page = 1
-
-    while page <= max_pages:
-        url = (
-            f"https://api.github.com/repos/{org}/{repo}/issues/{issue_number}/timeline"
-        )
-        params = {"per_page": 100, "page": page}
-
-        batch = gh_fetch_json_list(url, params)
-
-        if not batch:
-            return
-        for ev in batch:
-            # The target is the issue comment row with event == "commented" and id == issue_comment_id
-            if ev.get("event") == "commented" and ev.get("id") == target_comment_id:
-                yield ev  # nothing in the timeline after this matters, so stop early
-                return
-            yield ev
-        if len(batch) < 100:
-            return
-        page += 1
-
-    # If we got here without finding the comment, then we either hit a bug or some github PR
-    # has a _really_ long timeline.
-    # The max # of pages found on any pytorch/pytorch PR at the time of this change was 41
-    raise RuntimeError(
-        f"Could not find a merge commit in the first {max_pages} pages of the timeline at url {url}."
-        f"This is most likely a bug, please report it to the @pytorch/pytorch-dev-infra team."
-    )
-
-
-def sha_from_committed_event(ev: dict[str, Any]) -> Optional[str]:
-    """Extract SHA from committed event in timeline"""
-    return ev.get("sha")
-
-
-def sha_from_force_push_after(ev: dict[str, Any]) -> Optional[str]:
-    """Extract SHA from force push event in timeline"""
-    # The current GitHub API format
-    commit_id = ev.get("commit_id")
-    if commit_id:
-        return str(commit_id)
-
-    # Legacy format
-    after = ev.get("after") or ev.get("after_commit") or {}
-    if isinstance(after, dict):
-        return after.get("sha") or after.get("oid")
-    return ev.get("after_sha") or ev.get("head_sha")
-
-
 def gh_get_pr_info(org: str, proj: str, pr_no: int) -> Any:
    rc = gh_graphql(GH_GET_PR_INFO_QUERY, name=proj, owner=org, number=pr_no)
    return rc["data"]["repository"]["pullRequest"]
@ -794,24 +737,16 @@ class GitHubPR:
    def last_commit(self) -> Any:
        return self.info["commits"]["nodes"][-1]["commit"]

-    def last_commit_sha(self, default: Optional[str] = None) -> str:
-        # for commits, the oid is the sha
-
-        if default is None:
-            return str(self.last_commit()["oid"])
-
-        return str(self.last_commit().get("oid", default))
-
    def get_merge_base(self) -> str:
        if self.merge_base:
            return self.merge_base

-        last_commit_sha = self.last_commit_sha()
+        last_commit_oid = self.last_commit()["oid"]
        # NB: We could use self.base_ref() here for regular PR, however, that doesn't
        # work for ghstack where the base is the custom branch, i.e. gh/USER/ID/base,
        # so let's just use main instead
        self.merge_base = gh_fetch_merge_base(
-            self.org, self.project, last_commit_sha, self.default_branch()
+            self.org, self.project, last_commit_oid, self.default_branch()
        )

        # Fallback to baseRefOid if the API call fails, i.e. rate limit. Note that baseRefOid
@ -900,44 +835,6 @@ class GitHubPR:
    def get_commit_count(self) -> int:
        return int(self.info["commits_with_authors"]["totalCount"])

-    def get_commit_sha_at_comment(self, comment_id: int) -> Optional[str]:
-        """
-        Get the PR head commit SHA that was present when a specific comment was posted.
-        This ensures we only merge the state of the PR at the time the merge command was issued,
-        not any subsequent commits that may have been pushed after.
-
-        Returns None if no head-changing events found before the comment or if the comment was not found.
-        """
-        head = None
-
-        try:
-            for event in iter_issue_timeline_until_comment(
-                self.org, self.project, self.pr_num, comment_id
-            ):
-                etype = event.get("event")
-                if etype == "committed":
-                    sha = sha_from_committed_event(event)
-                    if sha:
-                        head = sha
-                        print(f"Timeline: Found commit event for SHA {sha}")
-                elif etype == "head_ref_force_pushed":
-                    sha = sha_from_force_push_after(event)
-                    if sha:
-                        head = sha
-                        print(f"Timeline: Found force push event for SHA {sha}")
-                elif etype == "commented":
-                    if event.get("id") == comment_id:
-                        print(f"Timeline: Found final comment with sha {sha}")
-                        return head
-        except Exception as e:
-            print(
-                f"Warning: Failed to reconstruct timeline for comment {comment_id}: {e}"
-            )
-            return None
-
-        print(f"Did not find comment with id {comment_id} in the PR timeline")
-        return None
-
    def get_pr_creator_login(self) -> str:
        return cast(str, self.info["author"]["login"])

@ -1254,7 +1151,7 @@ class GitHubPR:
        *,
        skip_mandatory_checks: bool = False,
        dry_run: bool = False,
-        comment_id: int,
+        comment_id: Optional[int] = None,
        ignore_current_checks: Optional[list[str]] = None,
    ) -> None:
        # Raises exception if matching rule is not found
@ -1270,7 +1167,7 @@ class GitHubPR:
            skip_internal_checks=can_skip_internal_checks(self, comment_id),
            ignore_current_checks=ignore_current_checks,
        )
-        additional_merged_prs = self.merge_changes_locally(
+        additional_merged_prs = self.merge_changes(
            repo, skip_mandatory_checks, comment_id
        )

@ -1299,7 +1196,7 @@ class GitHubPR:
                broken_trunk_checks=ignorable_checks.get("BROKEN_TRUNK", []),
                flaky_checks=ignorable_checks.get("FLAKY", []),
                unstable_checks=ignorable_checks.get("UNSTABLE", []),
-                last_commit_sha=self.last_commit_sha(default=""),
+                last_commit_sha=self.last_commit().get("oid", ""),
                merge_base_sha=self.get_merge_base(),
                merge_commit_sha=merge_commit_sha,
                is_failed=False,
@ -1320,7 +1217,7 @@ class GitHubPR:
            dry_run=dry_run,
        )

-    def merge_changes_locally(
+    def merge_changes(
        self,
        repo: GitRepo,
        skip_mandatory_checks: bool = False,
@ -1329,15 +1226,27 @@ class GitHubPR:
        skip_all_rule_checks: bool = False,
    ) -> list["GitHubPR"]:
        """
-        :param skip_all_rule_checks: If true, skips all rule checks on ghstack PRs, useful for dry-running merge locally
+        :param skip_all_rule_checks: If true, skips all rule checks, useful for dry-running merge locally
        """
        branch_to_merge_into = self.default_branch() if branch is None else branch
        if repo.current_branch() != branch_to_merge_into:
            repo.checkout(branch_to_merge_into)
+        if not self.is_ghstack_pr():
+            msg = self.gen_commit_message()
+            pr_branch_name = f"__pull-request-{self.pr_num}__init__"
+            repo.fetch(self.last_commit()["oid"], pr_branch_name)
+            repo._run_git("merge", "--squash", pr_branch_name)
+            repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)

-        # It's okay to skip the commit SHA check for ghstack PRs since
-        # authoring requires write access to the repo.
-        if self.is_ghstack_pr():
+            # Did the PR change since we started the merge?
+            pulled_sha = repo.show_ref(pr_branch_name)
+            latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
+            if pulled_sha != latest_pr_status.last_commit()["oid"]:
+                raise RuntimeError(
+                    "PR has been updated since CI checks last passed. Please rerun the merge command."
+                )
+            return []
+        else:
            return self.merge_ghstack_into(
                repo,
                skip_mandatory_checks,
@ -1345,48 +1254,6 @@ class GitHubPR:
                skip_all_rule_checks=skip_all_rule_checks,
            )

-        msg = self.gen_commit_message()
-        pr_branch_name = f"__pull-request-{self.pr_num}__init__"
-
-        # Determine which commit SHA to merge
-        commit_to_merge = None
-        if not comment_id:
-            raise ValueError("Must provide --comment-id when merging regular PRs")
-
-        # Get the commit SHA that was present when the comment was made
-        commit_to_merge = self.get_commit_sha_at_comment(comment_id)
-        if not commit_to_merge:
-            raise RuntimeError(
-                f"Could not find commit that was pushed before comment {comment_id}"
-            )
-
-        # Validate that this commit is the latest commit on the PR
-        latest_commit = self.last_commit_sha()
-        if commit_to_merge != latest_commit:
-            raise RuntimeError(
-                f"Commit {commit_to_merge} was HEAD when comment {comment_id} was posted "
-                f"but now the latest commit on the PR is {latest_commit}. "
-                f"Please re-issue the merge command to merge the latest commit."
-            )
-
-        print(f"Merging commit {commit_to_merge} locally")
-
-        repo.fetch(commit_to_merge, pr_branch_name)
-        repo._run_git("merge", "--squash", pr_branch_name)
-        repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
-
-        # Did the PR change since we started the merge?
-        pulled_sha = repo.show_ref(pr_branch_name)
-        latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
-        if (
-            pulled_sha != latest_pr_status.last_commit_sha()
-            or pulled_sha != commit_to_merge
-        ):
-            raise RuntimeError(
-                "PR has been updated since CI checks last passed. Please rerun the merge command."
-            )
-        return []
-

 class MergeRuleFailedError(RuntimeError):
    def __init__(self, message: str, rule: Optional["MergeRule"] = None) -> None:
@ -1591,7 +1458,7 @@ def find_matching_merge_rule(
            pending_checks = []
            failed_checks = []

-        hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit_sha()}"
+        hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit()['oid']}"
        if len(failed_checks) > 0:
            if reject_reason_score < 30000:
                reject_reason_score = 30000
@ -2289,14 +2156,14 @@ def categorize_checks(
 def merge(
    pr: GitHubPR,
    repo: GitRepo,
-    comment_id: int,
    dry_run: bool = False,
    skip_mandatory_checks: bool = False,
+    comment_id: Optional[int] = None,
    timeout_minutes: int = 400,
    stale_pr_days: int = 3,
    ignore_current: bool = False,
 ) -> None:
-    initial_commit_sha = pr.last_commit_sha()
+    initial_commit_sha = pr.last_commit()["oid"]
    pr_link = f"https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num}"
    print(f"Attempting merge of {initial_commit_sha} ({pr_link})")

@ -2367,7 +2234,7 @@ def merge(
            f"Attempting merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} ({elapsed_time / 60} minutes elapsed)"
        )
        pr = GitHubPR(pr.org, pr.project, pr.pr_num)
-        if initial_commit_sha != pr.last_commit_sha():
+        if initial_commit_sha != pr.last_commit()["oid"]:
            raise RuntimeError(
                "New commits were pushed while merging. Please rerun the merge command."
            )
@ -2534,7 +2401,7 @@ def main() -> None:
    if args.check_mergeability:
        if pr.is_ghstack_pr():
            get_ghstack_prs(repo, pr)  # raises error if out of sync
-        pr.merge_changes_locally(
+        pr.merge_changes(
            repo,
            skip_mandatory_checks=True,
            skip_all_rule_checks=True,
@ -2549,18 +2416,12 @@ def main() -> None:
        gh_post_pr_comment(org, project, args.pr_num, message, dry_run=args.dry_run)
        return
    try:
-        # Ensure comment id is set, else fail
-        if not args.comment_id:
-            raise ValueError(
-                "Comment ID is required for merging PRs, please provide it using --comment-id"
-            )
-
        merge(
            pr,
            repo,
-            comment_id=args.comment_id,
            dry_run=args.dry_run,
            skip_mandatory_checks=args.force,
+            comment_id=args.comment_id,
            ignore_current=args.ignore_current,
        )
    except Exception as e:
@ -2582,7 +2443,7 @@ def main() -> None:
                broken_trunk_checks=[],
                flaky_checks=[],
                unstable_checks=[],
-                last_commit_sha=pr.last_commit_sha(default=""),
+                last_commit_sha=pr.last_commit().get("oid", ""),
                merge_base_sha=pr.get_merge_base(),
                is_failed=True,
                skip_mandatory_checks=args.force,
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@ -4,7 +4,7 @@
 {%- set download_artifact_action = "actions/download-artifact@v4.1.7" -%}

 {%- set timeout_minutes = 240 -%}
-{%- set timeout_minutes_windows_binary = 360 -%}
+{%- set timeout_minutes_windows_binary = 300 -%}

 {%- macro concurrency(build_environment) -%}
 concurrency:
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -77,7 +77,6 @@ jobs:
        run: |
          git config --global core.longpaths true
          git config --global core.symlinks true
-          git config --global core.ignorecase false

          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
          # the directory on Windows and prevent GHA from checking out as reported
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -70,7 +70,6 @@ jobs:
        run: |
          git config --global core.longpaths true
          git config --global core.symlinks true
-          git config --global core.ignorecase false

          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
          # the directory on Windows and prevent GHA from checking out as reported
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -275,7 +275,7 @@ jobs:
      - name: Change permissions
        if: ${{ always() && steps.test.conclusion }}
        run: |
-          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1000:1000 test"
+          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"

      - name: Print remaining test logs
        shell: bash
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -145,7 +145,7 @@ jobs:
          fi

          docker exec -t "${container_name}" yum install -y zlib-devel zip
-          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}"  -m pip install -U setuptools==78.1.0 pybind11==3.0.1 auditwheel wheel
+          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}"  -m pip install -U setuptools==78.1.0 pybind11==2.13.1 auditwheel wheel
          set +e
          docker exec -t "${container_name}" command -v pip
          has_pip=$?
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -50,23 +50,24 @@ jobs:
        runner: [linux.12xlarge]
        docker-image-name: [
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
-          pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
          pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
-          pytorch-linux-jammy-py3.10-clang12,
+          pytorch-linux-jammy-py3.9-clang12,
          pytorch-linux-jammy-py3.13-clang12,
          pytorch-linux-jammy-rocm-n-py3,
          pytorch-linux-noble-rocm-n-py3,
          pytorch-linux-noble-rocm-alpha-py3,
          pytorch-linux-jammy-rocm-n-py3-benchmarks,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12,
-          pytorch-linux-jammy-py3.10-gcc11,
-          pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
+          pytorch-linux-jammy-py3.9-gcc11,
+          pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
          pytorch-linux-jammy-py3.12-halide,
-          pytorch-linux-jammy-xpu-n-1-py3,
-          pytorch-linux-jammy-xpu-n-py3,
+          pytorch-linux-jammy-xpu-2025.0-py3,
+          pytorch-linux-jammy-xpu-2025.1-py3,
          pytorch-linux-jammy-py3-clang18-asan,
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -158,52 +158,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_10-cuda-aarch64-13_0-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.10"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_10-cuda-aarch64-13_0
-      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda-aarch64-13_0-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_10-cuda-aarch64-13_0-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda-aarch64-13_0
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_11-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -315,52 +269,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_11-cuda-aarch64-13_0-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.11"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_11-cuda-aarch64-13_0
-      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda-aarch64-13_0-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_11-cuda-aarch64-13_0-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda-aarch64-13_0
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_12-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -472,52 +380,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_12-cuda-aarch64-13_0-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.12"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_12-cuda-aarch64-13_0
-      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda-aarch64-13_0-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_12-cuda-aarch64-13_0-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda-aarch64-13_0
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_13-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -629,52 +491,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_13-cuda-aarch64-13_0-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.13"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_13-cuda-aarch64-13_0
-      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda-aarch64-13_0-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13-cuda-aarch64-13_0-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda-aarch64-13_0
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_13t-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -786,52 +602,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_13t-cuda-aarch64-13_0-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.13t"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_13t-cuda-aarch64-13_0
-      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cuda-aarch64-13_0-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13t-cuda-aarch64-13_0-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cuda-aarch64-13_0
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_14-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -943,52 +713,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_14-cuda-aarch64-13_0-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.14"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_14-cuda-aarch64-13_0
-      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14-cuda-aarch64-13_0-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_14-cuda-aarch64-13_0-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.14"
-      build_name: manywheel-py3_14-cuda-aarch64-13_0
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_14t-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -1099,49 +823,3 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_14t-cuda-aarch64-13_0-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.14t"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_14t-cuda-aarch64-13_0
-      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14t-cuda-aarch64-13_0-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_14t-cuda-aarch64-13_0-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
-      DESIRED_PYTHON: "3.14t"
-      build_name: manywheel-py3_14t-cuda-aarch64-13_0
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -612,7 +612,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-xpu
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-xpu-test:  # Testing
@ -1270,7 +1270,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-xpu
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-xpu-test:  # Testing
@ -1928,7 +1928,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-xpu
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-xpu-test:  # Testing
@ -2586,7 +2586,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-xpu
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-xpu-test:  # Testing
@ -3244,7 +3244,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-xpu
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-xpu-test:  # Testing
@ -3902,7 +3902,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-xpu
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-xpu-test:  # Testing
@ -4560,7 +4560,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-xpu
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-xpu-test:  # Testing
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
@ -51,7 +51,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -128,7 +128,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-debug-build
      - get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
@ -51,7 +51,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -128,7 +128,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-release-build
      - get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
--- a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
@ -51,7 +51,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
@ -124,7 +124,7 @@ jobs:
      - wheel-py3_11-cpu-build
      - get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
@ -198,7 +198,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
@ -271,7 +271,7 @@ jobs:
      - wheel-py3_12-cpu-build
      - get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
@ -345,7 +345,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
@ -418,7 +418,7 @@ jobs:
      - wheel-py3_13-cpu-build
      - get-label-type
    runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
--- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
@ -38,7 +38,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -153,7 +153,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-debug-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@ -45,7 +45,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -160,7 +160,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-debug-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -292,7 +292,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -408,7 +408,7 @@ jobs:
      - libtorch-cuda12_6-shared-with-deps-debug-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -542,7 +542,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -658,7 +658,7 @@ jobs:
      - libtorch-cuda12_8-shared-with-deps-debug-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -792,7 +792,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -908,7 +908,7 @@ jobs:
      - libtorch-cuda12_9-shared-with-deps-debug-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -1038,253 +1038,3 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda13_0-shared-with-deps-debug-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
-        shell: bash
-        run: |
-          git config --global core.longpaths true
-          git config --global core.symlinks true
-
-          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
-          # the directory on Windows and prevent GHA from checking out as reported
-          # in https://github.com/actions/checkout/issues/1018
-          git config --global core.fsmonitor false
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v4.4.0
-        if: always()
-        with:
-          name: libtorch-cuda13_0-shared-with-deps-debug
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-
-  libtorch-cuda13_0-shared-with-deps-debug-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - libtorch-cuda13_0-shared-with-deps-debug-build
-      - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 360
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
-        shell: bash
-        run: |
-          git config --global core.longpaths true
-          git config --global core.symlinks true
-
-          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
-          # the directory on Windows and prevent GHA from checking out as reported
-          # in https://github.com/actions/checkout/issues/1018
-          git config --global core.fsmonitor false
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda13_0-shared-with-deps-debug
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda13_0-shared-with-deps-debug-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: libtorch-cuda13_0-shared-with-deps-debug-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-      build_name: libtorch-cuda13_0-shared-with-deps-debug
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
@ -38,7 +38,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -153,7 +153,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-release-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@ -45,7 +45,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -160,7 +160,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-release-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -292,7 +292,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -408,7 +408,7 @@ jobs:
      - libtorch-cuda12_6-shared-with-deps-release-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -542,7 +542,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -658,7 +658,7 @@ jobs:
      - libtorch-cuda12_8-shared-with-deps-release-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -792,7 +792,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -908,7 +908,7 @@ jobs:
      - libtorch-cuda12_9-shared-with-deps-release-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 360
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -1038,253 +1038,3 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda13_0-shared-with-deps-release-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
-        shell: bash
-        run: |
-          git config --global core.longpaths true
-          git config --global core.symlinks true
-
-          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
-          # the directory on Windows and prevent GHA from checking out as reported
-          # in https://github.com/actions/checkout/issues/1018
-          git config --global core.fsmonitor false
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v4.4.0
-        if: always()
-        with:
-          name: libtorch-cuda13_0-shared-with-deps-release
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-
-  libtorch-cuda13_0-shared-with-deps-release-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - libtorch-cuda13_0-shared-with-deps-release-build
-      - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 360
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
-        shell: bash
-        run: |
-          git config --global core.longpaths true
-          git config --global core.symlinks true
-
-          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
-          # the directory on Windows and prevent GHA from checking out as reported
-          # in https://github.com/actions/checkout/issues/1018
-          git config --global core.fsmonitor false
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda13_0-shared-with-deps-release
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda13_0-shared-with-deps-release-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: libtorch-cuda13_0-shared-with-deps-release-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-      build_name: libtorch-cuda13_0-shared-with-deps-release
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
--- a/.github/workflows/inductor-micro-benchmark-x86.yml
+++ b/.github/workflows/inductor-micro-benchmark-x86.yml
@ -18,13 +18,13 @@ permissions:
  contents: read

 jobs:
-  inductor-build:
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    name: inductor-build
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-py3.9-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      # Use metal host for benchmark jobs
      test-matrix: |
        { include: [
@ -32,13 +32,13 @@ jobs:
        ]}
    secrets: inherit

-  inductor-micro-benchmark-test:
-    name: inductor-micro-benchmark-test
+  linux-jammy-cpu-py3_9-gcc11-inductor-micro-benchmark-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
    with:
      build-environment: linux-jammy-py3.9-gcc11
-      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
    secrets: inherit
--- a/.github/workflows/inductor-nightly.yml
+++ b/.github/workflows/inductor-nightly.yml
@ -32,13 +32,13 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  nightly-dynamo-benchmarks-build:
-    name: nightly-dynamo-benchmarks-build
+  linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build:
+    name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      test-matrix: |
        { include: [
@ -51,13 +51,13 @@ jobs:
      build-additional-packages: "vision audio torchao"
    secrets: inherit

-  nightly-dynamo-benchmarks-test:
-    name: nightly-dynamo-benchmarks-test
+  linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-test:
+    name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks
    uses: ./.github/workflows/_linux-test.yml
-    needs: nightly-dynamo-benchmarks-build
+    needs: linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.test-matrix }}
      timeout-minutes: 720
    secrets: inherit
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -84,8 +84,9 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

+  # NB: Keep this in sync with trunk.yml
  build:
-    name: build
+    name: cuda12.8-py3.10-gcc9-sm90
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
@ -127,7 +128,7 @@ jobs:
    secrets: inherit

  test-periodically:
-    name: test-periodically
+    name: cuda12.8-py3.10-gcc9-sm90
    uses: ./.github/workflows/_linux-test.yml
    needs: build
    if: github.event.schedule == '15 0,12 * * 1-6'
@ -144,7 +145,7 @@ jobs:
    secrets: inherit

  test-weekly:
-    name: test-weekly
+    name: cuda12.8-py3.10-gcc9-sm90
    uses: ./.github/workflows/_linux-test.yml
    needs: build
    if: github.event.schedule == '0 7 * * 0'
@ -161,12 +162,9 @@ jobs:
    secrets: inherit

  test:
-    name: test
+    name: cuda12.8-py3.10-gcc9-sm90
    uses: ./.github/workflows/_linux-test.yml
    needs: build
-    # The pull_request trigger is used in PR to bump transformers pin which always
-    # needs one round of benchmark
-    if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }}
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
      dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}
--- a/.github/workflows/inductor-perf-test-nightly-macos.yml
+++ b/.github/workflows/inductor-perf-test-nightly-macos.yml
@ -48,9 +48,6 @@ jobs:
          { config: "perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" },
          { config: "perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" },
          { config: "perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" },
-          { config: "aot_inductor_perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" },
-          { config: "aot_inductor_perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" },
-          { config: "aot_inductor_perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" },
        ]}
    secrets: inherit

--- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
@ -69,14 +69,14 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  inductor-build:
-    name: inductor-build
+  linux-jammy-zen-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "inductor_huggingface_perf_cpu_x86_zen", shard: 1, num_shards: 3, runner: "linux.24xlarge.amd" },
@ -95,16 +95,16 @@ jobs:
      selected-test-configs: ${{ inputs.benchmark_configs }}
    secrets: inherit

-  inductor-test-nightly:
-    name: inductor-test-nightly
+  linux-jammy-zen-cpu-py3_9-gcc11-inductor-test-nightly:
+    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build
+    needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
    if: github.event.schedule == '0 7 * * *'
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
-      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
      # disable monitor in perf tests
      disable-monitor: false
@ -112,16 +112,17 @@ jobs:
      monitor-data-collect-interval: 4
    secrets: inherit

-  inductor-test:
-    name: inductor-test
+
+  linux-jammy-zen-cpu-py3_9-gcc11-inductor-test:
+    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build
+    needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
    if: github.event_name == 'workflow_dispatch'
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
-      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
      # disable monitor in perf tests
      disable-monitor: false
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@ -74,14 +74,14 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  inductor-build:
-    name: inductor-build
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "inductor_huggingface_perf_cpu_x86", shard: 1, num_shards: 3, runner: "linux.24xl.spr-metal" },
@ -101,16 +101,16 @@ jobs:
      build-additional-packages: "vision audio torchao"
    secrets: inherit

-  inductor-test-nightly-freezing:
-    name: inductor-test-nightly-freezing
+  linux-jammy-cpu-py3_9-gcc11-inductor-test-nightly-freezing:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
    if: github.event.schedule == '0 7 * * *'
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
-      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
      # disable monitor in perf tests
      disable-monitor: false
@ -118,16 +118,16 @@ jobs:
      monitor-data-collect-interval: 4
    secrets: inherit

-  inductor-test:
-    name: inductor-test
+  linux-jammy-cpu-py3_9-gcc11-inductor-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
    if: github.event_name == 'workflow_dispatch'
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }}
-      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
      # disable monitor in perf tests
      disable-monitor: false
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -79,6 +79,7 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

+  # NB: Keep this in sync with trunk.yml
  build:
    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -31,8 +31,8 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  periodic-dynamo-benchmarks-build:
-    name: periodic-dynamo-benchmarks-build
+  linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build:
+    name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
@ -57,33 +57,23 @@ jobs:
          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
        ]}
      build-additional-packages: "vision audio fbgemm torchao"
    secrets: inherit

-  periodic-dynamo-benchmarks-test:
-    name: periodic-dynamo-benchmarks-test
+  linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-test:
+    name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_linux-test.yml
-    needs: periodic-dynamo-benchmarks-build
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image: ${{ needs.periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
    secrets: inherit

-  rocm-periodic-dynamo-benchmarks-build:
+  linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build:
    if: github.repository_owner == 'pytorch'
-    name: rocm-periodic-dynamo-benchmarks-build
+    name: rocm-py3_10-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-rocm-py3_10
@ -109,21 +99,21 @@ jobs:
        ]}
    secrets: inherit

-  rocm-periodic-dynamo-benchmarks-test:
+  linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-test:
    permissions:
      id-token: write
      contents: read
-    name: rocm-periodic-dynamo-benchmarks-test
+    name: rocm-py3_10-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_rocm-test.yml
-    needs: rocm-periodic-dynamo-benchmarks-build
+    needs: linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build
    with:
      build-environment: linux-jammy-rocm-py3_10
-      docker-image: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
    secrets: inherit

-  inductor-smoke-build:
-    name: inductor-smoke-build
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
    needs:
      - get-default-label-prefix
@ -139,23 +129,23 @@ jobs:
      build-additional-packages: "vision audio fbgemm torchao"
    secrets: inherit

-  inductor-smoke-test:
-    name: inductor-smoke-test
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-test:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-smoke-build
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
-      docker-image: ${{ needs.inductor-smoke-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.test-matrix }}
    secrets: inherit

-  periodic-dynamo-benchmarks-cpu-build:
-    name: periodic-dynamo-benchmarks-cpu-build
+  linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build:
+    name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      test-matrix: |
        { include: [
@ -170,6 +160,68 @@ jobs:
          { config: "cpu_inductor_freezing_avx2_torchbench", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
          { config: "cpu_inductor_freezing_avx2_timm", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" },
          { config: "cpu_inductor_freezing_avx2_timm", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
+        ]}
+      build-additional-packages: "vision audio torchao"
+    secrets: inherit
+
+  linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-test:
+    name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+    secrets: inherit
+
+
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-default-label-prefix
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+      sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+      test-matrix: |
+        { include: [
+          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+      build-additional-packages: "vision audio fbgemm torchao"
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-default-label-prefix
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+      sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
+      test-matrix: |
+        { include: [
          { config: "cpu_inductor_freezing_huggingface", shard: 1, num_shards: 1, runner: "linux.8xlarge.amx" },
          { config: "cpu_inductor_freezing_timm", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
          { config: "cpu_inductor_freezing_timm", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
@ -195,12 +247,12 @@ jobs:
      build-additional-packages: "vision audio torchao"
    secrets: inherit

-  periodic-dynamo-benchmarks-cpu-test:
-    name: periodic-dynamo-benchmarks-cpu-test
+  linux-jammy-cpu-py3_9-gcc11-inductor-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: periodic-dynamo-benchmarks-cpu-build
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }}
-      test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -28,8 +28,8 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  inductor-build:
-    name: inductor-build
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
@ -47,18 +47,44 @@ jobs:
        ]}
    secrets: inherit

-  inductor-test:
-    name: inductor-test
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit

-  inductor-halide-build:
-    name: inductor-halide-build
+  linux-jammy-cuda12_8-py3_12-gcc9-inductor-build:
+    name: cuda12.8-py3.12-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      test-matrix: |
+        { include: [
+          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_12-gcc9-inductor-test:
+    name: cuda12.8-py3.12-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_12-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cpu-py3_12-inductor-halide-build:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
@ -71,18 +97,18 @@ jobs:
        ]}
    secrets: inherit

-  inductor-halide-test:
-    name: inductor-halide-test
+  linux-jammy-cpu-py3_12-inductor-halide-test:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-halide-build
+    needs: linux-jammy-cpu-py3_12-inductor-halide-build
    with:
      build-environment: linux-jammy-py3.12-gcc11
-      docker-image: ${{ needs.inductor-halide-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }}
    secrets: inherit

-  inductor-triton-cpu-build:
-    name: inductor-triton-cpu-build
+  linux-jammy-cpu-py3_12-inductor-triton-cpu-build:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
@ -95,23 +121,23 @@ jobs:
        ]}
    secrets: inherit

-  inductor-triton-cpu-test:
+  linux-jammy-cpu-py3_12-inductor-triton-cpu-test:
    name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-triton-cpu-build
+    needs: linux-jammy-cpu-py3_12-inductor-triton-cpu-build
    with:
      build-environment: linux-jammy-py3.12-gcc11
-      docker-image: ${{ needs.inductor-triton-cpu-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-triton-cpu-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.test-matrix }}
    secrets: inherit

-  inductor-cpu-build:
-    name: inductor-cpu-build
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
@ -122,12 +148,37 @@ jobs:
        ]}
    secrets: inherit

-  inductor-cpu-test:
-    name: inductor-cpu-test
+  linux-jammy-cpu-py3_9-gcc11-inductor-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-cpu-build
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_13-gcc9-inductor-build:
+    name: cuda12.8-py3.13-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      test-matrix: |
+        { include: [
+          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_13-gcc9-inductor-test:
+    name: cuda12.8-py3.13-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_13-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -44,8 +44,8 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  inductor-build:
-    name: inductor-build
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
@ -53,6 +53,7 @@ jobs:
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
      test-matrix: |
        { include: [
          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
@ -64,24 +65,25 @@ jobs:
      build-additional-packages: "vision audio fbgemm torchao"
    secrets: inherit

-  inductor-test:
-    name: inductor-test
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit

-  inductor-cpu-build:
-    name: inductor-cpu-build
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
      test-matrix: |
        { include: [
          { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
@ -96,12 +98,12 @@ jobs:
      build-additional-packages: "vision audio torchao"
    secrets: inherit

-  inductor-cpu-test:
-    name: inductor-cpu-test
+  linux-jammy-cpu-py3_9-gcc11-inductor-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-cpu-build
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -42,8 +42,8 @@ jobs:
    needs: get-label-type
    with:
      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
-      build-environment: linux-jammy-py3.10-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
    secrets: inherit

  docs-push:
--- a/.github/workflows/operator_benchmark.yml
+++ b/.github/workflows/operator_benchmark.yml
@ -24,38 +24,38 @@ permissions:
  contents: read

 jobs:
-  opbenchmark-build:
+  linux-jammy-cpu-py3_9-gcc11-opbenchmark-build:
    if: github.repository_owner == 'pytorch'
-    name: opbenchmark-build
+    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
        ]}
    secrets: inherit

-  opbenchmark-on-demand-build:
+  linux-jammy-cpu-py3_9-gcc11-opbenchmark-on-demand-build:
    if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'pytorch' }}
-    name: opbenchmark-on-demand-build
+    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "cpu_operator_benchmark_${{ inputs.test_mode }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
        ]}
    secrets: inherit

-  opbenchmark-test:
-    name: opbenchmark-test
+  linux-jammy-cpu-py3_9-gcc11-opbenchmark-test:
+    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
    uses: ./.github/workflows/_linux-test.yml
-    needs: opbenchmark-build
+    needs: linux-jammy-cpu-py3_9-gcc11-opbenchmark-build
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
-      test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -170,38 +170,6 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-cuda13_0-py3_10-gcc11-build:
-    name: linux-jammy-cuda13.0-py3.10-gcc11
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      cuda-arch-list: 7.5
-      build-environment: linux-jammy-cuda13.0-py3.10-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
-      test-matrix: |
-        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda13_0-py3_10-gcc11-test:
-    name: linux-jammy-cuda13.0-py3.10-gcc11
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda13_0-py3_10-gcc11-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-cuda13.0-py3.10-gcc11
-      docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
-    secrets: inherit
-
  linux-jammy-rocm-py3_10-build:
    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -49,14 +49,14 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}

-  linux-jammy-py3_10-gcc11-build:
-    name: linux-jammy-py3.10-gcc11
+  linux-jammy-py3_9-gcc11-build:
+    name: linux-jammy-py3.9-gcc11
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.10-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@ -73,49 +73,49 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-py3_10-gcc11-test:
-    name: linux-jammy-py3.10-gcc11
+  linux-jammy-py3_9-gcc11-test:
+    name: linux-jammy-py3.9-gcc11
    uses: ./.github/workflows/_linux-test.yml
    needs:
-      - linux-jammy-py3_10-gcc11-build
+      - linux-jammy-py3_9-gcc11-build
      - target-determination
    with:
-      build-environment: linux-jammy-py3.10-gcc11
-      docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.test-matrix }}
    secrets: inherit

  linux-docs:
    name: linux-docs
    uses: ./.github/workflows/_docs.yml
-    needs: linux-jammy-py3_10-gcc11-build
+    needs: linux-jammy-py3_9-gcc11-build
    with:
-      build-environment: linux-jammy-py3.10-gcc11
-      docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }}
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }}
    secrets: inherit

-  linux-jammy-py3_10-gcc11-no-ops:
-    name: linux-jammy-py3.10-gcc11-no-ops
+  linux-jammy-py3_9-gcc11-no-ops:
+    name: linux-jammy-py3.9-gcc11-no-ops
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.10-gcc11-no-ops
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+      build-environment: linux-jammy-py3.9-gcc11-no-ops
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 1 },
        ]}
    secrets: inherit

-  linux-jammy-py3_10-gcc11-pch:
-    name: linux-jammy-py3.10-gcc11-pch
+  linux-jammy-py3_9-gcc11-pch:
+    name: linux-jammy-py3.9-gcc11-pch
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.10-gcc11-pch
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+      build-environment: linux-jammy-py3.9-gcc11-pch
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 1 },
@ -183,14 +183,14 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-py3_10-clang12-build:
-    name: linux-jammy-py3.10-clang12
+  linux-jammy-py3_9-clang12-build:
+    name: linux-jammy-py3.9-clang12
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.10-clang12
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
+      build-environment: linux-jammy-py3.9-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
@ -207,16 +207,16 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-py3_10-clang12-test:
-    name: linux-jammy-py3.10-clang12
+  linux-jammy-py3_9-clang12-test:
+    name: linux-jammy-py3.9-clang12
    uses: ./.github/workflows/_linux-test.yml
    needs:
-      - linux-jammy-py3_10-clang12-build
+      - linux-jammy-py3_9-clang12-build
      - target-determination
    with:
-      build-environment: linux-jammy-py3.10-clang12
-      docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.9-clang12
+      docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }}
    secrets: inherit

  linux-jammy-py3_13-clang12-build:
@ -253,14 +253,14 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-cuda12_8-cudnn9-py3_10-clang12-build:
-    name: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
+  linux-jammy-cuda12_8-cudnn9-py3_9-clang12-build:
+    name: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12
+      build-environment: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 1 },
@ -282,14 +282,14 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-py3_10-gcc11-mobile-lightweight-dispatch-build:
-    name: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
+  linux-jammy-py3_9-gcc11-mobile-lightweight-dispatch-build:
+    name: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+      build-environment: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
      build-generates-artifacts: false
      test-matrix: |
        { include: [
@ -342,40 +342,15 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
-    name: cuda12.8-py3.10-gcc9-sm75
+  linux-jammy-xpu-2025_1-py3_9-build:
+    name: linux-jammy-xpu-2025.1-py3.9
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
-      cuda-arch-list: '7.5'
-      test-matrix: |
-        { include: [
-          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
-    name: cuda12.8-py3.10-gcc9-sm75
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
-    secrets: inherit
-
-  linux-jammy-xpu-n-py3_9-build:
-    name: linux-jammy-xpu-n-py3.9
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      sync-tag: linux-xpu-n-build
+      sync-tag: linux-xpu-2025-1-build
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-n-py3.9
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
+      build-environment: linux-jammy-xpu-2025.1-py3.9
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" },
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -78,14 +78,14 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-py3_10-clang12-build:
-    name: linux-jammy-py3.10-clang12
+  linux-jammy-py3_9-clang12-build:
+    name: linux-jammy-py3.9-clang12
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.10-clang12
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
+      build-environment: linux-jammy-py3.9-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12
      test-matrix: |
        { include: [
          { config: "slow", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
@ -93,16 +93,16 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-py3_10-clang12-test:
-    name: linux-jammy-py3.10-clang12
+  linux-jammy-py3_9-clang12-test:
+    name: linux-jammy-py3.9-clang12
    uses: ./.github/workflows/_linux-test.yml
    needs:
-      - linux-jammy-py3_10-clang12-build
+      - linux-jammy-py3_9-clang12-build
      - target-determination
    with:
-      build-environment: linux-jammy-py3.10-clang12
-      docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.9-clang12
+      docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }}
    secrets: inherit

  linux-jammy-rocm-py3_10-build:
--- a/.github/workflows/test-check-binary.yml
+++ b/.github/workflows/test-check-binary.yml
@ -30,7 +30,7 @@ jobs:
    name: Test check_binary.sh for Linux CUDA
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
    with:
-      runner: linux.g4dn.4xlarge.nvidia.gpu
+      runner: linux.4xlarge.nvidia.gpu
      docker-image: python:3.11
      docker-build-dir: "skip-docker-build"
      script: |
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -224,12 +224,13 @@ jobs:
      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
    secrets: inherit

-  inductor-build:
-    name: inductor-build
+  # NB: Keep this in sync with inductor-perf-test-nightly.yml
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
    secrets: inherit
@ -241,7 +242,7 @@ jobs:
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.9-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@ -59,19 +59,22 @@ jobs:
            # on the PR appear in chronological order (timing issues can shuffle them around)
            sleep 60
          fi
-
-          # Require a comment id for merge operations
-          if [ -z "${COMMENT_ID}" ]; then
-            echo "Error: merge requires COMMENT_ID to be specified"
-            exit 1
-          fi
-
          if [ -n "${FORCE}" ]; then
-            python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}"
+            if [ -n "${COMMENT_ID}" ]; then
+              python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}"
+            else
+              python3 .github/scripts/trymerge.py --force "${PR_NUM}"
+            fi
          elif [ -n "${IGNORE_CURRENT}" ]; then
-            python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}"
-          else
+            if [ -n "${COMMENT_ID}" ]; then
+              python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}"
+            else
+              python3 .github/scripts/trymerge.py --ignore-current "${PR_NUM}"
+            fi
+          elif [ -n "${COMMENT_ID}" ]; then
            python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}"
+          else
+            python3 .github/scripts/trymerge.py "${PR_NUM}"
          fi
      - name: Comment on Canceled
        if: ${{ cancelled() && steps.checkout.outcome == 'success' }}
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@ -6,7 +6,8 @@ on:
      - ciflow/vllm/*
  workflow_dispatch:
  schedule:
-    - cron: '0 */8 * * *'  # every 8 hours at minute 0 (UTC)
+    # Every 12 hours starting at 00:00 UTC (00:00 and 12:00)
+    - cron: '0 0,12 * * *'

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
--- a/.github/workflows/win-arm64-build-test.yml
+++ b/.github/workflows/win-arm64-build-test.yml
@ -4,9 +4,6 @@ on:
  push:
    tags:
      - ciflow/win-arm64/*
-  schedule:
-    # Every 4 hours starting at 00:00 UTC
-    - cron: '0 */4 * * *'

 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@ -26,15 +26,15 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

-  linux-jammy-xpu-n-1-py3_10-build:
-    name: linux-jammy-xpu-n-1-py3.10
+  linux-jammy-xpu-2025_0-py3_9-build:
+    name: linux-jammy-xpu-2025.0-py3.9
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      sync-tag: linux-xpu-n-1-build
+      sync-tag: linux-xpu-2025-0-build
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-n-1-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3
+      build-environment: linux-jammy-xpu-2025.0-py3.9
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.0-py3
      runner: linux.12xlarge
      test-matrix: |
        { include: [
@ -47,62 +47,60 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-xpu-n-py3_10-build:
-    name: linux-jammy-xpu-n-py3.10
+  linux-jammy-xpu-2025_1-py3_9-build:
+    name: linux-jammy-xpu-2025.1-py3.9
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      sync-tag: linux-xpu-n-build
+      sync-tag: linux-xpu-2025-1-build
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-n-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
+      build-environment: linux-jammy-xpu-2025.1-py3.9
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3
      runner: linux.12xlarge
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
        ]}
    secrets: inherit

-  linux-jammy-xpu-n-py3_10-test:
-    name: linux-jammy-xpu-n-py3.10
+  linux-jammy-xpu-2025_1-py3_9-test:
+    name: linux-jammy-xpu-2025.1-py3.9
    uses: ./.github/workflows/_xpu-test.yml
-    needs: linux-jammy-xpu-n-py3_10-build
+    needs: linux-jammy-xpu-2025_1-py3_9-build
    permissions:
      id-token: write
      contents: read
    with:
-      build-environment: linux-jammy-xpu-n-py3.10
-      docker-image: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.test-matrix }}
+      build-environment: linux-jammy-xpu-2025.1-py3.9
+      docker-image: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.test-matrix }}
    secrets: inherit

-  windows-xpu-n-1-build:
+  windows-xpu-2025_0-build:
    if: github.repository_owner == 'pytorch'
-    name: win-vs2022-xpu-n-1-py3
+    name: win-vs2022-xpu-2025_0-py3
    uses: ./.github/workflows/_win-build.yml
    with:
-      build-environment: win-vs2022-xpu-n-1-py3
+      build-environment: win-vs2022-xpu-py3
+      cuda-version: cpu
+      use-xpu: true
+      xpu-version: '2025.0'
+      vc-year: '2022'
+    secrets: inherit
+
+  windows-xpu-2025_1-build:
+    if: github.repository_owner == 'pytorch'
+    name: win-vs2022-xpu-2025_1-py3
+    uses: ./.github/workflows/_win-build.yml
+    with:
+      build-environment: win-vs2022-xpu-py3
      cuda-version: cpu
      use-xpu: true
      xpu-version: '2025.1'
      vc-year: '2022'
    secrets: inherit
-
-  windows-xpu-n-build:
-    if: github.repository_owner == 'pytorch'
-    name: win-vs2022-xpu-n-py3
-    uses: ./.github/workflows/_win-build.yml
-    with:
-      build-environment: win-vs2022-xpu-n-py3
-      cuda-version: cpu
-      use-xpu: true
-      xpu-version: '2025.2'
-      vc-year: '2022'
-    secrets: inherit
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -583,7 +583,7 @@ exclude_patterns = [
 command = [
    'python3',
    'tools/linter/adapters/grep_linter.py',
-    '--pattern=#include <pybind11\/(^|[^(gil_simple\.h)])',
+    '--pattern=#include <pybind11\/(^|[^(gil\.h)])',
    '--allowlist-pattern=#include <torch\/csrc\/utils\/pybind.h>',
    '--linter-name=PYBIND11_INCLUDE',
    '--match-first-only',
@ -1801,26 +1801,3 @@ command = [
  "python3",
  "tools/linter/adapters/gb_registry_linter.py",
 ]
-
-[[linter]]
-code = 'DISTRIBUTED_C10D_DIRECT_ACCESS'
-include_patterns = ['**/*.py']
-exclude_patterns = [
-    'torch/distributed/_distributed_c10d.py',
-    'fb/**',
-    '**/fb/**',
-]
-command = [
-    'python3',
-    'tools/linter/adapters/grep_linter.py',
-    '--pattern=torch\._C\._distributed_c10d',
-    '--linter-name=DISTRIBUTED_C10D_DIRECT_ACCESS',
-    '--error-name=direct access to torch._C._distributed_c10d',
-    """--error-description=\
-        Never access torch._C._distributed_c10d directly in code. Always \
-        import from and use torch.distributed._distributed_c10d which is \
-        guaranteed to have all functions available\
-    """,
-    '--',
-    '@{{PATHSFILE}}'
-]
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -22,6 +22,7 @@ COMMON_COPTS = [
    "-DHAVE_SHM_UNLINK=1",
    "-D_FILE_OFFSET_BITS=64",
    "-DUSE_FBGEMM",
+    "-DUSE_DISTRIBUTED",
    "-DAT_PER_OPERATOR_HEADERS",
    "-DATEN_THREADING=NATIVE",
    "-DNO_CUDNN_DESTROY_HANDLE",
@ -746,7 +747,6 @@ cc_library(
            "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu",
            "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu",
            "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
-            "torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp",
            "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
        ],
    )) + torch_sources,
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -181,9 +181,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
  set(CPU_POWER ON)
 endif()

-# For non-supported platforms, turn USE_DISTRIBUTED off by default.
-# NB: USE_DISTRIBUTED simply disables the backend; distributed code
-# still gets built
+# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
+# tested and likely won't work without additional changes.
 if(NOT LINUX AND NOT WIN32)
  set(USE_DISTRIBUTED
      OFF
@ -262,18 +261,18 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
 option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
-option(USE_DISTRIBUTED "Enable default distributed backends" ON)
+option(USE_DISTRIBUTED "Use distributed" ON)
 cmake_dependent_option(USE_NCCL "Use NCCL" ON
                       "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_XCCL "Use XCCL" ON
-                       "USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
+                       "USE_XPU;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
 cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
 cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL"
                       OFF)
 cmake_dependent_option(USE_NVSHMEM "Use NVSHMEM" ON
-                       "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
+                       "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
 option(USE_NNAPI "Use NNAPI" OFF)
 option(USE_NNPACK "Use NNPACK" ON)
 cmake_dependent_option(USE_NUMA "Use NUMA. Only available on Linux." ON "LINUX"
@ -431,10 +430,11 @@ if(WIN32)
      PATH_SUFFIXES lib
      NO_DEFAULT_PATH)
    if(NOT libuv_tmp_LIBRARY)
+      set(USE_DISTRIBUTED OFF)
      set(USE_GLOO OFF)
      message(
        WARNING
-          "Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
+          "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
          "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
      )
    else()
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -216,7 +216,7 @@ file(GLOB mem_eff_attention_cuda_cpp "native/transformers/cuda/mem_eff_attention
 if(USE_CUDA AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
  add_library(flash_attention OBJECT EXCLUDE_FROM_ALL ${flash_attention_cuda_kernels_cu} ${flash_attention_cuda_cpp})

-  target_include_directories(flash_attention SYSTEM PUBLIC
+  target_include_directories(flash_attention PUBLIC
    ${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc
    ${PROJECT_SOURCE_DIR}/third_party/flash-attention/include
    ${PROJECT_SOURCE_DIR}/third_party/cutlass/include
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -1,18 +1,5 @@
 #pragma once

-// See https://github.com/pytorch/pytorch/issues/161660
-// This compile flag is intended to be passed in to CppExtensions that rely on
-// the stable ABI via the `extra_compile_args` argument. This is a stopgap
-// solution to ensure that non-stable libtorch APIs are not used in the extension.
-// The long term solution is to have a torch_stable target that excludes headers
-// that are not in torch/stable or torch/headeronly.
-// See test/cpp_extensions/torch_stable_test_extension/setup.py for an example
-// of how this is used.
-#ifdef TORCH_STABLE_ONLY
-#error \
-    "TensorBase.h should not be included when TORCH_STABLE_ONLY compile flag is passed"
-#endif
-
 #include <c10/core/Device.h>
 #include <c10/core/Layout.h>
 #include <c10/core/MemoryFormat.h>
--- a/aten/src/ATen/core/boxing/KernelFunction_impl.h
+++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h
@ -15,7 +15,7 @@ std::enable_if_t<
        std::is_base_of_v<Base, Child>,
    std::unique_ptr<Base>>
 make_unique_base(Args&&... args) {
-  return std::make_unique<Child>(std::forward<Args>(args)...);
+  return std::unique_ptr<Base>(new Child(std::forward<Args>(args)...));
 }
 } // namespace detail

--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@ -185,17 +185,6 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
  // right:  "lro, summed, ro" permuted with rpermutation and the three flattened
  // then the permuted output is a view of bmm(left, right)
  // finally, opermutation reverts the permutation to the original order of dimensions
-  // By default the output is "lro, lo, 1-for-summed-dims, ro" with original shape dimensions.
-  // However, if all dimensions from the right operand appear before those from the left
-  // operand in the final output, we can swap the operands so that bmm directly produces
-  // the result in the correct memory order.
-
-  bool swap_lo_ro = !lo.empty() && !ro.empty() && ro.back() < lo.front();
-  if (swap_lo_ro) {
-    std::swap(left, right);
-    std::swap(lo, ro);
-    std::swap(lo_size, ro_size);
-  }
  auto out_num_dim = lro.size() + lo.size() + sum_dims_.size() + ro.size();
  std::vector<SymInt> out_size;
  out_size.reserve(out_num_dim);
--- a/aten/src/ATen/native/cpu/Loops.h
+++ b/aten/src/ATen/native/cpu/Loops.h
@ -89,7 +89,7 @@ execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t
  using result_type = typename traits::result_type;
  for (; i < n; i++) {
    result_type* out_ptr = (result_type*)(data[0] + i * strides[0]);
-    *out_ptr = std::apply(op, dereference<traits>(
+    *out_ptr = c10::guts::apply(op, dereference<traits>(
        &data[1],
        &strides[1],
        i));
@ -102,7 +102,7 @@ inline void
 execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) {
  using traits = function_traits<func_t>;
  for (; i < n; i++) {
-    std::apply(op, dereference<traits>(
+    c10::guts::apply(op, dereference<traits>(
        &data[0],
        &strides[0],
        i));
@ -162,7 +162,7 @@ void handle_tuple_outputs(char* C10_RESTRICT data[],
 }

 // Loop operation for `cpu_kernel_multiple_outputs`.
-// 1. Use `std::apply` to make dynamic method invocation
+// 1. Use `c10::guts::apply` to make dynamic method invocation
 //    for the lambda passed in `cpu_kernel_multiple_outputs`.
 // 2. Iterate over the members of the returned tuple, set the corresponding
 //    output tensor by the tuple member in `handle_tuple_outputs` function.
@ -183,7 +183,7 @@ multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_
  }

  for (; i < n; i++) {
-    auto output = std::apply(op, dereference<traits>(
+    auto output = c10::guts::apply(op, dereference<traits>(
      &data[num_outputs],
      &strides[num_outputs],
      i));
@ -213,8 +213,8 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve
  for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
    auto args1 = dereference_vec<traits>(&data[1], opt_scalar, S, i);
    auto args2 = dereference_vec<traits>(&data[1], opt_scalar, S, i + Vec::size());
-    auto out1 = std::apply(vop, std::move(args1));
-    auto out2 = std::apply(vop, std::move(args2));
+    auto out1 = c10::guts::apply(vop, std::move(args1));
+    auto out2 = c10::guts::apply(vop, std::move(args2));
    out1.store(data[0] + i * sizeof(scalar_t));
    out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t));
  }
--- a/aten/src/ATen/native/cpu/PaddingKernel.cpp
+++ b/aten/src/ATen/native/cpu/PaddingKernel.cpp
@ -156,7 +156,7 @@ void cpu_padding(
  int64_t offset_h = ndim >= 2 ? p.offsets[ndim - 2] : 0;
  int64_t offset_w = p.offsets[ndim - 1];

-  // do vectorized copy when output is overlapped with input on W,
+  // do vectorized copy whe output is overlapped with input on W,
  // only applies to positive padding
  auto loop = [=](scalar_t* out, const scalar_t* in, bool positive_padding) {
    if (positive_padding) {
--- a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
@ -318,7 +318,7 @@ batch_norm_cpu_collect_stats_channels_last_impl(
    //
    // The optimal THRESHOLD to tile was found empirically.
    // When C > THRESHOLD, C is large enough that the benefit from tiling and vectorization outweigh the synchronization overhead.
-    // When C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization.
+    // Wehn C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization.
    //
    // When num_threads == 1, always use Method 2 as there is no synchronization overhead.
    //
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -1665,7 +1665,7 @@ const std::optional<at::Tensor>& bias,
 const std::optional<at::Tensor>& scale_result,
 std::optional<c10::ScalarType> out_dtype,
 bool use_fast_accum) {
-  bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/false);
+  bool allowed_device = _scaled_mm_allowed_device();
  TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0, or ROCm MI300+");

  TORCH_CHECK(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed");
--- a/aten/src/ATen/native/cuda/SegmentReduce.cu
+++ b/aten/src/ATen/native/cuda/SegmentReduce.cu
@ -20,7 +20,7 @@

 // SegmentReduce compilation with CUDA-12.9 causes  NVCC crash on Windows
 // See https://github.com/pytorch/pytorch/issues/156181
-#if !(defined(_WIN32) && CUDART_VERSION == 12090)
+#if !defined(_WIN32) || CUDART_VERSION < 12090

 namespace at::native {

@ -606,4 +606,4 @@ REGISTER_DISPATCH(

 } // namespace at::native

-#endif
+#endif
--- a/aten/src/ATen/native/cuda/Shape.cu
+++ b/aten/src/ATen/native/cuda/Shape.cu
@ -226,38 +226,6 @@ __global__ void CatArrayBatchedCopy_contig(
    }
 }

-
-template <typename T, typename IndexType, int Dims, int batch_size, int stride_size, int alignment, int elems_per_vec>
-__global__ void CatArrayBatchedCopy_vectorized(
-    char* output,
-    CatArrInputTensorMetadata<T, IndexType, batch_size, stride_size> inputs,
-    TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
-    const int concatDim,
-    IndexType trailingSize) {
-
-    IndexType tid = blockIdx.x * blockDim.x + threadIdx.x;
-    IndexType nElements = inputs.nElements[blockIdx.y] / elems_per_vec;
-
-    if(tid >= nElements) return;
-
-    const char * data = (char*)inputs.input[blockIdx.y];
-    IndexType offset = inputs.offset[blockIdx.y] * trailingSize / elems_per_vec;
-    IndexType dimSize = inputs.dimSize[blockIdx.y] * trailingSize / elems_per_vec;
-    IndexType dataOffset = offset  * alignment; // in bytes
-
-    IndexType stride = gridDim.x * blockDim.x;
-
-    while( tid < nElements){
-      IndexType elementOffset = CatArrIndexToOffset<IndexType, Dims>::compute(
-                    os.tensorSize, os.tensorStride, dimSize, concatDim, tid) * alignment; // in bytes
-      auto vec = at::native::memory::ld_vec<alignment>(data + alignment * tid);
-      at::native::memory::st_vec<alignment>(output + dataOffset + elementOffset, vec);
-      tid += stride;
-    }
-}
-
-
-
 /*
  Specialized implementation of the CatArrayBatchedCopy written to generate wide memory loads
  to improve memory bandwidth throughput.
@ -328,27 +296,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
  scalar_t *data = (scalar_t *)(out.mutable_data_ptr());
  CatArrInputTensorMetadata<scalar_t, unsigned int, batch_size, stride_size> catMetaData;
  TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> outputParam;
-  // If all batches are contiguous we can call a specialized implementation
-  // which requires the input tensor addresses to be aligned to a
-  // 16 Byte boundary.
-
-  constexpr bool isContig = stride_size == 1;
-  bool isAligned = true;
-  constexpr int alignment = 16;

  // Next, let's initialize the size, stride arrays for the output Tensor.
-  // for contig case, we'll canonicalize output strides, so that
-  // we don't have arbitrary strides for dims of size 0
-  size_t stride0 = 1;
  if (memory_format == c10::MemoryFormat::Contiguous) {
-    for (int i = nDims - 1; i >= 0; --i) {
+    for (int i = 0; i < nDims; ++i) {
      outputParam.tensorSize[i] = out.size(i);
-      if (isContig) {
-        outputParam.tensorStride[i] = stride0;
-        stride0 *= out.size(i);
-      } else {
-        outputParam.tensorStride[i] = out.stride(i);
-      }
+      outputParam.tensorStride[i] = out.stride(i);
    }
  } else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) {
    // permute the semantics of dims from NCHW to NHWC so that the input
@ -367,15 +320,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i

  at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();

+  // If all batches are contiguous we can call a specialized implementation
+  // which requires the input tensor addresses to be aligned to a
+  // 16 Byte boundary.

-  // for channels last computing slice size correctly is much more involved, so we never send it
-  // on the fully vectorized path
-  // we need output stride in cat dimension to be multiple of alignment,
-  // if we ever use it to compute offsets
-  // for catting in 0th dimension it doesn't matter
-  bool isInOutAligned = isContig && at::native::memory::get_alignment(data) >= alignment &&
-                        memory_format == c10::MemoryFormat::Contiguous && (dimension == 0 ||
-                        outputParam.tensorStride[dimension - 1] * sizeof(scalar_t) % alignment == 0);
+  bool isContig = true;
+  bool isAligned = true;
  unsigned int max_elements_per_tensor = 0;

  // Now we loop
@ -391,16 +341,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
      // high-dimensional tensor
      if (inputs[i+batchCounter].get().numel() > 0) {
        dimSize = inputs[i+batchCounter].get().size(dimension);
-        if (isInOutAligned) {
-          auto t = inputs[i+batchCounter].get();
-          // similarly to output stride, we cannot trust stride value to
-          // determine slice size if the corresponding dimension is 1
-          // we have to multiply all the subsequent sizes
-          int64_t slice_size = dimension == 0 ? t.numel() : t.sizes()[dimension - 1] != 1 ?
-             t.strides()[dimension - 1] : c10::multiply_integers(t.sizes().begin() + dimension, t.sizes().end());
-          slice_size *= sizeof(scalar_t);
-          isInOutAligned &= (slice_size % alignment == 0);
-        }
      }

      catMetaData.input[batchCounter] = (scalar_t*)(inputs[i+batchCounter].get().const_data_ptr());
@ -411,12 +351,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
 #ifdef USE_ROCM
      // On ROCm, CatArrayBatchedCopy_contig is faster
      isAligned = false;
-      isInOutAligned = false;
 #else
      // If at least one of the inputs is not aligned, we can't call the
      // CatArrayBatchedCopy_alignedK_contig
      isAligned &= is_aligned_vec4(catMetaData.input[batchCounter]);
-      isInOutAligned &= at::native::memory::get_alignment(catMetaData.input[batchCounter]) >= alignment;
 #endif

      if (stride_size > 1) {
@ -427,6 +365,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
          catMetaData.tensorStride[batchCounter].tensorStride[j] = strides[j];
        }
        catMetaData.isContiguous[batchCounter] = false;
+        isContig = false;
      } else {
        catMetaData.isContiguous[batchCounter] = true;
      }
@ -449,13 +388,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
          max_elements_per_tensor, batchCounter);
 #else
    dim3 applyBlock, catGrid;
-    if (isInOutAligned) {
-      std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, alignment>(
-        max_elements_per_tensor, batchCounter);
-    } else if (isContig && isAligned && sizeof(scalar_t) > 2) {
+    if (isContig && sizeof(scalar_t) > 2) {
      std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_16>(
          max_elements_per_tensor, batchCounter);
-    } else if (isContig && isAligned && sizeof(scalar_t) == 2) {
+    } else if (isContig && sizeof(scalar_t) == 2) {
      std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_8>(
          max_elements_per_tensor, batchCounter);
    } else {
@ -463,30 +399,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
      getCatGrid(batchCounter, catGrid);
    }
 #endif
-    int32_t trailingSize;
-    TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> kernelOutputParam;
-    if (isInOutAligned) {
-      // in this case we can and should flatten the tensors after the cat dim
-      // we want to view the tensors as if consisting of `alignment`-sized elements
-      // however, we might not be able to cleanly divide just the last dim -
-      // it might not be the multiple of alignment.
-      // however, we know that the full concatted slice is multiple of alignment,
-      // so if we flatten all the dims after and including concat dim,
-      // it will be divisible by alignment
-      // then we need to divide last out size by elems_per_vec,
-      // and divide all strides except last by elems_per_vec (last stride is 1 always)
-      // for input, we will fix up the sizes and strides in the kernel directly
-      kernelOutputParam = outputParam;
-      nDims = dimension + 1;
-      constexpr auto elems_per_vec = alignment / sizeof(scalar_t);
-      auto out_size = dimension == 0 ? out.numel() : kernelOutputParam.tensorStride[dimension-1];
-      kernelOutputParam.tensorSize[dimension] = out_size / elems_per_vec;
-      trailingSize = outputParam.tensorStride[dimension];
-      kernelOutputParam.tensorStride[dimension] = 1;
-      for (int i = 0; i < dimension; ++i) {
-        kernelOutputParam.tensorStride[i] /= elems_per_vec;
-      }
-    }

    if (memory_format != c10::MemoryFormat::Contiguous) {
      switch (dimension) {
@ -501,12 +413,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
    }
    // Template Declarations for dim = 1, 2, 3, 4
 #define HANDLE_CASE(DIMS) \
-    if (isInOutAligned) {\
-      constexpr auto elems_per_vec = alignment / sizeof(scalar_t); \
-      CatArrayBatchedCopy_vectorized<scalar_t, unsigned int, DIMS, batch_size, stride_size, alignment, elems_per_vec><<<\
-      catGrid, applyBlock, 0, stream.stream()>>>(\
-        (char*)data, catMetaData, kernelOutputParam, dimension, trailingSize);\
-    } else if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
+    if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
      CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_16><<<\
          catGrid, applyBlock, 0, stream.stream()>>>(\
              data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
--- a/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
@ -1,7 +1,5 @@
 #include <ATen/core/op_registration/op_registration.h>
 #include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
-#include <ATen/native/mkldnn/xpu/qconv.h>
-
 #include <c10/core/MemoryFormat.h>
 #include <c10/core/ScalarType.h>
 #include <torch/library.h>
@ -9,7 +7,7 @@
 using namespace at::native::onednn;
 namespace at::native::xpu {

-inline c10::ScalarType QConvoneDNNXPU::qconv_decide_out_dtype(
+static inline c10::ScalarType qconv_decide_out_dtype(
    const at::Tensor& act,
    const std::optional<c10::ScalarType> output_dtype) {
  bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
@ -21,7 +19,7 @@ inline c10::ScalarType QConvoneDNNXPU::qconv_decide_out_dtype(
  return dst_dtype;
 }

-at::Tensor QConvoneDNNXPU::qconv_prepack_xpu(
+static at::Tensor qconv_prepack_xpu(
    at::Tensor weight,
    at::Tensor weight_scales,
    double input_scale,
@ -35,265 +33,222 @@ at::Tensor QConvoneDNNXPU::qconv_prepack_xpu(
  return weight;
 }

-at::Tensor QConvoneDNNXPU::run_pointwise(
-    at::Tensor act,
-    double act_scale,
-    int64_t act_zero_point,
-    at::Tensor weight,
-    at::Tensor weight_scales,
-    at::Tensor weight_zero_points,
-    std::optional<at::Tensor> bias,
-    torch::List<int64_t> stride,
-    torch::List<int64_t> padding,
-    torch::List<int64_t> dilation,
-    int64_t groups,
-    double inv_output_scale,
-    int64_t output_zero_point,
-    std::optional<c10::ScalarType> output_dtype,
-    std::string_view attr,
-    torch::List<std::optional<at::Scalar>> scalars,
-    std::optional<std::string_view> algorithm) {
-  if (act.dim() == 3 || act.dim() == 5) {
-    TORCH_CHECK(
-        attr == "none",
-        "quantized pointwise conv",
-        act.dim() - 2,
-        "d doesn't support unary_post_op fusion. Got unary_post_op:",
-        attr,
-        ".");
-  } else {
-    TORCH_CHECK(
-        attr == "none" || attr == "relu" || attr == "hardtanh" ||
-            attr == "hardswish" || attr == "swish",
-        "We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, GELU, Swish, and Hardswish are supported. However, encountered unsupported post operation:",
-        attr,
-        ".");
+class QConvoneDNNXPU final {
+ public:
+  static at::Tensor run_pointwise(
+      at::Tensor act,
+      double act_scale,
+      int64_t act_zero_point,
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      at::Tensor weight_zero_points,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      double inv_output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      std::string_view attr,
+      torch::List<std::optional<at::Scalar>> scalars,
+      std::optional<std::string_view> algorithm) {
+    if (act.dim() == 3 || act.dim() == 5) {
+      TORCH_CHECK(
+          attr == "none",
+          "quantized pointwise conv",
+          act.dim() - 2,
+          "d doesn't support unary_post_op fusion. Got unary_post_op:",
+          attr,
+          ".");
+    } else {
+      TORCH_CHECK(
+          attr == "none" || attr == "relu" || attr == "hardtanh" ||
+              attr == "hardswish" || attr == "swish",
+          "We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, GELU, Swish, and Hardswish are supported. However, encountered unsupported post operation:",
+          attr,
+          ".");
+    }
+
+    bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
+    auto mfmt = is_channels_last_suggested
+        ? get_cl_tag_by_ndim(act.ndimension())
+        : at::MemoryFormat::Contiguous;
+    Tensor input_ = act.contiguous(mfmt);
+    Tensor weight_ = weight.contiguous(mfmt);
+
+    auto dst_tz = conv_dst_size(
+        input_.ndimension(),
+        input_.sizes(),
+        weight_.sizes(),
+        padding.vec(),
+        padding.vec(),
+        stride.vec(),
+        dilation.vec());
+
+    auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
+    Tensor output =
+        at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
+
+    return quantized_convolution(
+        act,
+        act_scale,
+        act_zero_point,
+        weight,
+        weight_scales,
+        weight_zero_points,
+        bias,
+        stride,
+        padding,
+        dilation,
+        /*transposed*/ false,
+        groups,
+        output,
+        inv_output_scale,
+        output_zero_point,
+        /*accum*/ std::nullopt,
+        /*accum_scale*/ 0.0,
+        /*accum_zero_point*/ 0,
+        /*output_dtype*/ output_dtype,
+        /*binary_attr*/ std::nullopt,
+        /*binary_alpha*/ std::nullopt,
+        /*unary_attr*/ attr,
+        /*unary_scalars*/ scalars,
+        /*unary_algorithm*/ algorithm);
  }

-  bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
-  auto mfmt = is_channels_last_suggested ? get_cl_tag_by_ndim(act.ndimension())
-                                         : at::MemoryFormat::Contiguous;
-  Tensor input_ = act.contiguous(mfmt);
-  Tensor weight_ = weight.contiguous(mfmt);
-
-  auto dst_tz = conv_dst_size(
-      input_.ndimension(),
-      input_.sizes(),
-      weight_.sizes(),
-      padding.vec(),
-      padding.vec(),
-      stride.vec(),
-      dilation.vec());
-
-  auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
-  Tensor output =
-      at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
-
-  return quantized_convolution(
-      act,
-      act_scale,
-      act_zero_point,
-      weight,
-      weight_scales,
-      weight_zero_points,
-      bias,
-      stride,
-      padding,
-      dilation,
-      /*transposed*/ false,
-      groups,
-      output,
-      inv_output_scale,
-      output_zero_point,
-      /*accum*/ std::nullopt,
-      /*accum_scale*/ 0.0,
-      /*accum_zero_point*/ 0,
-      /*output_dtype*/ output_dtype,
-      /*binary_attr*/ std::nullopt,
-      /*binary_alpha*/ std::nullopt,
-      /*unary_attr*/ attr,
-      /*unary_scalars*/ scalars,
-      /*unary_algorithm*/ algorithm);
-}
-
-at::Tensor QConvoneDNNXPU::run_pointwise_tensor(
-    at::Tensor act,
-    at::Tensor act_scale,
-    at::Tensor act_zero_point,
-    at::Tensor weight,
-    at::Tensor weight_scales,
-    at::Tensor weight_zero_points,
-    std::optional<at::Tensor> bias,
-    torch::List<int64_t> stride,
-    torch::List<int64_t> padding,
-    torch::List<int64_t> dilation,
-    int64_t groups,
-    double output_scale,
-    int64_t output_zero_point,
-    std::optional<c10::ScalarType> output_dtype,
-    std::string_view attr,
-    torch::List<std::optional<at::Scalar>> scalars,
-    std::optional<std::string_view> algorithm) {
-  return run_pointwise(
-      act,
-      act_scale.item().toDouble(),
-      act_zero_point.item().toLong(),
-      weight,
-      weight_scales,
-      weight_zero_points,
-      bias,
-      stride,
-      padding,
-      dilation,
-      groups,
-      output_scale,
-      output_zero_point,
-      output_dtype,
-      /*unary_attr*/ attr,
-      /*unary_scalars*/ scalars,
-      /*unary_algorithm*/ algorithm);
-}
-
-at::Tensor QConvoneDNNXPU::run_pointwise_binary(
-    at::Tensor act,
-    double act_scale,
-    int64_t act_zero_point,
-    at::Tensor weight,
-    at::Tensor weight_scales,
-    at::Tensor weight_zero_points,
-    at::Tensor accum,
-    std::optional<at::Tensor> bias,
-    torch::List<int64_t> stride,
-    torch::List<int64_t> padding,
-    torch::List<int64_t> dilation,
-    int64_t groups,
-    double output_scale,
-    int64_t output_zero_point,
-    std::optional<c10::ScalarType> output_dtype,
-    double accum_scale,
-    int64_t accum_zero_point,
-    std::string_view binary_attr,
-    std::optional<at::Scalar> alpha,
-    std::optional<std::string_view> unary_attr,
-    torch::List<std::optional<at::Scalar>> unary_scalars,
-    std::optional<std::string_view> unary_algorithm) {
-  TORCH_CHECK(
-      act.dim() == 4 && binary_attr == "sum" &&
-          (!unary_attr.has_value() ||
-           (unary_attr.has_value() &&
-            (unary_attr.value() == "none" || unary_attr.value() == "relu"))),
-      "post_op sum or post_op sum_relu is supported for quantized pointwise conv2d. Got binary_post_op: ",
-      binary_attr,
-      " unary_post_op: ",
-      unary_attr.has_value() ? unary_attr.value() : "none",
-      ".")
-
-  bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
-  auto mfmt = is_channels_last_suggested ? get_cl_tag_by_ndim(act.ndimension())
-                                         : at::MemoryFormat::Contiguous;
-  Tensor input_ = act.contiguous(mfmt);
-  Tensor weight_ = weight.contiguous(mfmt);
-
-  auto dst_tz = conv_dst_size(
-      input_.ndimension(),
-      input_.sizes(),
-      weight_.sizes(),
-      padding.vec(),
-      padding.vec(),
-      stride.vec(),
-      dilation.vec());
-
-  auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
-  bool has_accum_postop_sum = binary_attr == "sum";
-  Tensor output = has_accum_postop_sum
-      ? accum
-      : at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
-
-  output = quantized_convolution(
-      act,
-      act_scale,
-      act_zero_point,
-      weight,
-      weight_scales,
-      weight_zero_points,
-      bias,
-      stride,
-      padding,
-      dilation,
-      /*transposed*/ false,
-      groups,
-      output,
-      output_scale,
-      output_zero_point,
-      /*accum*/ accum,
-      /*accum_scale*/ accum_scale,
-      /*accum_zero_point*/ accum_zero_point,
-      /*output_dtype*/ output_dtype,
-      /*binary_attr*/ binary_attr,
-      /*binary_alpha*/ alpha,
-      /*unary_attr*/ unary_attr,
-      /*unary_scalars*/ unary_scalars,
-      /*unary_algorithm*/ unary_algorithm);
-
-  if (!has_accum_postop_sum) {
-    return output;
-  } else {
-    return accum;
+  static at::Tensor run_pointwise_tensor(
+      at::Tensor act,
+      at::Tensor act_scale,
+      at::Tensor act_zero_point,
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      at::Tensor weight_zero_points,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      std::string_view attr,
+      torch::List<std::optional<at::Scalar>> scalars,
+      std::optional<std::string_view> algorithm) {
+    return run_pointwise(
+        act,
+        act_scale.item().toDouble(),
+        act_zero_point.item().toLong(),
+        weight,
+        weight_scales,
+        weight_zero_points,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        output_scale,
+        output_zero_point,
+        output_dtype,
+        /*unary_attr*/ attr,
+        /*unary_scalars*/ scalars,
+        /*unary_algorithm*/ algorithm);
  }
-}

-at::Tensor QConvoneDNNXPU::run_pointwise_binary_tensor(
-    at::Tensor act, // contains quantized values but not QTensor
-    at::Tensor act_scale,
-    at::Tensor act_zero_point,
-    at::Tensor weight, // contains quantized values but not QTensor
-    at::Tensor weight_scales,
-    at::Tensor weight_zero_points,
-    at::Tensor accum, // contains quantized values but not QTensor
-    std::optional<at::Tensor> bias,
-    torch::List<int64_t> stride,
-    torch::List<int64_t> padding,
-    torch::List<int64_t> dilation,
-    int64_t groups,
-    double output_scale,
-    int64_t output_zero_point,
-    std::optional<c10::ScalarType> output_dtype,
-    double accum_scale,
-    int64_t accum_zero_point,
-    std::string_view binary_attr,
-    std::optional<at::Scalar> alpha,
-    std::optional<std::string_view> unary_attr,
-    torch::List<std::optional<at::Scalar>> unary_scalars,
-    std::optional<std::string_view> unary_algorithm) {
-  return run_pointwise_binary(
-      act,
-      act_scale.item().toDouble(),
-      act_zero_point.item().toLong(),
-      weight,
-      weight_scales,
-      weight_zero_points,
-      accum,
-      bias,
-      stride,
-      padding,
-      dilation,
-      groups,
-      output_scale,
-      output_zero_point,
-      output_dtype,
-      accum_scale,
-      accum_zero_point,
-      binary_attr,
-      alpha,
-      unary_attr,
-      unary_scalars,
-      unary_algorithm);
-}
+  static at::Tensor run_pointwise_binary(
+      at::Tensor act,
+      double act_scale,
+      int64_t act_zero_point,
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      at::Tensor weight_zero_points,
+      at::Tensor accum,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      double accum_scale,
+      int64_t accum_zero_point,
+      std::string_view binary_attr,
+      std::optional<at::Scalar> alpha,
+      std::optional<std::string_view> unary_attr,
+      torch::List<std::optional<at::Scalar>> unary_scalars,
+      std::optional<std::string_view> unary_algorithm) {
+    TORCH_CHECK(
+        act.dim() == 4 && binary_attr == "sum" &&
+            (!unary_attr.has_value() ||
+             (unary_attr.has_value() &&
+              (unary_attr.value() == "none" || unary_attr.value() == "relu"))),
+        "post_op sum or post_op sum_relu is supported for quantized pointwise conv2d. Got binary_post_op: ",
+        binary_attr,
+        " unary_post_op: ",
+        unary_attr.has_value() ? unary_attr.value() : "none",
+        ".")
+
+    bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
+    auto mfmt = is_channels_last_suggested
+        ? get_cl_tag_by_ndim(act.ndimension())
+        : at::MemoryFormat::Contiguous;
+    Tensor input_ = act.contiguous(mfmt);
+    Tensor weight_ = weight.contiguous(mfmt);
+
+    auto dst_tz = conv_dst_size(
+        input_.ndimension(),
+        input_.sizes(),
+        weight_.sizes(),
+        padding.vec(),
+        padding.vec(),
+        stride.vec(),
+        dilation.vec());
+
+    auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
+    bool has_accum_postop_sum = binary_attr == "sum";
+    Tensor output = has_accum_postop_sum
+        ? accum
+        : at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
+
+    output = quantized_convolution(
+        act,
+        act_scale,
+        act_zero_point,
+        weight,
+        weight_scales,
+        weight_zero_points,
+        bias,
+        stride,
+        padding,
+        dilation,
+        /*transposed*/ false,
+        groups,
+        output,
+        output_scale,
+        output_zero_point,
+        /*accum*/ accum,
+        /*accum_scale*/ accum_scale,
+        /*accum_zero_point*/ accum_zero_point,
+        /*output_dtype*/ output_dtype,
+        /*binary_attr*/ binary_attr,
+        /*binary_alpha*/ alpha,
+        /*unary_attr*/ unary_attr,
+        /*unary_scalars*/ unary_scalars,
+        /*unary_algorithm*/ unary_algorithm);
+
+    if (!has_accum_postop_sum) {
+      return output;
+    } else {
+      return accum;
+    }
+  }
+};

 TORCH_LIBRARY_IMPL(onednn, XPU, m) {
  m.impl(
      TORCH_SELECTIVE_NAME("onednn::qconv_prepack"),
-      TORCH_FN(QConvoneDNNXPU::qconv_prepack_xpu));
+      TORCH_FN(xpu::qconv_prepack_xpu));
  m.impl(
      TORCH_SELECTIVE_NAME("onednn::qconv1d_pointwise"),
      QConvoneDNNXPU::run_pointwise);
@ -312,9 +267,6 @@ TORCH_LIBRARY_IMPL(onednn, XPU, m) {
  m.impl(
      TORCH_SELECTIVE_NAME("onednn::qconv_pointwise.tensor"),
      QConvoneDNNXPU::run_pointwise_tensor);
-  m.impl(
-      TORCH_SELECTIVE_NAME("onednn::qconv2d_pointwise.binary_tensor"),
-      QConvoneDNNXPU::run_pointwise_binary_tensor);
 }

 } // namespace at::native::xpu
--- a/aten/src/ATen/native/mkldnn/xpu/qconv.h
+++ b/aten/src/ATen/native/mkldnn/xpu/qconv.h
@ -1,111 +0,0 @@
-#pragma once
-
-#include <ATen/Config.h>
-#include <ATen/Tensor.h>
-
-namespace at::native::xpu {
-class QConvoneDNNXPU final {
- public:
-  C10_API static at::Tensor run_pointwise(
-      at::Tensor act,
-      double act_scale,
-      int64_t act_zero_point,
-      at::Tensor weight,
-      at::Tensor weight_scales,
-      at::Tensor weight_zero_points,
-      std::optional<at::Tensor> bias,
-      torch::List<int64_t> stride,
-      torch::List<int64_t> padding,
-      torch::List<int64_t> dilation,
-      int64_t groups,
-      double inv_output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      std::string_view attr,
-      torch::List<std::optional<at::Scalar>> scalars,
-      std::optional<std::string_view> algorithm);
-
-  C10_API static at::Tensor run_pointwise_tensor(
-      at::Tensor act,
-      at::Tensor act_scale,
-      at::Tensor act_zero_point,
-      at::Tensor weight,
-      at::Tensor weight_scales,
-      at::Tensor weight_zero_points,
-      std::optional<at::Tensor> bias,
-      torch::List<int64_t> stride,
-      torch::List<int64_t> padding,
-      torch::List<int64_t> dilation,
-      int64_t groups,
-      double output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      std::string_view attr,
-      torch::List<std::optional<at::Scalar>> scalars,
-      std::optional<std::string_view> algorithm);
-
-  C10_API static at::Tensor run_pointwise_binary(
-      at::Tensor act,
-      double act_scale,
-      int64_t act_zero_point,
-      at::Tensor weight,
-      at::Tensor weight_scales,
-      at::Tensor weight_zero_points,
-      at::Tensor accum,
-      std::optional<at::Tensor> bias,
-      torch::List<int64_t> stride,
-      torch::List<int64_t> padding,
-      torch::List<int64_t> dilation,
-      int64_t groups,
-      double output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      double accum_scale,
-      int64_t accum_zero_point,
-      std::string_view binary_attr,
-      std::optional<at::Scalar> alpha,
-      std::optional<std::string_view> unary_attr,
-      torch::List<std::optional<at::Scalar>> unary_scalars,
-      std::optional<std::string_view> unary_algorithm);
-
-  C10_API static at::Tensor run_pointwise_binary_tensor(
-      at::Tensor act,
-      at::Tensor act_scale,
-      at::Tensor act_zero_point,
-      at::Tensor weight,
-      at::Tensor weight_scales,
-      at::Tensor weight_zero_points,
-      at::Tensor accum,
-      std::optional<at::Tensor> bias,
-      torch::List<int64_t> stride,
-      torch::List<int64_t> padding,
-      torch::List<int64_t> dilation,
-      int64_t groups,
-      double output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      double accum_scale,
-      int64_t accum_zero_point,
-      std::string_view binary_attr,
-      std::optional<at::Scalar> alpha,
-      std::optional<std::string_view> unary_attr,
-      torch::List<std::optional<at::Scalar>> unary_scalars,
-      std::optional<std::string_view> unary_algorithm);
-
-  static inline c10::ScalarType qconv_decide_out_dtype(
-      const at::Tensor& act,
-      const std::optional<c10::ScalarType> output_dtype);
-
-  static at::Tensor qconv_prepack_xpu(
-      at::Tensor weight,
-      at::Tensor weight_scales,
-      double input_scale,
-      int64_t input_zero_point,
-      torch::List<int64_t> stride,
-      torch::List<int64_t> padding,
-      torch::List<int64_t> dilation,
-      int64_t groups,
-      std::optional<torch::List<int64_t>> input_shape);
-};
-
-} // namespace at::native::xpu
--- a/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp
@ -1,14 +1,13 @@
 #include <torch/library.h>

 #include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
-#include <ATen/native/mkldnn/xpu/qlinear.h>
 #include <c10/core/ScalarType.h>

 using namespace at::native::onednn;

 namespace at::native::xpu {

-inline c10::ScalarType QLinearOnednnXPU::qlinear_decide_out_dtype(
+static inline c10::ScalarType qlinear_decide_out_dtype(
    const at::Tensor& act,
    const std::optional<c10::ScalarType> output_dtype) {
  bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
@ -20,7 +19,7 @@ inline c10::ScalarType QLinearOnednnXPU::qlinear_decide_out_dtype(
  return dst_dtype;
 }

-Tensor QLinearOnednnXPU::q_linear_pointwise(
+static Tensor q_linear_pointwise(
    Tensor act,
    double act_scale,
    int64_t act_zero_point,
@ -79,7 +78,7 @@ Tensor QLinearOnednnXPU::q_linear_pointwise(
  return qout;
 }

-Tensor QLinearOnednnXPU::q_linear_pointwise_tensor(
+static Tensor q_linear_pointwise_tensor(
    Tensor act,
    Tensor act_scale,
    Tensor act_zero_point,
@ -138,7 +137,7 @@ Tensor QLinearOnednnXPU::q_linear_pointwise_tensor(
  return qout;
 }

-Tensor QLinearOnednnXPU::q_linear_pointwise_binary(
+static Tensor q_linear_pointwise_binary(
    Tensor act,
    double act_scale,
    int64_t act_zero_point,
@ -209,7 +208,7 @@ Tensor QLinearOnednnXPU::q_linear_pointwise_binary(
  return dim == 3 ? qout.reshape({act.size(0), -1, N}) : qout;
 }

-Tensor QLinearOnednnXPU::q_linear_pointwise_binary_tensor(
+static Tensor q_linear_pointwise_binary_tensor(
    Tensor act,
    Tensor act_scale,
    Tensor act_zero_point,
@ -249,7 +248,7 @@ Tensor QLinearOnednnXPU::q_linear_pointwise_binary_tensor(
      unary_post_op_algorithm);
 }

-Tensor QLinearOnednnXPU::q_linear_prepack_onednn(
+static at::Tensor q_linear_prepack_onednn(
    at::Tensor weight,
    std::optional<torch::List<int64_t>> input_shape) {
  at::Tensor weight_transposed = weight.transpose(0, 1);
@ -259,19 +258,19 @@ Tensor QLinearOnednnXPU::q_linear_prepack_onednn(
 TORCH_LIBRARY_IMPL(onednn, XPU, m) {
  m.impl(
      TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise"),
-      TORCH_FN(QLinearOnednnXPU::q_linear_pointwise));
+      TORCH_FN(q_linear_pointwise));
  m.impl(
      TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.tensor"),
-      TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_tensor));
+      TORCH_FN(q_linear_pointwise_tensor));
  m.impl(
      TORCH_SELECTIVE_NAME("onednn::qlinear_prepack"),
-      TORCH_FN(QLinearOnednnXPU::q_linear_prepack_onednn));
+      TORCH_FN(q_linear_prepack_onednn));
  m.impl(
      TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary"),
-      TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_binary));
+      TORCH_FN(q_linear_pointwise_binary));
  m.impl(
      TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary_tensor"),
-      TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_binary_tensor));
+      TORCH_FN(q_linear_pointwise_binary_tensor));
 }

 } // namespace at::native::xpu
--- a/aten/src/ATen/native/mkldnn/xpu/qlinear.h
+++ b/aten/src/ATen/native/mkldnn/xpu/qlinear.h
@ -1,91 +0,0 @@
-#pragma once
-
-#include <ATen/Config.h>
-#include <ATen/Tensor.h>
-#include <ATen/core/List.h>
-
-namespace at::native::xpu {
-
-class QLinearOnednnXPU final {
- public:
-  C10_API static Tensor q_linear_pointwise(
-      Tensor act,
-      double act_scale,
-      int64_t act_zero_point,
-      Tensor weight,
-      Tensor weight_scales,
-      Tensor weight_zero_points,
-      std::optional<Tensor> bias,
-      double output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      std::string_view post_op_name,
-      torch::List<std::optional<at::Scalar>> post_op_args,
-      std::string_view post_op_algorithm);
-
-  C10_API static Tensor q_linear_pointwise_tensor(
-      Tensor act,
-      Tensor act_scale,
-      Tensor act_zero_point,
-      Tensor weight,
-      Tensor weight_scales,
-      Tensor weight_zero_points,
-      std::optional<Tensor> bias,
-      double output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      std::string_view post_op_name,
-      torch::List<std::optional<at::Scalar>> post_op_args,
-      std::string_view post_op_algorithm);
-
-  C10_API static Tensor q_linear_pointwise_binary(
-      Tensor act,
-      double act_scale,
-      int64_t act_zero_point,
-      Tensor weight,
-      Tensor weight_scales,
-      Tensor weight_zero_points,
-      std::optional<at::Tensor> other,
-      std::optional<Tensor> bias,
-      double output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      double other_scale,
-      int64_t other_zero_point,
-      std::string_view binary_post_op,
-      double binary_alpha,
-      std::string_view unary_post_op,
-      torch::List<std::optional<at::Scalar>> unary_post_op_args,
-      std::string_view unary_post_op_algorithm);
-
-  C10_API static Tensor q_linear_pointwise_binary_tensor(
-      Tensor act,
-      Tensor act_scale,
-      Tensor act_zero_point,
-      Tensor weight,
-      Tensor weight_scales,
-      Tensor weight_zero_points,
-      std::optional<at::Tensor> other,
-      std::optional<Tensor> bias,
-      double output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      double other_scale,
-      int64_t other_zero_point,
-      std::string_view binary_post_op,
-      double binary_alpha,
-      std::string_view unary_post_op,
-      torch::List<std::optional<at::Scalar>> unary_post_op_args,
-      std::string_view unary_post_op_algorithm);
-
-  C10_API static Tensor q_linear_prepack_onednn(
-      at::Tensor weight,
-      std::optional<torch::List<int64_t>> input_shape);
-
-  static inline c10::ScalarType qlinear_decide_out_dtype(
-      const at::Tensor& act,
-      const std::optional<c10::ScalarType> output_dtype);
-
-}; // class QLinearOnednnXPU
-
-} // namespace at::native::xpu
--- a/aten/src/ATen/native/mps/kernels/UnaryKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/UnaryKernel.metal
@ -503,17 +503,6 @@ struct round_decimals_functor {
  }
 };

-struct round_functor {
-  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
-  inline T operator()(const T x) {
-    return static_cast<T>(rint(float(x)));
-  }
-  template <typename T, enable_if_t<is_scalar_integral_v<T>, bool> = true>
-  inline T operator()(const T x) {
-    return x;
-  }
-};
-
 DEFINE_UNARY_FLOATING_FUNCTOR(erf);
 DEFINE_UNARY_FLOATING_FUNCTOR(erfc);
 DEFINE_UNARY_FLOATING_FUNCTOR(erfinv);
@ -526,13 +515,6 @@ REGISTER_UNARY_OP(neg, char, char);
 REGISTER_UNARY_OP(neg, uchar, uchar);
 REGISTER_UNARY_OP(neg, float, float);
 REGISTER_UNARY_OP(neg, half, half);
-REGISTER_UNARY_OP(round, int, int);
-REGISTER_UNARY_OP(round, long, long);
-REGISTER_UNARY_OP(round, short, short);
-REGISTER_UNARY_OP(round, char, char);
-REGISTER_UNARY_OP(round, uchar, uchar);
-REGISTER_UNARY_OP(round, float, float);
-REGISTER_UNARY_OP(round, half, half);

 REGISTER_UNARY_OP(bitwise_not, int, int);
 REGISTER_UNARY_OP(bitwise_not, long, long);
@ -576,7 +558,6 @@ REGISTER_UNARY_OP(abs, half, half);

 INSTANTIATE_UNARY_KERNELS2(bfloat, bfloat);
 REGISTER_UNARY_OP(neg, bfloat, bfloat);
-REGISTER_UNARY_OP(round, bfloat, bfloat);
 REGISTER_UNARY_OP(abs, bfloat, bfloat);
 INSTANTIATE_UNARY_KERNELS2(half, half);
 INSTANTIATE_UNARY_KERNELS2(float, float);
--- a/aten/src/ATen/native/mps/operations/Linear.mm
+++ b/aten/src/ATen/native/mps/operations/Linear.mm
@ -115,10 +115,7 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const std::opt
    return output;
  }

-  // No-graph execution causes nonsense if these are non-contiguous.
-  const bool is_contiguous = input.is_contiguous() && weight.is_contiguous() && bias.is_contiguous();
-
-  if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_contiguous) {
+  if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS)) {
    _mps_linear_nograph(input, weight, bias, output);
    // Squeeze last dim of 1D linear
    return weight_arg.dim() != 1 ? output : output.squeeze(-1);
--- a/aten/src/ATen/native/mps/operations/Sort.mm
+++ b/aten/src/ATen/native/mps/operations/Sort.mm
@ -2,7 +2,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/MemoryOverlap.h>
 #include <ATen/WrapDimUtils.h>
-#include <ATen/native/SortingUtils.h>
 #include <ATen/native/TensorShape.h>
 #include <ATen/native/TypeProperties.h>
 #include <ATen/native/mps/MPSGraphVenturaOps.h>
@ -12,85 +11,10 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
-#include <ATen/ops/kthvalue_native.h>
 #include <ATen/ops/sort.h>
 #include <ATen/ops/sort_native.h>
 #endif
 namespace at::native {
-namespace {
-
-void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& values, Tensor& indices) {
-  using namespace mps;
-  if (self.dim() == 0 && self.numel() == 1) {
-    values.copy_(self);
-    indices.zero_();
-    return;
-  }
-  // Handle empty tensors
-  if (self.numel() == 0) {
-    values.copy_(self);
-    indices.copy_(values.toType(at::ScalarType::Long));
-    return;
-  }
-  // issue #154890, raising error to prevent crash within MPSGraph until
-  // workaround is implemented.
-  TORCH_CHECK(self.dim() - dim <= 4, "On-going issue on MPSGraph topk when ndims() - axis > 4, see issue #154890");
-
-  auto stream = getCurrentMPSStream();
-  struct CachedGraph : public MPSCachedGraph {
-    CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
-    MPSGraphTensor *selfTensor = nil, *valuesTensor = nil, *indicesTensor = nil;
-  };
-
-  // MPSGraph kthvalue is always sorted.
-  @autoreleasepool {
-    // Input as placeholders
-    MPSShape* input_shape = getMPSShape(self);
-    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
-    std::string key = std::string("kthvalue:") + [ns_shape_key UTF8String] + ":" + getMPSTypeString(self) + ":k" +
-        std::to_string(k) + ":dim" + std::to_string(dim);
-    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), input_shape);
-
-      MPSGraphTensor* castInputTensor = newCachedGraph->selfTensor;
-      MPSDataType dataType = getMPSDataType(self);
-      // #issue 104398441 sortWithTensor and argsortWithTensor
-      if (dataType != MPSDataTypeInt32 && dataType != MPSDataTypeFloat32 && dataType != MPSDataTypeFloat16) {
-        dataType = (dataType & MPSDataTypeFloatBit) ? MPSDataTypeFloat32 : MPSDataTypeInt32;
-        castInputTensor = [mpsGraph castTensor:newCachedGraph->selfTensor toType:dataType name:@"castInputTensor"];
-      }
-      MPSGraphTensor* sortedTensor = [mpsGraph sortWithTensor:castInputTensor
-                                                         axis:(NSUInteger)dim
-                                                   descending:false
-                                                         name:nil];
-      sortedTensor = [mpsGraph sliceTensor:sortedTensor
-                                 dimension:(NSUInteger)dim
-                                     start:((NSUInteger)k - 1)
-                                    length:1
-                                      name:nil];
-      MPSGraphTensor* argSortedTensor = [mpsGraph argSortWithTensor:castInputTensor
-                                                               axis:(NSInteger)dim
-                                                         descending:false
-                                                               name:@"kthvalue_out"];
-      argSortedTensor = [mpsGraph sliceTensor:argSortedTensor
-                                    dimension:dim
-                                        start:((NSUInteger)k - 1)
-                                       length:1
-                                         name:nil];
-      newCachedGraph->valuesTensor = sortedTensor;
-      newCachedGraph->indicesTensor = argSortedTensor;
-    });
-    Placeholder inputPlaceholder = Placeholder(cachedGraph->selfTensor, self);
-    // Outputs as placeholders
-    Placeholder valuesPlaceholder = Placeholder(cachedGraph->valuesTensor, values);
-    Placeholder indicesPlaceholder = Placeholder(cachedGraph->indicesTensor, indices);
-    // Create dictionary of inputs and outputs
-    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
-    auto results = dictionaryFromPlaceholders(valuesPlaceholder, indicesPlaceholder);
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-  }
-}
-} // anonymous namespace

 // sort
 TORCH_IMPL_FUNC(sort_stable_out_mps)
@ -157,31 +81,4 @@ TORCH_IMPL_FUNC(sort_stable_out_mps)
    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
  }
 }
-
-std::tuple<Tensor&, Tensor&> kthvalue_out_mps(const Tensor& self,
-                                              int64_t k,
-                                              int64_t dim_,
-                                              bool keepdim,
-                                              Tensor& values,
-                                              Tensor& indices) {
-  // See note [Writing Nondeterministic Operations]
-  // If there are duplicate elements of the kth value, the procedure for choosing which
-  // of the duplicates to use for the indices output is nondeterministic.
-  at::globalContext().alertNotDeterministic("kthvalue MPS");
-
-  int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
-  int64_t slicesize = self.dim() == 0 ? 1 : self.size(dim);
-  TORCH_CHECK(k >= 1 && k <= slicesize, "kthvalue(): selected number k out of range for dimension ", dim);
-  at::assert_no_overlap(self, values);
-  _reduction_with_indices_allocate_or_resize_output(values, indices, self, dim, keepdim);
-
-  kthvalue_out_mps_impl(self, k, dim, values, indices);
-
-  if (!keepdim) {
-    values.squeeze_(dim);
-    indices.squeeze_(dim);
-  }
-
-  return std::forward_as_tuple(values, indices);
-}
 } // namespace at::native
--- a/aten/src/ATen/native/mps/operations/TensorCompare.mm
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@ -335,9 +335,6 @@ static void isin_Tensor_Tensor_out_mps(const Tensor& elements,
 }

 static void is_posneginf_helper(TensorIteratorBase& iter, bool is_neg) {
-  if (iter.numel() == 0) {
-    return;
-  }
  const auto& self = iter.input(0);
  auto& out = iter.output(0);
  @autoreleasepool {
--- a/aten/src/ATen/native/mps/operations/UnaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryKernel.mm
@ -50,7 +50,6 @@ REGISTER_UNARY_TI_DISPATCH(log2);
 REGISTER_UNARY_TI_DISPATCH(log);
 REGISTER_UNARY_TI_DISPATCH(log1p);
 REGISTER_UNARY_TI_DISPATCH(bitwise_not);
-REGISTER_UNARY_TI_DISPATCH(round);
 REGISTER_UNARY_TI_DISPATCH(sigmoid);
 REGISTER_DISPATCH(round_decimals_stub, round_decimals_kernel);
 } // namespace at::native
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@ -184,6 +184,7 @@ TORCH_IMPL_FUNC(sign_out_mps)(const Tensor& self, const Tensor& output) {

 REGISTER_MPS_UNARY_STUB(ceil, ceil);
 REGISTER_MPS_UNARY_STUB(floor, floor);
+REGISTER_MPS_UNARY_STUB(round, round);
 REGISTER_MPS_UNARY_STUB(trunc, truncate);

 #define CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(func_out, func_stub)                                         \
@ -417,7 +418,6 @@ TORCH_IMPL_FUNC(sgn_out_mps)(const Tensor& self, const Tensor& output) {

 Tensor& conj_physical_out_mps(const Tensor& self, Tensor& result) {
  TORCH_CHECK(self.is_complex());
-  TORCH_CHECK(self.dtype() != at::kComplexDouble);
  mps::unary_op(self, result, "conj", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
    return [mpsGraph conjugateWithTensor:inputTensor name:nil];
  });
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -340,8 +340,8 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: abs
-    SparseCPU, SparseCUDA, SparseMPS: abs_sparse
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr
+    SparseCPU, SparseCUDA: abs_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs
  tags: [core, pointwise]

@ -350,16 +350,16 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: abs_
-    SparseCPU, SparseCUDA, SparseMPS: abs_sparse_
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr_
+    SparseCPU, SparseCUDA: abs_sparse_
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs_

 - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  dispatch:
    CPU, CUDA, MPS, MTIA: abs_out
-    SparseCPU, SparseCUDA, SparseMPS: abs_sparse_out
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr_out
+    SparseCPU, SparseCUDA: abs_sparse_out
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_out
  tags: pointwise

 # Note [Adding an alias]
@ -428,7 +428,7 @@
  variants: function, method
  structured_delegate: sgn.out
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: sgn_sparse
+    SparseCPU, SparseCUDA: sgn_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn
  tags: pointwise
@ -437,7 +437,7 @@
  variants: method
  structured_delegate: sgn.out
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: sgn_sparse_
+    SparseCPU, SparseCUDA: sgn_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn_
  tags: pointwise
@ -448,7 +448,7 @@
  dispatch:
    CPU, CUDA: sgn_out
    MPS: sgn_out_mps
-    SparseCPU, SparseCUDA, SparseMPS: sgn_sparse_out
+    SparseCPU, SparseCUDA: sgn_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_out
  tags: pointwise

@ -476,7 +476,7 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: _conj_physical
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: conj_physical_sparse_csr
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr
  autogen: _conj_physical.out

 - func: conj_physical(Tensor self) -> Tensor
@ -487,8 +487,8 @@
  dispatch:
    CPU, CUDA: conj_physical_out
    MPS: conj_physical_out_mps
-    SparseCPU, SparseCUDA, SparseMPS: conj_physical_out_sparse
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: conj_physical_sparse_csr_out
+    SparseCPU, SparseCUDA: conj_physical_out_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr_out
  tags: pointwise

 - func: conj_physical_(Tensor(a!) self) -> Tensor(a!)
@ -554,7 +554,7 @@
  structured_delegate: add.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: add_sparse
+    SparseCPU, SparseCUDA, SparseMeta: add_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr
    MkldnnCPU: mkldnn_add
    ZeroTensor: add_zerotensor
@ -566,7 +566,7 @@
  variants: method
  structured_delegate: add.out
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: add_sparse_
+    SparseCPU, SparseCUDA, SparseMeta: add_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_
    MkldnnCPU: mkldnn_add_
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_add__Tensor
@ -582,7 +582,6 @@
  dispatch:
    SparseCPU, SparseMeta: add_out_sparse_cpu
    SparseCUDA: add_out_sparse_cuda
-    SparseMPS: add_out_sparse_mps
    SparseCsrCPU, SparseCsrMeta: add_out_sparse_compressed_cpu
    SparseCsrCUDA: add_out_sparse_compressed_cuda
    MkldnnCPU: mkldnn_add_out
@ -875,7 +874,7 @@
  variants: function, method
  structured_delegate: asinh.out
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: asinh_sparse
+    SparseCPU, SparseCUDA: asinh_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr
  tags: [core, pointwise]

@ -883,7 +882,7 @@
  variants: function, method
  structured_delegate: asinh.out
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: asinh_sparse_
+    SparseCPU, SparseCUDA: asinh_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_
  tags: pointwise

@ -893,7 +892,7 @@
  dispatch:
    CPU, CUDA: asinh_out
    MPS: asinh_out_mps
-    SparseCPU, SparseCUDA, SparseMPS: asinh_sparse_out
+    SparseCPU, SparseCUDA: asinh_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_out
  tags: pointwise

@ -910,7 +909,7 @@
  structured_delegate: atanh.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: atanh_sparse
+    SparseCPU, SparseCUDA: atanh_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr
  tags: [core, pointwise]

@ -918,7 +917,7 @@
  structured_delegate: atanh.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: atanh_sparse_
+    SparseCPU, SparseCUDA: atanh_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_
  tags: pointwise

@ -928,7 +927,7 @@
  dispatch:
    CPU, CUDA: atanh_out
    MPS: atanh_out_mps
-    SparseCPU, SparseCUDA, SparseMPS: atanh_sparse_out
+    SparseCPU, SparseCUDA: atanh_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_out
  tags: pointwise
 # arctanh, alias for atanh
@ -965,7 +964,7 @@
  variants: function, method
  structured_delegate: asin.out
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: asin_sparse
+    SparseCPU, SparseCUDA: asin_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr
  tags: [core, pointwise]

@ -974,7 +973,7 @@
  variants: function, method
  structured_delegate: asin.out
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: asin_sparse_
+    SparseCPU, SparseCUDA: asin_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_
  tags: pointwise

@ -984,7 +983,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: asin_out
-    SparseCPU, SparseCUDA, SparseMPS: asin_sparse_out
+    SparseCPU, SparseCUDA: asin_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_out
  tags: pointwise

@ -1002,7 +1001,7 @@
  structured_delegate: atan.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: atan_sparse
+    SparseCPU, SparseCUDA: atan_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr
  tags: [core, pointwise]

@ -1011,7 +1010,7 @@
  structured_delegate: atan.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: atan_sparse_
+    SparseCPU, SparseCUDA: atan_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_
  tags: pointwise

@ -1021,7 +1020,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: atan_out
-    SparseCPU, SparseCUDA, SparseMPS: atan_sparse_out
+    SparseCPU, SparseCUDA: atan_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_out
  tags: pointwise

@ -1460,7 +1459,7 @@
  structured_delegate: ceil.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: ceil_sparse
+    SparseCPU, SparseCUDA: ceil_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr
  tags: [core, pointwise]

@ -1469,7 +1468,7 @@
  structured_delegate: ceil.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: ceil_sparse_
+    SparseCPU, SparseCUDA: ceil_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_
  tags: pointwise

@ -1479,7 +1478,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: ceil_out
-    SparseCPU, SparseCUDA, SparseMPS: ceil_sparse_out
+    SparseCPU, SparseCUDA: ceil_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_out
  tags: pointwise

@ -2407,7 +2406,7 @@
    MPS: empty_mps
    Meta: empty_meta_symint
    MkldnnCPU: empty_mkldnn
-    SparseCPU, SparseCUDA, SparseMPS: empty_sparse
+    SparseCPU, SparseCUDA: empty_sparse
    SparseMeta: empty_sparse_symint
    SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed
    SparseCsrMeta: empty_sparse_compressed_symint
@ -2535,7 +2534,7 @@
  structured_delegate: erf.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: erf_sparse
+    SparseCPU, SparseCUDA: erf_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr
  tags: [core, pointwise]

@ -2544,7 +2543,7 @@
  structured_delegate: erf.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: erf_sparse_
+    SparseCPU, SparseCUDA: erf_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_
  tags: pointwise

@ -2554,7 +2553,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS, MTIA: erf_out
-    SparseCPU, SparseCUDA, SparseMPS: erf_sparse_out
+    SparseCPU, SparseCUDA: erf_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_out
  tags: pointwise

@ -2620,7 +2619,7 @@
  structured_delegate: expm1.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: expm1_sparse
+    SparseCPU, SparseCUDA: expm1_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr
  tags: [core, pointwise]

@ -2629,7 +2628,7 @@
  structured_delegate: expm1.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: expm1_sparse_
+    SparseCPU, SparseCUDA: expm1_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_
  tags: pointwise

@ -2639,7 +2638,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: expm1_out
-    SparseCPU, SparseCUDA, SparseMPS: expm1_sparse_out
+    SparseCPU, SparseCUDA: expm1_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_out
  tags: pointwise

@ -2738,7 +2737,7 @@
  structured_delegate: floor.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: floor_sparse
+    SparseCPU, SparseCUDA: floor_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr
  tags: [core, pointwise]

@ -2747,7 +2746,7 @@
  structured_delegate: floor.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: floor_sparse_
+    SparseCPU, SparseCUDA: floor_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_
  tags: pointwise

@ -2757,7 +2756,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: floor_out
-    SparseCPU, SparseCUDA, SparseMPS: floor_sparse_out
+    SparseCPU, SparseCUDA: floor_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_out
  tags: pointwise

@ -2765,7 +2764,7 @@
  device_check: NoCheck   # TensorIterator
  variants: function, method
  dispatch:
-    CPU, CUDA, MPS, MTIA: floor_divide
+    CPU, CUDA, MPS: floor_divide
    SparseCPU, SparseCUDA: floor_divide_sparse

 - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@ -2799,7 +2798,7 @@
  structured_delegate: frac.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: frac_sparse
+    SparseCPU, SparseCUDA: frac_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr
  tags: pointwise

@ -2808,7 +2807,7 @@
  structured_delegate: frac.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: frac_sparse_
+    SparseCPU, SparseCUDA: frac_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_
  tags: pointwise

@ -2819,7 +2818,7 @@
  dispatch:
    CPU, CUDA: frac_out
    MPS: frac_out_mps
-    SparseCPU, SparseCUDA, SparseMPS: frac_sparse_out
+    SparseCPU, SparseCUDA: frac_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_out
  tags: pointwise

@ -3209,7 +3208,7 @@
  dispatch:
    CPU, CUDA, MPS, MTIA: isnan
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isnan
-    SparseCPU, SparseCUDA, SparseMPS: isnan_sparse
+    SparseCPU, SparseCUDA: isnan_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isnan_sparse_csr
  autogen: isnan.out
  tags: [core, pointwise]
@ -3290,7 +3289,6 @@
  dispatch:
    CPU: kthvalue_out_cpu
    CUDA: kthvalue_out_cuda
-    MPS: kthvalue_out_mps

 - func: kthvalue.dimname(Tensor self, SymInt k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
  variants: function, method
@ -3338,21 +3336,21 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: nan_to_num
-    SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse
+    SparseCPU, SparseCUDA: nan_to_num_sparse
  tags: pointwise

 - func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!)
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: nan_to_num_
-    SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse_
+    SparseCPU, SparseCUDA: nan_to_num_sparse_
  tags: pointwise

 - func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU, CUDA, MTIA: nan_to_num_out
    MPS: nan_to_num_out_mps
-    SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse_out
+    SparseCPU, SparseCUDA: nan_to_num_sparse_out
  tags: pointwise

 - func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
@ -3555,7 +3553,7 @@
  structured_delegate: log1p.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: log1p_sparse
+    SparseCPU, SparseCUDA: log1p_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr
  tags: [core, pointwise]

@ -3564,7 +3562,7 @@
  structured_delegate: log1p.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: log1p_sparse_
+    SparseCPU, SparseCUDA: log1p_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_
  tags: pointwise

@ -3574,7 +3572,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: log1p_out
-    SparseCPU, SparseCUDA, SparseMPS: log1p_sparse_out
+    SparseCPU, SparseCUDA: log1p_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_out
  tags: pointwise

@ -4666,7 +4664,7 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: rad2deg
-    SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse
+    SparseCPU, SparseCUDA: rad2deg_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr
  tags: pointwise

@ -4674,14 +4672,14 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: rad2deg_
-    SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse_
+    SparseCPU, SparseCUDA: rad2deg_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_
  tags: pointwise

 - func: rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CompositeExplicitAutograd: rad2deg_out
-    SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse_out
+    SparseCPU, SparseCUDA: rad2deg_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_out
  tags: pointwise

@ -4689,7 +4687,7 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: deg2rad
-    SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse
+    SparseCPU, SparseCUDA: deg2rad_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr
  tags: pointwise

@ -4697,14 +4695,14 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: deg2rad_
-    SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse_
+    SparseCPU, SparseCUDA: deg2rad_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_
  tags: pointwise

 - func: deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CompositeExplicitAutograd: deg2rad_out
-    SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse_out
+    SparseCPU, SparseCUDA: deg2rad_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_out
  tags: pointwise

@ -4930,7 +4928,7 @@
  structured_delegate: neg.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: neg_sparse
+    SparseCPU, SparseCUDA: neg_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg
  tags: [core, pointwise]
@ -4940,7 +4938,7 @@
  structured_delegate: neg.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: neg_sparse_
+    SparseCPU, SparseCUDA: neg_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg_
  tags: pointwise
@ -4951,7 +4949,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS, MTIA: neg_out
-    SparseCPU, SparseCUDA, SparseMPS: neg_out_sparse
+    SparseCPU, SparseCUDA: neg_out_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_out
  tags: pointwise
 # Alias for neg
@ -5035,7 +5033,7 @@
  structured_delegate: round.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: round_sparse
+    SparseCPU, SparseCUDA: round_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr
  tags: [core, pointwise]

@ -5044,7 +5042,7 @@
  structured_delegate: round.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: round_sparse_
+    SparseCPU, SparseCUDA: round_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_
  tags: pointwise

@ -5054,7 +5052,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: round_out
-    SparseCPU, SparseCUDA, SparseMPS: round_sparse_out
+    SparseCPU, SparseCUDA: round_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_out
  tags: pointwise

@ -5097,7 +5095,7 @@
    QuantizedCPU: relu_quantized_cpu
    QuantizedCUDA: relu_quantized_cuda
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu
-    SparseCPU, SparseCUDA, SparseMPS: relu_sparse
+    SparseCPU, SparseCUDA: relu_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr
  tags: [core, pointwise]

@ -5112,7 +5110,7 @@
    QuantizedCPU: relu_quantized_cpu_
    QuantizedCUDA: relu_quantized_cuda_
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu_
-    SparseCPU, SparseCUDA, SparseMPS: relu_sparse_
+    SparseCPU, SparseCUDA: relu_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr_
  autogen: relu.out
  tags: pointwise
@ -5399,7 +5397,7 @@
  variants: function, method
  dispatch:
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr
-    SparseCPU, SparseCUDA, SparseMPS: sin_sparse
+    SparseCPU, SparseCUDA: sin_sparse
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sin
  tags: [core, pointwise]

@ -5409,7 +5407,7 @@
  variants: function, method
  dispatch:
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_
-    SparseCPU, SparseCUDA, SparseMPS: sin_sparse_
+    SparseCPU, SparseCUDA: sin_sparse_
  tags: pointwise

 - func: sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -5419,7 +5417,7 @@
  dispatch:
    CPU, CUDA, MPS, MTIA: sin_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_out
-    SparseCPU, SparseCUDA, SparseMPS: sin_sparse_out
+    SparseCPU, SparseCUDA: sin_sparse_out
  tags: pointwise

 - func: sinc(Tensor self) -> Tensor
@ -5444,7 +5442,7 @@
  structured_delegate: sinh.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: sinh_sparse
+    SparseCPU, SparseCUDA: sinh_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr
  tags: [core, pointwise]

@ -5453,7 +5451,7 @@
  structured_delegate: sinh.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: sinh_sparse_
+    SparseCPU, SparseCUDA: sinh_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_
  tags: pointwise

@ -5463,7 +5461,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: sinh_out
-    SparseCPU, SparseCUDA, SparseMPS: sinh_sparse_out
+    SparseCPU, SparseCUDA: sinh_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_out

 # Returns a copy of this `Variable` that is detached from its autograd graph.
@ -5906,7 +5904,7 @@
  variants: function, method
  dispatch:
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sqrt
-    SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse
+    SparseCPU, SparseCUDA: sqrt_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr
  tags: [core, pointwise]

@ -5915,7 +5913,7 @@
  structured_delegate: sqrt.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse_
+    SparseCPU, SparseCUDA: sqrt_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_
  tags: pointwise

@ -5925,7 +5923,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS, MTIA: sqrt_out
-    SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse_out
+    SparseCPU, SparseCUDA: sqrt_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_out
  tags: pointwise

@ -6063,7 +6061,7 @@
  structured_delegate: tan.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: tan_sparse
+    SparseCPU, SparseCUDA: tan_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr
  tags: [core, pointwise]

@ -6072,7 +6070,7 @@
  structured_delegate: tan.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: tan_sparse_
+    SparseCPU, SparseCUDA: tan_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_
  tags: pointwise

@ -6082,7 +6080,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: tan_out
-    SparseCPU, SparseCUDA, SparseMPS: tan_sparse_out
+    SparseCPU, SparseCUDA: tan_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_out
  tags: pointwise

@ -6093,7 +6091,7 @@
  dispatch:
    QuantizedCPU: tanh_quantized_cpu
    MkldnnCPU: mkldnn_tanh
-    SparseCPU, SparseCUDA, SparseMPS: tanh_sparse
+    SparseCPU, SparseCUDA: tanh_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh
  tags: [core, pointwise]
@ -6104,7 +6102,7 @@
  variants: function, method
  dispatch:
    MkldnnCPU: mkldnn_tanh_
-    SparseCPU, SparseCUDA, SparseMPS: tanh_sparse_
+    SparseCPU, SparseCUDA: tanh_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh_
  tags: pointwise
@ -6115,7 +6113,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS, MTIA: tanh_out
-    SparseCPU, SparseCUDA, SparseMPS: tanh_sparse_out
+    SparseCPU, SparseCUDA: tanh_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_out
  tags: pointwise

@ -6387,8 +6385,8 @@
  device_check: NoCheck   # TensorIterator
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: trunc_sparse
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr
+    SparseCPU, SparseCUDA: trunc_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr
  tags: [core, pointwise]

 - func: trunc_(Tensor(a!) self) -> Tensor(a!)
@ -6396,8 +6394,8 @@
  device_check: NoCheck   # TensorIterator
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: trunc_sparse_
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr_
+    SparseCPU, SparseCUDA: trunc_sparse_
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_
  tags: pointwise

 - func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -6406,8 +6404,8 @@
  device_check: NoCheck   # TensorIterator
  dispatch:
    CPU, CUDA, MPS: trunc_out
-    SparseCPU, SparseCUDA, SparseMPS: trunc_sparse_out
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr_out
+    SparseCPU, SparseCUDA: trunc_sparse_out
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_out
  tags: pointwise
 # Alias for trunc

@ -7369,8 +7367,8 @@
 - func: _to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor
  variants: method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: sparse_to_dense
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: sparse_compressed_to_dense
+    SparseCPU, SparseCUDA: sparse_to_dense
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_compressed_to_dense
    MkldnnCPU: mkldnn_to_dense
  autogen: _to_dense.out

@ -7396,8 +7394,8 @@
 - func: dense_dim(Tensor self) -> int
  variants: method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: dense_dim_sparse
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: dense_dim_sparse_csr
+    SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr
    CompositeExplicitAutograd: dense_dim_default
  device_check: NoCheck
  device_guard: False
@ -7530,7 +7528,7 @@
  device_check: NoCheck  # Allows copy into different device
  variants: function
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: copy_sparse_
+    SparseCPU, SparseCUDA, SparseMeta: copy_sparse_
  autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out

 # By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors
@ -9721,7 +9719,7 @@
  structured_delegate: sign.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: sign_sparse
+    SparseCPU, SparseCUDA: sign_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr
  tags: [core, pointwise]

@ -9730,7 +9728,7 @@
  structured_delegate: sign.out
  variants: method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: sign_sparse_
+    SparseCPU, SparseCUDA: sign_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_
  tags: pointwise

@ -9741,7 +9739,7 @@
  dispatch:
    CPU, CUDA: sign_out
    MPS: sign_out_mps
-    SparseCPU, SparseCUDA, SparseMPS: sign_sparse_out
+    SparseCPU, SparseCUDA: sign_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_out
  tags: pointwise

@ -9749,7 +9747,7 @@
  variants: function, method
  structured_delegate: signbit.out
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: signbit_sparse
+    SparseCPU, SparseCUDA: signbit_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr
  tags: pointwise

@ -9760,7 +9758,7 @@
    CPU: signbit_out
    CUDA: signbit_out
    MPS: signbit_out_mps
-    SparseCPU, SparseCUDA, SparseMPS: signbit_sparse_out
+    SparseCPU, SparseCUDA: signbit_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr_out
  tags: pointwise

@ -13264,7 +13262,7 @@
  dispatch:
    CompositeExplicitAutograd: isinf
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isinf
-    SparseCPU, SparseCUDA, SparseMPS: isinf_sparse
+    SparseCPU, SparseCUDA: isinf_sparse
    SparseMeta: isinf_sparse_meta
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isinf_sparse_csr
  autogen: isinf.out
@ -13280,7 +13278,7 @@
  structured_delegate: isposinf.out
  dispatch:
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isposinf
-    SparseCPU, SparseCUDA, SparseMPS: isposinf_sparse
+    SparseCPU, SparseCUDA: isposinf_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr
  tags: pointwise

@ -13289,7 +13287,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: isposinf_out
-    SparseCPU, SparseCUDA, SparseMPS: isposinf_sparse_out
+    SparseCPU, SparseCUDA: isposinf_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr_out
  tags: pointwise

@ -13298,7 +13296,7 @@
  structured_delegate: isneginf.out
  dispatch:
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isneginf
-    SparseCPU, SparseCUDA, SparseMPS: isneginf_sparse
+    SparseCPU, SparseCUDA: isneginf_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr
  tags: pointwise

@ -13307,7 +13305,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: isneginf_out
-    SparseCPU, SparseCUDA, SparseMPS: isneginf_sparse_out
+    SparseCPU, SparseCUDA: isneginf_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr_out
  tags: pointwise

--- a/aten/src/ATen/native/sparse/mps/FlattenIndices.mm
+++ b/aten/src/ATen/native/sparse/mps/FlattenIndices.mm
@ -1,73 +0,0 @@
-#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/native/SparseTensorUtils.h>
-#include <ATen/native/mps/OperationUtils.h>
-#include <ATen/native/sparse/SparseStubs.h>
-#include <ATen/native/sparse/FlattenIndicesCommon.h>
-#include <ATen/ExpandUtils.h>
-
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
-#else
-#include <ATen/ops/_coalesce_native.h>
-#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
-#include <ATen/ops/empty_native.h>
-#include <ATen/ops/zeros_native.h>
-#endif
-
-namespace at::native {
-namespace {
-
-using namespace mps;
-using namespace at::sparse;
-
-#ifndef PYTORCH_JIT_COMPILE_SHADERS
-static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
-#else
-#include <ATen/native/mps/FlattenIndices_metallib.h>
-#endif
-
-Tensor flatten_indices_mps(const Tensor& indices, IntArrayRef size) {
-  TORCH_CHECK(indices.dim() == 2, "flatten_indices: indices must be 2D");
-  TORCH_CHECK(static_cast<size_t>(indices.size(0)) == size.size(),
-              "flatten_indices: indices.size(0) must equal size.size()");
-
-  const int64_t sparse_dim = indices.size(0);
-  const int64_t nnz = indices.size(1);
-
-  if (nnz == 0) {
-    return at::empty({0}, indices.options().dtype(kLong));
-  }
-
-  // Row-major multipliers for flattening: mul[d] = prod_{j>d}(size[j])
-  std::vector<int64_t> row_muls(sparse_dim);
-  row_muls[sparse_dim - 1] = 1;
-  for (int64_t i = sparse_dim - 2; i >= 0; --i) {
-    row_muls[i] = row_muls[i + 1] * size[i + 1];
-  }
-
-  auto flat_indices = at::empty({nnz}, indices.options().dtype(kLong));
-
-  auto stream = getCurrentMPSStream();
-  dispatch_sync_with_rethrow(stream->queue(), ^() {
-    @autoreleasepool {
-      auto pipeline = lib.getPipelineStateForFunc("flatten_indices_kernel");
-      auto encoder = stream->commandEncoder();
-      [encoder setComputePipelineState:pipeline];
-      mtl_setArgs(encoder,
-                  indices,
-                  row_muls,
-                  flat_indices,
-                  static_cast<uint>(sparse_dim),
-                  indices.strides()
-      );
-
-      mtl_dispatch1DJob(encoder, pipeline, nnz);
-    }
-  });
-  return flat_indices;
-}
-
-} // namespace
-REGISTER_MPS_DISPATCH(flatten_indices_stub, &flatten_indices_mps)
-} // namespace at::native
--- a/aten/src/ATen/native/sparse/mps/SparseMPSTensor.mm
+++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensor.mm
@ -20,9 +20,46 @@ using namespace at::sparse;
 #ifndef PYTORCH_JIT_COMPILE_SHADERS
 static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
 #else
-#include <ATen/native/mps/Coalesce_metallib.h>
+#include <ATen/native/mps/Sparse_metallib.h>
 #endif

+
+static Tensor flatten_indices(const Tensor& indices, IntArrayRef size) {
+
+  TORCH_CHECK(indices.dim() == 2, "flatten_indices: indices must be 2D");
+  TORCH_CHECK(static_cast<size_t>(indices.size(0)) == size.size(),
+              "flatten_indices: indices.size(0) must equal size.size()");
+
+  int64_t sparse_dim = indices.size(0);
+  int64_t nnz = indices.size(1);
+
+  if (nnz == 0) {
+    return at::empty({0}, indices.options().dtype(kLong));
+  }
+
+  std::vector<int64_t> strides(sparse_dim);
+  strides[sparse_dim - 1] = 1;
+  for (int64_t i = sparse_dim - 2; i >= 0; i--) {
+    strides[i] = strides[i + 1] * size[i + 1];
+  }
+
+  Tensor flat_indices = at::empty({nnz}, indices.options().dtype(kLong));
+
+  auto stream = getCurrentMPSStream();
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pipeline = lib.getPipelineStateForFunc("flatten_indices_kernel");
+      auto encoder = stream->commandEncoder();
+      [encoder setComputePipelineState:pipeline];
+
+      mtl_setArgs(encoder, indices, strides, flat_indices, sparse_dim, nnz);
+      mtl_dispatch1DJob(encoder, pipeline, nnz);
+    }
+  });
+
+  return flat_indices;
+}
+
 static Tensor compute_output_positions(const Tensor& is_unique) {

  int64_t nnz = is_unique.size(0);
--- a/Show More
+++ b/Show More