Compare commits

..

3 Commits

Author SHA1 Message Date
a0801ef6be add workflow to dispatch 2025-08-26 18:20:49 -07:00
735b375db4 echo variables
ghstack-source-id: 3c8f54e83cad9760fb06b39366bea2f31a39342f
Pull-Request: https://github.com/pytorch/pytorch/pull/161565
2025-08-26 17:10:56 -07:00
011155aea3 echo variables
ghstack-source-id: bd39100f9f9c99a5c45b85a48020375ac5f95da6
Pull-Request: https://github.com/pytorch/pytorch/pull/161537
2025-08-26 17:10:55 -07:00
515 changed files with 5327 additions and 19687 deletions

View File

@ -7,15 +7,6 @@ if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
fi
if [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0"
fi
# Compress the fatbin with -compress-mode=size for CUDA 13
if [[ "$DESIRED_CUDA" == *"13"* ]]; then
export TORCH_NVCC_FLAGS="-compress-mode=size"
fi
SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
source $SCRIPTPATH/aarch64_ci_setup.sh

View File

@ -77,23 +77,21 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
wheelname = os.path.basename(wheel_path)
os.mkdir(f"{folder}/tmp")
os.system(f"unzip {wheel_path} -d {folder}/tmp")
# Common libraries for all CUDA versions
common_libs = [
# Non-NVIDIA system libraries
"/lib64/libgomp.so.1",
"/usr/lib64/libgfortran.so.5",
"/acl/build/libarm_compute.so",
"/acl/build/libarm_compute_graph.so",
# Common CUDA libraries (same for all versions)
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
"/usr/local/lib/libnvpl_lapack_core.so.0",
"/usr/local/lib/libnvpl_blas_core.so.0",
libs_to_copy = [
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
"/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
"/usr/local/cuda/lib64/libcudnn.so.9",
"/usr/local/cuda/lib64/libcublas.so.12",
"/usr/local/cuda/lib64/libcublasLt.so.12",
"/usr/local/cuda/lib64/libcudart.so.12",
"/usr/local/cuda/lib64/libcufft.so.11",
"/usr/local/cuda/lib64/libcusparse.so.12",
"/usr/local/cuda/lib64/libcusparseLt.so.0",
"/usr/local/cuda/lib64/libcusolver.so.11",
"/usr/local/cuda/lib64/libcurand.so.10",
"/usr/local/cuda/lib64/libnccl.so.2",
"/usr/local/cuda/lib64/libnvJitLink.so.12",
"/usr/local/cuda/lib64/libnvrtc.so.12",
"/usr/local/cuda/lib64/libnvshmem_host.so.3",
"/usr/local/cuda/lib64/libcudnn_adv.so.9",
"/usr/local/cuda/lib64/libcudnn_cnn.so.9",
@ -102,41 +100,22 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
"/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
"/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
"/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
"/usr/local/cuda/lib64/libcufile.so.0",
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
"/usr/local/cuda/lib64/libcusparse.so.12",
"/lib64/libgomp.so.1",
"/usr/lib64/libgfortran.so.5",
"/acl/build/libarm_compute.so",
"/acl/build/libarm_compute_graph.so",
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
"/usr/local/lib/libnvpl_lapack_core.so.0",
"/usr/local/lib/libnvpl_blas_core.so.0",
]
# CUDA version-specific libraries
if "130" in desired_cuda:
version_specific_libs = [
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
"/usr/local/cuda/lib64/libcublas.so.13",
"/usr/local/cuda/lib64/libcublasLt.so.13",
"/usr/local/cuda/lib64/libcudart.so.13",
"/usr/local/cuda/lib64/libcufft.so.12",
"/usr/local/cuda/lib64/libcusolver.so.12",
"/usr/local/cuda/lib64/libnvJitLink.so.13",
"/usr/local/cuda/lib64/libnvrtc.so.13",
"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.0",
if "129" in desired_cuda:
libs_to_copy += [
"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9",
"/usr/local/cuda/lib64/libcufile.so.0",
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
]
elif "12" in desired_cuda:
# Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
minor_version = desired_cuda[-1]
version_specific_libs = [
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
"/usr/local/cuda/lib64/libcublas.so.12",
"/usr/local/cuda/lib64/libcublasLt.so.12",
"/usr/local/cuda/lib64/libcudart.so.12",
"/usr/local/cuda/lib64/libcufft.so.11",
"/usr/local/cuda/lib64/libcusolver.so.11",
"/usr/local/cuda/lib64/libnvJitLink.so.12",
"/usr/local/cuda/lib64/libnvrtc.so.12",
f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
]
# Combine all libraries
libs_to_copy = common_libs + version_specific_libs
# Copy libraries to unzipped_folder/a/lib
for lib_path in libs_to_copy:

View File

@ -81,8 +81,8 @@ elif [[ "$image" == *riscv* ]]; then
DOCKERFILE="ubuntu-cross-riscv/Dockerfile"
fi
_UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152
_UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96
_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
if [[ "$image" == *rocm* ]]; then
_UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
_UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
@ -114,19 +114,31 @@ case "$tag" in
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
;;
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
CUDA_VERSION=13.0.0
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=12.8.1
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
CUDA_VERSION=12.8.1
ANACONDA_PYTHON_VERSION=3.10
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
CUDA_VERSION=12.8.1
ANACONDA_PYTHON_VERSION=3.13
GCC_VERSION=9
VISION=yes
KATEX=yes
@ -161,8 +173,8 @@ case "$tag" in
VISION=yes
ONNX=yes
;;
pytorch-linux-jammy-py3.10-clang12)
ANACONDA_PYTHON_VERSION=3.10
pytorch-linux-jammy-py3.9-clang12)
ANACONDA_PYTHON_VERSION=3.9
CLANG_VERSION=12
VISION=yes
TRITON=yes
@ -197,24 +209,23 @@ case "$tag" in
UCC_COMMIT=${_UCC_COMMIT}
PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
;;
pytorch-linux-jammy-xpu-n-1-py3)
ANACONDA_PYTHON_VERSION=3.10
pytorch-linux-jammy-xpu-2025.0-py3)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
VISION=yes
XPU_VERSION=2025.0
NINJA_VERSION=1.9.0
TRITON=yes
;;
pytorch-linux-jammy-xpu-2025.1-py3)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
VISION=yes
XPU_VERSION=2025.1
NINJA_VERSION=1.9.0
TRITON=yes
;;
pytorch-linux-jammy-xpu-n-py3)
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
VISION=yes
XPU_VERSION=2025.2
NINJA_VERSION=1.9.0
TRITON=yes
;;
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
# TODO (huydhn): Upgrade this to Python >= 3.10
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
VISION=yes
@ -223,8 +234,8 @@ case "$tag" in
DOCS=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12)
ANACONDA_PYTHON_VERSION=3.10
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
ANACONDA_PYTHON_VERSION=3.9
CUDA_VERSION=12.8.1
CLANG_VERSION=12
VISION=yes
@ -235,8 +246,8 @@ case "$tag" in
CLANG_VERSION=18
VISION=yes
;;
pytorch-linux-jammy-py3.10-gcc11)
ANACONDA_PYTHON_VERSION=3.10
pytorch-linux-jammy-py3.9-gcc11)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
VISION=yes
KATEX=yes

View File

@ -1 +1 @@
74a23feff57432129df84d8099e622773cf77925
e03a63be43e33596f7f0a43b0f530353785e4a59

View File

@ -1 +1 @@
d0e80f39c562c70986fc548fa6e5852ad86e16e7
a6572fb0be5b9b0a19b0641a0ce05810fa04e44c

View File

@ -57,7 +57,7 @@ if [ ! -f setup.py ]; then
cd python
fi
pip_install pybind11==3.0.1
pip_install pybind11==2.13.6
# TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py

View File

@ -44,12 +44,8 @@ function install_ucc() {
./autogen.sh
if [[ -n "$CUDA_VERSION" && $CUDA_VERSION == 13* ]]; then
NVCC_GENCODE="-gencode=arch=compute_86,code=compute_86"
else
# We only run distributed tests on Tesla M60 and A10G
NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
fi
# We only run distributed tests on Tesla M60 and A10G
NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
if [[ -n "$ROCM_VERSION" ]]; then
if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then

View File

@ -146,11 +146,11 @@ if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
XPU_DRIVER_VERSION="/lts/2350"
fi
# Default use Intel® oneAPI Deep Learning Essentials 2025.1
if [[ "$XPU_VERSION" == "2025.2" ]]; then
XPU_PACKAGES="intel-deep-learning-essentials-2025.2"
else
# Default use Intel® oneAPI Deep Learning Essentials 2025.0
if [[ "$XPU_VERSION" == "2025.1" ]]; then
XPU_PACKAGES="intel-deep-learning-essentials-2025.1"
else
XPU_PACKAGES="intel-deep-learning-essentials-2025.0"
fi
# The installation depends on the base OS

View File

@ -175,6 +175,6 @@ ENV XPU_DRIVER_TYPE ROLLING
RUN python3 -m pip install --upgrade pip && \
python3 -mpip install cmake==3.28.4
ADD ./common/install_xpu.sh install_xpu.sh
ENV XPU_VERSION 2025.2
ENV XPU_VERSION 2025.1
RUN bash ./install_xpu.sh && rm install_xpu.sh
RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd

View File

@ -379,7 +379,7 @@ dataclasses_json==0.6.7
cmake==4.0.0
#Description: required for building
tlparse==0.4.0
tlparse==0.3.30
#Description: required for log parsing
cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x"

View File

@ -1,7 +1,7 @@
sphinx==5.3.0
#Description: This is used to generate PyTorch docs
#Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably

View File

@ -66,7 +66,6 @@ ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
# (optional) Install UCC
ARG UCX_COMMIT
ARG UCC_COMMIT
ARG CUDA_VERSION
ENV UCX_COMMIT $UCX_COMMIT
ENV UCC_COMMIT $UCC_COMMIT
ENV UCX_HOME /usr

View File

@ -1,56 +1,14 @@
from __future__ import annotations
import logging
import os
import textwrap
from pathlib import Path
from typing import TYPE_CHECKING
from cli.lib.common.utils import get_wheels
from jinja2 import Template
if TYPE_CHECKING:
from collections.abc import Iterable, Mapping
from typing import Iterable, Mapping, Optional
import logging
import xml.etree.ElementTree as ET
from pathlib import Path
from typing import Iterable, Tuple
logger = logging.getLogger(__name__)
_TPL_CONTENT = Template(
textwrap.dedent("""\
## {{ title }}
```{{ lang }}
{{ content }}
```
""")
)
_TPL_LIST_ITEMS = Template(
textwrap.dedent("""\
## {{ title }}
{% for it in items %}
- {{ it.pkg }}: {{ it.relpath }}
{% else %}
_(no item found)_
{% endfor %}
""")
)
_TPL_TABLE = Template(
textwrap.dedent("""\
{%- if rows %}
| {{ cols | join(' | ') }} |
|{%- for _ in cols %} --- |{%- endfor %}
{%- for r in rows %}
| {%- for c in cols %} {{ r.get(c, "") }} |{%- endfor %}
{%- endfor %}
{%- else %}
_(no data)_
{%- endif %}
""")
)
def gh_summary_path() -> Path | None:
"""Return the Path to the GitHub step summary file, or None if not set."""
@ -69,14 +27,13 @@ def write_gh_step_summary(md: str, *, append_content: bool = True) -> bool:
"""
sp = gh_summary_path()
if not sp:
# When running locally, just log to console instead of failing.
logger.info("[gh-summary] GITHUB_STEP_SUMMARY not set, skipping write.")
return False
md_clean = textwrap.dedent(md).strip() + "\n"
sp.parent.mkdir(parents=True, exist_ok=True)
mode = "a" if append_content else "w"
with sp.open(mode, encoding="utf-8") as f:
f.write(md_clean)
f.write(md.rstrip() + "\n")
return True
@ -85,59 +42,182 @@ def md_heading(text: str, level: int = 2) -> str:
return f"{'#' * max(1, min(level, 6))} {text}\n"
def md_kv_table(rows: Iterable[Mapping[str, str | int | float]]) -> str:
"""
Render a list of dictionaries as a Markdown table.
The first row (header) is derived from the union of all keys.
# Suppose you want to summarize benchmark results
rows = [
{"name": "transformer-small", "p50": 12.3, "p90(ms)": 18.4},
{"name": "transformer-large", "p50": 45.1, "p90(ms)": 60.7},
]
content = []
content.append(md_heading("Benchmark Results", level=2))
content.append(md_kv_table(rows))
content.append(md_details("Raw logs", "```\n[INFO] benchmark log ...\n```"))
# Join the pieces into one Markdown block
markdown = '\n'.join(content)
# Write to GitHub Actions summary (or log locally if not in CI)
write_gh_step_summary(markdown, append=True)
"""
rows = list(rows)
if not rows:
return "_(no data)_\n"
# Collect all columns across all rows
cols = list({k for r in rows for k in r.keys()})
header = "| " + " | ".join(cols) + " |\n"
sep = "|" + "|".join([" --- " for _ in cols]) + "|\n"
lines = []
for r in rows:
line = "| " + " | ".join(str(r.get(c, "")) for c in cols) + " |\n"
lines.append(line)
return header + sep + "".join(lines) + "\n"
def md_details(summary: str, content: str) -> str:
"""Generate a collapsible <details> block with a summary and inner content."""
return f"<details>\n<summary>{summary}</summary>\n\n{content}\n\n</details>\n"
# ---- helper test to generate a summary for list of pytest failures ------#
def summarize_failures_by_test_command(
xml_and_labels: Iterable[Tuple[str | Path, str]],
*,
title: str = "Pytest Failures by Test Command",
dedupe_within_command: bool = True,
):
"""
Args:
xml_and_labels: list of (xml_path, label) pairs.
Each XML corresponds to one pytest subprocess (one test command).
Behavior:
- Writes a section per test command if it has failures.
- Each failed test is listed as 'path/to/test.py:test_name'.
Example:
xml = [
("reports/junit_cmd0.xml", "pytest -v -s tests/unit"),
("reports/junit_cmd1.xml", "pytest -v -s tests/integration"),
("reports/junit_cmd2.xml", "pytest -v -s tests/entrypoints"),
]
summarize_failures_by_test_command(
xmls,
title="Consolidated Pytest Failures",
)
"""
write_gh_step_summary(md_heading(title, level=2))
for xml_path, label in xml_and_labels:
xmlp = Path(xml_path)
failed = _parse_failed_simple(xmlp)
if dedupe_within_command:
failed = sorted(set(failed))
if not failed:
continue # skip commands with no failures
write_gh_step_summary(md_heading(f"Test Command: {label}", level=3))
lines = "\n".join(f"- {item}" for item in failed)
write_gh_step_summary(lines + "\n")
def _to_simple_name_from_testcase(tc: ET.Element) -> str:
"""
Convert a <testcase> into 'path/to/test.py:test_name' format.
Prefer the 'file' attribute if available, else fall back to classname.
"""
name = tc.attrib.get("name", "")
file_attr = tc.attrib.get("file")
if file_attr:
return f"{file_attr}:{name}"
classname = tc.attrib.get("classname", "")
parts = classname.split(".") if classname else []
if len(parts) >= 1:
# drop last part if it's a class, treat rest as module path
mod_parts = parts[:-1] if len(parts) >= 2 else parts
mod_path = "/".join(mod_parts) + ".py" if mod_parts else "unknown.py"
return f"{mod_path}:{name}"
return f"unknown.py:{name or 'unknown_test'}"
def _parse_failed_simple(xml_path: Path) -> list[str]:
"""
Parse one XML, return failures as ['tests/a_test.py:test_x', ...].
Only include <failure> and <error>.
"""
if not xml_path.exists():
return []
tree = ET.parse(xml_path)
root = tree.getroot()
failed = []
for tc in root.iter("testcase"):
if any(x.tag in {"failure", "error"} for x in tc):
failed.append(_to_simple_name_from_testcase(tc))
return failed
def summarize_content_from_file(
output_dir: Path,
freeze_file: str,
title: str = "Content from file",
title: str = "Wheels (pip freeze)",
code_lang: str = "", # e.g. "text" or "ini"
) -> bool:
"""
Read a text file from output_dir/freeze_file and append it to
the GitHub Step Summary as a Markdown code block.
Returns True if something was written, False otherwise.
"""
f = Path(output_dir) / freeze_file
if not f.exists():
return False
content = f.read_text(encoding="utf-8").strip()
md = render_content(content, title=title, lang=code_lang)
return write_gh_step_summary(md)
def summarize_wheels(path: Path, title: str = "Wheels", max_depth: int = 3):
items = get_wheels(path, max_depth=max_depth)
if not items:
if not content:
return False
md = render_list(items, title=title)
return write_gh_step_summary(md)
md = []
md.append(md_heading(title, 2))
md.append(f"```{code_lang}".rstrip())
md.append(content)
md.append("```")
return write_gh_step_summary("\n".join(md) + "\n")
def md_kv_table(rows: Iterable[Mapping[str, str | int | float]]) -> str:
def summarize_wheels(
output_dir: Path,
title: str = "Wheels",
max_depth: Optional[int] = None, # None = unlimited
):
"""
Render a list of dicts as a Markdown table using Jinja template.
Walk output_dir up to max_depth and list all *.whl files.
Grouped as 'package: filename.whl'.
Args:
output_dir: base directory to search
title: section title in GH summary
max_depth: maximum folder depth relative to output_dir (0 = only top-level)
"""
rows = list(rows)
cols = list({k for r in rows for k in r.keys()})
md = _TPL_TABLE.render(cols=cols, rows=rows).strip() + "\n"
return md
if not output_dir.exists():
return False
root = Path(output_dir)
lines = [md_heading(title, 2)]
for dirpath, _, filenames in os.walk(root):
depth = Path(dirpath).relative_to(root).parts
if max_depth is not None and len(depth) > max_depth:
# skip going deeper
continue
def render_list(
items: Iterable[str],
*,
title: str = "List",
) -> str:
tpl = _TPL_LIST_ITEMS
md = tpl.render(title=title, items=items)
return md
for fname in sorted(filenames):
if not fname.endswith(".whl"):
continue
pkg = fname.split("-")[0]
relpath = str(Path(dirpath) / fname).replace(str(root) + os.sep, "")
lines.append(f"- {pkg}: {relpath}")
def render_content(
content: str,
*,
title: str = "Content",
lang: str = "text",
) -> str:
tpl = _TPL_CONTENT
md = tpl.render(title=title, content=content, lang=lang)
return md
if len(lines) > 1:
write_gh_step_summary("\n".join(lines) + "\n")

View File

@ -4,7 +4,7 @@ import shlex
import shutil
import sys
from collections.abc import Iterable
from importlib.metadata import PackageNotFoundError, version # noqa: UP035
from importlib.metadata import PackageNotFoundError, version
from typing import Optional, Union
from cli.lib.common.utils import run_command

View File

@ -8,7 +8,6 @@ import shlex
import subprocess
import sys
from contextlib import contextmanager
from pathlib import Path
from typing import Optional
@ -116,24 +115,3 @@ def working_directory(path: str):
yield
finally:
os.chdir(prev_cwd)
def get_wheels(
output_dir: Path,
max_depth: Optional[int] = None,
) -> list[str]:
"""Return a list of wheels found in the given output directory."""
root = Path(output_dir)
if not root.exists():
return []
items = []
for dirpath, _, filenames in os.walk(root):
depth = Path(dirpath).relative_to(root).parts
if max_depth is not None and len(depth) > max_depth:
continue
for fname in sorted(filenames):
if fname.endswith(".whl"):
pkg = fname.split("-")[0]
relpath = str((Path(dirpath) / fname).relative_to(root))
items.append({"pkg": pkg, "relpath": relpath})
return items

View File

@ -1,27 +1,15 @@
import logging
import os
import textwrap
from pathlib import Path
import re
from typing import Any
from cli.lib.common.gh_summary import write_gh_step_summary
from cli.lib.common.git_helper import clone_external_repo
from cli.lib.common.pip_helper import pip_install_packages
from cli.lib.common.utils import run_command, temp_environ, working_directory
from jinja2 import Template
from cli.lib.common.gh_summary import md_heading, write_gh_step_summary
logger = logging.getLogger(__name__)
_TPL_VLLM_INFO = Template(
textwrap.dedent("""\
## Vllm against Pytorch CI Test Summary
**Vllm Commit**: [{{ vllm_commit }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})
{%- if torch_sha %}
**Pytorch Commit**: [{{ torch_sha }}](https://github.com/pytorch/pytorch/commit/{{ torch_sha }})
{%- endif %}
""")
)
def sample_vllm_test_library():
"""
@ -245,12 +233,3 @@ def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) ->
for k in sorted(mapping, key=len, reverse=True):
step = step.replace(k, mapping[k])
return step
def summarize_build_info(vllm_commit: str) -> bool:
torch_sha = os.getenv("GITHUB_SHA")
md = (
_TPL_VLLM_INFO.render(vllm_commit=vllm_commit, torch_sha=torch_sha).strip()
+ "\n"
)
return write_gh_step_summary(md)

View File

@ -4,7 +4,6 @@ import textwrap
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
from cli.lib.common.cli_helper import BaseRunner
from cli.lib.common.docker_helper import local_image_exists
from cli.lib.common.envs_helper import (
@ -13,11 +12,6 @@ from cli.lib.common.envs_helper import (
env_str_field,
with_params_help,
)
from cli.lib.common.gh_summary import (
gh_summary_path,
summarize_content_from_file,
summarize_wheels,
)
from cli.lib.common.path_helper import (
copy,
ensure_dir_exists,
@ -26,7 +20,14 @@ from cli.lib.common.path_helper import (
is_path_exist,
)
from cli.lib.common.utils import run_command
from cli.lib.core.vllm.lib import clone_vllm, summarize_build_info
from cli.lib.core.vllm.lib import clone_vllm, write_gh_step_summary
from cli.lib.common.gh_summary import (
summarize_content_from_file,
summarize_wheels,
gh_summary_path,
)
import torch
from torch import torch_version
logger = logging.getLogger(__name__)
@ -160,7 +161,17 @@ class VllmBuildRunner(BaseRunner):
logger.info("Running vllm build with inputs: %s", inputs)
vllm_commit = clone_vllm()
vllm_sha_url = f"${vllm_commit}](https://github.com/vllm-project/vllm/commit/${vllm_commit})"
write_gh_step_summary(
f"""
## Commit Info
- **Vllm Commit**: `{vllm_sha_url}`
- **Torch Version**: `{torch_version}`
"""
)
self.cp_dockerfile_if_exist(inputs)
# cp torch wheels from root direct to vllm workspace if exist
self.cp_torch_whls_if_exist(inputs)
@ -181,19 +192,26 @@ class VllmBuildRunner(BaseRunner):
if not gh_summary_path():
return logger.info("Skipping, not detect GH Summary env var....")
logger.info("Generate GH Summary ...")
# summarize vllm build info
summarize_build_info(vllm_commit)
# summarize vllm build artifacts
vllm_sha_url = f"[{vllm_commit}](https://github.com/vllm-project/vllm/commit/{vllm_commit})"
write_gh_step_summary(
f"""
## Build vllm against Pytorch CI
**Vllm Commit**: `{vllm_sha_url}`
"""
)
torch_sha = os.getenv("GITHUB_SHA")
if torch_sha: # only can grab this in github action
torch_sha_url = (
f"[{torch_sha}](https://github.com/pytorch/pytorch/commit/{torch_sha})]"
)
write_gh_step_summary(
f"""
**Pytorch Commit**: `{torch_sha_url}`
"""
)
vllm_artifact_dir = inputs.output_dir / "wheels"
summarize_content_from_file(
vllm_artifact_dir,
"build_summary.txt",
title="Vllm build env pip package summary",
)
summarize_wheels(
inputs.torch_whls_path, max_depth=3, title="Torch Wheels Artifacts"
)
summarize_content_from_file(vllm_artifact_dir, "build_summary.txt", title="Vllm build package summary")
summarize_wheels(inputs.torch_whls_path, max_depth=3, title="Torch Wheels Artifacts")
summarize_wheels(vllm_artifact_dir, max_depth=3, title="Vllm Wheels Artifacts")
def cp_torch_whls_if_exist(self, inputs: VllmBuildParameters) -> str:

View File

@ -215,6 +215,7 @@ def preprocess_test_in(
"torchaudio",
"xformers",
"mamba_ssm",
"pybind11",
] + additional_package_to_move
# Read current requirements
target_path = Path(target_file)

View File

@ -300,3 +300,24 @@ except RuntimeError as e:
exit 1
fi
fi
###############################################################################
# Check for C++ ABI compatibility to GCC-11 - GCC 13
###############################################################################
if [[ "$(uname)" == 'Linux' && "$PACKAGE_TYPE" == 'manywheel' ]]; then
pushd /tmp
# Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html
# gcc-11 is ABI16, gcc-13 is ABI18, gcc-14 is ABI19
# gcc 11 - CUDA 11.8, xpu, rocm
# gcc 13 - CUDA 12.6, 12.8 and cpu
# Please see issue for reference: https://github.com/pytorch/pytorch/issues/152426
if [[ "$(uname -m)" == "s390x" ]]; then
cxx_abi="19"
elif [[ "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'rocm'* ]]; then
cxx_abi="18"
else
cxx_abi="16"
fi
python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi10${cxx_abi}' else 1)"
popd
fi

View File

@ -35,10 +35,11 @@ fi
print_cmake_info
if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
# Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
else
# NB: we always build with distributed; USE_DISTRIBUTED turns off all
# backends (specifically the gloo backend), so test that this case works too
# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
fi
if which sccache > /dev/null; then

View File

@ -13,13 +13,9 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
fi
popd
python -mpip install -r requirements.txt
# enable debug asserts in serialization
export TORCH_SERIALIZATION_DEBUG=1
python -mpip install --no-input -r requirements.txt
setup_test_python() {
# The CircleCI worker hostname doesn't resolve to an address.
# This environment variable makes ProcessGroupGloo default to
@ -306,47 +302,6 @@ test_torchbench_smoketest() {
fi
done
echo "Pytorch benchmark on mps device completed"
}
test_aoti_torchbench_smoketest() {
print_cmake_info
echo "Launching AOTInductor torchbench setup"
pip_benchmark_deps
# shellcheck disable=SC2119,SC2120
torchbench_setup_macos
TEST_REPORTS_DIR=$(pwd)/test/test-reports
mkdir -p "$TEST_REPORTS_DIR"
local device=mps
local dtypes=(undefined float16 bfloat16 notset)
local dtype=${dtypes[$1]}
local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}"
local dtype_arg="--${dtype}"
if [ "$dtype" == notset ]; then
dtype_arg="--float32"
fi
touch "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv"
for model in "${models[@]}"; do
PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
--performance --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
--output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv" || true
PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
--accuracy --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
--output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_accuracy.csv" || true
done
echo "Launching HuggingFace inference performance run for AOT Inductor and dtype ${dtype}"
PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
--performance --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
--output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_performance.csv" || true
PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
--accuracy --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
--output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_accuracy.csv" || true
echo "Pytorch benchmark on mps device completed"
}
@ -395,8 +350,6 @@ elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then
test_timm_perf
elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then
test_torchbench_smoketest "${SHARD_NUMBER}"
elif [[ $TEST_CONFIG == *"aot_inductor_perf_smoketest"* ]]; then
test_aoti_torchbench_smoketest "${SHARD_NUMBER}"
elif [[ $TEST_CONFIG == *"mps"* ]]; then
test_python_mps
elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then

View File

@ -496,14 +496,6 @@ test_inductor_cpp_wrapper_shard() {
-k 'take' \
--shard "$1" "$NUM_TEST_SHARDS" \
--verbose
if [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
python test/run_test.py \
--include inductor/test_mkldnn_pattern_matcher \
-k 'xpu' \
--shard "$1" "$NUM_TEST_SHARDS" \
--verbose
fi
}
# "Global" flags for inductor benchmarking controlled by TEST_CONFIG

View File

@ -44,7 +44,7 @@ python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==
python -m pip install z3-solver==4.15.1.0
# Install tlparse for test\dynamo\test_structured_trace.py UTs.
python -m pip install tlparse==0.4.0
python -m pip install tlparse==0.3.30
# Install parameterized
python -m pip install parameterized==0.8.1

View File

@ -13,9 +13,9 @@ if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
:xpu_bundle_install_start
set XPU_BUNDLE_PARENT_DIR=C:\Program Files (x86)\Intel\oneAPI
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d6d6c17-ca2d-4735-9331-99447e4a1280/intel-deep-learning-essentials-2025.0.1.28_offline.exe
set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product
set XPU_BUNDLE_VERSION=2025.1.3+5
set XPU_BUNDLE_VERSION=2025.0.1+20
set XPU_BUNDLE_INSTALLED=0
set XPU_BUNDLE_UNINSTALL=0
set XPU_EXTRA_URL=NULL
@ -24,9 +24,9 @@ set XPU_EXTRA_VERSION=2025.0.1+1226
set XPU_EXTRA_INSTALLED=0
set XPU_EXTRA_UNINSTALL=0
if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.2] (
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
set XPU_BUNDLE_VERSION=2025.2.1+20
if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.1] (
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
set XPU_BUNDLE_VERSION=2025.1.3+5
)
:: Check if XPU bundle is target version or already installed
@ -90,3 +90,14 @@ if errorlevel 1 exit /b 1
del xpu_extra.exe
:xpu_install_end
if not "%XPU_ENABLE_KINETO%"=="1" goto install_end
:: Install Level Zero SDK
set XPU_EXTRA_LZ_URL=https://github.com/oneapi-src/level-zero/releases/download/v1.14.0/level-zero-sdk_1.14.0.zip
curl -k -L %XPU_EXTRA_LZ_URL% --output "%SRC_DIR%\temp_build\level_zero_sdk.zip"
echo "Installing level zero SDK..."
7z x "%SRC_DIR%\temp_build\level_zero_sdk.zip" -o"%SRC_DIR%\temp_build\level_zero"
set "INCLUDE=%SRC_DIR%\temp_build\level_zero\include;%INCLUDE%"
del "%SRC_DIR%\temp_build\level_zero_sdk.zip"
:install_end

View File

@ -213,8 +213,7 @@ pip install requests ninja typing-extensions
retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
retry brew install libomp
# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
# is build as part of tensorpipe submodule
# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
export USE_DISTRIBUTED=1
export USE_MKLDNN=OFF

View File

@ -15,7 +15,8 @@ fi
if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
export VC_YEAR=2022
export USE_SCCACHE=0
export XPU_VERSION=2025.2
export XPU_VERSION=2025.1
export XPU_ENABLE_KINETO=1
fi
echo "Free space on filesystem before build:"

View File

@ -8,7 +8,7 @@ export VC_YEAR=2022
if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
export VC_YEAR=2022
export XPU_VERSION=2025.2
export XPU_VERSION=2025.1
fi
pushd "$PYTORCH_ROOT/.ci/pytorch/"

View File

@ -62,6 +62,8 @@ runs:
MAX_JOBS="$(nproc --ignore=6)"
export MAX_JOBS
echo "$GITHUB_STEP_SUMMARY"
# Split the comma-separated list and build each target
IFS=',' read -ra TARGETS <<< "$BUILD_TARGETS"
for target in "${TARGETS[@]}"; do

View File

@ -57,21 +57,6 @@ runs:
submodules: ${{ inputs.submodules }}
show-progress: false
- name: Clean submodules post checkout
id: clean-submodules
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
shell: bash
env:
NO_SUDO: ${{ inputs.no-sudo }}
run: |
cd "${GITHUB_WORKSPACE}"
# Clean stale submodule dirs
if [ -z "${NO_SUDO}" ]; then
sudo git submodule foreach --recursive git clean -ffdx
else
git submodule foreach --recursive git clean -ffdx
fi
- name: Clean workspace (try again)
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' &&
(steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }}

View File

@ -1 +1 @@
1c66402d0fa47ea74d365dcaa468d397da481918
10a5002c6195bd95e34df8fe28ff8a2d55a2a922

View File

@ -1 +1 @@
752d2e1c364e4195093e4f3f2fc33e3ae1840707
add1adfec742dfb13e614dab3372b5aafd1ff046

View File

@ -1 +1 @@
763e5b78d4fcd74a9e812256656c075f99d9a781
a1c6ee92c85e8b0955c20892ed68f032a6015c09

View File

@ -359,7 +359,7 @@ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# Build flashinfer for torch nightly from source around 10 mins
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
ARG FLASHINFER_GIT_REF="v0.2.14.post1"
ARG ="v0.2.14.post1"
RUN --mount=type=cache,target=/root/.cache/uv \
git clone --depth 1 --recursive --shallow-submodules \
--branch ${FLASHINFER_GIT_REF} \

View File

@ -28,7 +28,7 @@ pyyaml==6.0.2
scipy==1.12.0
setuptools==72.1.0
sympy==1.13.3
tlparse==0.4.0
tlparse==0.3.30
tensorboard==2.13.0
typing-extensions==4.12.2
unittest-xml-reporting<=3.2.0,>=2.0.0

View File

@ -40,7 +40,7 @@ CPU_AARCH64_ARCH = ["cpu-aarch64"]
CPU_S390X_ARCH = ["cpu-s390x"]
CUDA_AARCH64_ARCHES = ["12.9-aarch64", "13.0-aarch64"]
CUDA_AARCH64_ARCHES = ["12.9-aarch64"]
PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
@ -113,26 +113,26 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'"
),
"xpu": (
"intel-cmplr-lib-rt==2025.2.1 | "
"intel-cmplr-lib-ur==2025.2.1 | "
"intel-cmplr-lic-rt==2025.2.1 | "
"intel-sycl-rt==2025.2.1 | "
"oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"onemkl-sycl-blas==2025.2.0 | "
"onemkl-sycl-dft==2025.2.0 | "
"onemkl-sycl-lapack==2025.2.0 | "
"onemkl-sycl-rng==2025.2.0 | "
"onemkl-sycl-sparse==2025.2.0 | "
"dpcpp-cpp-rt==2025.2.1 | "
"intel-opencl-rt==2025.2.1 | "
"mkl==2025.2.0 | "
"intel-openmp==2025.2.1 | "
"tbb==2022.2.0 | "
"tcmlib==1.4.0 | "
"umf==0.11.0 | "
"intel-pti==0.13.1"
"intel-cmplr-lib-rt==2025.1.1 | "
"intel-cmplr-lib-ur==2025.1.1 | "
"intel-cmplr-lic-rt==2025.1.1 | "
"intel-sycl-rt==2025.1.1 | "
"oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"onemkl-sycl-blas==2025.1.0 | "
"onemkl-sycl-dft==2025.1.0 | "
"onemkl-sycl-lapack==2025.1.0 | "
"onemkl-sycl-rng==2025.1.0 | "
"onemkl-sycl-sparse==2025.1.0 | "
"dpcpp-cpp-rt==2025.1.1 | "
"intel-opencl-rt==2025.1.1 | "
"mkl==2025.1.0 | "
"intel-openmp==2025.1.1 | "
"tbb==2022.1.0 | "
"tcmlib==1.3.0 | "
"umf==0.10.0 | "
"intel-pti==0.12.3"
),
}
@ -244,6 +244,8 @@ def generate_libtorch_matrix(
arches.remove("13.0")
elif os == "windows":
arches += CUDA_ARCHES
if "13.0" in arches:
arches.remove("13.0")
if libtorch_variants is None:
libtorch_variants = [
"shared-with-deps",
@ -308,6 +310,8 @@ def generate_wheels_matrix(
arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
elif os == "windows":
arches += CUDA_ARCHES + XPU_ARCHES
if "13.0" in arches:
arches.remove("13.0")
elif os == "linux-aarch64":
# Separate new if as the CPU type is different and
# uses different build/test scripts

View File

@ -27,7 +27,6 @@ from trymerge import (
get_drci_classifications,
gh_get_team_members,
GitHubPR,
iter_issue_timeline_until_comment,
JobCheckState,
main as trymerge_main,
MandatoryChecksMissingError,
@ -35,8 +34,6 @@ from trymerge import (
RE_GHSTACK_DESC,
read_merge_rules,
remove_job_name_suffix,
sha_from_committed_event,
sha_from_force_push_after,
validate_revert,
)
@ -127,7 +124,7 @@ def mock_parse_args(revert: bool = False, force: bool = False) -> Any:
self.force = force
self.pr_num = 76123
self.dry_run = True
self.comment_id = 12345 # Set to non-zero value
self.comment_id = 0
self.reason = "this is for testing"
self.ignore_current = False
self.check_mergeability = False
@ -155,9 +152,9 @@ def mock_revert(
def mock_merge(
pr: GitHubPR,
repo: GitRepo,
comment_id: int,
dry_run: bool = False,
skip_mandatory_checks: bool = False,
comment_id: Optional[int] = None,
timeout_minutes: int = 400,
stale_pr_days: int = 3,
ignore_current: bool = False,
@ -473,9 +470,9 @@ class TestTryMerge(TestCase):
mock_merge.assert_called_once_with(
mock.ANY,
mock.ANY,
comment_id=mock.ANY,
dry_run=mock.ANY,
skip_mandatory_checks=True,
comment_id=mock.ANY,
ignore_current=False,
)
@ -488,9 +485,9 @@ class TestTryMerge(TestCase):
mock_merge.assert_called_once_with(
mock.ANY,
mock.ANY,
comment_id=mock.ANY,
dry_run=mock.ANY,
skip_mandatory_checks=False,
comment_id=mock.ANY,
ignore_current=False,
)
@ -1141,176 +1138,5 @@ Pull Request resolved: https://github.com/pytorch/pytorch/pull/154394"""
)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
@mock.patch(
"trymerge.get_drci_classifications", side_effect=mocked_drci_classifications
)
class TestTimelineFunctions(TestCase):
"""Tests for the new timeline-related functions"""
def test_sha_from_committed_event(self, *args: Any) -> None:
"""Test extracting SHA from committed event"""
# Based on actual GitHub API format - committed events have "sha" at top level
event = {
"event": "committed",
"sha": "fb21ce932ded6670c918804a0d9151b773770a7c",
}
self.assertEqual(
sha_from_committed_event(event), "fb21ce932ded6670c918804a0d9151b773770a7c"
)
# Test with missing SHA
event_no_sha = {"event": "committed"}
self.assertIsNone(sha_from_committed_event(event_no_sha))
def test_sha_from_force_push_after(self, *args: Any) -> None:
"""Test extracting SHA from force push event"""
# NOTE: The current function doesn't handle the actual GitHub API format
# Real force push events have "commit_id" at top level, but this function
# looks for "after", "after_commit", "after_sha", or "head_sha" fields
# Test with the legacy format the current function handles
event_legacy = {
"event": "head_ref_force_pushed",
"after": {"sha": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e"},
}
self.assertEqual(
sha_from_force_push_after(event_legacy),
"ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
)
# Test with current GitHub API format (should return None with current implementation)
event_real_api = {
"event": "head_ref_force_pushed",
"commit_id": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
}
self.assertEqual(
sha_from_force_push_after(event_real_api),
"ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
) # Current function doesn't handle commit_id
# Test with missing SHA
event_no_sha = {"event": "head_ref_force_pushed"}
self.assertIsNone(sha_from_force_push_after(event_no_sha))
@mock.patch("trymerge.gh_fetch_json_list")
def test_iter_issue_timeline_until_comment(
self, mock_gh_fetch_json_list: Any, *args: Any
) -> None:
"""Test timeline iteration until target comment"""
# Mock timeline data based on actual GitHub API format
timeline_data = [
{"event": "commented", "id": 100, "body": "first comment"},
{"event": "committed", "sha": "fb21ce932ded6670c918804a0d9151b773770a7c"},
{"event": "commented", "id": 200, "body": "target comment"},
{"event": "commented", "id": 300, "body": "after target"},
]
mock_gh_fetch_json_list.return_value = timeline_data
# Test iteration stops at target comment
events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 200))
self.assertEqual(len(events), 3) # Should stop at target comment
self.assertEqual(events[0]["event"], "commented")
self.assertEqual(events[0]["id"], 100)
self.assertEqual(events[1]["event"], "committed")
self.assertEqual(events[1]["sha"], "fb21ce932ded6670c918804a0d9151b773770a7c")
self.assertEqual(events[2]["event"], "commented")
self.assertEqual(events[2]["id"], 200)
@mock.patch("trymerge.gh_fetch_json_list")
def test_iter_issue_timeline_until_comment_not_found(
self, mock_gh_fetch_json_list: Any, *args: Any
) -> None:
"""Test timeline iteration when target comment is not found"""
# Mock empty timeline
mock_gh_fetch_json_list.return_value = []
events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 999))
self.assertEqual(len(events), 0)
@mock.patch("trymerge.iter_issue_timeline_until_comment")
def test_get_commit_sha_at_comment_commit_after_comment(
self, mock_iter_timeline: Any, *args: Any
) -> None:
"""Test get_commit_sha_at_comment returns correct SHA after comment"""
mock_iter_timeline.return_value = [
{"event": "committed", "sha": "commit1"},
{"event": "committed", "sha": "commit2"},
{"event": "commented", "id": 100},
{"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
]
pr = GitHubPR("pytorch", "pytorch", 77700)
sha = pr.get_commit_sha_at_comment(100)
self.assertEqual(sha, "commit2")
@mock.patch("trymerge.iter_issue_timeline_until_comment")
def test_get_commit_sha_at_comment_force_push_before_comment(
self, mock_iter_timeline: Any, *args: Any
) -> None:
mock_iter_timeline.return_value = [
{"event": "committed", "sha": "commit1"},
{"event": "committed", "sha": "commit2"},
{"event": "head_ref_force_pushed", "commit_id": "commit3"},
{"event": "commented", "id": 100},
]
pr = GitHubPR("pytorch", "pytorch", 77700)
sha = pr.get_commit_sha_at_comment(100)
self.assertEqual(sha, "commit3")
@mock.patch("trymerge.iter_issue_timeline_until_comment")
def test_get_commit_sha_at_comment_force_push_before_comment_legacy_mode(
self, mock_iter_timeline: Any, *args: Any
) -> None:
mock_iter_timeline.return_value = [
{"event": "committed", "sha": "commit1"},
{"event": "committed", "sha": "commit2"},
{"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
{"event": "commented", "id": 100},
]
pr = GitHubPR("pytorch", "pytorch", 77700)
sha = pr.get_commit_sha_at_comment(100)
self.assertEqual(sha, "commit3")
@mock.patch("trymerge.iter_issue_timeline_until_comment")
def test_get_commit_sha_at_comment_multiple_comments(
self, mock_iter_timeline: Any, *args: Any
) -> None:
mock_iter_timeline.return_value = [
{"event": "committed", "sha": "commit1"},
{"event": "commented", "id": 100},
{"event": "committed", "sha": "commit2"},
{"event": "commented", "id": 200},
{"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
{"event": "commented", "id": 300},
]
pr = GitHubPR("pytorch", "pytorch", 77700)
sha = pr.get_commit_sha_at_comment(200)
self.assertEqual(sha, "commit2")
sha = pr.get_commit_sha_at_comment(300)
self.assertEqual(sha, "commit3")
@mock.patch("trymerge.iter_issue_timeline_until_comment")
def test_get_commit_sha_at_comment_no_events(
self, mock_iter_timeline: Any, *args: Any
) -> None:
mock_iter_timeline.return_value = [
{"event": "commented", "id": 100},
{"event": "labeled", "label": {"name": "test"}},
]
pr = GitHubPR("pytorch", "pytorch", 77700)
sha = pr.get_commit_sha_at_comment(100)
self.assertIsNone(sha)
@mock.patch("trymerge.iter_issue_timeline_until_comment")
def test_get_commit_sha_at_comment_exception(
self, mock_iter_timeline: Any, *args: Any
) -> None:
mock_iter_timeline.side_effect = Exception("API error")
pr = GitHubPR("pytorch", "pytorch", 77700)
sha = pr.get_commit_sha_at_comment(100)
self.assertIsNone(sha)
if __name__ == "__main__":
main()

View File

@ -450,63 +450,6 @@ HAS_NO_CONNECTED_DIFF_TITLE = (
IGNORABLE_FAILED_CHECKS_THESHOLD = 10
def iter_issue_timeline_until_comment(
org: str, repo: str, issue_number: int, target_comment_id: int, max_pages: int = 200
) -> Any:
"""
Yield timeline entries in order until (and including) the entry whose id == target_comment_id
for a 'commented' event. Stops once the target comment is encountered.
"""
page = 1
while page <= max_pages:
url = (
f"https://api.github.com/repos/{org}/{repo}/issues/{issue_number}/timeline"
)
params = {"per_page": 100, "page": page}
batch = gh_fetch_json_list(url, params)
if not batch:
return
for ev in batch:
# The target is the issue comment row with event == "commented" and id == issue_comment_id
if ev.get("event") == "commented" and ev.get("id") == target_comment_id:
yield ev # nothing in the timeline after this matters, so stop early
return
yield ev
if len(batch) < 100:
return
page += 1
# If we got here without finding the comment, then we either hit a bug or some github PR
# has a _really_ long timeline.
# The max # of pages found on any pytorch/pytorch PR at the time of this change was 41
raise RuntimeError(
f"Could not find a merge commit in the first {max_pages} pages of the timeline at url {url}."
f"This is most likely a bug, please report it to the @pytorch/pytorch-dev-infra team."
)
def sha_from_committed_event(ev: dict[str, Any]) -> Optional[str]:
"""Extract SHA from committed event in timeline"""
return ev.get("sha")
def sha_from_force_push_after(ev: dict[str, Any]) -> Optional[str]:
"""Extract SHA from force push event in timeline"""
# The current GitHub API format
commit_id = ev.get("commit_id")
if commit_id:
return str(commit_id)
# Legacy format
after = ev.get("after") or ev.get("after_commit") or {}
if isinstance(after, dict):
return after.get("sha") or after.get("oid")
return ev.get("after_sha") or ev.get("head_sha")
def gh_get_pr_info(org: str, proj: str, pr_no: int) -> Any:
rc = gh_graphql(GH_GET_PR_INFO_QUERY, name=proj, owner=org, number=pr_no)
return rc["data"]["repository"]["pullRequest"]
@ -794,24 +737,16 @@ class GitHubPR:
def last_commit(self) -> Any:
return self.info["commits"]["nodes"][-1]["commit"]
def last_commit_sha(self, default: Optional[str] = None) -> str:
# for commits, the oid is the sha
if default is None:
return str(self.last_commit()["oid"])
return str(self.last_commit().get("oid", default))
def get_merge_base(self) -> str:
if self.merge_base:
return self.merge_base
last_commit_sha = self.last_commit_sha()
last_commit_oid = self.last_commit()["oid"]
# NB: We could use self.base_ref() here for regular PR, however, that doesn't
# work for ghstack where the base is the custom branch, i.e. gh/USER/ID/base,
# so let's just use main instead
self.merge_base = gh_fetch_merge_base(
self.org, self.project, last_commit_sha, self.default_branch()
self.org, self.project, last_commit_oid, self.default_branch()
)
# Fallback to baseRefOid if the API call fails, i.e. rate limit. Note that baseRefOid
@ -900,44 +835,6 @@ class GitHubPR:
def get_commit_count(self) -> int:
return int(self.info["commits_with_authors"]["totalCount"])
def get_commit_sha_at_comment(self, comment_id: int) -> Optional[str]:
"""
Get the PR head commit SHA that was present when a specific comment was posted.
This ensures we only merge the state of the PR at the time the merge command was issued,
not any subsequent commits that may have been pushed after.
Returns None if no head-changing events found before the comment or if the comment was not found.
"""
head = None
try:
for event in iter_issue_timeline_until_comment(
self.org, self.project, self.pr_num, comment_id
):
etype = event.get("event")
if etype == "committed":
sha = sha_from_committed_event(event)
if sha:
head = sha
print(f"Timeline: Found commit event for SHA {sha}")
elif etype == "head_ref_force_pushed":
sha = sha_from_force_push_after(event)
if sha:
head = sha
print(f"Timeline: Found force push event for SHA {sha}")
elif etype == "commented":
if event.get("id") == comment_id:
print(f"Timeline: Found final comment with sha {sha}")
return head
except Exception as e:
print(
f"Warning: Failed to reconstruct timeline for comment {comment_id}: {e}"
)
return None
print(f"Did not find comment with id {comment_id} in the PR timeline")
return None
def get_pr_creator_login(self) -> str:
return cast(str, self.info["author"]["login"])
@ -1254,7 +1151,7 @@ class GitHubPR:
*,
skip_mandatory_checks: bool = False,
dry_run: bool = False,
comment_id: int,
comment_id: Optional[int] = None,
ignore_current_checks: Optional[list[str]] = None,
) -> None:
# Raises exception if matching rule is not found
@ -1270,7 +1167,7 @@ class GitHubPR:
skip_internal_checks=can_skip_internal_checks(self, comment_id),
ignore_current_checks=ignore_current_checks,
)
additional_merged_prs = self.merge_changes_locally(
additional_merged_prs = self.merge_changes(
repo, skip_mandatory_checks, comment_id
)
@ -1299,7 +1196,7 @@ class GitHubPR:
broken_trunk_checks=ignorable_checks.get("BROKEN_TRUNK", []),
flaky_checks=ignorable_checks.get("FLAKY", []),
unstable_checks=ignorable_checks.get("UNSTABLE", []),
last_commit_sha=self.last_commit_sha(default=""),
last_commit_sha=self.last_commit().get("oid", ""),
merge_base_sha=self.get_merge_base(),
merge_commit_sha=merge_commit_sha,
is_failed=False,
@ -1320,7 +1217,7 @@ class GitHubPR:
dry_run=dry_run,
)
def merge_changes_locally(
def merge_changes(
self,
repo: GitRepo,
skip_mandatory_checks: bool = False,
@ -1329,15 +1226,27 @@ class GitHubPR:
skip_all_rule_checks: bool = False,
) -> list["GitHubPR"]:
"""
:param skip_all_rule_checks: If true, skips all rule checks on ghstack PRs, useful for dry-running merge locally
:param skip_all_rule_checks: If true, skips all rule checks, useful for dry-running merge locally
"""
branch_to_merge_into = self.default_branch() if branch is None else branch
if repo.current_branch() != branch_to_merge_into:
repo.checkout(branch_to_merge_into)
if not self.is_ghstack_pr():
msg = self.gen_commit_message()
pr_branch_name = f"__pull-request-{self.pr_num}__init__"
repo.fetch(self.last_commit()["oid"], pr_branch_name)
repo._run_git("merge", "--squash", pr_branch_name)
repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
# It's okay to skip the commit SHA check for ghstack PRs since
# authoring requires write access to the repo.
if self.is_ghstack_pr():
# Did the PR change since we started the merge?
pulled_sha = repo.show_ref(pr_branch_name)
latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
if pulled_sha != latest_pr_status.last_commit()["oid"]:
raise RuntimeError(
"PR has been updated since CI checks last passed. Please rerun the merge command."
)
return []
else:
return self.merge_ghstack_into(
repo,
skip_mandatory_checks,
@ -1345,48 +1254,6 @@ class GitHubPR:
skip_all_rule_checks=skip_all_rule_checks,
)
msg = self.gen_commit_message()
pr_branch_name = f"__pull-request-{self.pr_num}__init__"
# Determine which commit SHA to merge
commit_to_merge = None
if not comment_id:
raise ValueError("Must provide --comment-id when merging regular PRs")
# Get the commit SHA that was present when the comment was made
commit_to_merge = self.get_commit_sha_at_comment(comment_id)
if not commit_to_merge:
raise RuntimeError(
f"Could not find commit that was pushed before comment {comment_id}"
)
# Validate that this commit is the latest commit on the PR
latest_commit = self.last_commit_sha()
if commit_to_merge != latest_commit:
raise RuntimeError(
f"Commit {commit_to_merge} was HEAD when comment {comment_id} was posted "
f"but now the latest commit on the PR is {latest_commit}. "
f"Please re-issue the merge command to merge the latest commit."
)
print(f"Merging commit {commit_to_merge} locally")
repo.fetch(commit_to_merge, pr_branch_name)
repo._run_git("merge", "--squash", pr_branch_name)
repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
# Did the PR change since we started the merge?
pulled_sha = repo.show_ref(pr_branch_name)
latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
if (
pulled_sha != latest_pr_status.last_commit_sha()
or pulled_sha != commit_to_merge
):
raise RuntimeError(
"PR has been updated since CI checks last passed. Please rerun the merge command."
)
return []
class MergeRuleFailedError(RuntimeError):
def __init__(self, message: str, rule: Optional["MergeRule"] = None) -> None:
@ -1591,7 +1458,7 @@ def find_matching_merge_rule(
pending_checks = []
failed_checks = []
hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit_sha()}"
hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit()['oid']}"
if len(failed_checks) > 0:
if reject_reason_score < 30000:
reject_reason_score = 30000
@ -2289,14 +2156,14 @@ def categorize_checks(
def merge(
pr: GitHubPR,
repo: GitRepo,
comment_id: int,
dry_run: bool = False,
skip_mandatory_checks: bool = False,
comment_id: Optional[int] = None,
timeout_minutes: int = 400,
stale_pr_days: int = 3,
ignore_current: bool = False,
) -> None:
initial_commit_sha = pr.last_commit_sha()
initial_commit_sha = pr.last_commit()["oid"]
pr_link = f"https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num}"
print(f"Attempting merge of {initial_commit_sha} ({pr_link})")
@ -2367,7 +2234,7 @@ def merge(
f"Attempting merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} ({elapsed_time / 60} minutes elapsed)"
)
pr = GitHubPR(pr.org, pr.project, pr.pr_num)
if initial_commit_sha != pr.last_commit_sha():
if initial_commit_sha != pr.last_commit()["oid"]:
raise RuntimeError(
"New commits were pushed while merging. Please rerun the merge command."
)
@ -2534,7 +2401,7 @@ def main() -> None:
if args.check_mergeability:
if pr.is_ghstack_pr():
get_ghstack_prs(repo, pr) # raises error if out of sync
pr.merge_changes_locally(
pr.merge_changes(
repo,
skip_mandatory_checks=True,
skip_all_rule_checks=True,
@ -2549,18 +2416,12 @@ def main() -> None:
gh_post_pr_comment(org, project, args.pr_num, message, dry_run=args.dry_run)
return
try:
# Ensure comment id is set, else fail
if not args.comment_id:
raise ValueError(
"Comment ID is required for merging PRs, please provide it using --comment-id"
)
merge(
pr,
repo,
comment_id=args.comment_id,
dry_run=args.dry_run,
skip_mandatory_checks=args.force,
comment_id=args.comment_id,
ignore_current=args.ignore_current,
)
except Exception as e:
@ -2582,7 +2443,7 @@ def main() -> None:
broken_trunk_checks=[],
flaky_checks=[],
unstable_checks=[],
last_commit_sha=pr.last_commit_sha(default=""),
last_commit_sha=pr.last_commit().get("oid", ""),
merge_base_sha=pr.get_merge_base(),
is_failed=True,
skip_mandatory_checks=args.force,

View File

@ -4,7 +4,7 @@
{%- set download_artifact_action = "actions/download-artifact@v4.1.7" -%}
{%- set timeout_minutes = 240 -%}
{%- set timeout_minutes_windows_binary = 360 -%}
{%- set timeout_minutes_windows_binary = 300 -%}
{%- macro concurrency(build_environment) -%}
concurrency:

View File

@ -77,7 +77,6 @@ jobs:
run: |
git config --global core.longpaths true
git config --global core.symlinks true
git config --global core.ignorecase false
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
# the directory on Windows and prevent GHA from checking out as reported

View File

@ -70,7 +70,6 @@ jobs:
run: |
git config --global core.longpaths true
git config --global core.symlinks true
git config --global core.ignorecase false
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
# the directory on Windows and prevent GHA from checking out as reported

View File

@ -275,7 +275,7 @@ jobs:
- name: Change permissions
if: ${{ always() && steps.test.conclusion }}
run: |
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1000:1000 test"
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
- name: Print remaining test logs
shell: bash

View File

@ -145,7 +145,7 @@ jobs:
fi
docker exec -t "${container_name}" yum install -y zlib-devel zip
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -m pip install -U setuptools==78.1.0 pybind11==3.0.1 auditwheel wheel
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -m pip install -U setuptools==78.1.0 pybind11==2.13.1 auditwheel wheel
set +e
docker exec -t "${container_name}" command -v pip
has_pip=$?

View File

@ -50,23 +50,24 @@ jobs:
runner: [linux.12xlarge]
docker-image-name: [
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
pytorch-linux-jammy-py3.10-clang12,
pytorch-linux-jammy-py3.9-clang12,
pytorch-linux-jammy-py3.13-clang12,
pytorch-linux-jammy-rocm-n-py3,
pytorch-linux-noble-rocm-n-py3,
pytorch-linux-noble-rocm-alpha-py3,
pytorch-linux-jammy-rocm-n-py3-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12,
pytorch-linux-jammy-py3.10-gcc11,
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
pytorch-linux-jammy-py3.9-gcc11,
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
pytorch-linux-jammy-py3.12-halide,
pytorch-linux-jammy-xpu-n-1-py3,
pytorch-linux-jammy-xpu-n-py3,
pytorch-linux-jammy-xpu-2025.0-py3,
pytorch-linux-jammy-xpu-2025.1-py3,
pytorch-linux-jammy-py3-clang18-asan,
pytorch-linux-jammy-py3-clang12-onnx,
pytorch-linux-jammy-linter,

View File

@ -158,52 +158,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_10-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda-aarch64-13_0-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_10-cuda-aarch64-13_0-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cuda-aarch64-13_0
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_11-cpu-aarch64-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -315,52 +269,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_11-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda-aarch64-13_0-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_11-cuda-aarch64-13_0-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cuda-aarch64-13_0
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_12-cpu-aarch64-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -472,52 +380,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_12-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda-aarch64-13_0-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_12-cuda-aarch64-13_0-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cuda-aarch64-13_0
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13-cpu-aarch64-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -629,52 +491,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda-aarch64-13_0-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_13-cuda-aarch64-13_0-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.13"
build_name: manywheel-py3_13-cuda-aarch64-13_0
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13t-cpu-aarch64-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -786,52 +602,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13t-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda-aarch64-13_0-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_13t-cuda-aarch64-13_0-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.13t"
build_name: manywheel-py3_13t-cuda-aarch64-13_0
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14-cpu-aarch64-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -943,52 +713,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda-aarch64-13_0-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_14-cuda-aarch64-13_0-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.14"
build_name: manywheel-py3_14-cuda-aarch64-13_0
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14t-cpu-aarch64-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -1099,49 +823,3 @@ jobs:
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14t-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda-aarch64-13_0-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_14t-cuda-aarch64-13_0-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.14t"
build_name: manywheel-py3_14t-cuda-aarch64-13_0
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml

View File

@ -612,7 +612,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-xpu
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-xpu-test: # Testing
@ -1270,7 +1270,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-xpu
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-xpu-test: # Testing
@ -1928,7 +1928,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-xpu
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-xpu-test: # Testing
@ -2586,7 +2586,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13-xpu
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-xpu-test: # Testing
@ -3244,7 +3244,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13t-xpu
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-xpu-test: # Testing
@ -3902,7 +3902,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14-xpu
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-xpu-test: # Testing
@ -4560,7 +4560,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14t-xpu
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-xpu-test: # Testing

View File

@ -51,7 +51,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "windows-11-arm64-preview"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -128,7 +128,7 @@ jobs:
- libtorch-cpu-shared-with-deps-debug-build
- get-label-type
runs-on: "windows-11-arm64-preview"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch

View File

@ -51,7 +51,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "windows-11-arm64-preview"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -128,7 +128,7 @@ jobs:
- libtorch-cpu-shared-with-deps-release-build
- get-label-type
runs-on: "windows-11-arm64-preview"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch

View File

@ -51,7 +51,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "windows-11-arm64-preview"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: wheel
@ -124,7 +124,7 @@ jobs:
- wheel-py3_11-cpu-build
- get-label-type
runs-on: "windows-11-arm64-preview"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: wheel
@ -198,7 +198,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "windows-11-arm64-preview"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: wheel
@ -271,7 +271,7 @@ jobs:
- wheel-py3_12-cpu-build
- get-label-type
runs-on: "windows-11-arm64-preview"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: wheel
@ -345,7 +345,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "windows-11-arm64-preview"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: wheel
@ -418,7 +418,7 @@ jobs:
- wheel-py3_13-cpu-build
- get-label-type
runs-on: "windows-11-arm64-preview"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: wheel

View File

@ -38,7 +38,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -153,7 +153,7 @@ jobs:
- libtorch-cpu-shared-with-deps-debug-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch

View File

@ -45,7 +45,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -160,7 +160,7 @@ jobs:
- libtorch-cpu-shared-with-deps-debug-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -292,7 +292,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -408,7 +408,7 @@ jobs:
- libtorch-cuda12_6-shared-with-deps-debug-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -542,7 +542,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -658,7 +658,7 @@ jobs:
- libtorch-cuda12_8-shared-with-deps-debug-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -792,7 +792,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -908,7 +908,7 @@ jobs:
- libtorch-cuda12_9-shared-with-deps-debug-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -1038,253 +1038,3 @@ jobs:
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
libtorch-cuda13_0-shared-with-deps-debug-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
LIBTORCH_CONFIG: debug
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
shell: bash
run: |
git config --global core.longpaths true
git config --global core.symlinks true
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
# the directory on Windows and prevent GHA from checking out as reported
# in https://github.com/actions/checkout/issues/1018
git config --global core.fsmonitor false
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
- uses: actions/upload-artifact@v4.4.0
if: always()
with:
name: libtorch-cuda13_0-shared-with-deps-debug
retention-days: 14
if-no-files-found: error
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1
libtorch-cuda13_0-shared-with-deps-debug-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- libtorch-cuda13_0-shared-with-deps-debug-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
LIBTORCH_CONFIG: debug
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
steps:
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
shell: bash
run: |
git config --global core.longpaths true
git config --global core.symlinks true
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
# the directory on Windows and prevent GHA from checking out as reported
# in https://github.com/actions/checkout/issues/1018
git config --global core.fsmonitor false
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: libtorch-cuda13_0-shared-with-deps-debug
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Test PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1
libtorch-cuda13_0-shared-with-deps-debug-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: libtorch-cuda13_0-shared-with-deps-debug-test
with:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
LIBTORCH_CONFIG: debug
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
build_name: libtorch-cuda13_0-shared-with-deps-debug
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml

View File

@ -38,7 +38,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -153,7 +153,7 @@ jobs:
- libtorch-cpu-shared-with-deps-release-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch

View File

@ -45,7 +45,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -160,7 +160,7 @@ jobs:
- libtorch-cpu-shared-with-deps-release-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -292,7 +292,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -408,7 +408,7 @@ jobs:
- libtorch-cuda12_6-shared-with-deps-release-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -542,7 +542,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -658,7 +658,7 @@ jobs:
- libtorch-cuda12_8-shared-with-deps-release-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -792,7 +792,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -908,7 +908,7 @@ jobs:
- libtorch-cuda12_9-shared-with-deps-release-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
timeout-minutes: 360
timeout-minutes: 300
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -1038,253 +1038,3 @@ jobs:
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
libtorch-cuda13_0-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
shell: bash
run: |
git config --global core.longpaths true
git config --global core.symlinks true
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
# the directory on Windows and prevent GHA from checking out as reported
# in https://github.com/actions/checkout/issues/1018
git config --global core.fsmonitor false
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
- uses: actions/upload-artifact@v4.4.0
if: always()
with:
name: libtorch-cuda13_0-shared-with-deps-release
retention-days: 14
if-no-files-found: error
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1
libtorch-cuda13_0-shared-with-deps-release-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- libtorch-cuda13_0-shared-with-deps-release-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
steps:
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
shell: bash
run: |
git config --global core.longpaths true
git config --global core.symlinks true
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
# the directory on Windows and prevent GHA from checking out as reported
# in https://github.com/actions/checkout/issues/1018
git config --global core.fsmonitor false
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: libtorch-cuda13_0-shared-with-deps-release
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Test PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1
libtorch-cuda13_0-shared-with-deps-release-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: libtorch-cuda13_0-shared-with-deps-release-test
with:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
build_name: libtorch-cuda13_0-shared-with-deps-release
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml

File diff suppressed because it is too large Load Diff

View File

@ -18,13 +18,13 @@ permissions:
contents: read
jobs:
inductor-build:
linux-jammy-cpu-py3_9-gcc11-inductor-build:
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
name: inductor-build
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-py3.9-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
# Use metal host for benchmark jobs
test-matrix: |
{ include: [
@ -32,13 +32,13 @@ jobs:
]}
secrets: inherit
inductor-micro-benchmark-test:
name: inductor-micro-benchmark-test
linux-jammy-cpu-py3_9-gcc11-inductor-micro-benchmark-test:
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build
needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
with:
build-environment: linux-jammy-py3.9-gcc11
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
timeout-minutes: 720
secrets: inherit

View File

@ -32,13 +32,13 @@ jobs:
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
nightly-dynamo-benchmarks-build:
name: nightly-dynamo-benchmarks-build
linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build:
name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks
uses: ./.github/workflows/_linux-build.yml
needs: get-default-label-prefix
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
test-matrix: |
{ include: [
@ -51,13 +51,13 @@ jobs:
build-additional-packages: "vision audio torchao"
secrets: inherit
nightly-dynamo-benchmarks-test:
name: nightly-dynamo-benchmarks-test
linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-test:
name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks
uses: ./.github/workflows/_linux-test.yml
needs: nightly-dynamo-benchmarks-build
needs: linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.test-matrix }}
timeout-minutes: 720
secrets: inherit

View File

@ -84,8 +84,9 @@ jobs:
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
# NB: Keep this in sync with trunk.yml
build:
name: build
name: cuda12.8-py3.10-gcc9-sm90
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
@ -127,7 +128,7 @@ jobs:
secrets: inherit
test-periodically:
name: test-periodically
name: cuda12.8-py3.10-gcc9-sm90
uses: ./.github/workflows/_linux-test.yml
needs: build
if: github.event.schedule == '15 0,12 * * 1-6'
@ -144,7 +145,7 @@ jobs:
secrets: inherit
test-weekly:
name: test-weekly
name: cuda12.8-py3.10-gcc9-sm90
uses: ./.github/workflows/_linux-test.yml
needs: build
if: github.event.schedule == '0 7 * * 0'
@ -161,12 +162,9 @@ jobs:
secrets: inherit
test:
name: test
name: cuda12.8-py3.10-gcc9-sm90
uses: ./.github/workflows/_linux-test.yml
needs: build
# The pull_request trigger is used in PR to bump transformers pin which always
# needs one round of benchmark
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }}
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}

View File

@ -48,9 +48,6 @@ jobs:
{ config: "perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" },
{ config: "perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" },
{ config: "perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" },
{ config: "aot_inductor_perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" },
{ config: "aot_inductor_perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" },
{ config: "aot_inductor_perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" },
]}
secrets: inherit

View File

@ -69,14 +69,14 @@ jobs:
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
inductor-build:
name: inductor-build
linux-jammy-zen-cpu-py3_9-gcc11-inductor-build:
name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_cpu_x86_zen", shard: 1, num_shards: 3, runner: "linux.24xlarge.amd" },
@ -95,16 +95,16 @@ jobs:
selected-test-configs: ${{ inputs.benchmark_configs }}
secrets: inherit
inductor-test-nightly:
name: inductor-test-nightly
linux-jammy-zen-cpu-py3_9-gcc11-inductor-test-nightly:
name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build
needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
if: github.event.schedule == '0 7 * * *'
with:
build-environment: linux-jammy-py3.9-gcc11-build
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
timeout-minutes: 720
# disable monitor in perf tests
disable-monitor: false
@ -112,16 +112,17 @@ jobs:
monitor-data-collect-interval: 4
secrets: inherit
inductor-test:
name: inductor-test
linux-jammy-zen-cpu-py3_9-gcc11-inductor-test:
name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build
needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
if: github.event_name == 'workflow_dispatch'
with:
build-environment: linux-jammy-py3.9-gcc11-build
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
timeout-minutes: 720
# disable monitor in perf tests
disable-monitor: false

View File

@ -74,14 +74,14 @@ jobs:
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
inductor-build:
name: inductor-build
linux-jammy-cpu-py3_9-gcc11-inductor-build:
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_cpu_x86", shard: 1, num_shards: 3, runner: "linux.24xl.spr-metal" },
@ -101,16 +101,16 @@ jobs:
build-additional-packages: "vision audio torchao"
secrets: inherit
inductor-test-nightly-freezing:
name: inductor-test-nightly-freezing
linux-jammy-cpu-py3_9-gcc11-inductor-test-nightly-freezing:
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build
needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
if: github.event.schedule == '0 7 * * *'
with:
build-environment: linux-jammy-py3.9-gcc11-build
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
timeout-minutes: 720
# disable monitor in perf tests
disable-monitor: false
@ -118,16 +118,16 @@ jobs:
monitor-data-collect-interval: 4
secrets: inherit
inductor-test:
name: inductor-test
linux-jammy-cpu-py3_9-gcc11-inductor-test:
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build
needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
if: github.event_name == 'workflow_dispatch'
with:
build-environment: linux-jammy-py3.9-gcc11-build
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }}
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
timeout-minutes: 720
# disable monitor in perf tests
disable-monitor: false

View File

@ -79,6 +79,7 @@ jobs:
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
# NB: Keep this in sync with trunk.yml
build:
name: cuda12.8-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml

View File

@ -31,8 +31,8 @@ jobs:
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
periodic-dynamo-benchmarks-build:
name: periodic-dynamo-benchmarks-build
linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build:
name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
uses: ./.github/workflows/_linux-build.yml
needs: get-default-label-prefix
with:
@ -57,33 +57,23 @@ jobs:
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
]}
build-additional-packages: "vision audio fbgemm torchao"
secrets: inherit
periodic-dynamo-benchmarks-test:
name: periodic-dynamo-benchmarks-test
linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-test:
name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
uses: ./.github/workflows/_linux-test.yml
needs: periodic-dynamo-benchmarks-build
needs: linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
docker-image: ${{ needs.periodic-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
secrets: inherit
rocm-periodic-dynamo-benchmarks-build:
linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build:
if: github.repository_owner == 'pytorch'
name: rocm-periodic-dynamo-benchmarks-build
name: rocm-py3_10-periodic-dynamo-benchmarks
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-rocm-py3_10
@ -109,21 +99,21 @@ jobs:
]}
secrets: inherit
rocm-periodic-dynamo-benchmarks-test:
linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-test:
permissions:
id-token: write
contents: read
name: rocm-periodic-dynamo-benchmarks-test
name: rocm-py3_10-periodic-dynamo-benchmarks
uses: ./.github/workflows/_rocm-test.yml
needs: rocm-periodic-dynamo-benchmarks-build
needs: linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build
with:
build-environment: linux-jammy-rocm-py3_10
docker-image: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
secrets: inherit
inductor-smoke-build:
name: inductor-smoke-build
linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build:
name: cuda12.8-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml
needs:
- get-default-label-prefix
@ -139,23 +129,23 @@ jobs:
build-additional-packages: "vision audio fbgemm torchao"
secrets: inherit
inductor-smoke-test:
name: inductor-smoke-test
linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-test:
name: cuda12.8-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: inductor-smoke-build
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image: ${{ needs.inductor-smoke-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.test-matrix }}
secrets: inherit
periodic-dynamo-benchmarks-cpu-build:
name: periodic-dynamo-benchmarks-cpu-build
linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build:
name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks
uses: ./.github/workflows/_linux-build.yml
needs: get-default-label-prefix
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
test-matrix: |
{ include: [
@ -170,6 +160,68 @@ jobs:
{ config: "cpu_inductor_freezing_avx2_torchbench", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
{ config: "cpu_inductor_freezing_avx2_timm", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" },
{ config: "cpu_inductor_freezing_avx2_timm", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
]}
build-additional-packages: "vision audio torchao"
secrets: inherit
linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-test:
name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
name: cuda12.8-py3.10-gcc9-sm86
uses: ./.github/workflows/_linux-build.yml
needs: get-default-label-prefix
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
test-matrix: |
{ include: [
{ config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
]}
build-additional-packages: "vision audio fbgemm torchao"
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
name: cuda12.8-py3.10-gcc9-sm86
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cpu-py3_9-gcc11-inductor-build:
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-build.yml
needs: get-default-label-prefix
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
test-matrix: |
{ include: [
{ config: "cpu_inductor_freezing_huggingface", shard: 1, num_shards: 1, runner: "linux.8xlarge.amx" },
{ config: "cpu_inductor_freezing_timm", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
{ config: "cpu_inductor_freezing_timm", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
@ -195,12 +247,12 @@ jobs:
build-additional-packages: "vision audio torchao"
secrets: inherit
periodic-dynamo-benchmarks-cpu-test:
name: periodic-dynamo-benchmarks-cpu-test
linux-jammy-cpu-py3_9-gcc11-inductor-test:
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-test.yml
needs: periodic-dynamo-benchmarks-cpu-build
needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
secrets: inherit

View File

@ -28,8 +28,8 @@ jobs:
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
inductor-build:
name: inductor-build
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
name: cuda12.8-py3.10-gcc9-sm86
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
@ -47,18 +47,44 @@ jobs:
]}
secrets: inherit
inductor-test:
name: inductor-test
linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
name: cuda12.8-py3.10-gcc9-sm86
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
secrets: inherit
inductor-halide-build:
name: inductor-halide-build
linux-jammy-cuda12_8-py3_12-gcc9-inductor-build:
name: cuda12.8-py3.12-gcc9-sm86
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
test-matrix: |
{ include: [
{ config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
]}
secrets: inherit
linux-jammy-cuda12_8-py3_12-gcc9-inductor-test:
name: cuda12.8-py3.12-gcc9-sm86
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-cuda12_8-py3_12-gcc9-inductor-build
with:
build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cpu-py3_12-inductor-halide-build:
name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
@ -71,18 +97,18 @@ jobs:
]}
secrets: inherit
inductor-halide-test:
name: inductor-halide-test
linux-jammy-cpu-py3_12-inductor-halide-test:
name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
uses: ./.github/workflows/_linux-test.yml
needs: inductor-halide-build
needs: linux-jammy-cpu-py3_12-inductor-halide-build
with:
build-environment: linux-jammy-py3.12-gcc11
docker-image: ${{ needs.inductor-halide-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }}
secrets: inherit
inductor-triton-cpu-build:
name: inductor-triton-cpu-build
linux-jammy-cpu-py3_12-inductor-triton-cpu-build:
name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
@ -95,23 +121,23 @@ jobs:
]}
secrets: inherit
inductor-triton-cpu-test:
linux-jammy-cpu-py3_12-inductor-triton-cpu-test:
name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
uses: ./.github/workflows/_linux-test.yml
needs: inductor-triton-cpu-build
needs: linux-jammy-cpu-py3_12-inductor-triton-cpu-build
with:
build-environment: linux-jammy-py3.12-gcc11
docker-image: ${{ needs.inductor-triton-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-triton-cpu-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.test-matrix }}
secrets: inherit
inductor-cpu-build:
name: inductor-cpu-build
linux-jammy-cpu-py3_9-gcc11-inductor-build:
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
test-matrix: |
{ include: [
@ -122,12 +148,37 @@ jobs:
]}
secrets: inherit
inductor-cpu-test:
name: inductor-cpu-test
linux-jammy-cpu-py3_9-gcc11-inductor-test:
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-test.yml
needs: inductor-cpu-build
needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda12_8-py3_13-gcc9-inductor-build:
name: cuda12.8-py3.13-gcc9-sm86
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
test-matrix: |
{ include: [
{ config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
]}
secrets: inherit
linux-jammy-cuda12_8-py3_13-gcc9-inductor-test:
name: cuda12.8-py3.13-gcc9-sm86
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-cuda12_8-py3_13-gcc9-inductor-build
with:
build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.test-matrix }}
secrets: inherit

View File

@ -44,8 +44,8 @@ jobs:
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
inductor-build:
name: inductor-build
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
name: cuda12.8-py3.10-gcc9-sm86
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
@ -53,6 +53,7 @@ jobs:
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
test-matrix: |
{ include: [
{ config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
@ -64,24 +65,25 @@ jobs:
build-additional-packages: "vision audio fbgemm torchao"
secrets: inherit
inductor-test:
name: inductor-test
linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
name: cuda12.8-py3.10-gcc9-sm86
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
secrets: inherit
inductor-cpu-build:
name: inductor-cpu-build
linux-jammy-cpu-py3_9-gcc11-inductor-build:
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
test-matrix: |
{ include: [
{ config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
@ -96,12 +98,12 @@ jobs:
build-additional-packages: "vision audio torchao"
secrets: inherit
inductor-cpu-test:
name: inductor-cpu-test
linux-jammy-cpu-py3_9-gcc11-inductor-test:
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-test.yml
needs: inductor-cpu-build
needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
secrets: inherit

View File

@ -42,8 +42,8 @@ jobs:
needs: get-label-type
with:
runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
build-environment: linux-jammy-py3.10-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
build-environment: linux-jammy-py3.9-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
secrets: inherit
docs-push:

View File

@ -24,38 +24,38 @@ permissions:
contents: read
jobs:
opbenchmark-build:
linux-jammy-cpu-py3_9-gcc11-opbenchmark-build:
if: github.repository_owner == 'pytorch'
name: opbenchmark-build
name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
test-matrix: |
{ include: [
{ config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
]}
secrets: inherit
opbenchmark-on-demand-build:
linux-jammy-cpu-py3_9-gcc11-opbenchmark-on-demand-build:
if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'pytorch' }}
name: opbenchmark-on-demand-build
name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
test-matrix: |
{ include: [
{ config: "cpu_operator_benchmark_${{ inputs.test_mode }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
]}
secrets: inherit
opbenchmark-test:
name: opbenchmark-test
linux-jammy-cpu-py3_9-gcc11-opbenchmark-test:
name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
uses: ./.github/workflows/_linux-test.yml
needs: opbenchmark-build
needs: linux-jammy-cpu-py3_9-gcc11-opbenchmark-build
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.test-matrix }}
secrets: inherit

View File

@ -170,38 +170,6 @@ jobs:
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda13_0-py3_10-gcc11-build:
name: linux-jammy-cuda13.0-py3.10-gcc11
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
cuda-arch-list: 7.5
build-environment: linux-jammy-cuda13.0-py3.10-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
test-matrix: |
{ include: [
{ config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
{ config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
{ config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
{ config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
]}
secrets: inherit
linux-jammy-cuda13_0-py3_10-gcc11-test:
name: linux-jammy-cuda13.0-py3.10-gcc11
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda13_0-py3_10-gcc11-build
- target-determination
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc11
docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-rocm-py3_10-build:
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_linux-build.yml

View File

@ -49,14 +49,14 @@ jobs:
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
linux-jammy-py3_10-gcc11-build:
name: linux-jammy-py3.10-gcc11
linux-jammy-py3_9-gcc11-build:
name: linux-jammy-py3.9-gcc11
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
build-environment: linux-jammy-py3.9-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@ -73,49 +73,49 @@ jobs:
]}
secrets: inherit
linux-jammy-py3_10-gcc11-test:
name: linux-jammy-py3.10-gcc11
linux-jammy-py3_9-gcc11-test:
name: linux-jammy-py3.9-gcc11
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-py3_10-gcc11-build
- linux-jammy-py3_9-gcc11-build
- target-determination
with:
build-environment: linux-jammy-py3.10-gcc11
docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.test-matrix }}
build-environment: linux-jammy-py3.9-gcc11
docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.test-matrix }}
secrets: inherit
linux-docs:
name: linux-docs
uses: ./.github/workflows/_docs.yml
needs: linux-jammy-py3_10-gcc11-build
needs: linux-jammy-py3_9-gcc11-build
with:
build-environment: linux-jammy-py3.10-gcc11
docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }}
build-environment: linux-jammy-py3.9-gcc11
docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }}
secrets: inherit
linux-jammy-py3_10-gcc11-no-ops:
name: linux-jammy-py3.10-gcc11-no-ops
linux-jammy-py3_9-gcc11-no-ops:
name: linux-jammy-py3.9-gcc11-no-ops
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-gcc11-no-ops
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
build-environment: linux-jammy-py3.9-gcc11-no-ops
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 1 },
]}
secrets: inherit
linux-jammy-py3_10-gcc11-pch:
name: linux-jammy-py3.10-gcc11-pch
linux-jammy-py3_9-gcc11-pch:
name: linux-jammy-py3.9-gcc11-pch
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-gcc11-pch
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
build-environment: linux-jammy-py3.9-gcc11-pch
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 1 },
@ -183,14 +183,14 @@ jobs:
test-matrix: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-py3_10-clang12-build:
name: linux-jammy-py3.10-clang12
linux-jammy-py3_9-clang12-build:
name: linux-jammy-py3.9-clang12
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-clang12
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
build-environment: linux-jammy-py3.9-clang12
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
@ -207,16 +207,16 @@ jobs:
]}
secrets: inherit
linux-jammy-py3_10-clang12-test:
name: linux-jammy-py3.10-clang12
linux-jammy-py3_9-clang12-test:
name: linux-jammy-py3.9-clang12
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-py3_10-clang12-build
- linux-jammy-py3_9-clang12-build
- target-determination
with:
build-environment: linux-jammy-py3.10-clang12
docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
build-environment: linux-jammy-py3.9-clang12
docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-py3_13-clang12-build:
@ -253,14 +253,14 @@ jobs:
test-matrix: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda12_8-cudnn9-py3_10-clang12-build:
name: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
linux-jammy-cuda12_8-cudnn9-py3_9-clang12-build:
name: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12
build-environment: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 1 },
@ -282,14 +282,14 @@ jobs:
]}
secrets: inherit
linux-jammy-py3_10-gcc11-mobile-lightweight-dispatch-build:
name: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
linux-jammy-py3_9-gcc11-mobile-lightweight-dispatch-build:
name: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
build-environment: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
build-generates-artifacts: false
test-matrix: |
{ include: [
@ -342,40 +342,15 @@ jobs:
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
name: cuda12.8-py3.10-gcc9-sm75
linux-jammy-xpu-2025_1-py3_9-build:
name: linux-jammy-xpu-2025.1-py3.9
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '7.5'
test-matrix: |
{ include: [
{ config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
]}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
name: cuda12.8-py3.10-gcc9-sm75
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-xpu-n-py3_9-build:
name: linux-jammy-xpu-n-py3.9
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
sync-tag: linux-xpu-n-build
sync-tag: linux-xpu-2025-1-build
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
build-environment: linux-jammy-xpu-n-py3.9
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
build-environment: linux-jammy-xpu-2025.1-py3.9
docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" },

View File

@ -78,14 +78,14 @@ jobs:
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-py3_10-clang12-build:
name: linux-jammy-py3.10-clang12
linux-jammy-py3_9-clang12-build:
name: linux-jammy-py3.9-clang12
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-clang12
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
build-environment: linux-jammy-py3.9-clang12
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12
test-matrix: |
{ include: [
{ config: "slow", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
@ -93,16 +93,16 @@ jobs:
]}
secrets: inherit
linux-jammy-py3_10-clang12-test:
name: linux-jammy-py3.10-clang12
linux-jammy-py3_9-clang12-test:
name: linux-jammy-py3.9-clang12
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-py3_10-clang12-build
- linux-jammy-py3_9-clang12-build
- target-determination
with:
build-environment: linux-jammy-py3.10-clang12
docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
build-environment: linux-jammy-py3.9-clang12
docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-rocm-py3_10-build:

View File

@ -30,7 +30,7 @@ jobs:
name: Test check_binary.sh for Linux CUDA
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
runner: linux.g4dn.4xlarge.nvidia.gpu
runner: linux.4xlarge.nvidia.gpu
docker-image: python:3.11
docker-build-dir: "skip-docker-build"
script: |

View File

@ -224,12 +224,13 @@ jobs:
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
secrets: inherit
inductor-build:
name: inductor-build
# NB: Keep this in sync with inductor-perf-test-nightly.yml
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
name: cuda12.8-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm80
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
secrets: inherit
@ -241,7 +242,7 @@ jobs:
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.9-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
test-matrix: |
{ include: [
{ config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },

View File

@ -59,19 +59,22 @@ jobs:
# on the PR appear in chronological order (timing issues can shuffle them around)
sleep 60
fi
# Require a comment id for merge operations
if [ -z "${COMMENT_ID}" ]; then
echo "Error: merge requires COMMENT_ID to be specified"
exit 1
fi
if [ -n "${FORCE}" ]; then
python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}"
if [ -n "${COMMENT_ID}" ]; then
python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}"
else
python3 .github/scripts/trymerge.py --force "${PR_NUM}"
fi
elif [ -n "${IGNORE_CURRENT}" ]; then
python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}"
else
if [ -n "${COMMENT_ID}" ]; then
python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}"
else
python3 .github/scripts/trymerge.py --ignore-current "${PR_NUM}"
fi
elif [ -n "${COMMENT_ID}" ]; then
python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}"
else
python3 .github/scripts/trymerge.py "${PR_NUM}"
fi
- name: Comment on Canceled
if: ${{ cancelled() && steps.checkout.outcome == 'success' }}

View File

@ -6,7 +6,8 @@ on:
- ciflow/vllm/*
workflow_dispatch:
schedule:
- cron: '0 */8 * * *' # every 8 hours at minute 0 (UTC)
# Every 12 hours starting at 00:00 UTC (00:00 and 12:00)
- cron: '0 0,12 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}

View File

@ -4,9 +4,6 @@ on:
push:
tags:
- ciflow/win-arm64/*
schedule:
# Every 4 hours starting at 00:00 UTC
- cron: '0 */4 * * *'
env:
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}

View File

@ -26,15 +26,15 @@ jobs:
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-xpu-n-1-py3_10-build:
name: linux-jammy-xpu-n-1-py3.10
linux-jammy-xpu-2025_0-py3_9-build:
name: linux-jammy-xpu-2025.0-py3.9
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
sync-tag: linux-xpu-n-1-build
sync-tag: linux-xpu-2025-0-build
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
build-environment: linux-jammy-xpu-n-1-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3
build-environment: linux-jammy-xpu-2025.0-py3.9
docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.0-py3
runner: linux.12xlarge
test-matrix: |
{ include: [
@ -47,62 +47,60 @@ jobs:
]}
secrets: inherit
linux-jammy-xpu-n-py3_10-build:
name: linux-jammy-xpu-n-py3.10
linux-jammy-xpu-2025_1-py3_9-build:
name: linux-jammy-xpu-2025.1-py3.9
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
sync-tag: linux-xpu-n-build
sync-tag: linux-xpu-2025-1-build
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
build-environment: linux-jammy-xpu-n-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
build-environment: linux-jammy-xpu-2025.1-py3.9
docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3
runner: linux.12xlarge
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "default", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "default", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "default", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "default", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "default", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
]}
secrets: inherit
linux-jammy-xpu-n-py3_10-test:
name: linux-jammy-xpu-n-py3.10
linux-jammy-xpu-2025_1-py3_9-test:
name: linux-jammy-xpu-2025.1-py3.9
uses: ./.github/workflows/_xpu-test.yml
needs: linux-jammy-xpu-n-py3_10-build
needs: linux-jammy-xpu-2025_1-py3_9-build
permissions:
id-token: write
contents: read
with:
build-environment: linux-jammy-xpu-n-py3.10
docker-image: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.test-matrix }}
build-environment: linux-jammy-xpu-2025.1-py3.9
docker-image: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.test-matrix }}
secrets: inherit
windows-xpu-n-1-build:
windows-xpu-2025_0-build:
if: github.repository_owner == 'pytorch'
name: win-vs2022-xpu-n-1-py3
name: win-vs2022-xpu-2025_0-py3
uses: ./.github/workflows/_win-build.yml
with:
build-environment: win-vs2022-xpu-n-1-py3
build-environment: win-vs2022-xpu-py3
cuda-version: cpu
use-xpu: true
xpu-version: '2025.0'
vc-year: '2022'
secrets: inherit
windows-xpu-2025_1-build:
if: github.repository_owner == 'pytorch'
name: win-vs2022-xpu-2025_1-py3
uses: ./.github/workflows/_win-build.yml
with:
build-environment: win-vs2022-xpu-py3
cuda-version: cpu
use-xpu: true
xpu-version: '2025.1'
vc-year: '2022'
secrets: inherit
windows-xpu-n-build:
if: github.repository_owner == 'pytorch'
name: win-vs2022-xpu-n-py3
uses: ./.github/workflows/_win-build.yml
with:
build-environment: win-vs2022-xpu-n-py3
cuda-version: cpu
use-xpu: true
xpu-version: '2025.2'
vc-year: '2022'
secrets: inherit

View File

@ -583,7 +583,7 @@ exclude_patterns = [
command = [
'python3',
'tools/linter/adapters/grep_linter.py',
'--pattern=#include <pybind11\/(^|[^(gil_simple\.h)])',
'--pattern=#include <pybind11\/(^|[^(gil\.h)])',
'--allowlist-pattern=#include <torch\/csrc\/utils\/pybind.h>',
'--linter-name=PYBIND11_INCLUDE',
'--match-first-only',
@ -1801,26 +1801,3 @@ command = [
"python3",
"tools/linter/adapters/gb_registry_linter.py",
]
[[linter]]
code = 'DISTRIBUTED_C10D_DIRECT_ACCESS'
include_patterns = ['**/*.py']
exclude_patterns = [
'torch/distributed/_distributed_c10d.py',
'fb/**',
'**/fb/**',
]
command = [
'python3',
'tools/linter/adapters/grep_linter.py',
'--pattern=torch\._C\._distributed_c10d',
'--linter-name=DISTRIBUTED_C10D_DIRECT_ACCESS',
'--error-name=direct access to torch._C._distributed_c10d',
"""--error-description=\
Never access torch._C._distributed_c10d directly in code. Always \
import from and use torch.distributed._distributed_c10d which is \
guaranteed to have all functions available\
""",
'--',
'@{{PATHSFILE}}'
]

View File

@ -22,6 +22,7 @@ COMMON_COPTS = [
"-DHAVE_SHM_UNLINK=1",
"-D_FILE_OFFSET_BITS=64",
"-DUSE_FBGEMM",
"-DUSE_DISTRIBUTED",
"-DAT_PER_OPERATOR_HEADERS",
"-DATEN_THREADING=NATIVE",
"-DNO_CUDNN_DESTROY_HANDLE",
@ -746,7 +747,6 @@ cc_library(
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu",
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu",
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
"torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp",
"torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
],
)) + torch_sources,

View File

@ -181,9 +181,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
set(CPU_POWER ON)
endif()
# For non-supported platforms, turn USE_DISTRIBUTED off by default.
# NB: USE_DISTRIBUTED simply disables the backend; distributed code
# still gets built
# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
# tested and likely won't work without additional changes.
if(NOT LINUX AND NOT WIN32)
set(USE_DISTRIBUTED
OFF
@ -262,18 +261,18 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
option(USE_NATIVE_ARCH "Use -march=native" OFF)
cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
option(USE_DISTRIBUTED "Enable default distributed backends" ON)
option(USE_DISTRIBUTED "Use distributed" ON)
cmake_dependent_option(USE_NCCL "Use NCCL" ON
"USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
cmake_dependent_option(USE_XCCL "Use XCCL" ON
"USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
"USE_XPU;UNIX;NOT APPLE" OFF)
cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL"
OFF)
cmake_dependent_option(USE_NVSHMEM "Use NVSHMEM" ON
"USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
"USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
option(USE_NNAPI "Use NNAPI" OFF)
option(USE_NNPACK "Use NNPACK" ON)
cmake_dependent_option(USE_NUMA "Use NUMA. Only available on Linux." ON "LINUX"
@ -431,10 +430,11 @@ if(WIN32)
PATH_SUFFIXES lib
NO_DEFAULT_PATH)
if(NOT libuv_tmp_LIBRARY)
set(USE_DISTRIBUTED OFF)
set(USE_GLOO OFF)
message(
WARNING
"Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
"Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
"Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
)
else()

View File

@ -216,7 +216,7 @@ file(GLOB mem_eff_attention_cuda_cpp "native/transformers/cuda/mem_eff_attention
if(USE_CUDA AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
add_library(flash_attention OBJECT EXCLUDE_FROM_ALL ${flash_attention_cuda_kernels_cu} ${flash_attention_cuda_cpp})
target_include_directories(flash_attention SYSTEM PUBLIC
target_include_directories(flash_attention PUBLIC
${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc
${PROJECT_SOURCE_DIR}/third_party/flash-attention/include
${PROJECT_SOURCE_DIR}/third_party/cutlass/include

View File

@ -1,18 +1,5 @@
#pragma once
// See https://github.com/pytorch/pytorch/issues/161660
// This compile flag is intended to be passed in to CppExtensions that rely on
// the stable ABI via the `extra_compile_args` argument. This is a stopgap
// solution to ensure that non-stable libtorch APIs are not used in the extension.
// The long term solution is to have a torch_stable target that excludes headers
// that are not in torch/stable or torch/headeronly.
// See test/cpp_extensions/torch_stable_test_extension/setup.py for an example
// of how this is used.
#ifdef TORCH_STABLE_ONLY
#error \
"TensorBase.h should not be included when TORCH_STABLE_ONLY compile flag is passed"
#endif
#include <c10/core/Device.h>
#include <c10/core/Layout.h>
#include <c10/core/MemoryFormat.h>

View File

@ -15,7 +15,7 @@ std::enable_if_t<
std::is_base_of_v<Base, Child>,
std::unique_ptr<Base>>
make_unique_base(Args&&... args) {
return std::make_unique<Child>(std::forward<Args>(args)...);
return std::unique_ptr<Base>(new Child(std::forward<Args>(args)...));
}
} // namespace detail

View File

@ -185,17 +185,6 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
// right: "lro, summed, ro" permuted with rpermutation and the three flattened
// then the permuted output is a view of bmm(left, right)
// finally, opermutation reverts the permutation to the original order of dimensions
// By default the output is "lro, lo, 1-for-summed-dims, ro" with original shape dimensions.
// However, if all dimensions from the right operand appear before those from the left
// operand in the final output, we can swap the operands so that bmm directly produces
// the result in the correct memory order.
bool swap_lo_ro = !lo.empty() && !ro.empty() && ro.back() < lo.front();
if (swap_lo_ro) {
std::swap(left, right);
std::swap(lo, ro);
std::swap(lo_size, ro_size);
}
auto out_num_dim = lro.size() + lo.size() + sum_dims_.size() + ro.size();
std::vector<SymInt> out_size;
out_size.reserve(out_num_dim);

View File

@ -89,7 +89,7 @@ execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t
using result_type = typename traits::result_type;
for (; i < n; i++) {
result_type* out_ptr = (result_type*)(data[0] + i * strides[0]);
*out_ptr = std::apply(op, dereference<traits>(
*out_ptr = c10::guts::apply(op, dereference<traits>(
&data[1],
&strides[1],
i));
@ -102,7 +102,7 @@ inline void
execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) {
using traits = function_traits<func_t>;
for (; i < n; i++) {
std::apply(op, dereference<traits>(
c10::guts::apply(op, dereference<traits>(
&data[0],
&strides[0],
i));
@ -162,7 +162,7 @@ void handle_tuple_outputs(char* C10_RESTRICT data[],
}
// Loop operation for `cpu_kernel_multiple_outputs`.
// 1. Use `std::apply` to make dynamic method invocation
// 1. Use `c10::guts::apply` to make dynamic method invocation
// for the lambda passed in `cpu_kernel_multiple_outputs`.
// 2. Iterate over the members of the returned tuple, set the corresponding
// output tensor by the tuple member in `handle_tuple_outputs` function.
@ -183,7 +183,7 @@ multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_
}
for (; i < n; i++) {
auto output = std::apply(op, dereference<traits>(
auto output = c10::guts::apply(op, dereference<traits>(
&data[num_outputs],
&strides[num_outputs],
i));
@ -213,8 +213,8 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve
for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
auto args1 = dereference_vec<traits>(&data[1], opt_scalar, S, i);
auto args2 = dereference_vec<traits>(&data[1], opt_scalar, S, i + Vec::size());
auto out1 = std::apply(vop, std::move(args1));
auto out2 = std::apply(vop, std::move(args2));
auto out1 = c10::guts::apply(vop, std::move(args1));
auto out2 = c10::guts::apply(vop, std::move(args2));
out1.store(data[0] + i * sizeof(scalar_t));
out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t));
}

View File

@ -156,7 +156,7 @@ void cpu_padding(
int64_t offset_h = ndim >= 2 ? p.offsets[ndim - 2] : 0;
int64_t offset_w = p.offsets[ndim - 1];
// do vectorized copy when output is overlapped with input on W,
// do vectorized copy whe output is overlapped with input on W,
// only applies to positive padding
auto loop = [=](scalar_t* out, const scalar_t* in, bool positive_padding) {
if (positive_padding) {

View File

@ -318,7 +318,7 @@ batch_norm_cpu_collect_stats_channels_last_impl(
//
// The optimal THRESHOLD to tile was found empirically.
// When C > THRESHOLD, C is large enough that the benefit from tiling and vectorization outweigh the synchronization overhead.
// When C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization.
// Wehn C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization.
//
// When num_threads == 1, always use Method 2 as there is no synchronization overhead.
//

View File

@ -1665,7 +1665,7 @@ const std::optional<at::Tensor>& bias,
const std::optional<at::Tensor>& scale_result,
std::optional<c10::ScalarType> out_dtype,
bool use_fast_accum) {
bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/false);
bool allowed_device = _scaled_mm_allowed_device();
TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0, or ROCm MI300+");
TORCH_CHECK(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed");

View File

@ -20,7 +20,7 @@
// SegmentReduce compilation with CUDA-12.9 causes NVCC crash on Windows
// See https://github.com/pytorch/pytorch/issues/156181
#if !(defined(_WIN32) && CUDART_VERSION == 12090)
#if !defined(_WIN32) || CUDART_VERSION < 12090
namespace at::native {
@ -606,4 +606,4 @@ REGISTER_DISPATCH(
} // namespace at::native
#endif
#endif

View File

@ -226,38 +226,6 @@ __global__ void CatArrayBatchedCopy_contig(
}
}
template <typename T, typename IndexType, int Dims, int batch_size, int stride_size, int alignment, int elems_per_vec>
__global__ void CatArrayBatchedCopy_vectorized(
char* output,
CatArrInputTensorMetadata<T, IndexType, batch_size, stride_size> inputs,
TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
const int concatDim,
IndexType trailingSize) {
IndexType tid = blockIdx.x * blockDim.x + threadIdx.x;
IndexType nElements = inputs.nElements[blockIdx.y] / elems_per_vec;
if(tid >= nElements) return;
const char * data = (char*)inputs.input[blockIdx.y];
IndexType offset = inputs.offset[blockIdx.y] * trailingSize / elems_per_vec;
IndexType dimSize = inputs.dimSize[blockIdx.y] * trailingSize / elems_per_vec;
IndexType dataOffset = offset * alignment; // in bytes
IndexType stride = gridDim.x * blockDim.x;
while( tid < nElements){
IndexType elementOffset = CatArrIndexToOffset<IndexType, Dims>::compute(
os.tensorSize, os.tensorStride, dimSize, concatDim, tid) * alignment; // in bytes
auto vec = at::native::memory::ld_vec<alignment>(data + alignment * tid);
at::native::memory::st_vec<alignment>(output + dataOffset + elementOffset, vec);
tid += stride;
}
}
/*
Specialized implementation of the CatArrayBatchedCopy written to generate wide memory loads
to improve memory bandwidth throughput.
@ -328,27 +296,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
scalar_t *data = (scalar_t *)(out.mutable_data_ptr());
CatArrInputTensorMetadata<scalar_t, unsigned int, batch_size, stride_size> catMetaData;
TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> outputParam;
// If all batches are contiguous we can call a specialized implementation
// which requires the input tensor addresses to be aligned to a
// 16 Byte boundary.
constexpr bool isContig = stride_size == 1;
bool isAligned = true;
constexpr int alignment = 16;
// Next, let's initialize the size, stride arrays for the output Tensor.
// for contig case, we'll canonicalize output strides, so that
// we don't have arbitrary strides for dims of size 0
size_t stride0 = 1;
if (memory_format == c10::MemoryFormat::Contiguous) {
for (int i = nDims - 1; i >= 0; --i) {
for (int i = 0; i < nDims; ++i) {
outputParam.tensorSize[i] = out.size(i);
if (isContig) {
outputParam.tensorStride[i] = stride0;
stride0 *= out.size(i);
} else {
outputParam.tensorStride[i] = out.stride(i);
}
outputParam.tensorStride[i] = out.stride(i);
}
} else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) {
// permute the semantics of dims from NCHW to NHWC so that the input
@ -367,15 +320,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
// If all batches are contiguous we can call a specialized implementation
// which requires the input tensor addresses to be aligned to a
// 16 Byte boundary.
// for channels last computing slice size correctly is much more involved, so we never send it
// on the fully vectorized path
// we need output stride in cat dimension to be multiple of alignment,
// if we ever use it to compute offsets
// for catting in 0th dimension it doesn't matter
bool isInOutAligned = isContig && at::native::memory::get_alignment(data) >= alignment &&
memory_format == c10::MemoryFormat::Contiguous && (dimension == 0 ||
outputParam.tensorStride[dimension - 1] * sizeof(scalar_t) % alignment == 0);
bool isContig = true;
bool isAligned = true;
unsigned int max_elements_per_tensor = 0;
// Now we loop
@ -391,16 +341,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
// high-dimensional tensor
if (inputs[i+batchCounter].get().numel() > 0) {
dimSize = inputs[i+batchCounter].get().size(dimension);
if (isInOutAligned) {
auto t = inputs[i+batchCounter].get();
// similarly to output stride, we cannot trust stride value to
// determine slice size if the corresponding dimension is 1
// we have to multiply all the subsequent sizes
int64_t slice_size = dimension == 0 ? t.numel() : t.sizes()[dimension - 1] != 1 ?
t.strides()[dimension - 1] : c10::multiply_integers(t.sizes().begin() + dimension, t.sizes().end());
slice_size *= sizeof(scalar_t);
isInOutAligned &= (slice_size % alignment == 0);
}
}
catMetaData.input[batchCounter] = (scalar_t*)(inputs[i+batchCounter].get().const_data_ptr());
@ -411,12 +351,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
#ifdef USE_ROCM
// On ROCm, CatArrayBatchedCopy_contig is faster
isAligned = false;
isInOutAligned = false;
#else
// If at least one of the inputs is not aligned, we can't call the
// CatArrayBatchedCopy_alignedK_contig
isAligned &= is_aligned_vec4(catMetaData.input[batchCounter]);
isInOutAligned &= at::native::memory::get_alignment(catMetaData.input[batchCounter]) >= alignment;
#endif
if (stride_size > 1) {
@ -427,6 +365,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
catMetaData.tensorStride[batchCounter].tensorStride[j] = strides[j];
}
catMetaData.isContiguous[batchCounter] = false;
isContig = false;
} else {
catMetaData.isContiguous[batchCounter] = true;
}
@ -449,13 +388,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
max_elements_per_tensor, batchCounter);
#else
dim3 applyBlock, catGrid;
if (isInOutAligned) {
std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, alignment>(
max_elements_per_tensor, batchCounter);
} else if (isContig && isAligned && sizeof(scalar_t) > 2) {
if (isContig && sizeof(scalar_t) > 2) {
std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_16>(
max_elements_per_tensor, batchCounter);
} else if (isContig && isAligned && sizeof(scalar_t) == 2) {
} else if (isContig && sizeof(scalar_t) == 2) {
std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_8>(
max_elements_per_tensor, batchCounter);
} else {
@ -463,30 +399,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
getCatGrid(batchCounter, catGrid);
}
#endif
int32_t trailingSize;
TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> kernelOutputParam;
if (isInOutAligned) {
// in this case we can and should flatten the tensors after the cat dim
// we want to view the tensors as if consisting of `alignment`-sized elements
// however, we might not be able to cleanly divide just the last dim -
// it might not be the multiple of alignment.
// however, we know that the full concatted slice is multiple of alignment,
// so if we flatten all the dims after and including concat dim,
// it will be divisible by alignment
// then we need to divide last out size by elems_per_vec,
// and divide all strides except last by elems_per_vec (last stride is 1 always)
// for input, we will fix up the sizes and strides in the kernel directly
kernelOutputParam = outputParam;
nDims = dimension + 1;
constexpr auto elems_per_vec = alignment / sizeof(scalar_t);
auto out_size = dimension == 0 ? out.numel() : kernelOutputParam.tensorStride[dimension-1];
kernelOutputParam.tensorSize[dimension] = out_size / elems_per_vec;
trailingSize = outputParam.tensorStride[dimension];
kernelOutputParam.tensorStride[dimension] = 1;
for (int i = 0; i < dimension; ++i) {
kernelOutputParam.tensorStride[i] /= elems_per_vec;
}
}
if (memory_format != c10::MemoryFormat::Contiguous) {
switch (dimension) {
@ -501,12 +413,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
}
// Template Declarations for dim = 1, 2, 3, 4
#define HANDLE_CASE(DIMS) \
if (isInOutAligned) {\
constexpr auto elems_per_vec = alignment / sizeof(scalar_t); \
CatArrayBatchedCopy_vectorized<scalar_t, unsigned int, DIMS, batch_size, stride_size, alignment, elems_per_vec><<<\
catGrid, applyBlock, 0, stream.stream()>>>(\
(char*)data, catMetaData, kernelOutputParam, dimension, trailingSize);\
} else if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_16><<<\
catGrid, applyBlock, 0, stream.stream()>>>(\
data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\

View File

@ -1,7 +1,5 @@
#include <ATen/core/op_registration/op_registration.h>
#include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
#include <ATen/native/mkldnn/xpu/qconv.h>
#include <c10/core/MemoryFormat.h>
#include <c10/core/ScalarType.h>
#include <torch/library.h>
@ -9,7 +7,7 @@
using namespace at::native::onednn;
namespace at::native::xpu {
inline c10::ScalarType QConvoneDNNXPU::qconv_decide_out_dtype(
static inline c10::ScalarType qconv_decide_out_dtype(
const at::Tensor& act,
const std::optional<c10::ScalarType> output_dtype) {
bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
@ -21,7 +19,7 @@ inline c10::ScalarType QConvoneDNNXPU::qconv_decide_out_dtype(
return dst_dtype;
}
at::Tensor QConvoneDNNXPU::qconv_prepack_xpu(
static at::Tensor qconv_prepack_xpu(
at::Tensor weight,
at::Tensor weight_scales,
double input_scale,
@ -35,265 +33,222 @@ at::Tensor QConvoneDNNXPU::qconv_prepack_xpu(
return weight;
}
at::Tensor QConvoneDNNXPU::run_pointwise(
at::Tensor act,
double act_scale,
int64_t act_zero_point,
at::Tensor weight,
at::Tensor weight_scales,
at::Tensor weight_zero_points,
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double inv_output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
std::string_view attr,
torch::List<std::optional<at::Scalar>> scalars,
std::optional<std::string_view> algorithm) {
if (act.dim() == 3 || act.dim() == 5) {
TORCH_CHECK(
attr == "none",
"quantized pointwise conv",
act.dim() - 2,
"d doesn't support unary_post_op fusion. Got unary_post_op:",
attr,
".");
} else {
TORCH_CHECK(
attr == "none" || attr == "relu" || attr == "hardtanh" ||
attr == "hardswish" || attr == "swish",
"We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, GELU, Swish, and Hardswish are supported. However, encountered unsupported post operation:",
attr,
".");
class QConvoneDNNXPU final {
public:
static at::Tensor run_pointwise(
at::Tensor act,
double act_scale,
int64_t act_zero_point,
at::Tensor weight,
at::Tensor weight_scales,
at::Tensor weight_zero_points,
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double inv_output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
std::string_view attr,
torch::List<std::optional<at::Scalar>> scalars,
std::optional<std::string_view> algorithm) {
if (act.dim() == 3 || act.dim() == 5) {
TORCH_CHECK(
attr == "none",
"quantized pointwise conv",
act.dim() - 2,
"d doesn't support unary_post_op fusion. Got unary_post_op:",
attr,
".");
} else {
TORCH_CHECK(
attr == "none" || attr == "relu" || attr == "hardtanh" ||
attr == "hardswish" || attr == "swish",
"We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, GELU, Swish, and Hardswish are supported. However, encountered unsupported post operation:",
attr,
".");
}
bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
auto mfmt = is_channels_last_suggested
? get_cl_tag_by_ndim(act.ndimension())
: at::MemoryFormat::Contiguous;
Tensor input_ = act.contiguous(mfmt);
Tensor weight_ = weight.contiguous(mfmt);
auto dst_tz = conv_dst_size(
input_.ndimension(),
input_.sizes(),
weight_.sizes(),
padding.vec(),
padding.vec(),
stride.vec(),
dilation.vec());
auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
Tensor output =
at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
return quantized_convolution(
act,
act_scale,
act_zero_point,
weight,
weight_scales,
weight_zero_points,
bias,
stride,
padding,
dilation,
/*transposed*/ false,
groups,
output,
inv_output_scale,
output_zero_point,
/*accum*/ std::nullopt,
/*accum_scale*/ 0.0,
/*accum_zero_point*/ 0,
/*output_dtype*/ output_dtype,
/*binary_attr*/ std::nullopt,
/*binary_alpha*/ std::nullopt,
/*unary_attr*/ attr,
/*unary_scalars*/ scalars,
/*unary_algorithm*/ algorithm);
}
bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
auto mfmt = is_channels_last_suggested ? get_cl_tag_by_ndim(act.ndimension())
: at::MemoryFormat::Contiguous;
Tensor input_ = act.contiguous(mfmt);
Tensor weight_ = weight.contiguous(mfmt);
auto dst_tz = conv_dst_size(
input_.ndimension(),
input_.sizes(),
weight_.sizes(),
padding.vec(),
padding.vec(),
stride.vec(),
dilation.vec());
auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
Tensor output =
at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
return quantized_convolution(
act,
act_scale,
act_zero_point,
weight,
weight_scales,
weight_zero_points,
bias,
stride,
padding,
dilation,
/*transposed*/ false,
groups,
output,
inv_output_scale,
output_zero_point,
/*accum*/ std::nullopt,
/*accum_scale*/ 0.0,
/*accum_zero_point*/ 0,
/*output_dtype*/ output_dtype,
/*binary_attr*/ std::nullopt,
/*binary_alpha*/ std::nullopt,
/*unary_attr*/ attr,
/*unary_scalars*/ scalars,
/*unary_algorithm*/ algorithm);
}
at::Tensor QConvoneDNNXPU::run_pointwise_tensor(
at::Tensor act,
at::Tensor act_scale,
at::Tensor act_zero_point,
at::Tensor weight,
at::Tensor weight_scales,
at::Tensor weight_zero_points,
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
std::string_view attr,
torch::List<std::optional<at::Scalar>> scalars,
std::optional<std::string_view> algorithm) {
return run_pointwise(
act,
act_scale.item().toDouble(),
act_zero_point.item().toLong(),
weight,
weight_scales,
weight_zero_points,
bias,
stride,
padding,
dilation,
groups,
output_scale,
output_zero_point,
output_dtype,
/*unary_attr*/ attr,
/*unary_scalars*/ scalars,
/*unary_algorithm*/ algorithm);
}
at::Tensor QConvoneDNNXPU::run_pointwise_binary(
at::Tensor act,
double act_scale,
int64_t act_zero_point,
at::Tensor weight,
at::Tensor weight_scales,
at::Tensor weight_zero_points,
at::Tensor accum,
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
double accum_scale,
int64_t accum_zero_point,
std::string_view binary_attr,
std::optional<at::Scalar> alpha,
std::optional<std::string_view> unary_attr,
torch::List<std::optional<at::Scalar>> unary_scalars,
std::optional<std::string_view> unary_algorithm) {
TORCH_CHECK(
act.dim() == 4 && binary_attr == "sum" &&
(!unary_attr.has_value() ||
(unary_attr.has_value() &&
(unary_attr.value() == "none" || unary_attr.value() == "relu"))),
"post_op sum or post_op sum_relu is supported for quantized pointwise conv2d. Got binary_post_op: ",
binary_attr,
" unary_post_op: ",
unary_attr.has_value() ? unary_attr.value() : "none",
".")
bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
auto mfmt = is_channels_last_suggested ? get_cl_tag_by_ndim(act.ndimension())
: at::MemoryFormat::Contiguous;
Tensor input_ = act.contiguous(mfmt);
Tensor weight_ = weight.contiguous(mfmt);
auto dst_tz = conv_dst_size(
input_.ndimension(),
input_.sizes(),
weight_.sizes(),
padding.vec(),
padding.vec(),
stride.vec(),
dilation.vec());
auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
bool has_accum_postop_sum = binary_attr == "sum";
Tensor output = has_accum_postop_sum
? accum
: at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
output = quantized_convolution(
act,
act_scale,
act_zero_point,
weight,
weight_scales,
weight_zero_points,
bias,
stride,
padding,
dilation,
/*transposed*/ false,
groups,
output,
output_scale,
output_zero_point,
/*accum*/ accum,
/*accum_scale*/ accum_scale,
/*accum_zero_point*/ accum_zero_point,
/*output_dtype*/ output_dtype,
/*binary_attr*/ binary_attr,
/*binary_alpha*/ alpha,
/*unary_attr*/ unary_attr,
/*unary_scalars*/ unary_scalars,
/*unary_algorithm*/ unary_algorithm);
if (!has_accum_postop_sum) {
return output;
} else {
return accum;
static at::Tensor run_pointwise_tensor(
at::Tensor act,
at::Tensor act_scale,
at::Tensor act_zero_point,
at::Tensor weight,
at::Tensor weight_scales,
at::Tensor weight_zero_points,
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
std::string_view attr,
torch::List<std::optional<at::Scalar>> scalars,
std::optional<std::string_view> algorithm) {
return run_pointwise(
act,
act_scale.item().toDouble(),
act_zero_point.item().toLong(),
weight,
weight_scales,
weight_zero_points,
bias,
stride,
padding,
dilation,
groups,
output_scale,
output_zero_point,
output_dtype,
/*unary_attr*/ attr,
/*unary_scalars*/ scalars,
/*unary_algorithm*/ algorithm);
}
}
at::Tensor QConvoneDNNXPU::run_pointwise_binary_tensor(
at::Tensor act, // contains quantized values but not QTensor
at::Tensor act_scale,
at::Tensor act_zero_point,
at::Tensor weight, // contains quantized values but not QTensor
at::Tensor weight_scales,
at::Tensor weight_zero_points,
at::Tensor accum, // contains quantized values but not QTensor
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
double accum_scale,
int64_t accum_zero_point,
std::string_view binary_attr,
std::optional<at::Scalar> alpha,
std::optional<std::string_view> unary_attr,
torch::List<std::optional<at::Scalar>> unary_scalars,
std::optional<std::string_view> unary_algorithm) {
return run_pointwise_binary(
act,
act_scale.item().toDouble(),
act_zero_point.item().toLong(),
weight,
weight_scales,
weight_zero_points,
accum,
bias,
stride,
padding,
dilation,
groups,
output_scale,
output_zero_point,
output_dtype,
accum_scale,
accum_zero_point,
binary_attr,
alpha,
unary_attr,
unary_scalars,
unary_algorithm);
}
static at::Tensor run_pointwise_binary(
at::Tensor act,
double act_scale,
int64_t act_zero_point,
at::Tensor weight,
at::Tensor weight_scales,
at::Tensor weight_zero_points,
at::Tensor accum,
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
double accum_scale,
int64_t accum_zero_point,
std::string_view binary_attr,
std::optional<at::Scalar> alpha,
std::optional<std::string_view> unary_attr,
torch::List<std::optional<at::Scalar>> unary_scalars,
std::optional<std::string_view> unary_algorithm) {
TORCH_CHECK(
act.dim() == 4 && binary_attr == "sum" &&
(!unary_attr.has_value() ||
(unary_attr.has_value() &&
(unary_attr.value() == "none" || unary_attr.value() == "relu"))),
"post_op sum or post_op sum_relu is supported for quantized pointwise conv2d. Got binary_post_op: ",
binary_attr,
" unary_post_op: ",
unary_attr.has_value() ? unary_attr.value() : "none",
".")
bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
auto mfmt = is_channels_last_suggested
? get_cl_tag_by_ndim(act.ndimension())
: at::MemoryFormat::Contiguous;
Tensor input_ = act.contiguous(mfmt);
Tensor weight_ = weight.contiguous(mfmt);
auto dst_tz = conv_dst_size(
input_.ndimension(),
input_.sizes(),
weight_.sizes(),
padding.vec(),
padding.vec(),
stride.vec(),
dilation.vec());
auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
bool has_accum_postop_sum = binary_attr == "sum";
Tensor output = has_accum_postop_sum
? accum
: at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
output = quantized_convolution(
act,
act_scale,
act_zero_point,
weight,
weight_scales,
weight_zero_points,
bias,
stride,
padding,
dilation,
/*transposed*/ false,
groups,
output,
output_scale,
output_zero_point,
/*accum*/ accum,
/*accum_scale*/ accum_scale,
/*accum_zero_point*/ accum_zero_point,
/*output_dtype*/ output_dtype,
/*binary_attr*/ binary_attr,
/*binary_alpha*/ alpha,
/*unary_attr*/ unary_attr,
/*unary_scalars*/ unary_scalars,
/*unary_algorithm*/ unary_algorithm);
if (!has_accum_postop_sum) {
return output;
} else {
return accum;
}
}
};
TORCH_LIBRARY_IMPL(onednn, XPU, m) {
m.impl(
TORCH_SELECTIVE_NAME("onednn::qconv_prepack"),
TORCH_FN(QConvoneDNNXPU::qconv_prepack_xpu));
TORCH_FN(xpu::qconv_prepack_xpu));
m.impl(
TORCH_SELECTIVE_NAME("onednn::qconv1d_pointwise"),
QConvoneDNNXPU::run_pointwise);
@ -312,9 +267,6 @@ TORCH_LIBRARY_IMPL(onednn, XPU, m) {
m.impl(
TORCH_SELECTIVE_NAME("onednn::qconv_pointwise.tensor"),
QConvoneDNNXPU::run_pointwise_tensor);
m.impl(
TORCH_SELECTIVE_NAME("onednn::qconv2d_pointwise.binary_tensor"),
QConvoneDNNXPU::run_pointwise_binary_tensor);
}
} // namespace at::native::xpu

View File

@ -1,111 +0,0 @@
#pragma once
#include <ATen/Config.h>
#include <ATen/Tensor.h>
namespace at::native::xpu {
class QConvoneDNNXPU final {
public:
C10_API static at::Tensor run_pointwise(
at::Tensor act,
double act_scale,
int64_t act_zero_point,
at::Tensor weight,
at::Tensor weight_scales,
at::Tensor weight_zero_points,
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double inv_output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
std::string_view attr,
torch::List<std::optional<at::Scalar>> scalars,
std::optional<std::string_view> algorithm);
C10_API static at::Tensor run_pointwise_tensor(
at::Tensor act,
at::Tensor act_scale,
at::Tensor act_zero_point,
at::Tensor weight,
at::Tensor weight_scales,
at::Tensor weight_zero_points,
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
std::string_view attr,
torch::List<std::optional<at::Scalar>> scalars,
std::optional<std::string_view> algorithm);
C10_API static at::Tensor run_pointwise_binary(
at::Tensor act,
double act_scale,
int64_t act_zero_point,
at::Tensor weight,
at::Tensor weight_scales,
at::Tensor weight_zero_points,
at::Tensor accum,
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
double accum_scale,
int64_t accum_zero_point,
std::string_view binary_attr,
std::optional<at::Scalar> alpha,
std::optional<std::string_view> unary_attr,
torch::List<std::optional<at::Scalar>> unary_scalars,
std::optional<std::string_view> unary_algorithm);
C10_API static at::Tensor run_pointwise_binary_tensor(
at::Tensor act,
at::Tensor act_scale,
at::Tensor act_zero_point,
at::Tensor weight,
at::Tensor weight_scales,
at::Tensor weight_zero_points,
at::Tensor accum,
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
double accum_scale,
int64_t accum_zero_point,
std::string_view binary_attr,
std::optional<at::Scalar> alpha,
std::optional<std::string_view> unary_attr,
torch::List<std::optional<at::Scalar>> unary_scalars,
std::optional<std::string_view> unary_algorithm);
static inline c10::ScalarType qconv_decide_out_dtype(
const at::Tensor& act,
const std::optional<c10::ScalarType> output_dtype);
static at::Tensor qconv_prepack_xpu(
at::Tensor weight,
at::Tensor weight_scales,
double input_scale,
int64_t input_zero_point,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
std::optional<torch::List<int64_t>> input_shape);
};
} // namespace at::native::xpu

View File

@ -1,14 +1,13 @@
#include <torch/library.h>
#include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
#include <ATen/native/mkldnn/xpu/qlinear.h>
#include <c10/core/ScalarType.h>
using namespace at::native::onednn;
namespace at::native::xpu {
inline c10::ScalarType QLinearOnednnXPU::qlinear_decide_out_dtype(
static inline c10::ScalarType qlinear_decide_out_dtype(
const at::Tensor& act,
const std::optional<c10::ScalarType> output_dtype) {
bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
@ -20,7 +19,7 @@ inline c10::ScalarType QLinearOnednnXPU::qlinear_decide_out_dtype(
return dst_dtype;
}
Tensor QLinearOnednnXPU::q_linear_pointwise(
static Tensor q_linear_pointwise(
Tensor act,
double act_scale,
int64_t act_zero_point,
@ -79,7 +78,7 @@ Tensor QLinearOnednnXPU::q_linear_pointwise(
return qout;
}
Tensor QLinearOnednnXPU::q_linear_pointwise_tensor(
static Tensor q_linear_pointwise_tensor(
Tensor act,
Tensor act_scale,
Tensor act_zero_point,
@ -138,7 +137,7 @@ Tensor QLinearOnednnXPU::q_linear_pointwise_tensor(
return qout;
}
Tensor QLinearOnednnXPU::q_linear_pointwise_binary(
static Tensor q_linear_pointwise_binary(
Tensor act,
double act_scale,
int64_t act_zero_point,
@ -209,7 +208,7 @@ Tensor QLinearOnednnXPU::q_linear_pointwise_binary(
return dim == 3 ? qout.reshape({act.size(0), -1, N}) : qout;
}
Tensor QLinearOnednnXPU::q_linear_pointwise_binary_tensor(
static Tensor q_linear_pointwise_binary_tensor(
Tensor act,
Tensor act_scale,
Tensor act_zero_point,
@ -249,7 +248,7 @@ Tensor QLinearOnednnXPU::q_linear_pointwise_binary_tensor(
unary_post_op_algorithm);
}
Tensor QLinearOnednnXPU::q_linear_prepack_onednn(
static at::Tensor q_linear_prepack_onednn(
at::Tensor weight,
std::optional<torch::List<int64_t>> input_shape) {
at::Tensor weight_transposed = weight.transpose(0, 1);
@ -259,19 +258,19 @@ Tensor QLinearOnednnXPU::q_linear_prepack_onednn(
TORCH_LIBRARY_IMPL(onednn, XPU, m) {
m.impl(
TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise"),
TORCH_FN(QLinearOnednnXPU::q_linear_pointwise));
TORCH_FN(q_linear_pointwise));
m.impl(
TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.tensor"),
TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_tensor));
TORCH_FN(q_linear_pointwise_tensor));
m.impl(
TORCH_SELECTIVE_NAME("onednn::qlinear_prepack"),
TORCH_FN(QLinearOnednnXPU::q_linear_prepack_onednn));
TORCH_FN(q_linear_prepack_onednn));
m.impl(
TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary"),
TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_binary));
TORCH_FN(q_linear_pointwise_binary));
m.impl(
TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary_tensor"),
TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_binary_tensor));
TORCH_FN(q_linear_pointwise_binary_tensor));
}
} // namespace at::native::xpu

View File

@ -1,91 +0,0 @@
#pragma once
#include <ATen/Config.h>
#include <ATen/Tensor.h>
#include <ATen/core/List.h>
namespace at::native::xpu {
class QLinearOnednnXPU final {
public:
C10_API static Tensor q_linear_pointwise(
Tensor act,
double act_scale,
int64_t act_zero_point,
Tensor weight,
Tensor weight_scales,
Tensor weight_zero_points,
std::optional<Tensor> bias,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
std::string_view post_op_name,
torch::List<std::optional<at::Scalar>> post_op_args,
std::string_view post_op_algorithm);
C10_API static Tensor q_linear_pointwise_tensor(
Tensor act,
Tensor act_scale,
Tensor act_zero_point,
Tensor weight,
Tensor weight_scales,
Tensor weight_zero_points,
std::optional<Tensor> bias,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
std::string_view post_op_name,
torch::List<std::optional<at::Scalar>> post_op_args,
std::string_view post_op_algorithm);
C10_API static Tensor q_linear_pointwise_binary(
Tensor act,
double act_scale,
int64_t act_zero_point,
Tensor weight,
Tensor weight_scales,
Tensor weight_zero_points,
std::optional<at::Tensor> other,
std::optional<Tensor> bias,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
double other_scale,
int64_t other_zero_point,
std::string_view binary_post_op,
double binary_alpha,
std::string_view unary_post_op,
torch::List<std::optional<at::Scalar>> unary_post_op_args,
std::string_view unary_post_op_algorithm);
C10_API static Tensor q_linear_pointwise_binary_tensor(
Tensor act,
Tensor act_scale,
Tensor act_zero_point,
Tensor weight,
Tensor weight_scales,
Tensor weight_zero_points,
std::optional<at::Tensor> other,
std::optional<Tensor> bias,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
double other_scale,
int64_t other_zero_point,
std::string_view binary_post_op,
double binary_alpha,
std::string_view unary_post_op,
torch::List<std::optional<at::Scalar>> unary_post_op_args,
std::string_view unary_post_op_algorithm);
C10_API static Tensor q_linear_prepack_onednn(
at::Tensor weight,
std::optional<torch::List<int64_t>> input_shape);
static inline c10::ScalarType qlinear_decide_out_dtype(
const at::Tensor& act,
const std::optional<c10::ScalarType> output_dtype);
}; // class QLinearOnednnXPU
} // namespace at::native::xpu

View File

@ -503,17 +503,6 @@ struct round_decimals_functor {
}
};
struct round_functor {
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
inline T operator()(const T x) {
return static_cast<T>(rint(float(x)));
}
template <typename T, enable_if_t<is_scalar_integral_v<T>, bool> = true>
inline T operator()(const T x) {
return x;
}
};
DEFINE_UNARY_FLOATING_FUNCTOR(erf);
DEFINE_UNARY_FLOATING_FUNCTOR(erfc);
DEFINE_UNARY_FLOATING_FUNCTOR(erfinv);
@ -526,13 +515,6 @@ REGISTER_UNARY_OP(neg, char, char);
REGISTER_UNARY_OP(neg, uchar, uchar);
REGISTER_UNARY_OP(neg, float, float);
REGISTER_UNARY_OP(neg, half, half);
REGISTER_UNARY_OP(round, int, int);
REGISTER_UNARY_OP(round, long, long);
REGISTER_UNARY_OP(round, short, short);
REGISTER_UNARY_OP(round, char, char);
REGISTER_UNARY_OP(round, uchar, uchar);
REGISTER_UNARY_OP(round, float, float);
REGISTER_UNARY_OP(round, half, half);
REGISTER_UNARY_OP(bitwise_not, int, int);
REGISTER_UNARY_OP(bitwise_not, long, long);
@ -576,7 +558,6 @@ REGISTER_UNARY_OP(abs, half, half);
INSTANTIATE_UNARY_KERNELS2(bfloat, bfloat);
REGISTER_UNARY_OP(neg, bfloat, bfloat);
REGISTER_UNARY_OP(round, bfloat, bfloat);
REGISTER_UNARY_OP(abs, bfloat, bfloat);
INSTANTIATE_UNARY_KERNELS2(half, half);
INSTANTIATE_UNARY_KERNELS2(float, float);

View File

@ -115,10 +115,7 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const std::opt
return output;
}
// No-graph execution causes nonsense if these are non-contiguous.
const bool is_contiguous = input.is_contiguous() && weight.is_contiguous() && bias.is_contiguous();
if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_contiguous) {
if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS)) {
_mps_linear_nograph(input, weight, bias, output);
// Squeeze last dim of 1D linear
return weight_arg.dim() != 1 ? output : output.squeeze(-1);

View File

@ -2,7 +2,6 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/MemoryOverlap.h>
#include <ATen/WrapDimUtils.h>
#include <ATen/native/SortingUtils.h>
#include <ATen/native/TensorShape.h>
#include <ATen/native/TypeProperties.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
@ -12,85 +11,10 @@
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/kthvalue_native.h>
#include <ATen/ops/sort.h>
#include <ATen/ops/sort_native.h>
#endif
namespace at::native {
namespace {
void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& values, Tensor& indices) {
using namespace mps;
if (self.dim() == 0 && self.numel() == 1) {
values.copy_(self);
indices.zero_();
return;
}
// Handle empty tensors
if (self.numel() == 0) {
values.copy_(self);
indices.copy_(values.toType(at::ScalarType::Long));
return;
}
// issue #154890, raising error to prevent crash within MPSGraph until
// workaround is implemented.
TORCH_CHECK(self.dim() - dim <= 4, "On-going issue on MPSGraph topk when ndims() - axis > 4, see issue #154890");
auto stream = getCurrentMPSStream();
struct CachedGraph : public MPSCachedGraph {
CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
MPSGraphTensor *selfTensor = nil, *valuesTensor = nil, *indicesTensor = nil;
};
// MPSGraph kthvalue is always sorted.
@autoreleasepool {
// Input as placeholders
MPSShape* input_shape = getMPSShape(self);
NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
std::string key = std::string("kthvalue:") + [ns_shape_key UTF8String] + ":" + getMPSTypeString(self) + ":k" +
std::to_string(k) + ":dim" + std::to_string(dim);
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), input_shape);
MPSGraphTensor* castInputTensor = newCachedGraph->selfTensor;
MPSDataType dataType = getMPSDataType(self);
// #issue 104398441 sortWithTensor and argsortWithTensor
if (dataType != MPSDataTypeInt32 && dataType != MPSDataTypeFloat32 && dataType != MPSDataTypeFloat16) {
dataType = (dataType & MPSDataTypeFloatBit) ? MPSDataTypeFloat32 : MPSDataTypeInt32;
castInputTensor = [mpsGraph castTensor:newCachedGraph->selfTensor toType:dataType name:@"castInputTensor"];
}
MPSGraphTensor* sortedTensor = [mpsGraph sortWithTensor:castInputTensor
axis:(NSUInteger)dim
descending:false
name:nil];
sortedTensor = [mpsGraph sliceTensor:sortedTensor
dimension:(NSUInteger)dim
start:((NSUInteger)k - 1)
length:1
name:nil];
MPSGraphTensor* argSortedTensor = [mpsGraph argSortWithTensor:castInputTensor
axis:(NSInteger)dim
descending:false
name:@"kthvalue_out"];
argSortedTensor = [mpsGraph sliceTensor:argSortedTensor
dimension:dim
start:((NSUInteger)k - 1)
length:1
name:nil];
newCachedGraph->valuesTensor = sortedTensor;
newCachedGraph->indicesTensor = argSortedTensor;
});
Placeholder inputPlaceholder = Placeholder(cachedGraph->selfTensor, self);
// Outputs as placeholders
Placeholder valuesPlaceholder = Placeholder(cachedGraph->valuesTensor, values);
Placeholder indicesPlaceholder = Placeholder(cachedGraph->indicesTensor, indices);
// Create dictionary of inputs and outputs
auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
auto results = dictionaryFromPlaceholders(valuesPlaceholder, indicesPlaceholder);
runMPSGraph(stream, cachedGraph->graph(), feeds, results);
}
}
} // anonymous namespace
// sort
TORCH_IMPL_FUNC(sort_stable_out_mps)
@ -157,31 +81,4 @@ TORCH_IMPL_FUNC(sort_stable_out_mps)
runMPSGraph(stream, cachedGraph->graph(), feeds, results);
}
}
std::tuple<Tensor&, Tensor&> kthvalue_out_mps(const Tensor& self,
int64_t k,
int64_t dim_,
bool keepdim,
Tensor& values,
Tensor& indices) {
// See note [Writing Nondeterministic Operations]
// If there are duplicate elements of the kth value, the procedure for choosing which
// of the duplicates to use for the indices output is nondeterministic.
at::globalContext().alertNotDeterministic("kthvalue MPS");
int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
int64_t slicesize = self.dim() == 0 ? 1 : self.size(dim);
TORCH_CHECK(k >= 1 && k <= slicesize, "kthvalue(): selected number k out of range for dimension ", dim);
at::assert_no_overlap(self, values);
_reduction_with_indices_allocate_or_resize_output(values, indices, self, dim, keepdim);
kthvalue_out_mps_impl(self, k, dim, values, indices);
if (!keepdim) {
values.squeeze_(dim);
indices.squeeze_(dim);
}
return std::forward_as_tuple(values, indices);
}
} // namespace at::native

View File

@ -335,9 +335,6 @@ static void isin_Tensor_Tensor_out_mps(const Tensor& elements,
}
static void is_posneginf_helper(TensorIteratorBase& iter, bool is_neg) {
if (iter.numel() == 0) {
return;
}
const auto& self = iter.input(0);
auto& out = iter.output(0);
@autoreleasepool {

View File

@ -50,7 +50,6 @@ REGISTER_UNARY_TI_DISPATCH(log2);
REGISTER_UNARY_TI_DISPATCH(log);
REGISTER_UNARY_TI_DISPATCH(log1p);
REGISTER_UNARY_TI_DISPATCH(bitwise_not);
REGISTER_UNARY_TI_DISPATCH(round);
REGISTER_UNARY_TI_DISPATCH(sigmoid);
REGISTER_DISPATCH(round_decimals_stub, round_decimals_kernel);
} // namespace at::native

View File

@ -184,6 +184,7 @@ TORCH_IMPL_FUNC(sign_out_mps)(const Tensor& self, const Tensor& output) {
REGISTER_MPS_UNARY_STUB(ceil, ceil);
REGISTER_MPS_UNARY_STUB(floor, floor);
REGISTER_MPS_UNARY_STUB(round, round);
REGISTER_MPS_UNARY_STUB(trunc, truncate);
#define CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(func_out, func_stub) \
@ -417,7 +418,6 @@ TORCH_IMPL_FUNC(sgn_out_mps)(const Tensor& self, const Tensor& output) {
Tensor& conj_physical_out_mps(const Tensor& self, Tensor& result) {
TORCH_CHECK(self.is_complex());
TORCH_CHECK(self.dtype() != at::kComplexDouble);
mps::unary_op(self, result, "conj", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
return [mpsGraph conjugateWithTensor:inputTensor name:nil];
});

View File

@ -340,8 +340,8 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: abs
SparseCPU, SparseCUDA, SparseMPS: abs_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr
SparseCPU, SparseCUDA: abs_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs
tags: [core, pointwise]
@ -350,16 +350,16 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: abs_
SparseCPU, SparseCUDA, SparseMPS: abs_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr_
SparseCPU, SparseCUDA: abs_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs_
- func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
dispatch:
CPU, CUDA, MPS, MTIA: abs_out
SparseCPU, SparseCUDA, SparseMPS: abs_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr_out
SparseCPU, SparseCUDA: abs_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_out
tags: pointwise
# Note [Adding an alias]
@ -428,7 +428,7 @@
variants: function, method
structured_delegate: sgn.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sgn_sparse
SparseCPU, SparseCUDA: sgn_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn
tags: pointwise
@ -437,7 +437,7 @@
variants: method
structured_delegate: sgn.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sgn_sparse_
SparseCPU, SparseCUDA: sgn_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn_
tags: pointwise
@ -448,7 +448,7 @@
dispatch:
CPU, CUDA: sgn_out
MPS: sgn_out_mps
SparseCPU, SparseCUDA, SparseMPS: sgn_sparse_out
SparseCPU, SparseCUDA: sgn_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_out
tags: pointwise
@ -476,7 +476,7 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: _conj_physical
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: conj_physical_sparse_csr
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr
autogen: _conj_physical.out
- func: conj_physical(Tensor self) -> Tensor
@ -487,8 +487,8 @@
dispatch:
CPU, CUDA: conj_physical_out
MPS: conj_physical_out_mps
SparseCPU, SparseCUDA, SparseMPS: conj_physical_out_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: conj_physical_sparse_csr_out
SparseCPU, SparseCUDA: conj_physical_out_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr_out
tags: pointwise
- func: conj_physical_(Tensor(a!) self) -> Tensor(a!)
@ -554,7 +554,7 @@
structured_delegate: add.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: add_sparse
SparseCPU, SparseCUDA, SparseMeta: add_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr
MkldnnCPU: mkldnn_add
ZeroTensor: add_zerotensor
@ -566,7 +566,7 @@
variants: method
structured_delegate: add.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: add_sparse_
SparseCPU, SparseCUDA, SparseMeta: add_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_
MkldnnCPU: mkldnn_add_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_add__Tensor
@ -582,7 +582,6 @@
dispatch:
SparseCPU, SparseMeta: add_out_sparse_cpu
SparseCUDA: add_out_sparse_cuda
SparseMPS: add_out_sparse_mps
SparseCsrCPU, SparseCsrMeta: add_out_sparse_compressed_cpu
SparseCsrCUDA: add_out_sparse_compressed_cuda
MkldnnCPU: mkldnn_add_out
@ -875,7 +874,7 @@
variants: function, method
structured_delegate: asinh.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS: asinh_sparse
SparseCPU, SparseCUDA: asinh_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr
tags: [core, pointwise]
@ -883,7 +882,7 @@
variants: function, method
structured_delegate: asinh.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS: asinh_sparse_
SparseCPU, SparseCUDA: asinh_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_
tags: pointwise
@ -893,7 +892,7 @@
dispatch:
CPU, CUDA: asinh_out
MPS: asinh_out_mps
SparseCPU, SparseCUDA, SparseMPS: asinh_sparse_out
SparseCPU, SparseCUDA: asinh_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_out
tags: pointwise
@ -910,7 +909,7 @@
structured_delegate: atanh.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: atanh_sparse
SparseCPU, SparseCUDA: atanh_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr
tags: [core, pointwise]
@ -918,7 +917,7 @@
structured_delegate: atanh.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: atanh_sparse_
SparseCPU, SparseCUDA: atanh_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_
tags: pointwise
@ -928,7 +927,7 @@
dispatch:
CPU, CUDA: atanh_out
MPS: atanh_out_mps
SparseCPU, SparseCUDA, SparseMPS: atanh_sparse_out
SparseCPU, SparseCUDA: atanh_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_out
tags: pointwise
# arctanh, alias for atanh
@ -965,7 +964,7 @@
variants: function, method
structured_delegate: asin.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS: asin_sparse
SparseCPU, SparseCUDA: asin_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr
tags: [core, pointwise]
@ -974,7 +973,7 @@
variants: function, method
structured_delegate: asin.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS: asin_sparse_
SparseCPU, SparseCUDA: asin_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_
tags: pointwise
@ -984,7 +983,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: asin_out
SparseCPU, SparseCUDA, SparseMPS: asin_sparse_out
SparseCPU, SparseCUDA: asin_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_out
tags: pointwise
@ -1002,7 +1001,7 @@
structured_delegate: atan.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: atan_sparse
SparseCPU, SparseCUDA: atan_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr
tags: [core, pointwise]
@ -1011,7 +1010,7 @@
structured_delegate: atan.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: atan_sparse_
SparseCPU, SparseCUDA: atan_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_
tags: pointwise
@ -1021,7 +1020,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: atan_out
SparseCPU, SparseCUDA, SparseMPS: atan_sparse_out
SparseCPU, SparseCUDA: atan_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_out
tags: pointwise
@ -1460,7 +1459,7 @@
structured_delegate: ceil.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: ceil_sparse
SparseCPU, SparseCUDA: ceil_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr
tags: [core, pointwise]
@ -1469,7 +1468,7 @@
structured_delegate: ceil.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: ceil_sparse_
SparseCPU, SparseCUDA: ceil_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_
tags: pointwise
@ -1479,7 +1478,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: ceil_out
SparseCPU, SparseCUDA, SparseMPS: ceil_sparse_out
SparseCPU, SparseCUDA: ceil_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_out
tags: pointwise
@ -2407,7 +2406,7 @@
MPS: empty_mps
Meta: empty_meta_symint
MkldnnCPU: empty_mkldnn
SparseCPU, SparseCUDA, SparseMPS: empty_sparse
SparseCPU, SparseCUDA: empty_sparse
SparseMeta: empty_sparse_symint
SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed
SparseCsrMeta: empty_sparse_compressed_symint
@ -2535,7 +2534,7 @@
structured_delegate: erf.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: erf_sparse
SparseCPU, SparseCUDA: erf_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr
tags: [core, pointwise]
@ -2544,7 +2543,7 @@
structured_delegate: erf.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: erf_sparse_
SparseCPU, SparseCUDA: erf_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_
tags: pointwise
@ -2554,7 +2553,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS, MTIA: erf_out
SparseCPU, SparseCUDA, SparseMPS: erf_sparse_out
SparseCPU, SparseCUDA: erf_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_out
tags: pointwise
@ -2620,7 +2619,7 @@
structured_delegate: expm1.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: expm1_sparse
SparseCPU, SparseCUDA: expm1_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr
tags: [core, pointwise]
@ -2629,7 +2628,7 @@
structured_delegate: expm1.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: expm1_sparse_
SparseCPU, SparseCUDA: expm1_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_
tags: pointwise
@ -2639,7 +2638,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: expm1_out
SparseCPU, SparseCUDA, SparseMPS: expm1_sparse_out
SparseCPU, SparseCUDA: expm1_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_out
tags: pointwise
@ -2738,7 +2737,7 @@
structured_delegate: floor.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: floor_sparse
SparseCPU, SparseCUDA: floor_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr
tags: [core, pointwise]
@ -2747,7 +2746,7 @@
structured_delegate: floor.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: floor_sparse_
SparseCPU, SparseCUDA: floor_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_
tags: pointwise
@ -2757,7 +2756,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: floor_out
SparseCPU, SparseCUDA, SparseMPS: floor_sparse_out
SparseCPU, SparseCUDA: floor_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_out
tags: pointwise
@ -2765,7 +2764,7 @@
device_check: NoCheck # TensorIterator
variants: function, method
dispatch:
CPU, CUDA, MPS, MTIA: floor_divide
CPU, CUDA, MPS: floor_divide
SparseCPU, SparseCUDA: floor_divide_sparse
- func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@ -2799,7 +2798,7 @@
structured_delegate: frac.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: frac_sparse
SparseCPU, SparseCUDA: frac_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr
tags: pointwise
@ -2808,7 +2807,7 @@
structured_delegate: frac.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: frac_sparse_
SparseCPU, SparseCUDA: frac_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_
tags: pointwise
@ -2819,7 +2818,7 @@
dispatch:
CPU, CUDA: frac_out
MPS: frac_out_mps
SparseCPU, SparseCUDA, SparseMPS: frac_sparse_out
SparseCPU, SparseCUDA: frac_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_out
tags: pointwise
@ -3209,7 +3208,7 @@
dispatch:
CPU, CUDA, MPS, MTIA: isnan
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isnan
SparseCPU, SparseCUDA, SparseMPS: isnan_sparse
SparseCPU, SparseCUDA: isnan_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isnan_sparse_csr
autogen: isnan.out
tags: [core, pointwise]
@ -3290,7 +3289,6 @@
dispatch:
CPU: kthvalue_out_cpu
CUDA: kthvalue_out_cuda
MPS: kthvalue_out_mps
- func: kthvalue.dimname(Tensor self, SymInt k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
variants: function, method
@ -3338,21 +3336,21 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: nan_to_num
SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse
SparseCPU, SparseCUDA: nan_to_num_sparse
tags: pointwise
- func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!)
variants: function, method
dispatch:
CompositeExplicitAutograd: nan_to_num_
SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse_
SparseCPU, SparseCUDA: nan_to_num_sparse_
tags: pointwise
- func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CPU, CUDA, MTIA: nan_to_num_out
MPS: nan_to_num_out_mps
SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse_out
SparseCPU, SparseCUDA: nan_to_num_sparse_out
tags: pointwise
- func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
@ -3555,7 +3553,7 @@
structured_delegate: log1p.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: log1p_sparse
SparseCPU, SparseCUDA: log1p_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr
tags: [core, pointwise]
@ -3564,7 +3562,7 @@
structured_delegate: log1p.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: log1p_sparse_
SparseCPU, SparseCUDA: log1p_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_
tags: pointwise
@ -3574,7 +3572,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: log1p_out
SparseCPU, SparseCUDA, SparseMPS: log1p_sparse_out
SparseCPU, SparseCUDA: log1p_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_out
tags: pointwise
@ -4666,7 +4664,7 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: rad2deg
SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse
SparseCPU, SparseCUDA: rad2deg_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr
tags: pointwise
@ -4674,14 +4672,14 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: rad2deg_
SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse_
SparseCPU, SparseCUDA: rad2deg_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_
tags: pointwise
- func: rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CompositeExplicitAutograd: rad2deg_out
SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse_out
SparseCPU, SparseCUDA: rad2deg_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_out
tags: pointwise
@ -4689,7 +4687,7 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: deg2rad
SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse
SparseCPU, SparseCUDA: deg2rad_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr
tags: pointwise
@ -4697,14 +4695,14 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: deg2rad_
SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse_
SparseCPU, SparseCUDA: deg2rad_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_
tags: pointwise
- func: deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CompositeExplicitAutograd: deg2rad_out
SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse_out
SparseCPU, SparseCUDA: deg2rad_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_out
tags: pointwise
@ -4930,7 +4928,7 @@
structured_delegate: neg.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: neg_sparse
SparseCPU, SparseCUDA: neg_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg
tags: [core, pointwise]
@ -4940,7 +4938,7 @@
structured_delegate: neg.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: neg_sparse_
SparseCPU, SparseCUDA: neg_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg_
tags: pointwise
@ -4951,7 +4949,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS, MTIA: neg_out
SparseCPU, SparseCUDA, SparseMPS: neg_out_sparse
SparseCPU, SparseCUDA: neg_out_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_out
tags: pointwise
# Alias for neg
@ -5035,7 +5033,7 @@
structured_delegate: round.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: round_sparse
SparseCPU, SparseCUDA: round_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr
tags: [core, pointwise]
@ -5044,7 +5042,7 @@
structured_delegate: round.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: round_sparse_
SparseCPU, SparseCUDA: round_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_
tags: pointwise
@ -5054,7 +5052,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: round_out
SparseCPU, SparseCUDA, SparseMPS: round_sparse_out
SparseCPU, SparseCUDA: round_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_out
tags: pointwise
@ -5097,7 +5095,7 @@
QuantizedCPU: relu_quantized_cpu
QuantizedCUDA: relu_quantized_cuda
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu
SparseCPU, SparseCUDA, SparseMPS: relu_sparse
SparseCPU, SparseCUDA: relu_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr
tags: [core, pointwise]
@ -5112,7 +5110,7 @@
QuantizedCPU: relu_quantized_cpu_
QuantizedCUDA: relu_quantized_cuda_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu_
SparseCPU, SparseCUDA, SparseMPS: relu_sparse_
SparseCPU, SparseCUDA: relu_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr_
autogen: relu.out
tags: pointwise
@ -5399,7 +5397,7 @@
variants: function, method
dispatch:
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr
SparseCPU, SparseCUDA, SparseMPS: sin_sparse
SparseCPU, SparseCUDA: sin_sparse
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sin
tags: [core, pointwise]
@ -5409,7 +5407,7 @@
variants: function, method
dispatch:
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_
SparseCPU, SparseCUDA, SparseMPS: sin_sparse_
SparseCPU, SparseCUDA: sin_sparse_
tags: pointwise
- func: sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -5419,7 +5417,7 @@
dispatch:
CPU, CUDA, MPS, MTIA: sin_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_out
SparseCPU, SparseCUDA, SparseMPS: sin_sparse_out
SparseCPU, SparseCUDA: sin_sparse_out
tags: pointwise
- func: sinc(Tensor self) -> Tensor
@ -5444,7 +5442,7 @@
structured_delegate: sinh.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sinh_sparse
SparseCPU, SparseCUDA: sinh_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr
tags: [core, pointwise]
@ -5453,7 +5451,7 @@
structured_delegate: sinh.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sinh_sparse_
SparseCPU, SparseCUDA: sinh_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_
tags: pointwise
@ -5463,7 +5461,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: sinh_out
SparseCPU, SparseCUDA, SparseMPS: sinh_sparse_out
SparseCPU, SparseCUDA: sinh_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_out
# Returns a copy of this `Variable` that is detached from its autograd graph.
@ -5906,7 +5904,7 @@
variants: function, method
dispatch:
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sqrt
SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse
SparseCPU, SparseCUDA: sqrt_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr
tags: [core, pointwise]
@ -5915,7 +5913,7 @@
structured_delegate: sqrt.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse_
SparseCPU, SparseCUDA: sqrt_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_
tags: pointwise
@ -5925,7 +5923,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS, MTIA: sqrt_out
SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse_out
SparseCPU, SparseCUDA: sqrt_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_out
tags: pointwise
@ -6063,7 +6061,7 @@
structured_delegate: tan.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: tan_sparse
SparseCPU, SparseCUDA: tan_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr
tags: [core, pointwise]
@ -6072,7 +6070,7 @@
structured_delegate: tan.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: tan_sparse_
SparseCPU, SparseCUDA: tan_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_
tags: pointwise
@ -6082,7 +6080,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: tan_out
SparseCPU, SparseCUDA, SparseMPS: tan_sparse_out
SparseCPU, SparseCUDA: tan_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_out
tags: pointwise
@ -6093,7 +6091,7 @@
dispatch:
QuantizedCPU: tanh_quantized_cpu
MkldnnCPU: mkldnn_tanh
SparseCPU, SparseCUDA, SparseMPS: tanh_sparse
SparseCPU, SparseCUDA: tanh_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh
tags: [core, pointwise]
@ -6104,7 +6102,7 @@
variants: function, method
dispatch:
MkldnnCPU: mkldnn_tanh_
SparseCPU, SparseCUDA, SparseMPS: tanh_sparse_
SparseCPU, SparseCUDA: tanh_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh_
tags: pointwise
@ -6115,7 +6113,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS, MTIA: tanh_out
SparseCPU, SparseCUDA, SparseMPS: tanh_sparse_out
SparseCPU, SparseCUDA: tanh_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_out
tags: pointwise
@ -6387,8 +6385,8 @@
device_check: NoCheck # TensorIterator
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: trunc_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr
SparseCPU, SparseCUDA: trunc_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr
tags: [core, pointwise]
- func: trunc_(Tensor(a!) self) -> Tensor(a!)
@ -6396,8 +6394,8 @@
device_check: NoCheck # TensorIterator
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: trunc_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr_
SparseCPU, SparseCUDA: trunc_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_
tags: pointwise
- func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -6406,8 +6404,8 @@
device_check: NoCheck # TensorIterator
dispatch:
CPU, CUDA, MPS: trunc_out
SparseCPU, SparseCUDA, SparseMPS: trunc_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr_out
SparseCPU, SparseCUDA: trunc_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_out
tags: pointwise
# Alias for trunc
@ -7369,8 +7367,8 @@
- func: _to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor
variants: method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sparse_to_dense
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: sparse_compressed_to_dense
SparseCPU, SparseCUDA: sparse_to_dense
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_compressed_to_dense
MkldnnCPU: mkldnn_to_dense
autogen: _to_dense.out
@ -7396,8 +7394,8 @@
- func: dense_dim(Tensor self) -> int
variants: method
dispatch:
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: dense_dim_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: dense_dim_sparse_csr
SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr
CompositeExplicitAutograd: dense_dim_default
device_check: NoCheck
device_guard: False
@ -7530,7 +7528,7 @@
device_check: NoCheck # Allows copy into different device
variants: function
dispatch:
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: copy_sparse_
SparseCPU, SparseCUDA, SparseMeta: copy_sparse_
autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out
# By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors
@ -9721,7 +9719,7 @@
structured_delegate: sign.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sign_sparse
SparseCPU, SparseCUDA: sign_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr
tags: [core, pointwise]
@ -9730,7 +9728,7 @@
structured_delegate: sign.out
variants: method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sign_sparse_
SparseCPU, SparseCUDA: sign_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_
tags: pointwise
@ -9741,7 +9739,7 @@
dispatch:
CPU, CUDA: sign_out
MPS: sign_out_mps
SparseCPU, SparseCUDA, SparseMPS: sign_sparse_out
SparseCPU, SparseCUDA: sign_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_out
tags: pointwise
@ -9749,7 +9747,7 @@
variants: function, method
structured_delegate: signbit.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS: signbit_sparse
SparseCPU, SparseCUDA: signbit_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr
tags: pointwise
@ -9760,7 +9758,7 @@
CPU: signbit_out
CUDA: signbit_out
MPS: signbit_out_mps
SparseCPU, SparseCUDA, SparseMPS: signbit_sparse_out
SparseCPU, SparseCUDA: signbit_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr_out
tags: pointwise
@ -13264,7 +13262,7 @@
dispatch:
CompositeExplicitAutograd: isinf
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isinf
SparseCPU, SparseCUDA, SparseMPS: isinf_sparse
SparseCPU, SparseCUDA: isinf_sparse
SparseMeta: isinf_sparse_meta
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isinf_sparse_csr
autogen: isinf.out
@ -13280,7 +13278,7 @@
structured_delegate: isposinf.out
dispatch:
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isposinf
SparseCPU, SparseCUDA, SparseMPS: isposinf_sparse
SparseCPU, SparseCUDA: isposinf_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr
tags: pointwise
@ -13289,7 +13287,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: isposinf_out
SparseCPU, SparseCUDA, SparseMPS: isposinf_sparse_out
SparseCPU, SparseCUDA: isposinf_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr_out
tags: pointwise
@ -13298,7 +13296,7 @@
structured_delegate: isneginf.out
dispatch:
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isneginf
SparseCPU, SparseCUDA, SparseMPS: isneginf_sparse
SparseCPU, SparseCUDA: isneginf_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr
tags: pointwise
@ -13307,7 +13305,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: isneginf_out
SparseCPU, SparseCUDA, SparseMPS: isneginf_sparse_out
SparseCPU, SparseCUDA: isneginf_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr_out
tags: pointwise

View File

@ -1,73 +0,0 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/native/SparseTensorUtils.h>
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/native/sparse/SparseStubs.h>
#include <ATen/native/sparse/FlattenIndicesCommon.h>
#include <ATen/ExpandUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/_coalesce_native.h>
#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
#include <ATen/ops/empty_native.h>
#include <ATen/ops/zeros_native.h>
#endif
namespace at::native {
namespace {
using namespace mps;
using namespace at::sparse;
#ifndef PYTORCH_JIT_COMPILE_SHADERS
static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
#else
#include <ATen/native/mps/FlattenIndices_metallib.h>
#endif
Tensor flatten_indices_mps(const Tensor& indices, IntArrayRef size) {
TORCH_CHECK(indices.dim() == 2, "flatten_indices: indices must be 2D");
TORCH_CHECK(static_cast<size_t>(indices.size(0)) == size.size(),
"flatten_indices: indices.size(0) must equal size.size()");
const int64_t sparse_dim = indices.size(0);
const int64_t nnz = indices.size(1);
if (nnz == 0) {
return at::empty({0}, indices.options().dtype(kLong));
}
// Row-major multipliers for flattening: mul[d] = prod_{j>d}(size[j])
std::vector<int64_t> row_muls(sparse_dim);
row_muls[sparse_dim - 1] = 1;
for (int64_t i = sparse_dim - 2; i >= 0; --i) {
row_muls[i] = row_muls[i + 1] * size[i + 1];
}
auto flat_indices = at::empty({nnz}, indices.options().dtype(kLong));
auto stream = getCurrentMPSStream();
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
auto pipeline = lib.getPipelineStateForFunc("flatten_indices_kernel");
auto encoder = stream->commandEncoder();
[encoder setComputePipelineState:pipeline];
mtl_setArgs(encoder,
indices,
row_muls,
flat_indices,
static_cast<uint>(sparse_dim),
indices.strides()
);
mtl_dispatch1DJob(encoder, pipeline, nnz);
}
});
return flat_indices;
}
} // namespace
REGISTER_MPS_DISPATCH(flatten_indices_stub, &flatten_indices_mps)
} // namespace at::native

View File

@ -20,9 +20,46 @@ using namespace at::sparse;
#ifndef PYTORCH_JIT_COMPILE_SHADERS
static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
#else
#include <ATen/native/mps/Coalesce_metallib.h>
#include <ATen/native/mps/Sparse_metallib.h>
#endif
static Tensor flatten_indices(const Tensor& indices, IntArrayRef size) {
TORCH_CHECK(indices.dim() == 2, "flatten_indices: indices must be 2D");
TORCH_CHECK(static_cast<size_t>(indices.size(0)) == size.size(),
"flatten_indices: indices.size(0) must equal size.size()");
int64_t sparse_dim = indices.size(0);
int64_t nnz = indices.size(1);
if (nnz == 0) {
return at::empty({0}, indices.options().dtype(kLong));
}
std::vector<int64_t> strides(sparse_dim);
strides[sparse_dim - 1] = 1;
for (int64_t i = sparse_dim - 2; i >= 0; i--) {
strides[i] = strides[i + 1] * size[i + 1];
}
Tensor flat_indices = at::empty({nnz}, indices.options().dtype(kLong));
auto stream = getCurrentMPSStream();
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
auto pipeline = lib.getPipelineStateForFunc("flatten_indices_kernel");
auto encoder = stream->commandEncoder();
[encoder setComputePipelineState:pipeline];
mtl_setArgs(encoder, indices, strides, flat_indices, sparse_dim, nnz);
mtl_dispatch1DJob(encoder, pipeline, nnz);
}
});
return flat_indices;
}
static Tensor compute_output_positions(const Tensor& is_unique) {
int64_t nnz = is_unique.size(0);

Some files were not shown because too many files have changed in this diff Show More