Update on "PythonArgParser::symintlist{,Optional} should use SymDimVector"

They're usually/always used for sizes & strides, which is what SymDimVector is for (saves heap allocation of the list itself). Had to patch OptionalArray because there's a bunch of generated code that wants to convert these to SymIntArrayRef and the generator isn't easy to patch to manually wrap. Clear but small improvement in perf on "detach DTensor in a loop" benchmark; we aren't heap-allocating symdimlists anymore, though there's still some cost with destroying these because SymInt needs cleanup.

[ghstack-poisoned]
This commit is contained in:
Scott Wolchok
2025-08-27 23:59:20 -07:00
377 changed files with 11768 additions and 5796 deletions

View File

@ -7,6 +7,15 @@ if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
fi
if [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0"
fi
# Compress the fatbin with -compress-mode=size for CUDA 13
if [[ "$DESIRED_CUDA" == *"13"* ]]; then
export TORCH_NVCC_FLAGS="-compress-mode=size"
fi
SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
source $SCRIPTPATH/aarch64_ci_setup.sh

View File

@ -77,21 +77,23 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
wheelname = os.path.basename(wheel_path)
os.mkdir(f"{folder}/tmp")
os.system(f"unzip {wheel_path} -d {folder}/tmp")
libs_to_copy = [
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
# Common libraries for all CUDA versions
common_libs = [
# Non-NVIDIA system libraries
"/lib64/libgomp.so.1",
"/usr/lib64/libgfortran.so.5",
"/acl/build/libarm_compute.so",
"/acl/build/libarm_compute_graph.so",
# Common CUDA libraries (same for all versions)
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
"/usr/local/lib/libnvpl_lapack_core.so.0",
"/usr/local/lib/libnvpl_blas_core.so.0",
"/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
"/usr/local/cuda/lib64/libcudnn.so.9",
"/usr/local/cuda/lib64/libcublas.so.12",
"/usr/local/cuda/lib64/libcublasLt.so.12",
"/usr/local/cuda/lib64/libcudart.so.12",
"/usr/local/cuda/lib64/libcufft.so.11",
"/usr/local/cuda/lib64/libcusparse.so.12",
"/usr/local/cuda/lib64/libcusparseLt.so.0",
"/usr/local/cuda/lib64/libcusolver.so.11",
"/usr/local/cuda/lib64/libcurand.so.10",
"/usr/local/cuda/lib64/libnccl.so.2",
"/usr/local/cuda/lib64/libnvJitLink.so.12",
"/usr/local/cuda/lib64/libnvrtc.so.12",
"/usr/local/cuda/lib64/libnvshmem_host.so.3",
"/usr/local/cuda/lib64/libcudnn_adv.so.9",
"/usr/local/cuda/lib64/libcudnn_cnn.so.9",
@ -100,22 +102,41 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
"/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
"/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
"/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
"/lib64/libgomp.so.1",
"/usr/lib64/libgfortran.so.5",
"/acl/build/libarm_compute.so",
"/acl/build/libarm_compute_graph.so",
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
"/usr/local/lib/libnvpl_lapack_core.so.0",
"/usr/local/lib/libnvpl_blas_core.so.0",
"/usr/local/cuda/lib64/libcufile.so.0",
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
"/usr/local/cuda/lib64/libcusparse.so.12",
]
if "129" in desired_cuda:
libs_to_copy += [
"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9",
"/usr/local/cuda/lib64/libcufile.so.0",
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
# CUDA version-specific libraries
if "130" in desired_cuda:
version_specific_libs = [
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
"/usr/local/cuda/lib64/libcublas.so.13",
"/usr/local/cuda/lib64/libcublasLt.so.13",
"/usr/local/cuda/lib64/libcudart.so.13",
"/usr/local/cuda/lib64/libcufft.so.12",
"/usr/local/cuda/lib64/libcusolver.so.12",
"/usr/local/cuda/lib64/libnvJitLink.so.13",
"/usr/local/cuda/lib64/libnvrtc.so.13",
"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.0",
]
elif "12" in desired_cuda:
# Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
minor_version = desired_cuda[-1]
version_specific_libs = [
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
"/usr/local/cuda/lib64/libcublas.so.12",
"/usr/local/cuda/lib64/libcublasLt.so.12",
"/usr/local/cuda/lib64/libcudart.so.12",
"/usr/local/cuda/lib64/libcufft.so.11",
"/usr/local/cuda/lib64/libcusolver.so.11",
"/usr/local/cuda/lib64/libnvJitLink.so.12",
"/usr/local/cuda/lib64/libnvrtc.so.12",
f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
]
# Combine all libraries
libs_to_copy = common_libs + version_specific_libs
# Copy libraries to unzipped_folder/a/lib
for lib_path in libs_to_copy:

View File

@ -120,8 +120,8 @@ If your new Docker image needs a library installed from a specific pinned commit
If you're introducing a new argument to the Docker build, make sure to add it in the Docker build step in `.ci/docker/build.sh`:
```bash
docker build \
....
--build-arg "NEW_ARG_1=${NEW_ARG_1}"
....
--build-arg "NEW_ARG_1=${NEW_ARG_1}"
```
3. **Update Dockerfile logic**:

View File

@ -173,8 +173,8 @@ case "$tag" in
VISION=yes
ONNX=yes
;;
pytorch-linux-jammy-py3.9-clang12)
ANACONDA_PYTHON_VERSION=3.9
pytorch-linux-jammy-py3.10-clang12)
ANACONDA_PYTHON_VERSION=3.10
CLANG_VERSION=12
VISION=yes
TRITON=yes
@ -209,15 +209,7 @@ case "$tag" in
UCC_COMMIT=${_UCC_COMMIT}
PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
;;
pytorch-linux-jammy-xpu-2025.0-py3)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
VISION=yes
XPU_VERSION=2025.0
NINJA_VERSION=1.9.0
TRITON=yes
;;
pytorch-linux-jammy-xpu-2025.1-py3)
pytorch-linux-jammy-xpu-n-1-py3)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
VISION=yes
@ -225,6 +217,14 @@ case "$tag" in
NINJA_VERSION=1.9.0
TRITON=yes
;;
pytorch-linux-jammy-xpu-n-py3)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
VISION=yes
XPU_VERSION=2025.2
NINJA_VERSION=1.9.0
TRITON=yes
;;
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
@ -234,8 +234,8 @@ case "$tag" in
DOCS=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
ANACONDA_PYTHON_VERSION=3.9
pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12)
ANACONDA_PYTHON_VERSION=3.10
CUDA_VERSION=12.8.1
CLANG_VERSION=12
VISION=yes
@ -246,8 +246,8 @@ case "$tag" in
CLANG_VERSION=18
VISION=yes
;;
pytorch-linux-jammy-py3.9-gcc11)
ANACONDA_PYTHON_VERSION=3.9
pytorch-linux-jammy-py3.10-gcc11)
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
VISION=yes
KATEX=yes

View File

@ -10,7 +10,7 @@ else
arch_path='sbsa'
fi
NVSHMEM_VERSION=3.3.20
NVSHMEM_VERSION=3.3.24
function install_cuda {
version=$1
@ -65,7 +65,7 @@ function install_nvshmem {
# This pattern is a lie as it is not consistent across versions, for 3.3.9 it was cuda_ver-arch-nvshhem-ver
filename="libnvshmem-linux-${arch_path}-${nvshmem_version}_cuda${cuda_major_version}-archive"
suffix=".tar.xz"
url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}${suffix}"
url="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/linux-${arch_path}/${filename}${suffix}"
# download, unpack, install
wget -q "${url}"
@ -148,7 +148,6 @@ function install_128 {
function install_130 {
CUDNN_VERSION=9.12.0.46
NVSHMEM_VERSION=3.3.20
echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
# install CUDA 13.0 in the same container
install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux

View File

@ -57,7 +57,7 @@ if [ ! -f setup.py ]; then
cd python
fi
pip_install pybind11==2.13.6
pip_install pybind11==3.0.1
# TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py

View File

@ -146,11 +146,11 @@ if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
XPU_DRIVER_VERSION="/lts/2350"
fi
# Default use Intel® oneAPI Deep Learning Essentials 2025.0
if [[ "$XPU_VERSION" == "2025.1" ]]; then
XPU_PACKAGES="intel-deep-learning-essentials-2025.1"
# Default use Intel® oneAPI Deep Learning Essentials 2025.1
if [[ "$XPU_VERSION" == "2025.2" ]]; then
XPU_PACKAGES="intel-deep-learning-essentials-2025.2"
else
XPU_PACKAGES="intel-deep-learning-essentials-2025.0"
XPU_PACKAGES="intel-deep-learning-essentials-2025.1"
fi
# The installation depends on the base OS

View File

@ -175,6 +175,6 @@ ENV XPU_DRIVER_TYPE ROLLING
RUN python3 -m pip install --upgrade pip && \
python3 -mpip install cmake==3.28.4
ADD ./common/install_xpu.sh install_xpu.sh
ENV XPU_VERSION 2025.1
ENV XPU_VERSION 2025.2
RUN bash ./install_xpu.sh && rm install_xpu.sh
RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd

View File

@ -379,7 +379,7 @@ dataclasses_json==0.6.7
cmake==4.0.0
#Description: required for building
tlparse==0.3.30
tlparse==0.4.0
#Description: required for log parsing
cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x"

View File

@ -0,0 +1,143 @@
from __future__ import annotations
import logging
import os
import textwrap
from pathlib import Path
from typing import TYPE_CHECKING
from cli.lib.common.utils import get_wheels
from jinja2 import Template
if TYPE_CHECKING:
from collections.abc import Iterable, Mapping
logger = logging.getLogger(__name__)
_TPL_CONTENT = Template(
textwrap.dedent("""\
## {{ title }}
```{{ lang }}
{{ content }}
```
""")
)
_TPL_LIST_ITEMS = Template(
textwrap.dedent("""\
## {{ title }}
{% for it in items %}
- {{ it.pkg }}: {{ it.relpath }}
{% else %}
_(no item found)_
{% endfor %}
""")
)
_TPL_TABLE = Template(
textwrap.dedent("""\
{%- if rows %}
| {{ cols | join(' | ') }} |
|{%- for _ in cols %} --- |{%- endfor %}
{%- for r in rows %}
| {%- for c in cols %} {{ r.get(c, "") }} |{%- endfor %}
{%- endfor %}
{%- else %}
_(no data)_
{%- endif %}
""")
)
def gh_summary_path() -> Path | None:
"""Return the Path to the GitHub step summary file, or None if not set."""
p = os.environ.get("GITHUB_STEP_SUMMARY")
return Path(p) if p else None
def write_gh_step_summary(md: str, *, append_content: bool = True) -> bool:
"""
Write Markdown content to the GitHub Step Summary file if GITHUB_STEP_SUMMARY is set.
append_content: default true, if True, append to the end of the file, else overwrite the whole file
Returns:
True if written successfully (in GitHub Actions environment),
False if skipped (e.g., running locally where the variable is not set).
"""
sp = gh_summary_path()
if not sp:
logger.info("[gh-summary] GITHUB_STEP_SUMMARY not set, skipping write.")
return False
md_clean = textwrap.dedent(md).strip() + "\n"
mode = "a" if append_content else "w"
with sp.open(mode, encoding="utf-8") as f:
f.write(md_clean)
return True
def md_heading(text: str, level: int = 2) -> str:
"""Generate a Markdown heading string with the given level (1-6)."""
return f"{'#' * max(1, min(level, 6))} {text}\n"
def md_details(summary: str, content: str) -> str:
"""Generate a collapsible <details> block with a summary and inner content."""
return f"<details>\n<summary>{summary}</summary>\n\n{content}\n\n</details>\n"
def summarize_content_from_file(
output_dir: Path,
freeze_file: str,
title: str = "Content from file",
code_lang: str = "", # e.g. "text" or "ini"
) -> bool:
f = Path(output_dir) / freeze_file
if not f.exists():
return False
content = f.read_text(encoding="utf-8").strip()
md = render_content(content, title=title, lang=code_lang)
return write_gh_step_summary(md)
def summarize_wheels(path: Path, title: str = "Wheels", max_depth: int = 3):
items = get_wheels(path, max_depth=max_depth)
if not items:
return False
md = render_list(items, title=title)
return write_gh_step_summary(md)
def md_kv_table(rows: Iterable[Mapping[str, str | int | float]]) -> str:
"""
Render a list of dicts as a Markdown table using Jinja template.
"""
rows = list(rows)
cols = list({k for r in rows for k in r.keys()})
md = _TPL_TABLE.render(cols=cols, rows=rows).strip() + "\n"
return md
def render_list(
items: Iterable[str],
*,
title: str = "List",
) -> str:
tpl = _TPL_LIST_ITEMS
md = tpl.render(title=title, items=items)
return md
def render_content(
content: str,
*,
title: str = "Content",
lang: str = "text",
) -> str:
tpl = _TPL_CONTENT
md = tpl.render(title=title, content=content, lang=lang)
return md

View File

@ -45,7 +45,7 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules
# Checkout pinned commit
commit = get_post_build_pinned_commit(target)
logger.info("Checking out pinned commit %s", commit)
logger.info("Checking out pinned %s commit %s", target, commit)
r.git.checkout(commit)
# Update submodules if requested
@ -55,7 +55,7 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules
sm.update(init=True, recursive=True, progress=PrintProgress())
logger.info("Successfully cloned %s", target)
return r
return r, commit
except GitCommandError as e:
logger.error("Git operation failed: %s", e)

View File

@ -4,7 +4,7 @@ import shlex
import shutil
import sys
from collections.abc import Iterable
from importlib.metadata import PackageNotFoundError, version
from importlib.metadata import PackageNotFoundError, version # noqa: UP035
from typing import Optional, Union
from cli.lib.common.utils import run_command

View File

@ -8,6 +8,7 @@ import shlex
import subprocess
import sys
from contextlib import contextmanager
from pathlib import Path
from typing import Optional
@ -115,3 +116,24 @@ def working_directory(path: str):
yield
finally:
os.chdir(prev_cwd)
def get_wheels(
output_dir: Path,
max_depth: Optional[int] = None,
) -> list[str]:
"""Return a list of wheels found in the given output directory."""
root = Path(output_dir)
if not root.exists():
return []
items = []
for dirpath, _, filenames in os.walk(root):
depth = Path(dirpath).relative_to(root).parts
if max_depth is not None and len(depth) > max_depth:
continue
for fname in sorted(filenames):
if fname.endswith(".whl"):
pkg = fname.split("-")[0]
relpath = str((Path(dirpath) / fname).relative_to(root))
items.append({"pkg": pkg, "relpath": relpath})
return items

View File

@ -1,13 +1,27 @@
import logging
import os
import textwrap
from typing import Any
from cli.lib.common.gh_summary import write_gh_step_summary
from cli.lib.common.git_helper import clone_external_repo
from cli.lib.common.pip_helper import pip_install_packages
from cli.lib.common.utils import run_command, temp_environ, working_directory
from jinja2 import Template
logger = logging.getLogger(__name__)
_TPL_VLLM_INFO = Template(
textwrap.dedent("""\
## Vllm against Pytorch CI Test Summary
**Vllm Commit**: [{{ vllm_commit }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})
{%- if torch_sha %}
**Pytorch Commit**: [{{ torch_sha }}](https://github.com/pytorch/pytorch/commit/{{ torch_sha }})
{%- endif %}
""")
)
def sample_vllm_test_library():
"""
@ -214,12 +228,13 @@ def run_test_plan(
def clone_vllm(dst: str = "vllm"):
clone_external_repo(
_, commit = clone_external_repo(
target="vllm",
repo="https://github.com/vllm-project/vllm.git",
dst=dst,
update_submodules=True,
)
return commit
def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) -> str:
@ -230,3 +245,12 @@ def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) ->
for k in sorted(mapping, key=len, reverse=True):
step = step.replace(k, mapping[k])
return step
def summarize_build_info(vllm_commit: str) -> bool:
torch_sha = os.getenv("GITHUB_SHA")
md = (
_TPL_VLLM_INFO.render(vllm_commit=vllm_commit, torch_sha=torch_sha).strip()
+ "\n"
)
return write_gh_step_summary(md)

View File

@ -13,6 +13,11 @@ from cli.lib.common.envs_helper import (
env_str_field,
with_params_help,
)
from cli.lib.common.gh_summary import (
gh_summary_path,
summarize_content_from_file,
summarize_wheels,
)
from cli.lib.common.path_helper import (
copy,
ensure_dir_exists,
@ -21,7 +26,7 @@ from cli.lib.common.path_helper import (
is_path_exist,
)
from cli.lib.common.utils import run_command
from cli.lib.core.vllm.lib import clone_vllm
from cli.lib.core.vllm.lib import clone_vllm, summarize_build_info
logger = logging.getLogger(__name__)
@ -153,18 +158,43 @@ class VllmBuildRunner(BaseRunner):
"""
inputs = VllmBuildParameters()
logger.info("Running vllm build with inputs: %s", inputs)
clone_vllm()
vllm_commit = clone_vllm()
self.cp_dockerfile_if_exist(inputs)
# cp torch wheels from root direct to vllm workspace if exist
self.cp_torch_whls_if_exist(inputs)
ensure_dir_exists(inputs.output_dir)
# make sure the output dir to store the build artifacts exist
ensure_dir_exists(Path(inputs.output_dir))
cmd = self._generate_docker_build_cmd(inputs)
logger.info("Running docker build: \n %s", cmd)
run_command(cmd, cwd="vllm", env=os.environ.copy())
try:
run_command(cmd, cwd="vllm", env=os.environ.copy())
finally:
self.genearte_vllm_build_summary(vllm_commit, inputs)
def genearte_vllm_build_summary(
self, vllm_commit: str, inputs: VllmBuildParameters
):
if not gh_summary_path():
return logger.info("Skipping, not detect GH Summary env var....")
logger.info("Generate GH Summary ...")
# summarize vllm build info
summarize_build_info(vllm_commit)
# summarize vllm build artifacts
vllm_artifact_dir = inputs.output_dir / "wheels"
summarize_content_from_file(
vllm_artifact_dir,
"build_summary.txt",
title="Vllm build env pip package summary",
)
summarize_wheels(
inputs.torch_whls_path, max_depth=3, title="Torch Wheels Artifacts"
)
summarize_wheels(vllm_artifact_dir, max_depth=3, title="Vllm Wheels Artifacts")
def cp_torch_whls_if_exist(self, inputs: VllmBuildParameters) -> str:
if not inputs.use_torch_whl:

View File

@ -220,6 +220,8 @@ def preprocess_test_in(
target_path = Path(target_file)
lines = target_path.read_text().splitlines()
pkgs_to_add = []
# Remove lines starting with the package names (==, @, >=) — case-insensitive
pattern = re.compile(rf"^({'|'.join(pkgs_to_remove)})\s*(==|@|>=)", re.IGNORECASE)
kept_lines = [line for line in lines if not pattern.match(line)]
@ -236,7 +238,11 @@ def preprocess_test_in(
]
# Write back: header_lines + blank + kept_lines
out = "\n".join(header_lines + [""] + kept_lines) + "\n"
out_lines = header_lines + [""] + kept_lines
if pkgs_to_add:
out_lines += [""] + pkgs_to_add
out = "\n".join(out_lines) + "\n"
target_path.write_text(out)
logger.info("[INFO] Updated %s", target_file)

View File

@ -300,24 +300,3 @@ except RuntimeError as e:
exit 1
fi
fi
###############################################################################
# Check for C++ ABI compatibility to GCC-11 - GCC 13
###############################################################################
if [[ "$(uname)" == 'Linux' && "$PACKAGE_TYPE" == 'manywheel' ]]; then
pushd /tmp
# Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html
# gcc-11 is ABI16, gcc-13 is ABI18, gcc-14 is ABI19
# gcc 11 - CUDA 11.8, xpu, rocm
# gcc 13 - CUDA 12.6, 12.8 and cpu
# Please see issue for reference: https://github.com/pytorch/pytorch/issues/152426
if [[ "$(uname -m)" == "s390x" ]]; then
cxx_abi="19"
elif [[ "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'rocm'* ]]; then
cxx_abi="18"
else
cxx_abi="16"
fi
python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi10${cxx_abi}' else 1)"
popd
fi

View File

@ -149,13 +149,22 @@ function get_pinned_commit() {
cat .github/ci_commit_pins/"${1}".txt
}
function detect_cuda_arch() {
if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
if command -v nvidia-smi; then
TORCH_CUDA_ARCH_LIST=$(nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1)
elif [[ "${TEST_CONFIG}" == *nogpu* ]]; then
# There won't be nvidia-smi in nogpu tests, so just set TORCH_CUDA_ARCH_LIST to the default
# minimum supported value here
TORCH_CUDA_ARCH_LIST=8.0
fi
export TORCH_CUDA_ARCH_LIST
fi
}
function install_torchaudio() {
local commit
commit=$(get_pinned_commit audio)
if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]] && command -v nvidia-smi; then
TORCH_CUDA_ARCH_LIST=$(nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1)
export TORCH_CUDA_ARCH_LIST
fi
pip_build_and_install "git+https://github.com/pytorch/audio.git@${commit}" dist/audio
}

View File

@ -45,6 +45,7 @@ if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then
# DTensor tests
time python test/run_test.py --verbose -i distributed/tensor/test_random_ops
time python test/run_test.py --verbose -i distributed/tensor/test_dtensor_compile
time python test/run_test.py --verbose -i distributed/tensor/test_utils.py
# DeviceMesh test
time python test/run_test.py --verbose -i distributed/test_device_mesh

View File

@ -91,6 +91,7 @@ if [[ "$BUILD_ENVIRONMENT" == *clang9* || "$BUILD_ENVIRONMENT" == *xpu* ]]; then
export VALGRIND=OFF
fi
detect_cuda_arch
if [[ "$BUILD_ENVIRONMENT" == *s390x* ]]; then
# There are additional warnings on s390x, maybe due to newer gcc.
@ -1630,11 +1631,7 @@ elif [[ "${TEST_CONFIG}" == *xla* ]]; then
build_xla
test_xla
elif [[ "$TEST_CONFIG" == *vllm* ]]; then
if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
TORCH_CUDA_ARCH_LIST=$(nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1)
export TORCH_CUDA_ARCH_LIST
fi
echo "VLLM CI TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST"
echo "vLLM CI uses TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST"
(cd .ci/lumen_cli && python -m pip install -e .)
python -m cli.run test external vllm --test-plan "$TEST_CONFIG" --shard-id "$SHARD_NUMBER" --num-shards "$NUM_TEST_SHARDS"
elif [[ "${TEST_CONFIG}" == *executorch* ]]; then

View File

@ -44,7 +44,7 @@ python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==
python -m pip install z3-solver==4.15.1.0
# Install tlparse for test\dynamo\test_structured_trace.py UTs.
python -m pip install tlparse==0.3.30
python -m pip install tlparse==0.4.0
# Install parameterized
python -m pip install parameterized==0.8.1

View File

@ -13,9 +13,9 @@ if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
:xpu_bundle_install_start
set XPU_BUNDLE_PARENT_DIR=C:\Program Files (x86)\Intel\oneAPI
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d6d6c17-ca2d-4735-9331-99447e4a1280/intel-deep-learning-essentials-2025.0.1.28_offline.exe
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product
set XPU_BUNDLE_VERSION=2025.0.1+20
set XPU_BUNDLE_VERSION=2025.1.3+5
set XPU_BUNDLE_INSTALLED=0
set XPU_BUNDLE_UNINSTALL=0
set XPU_EXTRA_URL=NULL
@ -24,9 +24,9 @@ set XPU_EXTRA_VERSION=2025.0.1+1226
set XPU_EXTRA_INSTALLED=0
set XPU_EXTRA_UNINSTALL=0
if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.1] (
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
set XPU_BUNDLE_VERSION=2025.1.3+5
if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.2] (
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
set XPU_BUNDLE_VERSION=2025.2.1+20
)
:: Check if XPU bundle is target version or already installed
@ -90,14 +90,3 @@ if errorlevel 1 exit /b 1
del xpu_extra.exe
:xpu_install_end
if not "%XPU_ENABLE_KINETO%"=="1" goto install_end
:: Install Level Zero SDK
set XPU_EXTRA_LZ_URL=https://github.com/oneapi-src/level-zero/releases/download/v1.14.0/level-zero-sdk_1.14.0.zip
curl -k -L %XPU_EXTRA_LZ_URL% --output "%SRC_DIR%\temp_build\level_zero_sdk.zip"
echo "Installing level zero SDK..."
7z x "%SRC_DIR%\temp_build\level_zero_sdk.zip" -o"%SRC_DIR%\temp_build\level_zero"
set "INCLUDE=%SRC_DIR%\temp_build\level_zero\include;%INCLUDE%"
del "%SRC_DIR%\temp_build\level_zero_sdk.zip"
:install_end

View File

@ -15,8 +15,7 @@ fi
if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
export VC_YEAR=2022
export USE_SCCACHE=0
export XPU_VERSION=2025.1
export XPU_ENABLE_KINETO=1
export XPU_VERSION=2025.2
fi
echo "Free space on filesystem before build:"

View File

@ -8,7 +8,7 @@ export VC_YEAR=2022
if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
export VC_YEAR=2022
export XPU_VERSION=2025.1
export XPU_VERSION=2025.2
fi
pushd "$PYTORCH_ROOT/.ci/pytorch/"

View File

@ -48,6 +48,7 @@ runs:
BASE_IMAGE: ${{ inputs.docker-image }}
BUILD_TARGETS: ${{ inputs.build-targets }}
PARENT_OUTPUT_DIR: ${{ inputs.output-dir}}
shell: bash
run: |
set -euo pipefail

View File

@ -1 +1 @@
add1adfec742dfb13e614dab3372b5aafd1ff046
321938e9ac4000e0cb37e328359a7fd3026bc672

View File

@ -176,6 +176,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# Build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
RUN cat torch_build_versions.txt
RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'
@ -358,7 +359,7 @@ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# Build flashinfer for torch nightly from source around 10 mins
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
ARG FLASHINFER_GIT_REF="v0.2.9rc2"
ARG FLASHINFER_GIT_REF="v0.2.14.post1"
RUN --mount=type=cache,target=/root/.cache/uv \
git clone --depth 1 --recursive --shallow-submodules \
--branch ${FLASHINFER_GIT_REF} \
@ -376,6 +377,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# Logging to confirm the torch versions
RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio\|^xformers\|^vllm\|^flashinfer' > build_summary.txt
################### VLLM INSTALLED IMAGE ####################
@ -433,4 +435,5 @@ FROM scratch as export-wheels
# Just copy the wheels we prepared in previous stages
COPY --from=base /workspace/xformers-dist /wheels/xformers
COPY --from=build /workspace/vllm-dist /wheels/vllm
COPY --from=vllm-base /workspace/build_summary.txt /wheels/build_summary.txt
COPY --from=vllm-base /workspace/wheels/flashinfer /wheels/flashinfer-python

3
.github/labeler.yml vendored
View File

@ -41,6 +41,9 @@
- test/inductor/**
- test/dynamo/**
"ciflow/vllm":
- .github/ci_commit_pins/vllm.txt
"module: cpu":
- aten/src/ATen/cpu/**
- aten/src/ATen/native/cpu/**

View File

@ -28,7 +28,7 @@ pyyaml==6.0.2
scipy==1.12.0
setuptools==72.1.0
sympy==1.13.3
tlparse==0.3.30
tlparse==0.4.0
tensorboard==2.13.0
typing-extensions==4.12.2
unittest-xml-reporting<=3.2.0,>=2.0.0

View File

@ -40,7 +40,7 @@ CPU_AARCH64_ARCH = ["cpu-aarch64"]
CPU_S390X_ARCH = ["cpu-s390x"]
CUDA_AARCH64_ARCHES = ["12.9-aarch64"]
CUDA_AARCH64_ARCHES = ["12.9-aarch64", "13.0-aarch64"]
PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
@ -107,32 +107,32 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'"
),
"xpu": (
"intel-cmplr-lib-rt==2025.1.1 | "
"intel-cmplr-lib-ur==2025.1.1 | "
"intel-cmplr-lic-rt==2025.1.1 | "
"intel-sycl-rt==2025.1.1 | "
"oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"onemkl-sycl-blas==2025.1.0 | "
"onemkl-sycl-dft==2025.1.0 | "
"onemkl-sycl-lapack==2025.1.0 | "
"onemkl-sycl-rng==2025.1.0 | "
"onemkl-sycl-sparse==2025.1.0 | "
"dpcpp-cpp-rt==2025.1.1 | "
"intel-opencl-rt==2025.1.1 | "
"mkl==2025.1.0 | "
"intel-openmp==2025.1.1 | "
"tbb==2022.1.0 | "
"tcmlib==1.3.0 | "
"umf==0.10.0 | "
"intel-pti==0.12.3"
"intel-cmplr-lib-rt==2025.2.1 | "
"intel-cmplr-lib-ur==2025.2.1 | "
"intel-cmplr-lic-rt==2025.2.1 | "
"intel-sycl-rt==2025.2.1 | "
"oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"onemkl-sycl-blas==2025.2.0 | "
"onemkl-sycl-dft==2025.2.0 | "
"onemkl-sycl-lapack==2025.2.0 | "
"onemkl-sycl-rng==2025.2.0 | "
"onemkl-sycl-sparse==2025.2.0 | "
"dpcpp-cpp-rt==2025.2.1 | "
"intel-opencl-rt==2025.2.1 | "
"mkl==2025.2.0 | "
"intel-openmp==2025.2.1 | "
"tbb==2022.2.0 | "
"tcmlib==1.4.0 | "
"umf==0.11.0 | "
"intel-pti==0.13.1"
),
}
@ -210,7 +210,7 @@ LIBTORCH_CONTAINER_IMAGES: dict[str, str] = {
"cpu": "libtorch-cxx11-builder:cpu",
}
FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]
FULL_PYTHON_VERSIONS = ["3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]
def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:

View File

@ -124,7 +124,7 @@ def mock_parse_args(revert: bool = False, force: bool = False) -> Any:
self.force = force
self.pr_num = 76123
self.dry_run = True
self.comment_id = 0
self.comment_id = 12345 # Set to non-zero value
self.reason = "this is for testing"
self.ignore_current = False
self.check_mergeability = False
@ -152,9 +152,9 @@ def mock_revert(
def mock_merge(
pr: GitHubPR,
repo: GitRepo,
comment_id: int,
dry_run: bool = False,
skip_mandatory_checks: bool = False,
comment_id: Optional[int] = None,
timeout_minutes: int = 400,
stale_pr_days: int = 3,
ignore_current: bool = False,
@ -470,9 +470,9 @@ class TestTryMerge(TestCase):
mock_merge.assert_called_once_with(
mock.ANY,
mock.ANY,
comment_id=mock.ANY,
dry_run=mock.ANY,
skip_mandatory_checks=True,
comment_id=mock.ANY,
ignore_current=False,
)
@ -485,9 +485,9 @@ class TestTryMerge(TestCase):
mock_merge.assert_called_once_with(
mock.ANY,
mock.ANY,
comment_id=mock.ANY,
dry_run=mock.ANY,
skip_mandatory_checks=False,
comment_id=mock.ANY,
ignore_current=False,
)

View File

@ -737,16 +737,24 @@ class GitHubPR:
def last_commit(self) -> Any:
return self.info["commits"]["nodes"][-1]["commit"]
def last_commit_sha(self, default: Optional[str] = None) -> str:
# for commits, the oid is the sha
if default is None:
return str(self.last_commit()["oid"])
return str(self.last_commit().get("oid", default))
def get_merge_base(self) -> str:
if self.merge_base:
return self.merge_base
last_commit_oid = self.last_commit()["oid"]
last_commit_sha = self.last_commit_sha()
# NB: We could use self.base_ref() here for regular PR, however, that doesn't
# work for ghstack where the base is the custom branch, i.e. gh/USER/ID/base,
# so let's just use main instead
self.merge_base = gh_fetch_merge_base(
self.org, self.project, last_commit_oid, self.default_branch()
self.org, self.project, last_commit_sha, self.default_branch()
)
# Fallback to baseRefOid if the API call fails, i.e. rate limit. Note that baseRefOid
@ -1151,7 +1159,7 @@ class GitHubPR:
*,
skip_mandatory_checks: bool = False,
dry_run: bool = False,
comment_id: Optional[int] = None,
comment_id: int,
ignore_current_checks: Optional[list[str]] = None,
) -> None:
# Raises exception if matching rule is not found
@ -1167,7 +1175,7 @@ class GitHubPR:
skip_internal_checks=can_skip_internal_checks(self, comment_id),
ignore_current_checks=ignore_current_checks,
)
additional_merged_prs = self.merge_changes(
additional_merged_prs = self.merge_changes_locally(
repo, skip_mandatory_checks, comment_id
)
@ -1196,7 +1204,7 @@ class GitHubPR:
broken_trunk_checks=ignorable_checks.get("BROKEN_TRUNK", []),
flaky_checks=ignorable_checks.get("FLAKY", []),
unstable_checks=ignorable_checks.get("UNSTABLE", []),
last_commit_sha=self.last_commit().get("oid", ""),
last_commit_sha=self.last_commit_sha(default=""),
merge_base_sha=self.get_merge_base(),
merge_commit_sha=merge_commit_sha,
is_failed=False,
@ -1217,7 +1225,7 @@ class GitHubPR:
dry_run=dry_run,
)
def merge_changes(
def merge_changes_locally(
self,
repo: GitRepo,
skip_mandatory_checks: bool = False,
@ -1231,22 +1239,7 @@ class GitHubPR:
branch_to_merge_into = self.default_branch() if branch is None else branch
if repo.current_branch() != branch_to_merge_into:
repo.checkout(branch_to_merge_into)
if not self.is_ghstack_pr():
msg = self.gen_commit_message()
pr_branch_name = f"__pull-request-{self.pr_num}__init__"
repo.fetch(self.last_commit()["oid"], pr_branch_name)
repo._run_git("merge", "--squash", pr_branch_name)
repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
# Did the PR change since we started the merge?
pulled_sha = repo.show_ref(pr_branch_name)
latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
if pulled_sha != latest_pr_status.last_commit()["oid"]:
raise RuntimeError(
"PR has been updated since CI checks last passed. Please rerun the merge command."
)
return []
else:
if self.is_ghstack_pr():
return self.merge_ghstack_into(
repo,
skip_mandatory_checks,
@ -1254,6 +1247,21 @@ class GitHubPR:
skip_all_rule_checks=skip_all_rule_checks,
)
msg = self.gen_commit_message()
pr_branch_name = f"__pull-request-{self.pr_num}__init__"
repo.fetch(self.last_commit_sha(), pr_branch_name)
repo._run_git("merge", "--squash", pr_branch_name)
repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
# Did the PR change since we started the merge?
pulled_sha = repo.show_ref(pr_branch_name)
latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
if pulled_sha != latest_pr_status.last_commit_sha():
raise RuntimeError(
"PR has been updated since CI checks last passed. Please rerun the merge command."
)
return []
class MergeRuleFailedError(RuntimeError):
def __init__(self, message: str, rule: Optional["MergeRule"] = None) -> None:
@ -1458,7 +1466,7 @@ def find_matching_merge_rule(
pending_checks = []
failed_checks = []
hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit()['oid']}"
hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit_sha()}"
if len(failed_checks) > 0:
if reject_reason_score < 30000:
reject_reason_score = 30000
@ -2156,14 +2164,14 @@ def categorize_checks(
def merge(
pr: GitHubPR,
repo: GitRepo,
comment_id: int,
dry_run: bool = False,
skip_mandatory_checks: bool = False,
comment_id: Optional[int] = None,
timeout_minutes: int = 400,
stale_pr_days: int = 3,
ignore_current: bool = False,
) -> None:
initial_commit_sha = pr.last_commit()["oid"]
initial_commit_sha = pr.last_commit_sha()
pr_link = f"https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num}"
print(f"Attempting merge of {initial_commit_sha} ({pr_link})")
@ -2234,7 +2242,7 @@ def merge(
f"Attempting merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} ({elapsed_time / 60} minutes elapsed)"
)
pr = GitHubPR(pr.org, pr.project, pr.pr_num)
if initial_commit_sha != pr.last_commit()["oid"]:
if initial_commit_sha != pr.last_commit_sha():
raise RuntimeError(
"New commits were pushed while merging. Please rerun the merge command."
)
@ -2401,7 +2409,7 @@ def main() -> None:
if args.check_mergeability:
if pr.is_ghstack_pr():
get_ghstack_prs(repo, pr) # raises error if out of sync
pr.merge_changes(
pr.merge_changes_locally(
repo,
skip_mandatory_checks=True,
skip_all_rule_checks=True,
@ -2416,12 +2424,18 @@ def main() -> None:
gh_post_pr_comment(org, project, args.pr_num, message, dry_run=args.dry_run)
return
try:
# Ensure comment id is set, else fail
if not args.comment_id:
raise ValueError(
"Comment ID is required for merging PRs, please provide it using --comment-id"
)
merge(
pr,
repo,
comment_id=args.comment_id,
dry_run=args.dry_run,
skip_mandatory_checks=args.force,
comment_id=args.comment_id,
ignore_current=args.ignore_current,
)
except Exception as e:
@ -2443,7 +2457,7 @@ def main() -> None:
broken_trunk_checks=[],
flaky_checks=[],
unstable_checks=[],
last_commit_sha=pr.last_commit().get("oid", ""),
last_commit_sha=pr.last_commit_sha(default=""),
merge_base_sha=pr.get_merge_base(),
is_failed=True,
skip_mandatory_checks=args.force,

View File

@ -4,7 +4,7 @@
{%- set download_artifact_action = "actions/download-artifact@v4.1.7" -%}
{%- set timeout_minutes = 240 -%}
{%- set timeout_minutes_windows_binary = 300 -%}
{%- set timeout_minutes_windows_binary = 360 -%}
{%- macro concurrency(build_environment) -%}
concurrency:

View File

@ -13,6 +13,7 @@ jobs:
if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-url-lint') }}
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
job-name: lint-urls
timeout: 120
runner: ${{ inputs.runner }}linux.2xlarge
docker-image: ci-image:pytorch-linux-jammy-linter
@ -38,6 +39,7 @@ jobs:
if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-xref-lint') }}
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
job-name: lint-xrefs
timeout: 60
runner: ${{ inputs.runner }}linux.2xlarge
docker-image: ci-image:pytorch-linux-jammy-linter

View File

@ -409,7 +409,7 @@ jobs:
job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
- name: Authenticate with AWS
if: ${{ contains(matrix.runner, 'b200') }}
if: ${{ always() && contains(matrix.runner, 'b200') }}
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results

View File

@ -145,7 +145,7 @@ jobs:
fi
docker exec -t "${container_name}" yum install -y zlib-devel zip
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -m pip install -U setuptools==78.1.0 pybind11==2.13.1 auditwheel wheel
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -m pip install -U setuptools==78.1.0 pybind11==3.0.1 auditwheel wheel
set +e
docker exec -t "${container_name}" command -v pip
has_pip=$?

View File

@ -56,18 +56,18 @@ jobs:
pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
pytorch-linux-jammy-py3.9-clang12,
pytorch-linux-jammy-py3.10-clang12,
pytorch-linux-jammy-py3.13-clang12,
pytorch-linux-jammy-rocm-n-py3,
pytorch-linux-noble-rocm-n-py3,
pytorch-linux-noble-rocm-alpha-py3,
pytorch-linux-jammy-rocm-n-py3-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
pytorch-linux-jammy-py3.9-gcc11,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12,
pytorch-linux-jammy-py3.10-gcc11,
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
pytorch-linux-jammy-py3.12-halide,
pytorch-linux-jammy-xpu-2025.0-py3,
pytorch-linux-jammy-xpu-2025.1-py3,
pytorch-linux-jammy-xpu-n-1-py3,
pytorch-linux-jammy-xpu-n-py3,
pytorch-linux-jammy-py3-clang18-asan,
pytorch-linux-jammy-py3-clang12-onnx,
pytorch-linux-jammy-linter,
@ -124,7 +124,7 @@ jobs:
GHCR_PAT: ${{ secrets.GHCR_PAT }}
with:
shell: bash
timeout_minutes: 30
timeout_minutes: 60
max_attempts: 5
retry_wait_seconds: 90
command: |

View File

@ -47,117 +47,6 @@ jobs:
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
manywheel-py3_9-cpu-aarch64-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.9"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_9-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cpu-aarch64-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_9-cpu-aarch64-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.2xlarge
ALPINE_IMAGE: "arm64v8/alpine"
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cpu-aarch64-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_9-cpu-aarch64-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cpu-aarch64
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_9-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.9"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_9-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_9-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_10-cpu-aarch64-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -269,6 +158,52 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_10-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda-aarch64-13_0-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_10-cuda-aarch64-13_0-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cuda-aarch64-13_0
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_11-cpu-aarch64-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -380,6 +315,52 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_11-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda-aarch64-13_0-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_11-cuda-aarch64-13_0-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cuda-aarch64-13_0
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_12-cpu-aarch64-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -491,6 +472,52 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_12-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda-aarch64-13_0-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_12-cuda-aarch64-13_0-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cuda-aarch64-13_0
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13-cpu-aarch64-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -602,6 +629,52 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda-aarch64-13_0-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_13-cuda-aarch64-13_0-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.13"
build_name: manywheel-py3_13-cuda-aarch64-13_0
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13t-cpu-aarch64-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -713,6 +786,52 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13t-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda-aarch64-13_0-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_13t-cuda-aarch64-13_0-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.13t"
build_name: manywheel-py3_13t-cuda-aarch64-13_0
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14-cpu-aarch64-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -824,6 +943,52 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda-aarch64-13_0-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_14-cuda-aarch64-13_0-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.14"
build_name: manywheel-py3_14-cuda-aarch64-13_0
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14t-cpu-aarch64-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -934,3 +1099,49 @@ jobs:
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14t-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda-aarch64-13_0-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_14t-cuda-aarch64-13_0-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.14t"
build_name: manywheel-py3_14t-cuda-aarch64-13_0
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml

View File

@ -47,664 +47,6 @@ jobs:
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
manywheel-py3_9-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cpu
DESIRED_PYTHON: "3.9"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_9-cpu
build_environment: linux-binary-manywheel
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cpu-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_9-cpu-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cpu
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cpu
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.4xlarge
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cpu-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_9-cpu-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cpu
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cpu
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_9-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu126
GPU_ARCH_VERSION: "12.6"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.9"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_9-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda12_6-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_9-cuda12_6-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu126
GPU_ARCH_VERSION: "12.6"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cuda12_6
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda12_6-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_9-cuda12_6-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu126
GPU_ARCH_VERSION: "12.6"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cuda12_6
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_9-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu128
GPU_ARCH_VERSION: "12.8"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.9"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_9-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda12_8-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_9-cuda12_8-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu128
GPU_ARCH_VERSION: "12.8"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cuda12_8
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda12_8-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_9-cuda12_8-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu128
GPU_ARCH_VERSION: "12.8"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cuda12_8
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_9-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.9"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_9-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_9-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_9-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_9-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.9"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_9-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda13_0-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_9-cuda13_0-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cuda13_0
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda13_0-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_9-cuda13_0-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cuda13_0
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_9-rocm6_3-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.3
GPU_ARCH_VERSION: "6.3"
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
DESIRED_PYTHON: "3.9"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_9-rocm6_3
build_environment: linux-binary-manywheel
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-rocm6_3-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_9-rocm6_3-build
- get-label-type
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.3
GPU_ARCH_VERSION: "6.3"
GPU_ARCH_TYPE: rocm
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
DESIRED_PYTHON: "3.9"
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: manywheel-py3_9-rocm6_3
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: ROCm set GPU_FLAG
run: |
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
- name: configure aws credentials
id: aws_creds
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
role-duration-seconds: 18000
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
docker-image-name: manylinux2_28-builder
custom-tag-prefix: rocm6.3
docker-build-dir: .ci/docker
working-directory: pytorch
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Test Pytorch binary
uses: ./pytorch/.github/actions/test-pytorch-binary
env:
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Teardown ROCm
uses: ./.github/actions/teardown-rocm
manywheel-py3_9-rocm6_3-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_9-rocm6_3-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.3
GPU_ARCH_VERSION: "6.3"
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-rocm6_3
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_9-rocm6_4-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.4
GPU_ARCH_VERSION: "6.4"
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.9"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_9-rocm6_4
build_environment: linux-binary-manywheel
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-rocm6_4-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_9-rocm6_4-build
- get-label-type
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.4
GPU_ARCH_VERSION: "6.4"
GPU_ARCH_TYPE: rocm
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.9"
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: manywheel-py3_9-rocm6_4
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: ROCm set GPU_FLAG
run: |
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
- name: configure aws credentials
id: aws_creds
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
role-duration-seconds: 18000
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
docker-image-name: manylinux2_28-builder
custom-tag-prefix: rocm6.4
docker-build-dir: .ci/docker
working-directory: pytorch
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Test Pytorch binary
uses: ./pytorch/.github/actions/test-pytorch-binary
env:
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Teardown ROCm
uses: ./.github/actions/teardown-rocm
manywheel-py3_9-rocm6_4-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_9-rocm6_4-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.4
GPU_ARCH_VERSION: "6.4"
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-rocm6_4
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_9-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: xpu
GPU_ARCH_TYPE: xpu
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: xpu
DESIRED_PYTHON: "3.9"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_9-xpu
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-xpu-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_9-xpu-build
- get-label-type
runs-on: linux.idc.xpu
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: xpu
GPU_ARCH_TYPE: xpu
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: xpu
DESIRED_PYTHON: "3.9"
permissions:
id-token: write
contents: read
steps:
- name: Setup XPU
uses: ./.github/actions/setup-xpu
- name: configure aws credentials
id: aws_creds
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v2
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: manywheel-py3_9-xpu
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
docker-image-name: manylinux2_28-builder
custom-tag-prefix: xpu
docker-build-dir: .ci/docker
working-directory: pytorch
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Test Pytorch binary
uses: ./pytorch/.github/actions/test-pytorch-binary
env:
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Teardown XPU
uses: ./.github/actions/teardown-xpu
manywheel-py3_9-xpu-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_9-xpu-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: xpu
GPU_ARCH_TYPE: xpu
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: xpu
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-xpu
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_10-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -983,7 +325,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda13_0-test: # Testing
@ -1270,7 +612,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-xpu
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-xpu-test: # Testing
@ -1641,7 +983,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda13_0-test: # Testing
@ -1928,7 +1270,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-xpu
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-xpu-test: # Testing
@ -2299,7 +1641,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda13_0-test: # Testing
@ -2586,7 +1928,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-xpu
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-xpu-test: # Testing
@ -2957,7 +2299,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda13_0-test: # Testing
@ -3244,7 +2586,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13-xpu
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-xpu-test: # Testing
@ -3615,7 +2957,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13t-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda13_0-test: # Testing
@ -3902,7 +3244,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13t-xpu
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-xpu-test: # Testing
@ -4273,7 +3615,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda13_0-test: # Testing
@ -4560,7 +3902,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14-xpu
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-xpu-test: # Testing
@ -4931,7 +4273,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14t-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda13_0-test: # Testing
@ -5218,7 +4560,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14t-xpu
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-xpu-test: # Testing

View File

@ -47,70 +47,6 @@ jobs:
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
manywheel-py3_9-cpu-s390x-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu-s390x
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
DESIRED_PYTHON: "3.9"
runs_on: linux.s390x
ALPINE_IMAGE: "docker.io/s390x/alpine"
timeout-minutes: 420
build_name: manywheel-py3_9-cpu-s390x
build_environment: linux-s390x-binary-manywheel
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cpu-s390x-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_9-cpu-s390x-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu-s390x
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cpu-s390x
build_environment: linux-s390x-binary-manywheel
runs_on: linux.s390x
ALPINE_IMAGE: "docker.io/s390x/alpine"
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cpu-s390x-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_9-cpu-s390x-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu-s390x
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cpu-s390x
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_10-cpu-s390x-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml

View File

@ -30,151 +30,6 @@ concurrency:
cancel-in-progress: true
jobs:
wheel-py3_9-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: macos-14-xlarge
timeout-minutes: 240
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: wheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.9"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
# shellcheck disable=SC2129
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
- name: Install conda and dependencies
run: |
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
fi
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Populate binary env
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
# Build
USE_PYTORCH_METAL_EXPORT=1
USE_COREML_DELEGATE=1
TORCH_PACKAGE_NAME="${TORCH_PACKAGE_NAME//-/_}"
export USE_PYTORCH_METAL_EXPORT
export USE_COREML_DELEGATE
export TORCH_PACKAGE_NAME
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
- name: Test PyTorch wheel
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
pip uninstall -y "$TORCH_PACKAGE_NAME" || true
pip uninstall -y "$TORCH_PACKAGE_NAME" || true
# Create new "clean" conda environment for testing
SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
# shellcheck disable=SC2086
python "${PYTORCH_ROOT}/.ci/pytorch/smoke_test/smoke_test.py" --package torchonly ${SMOKE_TEST_PARAMS}
- uses: actions/upload-artifact@v4.4.0
if: always()
with:
name: wheel-py3_9-cpu
retention-days: 14
if-no-files-found: error
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
wheel-py3_9-cpu-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: wheel-py3_9-cpu-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: wheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cpu
DESIRED_PYTHON: "3.9"
build_name: wheel-py3_9-cpu
use_s3: False
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
wheel-py3_10-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: macos-14-xlarge

View File

@ -51,7 +51,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "windows-11-arm64-preview"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -128,7 +128,7 @@ jobs:
- libtorch-cpu-shared-with-deps-debug-build
- get-label-type
runs-on: "windows-11-arm64-preview"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch

View File

@ -51,7 +51,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "windows-11-arm64-preview"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -128,7 +128,7 @@ jobs:
- libtorch-cpu-shared-with-deps-release-build
- get-label-type
runs-on: "windows-11-arm64-preview"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch

View File

@ -51,7 +51,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "windows-11-arm64-preview"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: wheel
@ -124,7 +124,7 @@ jobs:
- wheel-py3_11-cpu-build
- get-label-type
runs-on: "windows-11-arm64-preview"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: wheel
@ -198,7 +198,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "windows-11-arm64-preview"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: wheel
@ -271,7 +271,7 @@ jobs:
- wheel-py3_12-cpu-build
- get-label-type
runs-on: "windows-11-arm64-preview"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: wheel
@ -345,7 +345,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "windows-11-arm64-preview"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: wheel
@ -418,7 +418,7 @@ jobs:
- wheel-py3_13-cpu-build
- get-label-type
runs-on: "windows-11-arm64-preview"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: wheel

View File

@ -38,7 +38,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -153,7 +153,7 @@ jobs:
- libtorch-cpu-shared-with-deps-debug-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch

View File

@ -45,7 +45,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -160,7 +160,7 @@ jobs:
- libtorch-cpu-shared-with-deps-debug-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -292,7 +292,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -408,7 +408,7 @@ jobs:
- libtorch-cuda12_6-shared-with-deps-debug-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -542,7 +542,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -658,7 +658,7 @@ jobs:
- libtorch-cuda12_8-shared-with-deps-debug-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -792,7 +792,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -908,7 +908,7 @@ jobs:
- libtorch-cuda12_9-shared-with-deps-debug-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch

View File

@ -38,7 +38,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -153,7 +153,7 @@ jobs:
- libtorch-cpu-shared-with-deps-release-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch

View File

@ -45,7 +45,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -160,7 +160,7 @@ jobs:
- libtorch-cpu-shared-with-deps-release-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -292,7 +292,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -408,7 +408,7 @@ jobs:
- libtorch-cuda12_6-shared-with-deps-release-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -542,7 +542,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -658,7 +658,7 @@ jobs:
- libtorch-cuda12_8-shared-with-deps-release-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -792,7 +792,7 @@ jobs:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
@ -908,7 +908,7 @@ jobs:
- libtorch-cuda12_9-shared-with-deps-release-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
timeout-minutes: 300
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch

File diff suppressed because it is too large Load Diff

View File

@ -165,6 +165,9 @@ jobs:
name: cuda12.8-py3.10-gcc9-sm90
uses: ./.github/workflows/_linux-test.yml
needs: build
# The pull_request trigger is used in PR to bump transformers pin which always
# needs one round of benchmark
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }}
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}

View File

@ -42,8 +42,8 @@ jobs:
needs: get-label-type
with:
runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
build-environment: linux-jammy-py3.9-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
build-environment: linux-jammy-py3.10-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
secrets: inherit
docs-push:

View File

@ -49,14 +49,14 @@ jobs:
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
linux-jammy-py3_9-gcc11-build:
name: linux-jammy-py3.9-gcc11
linux-jammy-py3_10-gcc11-build:
name: linux-jammy-py3.10-gcc11
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.9-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
build-environment: linux-jammy-py3.10-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@ -73,49 +73,49 @@ jobs:
]}
secrets: inherit
linux-jammy-py3_9-gcc11-test:
name: linux-jammy-py3.9-gcc11
linux-jammy-py3_10-gcc11-test:
name: linux-jammy-py3.10-gcc11
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-py3_9-gcc11-build
- linux-jammy-py3_10-gcc11-build
- target-determination
with:
build-environment: linux-jammy-py3.9-gcc11
docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.test-matrix }}
build-environment: linux-jammy-py3.10-gcc11
docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.test-matrix }}
secrets: inherit
linux-docs:
name: linux-docs
uses: ./.github/workflows/_docs.yml
needs: linux-jammy-py3_9-gcc11-build
needs: linux-jammy-py3_10-gcc11-build
with:
build-environment: linux-jammy-py3.9-gcc11
docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }}
build-environment: linux-jammy-py3.10-gcc11
docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }}
secrets: inherit
linux-jammy-py3_9-gcc11-no-ops:
name: linux-jammy-py3.9-gcc11-no-ops
linux-jammy-py3_10-gcc11-no-ops:
name: linux-jammy-py3.10-gcc11-no-ops
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.9-gcc11-no-ops
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
build-environment: linux-jammy-py3.10-gcc11-no-ops
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 1 },
]}
secrets: inherit
linux-jammy-py3_9-gcc11-pch:
name: linux-jammy-py3.9-gcc11-pch
linux-jammy-py3_10-gcc11-pch:
name: linux-jammy-py3.10-gcc11-pch
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.9-gcc11-pch
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
build-environment: linux-jammy-py3.10-gcc11-pch
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 1 },
@ -183,14 +183,14 @@ jobs:
test-matrix: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-py3_9-clang12-build:
name: linux-jammy-py3.9-clang12
linux-jammy-py3_10-clang12-build:
name: linux-jammy-py3.10-clang12
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.9-clang12
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12
build-environment: linux-jammy-py3.10-clang12
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
@ -207,16 +207,16 @@ jobs:
]}
secrets: inherit
linux-jammy-py3_9-clang12-test:
name: linux-jammy-py3.9-clang12
linux-jammy-py3_10-clang12-test:
name: linux-jammy-py3.10-clang12
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-py3_9-clang12-build
- linux-jammy-py3_10-clang12-build
- target-determination
with:
build-environment: linux-jammy-py3.9-clang12
docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }}
build-environment: linux-jammy-py3.10-clang12
docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-py3_13-clang12-build:
@ -253,14 +253,14 @@ jobs:
test-matrix: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda12_8-cudnn9-py3_9-clang12-build:
name: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
linux-jammy-cuda12_8-cudnn9-py3_10-clang12-build:
name: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12
build-environment: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 1 },
@ -282,14 +282,14 @@ jobs:
]}
secrets: inherit
linux-jammy-py3_9-gcc11-mobile-lightweight-dispatch-build:
name: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build
linux-jammy-py3_10-gcc11-mobile-lightweight-dispatch-build:
name: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
build-environment: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
build-generates-artifacts: false
test-matrix: |
{ include: [
@ -342,15 +342,40 @@ jobs:
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-xpu-2025_1-py3_9-build:
name: linux-jammy-xpu-2025.1-py3.9
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
name: cuda12.8-py3.10-gcc9-sm75
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
sync-tag: linux-xpu-2025-1-build
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '7.5'
test-matrix: |
{ include: [
{ config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
]}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
name: cuda12.8-py3.10-gcc9-sm75
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-xpu-n-py3_9-build:
name: linux-jammy-xpu-n-py3.9
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
sync-tag: linux-xpu-n-build
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
build-environment: linux-jammy-xpu-2025.1-py3.9
docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3
build-environment: linux-jammy-xpu-n-py3.9
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" },

View File

@ -78,14 +78,14 @@ jobs:
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-py3_9-clang12-build:
name: linux-jammy-py3.9-clang12
linux-jammy-py3_10-clang12-build:
name: linux-jammy-py3.10-clang12
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.9-clang12
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12
build-environment: linux-jammy-py3.10-clang12
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
test-matrix: |
{ include: [
{ config: "slow", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
@ -93,16 +93,16 @@ jobs:
]}
secrets: inherit
linux-jammy-py3_9-clang12-test:
name: linux-jammy-py3.9-clang12
linux-jammy-py3_10-clang12-test:
name: linux-jammy-py3.10-clang12
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-py3_9-clang12-build
- linux-jammy-py3_10-clang12-build
- target-determination
with:
build-environment: linux-jammy-py3.9-clang12
docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }}
build-environment: linux-jammy-py3.10-clang12
docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-rocm-py3_10-build:

View File

@ -30,7 +30,7 @@ jobs:
name: Test check_binary.sh for Linux CUDA
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
runner: linux.4xlarge.nvidia.gpu
runner: linux.g4dn.4xlarge.nvidia.gpu
docker-image: python:3.11
docker-build-dir: "skip-docker-build"
script: |

View File

@ -59,22 +59,19 @@ jobs:
# on the PR appear in chronological order (timing issues can shuffle them around)
sleep 60
fi
# Require a comment id for merge operations
if [ -z "${COMMENT_ID}" ]; then
echo "Error: merge requires COMMENT_ID to be specified"
exit 1
fi
if [ -n "${FORCE}" ]; then
if [ -n "${COMMENT_ID}" ]; then
python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}"
else
python3 .github/scripts/trymerge.py --force "${PR_NUM}"
fi
python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}"
elif [ -n "${IGNORE_CURRENT}" ]; then
if [ -n "${COMMENT_ID}" ]; then
python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}"
else
python3 .github/scripts/trymerge.py --ignore-current "${PR_NUM}"
fi
elif [ -n "${COMMENT_ID}" ]; then
python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}"
python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}"
else
python3 .github/scripts/trymerge.py "${PR_NUM}"
python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}"
fi
- name: Comment on Canceled
if: ${{ cancelled() && steps.checkout.outcome == 'success' }}

View File

@ -6,8 +6,7 @@ on:
- ciflow/vllm/*
workflow_dispatch:
schedule:
# Every 12 hours starting at 00:00 UTC (00:00 and 12:00)
- cron: '0 0,12 * * *'
- cron: '0 */8 * * *' # every 8 hours at minute 0 (UTC)
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}

View File

@ -4,6 +4,9 @@ on:
push:
tags:
- ciflow/win-arm64/*
schedule:
# Every 4 hours starting at 00:00 UTC
- cron: '0 */4 * * *'
env:
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}

View File

@ -26,15 +26,15 @@ jobs:
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-xpu-2025_0-py3_9-build:
name: linux-jammy-xpu-2025.0-py3.9
linux-jammy-xpu-n-1-py3_9-build:
name: linux-jammy-xpu-n-1-py3.9
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
sync-tag: linux-xpu-2025-0-build
sync-tag: linux-xpu-n-1-build
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
build-environment: linux-jammy-xpu-2025.0-py3.9
docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.0-py3
build-environment: linux-jammy-xpu-n-1-py3.9
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3
runner: linux.12xlarge
test-matrix: |
{ include: [
@ -47,60 +47,62 @@ jobs:
]}
secrets: inherit
linux-jammy-xpu-2025_1-py3_9-build:
name: linux-jammy-xpu-2025.1-py3.9
linux-jammy-xpu-n-py3_9-build:
name: linux-jammy-xpu-n-py3.9
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
sync-tag: linux-xpu-2025-1-build
sync-tag: linux-xpu-n-build
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
build-environment: linux-jammy-xpu-2025.1-py3.9
docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3
build-environment: linux-jammy-xpu-n-py3.9
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
runner: linux.12xlarge
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "default", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "default", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "default", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "default", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "default", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" },
]}
secrets: inherit
linux-jammy-xpu-2025_1-py3_9-test:
name: linux-jammy-xpu-2025.1-py3.9
linux-jammy-xpu-n-py3_9-test:
name: linux-jammy-xpu-n-py3.9
uses: ./.github/workflows/_xpu-test.yml
needs: linux-jammy-xpu-2025_1-py3_9-build
needs: linux-jammy-xpu-n-py3_9-build
permissions:
id-token: write
contents: read
with:
build-environment: linux-jammy-xpu-2025.1-py3.9
docker-image: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.test-matrix }}
build-environment: linux-jammy-xpu-n-py3.9
docker-image: ${{ needs.linux-jammy-xpu-n-py3_9-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-xpu-n-py3_9-build.outputs.test-matrix }}
secrets: inherit
windows-xpu-2025_0-build:
windows-xpu-n-1-build:
if: github.repository_owner == 'pytorch'
name: win-vs2022-xpu-2025_0-py3
name: win-vs2022-xpu-n-1-py3
uses: ./.github/workflows/_win-build.yml
with:
build-environment: win-vs2022-xpu-py3
cuda-version: cpu
use-xpu: true
xpu-version: '2025.0'
vc-year: '2022'
secrets: inherit
windows-xpu-2025_1-build:
if: github.repository_owner == 'pytorch'
name: win-vs2022-xpu-2025_1-py3
uses: ./.github/workflows/_win-build.yml
with:
build-environment: win-vs2022-xpu-py3
build-environment: win-vs2022-xpu-n-1-py3
cuda-version: cpu
use-xpu: true
xpu-version: '2025.1'
vc-year: '2022'
secrets: inherit
windows-xpu-n-build:
if: github.repository_owner == 'pytorch'
name: win-vs2022-xpu-n-py3
uses: ./.github/workflows/_win-build.yml
with:
build-environment: win-vs2022-xpu-n-py3
cuda-version: cpu
use-xpu: true
xpu-version: '2025.2'
vc-year: '2022'
secrets: inherit

View File

@ -583,7 +583,7 @@ exclude_patterns = [
command = [
'python3',
'tools/linter/adapters/grep_linter.py',
'--pattern=#include <pybind11\/(^|[^(gil\.h)])',
'--pattern=#include <pybind11\/(^|[^(gil_simple\.h)])',
'--allowlist-pattern=#include <torch\/csrc\/utils\/pybind.h>',
'--linter-name=PYBIND11_INCLUDE',
'--match-first-only',

View File

@ -747,6 +747,7 @@ cc_library(
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu",
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu",
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
"torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp",
"torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
],
)) + torch_sources,

View File

@ -272,7 +272,7 @@ cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL"
OFF)
cmake_dependent_option(USE_NVSHMEM "Use NVSHMEM" ON
"USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
"USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
option(USE_NNAPI "Use NNAPI" OFF)
option(USE_NNPACK "Use NNPACK" ON)
cmake_dependent_option(USE_NUMA "Use NUMA. Only available on Linux." ON "LINUX"

View File

@ -1,5 +1,18 @@
#pragma once
// See https://github.com/pytorch/pytorch/issues/161660
// This compile flag is intended to be passed in to CppExtensions that rely on
// the stable ABI via the `extra_compile_args` argument. This is a stopgap
// solution to ensure that non-stable libtorch APIs are not used in the extension.
// The long term solution is to have a torch_stable target that excludes headers
// that are not in torch/stable or torch/headeronly.
// See test/cpp_extensions/torch_stable_test_extension/setup.py for an example
// of how this is used.
#ifdef TORCH_STABLE_ONLY
#error \
"TensorBase.h should not be included when TORCH_STABLE_ONLY compile flag is passed"
#endif
#include <c10/core/Device.h>
#include <c10/core/Layout.h>
#include <c10/core/MemoryFormat.h>

View File

@ -15,7 +15,7 @@ std::enable_if_t<
std::is_base_of_v<Base, Child>,
std::unique_ptr<Base>>
make_unique_base(Args&&... args) {
return std::unique_ptr<Base>(new Child(std::forward<Args>(args)...));
return std::make_unique<Child>(std::forward<Args>(args)...);
}
} // namespace detail

View File

@ -252,6 +252,13 @@ cudaGraph_t CUDAGraph::raw_cuda_graph() {
return graph_;
}
cudaGraphExec_t CUDAGraph::raw_cuda_graph_exec() {
TORCH_CHECK(
has_graph_exec_,
"You cannot access the raw cudaGraphExec_t instance until instantiate() has been called");
return graph_exec_;
}
void CUDAGraph::reset() {
// I'd prefer these checks throw exceptions, not print warnings,
// but the destructor calls reset(), and at least one CI build

View File

@ -37,6 +37,7 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
void enable_debug_mode();
void debug_dump(const std::string& debug_path);
cudaGraph_t raw_cuda_graph();
cudaGraphExec_t raw_cuda_graph_exec();
protected:
cudaGraph_t graph_ = nullptr;

View File

@ -103,15 +103,13 @@ inline bool _check_tensors_share_device_and_dtype(
tensor.is_non_overlapping_and_dense();
};
for (const auto& tensorList : tensorLists) {
for (const auto& tensor : tensorList) {
if (!is_tensor_okay(tensor)) {
return false;
}
}
}
return true;
return std::all_of(
tensorLists.cbegin(),
tensorLists.cend(),
[&](const TensorList& tensorList) {
return std::all_of(
tensorList.cbegin(), tensorList.cend(), is_tensor_okay);
});
}
// Helper function called in check_fast_path_restrictions to check if
@ -157,11 +155,9 @@ inline bool _check_tensors_do_type_promotion_with_scalars(
bool does_op_promote_integer_inputs_to_float = false) {
for (const auto i : c10::irange(tensorList.size())) {
// For division, integer inputs will result in float.
if (does_op_promote_integer_inputs_to_float) {
if (at::isIntegralType(
tensorList[i].scalar_type(), /*includeBool*/ true)) {
return false;
}
if (does_op_promote_integer_inputs_to_float &&
at::isIntegralType(tensorList[i].scalar_type(), /*includeBool*/ true)) {
return false;
}
if (!scalarList.empty()) {
const auto& scalar =
@ -338,36 +334,34 @@ inline FlatMap _group_tensors_by_first_tensors_device_and_dtype(
}
}),
"Tensors of the same index must be on the same device and the same dtype except `step` tensors that can be CPU and float32/64 notwithstanding");
if (!grouped_tensors_with_indices.count(key)) {
grouped_tensors_with_indices.insert(
{key,
TensorsAndIndicesT{
[&]() -> nested_optional_tensorvec_t {
nested_optional_tensorvec_t nested_tensorvec;
nested_tensorvec.reserve(num_lists);
for (const auto& i : c10::irange(num_lists)) {
std::vector<std::optional<at::Tensor>> tensors;
if (!nested_tensorlist[i].empty()) {
// NB: num_tensors is the max possible length for any of
// the inner lists of tensor references. Reserving the max
// trades memory for perf. This should not have significant
// impact.
tensors.reserve(num_tensors);
}
nested_tensorvec.emplace_back(tensors);
}
return nested_tensorvec;
}(),
[&]() -> IndicesT {
if (!with_indices) {
return {};
} else {
IndicesT indices;
indices.reserve(num_tensors);
return indices;
}
}()}});
}
grouped_tensors_with_indices.try_emplace(
key,
TensorsAndIndicesT{
[&]() -> nested_optional_tensorvec_t {
nested_optional_tensorvec_t nested_tensorvec;
nested_tensorvec.reserve(num_lists);
for (const auto& i : c10::irange(num_lists)) {
std::vector<std::optional<at::Tensor>> tensors;
if (!nested_tensorlist[i].empty()) {
// NB: num_tensors is the max possible length for any of
// the inner lists of tensor references. Reserving the max
// trades memory for perf. This should not have significant
// impact.
tensors.reserve(num_tensors);
}
nested_tensorvec.emplace_back(std::move(tensors));
}
return nested_tensorvec;
}(),
[&]() -> IndicesT {
if (!with_indices) {
return {};
} else {
IndicesT indices;
indices.reserve(num_tensors);
return indices;
}
}()});
for (const auto& list_index : c10::irange(num_lists)) {
if (!nested_tensorlist[list_index].empty()) {
grouped_tensors_with_indices[key].first[list_index].emplace_back(

View File

@ -89,7 +89,7 @@ execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t
using result_type = typename traits::result_type;
for (; i < n; i++) {
result_type* out_ptr = (result_type*)(data[0] + i * strides[0]);
*out_ptr = c10::guts::apply(op, dereference<traits>(
*out_ptr = std::apply(op, dereference<traits>(
&data[1],
&strides[1],
i));
@ -102,7 +102,7 @@ inline void
execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) {
using traits = function_traits<func_t>;
for (; i < n; i++) {
c10::guts::apply(op, dereference<traits>(
std::apply(op, dereference<traits>(
&data[0],
&strides[0],
i));
@ -162,7 +162,7 @@ void handle_tuple_outputs(char* C10_RESTRICT data[],
}
// Loop operation for `cpu_kernel_multiple_outputs`.
// 1. Use `c10::guts::apply` to make dynamic method invocation
// 1. Use `std::apply` to make dynamic method invocation
// for the lambda passed in `cpu_kernel_multiple_outputs`.
// 2. Iterate over the members of the returned tuple, set the corresponding
// output tensor by the tuple member in `handle_tuple_outputs` function.
@ -183,7 +183,7 @@ multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_
}
for (; i < n; i++) {
auto output = c10::guts::apply(op, dereference<traits>(
auto output = std::apply(op, dereference<traits>(
&data[num_outputs],
&strides[num_outputs],
i));
@ -213,8 +213,8 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve
for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
auto args1 = dereference_vec<traits>(&data[1], opt_scalar, S, i);
auto args2 = dereference_vec<traits>(&data[1], opt_scalar, S, i + Vec::size());
auto out1 = c10::guts::apply(vop, std::move(args1));
auto out2 = c10::guts::apply(vop, std::move(args2));
auto out1 = std::apply(vop, std::move(args1));
auto out2 = std::apply(vop, std::move(args2));
out1.store(data[0] + i * sizeof(scalar_t));
out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t));
}

View File

@ -1349,7 +1349,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
if (scaling_choice_a == ScalingType::RowWise && scaling_choice_b == ScalingType::RowWise
&& ((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)
// cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales
|| (dprops->major == 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) {
|| (dprops->major >= 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) {
TORCH_CHECK(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling.");
at::cuda::detail::f8f8bf16_rowwise(
mat1,

View File

@ -223,6 +223,41 @@ __device__ __forceinline__ void fastAtomicAdd(
}
}
#ifdef USE_ROCM
// This function implements a committed store.
// Upon returning, the store is committed to global memory.
// This is useful in avoiding the need for fences.
template <typename T>
__device__ inline void cmtdStore(void* address, T value) {
int constexpr num_long_per_val = sizeof(value)/sizeof(long);
int constexpr num_int_per_val = sizeof(value)/sizeof(int);
int constexpr num_short_per_val = sizeof(value)/sizeof(short);
int constexpr num_char_per_val = sizeof(value)/sizeof(char);
union pnr { T v;
long l[num_long_per_val];
int i[num_int_per_val];
short s[num_short_per_val];
char c[num_char_per_val]; }
_pnr = {.v = value };
if constexpr (num_long_per_val*sizeof(long) == sizeof(value))
for (int i=0; i<num_long_per_val; i++)
__hip_atomic_store(reinterpret_cast<long *>(address)+i, _pnr.l[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
else if constexpr (num_int_per_val*sizeof(int) == sizeof(value))
for (int i=0; i<num_int_per_val; i++)
__hip_atomic_store(reinterpret_cast<int *>(address)+i, _pnr.i[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
else if constexpr (num_short_per_val*sizeof(short) == sizeof(value))
for (int i=0; i<num_short_per_val; i++)
__hip_atomic_store(reinterpret_cast<short *>(address)+i, _pnr.s[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
else if constexpr (num_char_per_val*sizeof(char) == sizeof(value))
for (int i=0; i<num_char_per_val; i++)
__hip_atomic_store(reinterpret_cast<char *>(address)+i, _pnr.c[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
__atomic_signal_fence(__ATOMIC_SEQ_CST);
asm volatile("s_waitcnt vmcnt(0)" ::: "memory");
__atomic_signal_fence(__ATOMIC_SEQ_CST);
}
#endif
#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__))
// This function implements warp-level opportunistic fastatomics
// To reduce contention on an atomicAdd, this replaces per-thread atomicAdd with a per-warp atomicAdd.

View File

@ -18,6 +18,7 @@
#include <thrust/pair.h>
#include <ATen/native/cuda/jit_utils.h>
#include <ATen/native/cuda/KernelUtils.cuh>
namespace at::native {
@ -796,15 +797,25 @@ struct ReduceOp {
bool should_store = config.should_store(output_idx);
if (should_store) {
index_t offset = config.staging_memory_offset(blockIdx.y);
#ifndef USE_ROCM
reduce_buffer[offset] = value;
#else // [CMTSTRS]
// In architectures with split caches, global fences are costly.
// Here we preempt need for fences by committing stores to global memory.
cmtdStore(&reduce_buffer[offset], value);
#endif
}
#ifndef USE_ROCM // skip fence if store are committed [CMTSTRS]
__threadfence(); // make sure writes are globally visible
#endif
__syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
bool is_last_block_done = mark_block_finished();
if (is_last_block_done) {
#ifndef USE_ROCM // skip fence if store are committed [CMTSTRS]
__threadfence(); // complete the acquire pattern after atomic
#endif
for (auto &v : value) {
v = ident;
}

View File

@ -115,7 +115,10 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const std::opt
return output;
}
if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS)) {
// No-graph execution causes nonsense if these are non-contiguous.
const bool is_contiguous = input.is_contiguous() && weight.is_contiguous() && bias.is_contiguous();
if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_contiguous) {
_mps_linear_nograph(input, weight, bias, output);
// Squeeze last dim of 1D linear
return weight_arg.dim() != 1 ? output : output.squeeze(-1);

View File

@ -388,16 +388,11 @@ mha_bwd_ck(const at::Tensor &dout, // batch_size x seqlen_q x
dv_expanded = dv;
}
auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
std::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
uint64_t drop_seed = 1, drop_offset = 0;
drop_seed = *philox_seed.data_ptr<int64_t>();
drop_offset = *philox_offset.data_ptr<int64_t>();
auto drop_seed_offset = std::make_pair(&drop_seed, &drop_offset);
uint64_t* drop_seed, drop_offset;
int64_t counter_offset = batch_size * num_heads * ck_tile::get_warp_size();
std::pair<uint64_t*, uint64_t*> drop_seed_offset = {nullptr,nullptr};
if(is_dropout) {
drop_seed_offset.first = philox_seed[0].data_ptr<uint64_t>();
drop_seed_offset.second = philox_seed[1].data_ptr<uint64_t>();
}
if (seqlen_q > 0) {
ck_tile::stream_config stream_config{stream};

View File

@ -177,6 +177,7 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x
TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
const auto sizes = q.sizes();
const int batch_size = sizes[0];
int seqlen_q = sizes[1];
int num_heads = sizes[2];
@ -225,6 +226,7 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x
CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size);
CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size);
at::Tensor q_padded, k_padded, v_padded;
if (head_size % 8 != 0) {
q_padded = at::pad(temp_q, {0, 8 - head_size % 8});
@ -237,6 +239,7 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x
v_padded = v;
}
at::Tensor out;
if (out_.has_value()) {
out = out_.value();
@ -263,6 +266,7 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x
auto opts = q.options();
bool has_lse = true;
bool has_dropout = p_dropout > 0.0f;
at::Tensor softmax_lse;
// TODO - check gradient, only training require lse
softmax_lse = at::empty({batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat));
@ -273,41 +277,46 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x
p = at::empty({batch_size, num_heads, seqlen_q, seqlen_k}, opts.dtype(at::kByte));
}
else {
p = at::empty({ 0 }, opts.dtype(at::kByte));
p = at::empty({ 0 }, opts);
}
uint64_t drop_seed = 1, drop_offset = 0;
int64_t counter_offset = batch_size * num_heads * ck_tile::get_warp_size();
auto rng_state = at::empty({2}, opts.dtype(at::kLong));
auto rng_state_ptr = reinterpret_cast<uint64_t*>(rng_state.data_ptr());
auto rng_state_options = at::TensorOptions().dtype(at::kUInt64).device(at::kCUDA);
auto rng_state = at::zeros({2}, rng_state_options.dtype(at::kUInt64));
auto _unused = at::empty({}, at::dtype(c10::kUInt64).device(at::kCUDA));
at::Tensor seed_t, offset_t;
if (p_dropout > 0.0) {
auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
gen_, at::cuda::detail::getDefaultCUDAGenerator());
// See Note [Acquire lock when using random generators]
std::lock_guard<std::mutex> lock(gen->mutex_);
auto philox_args = gen->philox_cuda_state(counter_offset);
std::tie(drop_seed, drop_offset) = at::cuda::philox::unpack(philox_args);
hipLaunchKernelGGL(
flash::ParsePhiloxCudaState, dim3(1), dim3(64), 0, at::hip::getCurrentHIPStreamMasqueradingAsCUDA(), philox_args, rng_state_ptr);
seed_t = at::scalar_tensor(at::Scalar(static_cast<uint64_t>(rng_state_ptr[0])), at::dtype(at::kLong));
offset_t = at::scalar_tensor(at::Scalar(static_cast<uint64_t>(rng_state_ptr[1])), at::dtype(at::kLong));
}
else
{
seed_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
offset_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
}
rng_state[0] = *(reinterpret_cast<int64_t*>(&drop_seed));
rng_state[1] = *(reinterpret_cast<int64_t*>(&drop_offset));
auto drop_options = at::TensorOptions().dtype(at::kLong).device(at::kCUDA);
std::optional<at::Tensor> attn_bias;
if( attn_bias_.has_value())
{
attn_bias = attn_bias_;
}
if (seqlen_k > 0) {
auto drop_seed_offset = std::make_pair(rng_state[0].data_ptr<uint64_t>(),
rng_state[1].data_ptr<uint64_t>());
auto drop_seed_offset = std::make_pair(rng_state_ptr, rng_state_ptr + 1);
auto stream = at::cuda::getCurrentHIPStream().stream();
ck_tile::stream_config stream_config{stream};
@ -323,7 +332,7 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x
auto args =
get_ck_fmha_fwd_args(
has_lse,
has_dropout,
return_dropout_randval,
mask,
batch_size,
seqlen_q,
@ -349,11 +358,12 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x
out.zero_();
softmax_lse.fill_(std::numeric_limits<float>::infinity());
}
if (seqlenq_ngroups_swapped) {
out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size});
q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size});
softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1});
}
return {out, q_padded, k_padded, v_padded, softmax_lse, rng_state, _unused, p};
return {out, q_padded, k_padded, v_padded, softmax_lse, seed_t, offset_t, p};
}
} //namespace pytorch_flash

View File

@ -15,6 +15,8 @@ flaky_models = {
"timm_efficientnet", # see https://github.com/pytorch/pytorch/issues/148699
"XGLMForCausalLM", # discovered in https://github.com/pytorch/pytorch/pull/128148
"moondream", # discovered in https://github.com/pytorch/pytorch/pull/159291
# discovered in https://github.com/pytorch/pytorch/issues/161419. Its not flaky but really hard to repro, so skipping it
"mobilenetv3_large_100",
}

View File

@ -2,7 +2,7 @@ name,accuracy,graph_breaks
torchrec_dlrm,fail_to_run,3
torchrec_dlrm,pass,6
@ -94,7 +94,7 @@ hf_Bert_large,pass,6
hf_BigBird,fail_to_run,3
hf_BigBird,pass,6
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
hf_Reformer,fail_to_run,21
hf_Reformer,pass,25

1 name accuracy graph_breaks
2 torchrec_dlrm fail_to_run pass 3 6
3 BERT_pytorch pass 6
4 Background_Matting pass_due_to_skip 0
5 LearningToPaint pass 6
6 Super_SloMo pass 7
7 alexnet pass 6
8 basic_gnn_edgecnn pass 20
94
95
96
97
98
99
100
110
111
112
113
114
115
116

View File

@ -2,7 +2,7 @@ name,accuracy,graph_breaks
torchrec_dlrm,fail_to_run,3
torchrec_dlrm,pass,6
@ -46,7 +46,7 @@ dcgan,pass,6
demucs,fail_to_run,4
demucs,pass,9
@ -94,7 +94,7 @@ hf_Bert_large,pass,6
hf_BigBird,fail_to_run,3
hf_BigBird,pass,6
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
hf_Reformer,fail_to_run,21
hf_Reformer,pass,25

1 name accuracy graph_breaks
2 torchrec_dlrm fail_to_run pass 3 6
3 BERT_pytorch pass 6
4 Background_Matting pass_due_to_skip 0
5 LearningToPaint pass 6
6 Super_SloMo pass 7
7 alexnet pass 6
8 basic_gnn_edgecnn pass 20
46 opacus_cifar10 eager_fail_to_run 0
47 phlippe_densenet pass 6
48 phlippe_resnet pass 6
49 pytorch_CycleGAN_and_pix2pix pass 6
50 pytorch_stargan pass 6
51 pytorch_unet pass_due_to_skip 7
52 resnet152 pass 7
94
95
96
97
98
99
100
110
111
112
113
114
115
116

View File

@ -755,6 +755,7 @@ libtorch_cuda_distributed_extra_sources = [
"torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu",
"torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp",
"torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
"torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp",
"torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
]

View File

@ -45,7 +45,7 @@ size_t AcceleratorAllocatorConfig::roundup_power2_divisions(size_t size) {
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoStart);
const size_t interval_end =
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoEnd);
TORCH_CHECK_VALUE(
TORCH_CHECK(
interval_end - interval_start == kRoundUpPowerOfTwoIntervals,
"kRoundUpPowerOfTwoIntervals mismatch");
@ -64,7 +64,7 @@ size_t AcceleratorAllocatorConfig::parseMaxSplitSize(
std::numeric_limits<size_t>::max() / kMB;
size_t val_env = tokenizer.toSizeT(++i);
TORCH_CHECK_VALUE(
TORCH_CHECK(
val_env >= min_allowed_split_size_mb,
"CachingAllocator option max_split_size_mb too small, must be >= ",
min_allowed_split_size_mb);
@ -83,7 +83,7 @@ size_t AcceleratorAllocatorConfig::parseMaxNonSplitRoundingSize(
std::numeric_limits<size_t>::max() / kMB;
size_t val_env = tokenizer.toSizeT(++i);
TORCH_CHECK_VALUE(
TORCH_CHECK(
val_env >= min_allowed_split_size_mb,
"CachingAllocator option max_non_split_rounding_mb too small, must be >= ",
min_allowed_split_size_mb);
@ -98,7 +98,7 @@ size_t AcceleratorAllocatorConfig::parseGarbageCollectionThreshold(
size_t i) {
tokenizer.checkToken(++i, ":");
double val_env = tokenizer.toDouble(++i);
TORCH_CHECK_VALUE(
TORCH_CHECK(
val_env > 0 && val_env < 1.0,
"garbage_collect_threshold is invalid, set it in (0.0, 1.0)");
garbage_collection_threshold_ = val_env;
@ -119,7 +119,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
size_t value_index = i;
tokenizer.checkToken(++i, ":");
size_t value = tokenizer.toSizeT(++i);
TORCH_CHECK_VALUE(
TORCH_CHECK(
value == 0 || llvm::isPowerOf2_64(value),
"For roundups, the divisions has to be power of 2 or 0 to disable roundup ");
@ -133,7 +133,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
value);
} else {
size_t boundary = tokenizer.toSizeT(value_index);
TORCH_CHECK_VALUE(
TORCH_CHECK(
llvm::isPowerOf2_64(boundary),
"For roundups, the intervals have to be power of 2 ");
@ -163,7 +163,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
"Expected closing bracket ']' in ConfigTokenizer but reached end of config");
} else { // Keep this for backwards compatibility
size_t value = tokenizer.toSizeT(i);
TORCH_CHECK_VALUE(
TORCH_CHECK(
llvm::isPowerOf2_64(value),
"For roundups, the divisions has to be power of 2 ");
std::fill(

View File

@ -76,7 +76,7 @@ class ConfigTokenizer {
} else if (token == "False") {
return false;
} else {
TORCH_CHECK_VALUE(
TORCH_CHECK(
false,
"Expected 'True' or 'False' at index ",
i,

View File

@ -1,119 +1,389 @@
#include <c10/cuda/CUDAAllocatorConfig.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/util/llvmMathExtras.h>
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
#include <c10/cuda/driver_api.h>
#endif
#include <cuda_runtime_api.h>
namespace c10::cuda::CUDACachingAllocator {
size_t CUDAAllocatorConfig::parseAllocatorConfig(
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
CUDAAllocatorConfig::CUDAAllocatorConfig()
: m_max_split_size(std::numeric_limits<size_t>::max()),
m_max_non_split_rounding_size(kLargeBuffer),
m_garbage_collection_threshold(0),
m_pinned_num_register_threads(1),
m_expandable_segments(false),
#if CUDA_VERSION >= 12030
m_expandable_segments_handle_type(
Expandable_Segments_Handle_Type::UNSPECIFIED),
#else
m_expandable_segments_handle_type(
Expandable_Segments_Handle_Type::POSIX_FD),
#endif
m_release_lock_on_cudamalloc(false),
m_pinned_use_cuda_host_register(false),
m_pinned_use_background_threads(false) {
m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
}
size_t CUDAAllocatorConfig::roundup_power2_divisions(size_t size) {
size_t log_size = (63 - llvm::countLeadingZeros(size));
// Our intervals start at 1MB and end at 64GB
const size_t interval_start =
63 - llvm::countLeadingZeros(static_cast<size_t>(1048576));
const size_t interval_end =
63 - llvm::countLeadingZeros(static_cast<size_t>(68719476736));
TORCH_CHECK(
(interval_end - interval_start == kRoundUpPowerOfTwoIntervals),
"kRoundUpPowerOfTwoIntervals mismatch");
int index = static_cast<int>(log_size) - static_cast<int>(interval_start);
index = std::max(0, index);
index = std::min(index, static_cast<int>(kRoundUpPowerOfTwoIntervals) - 1);
return instance().m_roundup_power2_divisions[index];
}
void CUDAAllocatorConfig::lexArgs(
const std::string& env,
std::vector<std::string>& config) {
std::vector<char> buf;
for (char ch : env) {
if (ch == ',' || ch == ':' || ch == '[' || ch == ']') {
if (!buf.empty()) {
config.emplace_back(buf.begin(), buf.end());
buf.clear();
}
config.emplace_back(1, ch);
} else if (ch != ' ') {
buf.emplace_back(ch);
}
}
if (!buf.empty()) {
config.emplace_back(buf.begin(), buf.end());
}
}
void CUDAAllocatorConfig::consumeToken(
const std::vector<std::string>& config,
size_t i,
const char c) {
TORCH_CHECK(
i < config.size() && config[i] == std::string(1, c),
"Error parsing CachingAllocator settings, expected ",
c,
"");
}
size_t CUDAAllocatorConfig::parseMaxSplitSize(
const std::vector<std::string>& config,
size_t i) {
consumeToken(config, ++i, ':');
constexpr int mb = 1024 * 1024;
if (++i < config.size()) {
size_t val1 = stoi(config[i]);
TORCH_CHECK(
val1 > kLargeBuffer / mb,
"CachingAllocator option max_split_size_mb too small, must be > ",
kLargeBuffer / mb,
"");
val1 = std::max(val1, kLargeBuffer / mb);
val1 = std::min(val1, (std::numeric_limits<size_t>::max() / mb));
m_max_split_size = val1 * 1024 * 1024;
} else {
TORCH_CHECK(false, "Error, expecting max_split_size_mb value", "");
}
return i;
}
size_t CUDAAllocatorConfig::parseMaxNonSplitRoundingSize(
const std::vector<std::string>& config,
size_t i) {
consumeToken(config, ++i, ':');
constexpr int mb = 1024 * 1024;
if (++i < config.size()) {
size_t val1 = stoi(config[i]);
TORCH_CHECK(
val1 > kLargeBuffer / mb,
"CachingAllocator option max_non_split_rounding_mb too small, must be > ",
kLargeBuffer / mb,
"");
val1 = std::max(val1, kLargeBuffer / mb);
val1 = std::min(val1, (std::numeric_limits<size_t>::max() / mb));
m_max_non_split_rounding_size = val1 * 1024 * 1024;
} else {
TORCH_CHECK(false, "Error, expecting max_non_split_rounding_mb value", "");
}
return i;
}
size_t CUDAAllocatorConfig::parseGarbageCollectionThreshold(
const std::vector<std::string>& config,
size_t i) {
consumeToken(config, ++i, ':');
if (++i < config.size()) {
double val1 = stod(config[i]);
TORCH_CHECK(
val1 > 0, "garbage_collect_threshold too small, set it 0.0~1.0", "");
TORCH_CHECK(
val1 < 1.0, "garbage_collect_threshold too big, set it 0.0~1.0", "");
m_garbage_collection_threshold = val1;
} else {
TORCH_CHECK(
false, "Error, expecting garbage_collection_threshold value", "");
}
return i;
}
size_t CUDAAllocatorConfig::parseRoundUpPower2Divisions(
const std::vector<std::string>& config,
size_t i) {
consumeToken(config, ++i, ':');
bool first_value = true;
if (++i < config.size()) {
if (std::string_view(config[i]) == "[") {
size_t last_index = 0;
// NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
while (++i < config.size() && std::string_view(config[i]) != "]") {
const std::string& val1 = config[i];
size_t val2 = 0;
consumeToken(config, ++i, ':');
if (++i < config.size()) {
val2 = stoi(config[i]);
} else {
TORCH_CHECK(
false, "Error parsing roundup_power2_divisions value", "");
}
TORCH_CHECK(
val2 == 0 || llvm::isPowerOf2_64(val2),
"For roundups, the divisions has to be power of 2 or 0 to disable roundup ",
"");
if (std::string_view(val1) == ">") {
std::fill(
std::next(
m_roundup_power2_divisions.begin(),
static_cast<std::vector<unsigned long>::difference_type>(
last_index)),
m_roundup_power2_divisions.end(),
val2);
} else {
size_t val1_long = stoul(val1);
TORCH_CHECK(
llvm::isPowerOf2_64(val1_long),
"For roundups, the intervals have to be power of 2 ",
"");
size_t index = 63 - llvm::countLeadingZeros(val1_long);
index = std::max((size_t)0, index);
index = std::min(index, m_roundup_power2_divisions.size() - 1);
if (first_value) {
std::fill(
m_roundup_power2_divisions.begin(),
std::next(
m_roundup_power2_divisions.begin(),
static_cast<std::vector<unsigned long>::difference_type>(
index)),
val2);
first_value = false;
}
if (index < m_roundup_power2_divisions.size()) {
m_roundup_power2_divisions[index] = val2;
}
last_index = index;
}
if (std::string_view(config[i + 1]) != "]") {
consumeToken(config, ++i, ',');
}
}
} else { // Keep this for backwards compatibility
size_t val1 = stoi(config[i]);
TORCH_CHECK(
llvm::isPowerOf2_64(val1),
"For roundups, the divisions has to be power of 2 ",
"");
std::fill(
m_roundup_power2_divisions.begin(),
m_roundup_power2_divisions.end(),
val1);
}
} else {
TORCH_CHECK(false, "Error, expecting roundup_power2_divisions value", "");
}
return i;
}
size_t CUDAAllocatorConfig::parseAllocatorConfig(
const std::vector<std::string>& config,
size_t i,
bool& used_cudaMallocAsync) {
// For ease of maintenance and understanding, the CUDA and ROCm
// implementations of this function are separated. This avoids having many
// #ifdef's throughout.
#ifdef USE_ROCM
// Ease burden on ROCm users by allowing either cuda or hip tokens.
// cuda token is broken up to prevent hipify matching it.
#define PYTORCH_TOKEN1 \
"cud" \
"aMallocAsync"
#define PYTORCH_TOKEN2 "hipMallocAsync"
tokenizer.checkToken(++i, ":");
i++; // Move to the value after the colon
TORCH_CHECK_VALUE(
((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1) ||
(tokenizer[i] == PYTORCH_TOKEN2)),
"Unknown allocator backend, "
"options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2);
if (m_is_allocator_loaded) {
bool aync_allocator_at_runtime = (tokenizer[i] != "native");
consumeToken(config, ++i, ':');
if (++i < config.size()) {
TORCH_CHECK(
aync_allocator_at_runtime == m_use_async_allocator,
"Allocator async backend parsed at runtime != allocator async backend parsed at load time, ",
aync_allocator_at_runtime,
((config[i] == "native") || (config[i] == PYTORCH_TOKEN1) ||
(config[i] == PYTORCH_TOKEN2)),
"Unknown allocator backend, "
"options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2);
used_cudaMallocAsync =
(config[i] == PYTORCH_TOKEN1 || config[i] == PYTORCH_TOKEN2);
TORCH_INTERNAL_ASSERT(
config[i] == get()->name() ||
(config[i] == PYTORCH_TOKEN1 && get()->name() == PYTORCH_TOKEN2),
"Allocator backend parsed at runtime != "
"allocator backend parsed at load time, ",
config[i],
" != ",
m_use_async_allocator);
get()->name());
} else {
TORCH_CHECK(false, "Error parsing backend value", "");
}
m_use_async_allocator =
(tokenizer[i] == PYTORCH_TOKEN1 || tokenizer[i] == PYTORCH_TOKEN2);
// CUDA allocator is always loaded at the start of the program
m_is_allocator_loaded = true;
#if defined(CUDA_VERSION)
if (m_use_async_allocator) {
#if CUDA_VERSION >= 11040
int version = 0;
C10_CUDA_CHECK(cudaDriverGetVersion(&version));
TORCH_CHECK(
version >= 11040,
"backend:cudaMallocAsync requires CUDA runtime "
"11.4 or newer, but cudaDriverGetVersion returned ",
version);
#else
TORCH_CHECK(
false,
"backend:cudaMallocAsync requires PyTorch to be built with "
"CUDA 11.4 or newer, but CUDA_VERSION is ",
CUDA_VERSION);
#endif
}
#endif
return i;
#undef PYTORCH_TOKEN1
#undef PYTORCH_TOKEN2
#else // USE_ROCM
consumeToken(config, ++i, ':');
if (++i < config.size()) {
TORCH_CHECK(
((config[i] == "native") || (config[i] == "cudaMallocAsync")),
"Unknown allocator backend, "
"options are native and cudaMallocAsync");
used_cudaMallocAsync = (config[i] == "cudaMallocAsync");
if (used_cudaMallocAsync) {
#if CUDA_VERSION >= 11040
int version = 0;
C10_CUDA_CHECK(cudaDriverGetVersion(&version));
TORCH_CHECK(
version >= 11040,
"backend:cudaMallocAsync requires CUDA runtime "
"11.4 or newer, but cudaDriverGetVersion returned ",
version);
#else
TORCH_CHECK(
false,
"backend:cudaMallocAsync requires PyTorch to be built with "
"CUDA 11.4 or newer, but CUDA_VERSION is ",
CUDA_VERSION);
#endif
}
TORCH_INTERNAL_ASSERT(
config[i] == get()->name(),
"Allocator backend parsed at runtime != "
"allocator backend parsed at load time");
} else {
TORCH_CHECK(false, "Error parsing backend value", "");
}
return i;
#endif // USE_ROCM
}
void CUDAAllocatorConfig::parseArgs(const std::string& env) {
void CUDAAllocatorConfig::parseArgs(const std::optional<std::string>& env) {
// If empty, set the default values
m_max_split_size = std::numeric_limits<size_t>::max();
m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
m_garbage_collection_threshold = 0;
bool used_cudaMallocAsync = false;
bool used_native_specific_option = false;
c10::CachingAllocator::ConfigTokenizer tokenizer(env);
for (size_t i = 0; i < tokenizer.size(); i++) {
const auto& key = tokenizer[i];
if (key == "backend") {
i = parseAllocatorConfig(tokenizer, i);
if (!env.has_value()) {
return;
}
{
std::lock_guard<std::mutex> lock(m_last_allocator_settings_mutex);
m_last_allocator_settings = env.value();
}
std::vector<std::string> config;
lexArgs(env.value(), config);
for (size_t i = 0; i < config.size(); i++) {
std::string_view config_item_view(config[i]);
if (config_item_view == "max_split_size_mb") {
i = parseMaxSplitSize(config, i);
used_native_specific_option = true;
} else if (config_item_view == "max_non_split_rounding_mb") {
i = parseMaxNonSplitRoundingSize(config, i);
used_native_specific_option = true;
} else if (config_item_view == "garbage_collection_threshold") {
i = parseGarbageCollectionThreshold(config, i);
used_native_specific_option = true;
} else if (config_item_view == "roundup_power2_divisions") {
i = parseRoundUpPower2Divisions(config, i);
used_native_specific_option = true;
} else if (config_item_view == "backend") {
i = parseAllocatorConfig(config, i, used_cudaMallocAsync);
} else if (config_item_view == "expandable_segments") {
used_native_specific_option = true;
consumeToken(config, ++i, ':');
++i;
TORCH_CHECK(
i < config.size() &&
(std::string_view(config[i]) == "True" ||
std::string_view(config[i]) == "False"),
"Expected a single True/False argument for expandable_segments");
config_item_view = config[i];
m_expandable_segments = (config_item_view == "True");
} else if (
// ROCm build's hipify step will change "cuda" to "hip", but for ease of
// use, accept both. We must break up the string to prevent hipify here.
key == "release_lock_on_hipmalloc" ||
key ==
config_item_view == "release_lock_on_hipmalloc" ||
config_item_view ==
"release_lock_on_c"
"udamalloc") {
used_native_specific_option = true;
tokenizer.checkToken(++i, ":");
m_release_lock_on_cudamalloc = tokenizer.toBool(++i);
consumeToken(config, ++i, ':');
++i;
TORCH_CHECK(
i < config.size() &&
(std::string_view(config[i]) == "True" ||
std::string_view(config[i]) == "False"),
"Expected a single True/False argument for release_lock_on_cudamalloc");
config_item_view = config[i];
m_release_lock_on_cudamalloc = (config_item_view == "True");
} else if (
// ROCm build's hipify step will change "cuda" to "hip", but for ease of
// use, accept both. We must break up the string to prevent hipify here.
key == "pinned_use_hip_host_register" ||
key ==
config_item_view == "pinned_use_hip_host_register" ||
config_item_view ==
"pinned_use_c"
"uda_host_register") {
i = parsePinnedUseCudaHostRegister(tokenizer, i);
i = parsePinnedUseCudaHostRegister(config, i);
used_native_specific_option = true;
} else if (key == "pinned_num_register_threads") {
i = parsePinnedNumRegisterThreads(tokenizer, i);
} else if (config_item_view == "pinned_num_register_threads") {
i = parsePinnedNumRegisterThreads(config, i);
used_native_specific_option = true;
} else if (config_item_view == "pinned_use_background_threads") {
i = parsePinnedUseBackgroundThreads(config, i);
used_native_specific_option = true;
} else {
const auto& keys =
c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
TORCH_CHECK(
keys.find(key) != keys.end(),
"Unrecognized key '",
key,
"' in Accelerator allocator config.");
i = tokenizer.skipKey(i);
false, "Unrecognized CachingAllocator option: ", config_item_view);
}
if (i + 1 < tokenizer.size()) {
tokenizer.checkToken(++i, ",");
if (i + 1 < config.size()) {
consumeToken(config, ++i, ',');
}
}
if (m_use_async_allocator && used_native_specific_option) {
if (used_cudaMallocAsync && used_native_specific_option) {
TORCH_WARN(
"backend:cudaMallocAsync ignores max_split_size_mb,"
"roundup_power2_divisions, and garbage_collect_threshold.");
@ -121,33 +391,64 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
}
size_t CUDAAllocatorConfig::parsePinnedUseCudaHostRegister(
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
const std::vector<std::string>& config,
size_t i) {
tokenizer.checkToken(++i, ":");
m_pinned_use_cuda_host_register = tokenizer.toBool(++i);
consumeToken(config, ++i, ':');
if (++i < config.size()) {
TORCH_CHECK(
(config[i] == "True" || config[i] == "False"),
"Expected a single True/False argument for pinned_use_cuda_host_register");
m_pinned_use_cuda_host_register = (config[i] == "True");
} else {
TORCH_CHECK(
false, "Error, expecting pinned_use_cuda_host_register value", "");
}
return i;
}
size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
const std::vector<std::string>& config,
size_t i) {
tokenizer.checkToken(++i, ":");
size_t val2 = tokenizer.toSizeT(++i);
TORCH_CHECK_VALUE(
llvm::isPowerOf2_64(val2),
"Number of register threads has to be power of 2 ",
"");
auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
TORCH_CHECK_VALUE(
val2 <= maxThreads,
"Number of register threads should be less than or equal to " +
std::to_string(maxThreads),
"");
m_pinned_num_register_threads = val2;
consumeToken(config, ++i, ':');
if (++i < config.size()) {
size_t val2 = stoi(config[i]);
TORCH_CHECK(
llvm::isPowerOf2_64(val2),
"Number of register threads has to be power of 2 ",
"");
auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
TORCH_CHECK(
val2 <= maxThreads,
"Number of register threads should be less than or equal to " +
std::to_string(maxThreads),
"");
m_pinned_num_register_threads = val2;
} else {
TORCH_CHECK(
false, "Error, expecting pinned_num_register_threads value", "");
}
return i;
}
REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK(CUDAAllocatorConfig)
size_t CUDAAllocatorConfig::parsePinnedUseBackgroundThreads(
const std::vector<std::string>& config,
size_t i) {
consumeToken(config, ++i, ':');
if (++i < config.size()) {
TORCH_CHECK(
(config[i] == "True" || config[i] == "False"),
"Expected a single True/False argument for pinned_use_background_threads");
m_pinned_use_background_threads = (config[i] == "True");
} else {
TORCH_CHECK(
false, "Error, expecting pinned_use_background_threads value", "");
}
return i;
}
// General caching allocator utilities
void setAllocatorSettings(const std::string& env) {
CUDACachingAllocator::CUDAAllocatorConfig::instance().parseArgs(env.c_str());
}
} // namespace c10::cuda::CUDACachingAllocator

View File

@ -1,11 +1,16 @@
#pragma once
#include <c10/core/AllocatorConfig.h>
#include <c10/cuda/CUDAException.h>
#include <c10/cuda/CUDAMacros.h>
#include <c10/util/Exception.h>
#include <c10/util/env.h>
#include <atomic>
#include <cstddef>
#include <cstdlib>
#include <mutex>
#include <string>
#include <vector>
namespace c10::cuda::CUDACachingAllocator {
enum class Expandable_Segments_Handle_Type : int {
@ -18,23 +23,20 @@ enum class Expandable_Segments_Handle_Type : int {
class C10_CUDA_API CUDAAllocatorConfig {
public:
static size_t max_split_size() {
return c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size();
return instance().m_max_split_size;
}
static double garbage_collection_threshold() {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
garbage_collection_threshold();
return instance().m_garbage_collection_threshold;
}
static bool expandable_segments() {
bool enabled = c10::CachingAllocator::AcceleratorAllocatorConfig::
use_expandable_segments();
#ifndef PYTORCH_C10_DRIVER_API_SUPPORTED
if (enabled) {
if (instance().m_expandable_segments) {
TORCH_WARN_ONCE("expandable_segments not supported on this platform")
}
return false;
#else
return enabled;
return instance().m_expandable_segments;
#endif
}
@ -61,8 +63,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
}
static bool pinned_use_background_threads() {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
pinned_use_background_threads();
return instance().m_pinned_use_background_threads;
}
static size_t pinned_max_register_threads() {
@ -76,99 +77,88 @@ class C10_CUDA_API CUDAAllocatorConfig {
// More description below in function roundup_power2_next_division
// As an example, if we want 4 divisions between 2's power, this can be done
// using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
static size_t roundup_power2_divisions(size_t size) {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
roundup_power2_divisions(size);
}
static size_t roundup_power2_divisions(size_t size);
static std::vector<size_t> roundup_power2_divisions() {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
roundup_power2_divisions();
return instance().m_roundup_power2_divisions;
}
static size_t max_non_split_rounding_size() {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
max_non_split_rounding_size();
return instance().m_max_non_split_rounding_size;
}
static std::string last_allocator_settings() {
return c10::CachingAllocator::getAllocatorSettings();
}
static bool use_async_allocator() {
return instance().m_use_async_allocator;
}
// Use `Construct On First Use Idiom` to avoid `Static Initialization Order`
// issue.
static const std::unordered_set<std::string>& getKeys() {
static std::unordered_set<std::string> keys{
"backend",
// keep BC for Rocm: `cuda` -> `cud` `a`, to avoid hipify issues
// NOLINTBEGIN(bugprone-suspicious-missing-comma,-warnings-as-errors)
"release_lock_on_cud"
"amalloc",
"pinned_use_cud"
"a_host_register",
// NOLINTEND(bugprone-suspicious-missing-comma,-warnings-as-errors)
"release_lock_on_hipmalloc",
"pinned_use_hip_host_register",
"pinned_num_register_threads"};
return keys;
std::lock_guard<std::mutex> lock(
instance().m_last_allocator_settings_mutex);
return instance().m_last_allocator_settings;
}
static CUDAAllocatorConfig& instance() {
static CUDAAllocatorConfig* s_instance = ([]() {
auto inst = new CUDAAllocatorConfig();
auto env = c10::utils::get_env("PYTORCH_ALLOC_CONF");
if (!env.has_value()) {
// For backward compatibility, check for the old environment variable
// PYTORCH_CUDA_ALLOC_CONF.
env = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
}
auto env = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
#ifdef USE_ROCM
// convenience for ROCm users, allow alternative HIP token
if (!env.has_value()) {
env = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
}
#endif
if (env.has_value()) {
inst->parseArgs(env.value());
}
inst->parseArgs(env);
return inst;
})();
return *s_instance;
}
void parseArgs(const std::string& env);
void parseArgs(const std::optional<std::string>& env);
private:
CUDAAllocatorConfig() = default;
CUDAAllocatorConfig();
size_t parseAllocatorConfig(
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
static void lexArgs(const std::string& env, std::vector<std::string>& config);
static void consumeToken(
const std::vector<std::string>& config,
size_t i,
const char c);
size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
size_t parseMaxNonSplitRoundingSize(
const std::vector<std::string>& config,
size_t i);
size_t parseGarbageCollectionThreshold(
const std::vector<std::string>& config,
size_t i);
size_t parseRoundUpPower2Divisions(
const std::vector<std::string>& config,
size_t i);
size_t parseAllocatorConfig(
const std::vector<std::string>& config,
size_t i,
bool& used_cudaMallocAsync);
size_t parsePinnedUseCudaHostRegister(
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
const std::vector<std::string>& config,
size_t i);
size_t parsePinnedNumRegisterThreads(
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
const std::vector<std::string>& config,
size_t i);
size_t parsePinnedUseBackgroundThreads(
const std::vector<std::string>& config,
size_t i);
std::atomic<size_t> m_pinned_num_register_threads{1};
std::atomic<Expandable_Segments_Handle_Type> m_expandable_segments_handle_type
#if CUDA_VERSION >= 12030
{Expandable_Segments_Handle_Type::UNSPECIFIED};
#else
{Expandable_Segments_Handle_Type::POSIX_FD};
#endif
std::atomic<bool> m_release_lock_on_cudamalloc{false};
std::atomic<bool> m_pinned_use_cuda_host_register{false};
std::atomic<bool> m_use_async_allocator{false};
std::atomic<bool> m_is_allocator_loaded{false};
std::atomic<size_t> m_max_split_size;
std::atomic<size_t> m_max_non_split_rounding_size;
std::vector<size_t> m_roundup_power2_divisions;
std::atomic<double> m_garbage_collection_threshold;
std::atomic<size_t> m_pinned_num_register_threads;
std::atomic<bool> m_expandable_segments;
std::atomic<Expandable_Segments_Handle_Type>
m_expandable_segments_handle_type;
std::atomic<bool> m_release_lock_on_cudamalloc;
std::atomic<bool> m_pinned_use_cuda_host_register;
std::atomic<bool> m_pinned_use_background_threads;
std::string m_last_allocator_settings;
std::mutex m_last_allocator_settings_mutex;
};
// Keep this for backwards compatibility
using c10::CachingAllocator::setAllocatorSettings;
// General caching allocator utilities
C10_CUDA_API void setAllocatorSettings(const std::string& env);
} // namespace c10::cuda::CUDACachingAllocator

View File

@ -1,6 +1,7 @@
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/core/impl/GPUTrace.h>
#include <c10/cuda/CUDAAllocatorConfig.h>
#include <c10/cuda/CUDAException.h>
#include <c10/cuda/CUDAFunctions.h>
#include <c10/cuda/CUDAGuard.h>
@ -63,6 +64,10 @@ namespace cuda::CUDACachingAllocator {
using namespace c10::CachingAllocator;
using namespace c10::CachingDeviceAllocator;
// Included here as this is externally used in CUDAAllocatorConfig
const size_t kLargeBuffer =
20971520; // "large" allocations may be packed in 20 MiB blocks
namespace Native {
//
@ -4105,10 +4110,49 @@ CUDAAllocator* allocator();
} // namespace CudaMallocAsync
struct BackendStaticInitializer {
// Parses env for backend at load time, duplicating some logic from
// CUDAAllocatorConfig. CUDAAllocatorConfig double-checks it later (at
// runtime). Defers verbose exceptions and error checks, including Cuda
// version checks, to CUDAAllocatorConfig's runtime doublecheck. If this
// works, maybe we should move all of CUDAAllocatorConfig here?
CUDAAllocator* parseEnvForBackend() {
// If the environment variable is set, we use the CudaMallocAsync allocator.
if (CUDAAllocatorConfig::use_async_allocator()) {
return CudaMallocAsync::allocator();
auto val = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
#ifdef USE_ROCM
// convenience for ROCm users to allow either CUDA or HIP env var
if (!val.has_value()) {
val = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
}
#endif
if (val.has_value()) {
const std::string& config = val.value();
std::regex exp("[\\s,]+");
std::sregex_token_iterator it(config.begin(), config.end(), exp, -1);
std::sregex_token_iterator end;
std::vector<std::string> options(it, end);
for (auto option : options) {
std::regex exp2("[:]+");
std::sregex_token_iterator it2(option.begin(), option.end(), exp2, -1);
std::sregex_token_iterator end2;
std::vector<std::string> kv(it2, end2);
if (kv.size() >= 2) {
if (kv[0] == "backend") {
#ifdef USE_ROCM
// convenience for ROCm users to allow either CUDA or HIP env var
if (kv[1] ==
"cud"
"aMallocAsync" ||
kv[1] == "hipMallocAsync")
#else
if (kv[1] == "cudaMallocAsync")
#endif
return CudaMallocAsync::allocator();
if (kv[1] == "native")
return &Native::allocator;
}
}
}
}
return &Native::allocator;
}

View File

@ -1,7 +1,6 @@
#pragma once
#include <c10/core/CachingDeviceAllocator.h>
#include <c10/cuda/CUDAAllocatorConfig.h>
#include <c10/cuda/CUDAGraphsC10Utils.h>
#include <c10/cuda/CUDAMacros.h>
#include <c10/cuda/CUDAStream.h>
@ -50,9 +49,10 @@ namespace c10::cuda::CUDACachingAllocator {
// Preserved only for BC reasons
// NOLINTNEXTLINE(misc-unused-using-decls)
using c10::CachingAllocator::kLargeBuffer;
using c10::CachingDeviceAllocator::DeviceStats;
extern const size_t kLargeBuffer;
typedef std::shared_ptr<GatheredContext> (*CreateContextFn)();
// Struct containing info of an allocation block (i.e. a fractional part of a

View File

@ -581,6 +581,7 @@ if(USE_CUDA)
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
)
endif()

View File

@ -169,10 +169,6 @@ These backends include:
.. autofunction:: torch.backends.cuda.sdp_kernel
```
```{eval-rst}
.. autofunction:: torch.backends.cuda.is_ck_sdpa_available
```
## torch.backends.cudnn
```{eval-rst}

View File

@ -1221,9 +1221,6 @@ coverage_ignore_functions = [
"reduce_typed_storage_child",
"storage_from_cache",
# torch.multiprocessing.spawn
# Added docstring for this but I think we need to go through
# and add the entire torch.multiprocessing.spawn module to a .rst...
"should_use_parallel_start",
"start_processes",
# torch.nn.functional
"adaptive_max_pool1d_with_indices", # documented as adaptive_max_pool1d

View File

@ -0,0 +1,439 @@
# Extending PyTorch with New Accelerators
## Background
Since PyTorch 2.1, the community has made significant progress in simplifying the integration of new accelerators into the PyTorch ecosystem. These improvements include, but are not limited to: refinement of the `PrivateUse1` Dispatch Key, introduction and improvement of core subsystem extension mechanisms, and device-agnostic refactoring of key modules (e.g., `torch.accelerator`, `memory management`). Taken together, these improvements lay the foundation for a **robust**, **flexible** and developer-friendly accelerator integration path.
### Why Does This Matter?
This integration path has several key advantages:
* **Speed**: Extensibility is built-in for all core PyTorch modules. Developers can integrate new accelerators into their downstream codebase independently without modifying upstream code and without being constrained by community review bandwidth.
* **Future-proofing**: This integration path is the default for all future PyTorch features, which means that new modules and features will automatically support scaling to new accelerators as long as this path is followed.
* **Autonomy**: Vendors have full control over their accelerator integration timelines, enabling agile iteration cycles and reducing reliance on upstream coordination.
### About This Document
This guide aims to provide a **comprehensive overview of the modern integration pathway** for new accelerator in PyTorch. It walks through the full integration surface, from low-level device primitives to higher-level domain modules like compilation and quantization. The structure follows a **modular and scenario-driven approach**, where each topic is paired with corresponding code examples from [torch_openreg][OpenReg URL], an official reference implementation.
The goal is to help developers:
* Understand the full scope of accelerator integration;
* Follow best practices to quickly launch new accelerators;
* Avoid common pitfalls through clear, targeted examples.
### Target Audience
This document is intended for:
* **Accelerator Developers** who are integrating accelerator into PyTorch;
* **Advanced PyTorch Users** interested in the inner workings of key modules;
Next, we will officially embark on the integration journey of the new PyTorch accelerator.
## Operators
For new accelerators, one of the most important and fundamental aspects of integration is supporting high-performance operators. To facilitate operator adaptation for users and accelerator developers, PyTorch provides multiple methods for developing and registering operators in both `Python` and `C++`. The following sections detail some of PyTorch's fundamental capabilities for operator registration.
```{note}
`Dispatch Key` is used to uniquely identify accelerator within PyTorch, such as `CPU`, `CUDA`, `MPS`, and `PrivateUse1`. In theory, all subsequent new accelerators will share `PrivateUse1`, leveraging its built-in comprehensive scaffolding capabilities to complete the integration of new accelerators. Please refer to [Let's talk about the PyTorch dispatcher](https://blog.ezyang.com/2020/09/lets-talk-about-the-pytorch-dispatcher/) if you are interested with dispatcher.
```
(operator-set)=
### Operator Set
PyTorch currently has over 3500 built-in operators (including related operator variants). This represents a significant workload from any perspective, and supporting this massive number of operators in a short period of time is no easy task. Therefore, as the first step in developing new backend operators, our goal should be to focus on the essential operators. For other operators, we can first use the community's fallback mechanism to support the feature as the first priority. After that, we can gradually complete other operators to improve the performance of the new backend.
The required operator set is listed below, primarily consisting of low-level operators required by factory functions and fallback operators:
| Operator Name | Dispatch Key | Description |
| :---: | :---: | :---: |
| empty.memory_format | PrivateUse1 | Create an uninitialized Tensor with the specified shape and memory layout (the stride is automatically calculated) |
| empty_strided | PrivateUse1 | Create an uninitialized Tensor of the specified shape and stride (more degrees of freedom) |
| as_strided | PrivateUse1 | Create a shared view of the input Tensor with new shape, stride, and offset (without allocating new memory) |
| view | PrivateUse1 | Create a shared view of the input Tensor with new shape, but the original Tensor must be memory-contiguous |
| _reshape_alias | PrivateUse1 | Creates a shared view without safety checks(Internal version of reshape) |
| resize_ | PrivateUse1 | Modify the shape of the Tensor in place and reallocate memory if capacity is insufficient |
| _copy_from | PrivateUse1 | The underlying core function of Tensor.copy_ is responsible for the actual cross-device data copying |
| _copy_from_and_resize | PrivateUse1 | Combine `resize_` and `_copy_from` to resize first and then copy |
| _local_scalar_dense | PrivateUse1 | The underlying implementation of `.item()`, extracting values from Tensor to CPU scalars |
| set_.source_Tensor | PrivateUse1 | Set the current Tensor using the specified Tensor |
| set_.source_Storage | PrivateUse1 | Set the current Tensor using the specified Storage |
| set_.source_Storage_storage_offset | PrivateUse1 | Set the current Tensor using the specified Storage with the storage offset |
| fallback | PrivateUse1 | Fallback to CPU |
### Basics
Now that we have defined the initial scope of operator support, we can begin developing operator adaptations. This section will explain these implementations in `Python` and `C++` based on actual scenarios.
(step-one)=
#### Step 1
{ref}`The operators mentioned above <operator-set>` share a common characteristic: They are built-in PyTorch operators with defined `namespaces` and `Schemas`, and these operators' built-in accelerators (`CPU`, `CUDA`, etc.) have been implemented. What we have to do next is to implement these operators for the new accelerators.
::::{tab-set}
:::{tab-item} C++
```{eval-rst}
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp
:language: c++
:start-after: LITERALINCLUDE START: EMPTY.MEMORY_FORMAT IMPL
:end-before: LITERALINCLUDE END: EMPTY.MEMORY_FORMAT IMPL
:linenos:
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
:language: c++
:start-after: LITERALINCLUDE START: EMPTY.MEMORY_FORMAT WRAPPER
:end-before: LITERALINCLUDE END: EMPTY.MEMORY_FORMAT WRAPPER
:linenos:
```
:::
::::
Taking the `empty.memory_format` operator as an example, we first need to query the operator's `schema` information in `native_functions.yaml`, which contains detailed signature information. Then, we can implement the operator based on the capabilities of the new accelerator.
```Yaml
- func: empty.memory_format(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
dispatch:
CPU: empty_cpu
CUDA: empty_cuda
...
```
::::{tab-set-code}
```{eval-rst}
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
:language: c++
:start-after: LITERALINCLUDE START: TORCH_LIBRARY_IMPL DEFAULT
:end-before: LITERALINCLUDE END: TORCH_LIBRARY_IMPL DEFAULT
:emphasize-lines: 1,2
:linenos:
```
::::
After completing the `wrapper_empty_memory_format`, we can register `aten::empty.memory_format` for `PrivateUse1` through `TORCH_LIBRARY_IMPL`.
#### Step 2
By following {ref}`Step 1<step-one>`, we can complete the development and registration of all operators except `fallback`. Next, to support operators related to operations (such as mathematical operations and convolution operations), we need to implement the registration of fallback semantics. This is a built-in capability provided by the PyTorch framework that can fallback some operations that are not supported by new accelerators to the CPU for execution. For new backends in development, this is an extremely effective way to ensure functionality at the expense of performance.
::::{tab-set}
:::{tab-item} C++
```{eval-rst}
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp
:language: c++
:start-after: LITERALINCLUDE START: FALLBACK IMPL
:end-before: LITERALINCLUDE END: FALLBACK IMPL
:emphasize-lines: 15
:linenos:
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
:language: c++
:start-after: LITERALINCLUDE START: FALLBACK WRAPPER
:end-before: LITERALINCLUDE END: FALLBACK WRAPPER
:linenos:
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
:language: c++
:start-after: LITERALINCLUDE START: FALLBACK GLOBAL
:end-before: LITERALINCLUDE END: FALLBACK GLOBAL
:linenos:
```
:::
::::
`wrapper_cpu_fallback` wraps the `at::native::cpu_fallback` method provided by PyTorch and is registered with `PrivateUse1` in PyTorch via `TORCH_LIBRARY_IMPL`. Subsequent operations not supported by the new backend will automatically fall back to the CPU for execution, and the results will be passed back to the new backend after execution.
### Advanced
#### Selective Fallback
Enabling the fallback mechanism only for certain operators, while following PyTorch's default behavior for other operators (an error will be reported if the accelerator does not have a corresponding operator implementation), this is a very reasonable scenario as well.
::::{tab-set}
:::{tab-item} C++
```{eval-rst}
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
:language: c++
:start-after: LITERALINCLUDE START: FALLBACK WRAPPER
:end-before: LITERALINCLUDE END: FALLBACK WRAPPER
:linenos:
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
:language: c++
:start-after: LITERALINCLUDE START: FALLBACK SINGLE
:end-before: LITERALINCLUDE END: FALLBACK SINGLE
:linenos:
```
:::
::::
Per-operator fallbacks are very similar to global fallbacks, the only difference being the registration method: calling `m.impl` registers an implementation for a specific operator, while `m.fallback` registers a default implementation for all operators.
::::{tab-set-code}
```{eval-rst}
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp
:language: c++
:start-after: LITERALINCLUDE START: FALLBACK IMPL
:end-before: LITERALINCLUDE END: FALLBACK IMPL
:emphasize-lines: 2-5
:linenos:
```
::::
Of course, global fallbacks can also be combined with a blacklist of fallbacks, which is a common approach, especially when only a few operators do not support fallbacks.
#### PyTorch STUB
PyTorch also provides another approach for built-in operators: `STUB`. This method is essentially based on the `Step 1<step-one>` approach, but adds secondary scheduling capabilities (for example, scheduling based on CPU characteristics).
```{note}
The `STUB` method currently supports only a limited set of operators. For new accelerator devices, the advantage of the `STUB` method is that it significantly reduces the cost of development at the cost of a small performance overhead. PyTorch currently does not clearly list the set of operators that can be registered through `STUB`. Due to the large number of related operators, only the query method for the supported operator list is provided here.
```
```shell
pushd ${TORCH_ROOT}
find aten -type f -a -name "*.h" | xargs -I {} grep -wl "^DECLARE_DISPATCH" {}
popd
```
`DECLARE_DISPATCH` is a macro used to explicitly declare `STUB`. It is currently distributed in the `aten` directory. Based on this macro, you can find all operators that can be integrated using the `STUB` method.
```text
...
aten/src/ATen/native/Activation.h
aten/src/ATen/native/FusedSGD.h
aten/src/ATen/native/nested/NestedTensorBinaryOps.h
aten/src/ATen/native/TensorCompare.h
aten/src/ATen/native/Sorting.h
...
```
```c++
using unary_fn = void(*)(TensorIteratorBase&);
DECLARE_DISPATCH(unary_fn, abs_stub)
```
The above listing contains the file that declares the `STUB` operator, where you can clearly see the STUB name and the associated function signature. Next, we will take `abs_stub` as an example to briefly introduce the path to support operators through `STUB`.
::::{tab-set}
:::{tab-item} C++
```{eval-rst}
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Extra.cpp
:language: c++
:start-after: LITERALINCLUDE START: STUB ABS
:end-before: LITERALINCLUDE END: STUB ABS
:linenos:
```
```{eval-rst}
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
:language: c++
:start-after: LITERALINCLUDE START: STUB DEFAULT
:end-before: LITERALINCLUDE END: STUB DEFAULT
:emphasize-lines: 1
:linenos:
```
:::
::::
From the signature, we can see that the input of `abs_stub` is `TensorIteratorBase`, a powerful helper class provided by PyTorch that contains all input and output operators, as well as some other auxiliary methods. Based on it, we can develop the `abs_kernel` operator and then call `REGISTER_PRIVATEUSE1_DISPATCH` to specify `abs_stub` to complete the registration.
#### Custom Operators
In addition to PyTorch's built-in operators, custom accelerator operators are also very common to improve performance in specific scenarios. These can be categorized into three main approaches:
* Forward-only
* Forward and backward: Separate registration
* Forward and backward: Implemented using `torch.autograd.Function`
```{note}
There are more details in PyTorch tutorials, so refer to [PyTorch Custom Operators](https://docs.pytorch.org/tutorials/advanced/custom_ops_landing_page.html) if you are interested.
```
Here, we'll briefly introduce the implementation process of custom operators, focusing on the forward-only approach. The implementation can be summarized into the following three points:
1. **Define Schema:**
::::{tab-set}
:::{tab-item} C++
```{eval-rst}
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
:language: c++
:start-after: LITERALINCLUDE START: CUSTOM OPERATOR SCHEMA
:end-before: LITERALINCLUDE END: CUSTOM OPERATOR SCHEMA
:emphasize-lines: 2
:linenos:
```
:::
::::
* Namespace Name: `openreg`
* Function Name: `custom_abs`
* Input Parameters:
* Type: `Tensor`
* Name: `input`
* Output Type: `Tensor`
2. **Register Operator&Autograd Fallback:**
::::{tab-set}
:::{tab-item} C++
```{eval-rst}
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
:language: c++
:start-after: LITERALINCLUDE START: CUSTOM OPERATOR DEFAULT
:end-before: LITERALINCLUDE END: CUSTOM OPERATOR DEFAULT
:linenos:
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
:language: c++
:start-after: LITERALINCLUDE START: CUSTOM OPERATOR FALLBACK
:end-before: LITERALINCLUDE END: CUSTOM OPERATOR FALLBACK
:emphasize-lines: 2
:linenos:
```
:::
::::
Use `TORCH_LIBRARY_IMPL` to register the `wrapper_custom_abs` implementation for the `custom_abs` operator in `PrivateUse1`. However, because `Autograd` is always enabled in PyTorch, PyTorch defaults to finding and executing the corresponding backward implementation even if only forward computation is required(will fallthrough in backward implementation). Therefore, we also need to register the corresponding implementation for `AutogradPrivateUse1` of the `custom_abs` operator. Fortunately, PyTorch also provides a general `Autograd Fallback` mechanism named `torch::autograd::autogradNotImplementedFallback`, if only forward computation is involved, it is equivalent to a fallthrough operation, selecting the next DispatchKey for computation; if backward computation is involved, an error is thrown.
3. **Register Metadata(optional, but required by the graph mode, etc.):**
::::{tab-set-code}
```{eval-rst}
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/meta.py
:language: python
:start-after: LITERALINCLUDE START: CUSTOM OPERATOR META
:end-before: LITERALINCLUDE END: CUSTOM OPERATOR META
:linenos:
```
::::
PyTorch supports registering `Meta` in both C++ and Python. Since Python registration is simpler, Python is used as an example here. Similar to the `TORCH_LIBRARY_IMPL` function in C++, Python provides the more user-friendly `torch.library.impl` decorator.
### Tools
Operator registration in PyTorch is complex, with diverse registration methods and numerous scenarios. Therefore, the PyTorch community has provided a number of tools to help developers quickly understand the underlying principles and assist in troubleshooting. Here we briefly introduce several commonly used tools:
#### Commands
PyTorch provides a set of commands prefixed with `torch._C._dispatch_` around its Dispatch feature. You can query all related interfaces using the following command:
```Shell
python -c 'import torch; print("\n".join([x for x in dir(torch._C) if x.startswith("_dispatch_")]))'
...
_dispatch_dump
_dispatch_dump_table
_dispatch_has_kernel
_dispatch_has_kernel_for_any_dispatch_key
_dispatch_has_kernel_for_dispatch_key
_dispatch_isTensorSubclassLike
_dispatch_is_alias_key
_dispatch_is_included_in_alias
_dispatch_is_main_interpreter
_dispatch_kernel_for_dispatch_key_is_fallthrough
_dispatch_key_for_device
_dispatch_key_name
_dispatch_key_parse
_dispatch_key_set
...
```
Here are explanations for several commonly used commands:
* `torch._C._dispatch_key_set`:
Displays the DispatchKey of the current Tensor, with priority increasing from left to right.
```Python
>>> import torch
>>> a = torch.randn(3,3,device="cuda")
>>> torch._C._dispatch_key_set(a)
'DispatchKeySet(CUDA, ADInplaceOrView, AutogradCUDA, AutocastCUDA)'
```
* `torch._C._dispatch_dump_table`:
Queries the support status of a given operator across different Dispatch Keys, making it easy to locate the corresponding implementation code.
```Python
>>> import torch
>>> print(torch._C._dispatch_dump_table("aten::add.Tensor"))
>>> ...
CPU: registered at ./build/aten/src/ATen/RegisterCPU_0.cpp:1309 [kernel]
CUDA: registered at ./build/aten/src/ATen/RegisterCUDA_0.cpp:2420 [kernel]
HIP: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
MPS: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
IPU: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
XPU: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
HPU: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
VE: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
MTIA: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
MAIA: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
PrivateUse1: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
...
```
You can easily query the corresponding implementation of the `aten::add.Tensor` operator on other platforms, so that you can track the entire operator calling process from the source code level.
#### Environment Variables
PyTorch also provides some dispatcher-related environment variables that can help with learning and quickly locating issues.
* TORCH_SHOW_DISPATCH_TRACE
Displays detailed internal dispatch key scheduling during PyTorch execution.
```Bash
export TORCH_SHOW_DISPATCH_TRACE=1
```
```Python
>>> import torch
>>> a = torch.randn(3,3)
[call] op=[aten::randn], key=[BackendSelect]
[redispatch] op=[aten::randn], key=[CPU]
[call] op=[aten::empty.memory_format], key=[BackendSelect]
[redispatch] op=[aten::empty.memory_format], key=[CPU]
[call] op=[aten::normal_], key=[CPU]
```
You can clearly see all the underlying operators called by Python-level operators within PyTorch: including the operator name, calling hierarchy, and corresponding `Dispatch Key`.
[OpenReg URL]: https://github.com/pytorch/pytorch/tree/main/test/cpp_extensions/open_registration_extension/torch_openreg "OpenReg URL"

View File

@ -88,6 +88,7 @@ set(JIT_TEST_SRCS
${JIT_TEST_ROOT}/test_subgraph_matcher.cpp
${JIT_TEST_ROOT}/test_subgraph_rewriter.cpp
${JIT_TEST_ROOT}/test_subgraph_utils.cpp
${JIT_TEST_ROOT}/test_te.cpp
${JIT_TEST_ROOT}/test_union.cpp
${JIT_TEST_ROOT}/test_utils.cpp
${JIT_TEST_ROOT}/test_script_profile.cpp

41
test/cpp/jit/test_te.cpp Normal file
View File

@ -0,0 +1,41 @@
#include <gtest/gtest.h>
#include <test/cpp/jit/test_utils.h>
#include <torch/csrc/jit/ir/irparser.h>
#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
#include <iostream>
namespace torch {
namespace jit {
TEST(TETest, RemoveProfiling) {
auto g = std::make_shared<Graph>();
const auto graph_string = R"IR(
graph(%a : Tensor,
%b : bool):
%1 : None = prim::Constant()
%2 : Tensor? = prim::If(%b)
block0():
%3 : Tensor? = prim::profile[profiled_type=Tensor, seen_none=0](%1)
-> (%3)
block1():
%4 : Tensor = prim::profile[profiled_type=Tensor, seen_none=0](%a)
-> (%4)
return (%2))IR";
torch::jit::parseIR(graph_string, g.get());
g->lint();
RemoveProfileNodesAndSpecializeTypes(g);
g->lint();
testing::FileCheck()
.check("prim::Constant")
->check("prim::If")
->check("block")
->check("block")
->check("return")
->run(*g);
}
} // namespace jit
} // namespace torch

View File

@ -60,6 +60,7 @@ torch_openreg/
├── __init__.py
└── openreg
├── __init__.py
├── meta.py
└── random.py
```
@ -110,35 +111,18 @@ There are 4 DSOs in torch_openreg, and the dependencies between them are as foll
- Operator Implementation
- `TORCH_LIBRARY` form
- Registering a specific operator for an existing schema: See `empty.memory_format`
- Registering an operator with a custom schema
- Extending an existing namespace: (TODO)
- Custom namespace: See `custom_autograd_fn_returns_self`
- Autograd: See `custom_autograd_fn_returns_self`
- STUB form: See `abs_stub`
- Fallback
- Register for builtin PyTorch Operators
- `TORCH_LIBRARY_IMPL` form: See `empty.memory_format
- `STUB` form: See `abs_stub`
- Register for custom operators
- Schema Registration: See `custom_abs`
- Kernel Registration: See `custom_abs`
- Fallback Registration for `AutogradPriavateUse1`: See `custom_abs`
- Meta Registration: See `custom_abs`
- `torch.autograd.Function`: See `custom_autograd_fn_aliasing`
- Register for fallback
- Per-operator Fallback: See `sub.Tensor`
- Global Fallback: See `wrapper_cpu_fallback`
- Per-operator Fallback: (TODO)
- AMP (TODO)
### Memory Management
- Device Memory Management (TODO)
- Host Memory Management (TODO)
### Custom Storage
- Adding custom device descriptions (TODO)
- Serialization support (TODO)
### Autoload
- (TODO)
...
## Installation and Usage
@ -177,7 +161,15 @@ print(f"Device of z: {z.device}")
## Future Plans
- **Enhance Features**: AMP, memory management, generators, distributed computing, etc. (to reiterate, the fundamental goal is to verify the integration mechanism).
- **Enhance Features**:
- Autoload
- AMP
- Device-agnostic APIs
- Memory Management
- Generator
- Distrubuted
- Custom Tensor&Storage
- ...
- **Improve Tests**: Add more test cases related to the integration mechanism.
- **Improve Documentation**: Add a new chapter on third-party device integration in the `Developer Notes` section of the PyTorch documentation.
- **Real-time Synchronization**: Keep the code and documentation updated iteratively and in sync.

View File

@ -3,16 +3,18 @@
#include <ATen/native/CPUFallback.h>
#include <ATen/native/DispatchStub.h>
#include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
#include <torch/library.h>
namespace at::openreg {
namespace {
at::Tensor wrapper_quantize_per_tensor(
const at::Tensor& self,
double scale,
int64_t zero_point,
at::ScalarType dtype) {
return at::native::quantize_per_tensor_openreg(
return at::native::openreg::quantize_per_tensor(
self, scale, zero_point, dtype);
}
@ -25,10 +27,19 @@ int64_t wrapper__fused_sdp_choice(
bool is_causal,
std::optional<double> scale,
bool enable_gqa) {
return at::native::_fused_sdp_choice_openreg(
return at::native::openreg::_fused_sdp_choice(
query, key, value, attn_mask, dropout_p, is_causal, scale, enable_gqa);
}
void wrapper_quantize_tensor_per_tensor_affine_stub(
const at::Tensor& rtensor,
at::Tensor& qtensor,
double scale,
int64_t zero_point) {
at::native::openreg::quantize_tensor_per_tensor_affine_stub(
rtensor, qtensor, scale, zero_point);
}
std::tuple<
at::Tensor,
at::Tensor,
@ -48,7 +59,7 @@ wrapper__scaled_dot_product_fused_attention_overrideable(
bool is_causal,
bool return_debug_mask,
std::optional<double> scale) {
return at::native::_scaled_dot_product_fused_attention_overrideable_openreg(
return at::native::openreg::_scaled_dot_product_fused_attention_overrideable(
query,
key,
value,
@ -78,8 +89,8 @@ wrapper_scaled_dot_product_fused_attention_overrideable_backward(
const at::Tensor& philox_seed,
const at::Tensor& philox_offset,
std::optional<double> scale) {
return at::native::
_scaled_dot_product_fused_attention_overrideable_backward_openreg(
return at::native::openreg::
_scaled_dot_product_fused_attention_overrideable_backward(
grad_out,
query,
key,
@ -99,7 +110,66 @@ wrapper_scaled_dot_product_fused_attention_overrideable_backward(
scale);
}
at::Tensor wrapper_custom_autograd_fn_returns_self(at::Tensor x) {
return at::native::openreg::custom_autograd_fn_returns_self(x);
}
at::Tensor wrapper_custom_autograd_fn_aliasing(at::Tensor x) {
return at::native::openreg::custom_autograd_fn_aliasing(x);
}
at::Tensor& wrapper_abs_out(const at::Tensor& self, at::Tensor& out) {
return at::native::openreg::abs_out(self, out);
}
void wrapper_abs_stub(at::TensorIteratorBase& iter) {
at::native::openreg::abs_kernel(iter);
}
at::Tensor wrapper_custom_abs(at::Tensor x) {
return at::native::openreg::custom_abs(x);
}
} // namespace
using namespace at::native;
// Registration via STUB
// LITERALINCLUDE START: STUB DEFAULT
REGISTER_PRIVATEUSE1_DISPATCH(abs_stub, &wrapper_abs_stub);
REGISTER_PRIVATEUSE1_DISPATCH(
quantize_tensor_per_tensor_affine_stub,
&wrapper_quantize_tensor_per_tensor_affine_stub);
REGISTER_PRIVATEUSE1_DISPATCH(
_fused_sdp_choice_stub,
&wrapper__fused_sdp_choice);
// LITERALINCLUDE END: STUB DEFAULT
// Registration of custom operators
// LITERALINCLUDE START: CUSTOM OPERATOR SCHEMA
TORCH_LIBRARY(openreg, m) {
m.def("custom_abs(Tensor input)-> Tensor");
}
// LITERALINCLUDE END: CUSTOM OPERATOR SCHEMA
// LITERALINCLUDE START: CUSTOM OPERATOR DEFAULT
TORCH_LIBRARY_IMPL(openreg, PrivateUse1, m) {
m.impl("custom_abs", &wrapper_custom_abs);
}
// LITERALINCLUDE END: CUSTOM OPERATOR DEFAULT
// LITERALINCLUDE START: CUSTOM OPERATOR FALLBACK
TORCH_LIBRARY_IMPL(_, AutogradPrivateUse1, m) {
m.fallback(torch::autograd::autogradNotImplementedFallback());
}
// LITERALINCLUDE END: CUSTOM OPERATOR FALLBACK
// The rest is for testing purposes
TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
/*
abs_stub only works if abs.out is also registered with PrivateUse1, because
abs.default is designed to redirect directly to abs.out, which calls
abs_stub.
*/
m.impl("abs.out", &wrapper_abs_out);
m.impl("quantize_per_tensor", &wrapper_quantize_per_tensor);
m.impl("_fused_sdp_choice", &wrapper__fused_sdp_choice);
m.impl(
@ -110,10 +180,7 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
&wrapper_scaled_dot_product_fused_attention_overrideable_backward);
}
} // namespace at::openreg
namespace at::openreg {
TORCH_LIBRARY(openreg, m) {
TORCH_LIBRARY_FRAGMENT(openreg, m) {
m.def("custom_autograd_fn_returns_self(Tensor input)-> Tensor");
m.def("custom_autograd_fn_aliasing(Tensor(a) input)-> Tensor(a)");
}
@ -121,18 +188,8 @@ TORCH_LIBRARY(openreg, m) {
TORCH_LIBRARY_IMPL(openreg, AutogradPrivateUse1, m) {
m.impl(
"custom_autograd_fn_returns_self",
&at::native::custom_autograd_fn_returns_self);
m.impl(
"custom_autograd_fn_aliasing", &at::native::custom_autograd_fn_aliasing);
&wrapper_custom_autograd_fn_returns_self);
m.impl("custom_autograd_fn_aliasing", &wrapper_custom_autograd_fn_aliasing);
}
} // namespace at::openreg
namespace at::native {
REGISTER_PRIVATEUSE1_DISPATCH(abs_stub, &abs_kernel_openreg);
REGISTER_PRIVATEUSE1_DISPATCH(
quantize_tensor_per_tensor_affine_stub,
&quantize_tensor_per_tensor_affine_stub_openreg);
REGISTER_PRIVATEUSE1_DISPATCH(
_fused_sdp_choice_stub,
&_fused_sdp_choice_openreg);
} // namespace at::native
} // namespace at::openreg

View File

@ -7,6 +7,9 @@
namespace at::openreg {
namespace {
// LITERALINCLUDE START: EMPTY.MEMORY_FORMAT WRAPPER
at::Tensor wrapper_empty_memory_format(
c10::IntArrayRef size,
std::optional<c10::ScalarType> dtype_opt,
@ -14,7 +17,7 @@ at::Tensor wrapper_empty_memory_format(
std::optional<c10::Device> device_opt,
std::optional<bool> pin_memory_opt,
std::optional<c10::MemoryFormat> memory_format_opt) {
return at::native::empty_memory_format_openreg(
return at::native::openreg::empty_memory_format(
size,
dtype_opt,
layout_opt,
@ -22,6 +25,7 @@ at::Tensor wrapper_empty_memory_format(
pin_memory_opt,
memory_format_opt);
}
// LITERALINCLUDE END: EMPTY.MEMORY_FORMAT WRAPPER
at::Tensor wrapper_empty_strided(
c10::IntArrayRef size,
@ -30,7 +34,7 @@ at::Tensor wrapper_empty_strided(
std::optional<c10::Layout> layout_opt,
std::optional<c10::Device> device_opt,
std::optional<bool> pin_memory_opt) {
return at::native::empty_strided_openreg(
return at::native::openreg::empty_strided(
size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
}
@ -39,48 +43,48 @@ at::Tensor wrapper_as_strided(
c10::SymIntArrayRef size,
c10::SymIntArrayRef stride,
std::optional<c10::SymInt> storage_offset) {
return at::native::as_strided_openreg(self, size, stride, storage_offset);
return at::native::openreg::as_strided(self, size, stride, storage_offset);
}
const at::Tensor& wrapper_resize_(
const at::Tensor& self,
c10::SymIntArrayRef size,
::std::optional<at::MemoryFormat> memory_format) {
return at::native::resize_openreg_(self, size, memory_format);
return at::native::openreg::resize_(self, size, memory_format);
}
at::Tensor wrapper__reshape_alias(
const at::Tensor& self,
c10::SymIntArrayRef size,
c10::SymIntArrayRef stride) {
return at::native::_reshape_alias_openreg(self, size, stride);
return at::native::openreg::_reshape_alias(self, size, stride);
}
at::Tensor wrapper__copy_from(
const at::Tensor& self,
const at::Tensor& dst,
bool non_blocking) {
return at::native::_copy_from_openreg(self, dst, non_blocking);
return at::native::openreg::_copy_from(self, dst, non_blocking);
}
at::Tensor wrapper__copy_from_and_resize(
const at::Tensor& self,
const at::Tensor& dst) {
return at::native::_copy_from_and_resize_openreg(self, dst);
return at::native::openreg::_copy_from_and_resize(self, dst);
}
at::Scalar wrapper__local_scalar_densor(const at::Tensor& self) {
return at::native::_local_scalar_dense_openreg(self);
return at::native::openreg::_local_scalar_dense(self);
}
at::Tensor& wrapper_set_source_Tensor_(
at::Tensor& self,
const at::Tensor& source) {
return at::native::set_source_Tensor_openreg_(self, source);
return at::native::openreg::set_source_Tensor_(self, source);
}
at::Tensor& wrapper_set_source_Storage_(at::Tensor& self, at::Storage source) {
return at::native::set_source_Storage_openreg_(self, source);
return at::native::openreg::set_source_Storage_(self, source);
}
at::Tensor& wrapper_set_source_Storage_storage_offsetset_(
@ -89,14 +93,25 @@ at::Tensor& wrapper_set_source_Storage_storage_offsetset_(
int64_t storage_offset,
c10::IntArrayRef size,
c10::IntArrayRef stride) {
return at::native::set_source_Storage_storage_offset_openreg_(
return at::native::openreg::set_source_Storage_storage_offset_(
result, storage, storage_offset, size, stride);
}
at::Tensor wrapper_view(const at::Tensor& self, c10::SymIntArrayRef size) {
return at::native::view_openreg(self, size);
return at::native::openreg::view(self, size);
}
// LITERALINCLUDE START: FALLBACK WRAPPER
void wrapper_cpu_fallback(
const c10::OperatorHandle& op,
torch::jit::Stack* stack) {
at::native::openreg::cpu_fallback(op, stack);
}
// LITERALINCLUDE END: FALLBACK WRAPPER
} // namespace
// LITERALINCLUDE START: TORCH_LIBRARY_IMPL DEFAULT
TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
m.impl("empty.memory_format", wrapper_empty_memory_format);
m.impl("empty_strided", wrapper_empty_strided);
@ -113,16 +128,21 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
wrapper_set_source_Storage_storage_offsetset_);
m.impl("view", wrapper_view);
}
// LITERALINCLUDE END: TORCH_LIBRARY_IMPL DEFAULT
void wrapper_cpu_fallback(
const c10::OperatorHandle& op,
torch::jit::Stack* stack) {
at::native::cpu_fallback_openreg(op, stack);
}
// LITERALINCLUDE START: FALLBACK GLOBAL
TORCH_LIBRARY_IMPL(_, PrivateUse1, m) {
m.fallback(
torch::CppFunction::makeFromBoxedFunction<&wrapper_cpu_fallback>());
}
// LITERALINCLUDE END: FALLBACK GLOBAL
// LITERALINCLUDE START: FALLBACK SINGLE
TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
m.impl(
"sub.Tensor",
torch::CppFunction::makeFromBoxedFunction<&wrapper_cpu_fallback>());
}
// LITERALINCLUDE END: FALLBACK SINGLE
} // namespace at::openreg

View File

@ -10,6 +10,7 @@
#include <ATen/native/transformers/sdp_utils_cpp.h>
#include <ATen/ops/_local_scalar_dense_native.h>
#include <ATen/ops/_reshape_alias_native.h>
#include <ATen/ops/abs_native.h>
#include <ATen/ops/as_strided_cpu_dispatch.h>
#include <ATen/ops/copy_native.h>
#include <ATen/ops/quantize_per_tensor_native.h>
@ -24,26 +25,18 @@
#include <c10/core/Allocator.h>
#include <set>
#include <include/openreg.h>
namespace at::native {
namespace at::native::openreg {
class MemoryGuard {
public:
explicit MemoryGuard(const torch::jit::Stack& stack) {
for (const c10::IValue& ivalue : stack) {
find_and_unprotect_tensors(ivalue);
}
}
template <typename... Args>
explicit MemoryGuard(const Args&... args) {
(handler(args), ...);
(find_and_unprotect_tensors(args), ...);
}
~MemoryGuard() {
~MemoryGuard() noexcept {
for (void* ptr : unprotected_pointers_) {
orMemoryProtect(ptr);
}
@ -55,26 +48,31 @@ class MemoryGuard {
MemoryGuard& operator=(MemoryGuard&&) = delete;
private:
void find_and_unprotect_tensors(const c10::IValue& ivalue) {
if (ivalue.isTensor()) {
unprotect_if_needed(ivalue.toTensor());
} else if (ivalue.isTensorList()) {
for (const at::Tensor& tensor : ivalue.toTensorList()) {
unprotect_if_needed(tensor);
}
} else if (ivalue.isList()) {
for (const c10::IValue& element : ivalue.toListRef()) {
find_and_unprotect_tensors(element);
}
} else if (ivalue.isGenericDict()) {
for (const auto& pair : ivalue.toGenericDict()) {
find_and_unprotect_tensors(pair.key());
find_and_unprotect_tensors(pair.value());
template <typename T>
void find_and_unprotect_tensors(const T& item) {
if constexpr (std::is_base_of_v<at::TensorBase, T>) {
unprotect_if_needed(item);
} else if constexpr (std::is_same_v<T, c10::IValue>) {
if (item.isTensor()) {
unprotect_if_needed(item.toTensor());
} else if (item.isTensorList()) {
for (const at::Tensor& tensor : item.toTensorListRef()) {
unprotect_if_needed(tensor);
}
} else if (item.isList()) {
for (const c10::IValue& element : item.toListRef()) {
find_and_unprotect_tensors(element);
}
} else if (item.isGenericDict()) {
for (const auto& [key, value] : item.toGenericDict()) {
find_and_unprotect_tensors(key);
find_and_unprotect_tensors(value);
}
}
}
}
void unprotect_if_needed(const at::Tensor& tensor) {
void unprotect_if_needed(const at::TensorBase& tensor) {
if (!tensor.defined() || !tensor.has_storage()) {
return;
}
@ -82,25 +80,18 @@ class MemoryGuard {
void* ptr = tensor.data_ptr();
orPointerAttributes attr;
if (orPointerGetAttributes(&attr, ptr) == orSuccess) {
if (attr.type == orMemoryTypeDevice) {
if (unprotected_pointers_.find(attr.pointer) ==
unprotected_pointers_.end()) {
orMemoryUnprotect(attr.pointer);
unprotected_pointers_.insert(attr.pointer);
}
}
if (orPointerGetAttributes(&attr, ptr) != orSuccess ||
attr.type != orMemoryTypeDevice) {
return;
}
auto [it, inserted] = unprotected_pointers_.insert(attr.pointer);
if (inserted) {
orMemoryUnprotect(attr.pointer);
}
}
template <typename T>
void handler(const T& x) {
if constexpr (std::is_same_v<std::decay_t<T>, at::Tensor>) {
unprotect_if_needed(x);
}
}
std::set<void*> unprotected_pointers_;
std::unordered_set<void*> unprotected_pointers_;
};
} // namespace at::native
} // namespace at::native::openreg

View File

@ -1,8 +1,8 @@
#include "Extra.h"
namespace at::native {
namespace at::native::openreg {
at::Tensor quantize_per_tensor_openreg(
at::Tensor quantize_per_tensor(
const at::Tensor& self,
double scale,
int64_t zero_point,
@ -10,7 +10,7 @@ at::Tensor quantize_per_tensor_openreg(
return at::native::quantize_per_tensor(self, scale, zero_point, dtype);
}
int64_t _fused_sdp_choice_openreg(
int64_t _fused_sdp_choice(
const at::Tensor& query,
const at::Tensor& key,
const at::Tensor& value,
@ -23,6 +23,12 @@ int64_t _fused_sdp_choice_openreg(
return static_cast<int64_t>(backend);
}
void quantize_tensor_per_tensor_affine_stub(
const at::Tensor& rtensor,
at::Tensor& qtensor,
double scale,
int64_t zero_point) {}
std::tuple<
at::Tensor,
at::Tensor,
@ -33,7 +39,7 @@ std::tuple<
at::Tensor,
at::Tensor,
at::Tensor>
_scaled_dot_product_fused_attention_overrideable_openreg(
_scaled_dot_product_fused_attention_overrideable(
const at::Tensor& query,
const at::Tensor& key,
const at::Tensor& value,
@ -72,7 +78,7 @@ _scaled_dot_product_fused_attention_overrideable_openreg(
}
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
_scaled_dot_product_fused_attention_overrideable_backward_openreg(
_scaled_dot_product_fused_attention_overrideable_backward(
const at::Tensor& grad_out,
const at::Tensor& query,
const at::Tensor& key,
@ -97,104 +103,6 @@ _scaled_dot_product_fused_attention_overrideable_backward_openreg(
at::empty_like(attn_bias));
}
} // namespace at::native
namespace at::native {
void abs_kernel_openreg(at::TensorIteratorBase& iter) {
// Abs only have a input tensor and a output tensor.
auto& output_operand = iter.operand(0);
auto& input_operand = iter.operand(1);
auto& output_tensor_base = output_operand.tensor_base();
auto& input_tensor_base = input_operand.tensor_base();
TORCH_CHECK(
!input_operand.original_tensor_base().defined(),
"input original tensor is defined.");
TORCH_CHECK(
!output_operand.original_tensor_base().defined(),
"output original tensor is defined.");
// For easy test, only accept contiguous input tensor for calculate.
auto memory_format = input_tensor_base.suggest_memory_format();
TORCH_CHECK(
input_tensor_base.is_contiguous(memory_format),
"Input tensor need be contiguous.");
// Add necessary restrictions to ensure the security of the demo.
TORCH_CHECK(
input_tensor_base.sizes() == output_tensor_base.sizes(),
"Intput and output tensor size are not equal.");
// Common dtype is calculate in TensorIteratorBase.
TORCH_CHECK(
iter.common_dtype() == at::ScalarType::Float, "Only support float type.")
// Using for loop for abs calculate.
auto abs_function =
[](float* output_ptr, const float* input_ptr, const int64_t NUM) {
for (int64_t i = 0; i < NUM; ++i) {
*(output_ptr + i) = std::abs(*(input_ptr + i));
}
};
// To simplify the logic of the test demo code,
// we only use contiguous tensor to calculate on device side.
// And using input tensor memory format.
if (iter.is_contiguous()) {
// Add for will_resize flag check. You can convert to differernt
// tensor memory format when will_resize is True.
// If TensorIteratorConfig resize_outputs_ flag is true, and there are two
// situations:
// 1) Out tensor is undefined, and TensorIterator set will_resize to true;
// 2) Out tensor is defined and tensor size is not equal to input tensor
// size;
// TensorIterator set will_resize to true, and call
// set_output_raw_strided to resize output tensor.
// When output operand will_resize flag is ture, dummy
// device can convert tensor to dummy device preferred memory format.
// Here we don't convert tensor memory format, because it will become
// complex when dummy device want keep same memory format for training
// network.
TORCH_CHECK(
output_operand.will_resize,
"output operand will_resize flag need be True.");
abs_function(
(float*)iter.data_ptr(0), (float*)iter.data_ptr(1), iter.numel());
} else {
// Stride copy is not support for foo device, using cpu device instead.
// For abs op, the last situation is: output tensor is not contiguous with
// operand will_resize is False.
TORCH_CHECK(
!output_operand.will_resize, "output operand will_resize is True.");
// Get a contiguous tensor with input memory format.
at::Tensor output = at::empty(
output_tensor_base.sizes(),
input_tensor_base.options().memory_format(memory_format));
// For structured op which inheried from TensorIteratorBase, maybe you need
// to call set_output_raw_strided function to update output stored in op
// sturctured. abs op is no need to do this.
output_operand.exchange_tensor(
c10::MaybeOwned<at::TensorBase>::owned(std::in_place, output));
abs_function(
(float*)output_operand.tensor_base().mutable_data_ptr(),
(float*)iter.data_ptr(1),
iter.numel());
// Copy tensor base to original tensor base, and keep same scalar type and
// stride with cpu and gpu.
if (output_operand.original_tensor_base().defined() &&
!output_operand.original_tensor_base().is_same(
output_operand.tensor_base())) {
output_operand.original_tensor().copy_(output_operand.tensor());
output_operand.restore_original_tensor();
}
}
}
void quantize_tensor_per_tensor_affine_stub_openreg(
const at::Tensor& rtensor,
at::Tensor& qtensor,
double scale,
int64_t zero_point) {}
} // namespace at::native
namespace at::native {
namespace {
struct CustomAutogradFnReturnsSelf
: public torch::autograd::Function<CustomAutogradFnReturnsSelf> {
@ -235,4 +143,68 @@ at::Tensor custom_autograd_fn_aliasing(at::Tensor x) {
return CustomAutogradFnAliasing::apply(x);
}
} // namespace at::native
/*
This implementation is only used to test stub registration, so not all
capabilities are fully supported.
Current Limitations:
- dtype: Float only
- input tensor: must be contiguous layout
*/
// LITERALINCLUDE START: STUB ABS
void abs_kernel(at::TensorIteratorBase& iter) {
TORCH_CHECK(iter.ntensors() == 2, "Abs kernel expects 2 tensors");
TORCH_CHECK(
iter.common_dtype() == at::ScalarType::Float,
"Abs kernel only supports float type");
auto& output_tensor = iter.tensor(0);
auto& input_tensor = iter.tensor(1);
TORCH_CHECK(
input_tensor.sizes() == output_tensor.sizes(),
"Input and output tensor sizes must match.");
auto abs_loop = [](float* out_ptr, const float* in_ptr, int64_t n) {
for (int64_t i = 0; i < n; ++i) {
out_ptr[i] = std::abs(in_ptr[i]);
}
};
MemoryGuard guard(input_tensor, output_tensor);
if (iter.is_contiguous()) {
abs_loop(
static_cast<float*>(iter.data_ptr(0)),
static_cast<float*>(iter.data_ptr(1)),
iter.numel());
} else {
TORCH_CHECK(
input_tensor.is_contiguous(), "Input tensor must be contiguous.")
auto output = at::empty(
input_tensor.sizes(),
input_tensor.options().memory_format(
input_tensor.suggest_memory_format()));
MemoryGuard guard(output);
abs_loop(
static_cast<float*>(output.data_ptr()),
static_cast<float*>(iter.data_ptr(1)),
iter.numel());
output_tensor.copy_(output);
}
}
// LITERALINCLUDE END: STUB ABS
at::Tensor& abs_out(const at::Tensor& self, at::Tensor& out) {
return at::native::abs_out(self, out);
}
at::Tensor custom_abs(at::Tensor x) {
return at::abs(x);
}
} // namespace at::native::openreg

View File

@ -1,12 +1,13 @@
#include "Common.h"
namespace at::native {
at::Tensor quantize_per_tensor_openreg(
namespace at::native::openreg {
at::Tensor quantize_per_tensor(
const at::Tensor& self,
double scale,
int64_t zero_point,
at::ScalarType dtype);
int64_t _fused_sdp_choice_openreg(
int64_t _fused_sdp_choice(
const at::Tensor& query,
const at::Tensor& key,
const at::Tensor& value,
@ -15,6 +16,11 @@ int64_t _fused_sdp_choice_openreg(
bool is_causal,
std::optional<double> scale,
bool enable_gqa);
void quantize_tensor_per_tensor_affine_stub(
const at::Tensor& rtensor,
at::Tensor& qtensor,
double scale,
int64_t zero_point);
std::tuple<
at::Tensor,
at::Tensor,
@ -25,7 +31,7 @@ std::tuple<
at::Tensor,
at::Tensor,
at::Tensor>
_scaled_dot_product_fused_attention_overrideable_openreg(
_scaled_dot_product_fused_attention_overrideable(
const at::Tensor& query,
const at::Tensor& key,
const at::Tensor& value,
@ -35,7 +41,7 @@ _scaled_dot_product_fused_attention_overrideable_openreg(
bool return_debug_mask,
std::optional<double> scale);
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
_scaled_dot_product_fused_attention_overrideable_backward_openreg(
_scaled_dot_product_fused_attention_overrideable_backward(
const at::Tensor& grad_out,
const at::Tensor& query,
const at::Tensor& key,
@ -53,18 +59,11 @@ _scaled_dot_product_fused_attention_overrideable_backward_openreg(
const at::Tensor& philox_seed,
const at::Tensor& philox_offset,
std::optional<double> scale);
} // namespace at::native
namespace at::native {
void abs_kernel_openreg(at::TensorIteratorBase& iter);
void quantize_tensor_per_tensor_affine_stub_openreg(
const at::Tensor& rtensor,
at::Tensor& qtensor,
double scale,
int64_t zero_point);
} // namespace at::native
namespace at::native {
at::Tensor custom_autograd_fn_returns_self(at::Tensor x);
at::Tensor custom_autograd_fn_aliasing(at::Tensor x);
} // namespace at::native
at::Tensor& abs_out(const at::Tensor& self, at::Tensor& out);
void abs_kernel(at::TensorIteratorBase& iter);
at::Tensor custom_abs(at::Tensor x);
} // namespace at::native::openreg

View File

@ -1,8 +1,11 @@
#include "Minimal.h"
namespace at::native {
#include <unordered_set>
at::Tensor empty_memory_format_openreg(
namespace at::native::openreg {
// LITERALINCLUDE START: EMPTY.MEMORY_FORMAT IMPL
at::Tensor empty_memory_format(
c10::IntArrayRef size,
std::optional<c10::ScalarType> dtype_opt,
std::optional<c10::Layout> layout_opt,
@ -24,8 +27,9 @@ at::Tensor empty_memory_format_openreg(
return at::detail::empty_generic(
size, allocator, pu1_dks, dtype, memory_format_opt);
}
// LITERALINCLUDE END: EMPTY.MEMORY_FORMAT IMPL
at::Tensor empty_strided_openreg(
at::Tensor empty_strided(
c10::IntArrayRef size,
c10::IntArrayRef stride,
std::optional<c10::ScalarType> dtype_opt,
@ -48,7 +52,7 @@ at::Tensor empty_strided_openreg(
size, stride, allocator, pu1_dks, dtype);
}
at::Tensor as_strided_openreg(
at::Tensor as_strided(
const at::Tensor& self,
c10::SymIntArrayRef size,
c10::SymIntArrayRef stride,
@ -58,7 +62,7 @@ at::Tensor as_strided_openreg(
return at::cpu::as_strided_symint(self, size, stride, storage_offset);
}
const at::Tensor& resize_openreg_(
const at::Tensor& resize_(
const at::Tensor& self,
c10::SymIntArrayRef size,
::std::optional<at::MemoryFormat> memory_format) {
@ -66,7 +70,7 @@ const at::Tensor& resize_openreg_(
self, C10_AS_INTARRAYREF_SLOW(size), memory_format);
}
at::Tensor _reshape_alias_openreg(
at::Tensor _reshape_alias(
const at::Tensor& self,
c10::SymIntArrayRef size,
c10::SymIntArrayRef stride) {
@ -74,7 +78,7 @@ at::Tensor _reshape_alias_openreg(
self, C10_AS_INTARRAYREF_SLOW(size), C10_AS_INTARRAYREF_SLOW(stride));
}
at::Tensor _copy_from_openreg(
at::Tensor _copy_from(
const at::Tensor& self,
const at::Tensor& dst,
bool non_blocking) {
@ -124,50 +128,58 @@ at::Tensor _copy_from_openreg(
return dst;
}
at::Tensor _copy_from_and_resize_openreg(
at::Tensor _copy_from_and_resize(
const at::Tensor& self,
const at::Tensor& dst) {
at::native::resize_(dst, self.sizes(), std::nullopt);
MemoryGuard guard(self, dst);
return at::native::copy_(const_cast<at::Tensor&>(dst), self, false);
}
at::Scalar _local_scalar_dense_openreg(const at::Tensor& self) {
at::Scalar _local_scalar_dense(const at::Tensor& self) {
MemoryGuard guard(self);
return at::native::_local_scalar_dense_cpu(self);
}
at::Tensor& set_source_Tensor_openreg_(
at::Tensor& self,
const at::Tensor& source) {
at::Tensor& set_source_Tensor_(at::Tensor& self, const at::Tensor& source) {
return at::native::set_tensor_(self, source);
}
at::Tensor& set_source_Storage_openreg_(at::Tensor& self, at::Storage source) {
at::Tensor& set_source_Storage_(at::Tensor& self, at::Storage source) {
return at::native::set_(self, source);
}
at::Tensor& set_source_Storage_storage_offset_openreg_(
at::Tensor& set_source_Storage_storage_offset_(
at::Tensor& result,
at::Storage storage,
int64_t storage_offset,
c10::IntArrayRef size,
c10::IntArrayRef stride) {
// call native::
return at::cpu::set_(result, storage, storage_offset, size, stride);
}
at::Tensor view_openreg(const at::Tensor& self, c10::SymIntArrayRef size) {
at::Tensor view(const at::Tensor& self, c10::SymIntArrayRef size) {
MemoryGuard guard(self);
return at::native::view(self, C10_AS_INTARRAYREF_SLOW(size));
}
void cpu_fallback_openreg(
const c10::OperatorHandle& op,
torch::jit::Stack* stack) {
at::native::cpu_fallback(op, stack);
}
// LITERALINCLUDE START: FALLBACK IMPL
void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
static const std::unordered_set<c10::OperatorName> cpu_fallback_blacklist = {
c10::OperatorName("aten::abs", ""),
c10::OperatorName("aten::abs", "out"),
};
} // namespace at::native
const auto& op_name = op.schema().operator_name();
if (cpu_fallback_blacklist.count(op_name)) {
TORCH_CHECK(
false,
"Operator '",
op_name,
"' is not implemented for device openreg.");
} else {
at::native::cpu_fallback(op, stack);
}
}
// LITERALINCLUDE END: FALLBACK IMPL
} // namespace at::native::openreg

View File

@ -1,8 +1,8 @@
#include "Common.h"
namespace at::native {
namespace at::native::openreg {
at::Tensor empty_memory_format_openreg(
at::Tensor empty_memory_format(
c10::IntArrayRef size,
std::optional<c10::ScalarType> dtype_opt,
std::optional<c10::Layout> layout_opt,
@ -10,7 +10,7 @@ at::Tensor empty_memory_format_openreg(
std::optional<bool> pin_memory_opt,
std::optional<c10::MemoryFormat> memory_format_opt);
at::Tensor empty_strided_openreg(
at::Tensor empty_strided(
c10::IntArrayRef size,
c10::IntArrayRef stride,
std::optional<c10::ScalarType> dtype_opt,
@ -18,50 +18,44 @@ at::Tensor empty_strided_openreg(
std::optional<c10::Device> device_opt,
std::optional<bool> pin_memory_opt);
at::Tensor as_strided_openreg(
at::Tensor as_strided(
const at::Tensor& self,
c10::SymIntArrayRef size,
c10::SymIntArrayRef stride,
std::optional<c10::SymInt> storage_offset);
const at::Tensor& resize_openreg_(
const at::Tensor& resize_(
const at::Tensor& self,
c10::SymIntArrayRef size,
::std::optional<at::MemoryFormat> memory_format);
at::Tensor _reshape_alias_openreg(
at::Tensor _reshape_alias(
const at::Tensor& self,
c10::SymIntArrayRef size,
c10::SymIntArrayRef stride);
at::Tensor _copy_from_openreg(
at::Tensor _copy_from(
const at::Tensor& self,
const at::Tensor& dst,
bool non_blocking);
at::Tensor _copy_from_and_resize_openreg(
const at::Tensor& self,
const at::Tensor& dst);
at::Tensor _copy_from_and_resize(const at::Tensor& self, const at::Tensor& dst);
at::Scalar _local_scalar_dense_openreg(const at::Tensor& self);
at::Scalar _local_scalar_dense(const at::Tensor& self);
at::Tensor& set_source_Tensor_openreg_(
at::Tensor& self,
const at::Tensor& source);
at::Tensor& set_source_Tensor_(at::Tensor& self, const at::Tensor& source);
at::Tensor& set_source_Storage_openreg_(at::Tensor& self, at::Storage source);
at::Tensor& set_source_Storage_(at::Tensor& self, at::Storage source);
at::Tensor& set_source_Storage_storage_offset_openreg_(
at::Tensor& set_source_Storage_storage_offset_(
at::Tensor& result,
at::Storage storage,
int64_t storage_offset,
c10::IntArrayRef size,
c10::IntArrayRef stride);
at::Tensor view_openreg(const at::Tensor& self, c10::SymIntArrayRef size);
at::Tensor view(const at::Tensor& self, c10::SymIntArrayRef size);
void cpu_fallback_openreg(
const c10::OperatorHandle& op,
torch::jit::Stack* stack);
void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack);
} // namespace at::native
} // namespace at::native::openreg

View File

@ -103,6 +103,7 @@ def main():
"-Wno-unused-parameter",
"-Wno-missing-field-initializers",
"-Wno-unknown-pragmas",
"-fno-strict-aliasing",
]
ext_modules = [

View File

@ -2,6 +2,8 @@ import torch
import torch_openreg._C # type: ignore[misc]
from . import meta # noqa: F401
_initialized = False
@ -42,6 +44,10 @@ def set_device(device) -> None:
return torch_openreg._C._set_device(device)
def init():
_lazy_init()
def is_initialized():
return _initialized
@ -64,6 +70,7 @@ __all__ = [
"set_device",
"initial_seed",
"is_available",
"init",
"is_initialized",
"random",
"manual_seed",

View File

@ -0,0 +1,13 @@
import torch
# LITERALINCLUDE START: CUSTOM OPERATOR META
lib = torch.library.Library("openreg", "IMPL", "Meta") # noqa: TOR901
@torch.library.impl(lib, "custom_abs")
def custom_abs(self):
return torch.empty_like(self)
# LITERALINCLUDE END: CUSTOM OPERATOR META

Some files were not shown because too many files have changed in this diff Show More