mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-02 06:24:59 +08:00
Update on "PythonArgParser::symintlist{,Optional} should use SymDimVector"
They're usually/always used for sizes & strides, which is what SymDimVector is for (saves heap allocation of the list itself). Had to patch OptionalArray because there's a bunch of generated code that wants to convert these to SymIntArrayRef and the generator isn't easy to patch to manually wrap. Clear but small improvement in perf on "detach DTensor in a loop" benchmark; we aren't heap-allocating symdimlists anymore, though there's still some cost with destroying these because SymInt needs cleanup. [ghstack-poisoned]
This commit is contained in:
@ -7,6 +7,15 @@ if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
|
||||
fi
|
||||
|
||||
if [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0"
|
||||
fi
|
||||
|
||||
# Compress the fatbin with -compress-mode=size for CUDA 13
|
||||
if [[ "$DESIRED_CUDA" == *"13"* ]]; then
|
||||
export TORCH_NVCC_FLAGS="-compress-mode=size"
|
||||
fi
|
||||
|
||||
SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
|
||||
source $SCRIPTPATH/aarch64_ci_setup.sh
|
||||
|
||||
|
||||
@ -77,21 +77,23 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
|
||||
wheelname = os.path.basename(wheel_path)
|
||||
os.mkdir(f"{folder}/tmp")
|
||||
os.system(f"unzip {wheel_path} -d {folder}/tmp")
|
||||
libs_to_copy = [
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
|
||||
# Common libraries for all CUDA versions
|
||||
common_libs = [
|
||||
# Non-NVIDIA system libraries
|
||||
"/lib64/libgomp.so.1",
|
||||
"/usr/lib64/libgfortran.so.5",
|
||||
"/acl/build/libarm_compute.so",
|
||||
"/acl/build/libarm_compute_graph.so",
|
||||
# Common CUDA libraries (same for all versions)
|
||||
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
|
||||
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
|
||||
"/usr/local/lib/libnvpl_lapack_core.so.0",
|
||||
"/usr/local/lib/libnvpl_blas_core.so.0",
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
|
||||
"/usr/local/cuda/lib64/libcudnn.so.9",
|
||||
"/usr/local/cuda/lib64/libcublas.so.12",
|
||||
"/usr/local/cuda/lib64/libcublasLt.so.12",
|
||||
"/usr/local/cuda/lib64/libcudart.so.12",
|
||||
"/usr/local/cuda/lib64/libcufft.so.11",
|
||||
"/usr/local/cuda/lib64/libcusparse.so.12",
|
||||
"/usr/local/cuda/lib64/libcusparseLt.so.0",
|
||||
"/usr/local/cuda/lib64/libcusolver.so.11",
|
||||
"/usr/local/cuda/lib64/libcurand.so.10",
|
||||
"/usr/local/cuda/lib64/libnccl.so.2",
|
||||
"/usr/local/cuda/lib64/libnvJitLink.so.12",
|
||||
"/usr/local/cuda/lib64/libnvrtc.so.12",
|
||||
"/usr/local/cuda/lib64/libnvshmem_host.so.3",
|
||||
"/usr/local/cuda/lib64/libcudnn_adv.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_cnn.so.9",
|
||||
@ -100,22 +102,41 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
|
||||
"/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
|
||||
"/lib64/libgomp.so.1",
|
||||
"/usr/lib64/libgfortran.so.5",
|
||||
"/acl/build/libarm_compute.so",
|
||||
"/acl/build/libarm_compute_graph.so",
|
||||
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
|
||||
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
|
||||
"/usr/local/lib/libnvpl_lapack_core.so.0",
|
||||
"/usr/local/lib/libnvpl_blas_core.so.0",
|
||||
"/usr/local/cuda/lib64/libcufile.so.0",
|
||||
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
|
||||
"/usr/local/cuda/lib64/libcusparse.so.12",
|
||||
]
|
||||
|
||||
if "129" in desired_cuda:
|
||||
libs_to_copy += [
|
||||
"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9",
|
||||
"/usr/local/cuda/lib64/libcufile.so.0",
|
||||
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
|
||||
# CUDA version-specific libraries
|
||||
if "130" in desired_cuda:
|
||||
version_specific_libs = [
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
|
||||
"/usr/local/cuda/lib64/libcublas.so.13",
|
||||
"/usr/local/cuda/lib64/libcublasLt.so.13",
|
||||
"/usr/local/cuda/lib64/libcudart.so.13",
|
||||
"/usr/local/cuda/lib64/libcufft.so.12",
|
||||
"/usr/local/cuda/lib64/libcusolver.so.12",
|
||||
"/usr/local/cuda/lib64/libnvJitLink.so.13",
|
||||
"/usr/local/cuda/lib64/libnvrtc.so.13",
|
||||
"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.0",
|
||||
]
|
||||
elif "12" in desired_cuda:
|
||||
# Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
|
||||
minor_version = desired_cuda[-1]
|
||||
version_specific_libs = [
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
|
||||
"/usr/local/cuda/lib64/libcublas.so.12",
|
||||
"/usr/local/cuda/lib64/libcublasLt.so.12",
|
||||
"/usr/local/cuda/lib64/libcudart.so.12",
|
||||
"/usr/local/cuda/lib64/libcufft.so.11",
|
||||
"/usr/local/cuda/lib64/libcusolver.so.11",
|
||||
"/usr/local/cuda/lib64/libnvJitLink.so.12",
|
||||
"/usr/local/cuda/lib64/libnvrtc.so.12",
|
||||
f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
|
||||
]
|
||||
|
||||
# Combine all libraries
|
||||
libs_to_copy = common_libs + version_specific_libs
|
||||
|
||||
# Copy libraries to unzipped_folder/a/lib
|
||||
for lib_path in libs_to_copy:
|
||||
|
||||
@ -120,8 +120,8 @@ If your new Docker image needs a library installed from a specific pinned commit
|
||||
If you're introducing a new argument to the Docker build, make sure to add it in the Docker build step in `.ci/docker/build.sh`:
|
||||
```bash
|
||||
docker build \
|
||||
....
|
||||
--build-arg "NEW_ARG_1=${NEW_ARG_1}"
|
||||
....
|
||||
--build-arg "NEW_ARG_1=${NEW_ARG_1}"
|
||||
```
|
||||
|
||||
3. **Update Dockerfile logic**:
|
||||
|
||||
@ -173,8 +173,8 @@ case "$tag" in
|
||||
VISION=yes
|
||||
ONNX=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3.9-clang12)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
pytorch-linux-jammy-py3.10-clang12)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
CLANG_VERSION=12
|
||||
VISION=yes
|
||||
TRITON=yes
|
||||
@ -209,15 +209,7 @@ case "$tag" in
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
|
||||
;;
|
||||
pytorch-linux-jammy-xpu-2025.0-py3)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
XPU_VERSION=2025.0
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-xpu-2025.1-py3)
|
||||
pytorch-linux-jammy-xpu-n-1-py3)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
@ -225,6 +217,14 @@ case "$tag" in
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-xpu-n-py3)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
XPU_VERSION=2025.2
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
GCC_VERSION=11
|
||||
@ -234,8 +234,8 @@ case "$tag" in
|
||||
DOCS=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
CUDA_VERSION=12.8.1
|
||||
CLANG_VERSION=12
|
||||
VISION=yes
|
||||
@ -246,8 +246,8 @@ case "$tag" in
|
||||
CLANG_VERSION=18
|
||||
VISION=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3.9-gcc11)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
pytorch-linux-jammy-py3.10-gcc11)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
|
||||
@ -10,7 +10,7 @@ else
|
||||
arch_path='sbsa'
|
||||
fi
|
||||
|
||||
NVSHMEM_VERSION=3.3.20
|
||||
NVSHMEM_VERSION=3.3.24
|
||||
|
||||
function install_cuda {
|
||||
version=$1
|
||||
@ -65,7 +65,7 @@ function install_nvshmem {
|
||||
# This pattern is a lie as it is not consistent across versions, for 3.3.9 it was cuda_ver-arch-nvshhem-ver
|
||||
filename="libnvshmem-linux-${arch_path}-${nvshmem_version}_cuda${cuda_major_version}-archive"
|
||||
suffix=".tar.xz"
|
||||
url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}${suffix}"
|
||||
url="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/linux-${arch_path}/${filename}${suffix}"
|
||||
|
||||
# download, unpack, install
|
||||
wget -q "${url}"
|
||||
@ -148,7 +148,6 @@ function install_128 {
|
||||
|
||||
function install_130 {
|
||||
CUDNN_VERSION=9.12.0.46
|
||||
NVSHMEM_VERSION=3.3.20
|
||||
echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
|
||||
# install CUDA 13.0 in the same container
|
||||
install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
|
||||
|
||||
@ -57,7 +57,7 @@ if [ ! -f setup.py ]; then
|
||||
cd python
|
||||
fi
|
||||
|
||||
pip_install pybind11==2.13.6
|
||||
pip_install pybind11==3.0.1
|
||||
|
||||
# TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
|
||||
as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py
|
||||
|
||||
@ -146,11 +146,11 @@ if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
|
||||
XPU_DRIVER_VERSION="/lts/2350"
|
||||
fi
|
||||
|
||||
# Default use Intel® oneAPI Deep Learning Essentials 2025.0
|
||||
if [[ "$XPU_VERSION" == "2025.1" ]]; then
|
||||
XPU_PACKAGES="intel-deep-learning-essentials-2025.1"
|
||||
# Default use Intel® oneAPI Deep Learning Essentials 2025.1
|
||||
if [[ "$XPU_VERSION" == "2025.2" ]]; then
|
||||
XPU_PACKAGES="intel-deep-learning-essentials-2025.2"
|
||||
else
|
||||
XPU_PACKAGES="intel-deep-learning-essentials-2025.0"
|
||||
XPU_PACKAGES="intel-deep-learning-essentials-2025.1"
|
||||
fi
|
||||
|
||||
# The installation depends on the base OS
|
||||
|
||||
@ -175,6 +175,6 @@ ENV XPU_DRIVER_TYPE ROLLING
|
||||
RUN python3 -m pip install --upgrade pip && \
|
||||
python3 -mpip install cmake==3.28.4
|
||||
ADD ./common/install_xpu.sh install_xpu.sh
|
||||
ENV XPU_VERSION 2025.1
|
||||
ENV XPU_VERSION 2025.2
|
||||
RUN bash ./install_xpu.sh && rm install_xpu.sh
|
||||
RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
|
||||
|
||||
@ -379,7 +379,7 @@ dataclasses_json==0.6.7
|
||||
cmake==4.0.0
|
||||
#Description: required for building
|
||||
|
||||
tlparse==0.3.30
|
||||
tlparse==0.4.0
|
||||
#Description: required for log parsing
|
||||
|
||||
cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x"
|
||||
|
||||
143
.ci/lumen_cli/cli/lib/common/gh_summary.py
Normal file
143
.ci/lumen_cli/cli/lib/common/gh_summary.py
Normal file
@ -0,0 +1,143 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import textwrap
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from cli.lib.common.utils import get_wheels
|
||||
from jinja2 import Template
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable, Mapping
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_TPL_CONTENT = Template(
|
||||
textwrap.dedent("""\
|
||||
## {{ title }}
|
||||
|
||||
```{{ lang }}
|
||||
{{ content }}
|
||||
```
|
||||
""")
|
||||
)
|
||||
|
||||
_TPL_LIST_ITEMS = Template(
|
||||
textwrap.dedent("""\
|
||||
## {{ title }}
|
||||
{% for it in items %}
|
||||
- {{ it.pkg }}: {{ it.relpath }}
|
||||
{% else %}
|
||||
_(no item found)_
|
||||
{% endfor %}
|
||||
""")
|
||||
)
|
||||
|
||||
_TPL_TABLE = Template(
|
||||
textwrap.dedent("""\
|
||||
{%- if rows %}
|
||||
| {{ cols | join(' | ') }} |
|
||||
|{%- for _ in cols %} --- |{%- endfor %}
|
||||
{%- for r in rows %}
|
||||
| {%- for c in cols %} {{ r.get(c, "") }} |{%- endfor %}
|
||||
{%- endfor %}
|
||||
{%- else %}
|
||||
_(no data)_
|
||||
{%- endif %}
|
||||
""")
|
||||
)
|
||||
|
||||
|
||||
def gh_summary_path() -> Path | None:
|
||||
"""Return the Path to the GitHub step summary file, or None if not set."""
|
||||
p = os.environ.get("GITHUB_STEP_SUMMARY")
|
||||
return Path(p) if p else None
|
||||
|
||||
|
||||
def write_gh_step_summary(md: str, *, append_content: bool = True) -> bool:
|
||||
"""
|
||||
Write Markdown content to the GitHub Step Summary file if GITHUB_STEP_SUMMARY is set.
|
||||
append_content: default true, if True, append to the end of the file, else overwrite the whole file
|
||||
|
||||
Returns:
|
||||
True if written successfully (in GitHub Actions environment),
|
||||
False if skipped (e.g., running locally where the variable is not set).
|
||||
"""
|
||||
sp = gh_summary_path()
|
||||
if not sp:
|
||||
logger.info("[gh-summary] GITHUB_STEP_SUMMARY not set, skipping write.")
|
||||
return False
|
||||
|
||||
md_clean = textwrap.dedent(md).strip() + "\n"
|
||||
|
||||
mode = "a" if append_content else "w"
|
||||
with sp.open(mode, encoding="utf-8") as f:
|
||||
f.write(md_clean)
|
||||
return True
|
||||
|
||||
|
||||
def md_heading(text: str, level: int = 2) -> str:
|
||||
"""Generate a Markdown heading string with the given level (1-6)."""
|
||||
return f"{'#' * max(1, min(level, 6))} {text}\n"
|
||||
|
||||
|
||||
def md_details(summary: str, content: str) -> str:
|
||||
"""Generate a collapsible <details> block with a summary and inner content."""
|
||||
return f"<details>\n<summary>{summary}</summary>\n\n{content}\n\n</details>\n"
|
||||
|
||||
|
||||
def summarize_content_from_file(
|
||||
output_dir: Path,
|
||||
freeze_file: str,
|
||||
title: str = "Content from file",
|
||||
code_lang: str = "", # e.g. "text" or "ini"
|
||||
) -> bool:
|
||||
f = Path(output_dir) / freeze_file
|
||||
if not f.exists():
|
||||
return False
|
||||
content = f.read_text(encoding="utf-8").strip()
|
||||
md = render_content(content, title=title, lang=code_lang)
|
||||
return write_gh_step_summary(md)
|
||||
|
||||
|
||||
def summarize_wheels(path: Path, title: str = "Wheels", max_depth: int = 3):
|
||||
items = get_wheels(path, max_depth=max_depth)
|
||||
if not items:
|
||||
return False
|
||||
md = render_list(items, title=title)
|
||||
return write_gh_step_summary(md)
|
||||
|
||||
|
||||
def md_kv_table(rows: Iterable[Mapping[str, str | int | float]]) -> str:
|
||||
"""
|
||||
Render a list of dicts as a Markdown table using Jinja template.
|
||||
"""
|
||||
rows = list(rows)
|
||||
cols = list({k for r in rows for k in r.keys()})
|
||||
md = _TPL_TABLE.render(cols=cols, rows=rows).strip() + "\n"
|
||||
return md
|
||||
|
||||
|
||||
def render_list(
|
||||
items: Iterable[str],
|
||||
*,
|
||||
title: str = "List",
|
||||
) -> str:
|
||||
tpl = _TPL_LIST_ITEMS
|
||||
md = tpl.render(title=title, items=items)
|
||||
return md
|
||||
|
||||
|
||||
def render_content(
|
||||
content: str,
|
||||
*,
|
||||
title: str = "Content",
|
||||
lang: str = "text",
|
||||
) -> str:
|
||||
tpl = _TPL_CONTENT
|
||||
md = tpl.render(title=title, content=content, lang=lang)
|
||||
return md
|
||||
@ -45,7 +45,7 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules
|
||||
|
||||
# Checkout pinned commit
|
||||
commit = get_post_build_pinned_commit(target)
|
||||
logger.info("Checking out pinned commit %s", commit)
|
||||
logger.info("Checking out pinned %s commit %s", target, commit)
|
||||
r.git.checkout(commit)
|
||||
|
||||
# Update submodules if requested
|
||||
@ -55,7 +55,7 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules
|
||||
sm.update(init=True, recursive=True, progress=PrintProgress())
|
||||
|
||||
logger.info("Successfully cloned %s", target)
|
||||
return r
|
||||
return r, commit
|
||||
|
||||
except GitCommandError as e:
|
||||
logger.error("Git operation failed: %s", e)
|
||||
|
||||
@ -4,7 +4,7 @@ import shlex
|
||||
import shutil
|
||||
import sys
|
||||
from collections.abc import Iterable
|
||||
from importlib.metadata import PackageNotFoundError, version
|
||||
from importlib.metadata import PackageNotFoundError, version # noqa: UP035
|
||||
from typing import Optional, Union
|
||||
|
||||
from cli.lib.common.utils import run_command
|
||||
|
||||
@ -8,6 +8,7 @@ import shlex
|
||||
import subprocess
|
||||
import sys
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@ -115,3 +116,24 @@ def working_directory(path: str):
|
||||
yield
|
||||
finally:
|
||||
os.chdir(prev_cwd)
|
||||
|
||||
|
||||
def get_wheels(
|
||||
output_dir: Path,
|
||||
max_depth: Optional[int] = None,
|
||||
) -> list[str]:
|
||||
"""Return a list of wheels found in the given output directory."""
|
||||
root = Path(output_dir)
|
||||
if not root.exists():
|
||||
return []
|
||||
items = []
|
||||
for dirpath, _, filenames in os.walk(root):
|
||||
depth = Path(dirpath).relative_to(root).parts
|
||||
if max_depth is not None and len(depth) > max_depth:
|
||||
continue
|
||||
for fname in sorted(filenames):
|
||||
if fname.endswith(".whl"):
|
||||
pkg = fname.split("-")[0]
|
||||
relpath = str((Path(dirpath) / fname).relative_to(root))
|
||||
items.append({"pkg": pkg, "relpath": relpath})
|
||||
return items
|
||||
|
||||
@ -1,13 +1,27 @@
|
||||
import logging
|
||||
import os
|
||||
import textwrap
|
||||
from typing import Any
|
||||
|
||||
from cli.lib.common.gh_summary import write_gh_step_summary
|
||||
from cli.lib.common.git_helper import clone_external_repo
|
||||
from cli.lib.common.pip_helper import pip_install_packages
|
||||
from cli.lib.common.utils import run_command, temp_environ, working_directory
|
||||
from jinja2 import Template
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_TPL_VLLM_INFO = Template(
|
||||
textwrap.dedent("""\
|
||||
## Vllm against Pytorch CI Test Summary
|
||||
**Vllm Commit**: [{{ vllm_commit }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})
|
||||
{%- if torch_sha %}
|
||||
**Pytorch Commit**: [{{ torch_sha }}](https://github.com/pytorch/pytorch/commit/{{ torch_sha }})
|
||||
{%- endif %}
|
||||
""")
|
||||
)
|
||||
|
||||
|
||||
def sample_vllm_test_library():
|
||||
"""
|
||||
@ -214,12 +228,13 @@ def run_test_plan(
|
||||
|
||||
|
||||
def clone_vllm(dst: str = "vllm"):
|
||||
clone_external_repo(
|
||||
_, commit = clone_external_repo(
|
||||
target="vllm",
|
||||
repo="https://github.com/vllm-project/vllm.git",
|
||||
dst=dst,
|
||||
update_submodules=True,
|
||||
)
|
||||
return commit
|
||||
|
||||
|
||||
def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) -> str:
|
||||
@ -230,3 +245,12 @@ def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) ->
|
||||
for k in sorted(mapping, key=len, reverse=True):
|
||||
step = step.replace(k, mapping[k])
|
||||
return step
|
||||
|
||||
|
||||
def summarize_build_info(vllm_commit: str) -> bool:
|
||||
torch_sha = os.getenv("GITHUB_SHA")
|
||||
md = (
|
||||
_TPL_VLLM_INFO.render(vllm_commit=vllm_commit, torch_sha=torch_sha).strip()
|
||||
+ "\n"
|
||||
)
|
||||
return write_gh_step_summary(md)
|
||||
|
||||
@ -13,6 +13,11 @@ from cli.lib.common.envs_helper import (
|
||||
env_str_field,
|
||||
with_params_help,
|
||||
)
|
||||
from cli.lib.common.gh_summary import (
|
||||
gh_summary_path,
|
||||
summarize_content_from_file,
|
||||
summarize_wheels,
|
||||
)
|
||||
from cli.lib.common.path_helper import (
|
||||
copy,
|
||||
ensure_dir_exists,
|
||||
@ -21,7 +26,7 @@ from cli.lib.common.path_helper import (
|
||||
is_path_exist,
|
||||
)
|
||||
from cli.lib.common.utils import run_command
|
||||
from cli.lib.core.vllm.lib import clone_vllm
|
||||
from cli.lib.core.vllm.lib import clone_vllm, summarize_build_info
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -153,18 +158,43 @@ class VllmBuildRunner(BaseRunner):
|
||||
"""
|
||||
inputs = VllmBuildParameters()
|
||||
logger.info("Running vllm build with inputs: %s", inputs)
|
||||
clone_vllm()
|
||||
vllm_commit = clone_vllm()
|
||||
|
||||
self.cp_dockerfile_if_exist(inputs)
|
||||
|
||||
# cp torch wheels from root direct to vllm workspace if exist
|
||||
self.cp_torch_whls_if_exist(inputs)
|
||||
|
||||
ensure_dir_exists(inputs.output_dir)
|
||||
# make sure the output dir to store the build artifacts exist
|
||||
ensure_dir_exists(Path(inputs.output_dir))
|
||||
|
||||
cmd = self._generate_docker_build_cmd(inputs)
|
||||
logger.info("Running docker build: \n %s", cmd)
|
||||
run_command(cmd, cwd="vllm", env=os.environ.copy())
|
||||
|
||||
try:
|
||||
run_command(cmd, cwd="vllm", env=os.environ.copy())
|
||||
finally:
|
||||
self.genearte_vllm_build_summary(vllm_commit, inputs)
|
||||
|
||||
def genearte_vllm_build_summary(
|
||||
self, vllm_commit: str, inputs: VllmBuildParameters
|
||||
):
|
||||
if not gh_summary_path():
|
||||
return logger.info("Skipping, not detect GH Summary env var....")
|
||||
logger.info("Generate GH Summary ...")
|
||||
# summarize vllm build info
|
||||
summarize_build_info(vllm_commit)
|
||||
|
||||
# summarize vllm build artifacts
|
||||
vllm_artifact_dir = inputs.output_dir / "wheels"
|
||||
summarize_content_from_file(
|
||||
vllm_artifact_dir,
|
||||
"build_summary.txt",
|
||||
title="Vllm build env pip package summary",
|
||||
)
|
||||
summarize_wheels(
|
||||
inputs.torch_whls_path, max_depth=3, title="Torch Wheels Artifacts"
|
||||
)
|
||||
summarize_wheels(vllm_artifact_dir, max_depth=3, title="Vllm Wheels Artifacts")
|
||||
|
||||
def cp_torch_whls_if_exist(self, inputs: VllmBuildParameters) -> str:
|
||||
if not inputs.use_torch_whl:
|
||||
|
||||
@ -220,6 +220,8 @@ def preprocess_test_in(
|
||||
target_path = Path(target_file)
|
||||
lines = target_path.read_text().splitlines()
|
||||
|
||||
pkgs_to_add = []
|
||||
|
||||
# Remove lines starting with the package names (==, @, >=) — case-insensitive
|
||||
pattern = re.compile(rf"^({'|'.join(pkgs_to_remove)})\s*(==|@|>=)", re.IGNORECASE)
|
||||
kept_lines = [line for line in lines if not pattern.match(line)]
|
||||
@ -236,7 +238,11 @@ def preprocess_test_in(
|
||||
]
|
||||
|
||||
# Write back: header_lines + blank + kept_lines
|
||||
out = "\n".join(header_lines + [""] + kept_lines) + "\n"
|
||||
out_lines = header_lines + [""] + kept_lines
|
||||
if pkgs_to_add:
|
||||
out_lines += [""] + pkgs_to_add
|
||||
|
||||
out = "\n".join(out_lines) + "\n"
|
||||
target_path.write_text(out)
|
||||
logger.info("[INFO] Updated %s", target_file)
|
||||
|
||||
|
||||
@ -300,24 +300,3 @@ except RuntimeError as e:
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
###############################################################################
|
||||
# Check for C++ ABI compatibility to GCC-11 - GCC 13
|
||||
###############################################################################
|
||||
if [[ "$(uname)" == 'Linux' && "$PACKAGE_TYPE" == 'manywheel' ]]; then
|
||||
pushd /tmp
|
||||
# Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html
|
||||
# gcc-11 is ABI16, gcc-13 is ABI18, gcc-14 is ABI19
|
||||
# gcc 11 - CUDA 11.8, xpu, rocm
|
||||
# gcc 13 - CUDA 12.6, 12.8 and cpu
|
||||
# Please see issue for reference: https://github.com/pytorch/pytorch/issues/152426
|
||||
if [[ "$(uname -m)" == "s390x" ]]; then
|
||||
cxx_abi="19"
|
||||
elif [[ "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'rocm'* ]]; then
|
||||
cxx_abi="18"
|
||||
else
|
||||
cxx_abi="16"
|
||||
fi
|
||||
python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi10${cxx_abi}' else 1)"
|
||||
popd
|
||||
fi
|
||||
|
||||
@ -149,13 +149,22 @@ function get_pinned_commit() {
|
||||
cat .github/ci_commit_pins/"${1}".txt
|
||||
}
|
||||
|
||||
function detect_cuda_arch() {
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
|
||||
if command -v nvidia-smi; then
|
||||
TORCH_CUDA_ARCH_LIST=$(nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1)
|
||||
elif [[ "${TEST_CONFIG}" == *nogpu* ]]; then
|
||||
# There won't be nvidia-smi in nogpu tests, so just set TORCH_CUDA_ARCH_LIST to the default
|
||||
# minimum supported value here
|
||||
TORCH_CUDA_ARCH_LIST=8.0
|
||||
fi
|
||||
export TORCH_CUDA_ARCH_LIST
|
||||
fi
|
||||
}
|
||||
|
||||
function install_torchaudio() {
|
||||
local commit
|
||||
commit=$(get_pinned_commit audio)
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]] && command -v nvidia-smi; then
|
||||
TORCH_CUDA_ARCH_LIST=$(nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1)
|
||||
export TORCH_CUDA_ARCH_LIST
|
||||
fi
|
||||
pip_build_and_install "git+https://github.com/pytorch/audio.git@${commit}" dist/audio
|
||||
}
|
||||
|
||||
|
||||
@ -45,6 +45,7 @@ if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then
|
||||
# DTensor tests
|
||||
time python test/run_test.py --verbose -i distributed/tensor/test_random_ops
|
||||
time python test/run_test.py --verbose -i distributed/tensor/test_dtensor_compile
|
||||
time python test/run_test.py --verbose -i distributed/tensor/test_utils.py
|
||||
|
||||
# DeviceMesh test
|
||||
time python test/run_test.py --verbose -i distributed/test_device_mesh
|
||||
|
||||
@ -91,6 +91,7 @@ if [[ "$BUILD_ENVIRONMENT" == *clang9* || "$BUILD_ENVIRONMENT" == *xpu* ]]; then
|
||||
export VALGRIND=OFF
|
||||
fi
|
||||
|
||||
detect_cuda_arch
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *s390x* ]]; then
|
||||
# There are additional warnings on s390x, maybe due to newer gcc.
|
||||
@ -1630,11 +1631,7 @@ elif [[ "${TEST_CONFIG}" == *xla* ]]; then
|
||||
build_xla
|
||||
test_xla
|
||||
elif [[ "$TEST_CONFIG" == *vllm* ]]; then
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
|
||||
TORCH_CUDA_ARCH_LIST=$(nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1)
|
||||
export TORCH_CUDA_ARCH_LIST
|
||||
fi
|
||||
echo "VLLM CI TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST"
|
||||
echo "vLLM CI uses TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST"
|
||||
(cd .ci/lumen_cli && python -m pip install -e .)
|
||||
python -m cli.run test external vllm --test-plan "$TEST_CONFIG" --shard-id "$SHARD_NUMBER" --num-shards "$NUM_TEST_SHARDS"
|
||||
elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
|
||||
|
||||
@ -44,7 +44,7 @@ python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==
|
||||
python -m pip install z3-solver==4.15.1.0
|
||||
|
||||
# Install tlparse for test\dynamo\test_structured_trace.py UTs.
|
||||
python -m pip install tlparse==0.3.30
|
||||
python -m pip install tlparse==0.4.0
|
||||
|
||||
# Install parameterized
|
||||
python -m pip install parameterized==0.8.1
|
||||
|
||||
@ -13,9 +13,9 @@ if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
|
||||
:xpu_bundle_install_start
|
||||
|
||||
set XPU_BUNDLE_PARENT_DIR=C:\Program Files (x86)\Intel\oneAPI
|
||||
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d6d6c17-ca2d-4735-9331-99447e4a1280/intel-deep-learning-essentials-2025.0.1.28_offline.exe
|
||||
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
|
||||
set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product
|
||||
set XPU_BUNDLE_VERSION=2025.0.1+20
|
||||
set XPU_BUNDLE_VERSION=2025.1.3+5
|
||||
set XPU_BUNDLE_INSTALLED=0
|
||||
set XPU_BUNDLE_UNINSTALL=0
|
||||
set XPU_EXTRA_URL=NULL
|
||||
@ -24,9 +24,9 @@ set XPU_EXTRA_VERSION=2025.0.1+1226
|
||||
set XPU_EXTRA_INSTALLED=0
|
||||
set XPU_EXTRA_UNINSTALL=0
|
||||
|
||||
if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.1] (
|
||||
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
|
||||
set XPU_BUNDLE_VERSION=2025.1.3+5
|
||||
if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.2] (
|
||||
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
|
||||
set XPU_BUNDLE_VERSION=2025.2.1+20
|
||||
)
|
||||
|
||||
:: Check if XPU bundle is target version or already installed
|
||||
@ -90,14 +90,3 @@ if errorlevel 1 exit /b 1
|
||||
del xpu_extra.exe
|
||||
|
||||
:xpu_install_end
|
||||
|
||||
if not "%XPU_ENABLE_KINETO%"=="1" goto install_end
|
||||
:: Install Level Zero SDK
|
||||
set XPU_EXTRA_LZ_URL=https://github.com/oneapi-src/level-zero/releases/download/v1.14.0/level-zero-sdk_1.14.0.zip
|
||||
curl -k -L %XPU_EXTRA_LZ_URL% --output "%SRC_DIR%\temp_build\level_zero_sdk.zip"
|
||||
echo "Installing level zero SDK..."
|
||||
7z x "%SRC_DIR%\temp_build\level_zero_sdk.zip" -o"%SRC_DIR%\temp_build\level_zero"
|
||||
set "INCLUDE=%SRC_DIR%\temp_build\level_zero\include;%INCLUDE%"
|
||||
del "%SRC_DIR%\temp_build\level_zero_sdk.zip"
|
||||
|
||||
:install_end
|
||||
|
||||
@ -15,8 +15,7 @@ fi
|
||||
if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
|
||||
export VC_YEAR=2022
|
||||
export USE_SCCACHE=0
|
||||
export XPU_VERSION=2025.1
|
||||
export XPU_ENABLE_KINETO=1
|
||||
export XPU_VERSION=2025.2
|
||||
fi
|
||||
|
||||
echo "Free space on filesystem before build:"
|
||||
|
||||
@ -8,7 +8,7 @@ export VC_YEAR=2022
|
||||
|
||||
if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
|
||||
export VC_YEAR=2022
|
||||
export XPU_VERSION=2025.1
|
||||
export XPU_VERSION=2025.2
|
||||
fi
|
||||
|
||||
pushd "$PYTORCH_ROOT/.ci/pytorch/"
|
||||
|
||||
@ -48,6 +48,7 @@ runs:
|
||||
BASE_IMAGE: ${{ inputs.docker-image }}
|
||||
BUILD_TARGETS: ${{ inputs.build-targets }}
|
||||
PARENT_OUTPUT_DIR: ${{ inputs.output-dir}}
|
||||
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
2
.github/ci_commit_pins/vllm.txt
vendored
2
.github/ci_commit_pins/vllm.txt
vendored
@ -1 +1 @@
|
||||
add1adfec742dfb13e614dab3372b5aafd1ff046
|
||||
321938e9ac4000e0cb37e328359a7fd3026bc672
|
||||
|
||||
5
.github/ci_configs/vllm/Dockerfile.tmp_vllm
vendored
5
.github/ci_configs/vllm/Dockerfile.tmp_vllm
vendored
@ -176,6 +176,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
# Build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
|
||||
# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
|
||||
RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
|
||||
|
||||
RUN cat torch_build_versions.txt
|
||||
RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'
|
||||
|
||||
@ -358,7 +359,7 @@ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||
# Build flashinfer for torch nightly from source around 10 mins
|
||||
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
|
||||
# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
|
||||
ARG FLASHINFER_GIT_REF="v0.2.9rc2"
|
||||
ARG FLASHINFER_GIT_REF="v0.2.14.post1"
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
git clone --depth 1 --recursive --shallow-submodules \
|
||||
--branch ${FLASHINFER_GIT_REF} \
|
||||
@ -376,6 +377,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
|
||||
# Logging to confirm the torch versions
|
||||
RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
|
||||
RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio\|^xformers\|^vllm\|^flashinfer' > build_summary.txt
|
||||
################### VLLM INSTALLED IMAGE ####################
|
||||
|
||||
|
||||
@ -433,4 +435,5 @@ FROM scratch as export-wheels
|
||||
# Just copy the wheels we prepared in previous stages
|
||||
COPY --from=base /workspace/xformers-dist /wheels/xformers
|
||||
COPY --from=build /workspace/vllm-dist /wheels/vllm
|
||||
COPY --from=vllm-base /workspace/build_summary.txt /wheels/build_summary.txt
|
||||
COPY --from=vllm-base /workspace/wheels/flashinfer /wheels/flashinfer-python
|
||||
|
||||
3
.github/labeler.yml
vendored
3
.github/labeler.yml
vendored
@ -41,6 +41,9 @@
|
||||
- test/inductor/**
|
||||
- test/dynamo/**
|
||||
|
||||
"ciflow/vllm":
|
||||
- .github/ci_commit_pins/vllm.txt
|
||||
|
||||
"module: cpu":
|
||||
- aten/src/ATen/cpu/**
|
||||
- aten/src/ATen/native/cpu/**
|
||||
|
||||
@ -28,7 +28,7 @@ pyyaml==6.0.2
|
||||
scipy==1.12.0
|
||||
setuptools==72.1.0
|
||||
sympy==1.13.3
|
||||
tlparse==0.3.30
|
||||
tlparse==0.4.0
|
||||
tensorboard==2.13.0
|
||||
typing-extensions==4.12.2
|
||||
unittest-xml-reporting<=3.2.0,>=2.0.0
|
||||
|
||||
46
.github/scripts/generate_binary_build_matrix.py
vendored
46
.github/scripts/generate_binary_build_matrix.py
vendored
@ -40,7 +40,7 @@ CPU_AARCH64_ARCH = ["cpu-aarch64"]
|
||||
|
||||
CPU_S390X_ARCH = ["cpu-s390x"]
|
||||
|
||||
CUDA_AARCH64_ARCHES = ["12.9-aarch64"]
|
||||
CUDA_AARCH64_ARCHES = ["12.9-aarch64", "13.0-aarch64"]
|
||||
|
||||
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
@ -107,32 +107,32 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
),
|
||||
"xpu": (
|
||||
"intel-cmplr-lib-rt==2025.1.1 | "
|
||||
"intel-cmplr-lib-ur==2025.1.1 | "
|
||||
"intel-cmplr-lic-rt==2025.1.1 | "
|
||||
"intel-sycl-rt==2025.1.1 | "
|
||||
"oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"onemkl-sycl-blas==2025.1.0 | "
|
||||
"onemkl-sycl-dft==2025.1.0 | "
|
||||
"onemkl-sycl-lapack==2025.1.0 | "
|
||||
"onemkl-sycl-rng==2025.1.0 | "
|
||||
"onemkl-sycl-sparse==2025.1.0 | "
|
||||
"dpcpp-cpp-rt==2025.1.1 | "
|
||||
"intel-opencl-rt==2025.1.1 | "
|
||||
"mkl==2025.1.0 | "
|
||||
"intel-openmp==2025.1.1 | "
|
||||
"tbb==2022.1.0 | "
|
||||
"tcmlib==1.3.0 | "
|
||||
"umf==0.10.0 | "
|
||||
"intel-pti==0.12.3"
|
||||
"intel-cmplr-lib-rt==2025.2.1 | "
|
||||
"intel-cmplr-lib-ur==2025.2.1 | "
|
||||
"intel-cmplr-lic-rt==2025.2.1 | "
|
||||
"intel-sycl-rt==2025.2.1 | "
|
||||
"oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"onemkl-sycl-blas==2025.2.0 | "
|
||||
"onemkl-sycl-dft==2025.2.0 | "
|
||||
"onemkl-sycl-lapack==2025.2.0 | "
|
||||
"onemkl-sycl-rng==2025.2.0 | "
|
||||
"onemkl-sycl-sparse==2025.2.0 | "
|
||||
"dpcpp-cpp-rt==2025.2.1 | "
|
||||
"intel-opencl-rt==2025.2.1 | "
|
||||
"mkl==2025.2.0 | "
|
||||
"intel-openmp==2025.2.1 | "
|
||||
"tbb==2022.2.0 | "
|
||||
"tcmlib==1.4.0 | "
|
||||
"umf==0.11.0 | "
|
||||
"intel-pti==0.13.1"
|
||||
),
|
||||
}
|
||||
|
||||
@ -210,7 +210,7 @@ LIBTORCH_CONTAINER_IMAGES: dict[str, str] = {
|
||||
"cpu": "libtorch-cxx11-builder:cpu",
|
||||
}
|
||||
|
||||
FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]
|
||||
FULL_PYTHON_VERSIONS = ["3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]
|
||||
|
||||
|
||||
def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
|
||||
|
||||
8
.github/scripts/test_trymerge.py
vendored
8
.github/scripts/test_trymerge.py
vendored
@ -124,7 +124,7 @@ def mock_parse_args(revert: bool = False, force: bool = False) -> Any:
|
||||
self.force = force
|
||||
self.pr_num = 76123
|
||||
self.dry_run = True
|
||||
self.comment_id = 0
|
||||
self.comment_id = 12345 # Set to non-zero value
|
||||
self.reason = "this is for testing"
|
||||
self.ignore_current = False
|
||||
self.check_mergeability = False
|
||||
@ -152,9 +152,9 @@ def mock_revert(
|
||||
def mock_merge(
|
||||
pr: GitHubPR,
|
||||
repo: GitRepo,
|
||||
comment_id: int,
|
||||
dry_run: bool = False,
|
||||
skip_mandatory_checks: bool = False,
|
||||
comment_id: Optional[int] = None,
|
||||
timeout_minutes: int = 400,
|
||||
stale_pr_days: int = 3,
|
||||
ignore_current: bool = False,
|
||||
@ -470,9 +470,9 @@ class TestTryMerge(TestCase):
|
||||
mock_merge.assert_called_once_with(
|
||||
mock.ANY,
|
||||
mock.ANY,
|
||||
comment_id=mock.ANY,
|
||||
dry_run=mock.ANY,
|
||||
skip_mandatory_checks=True,
|
||||
comment_id=mock.ANY,
|
||||
ignore_current=False,
|
||||
)
|
||||
|
||||
@ -485,9 +485,9 @@ class TestTryMerge(TestCase):
|
||||
mock_merge.assert_called_once_with(
|
||||
mock.ANY,
|
||||
mock.ANY,
|
||||
comment_id=mock.ANY,
|
||||
dry_run=mock.ANY,
|
||||
skip_mandatory_checks=False,
|
||||
comment_id=mock.ANY,
|
||||
ignore_current=False,
|
||||
)
|
||||
|
||||
|
||||
72
.github/scripts/trymerge.py
vendored
72
.github/scripts/trymerge.py
vendored
@ -737,16 +737,24 @@ class GitHubPR:
|
||||
def last_commit(self) -> Any:
|
||||
return self.info["commits"]["nodes"][-1]["commit"]
|
||||
|
||||
def last_commit_sha(self, default: Optional[str] = None) -> str:
|
||||
# for commits, the oid is the sha
|
||||
|
||||
if default is None:
|
||||
return str(self.last_commit()["oid"])
|
||||
|
||||
return str(self.last_commit().get("oid", default))
|
||||
|
||||
def get_merge_base(self) -> str:
|
||||
if self.merge_base:
|
||||
return self.merge_base
|
||||
|
||||
last_commit_oid = self.last_commit()["oid"]
|
||||
last_commit_sha = self.last_commit_sha()
|
||||
# NB: We could use self.base_ref() here for regular PR, however, that doesn't
|
||||
# work for ghstack where the base is the custom branch, i.e. gh/USER/ID/base,
|
||||
# so let's just use main instead
|
||||
self.merge_base = gh_fetch_merge_base(
|
||||
self.org, self.project, last_commit_oid, self.default_branch()
|
||||
self.org, self.project, last_commit_sha, self.default_branch()
|
||||
)
|
||||
|
||||
# Fallback to baseRefOid if the API call fails, i.e. rate limit. Note that baseRefOid
|
||||
@ -1151,7 +1159,7 @@ class GitHubPR:
|
||||
*,
|
||||
skip_mandatory_checks: bool = False,
|
||||
dry_run: bool = False,
|
||||
comment_id: Optional[int] = None,
|
||||
comment_id: int,
|
||||
ignore_current_checks: Optional[list[str]] = None,
|
||||
) -> None:
|
||||
# Raises exception if matching rule is not found
|
||||
@ -1167,7 +1175,7 @@ class GitHubPR:
|
||||
skip_internal_checks=can_skip_internal_checks(self, comment_id),
|
||||
ignore_current_checks=ignore_current_checks,
|
||||
)
|
||||
additional_merged_prs = self.merge_changes(
|
||||
additional_merged_prs = self.merge_changes_locally(
|
||||
repo, skip_mandatory_checks, comment_id
|
||||
)
|
||||
|
||||
@ -1196,7 +1204,7 @@ class GitHubPR:
|
||||
broken_trunk_checks=ignorable_checks.get("BROKEN_TRUNK", []),
|
||||
flaky_checks=ignorable_checks.get("FLAKY", []),
|
||||
unstable_checks=ignorable_checks.get("UNSTABLE", []),
|
||||
last_commit_sha=self.last_commit().get("oid", ""),
|
||||
last_commit_sha=self.last_commit_sha(default=""),
|
||||
merge_base_sha=self.get_merge_base(),
|
||||
merge_commit_sha=merge_commit_sha,
|
||||
is_failed=False,
|
||||
@ -1217,7 +1225,7 @@ class GitHubPR:
|
||||
dry_run=dry_run,
|
||||
)
|
||||
|
||||
def merge_changes(
|
||||
def merge_changes_locally(
|
||||
self,
|
||||
repo: GitRepo,
|
||||
skip_mandatory_checks: bool = False,
|
||||
@ -1231,22 +1239,7 @@ class GitHubPR:
|
||||
branch_to_merge_into = self.default_branch() if branch is None else branch
|
||||
if repo.current_branch() != branch_to_merge_into:
|
||||
repo.checkout(branch_to_merge_into)
|
||||
if not self.is_ghstack_pr():
|
||||
msg = self.gen_commit_message()
|
||||
pr_branch_name = f"__pull-request-{self.pr_num}__init__"
|
||||
repo.fetch(self.last_commit()["oid"], pr_branch_name)
|
||||
repo._run_git("merge", "--squash", pr_branch_name)
|
||||
repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
|
||||
|
||||
# Did the PR change since we started the merge?
|
||||
pulled_sha = repo.show_ref(pr_branch_name)
|
||||
latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
|
||||
if pulled_sha != latest_pr_status.last_commit()["oid"]:
|
||||
raise RuntimeError(
|
||||
"PR has been updated since CI checks last passed. Please rerun the merge command."
|
||||
)
|
||||
return []
|
||||
else:
|
||||
if self.is_ghstack_pr():
|
||||
return self.merge_ghstack_into(
|
||||
repo,
|
||||
skip_mandatory_checks,
|
||||
@ -1254,6 +1247,21 @@ class GitHubPR:
|
||||
skip_all_rule_checks=skip_all_rule_checks,
|
||||
)
|
||||
|
||||
msg = self.gen_commit_message()
|
||||
pr_branch_name = f"__pull-request-{self.pr_num}__init__"
|
||||
repo.fetch(self.last_commit_sha(), pr_branch_name)
|
||||
repo._run_git("merge", "--squash", pr_branch_name)
|
||||
repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
|
||||
|
||||
# Did the PR change since we started the merge?
|
||||
pulled_sha = repo.show_ref(pr_branch_name)
|
||||
latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
|
||||
if pulled_sha != latest_pr_status.last_commit_sha():
|
||||
raise RuntimeError(
|
||||
"PR has been updated since CI checks last passed. Please rerun the merge command."
|
||||
)
|
||||
return []
|
||||
|
||||
|
||||
class MergeRuleFailedError(RuntimeError):
|
||||
def __init__(self, message: str, rule: Optional["MergeRule"] = None) -> None:
|
||||
@ -1458,7 +1466,7 @@ def find_matching_merge_rule(
|
||||
pending_checks = []
|
||||
failed_checks = []
|
||||
|
||||
hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit()['oid']}"
|
||||
hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit_sha()}"
|
||||
if len(failed_checks) > 0:
|
||||
if reject_reason_score < 30000:
|
||||
reject_reason_score = 30000
|
||||
@ -2156,14 +2164,14 @@ def categorize_checks(
|
||||
def merge(
|
||||
pr: GitHubPR,
|
||||
repo: GitRepo,
|
||||
comment_id: int,
|
||||
dry_run: bool = False,
|
||||
skip_mandatory_checks: bool = False,
|
||||
comment_id: Optional[int] = None,
|
||||
timeout_minutes: int = 400,
|
||||
stale_pr_days: int = 3,
|
||||
ignore_current: bool = False,
|
||||
) -> None:
|
||||
initial_commit_sha = pr.last_commit()["oid"]
|
||||
initial_commit_sha = pr.last_commit_sha()
|
||||
pr_link = f"https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num}"
|
||||
print(f"Attempting merge of {initial_commit_sha} ({pr_link})")
|
||||
|
||||
@ -2234,7 +2242,7 @@ def merge(
|
||||
f"Attempting merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} ({elapsed_time / 60} minutes elapsed)"
|
||||
)
|
||||
pr = GitHubPR(pr.org, pr.project, pr.pr_num)
|
||||
if initial_commit_sha != pr.last_commit()["oid"]:
|
||||
if initial_commit_sha != pr.last_commit_sha():
|
||||
raise RuntimeError(
|
||||
"New commits were pushed while merging. Please rerun the merge command."
|
||||
)
|
||||
@ -2401,7 +2409,7 @@ def main() -> None:
|
||||
if args.check_mergeability:
|
||||
if pr.is_ghstack_pr():
|
||||
get_ghstack_prs(repo, pr) # raises error if out of sync
|
||||
pr.merge_changes(
|
||||
pr.merge_changes_locally(
|
||||
repo,
|
||||
skip_mandatory_checks=True,
|
||||
skip_all_rule_checks=True,
|
||||
@ -2416,12 +2424,18 @@ def main() -> None:
|
||||
gh_post_pr_comment(org, project, args.pr_num, message, dry_run=args.dry_run)
|
||||
return
|
||||
try:
|
||||
# Ensure comment id is set, else fail
|
||||
if not args.comment_id:
|
||||
raise ValueError(
|
||||
"Comment ID is required for merging PRs, please provide it using --comment-id"
|
||||
)
|
||||
|
||||
merge(
|
||||
pr,
|
||||
repo,
|
||||
comment_id=args.comment_id,
|
||||
dry_run=args.dry_run,
|
||||
skip_mandatory_checks=args.force,
|
||||
comment_id=args.comment_id,
|
||||
ignore_current=args.ignore_current,
|
||||
)
|
||||
except Exception as e:
|
||||
@ -2443,7 +2457,7 @@ def main() -> None:
|
||||
broken_trunk_checks=[],
|
||||
flaky_checks=[],
|
||||
unstable_checks=[],
|
||||
last_commit_sha=pr.last_commit().get("oid", ""),
|
||||
last_commit_sha=pr.last_commit_sha(default=""),
|
||||
merge_base_sha=pr.get_merge_base(),
|
||||
is_failed=True,
|
||||
skip_mandatory_checks=args.force,
|
||||
|
||||
2
.github/templates/common.yml.j2
vendored
2
.github/templates/common.yml.j2
vendored
@ -4,7 +4,7 @@
|
||||
{%- set download_artifact_action = "actions/download-artifact@v4.1.7" -%}
|
||||
|
||||
{%- set timeout_minutes = 240 -%}
|
||||
{%- set timeout_minutes_windows_binary = 300 -%}
|
||||
{%- set timeout_minutes_windows_binary = 360 -%}
|
||||
|
||||
{%- macro concurrency(build_environment) -%}
|
||||
concurrency:
|
||||
|
||||
2
.github/workflows/_link_check.yml
vendored
2
.github/workflows/_link_check.yml
vendored
@ -13,6 +13,7 @@ jobs:
|
||||
if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-url-lint') }}
|
||||
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
|
||||
with:
|
||||
job-name: lint-urls
|
||||
timeout: 120
|
||||
runner: ${{ inputs.runner }}linux.2xlarge
|
||||
docker-image: ci-image:pytorch-linux-jammy-linter
|
||||
@ -38,6 +39,7 @@ jobs:
|
||||
if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-xref-lint') }}
|
||||
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
|
||||
with:
|
||||
job-name: lint-xrefs
|
||||
timeout: 60
|
||||
runner: ${{ inputs.runner }}linux.2xlarge
|
||||
docker-image: ci-image:pytorch-linux-jammy-linter
|
||||
|
||||
2
.github/workflows/_linux-test.yml
vendored
2
.github/workflows/_linux-test.yml
vendored
@ -409,7 +409,7 @@ jobs:
|
||||
job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
|
||||
|
||||
- name: Authenticate with AWS
|
||||
if: ${{ contains(matrix.runner, 'b200') }}
|
||||
if: ${{ always() && contains(matrix.runner, 'b200') }}
|
||||
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
|
||||
|
||||
2
.github/workflows/build-triton-wheel.yml
vendored
2
.github/workflows/build-triton-wheel.yml
vendored
@ -145,7 +145,7 @@ jobs:
|
||||
fi
|
||||
|
||||
docker exec -t "${container_name}" yum install -y zlib-devel zip
|
||||
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -m pip install -U setuptools==78.1.0 pybind11==2.13.1 auditwheel wheel
|
||||
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -m pip install -U setuptools==78.1.0 pybind11==3.0.1 auditwheel wheel
|
||||
set +e
|
||||
docker exec -t "${container_name}" command -v pip
|
||||
has_pip=$?
|
||||
|
||||
12
.github/workflows/docker-builds.yml
vendored
12
.github/workflows/docker-builds.yml
vendored
@ -56,18 +56,18 @@ jobs:
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
|
||||
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-py3.9-clang12,
|
||||
pytorch-linux-jammy-py3.10-clang12,
|
||||
pytorch-linux-jammy-py3.13-clang12,
|
||||
pytorch-linux-jammy-rocm-n-py3,
|
||||
pytorch-linux-noble-rocm-n-py3,
|
||||
pytorch-linux-noble-rocm-alpha-py3,
|
||||
pytorch-linux-jammy-rocm-n-py3-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
|
||||
pytorch-linux-jammy-py3.9-gcc11,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12,
|
||||
pytorch-linux-jammy-py3.10-gcc11,
|
||||
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
|
||||
pytorch-linux-jammy-py3.12-halide,
|
||||
pytorch-linux-jammy-xpu-2025.0-py3,
|
||||
pytorch-linux-jammy-xpu-2025.1-py3,
|
||||
pytorch-linux-jammy-xpu-n-1-py3,
|
||||
pytorch-linux-jammy-xpu-n-py3,
|
||||
pytorch-linux-jammy-py3-clang18-asan,
|
||||
pytorch-linux-jammy-py3-clang12-onnx,
|
||||
pytorch-linux-jammy-linter,
|
||||
@ -124,7 +124,7 @@ jobs:
|
||||
GHCR_PAT: ${{ secrets.GHCR_PAT }}
|
||||
with:
|
||||
shell: bash
|
||||
timeout_minutes: 30
|
||||
timeout_minutes: 60
|
||||
max_attempts: 5
|
||||
retry_wait_seconds: 90
|
||||
command: |
|
||||
|
||||
433
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
433
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
@ -47,117 +47,6 @@ jobs:
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
manywheel-py3_9-cpu-aarch64-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: manylinux2_28_aarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_9-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cpu-aarch64-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cpu-aarch64-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: manylinux2_28_aarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.2xlarge
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cpu-aarch64-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_9-cpu-aarch64-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: manylinux2_28_aarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cpu-aarch64
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_9-cuda-aarch64-12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_9-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda-aarch64-12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_9-cuda-aarch64-12_9-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda-aarch64-12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_10-cpu-aarch64-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -269,6 +158,52 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_10-cuda-aarch64-13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda-aarch64-13_0-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_10-cuda-aarch64-13_0-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-13_0
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cpu-aarch64-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -380,6 +315,52 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda-aarch64-13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda-aarch64-13_0-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_11-cuda-aarch64-13_0-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-13_0
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_12-cpu-aarch64-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -491,6 +472,52 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_12-cuda-aarch64-13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda-aarch64-13_0-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_12-cuda-aarch64-13_0-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-13_0
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_13-cpu-aarch64-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -602,6 +629,52 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_13-cuda-aarch64-13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda-aarch64-13_0-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_13-cuda-aarch64-13_0-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-13_0
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_13t-cpu-aarch64-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -713,6 +786,52 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_13t-cuda-aarch64-13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.13t"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda-aarch64-13_0-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_13t-cuda-aarch64-13_0-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.13t"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-13_0
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_14-cpu-aarch64-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -824,6 +943,52 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_14-cuda-aarch64-13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.14"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda-aarch64-13_0-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_14-cuda-aarch64-13_0-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.14"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-13_0
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_14t-cpu-aarch64-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -934,3 +1099,49 @@ jobs:
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_14t-cuda-aarch64-13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.14t"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda-aarch64-13_0-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_14t-cuda-aarch64-13_0-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.14t"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-13_0
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
686
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
686
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
@ -47,664 +47,6 @@ jobs:
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
manywheel-py3_9-cpu-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cpu-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cpu-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cpu-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_9-cpu-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_9-cuda12_6-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu126
|
||||
GPU_ARCH_VERSION: "12.6"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_6-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cuda12_6-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu126
|
||||
GPU_ARCH_VERSION: "12.6"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_6-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_9-cuda12_6-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu126
|
||||
GPU_ARCH_VERSION: "12.6"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_6
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_9-cuda12_8-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu128
|
||||
GPU_ARCH_VERSION: "12.8"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_8-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cuda12_8-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu128
|
||||
GPU_ARCH_VERSION: "12.8"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_8-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_9-cuda12_8-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu128
|
||||
GPU_ARCH_VERSION: "12.8"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_8
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_9-cuda12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_9-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cuda12_9-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_9-cuda12_9-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_9-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda13_0-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cuda13_0-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda13_0-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_9-cuda13_0-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda13_0
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_9-rocm6_3-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm6.3
|
||||
GPU_ARCH_VERSION: "6.3"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-rocm6_3
|
||||
build_environment: linux-binary-manywheel
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-rocm6_3-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-rocm6_3-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm6.3
|
||||
GPU_ARCH_VERSION: "6.3"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
SKIP_ALL_TESTS: 1
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Setup ROCm
|
||||
uses: ./.github/actions/setup-rocm
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: manywheel-py3_9-rocm6_3
|
||||
path: "${{ runner.temp }}/artifacts/"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: ROCm set GPU_FLAG
|
||||
run: |
|
||||
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
|
||||
- name: configure aws credentials
|
||||
id: aws_creds
|
||||
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
aws-region: us-east-1
|
||||
role-duration-seconds: 18000
|
||||
- name: Calculate docker image
|
||||
id: calculate-docker-image
|
||||
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
||||
with:
|
||||
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
|
||||
docker-image-name: manylinux2_28-builder
|
||||
custom-tag-prefix: rocm6.3
|
||||
docker-build-dir: .ci/docker
|
||||
working-directory: pytorch
|
||||
- name: Pull Docker image
|
||||
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
||||
with:
|
||||
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
- name: Test Pytorch binary
|
||||
uses: ./pytorch/.github/actions/test-pytorch-binary
|
||||
env:
|
||||
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
- name: Teardown ROCm
|
||||
uses: ./.github/actions/teardown-rocm
|
||||
manywheel-py3_9-rocm6_3-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_9-rocm6_3-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm6.3
|
||||
GPU_ARCH_VERSION: "6.3"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-rocm6_3
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_9-rocm6_4-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm6.4
|
||||
GPU_ARCH_VERSION: "6.4"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-rocm6_4
|
||||
build_environment: linux-binary-manywheel
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-rocm6_4-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-rocm6_4-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm6.4
|
||||
GPU_ARCH_VERSION: "6.4"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
SKIP_ALL_TESTS: 1
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Setup ROCm
|
||||
uses: ./.github/actions/setup-rocm
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: manywheel-py3_9-rocm6_4
|
||||
path: "${{ runner.temp }}/artifacts/"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: ROCm set GPU_FLAG
|
||||
run: |
|
||||
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
|
||||
- name: configure aws credentials
|
||||
id: aws_creds
|
||||
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
aws-region: us-east-1
|
||||
role-duration-seconds: 18000
|
||||
- name: Calculate docker image
|
||||
id: calculate-docker-image
|
||||
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
||||
with:
|
||||
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
|
||||
docker-image-name: manylinux2_28-builder
|
||||
custom-tag-prefix: rocm6.4
|
||||
docker-build-dir: .ci/docker
|
||||
working-directory: pytorch
|
||||
- name: Pull Docker image
|
||||
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
||||
with:
|
||||
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
- name: Test Pytorch binary
|
||||
uses: ./pytorch/.github/actions/test-pytorch-binary
|
||||
env:
|
||||
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
- name: Teardown ROCm
|
||||
uses: ./.github/actions/teardown-rocm
|
||||
manywheel-py3_9-rocm6_4-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_9-rocm6_4-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm6.4
|
||||
GPU_ARCH_VERSION: "6.4"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-rocm6_4
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_9-xpu-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: xpu
|
||||
GPU_ARCH_TYPE: xpu
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: xpu
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-xpu
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-xpu-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-xpu-build
|
||||
- get-label-type
|
||||
runs-on: linux.idc.xpu
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: xpu
|
||||
GPU_ARCH_TYPE: xpu
|
||||
SKIP_ALL_TESTS: 1
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: xpu
|
||||
DESIRED_PYTHON: "3.9"
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
steps:
|
||||
- name: Setup XPU
|
||||
uses: ./.github/actions/setup-xpu
|
||||
- name: configure aws credentials
|
||||
id: aws_creds
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
aws-region: us-east-1
|
||||
- name: Login to Amazon ECR
|
||||
id: login-ecr
|
||||
uses: aws-actions/amazon-ecr-login@v2
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: manywheel-py3_9-xpu
|
||||
path: "${{ runner.temp }}/artifacts/"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: Calculate docker image
|
||||
id: calculate-docker-image
|
||||
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
||||
with:
|
||||
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
|
||||
docker-image-name: manylinux2_28-builder
|
||||
custom-tag-prefix: xpu
|
||||
docker-build-dir: .ci/docker
|
||||
working-directory: pytorch
|
||||
- name: Pull Docker image
|
||||
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
||||
with:
|
||||
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
- name: Test Pytorch binary
|
||||
uses: ./pytorch/.github/actions/test-pytorch-binary
|
||||
env:
|
||||
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
- name: Teardown XPU
|
||||
uses: ./.github/actions/teardown-xpu
|
||||
manywheel-py3_9-xpu-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_9-xpu-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: xpu
|
||||
GPU_ARCH_TYPE: xpu
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: xpu
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-xpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_10-cpu-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -983,7 +325,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda13_0-test: # Testing
|
||||
@ -1270,7 +612,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-xpu
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-xpu-test: # Testing
|
||||
@ -1641,7 +983,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda13_0-test: # Testing
|
||||
@ -1928,7 +1270,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-xpu
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-xpu-test: # Testing
|
||||
@ -2299,7 +1641,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda13_0-test: # Testing
|
||||
@ -2586,7 +1928,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-xpu
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-xpu-test: # Testing
|
||||
@ -2957,7 +2299,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda13_0-test: # Testing
|
||||
@ -3244,7 +2586,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-xpu
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-xpu-test: # Testing
|
||||
@ -3615,7 +2957,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda13_0-test: # Testing
|
||||
@ -3902,7 +3244,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-xpu
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-xpu-test: # Testing
|
||||
@ -4273,7 +3615,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda13_0-test: # Testing
|
||||
@ -4560,7 +3902,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-xpu
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-xpu-test: # Testing
|
||||
@ -4931,7 +4273,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda13_0-test: # Testing
|
||||
@ -5218,7 +4560,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-xpu
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-xpu-test: # Testing
|
||||
|
||||
64
.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
generated
vendored
64
.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
generated
vendored
@ -47,70 +47,6 @@ jobs:
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
manywheel-py3_9-cpu-s390x-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runs_on: linux.s390x
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
timeout-minutes: 420
|
||||
build_name: manywheel-py3_9-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cpu-s390x-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cpu-s390x-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
runs_on: linux.s390x
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cpu-s390x-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_9-cpu-s390x-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cpu-s390x
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_10-cpu-s390x-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
|
||||
145
.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
generated
vendored
145
.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
generated
vendored
@ -30,151 +30,6 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
wheel-py3_9-cpu-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
runs-on: macos-14-xlarge
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: wheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
# runner.temp variable, which we need.
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
# shellcheck disable=SC2129
|
||||
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
|
||||
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
|
||||
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
|
||||
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
|
||||
fi
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
|
||||
|
||||
# Build
|
||||
USE_PYTORCH_METAL_EXPORT=1
|
||||
USE_COREML_DELEGATE=1
|
||||
TORCH_PACKAGE_NAME="${TORCH_PACKAGE_NAME//-/_}"
|
||||
export USE_PYTORCH_METAL_EXPORT
|
||||
export USE_COREML_DELEGATE
|
||||
export TORCH_PACKAGE_NAME
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
pip uninstall -y "$TORCH_PACKAGE_NAME" || true
|
||||
pip uninstall -y "$TORCH_PACKAGE_NAME" || true
|
||||
|
||||
# Create new "clean" conda environment for testing
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
python "${PYTORCH_ROOT}/.ci/pytorch/smoke_test/smoke_test.py" --package torchonly ${SMOKE_TEST_PARAMS}
|
||||
- uses: actions/upload-artifact@v4.4.0
|
||||
if: always()
|
||||
with:
|
||||
name: wheel-py3_9-cpu
|
||||
retention-days: 14
|
||||
if-no-files-found: error
|
||||
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
||||
wheel-py3_9-cpu-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: wheel-py3_9-cpu-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: wheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: wheel-py3_9-cpu
|
||||
use_s3: False
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
wheel-py3_10-cpu-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
runs-on: macos-14-xlarge
|
||||
|
||||
4
.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
generated
vendored
4
.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
generated
vendored
@ -51,7 +51,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "windows-11-arm64-preview"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -128,7 +128,7 @@ jobs:
|
||||
- libtorch-cpu-shared-with-deps-debug-build
|
||||
- get-label-type
|
||||
runs-on: "windows-11-arm64-preview"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
|
||||
4
.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
generated
vendored
4
.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
generated
vendored
@ -51,7 +51,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "windows-11-arm64-preview"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -128,7 +128,7 @@ jobs:
|
||||
- libtorch-cpu-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: "windows-11-arm64-preview"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
|
||||
12
.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
generated
vendored
12
.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
generated
vendored
@ -51,7 +51,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "windows-11-arm64-preview"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: wheel
|
||||
@ -124,7 +124,7 @@ jobs:
|
||||
- wheel-py3_11-cpu-build
|
||||
- get-label-type
|
||||
runs-on: "windows-11-arm64-preview"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: wheel
|
||||
@ -198,7 +198,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "windows-11-arm64-preview"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: wheel
|
||||
@ -271,7 +271,7 @@ jobs:
|
||||
- wheel-py3_12-cpu-build
|
||||
- get-label-type
|
||||
runs-on: "windows-11-arm64-preview"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: wheel
|
||||
@ -345,7 +345,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "windows-11-arm64-preview"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: wheel
|
||||
@ -418,7 +418,7 @@ jobs:
|
||||
- wheel-py3_13-cpu-build
|
||||
- get-label-type
|
||||
runs-on: "windows-11-arm64-preview"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: wheel
|
||||
|
||||
4
.github/workflows/generated-windows-binary-libtorch-debug-main.yml
generated
vendored
4
.github/workflows/generated-windows-binary-libtorch-debug-main.yml
generated
vendored
@ -38,7 +38,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -153,7 +153,7 @@ jobs:
|
||||
- libtorch-cpu-shared-with-deps-debug-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
|
||||
16
.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
generated
vendored
16
.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
generated
vendored
@ -45,7 +45,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -160,7 +160,7 @@ jobs:
|
||||
- libtorch-cpu-shared-with-deps-debug-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -292,7 +292,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -408,7 +408,7 @@ jobs:
|
||||
- libtorch-cuda12_6-shared-with-deps-debug-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -542,7 +542,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -658,7 +658,7 @@ jobs:
|
||||
- libtorch-cuda12_8-shared-with-deps-debug-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -792,7 +792,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -908,7 +908,7 @@ jobs:
|
||||
- libtorch-cuda12_9-shared-with-deps-debug-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
|
||||
4
.github/workflows/generated-windows-binary-libtorch-release-main.yml
generated
vendored
4
.github/workflows/generated-windows-binary-libtorch-release-main.yml
generated
vendored
@ -38,7 +38,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -153,7 +153,7 @@ jobs:
|
||||
- libtorch-cpu-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
|
||||
16
.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
generated
vendored
16
.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
generated
vendored
@ -45,7 +45,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -160,7 +160,7 @@ jobs:
|
||||
- libtorch-cpu-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -292,7 +292,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -408,7 +408,7 @@ jobs:
|
||||
- libtorch-cuda12_6-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -542,7 +542,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -658,7 +658,7 @@ jobs:
|
||||
- libtorch-cuda12_8-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -792,7 +792,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -908,7 +908,7 @@ jobs:
|
||||
- libtorch-cuda12_9-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
|
||||
timeout-minutes: 300
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
|
||||
1339
.github/workflows/generated-windows-binary-wheel-nightly.yml
generated
vendored
1339
.github/workflows/generated-windows-binary-wheel-nightly.yml
generated
vendored
File diff suppressed because it is too large
Load Diff
@ -165,6 +165,9 @@ jobs:
|
||||
name: cuda12.8-py3.10-gcc9-sm90
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
# The pull_request trigger is used in PR to bump transformers pin which always
|
||||
# needs one round of benchmark
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }}
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
|
||||
dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}
|
||||
|
||||
4
.github/workflows/nightly.yml
vendored
4
.github/workflows/nightly.yml
vendored
@ -42,8 +42,8 @@ jobs:
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
|
||||
build-environment: linux-jammy-py3.9-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
|
||||
build-environment: linux-jammy-py3.10-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
|
||||
secrets: inherit
|
||||
|
||||
docs-push:
|
||||
|
||||
113
.github/workflows/pull.yml
vendored
113
.github/workflows/pull.yml
vendored
@ -49,14 +49,14 @@ jobs:
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
linux-jammy-py3_9-gcc11-build:
|
||||
name: linux-jammy-py3.9-gcc11
|
||||
linux-jammy-py3_10-gcc11-build:
|
||||
name: linux-jammy-py3.10-gcc11
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.9-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
|
||||
build-environment: linux-jammy-py3.10-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
@ -73,49 +73,49 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3_9-gcc11-test:
|
||||
name: linux-jammy-py3.9-gcc11
|
||||
linux-jammy-py3_10-gcc11-test:
|
||||
name: linux-jammy-py3.10-gcc11
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
- linux-jammy-py3_9-gcc11-build
|
||||
- linux-jammy-py3_10-gcc11-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11
|
||||
docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.test-matrix }}
|
||||
build-environment: linux-jammy-py3.10-gcc11
|
||||
docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-docs:
|
||||
name: linux-docs
|
||||
uses: ./.github/workflows/_docs.yml
|
||||
needs: linux-jammy-py3_9-gcc11-build
|
||||
needs: linux-jammy-py3_10-gcc11-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11
|
||||
docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }}
|
||||
build-environment: linux-jammy-py3.10-gcc11
|
||||
docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3_9-gcc11-no-ops:
|
||||
name: linux-jammy-py3.9-gcc11-no-ops
|
||||
linux-jammy-py3_10-gcc11-no-ops:
|
||||
name: linux-jammy-py3.10-gcc11-no-ops
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.9-gcc11-no-ops
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
|
||||
build-environment: linux-jammy-py3.10-gcc11-no-ops
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 1 },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3_9-gcc11-pch:
|
||||
name: linux-jammy-py3.9-gcc11-pch
|
||||
linux-jammy-py3_10-gcc11-pch:
|
||||
name: linux-jammy-py3.10-gcc11-pch
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.9-gcc11-pch
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
|
||||
build-environment: linux-jammy-py3.10-gcc11-pch
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 1 },
|
||||
@ -183,14 +183,14 @@ jobs:
|
||||
test-matrix: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3_9-clang12-build:
|
||||
name: linux-jammy-py3.9-clang12
|
||||
linux-jammy-py3_10-clang12-build:
|
||||
name: linux-jammy-py3.10-clang12
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.9-clang12
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12
|
||||
build-environment: linux-jammy-py3.10-clang12
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
@ -207,16 +207,16 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3_9-clang12-test:
|
||||
name: linux-jammy-py3.9-clang12
|
||||
linux-jammy-py3_10-clang12-test:
|
||||
name: linux-jammy-py3.10-clang12
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
- linux-jammy-py3_9-clang12-build
|
||||
- linux-jammy-py3_10-clang12-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-clang12
|
||||
docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }}
|
||||
build-environment: linux-jammy-py3.10-clang12
|
||||
docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3_13-clang12-build:
|
||||
@ -253,14 +253,14 @@ jobs:
|
||||
test-matrix: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-cudnn9-py3_9-clang12-build:
|
||||
name: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
|
||||
linux-jammy-cuda12_8-cudnn9-py3_10-clang12-build:
|
||||
name: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12
|
||||
build-environment: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 1 },
|
||||
@ -282,14 +282,14 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3_9-gcc11-mobile-lightweight-dispatch-build:
|
||||
name: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build
|
||||
linux-jammy-py3_10-gcc11-mobile-lightweight-dispatch-build:
|
||||
name: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
|
||||
build-environment: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
|
||||
build-generates-artifacts: false
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -342,15 +342,40 @@ jobs:
|
||||
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-xpu-2025_1-py3_9-build:
|
||||
name: linux-jammy-xpu-2025.1-py3.9
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
|
||||
name: cuda12.8-py3.10-gcc9-sm75
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
sync-tag: linux-xpu-2025-1-build
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '7.5'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
|
||||
name: cuda12.8-py3.10-gcc9-sm75
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-xpu-n-py3_9-build:
|
||||
name: linux-jammy-xpu-n-py3.9
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
sync-tag: linux-xpu-n-build
|
||||
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
|
||||
build-environment: linux-jammy-xpu-2025.1-py3.9
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3
|
||||
build-environment: linux-jammy-xpu-n-py3.9
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" },
|
||||
|
||||
20
.github/workflows/slow.yml
vendored
20
.github/workflows/slow.yml
vendored
@ -78,14 +78,14 @@ jobs:
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3_9-clang12-build:
|
||||
name: linux-jammy-py3.9-clang12
|
||||
linux-jammy-py3_10-clang12-build:
|
||||
name: linux-jammy-py3.10-clang12
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.9-clang12
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12
|
||||
build-environment: linux-jammy-py3.10-clang12
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "slow", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
|
||||
@ -93,16 +93,16 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3_9-clang12-test:
|
||||
name: linux-jammy-py3.9-clang12
|
||||
linux-jammy-py3_10-clang12-test:
|
||||
name: linux-jammy-py3.10-clang12
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
- linux-jammy-py3_9-clang12-build
|
||||
- linux-jammy-py3_10-clang12-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-clang12
|
||||
docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }}
|
||||
build-environment: linux-jammy-py3.10-clang12
|
||||
docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-build:
|
||||
|
||||
2
.github/workflows/test-check-binary.yml
vendored
2
.github/workflows/test-check-binary.yml
vendored
@ -30,7 +30,7 @@ jobs:
|
||||
name: Test check_binary.sh for Linux CUDA
|
||||
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
|
||||
with:
|
||||
runner: linux.4xlarge.nvidia.gpu
|
||||
runner: linux.g4dn.4xlarge.nvidia.gpu
|
||||
docker-image: python:3.11
|
||||
docker-build-dir: "skip-docker-build"
|
||||
script: |
|
||||
|
||||
23
.github/workflows/trymerge.yml
vendored
23
.github/workflows/trymerge.yml
vendored
@ -59,22 +59,19 @@ jobs:
|
||||
# on the PR appear in chronological order (timing issues can shuffle them around)
|
||||
sleep 60
|
||||
fi
|
||||
|
||||
# Require a comment id for merge operations
|
||||
if [ -z "${COMMENT_ID}" ]; then
|
||||
echo "Error: merge requires COMMENT_ID to be specified"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -n "${FORCE}" ]; then
|
||||
if [ -n "${COMMENT_ID}" ]; then
|
||||
python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}"
|
||||
else
|
||||
python3 .github/scripts/trymerge.py --force "${PR_NUM}"
|
||||
fi
|
||||
python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}"
|
||||
elif [ -n "${IGNORE_CURRENT}" ]; then
|
||||
if [ -n "${COMMENT_ID}" ]; then
|
||||
python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}"
|
||||
else
|
||||
python3 .github/scripts/trymerge.py --ignore-current "${PR_NUM}"
|
||||
fi
|
||||
elif [ -n "${COMMENT_ID}" ]; then
|
||||
python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}"
|
||||
python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}"
|
||||
else
|
||||
python3 .github/scripts/trymerge.py "${PR_NUM}"
|
||||
python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}"
|
||||
fi
|
||||
- name: Comment on Canceled
|
||||
if: ${{ cancelled() && steps.checkout.outcome == 'success' }}
|
||||
|
||||
3
.github/workflows/vllm.yml
vendored
3
.github/workflows/vllm.yml
vendored
@ -6,8 +6,7 @@ on:
|
||||
- ciflow/vllm/*
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# Every 12 hours starting at 00:00 UTC (00:00 and 12:00)
|
||||
- cron: '0 0,12 * * *'
|
||||
- cron: '0 */8 * * *' # every 8 hours at minute 0 (UTC)
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
|
||||
3
.github/workflows/win-arm64-build-test.yml
vendored
3
.github/workflows/win-arm64-build-test.yml
vendored
@ -4,6 +4,9 @@ on:
|
||||
push:
|
||||
tags:
|
||||
- ciflow/win-arm64/*
|
||||
schedule:
|
||||
# Every 4 hours starting at 00:00 UTC
|
||||
- cron: '0 */4 * * *'
|
||||
|
||||
env:
|
||||
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
|
||||
|
||||
76
.github/workflows/xpu.yml
vendored
76
.github/workflows/xpu.yml
vendored
@ -26,15 +26,15 @@ jobs:
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-jammy-xpu-2025_0-py3_9-build:
|
||||
name: linux-jammy-xpu-2025.0-py3.9
|
||||
linux-jammy-xpu-n-1-py3_9-build:
|
||||
name: linux-jammy-xpu-n-1-py3.9
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
sync-tag: linux-xpu-2025-0-build
|
||||
sync-tag: linux-xpu-n-1-build
|
||||
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
|
||||
build-environment: linux-jammy-xpu-2025.0-py3.9
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.0-py3
|
||||
build-environment: linux-jammy-xpu-n-1-py3.9
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3
|
||||
runner: linux.12xlarge
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -47,60 +47,62 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-xpu-2025_1-py3_9-build:
|
||||
name: linux-jammy-xpu-2025.1-py3.9
|
||||
linux-jammy-xpu-n-py3_9-build:
|
||||
name: linux-jammy-xpu-n-py3.9
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
sync-tag: linux-xpu-2025-1-build
|
||||
sync-tag: linux-xpu-n-build
|
||||
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
|
||||
build-environment: linux-jammy-xpu-2025.1-py3.9
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3
|
||||
build-environment: linux-jammy-xpu-n-py3.9
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
|
||||
runner: linux.12xlarge
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-xpu-2025_1-py3_9-test:
|
||||
name: linux-jammy-xpu-2025.1-py3.9
|
||||
linux-jammy-xpu-n-py3_9-test:
|
||||
name: linux-jammy-xpu-n-py3.9
|
||||
uses: ./.github/workflows/_xpu-test.yml
|
||||
needs: linux-jammy-xpu-2025_1-py3_9-build
|
||||
needs: linux-jammy-xpu-n-py3_9-build
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
with:
|
||||
build-environment: linux-jammy-xpu-2025.1-py3.9
|
||||
docker-image: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.test-matrix }}
|
||||
build-environment: linux-jammy-xpu-n-py3.9
|
||||
docker-image: ${{ needs.linux-jammy-xpu-n-py3_9-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-xpu-n-py3_9-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
windows-xpu-2025_0-build:
|
||||
windows-xpu-n-1-build:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: win-vs2022-xpu-2025_0-py3
|
||||
name: win-vs2022-xpu-n-1-py3
|
||||
uses: ./.github/workflows/_win-build.yml
|
||||
with:
|
||||
build-environment: win-vs2022-xpu-py3
|
||||
cuda-version: cpu
|
||||
use-xpu: true
|
||||
xpu-version: '2025.0'
|
||||
vc-year: '2022'
|
||||
secrets: inherit
|
||||
|
||||
windows-xpu-2025_1-build:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: win-vs2022-xpu-2025_1-py3
|
||||
uses: ./.github/workflows/_win-build.yml
|
||||
with:
|
||||
build-environment: win-vs2022-xpu-py3
|
||||
build-environment: win-vs2022-xpu-n-1-py3
|
||||
cuda-version: cpu
|
||||
use-xpu: true
|
||||
xpu-version: '2025.1'
|
||||
vc-year: '2022'
|
||||
secrets: inherit
|
||||
|
||||
windows-xpu-n-build:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: win-vs2022-xpu-n-py3
|
||||
uses: ./.github/workflows/_win-build.yml
|
||||
with:
|
||||
build-environment: win-vs2022-xpu-n-py3
|
||||
cuda-version: cpu
|
||||
use-xpu: true
|
||||
xpu-version: '2025.2'
|
||||
vc-year: '2022'
|
||||
secrets: inherit
|
||||
|
||||
@ -583,7 +583,7 @@ exclude_patterns = [
|
||||
command = [
|
||||
'python3',
|
||||
'tools/linter/adapters/grep_linter.py',
|
||||
'--pattern=#include <pybind11\/(^|[^(gil\.h)])',
|
||||
'--pattern=#include <pybind11\/(^|[^(gil_simple\.h)])',
|
||||
'--allowlist-pattern=#include <torch\/csrc\/utils\/pybind.h>',
|
||||
'--linter-name=PYBIND11_INCLUDE',
|
||||
'--match-first-only',
|
||||
|
||||
@ -747,6 +747,7 @@ cc_library(
|
||||
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu",
|
||||
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu",
|
||||
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
|
||||
"torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp",
|
||||
"torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
|
||||
],
|
||||
)) + torch_sources,
|
||||
|
||||
@ -272,7 +272,7 @@ cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
|
||||
cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL"
|
||||
OFF)
|
||||
cmake_dependent_option(USE_NVSHMEM "Use NVSHMEM" ON
|
||||
"USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
|
||||
"USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
|
||||
option(USE_NNAPI "Use NNAPI" OFF)
|
||||
option(USE_NNPACK "Use NNPACK" ON)
|
||||
cmake_dependent_option(USE_NUMA "Use NUMA. Only available on Linux." ON "LINUX"
|
||||
|
||||
@ -1,5 +1,18 @@
|
||||
#pragma once
|
||||
|
||||
// See https://github.com/pytorch/pytorch/issues/161660
|
||||
// This compile flag is intended to be passed in to CppExtensions that rely on
|
||||
// the stable ABI via the `extra_compile_args` argument. This is a stopgap
|
||||
// solution to ensure that non-stable libtorch APIs are not used in the extension.
|
||||
// The long term solution is to have a torch_stable target that excludes headers
|
||||
// that are not in torch/stable or torch/headeronly.
|
||||
// See test/cpp_extensions/torch_stable_test_extension/setup.py for an example
|
||||
// of how this is used.
|
||||
#ifdef TORCH_STABLE_ONLY
|
||||
#error \
|
||||
"TensorBase.h should not be included when TORCH_STABLE_ONLY compile flag is passed"
|
||||
#endif
|
||||
|
||||
#include <c10/core/Device.h>
|
||||
#include <c10/core/Layout.h>
|
||||
#include <c10/core/MemoryFormat.h>
|
||||
|
||||
@ -15,7 +15,7 @@ std::enable_if_t<
|
||||
std::is_base_of_v<Base, Child>,
|
||||
std::unique_ptr<Base>>
|
||||
make_unique_base(Args&&... args) {
|
||||
return std::unique_ptr<Base>(new Child(std::forward<Args>(args)...));
|
||||
return std::make_unique<Child>(std::forward<Args>(args)...);
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
|
||||
@ -252,6 +252,13 @@ cudaGraph_t CUDAGraph::raw_cuda_graph() {
|
||||
return graph_;
|
||||
}
|
||||
|
||||
cudaGraphExec_t CUDAGraph::raw_cuda_graph_exec() {
|
||||
TORCH_CHECK(
|
||||
has_graph_exec_,
|
||||
"You cannot access the raw cudaGraphExec_t instance until instantiate() has been called");
|
||||
return graph_exec_;
|
||||
}
|
||||
|
||||
void CUDAGraph::reset() {
|
||||
// I'd prefer these checks throw exceptions, not print warnings,
|
||||
// but the destructor calls reset(), and at least one CI build
|
||||
|
||||
@ -37,6 +37,7 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
|
||||
void enable_debug_mode();
|
||||
void debug_dump(const std::string& debug_path);
|
||||
cudaGraph_t raw_cuda_graph();
|
||||
cudaGraphExec_t raw_cuda_graph_exec();
|
||||
|
||||
protected:
|
||||
cudaGraph_t graph_ = nullptr;
|
||||
|
||||
@ -103,15 +103,13 @@ inline bool _check_tensors_share_device_and_dtype(
|
||||
tensor.is_non_overlapping_and_dense();
|
||||
};
|
||||
|
||||
for (const auto& tensorList : tensorLists) {
|
||||
for (const auto& tensor : tensorList) {
|
||||
if (!is_tensor_okay(tensor)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
return std::all_of(
|
||||
tensorLists.cbegin(),
|
||||
tensorLists.cend(),
|
||||
[&](const TensorList& tensorList) {
|
||||
return std::all_of(
|
||||
tensorList.cbegin(), tensorList.cend(), is_tensor_okay);
|
||||
});
|
||||
}
|
||||
|
||||
// Helper function called in check_fast_path_restrictions to check if
|
||||
@ -157,11 +155,9 @@ inline bool _check_tensors_do_type_promotion_with_scalars(
|
||||
bool does_op_promote_integer_inputs_to_float = false) {
|
||||
for (const auto i : c10::irange(tensorList.size())) {
|
||||
// For division, integer inputs will result in float.
|
||||
if (does_op_promote_integer_inputs_to_float) {
|
||||
if (at::isIntegralType(
|
||||
tensorList[i].scalar_type(), /*includeBool*/ true)) {
|
||||
return false;
|
||||
}
|
||||
if (does_op_promote_integer_inputs_to_float &&
|
||||
at::isIntegralType(tensorList[i].scalar_type(), /*includeBool*/ true)) {
|
||||
return false;
|
||||
}
|
||||
if (!scalarList.empty()) {
|
||||
const auto& scalar =
|
||||
@ -338,36 +334,34 @@ inline FlatMap _group_tensors_by_first_tensors_device_and_dtype(
|
||||
}
|
||||
}),
|
||||
"Tensors of the same index must be on the same device and the same dtype except `step` tensors that can be CPU and float32/64 notwithstanding");
|
||||
if (!grouped_tensors_with_indices.count(key)) {
|
||||
grouped_tensors_with_indices.insert(
|
||||
{key,
|
||||
TensorsAndIndicesT{
|
||||
[&]() -> nested_optional_tensorvec_t {
|
||||
nested_optional_tensorvec_t nested_tensorvec;
|
||||
nested_tensorvec.reserve(num_lists);
|
||||
for (const auto& i : c10::irange(num_lists)) {
|
||||
std::vector<std::optional<at::Tensor>> tensors;
|
||||
if (!nested_tensorlist[i].empty()) {
|
||||
// NB: num_tensors is the max possible length for any of
|
||||
// the inner lists of tensor references. Reserving the max
|
||||
// trades memory for perf. This should not have significant
|
||||
// impact.
|
||||
tensors.reserve(num_tensors);
|
||||
}
|
||||
nested_tensorvec.emplace_back(tensors);
|
||||
}
|
||||
return nested_tensorvec;
|
||||
}(),
|
||||
[&]() -> IndicesT {
|
||||
if (!with_indices) {
|
||||
return {};
|
||||
} else {
|
||||
IndicesT indices;
|
||||
indices.reserve(num_tensors);
|
||||
return indices;
|
||||
}
|
||||
}()}});
|
||||
}
|
||||
grouped_tensors_with_indices.try_emplace(
|
||||
key,
|
||||
TensorsAndIndicesT{
|
||||
[&]() -> nested_optional_tensorvec_t {
|
||||
nested_optional_tensorvec_t nested_tensorvec;
|
||||
nested_tensorvec.reserve(num_lists);
|
||||
for (const auto& i : c10::irange(num_lists)) {
|
||||
std::vector<std::optional<at::Tensor>> tensors;
|
||||
if (!nested_tensorlist[i].empty()) {
|
||||
// NB: num_tensors is the max possible length for any of
|
||||
// the inner lists of tensor references. Reserving the max
|
||||
// trades memory for perf. This should not have significant
|
||||
// impact.
|
||||
tensors.reserve(num_tensors);
|
||||
}
|
||||
nested_tensorvec.emplace_back(std::move(tensors));
|
||||
}
|
||||
return nested_tensorvec;
|
||||
}(),
|
||||
[&]() -> IndicesT {
|
||||
if (!with_indices) {
|
||||
return {};
|
||||
} else {
|
||||
IndicesT indices;
|
||||
indices.reserve(num_tensors);
|
||||
return indices;
|
||||
}
|
||||
}()});
|
||||
for (const auto& list_index : c10::irange(num_lists)) {
|
||||
if (!nested_tensorlist[list_index].empty()) {
|
||||
grouped_tensors_with_indices[key].first[list_index].emplace_back(
|
||||
|
||||
@ -89,7 +89,7 @@ execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t
|
||||
using result_type = typename traits::result_type;
|
||||
for (; i < n; i++) {
|
||||
result_type* out_ptr = (result_type*)(data[0] + i * strides[0]);
|
||||
*out_ptr = c10::guts::apply(op, dereference<traits>(
|
||||
*out_ptr = std::apply(op, dereference<traits>(
|
||||
&data[1],
|
||||
&strides[1],
|
||||
i));
|
||||
@ -102,7 +102,7 @@ inline void
|
||||
execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) {
|
||||
using traits = function_traits<func_t>;
|
||||
for (; i < n; i++) {
|
||||
c10::guts::apply(op, dereference<traits>(
|
||||
std::apply(op, dereference<traits>(
|
||||
&data[0],
|
||||
&strides[0],
|
||||
i));
|
||||
@ -162,7 +162,7 @@ void handle_tuple_outputs(char* C10_RESTRICT data[],
|
||||
}
|
||||
|
||||
// Loop operation for `cpu_kernel_multiple_outputs`.
|
||||
// 1. Use `c10::guts::apply` to make dynamic method invocation
|
||||
// 1. Use `std::apply` to make dynamic method invocation
|
||||
// for the lambda passed in `cpu_kernel_multiple_outputs`.
|
||||
// 2. Iterate over the members of the returned tuple, set the corresponding
|
||||
// output tensor by the tuple member in `handle_tuple_outputs` function.
|
||||
@ -183,7 +183,7 @@ multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_
|
||||
}
|
||||
|
||||
for (; i < n; i++) {
|
||||
auto output = c10::guts::apply(op, dereference<traits>(
|
||||
auto output = std::apply(op, dereference<traits>(
|
||||
&data[num_outputs],
|
||||
&strides[num_outputs],
|
||||
i));
|
||||
@ -213,8 +213,8 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve
|
||||
for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
|
||||
auto args1 = dereference_vec<traits>(&data[1], opt_scalar, S, i);
|
||||
auto args2 = dereference_vec<traits>(&data[1], opt_scalar, S, i + Vec::size());
|
||||
auto out1 = c10::guts::apply(vop, std::move(args1));
|
||||
auto out2 = c10::guts::apply(vop, std::move(args2));
|
||||
auto out1 = std::apply(vop, std::move(args1));
|
||||
auto out2 = std::apply(vop, std::move(args2));
|
||||
out1.store(data[0] + i * sizeof(scalar_t));
|
||||
out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t));
|
||||
}
|
||||
|
||||
@ -1349,7 +1349,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
|
||||
if (scaling_choice_a == ScalingType::RowWise && scaling_choice_b == ScalingType::RowWise
|
||||
&& ((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)
|
||||
// cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales
|
||||
|| (dprops->major == 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) {
|
||||
|| (dprops->major >= 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) {
|
||||
TORCH_CHECK(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling.");
|
||||
at::cuda::detail::f8f8bf16_rowwise(
|
||||
mat1,
|
||||
|
||||
@ -223,6 +223,41 @@ __device__ __forceinline__ void fastAtomicAdd(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#ifdef USE_ROCM
|
||||
// This function implements a committed store.
|
||||
// Upon returning, the store is committed to global memory.
|
||||
// This is useful in avoiding the need for fences.
|
||||
template <typename T>
|
||||
__device__ inline void cmtdStore(void* address, T value) {
|
||||
int constexpr num_long_per_val = sizeof(value)/sizeof(long);
|
||||
int constexpr num_int_per_val = sizeof(value)/sizeof(int);
|
||||
int constexpr num_short_per_val = sizeof(value)/sizeof(short);
|
||||
int constexpr num_char_per_val = sizeof(value)/sizeof(char);
|
||||
union pnr { T v;
|
||||
long l[num_long_per_val];
|
||||
int i[num_int_per_val];
|
||||
short s[num_short_per_val];
|
||||
char c[num_char_per_val]; }
|
||||
_pnr = {.v = value };
|
||||
if constexpr (num_long_per_val*sizeof(long) == sizeof(value))
|
||||
for (int i=0; i<num_long_per_val; i++)
|
||||
__hip_atomic_store(reinterpret_cast<long *>(address)+i, _pnr.l[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
else if constexpr (num_int_per_val*sizeof(int) == sizeof(value))
|
||||
for (int i=0; i<num_int_per_val; i++)
|
||||
__hip_atomic_store(reinterpret_cast<int *>(address)+i, _pnr.i[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
else if constexpr (num_short_per_val*sizeof(short) == sizeof(value))
|
||||
for (int i=0; i<num_short_per_val; i++)
|
||||
__hip_atomic_store(reinterpret_cast<short *>(address)+i, _pnr.s[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
else if constexpr (num_char_per_val*sizeof(char) == sizeof(value))
|
||||
for (int i=0; i<num_char_per_val; i++)
|
||||
__hip_atomic_store(reinterpret_cast<char *>(address)+i, _pnr.c[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
__atomic_signal_fence(__ATOMIC_SEQ_CST);
|
||||
asm volatile("s_waitcnt vmcnt(0)" ::: "memory");
|
||||
__atomic_signal_fence(__ATOMIC_SEQ_CST);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__))
|
||||
// This function implements warp-level opportunistic fastatomics
|
||||
// To reduce contention on an atomicAdd, this replaces per-thread atomicAdd with a per-warp atomicAdd.
|
||||
|
||||
@ -18,6 +18,7 @@
|
||||
#include <thrust/pair.h>
|
||||
|
||||
#include <ATen/native/cuda/jit_utils.h>
|
||||
#include <ATen/native/cuda/KernelUtils.cuh>
|
||||
|
||||
namespace at::native {
|
||||
|
||||
@ -796,15 +797,25 @@ struct ReduceOp {
|
||||
bool should_store = config.should_store(output_idx);
|
||||
if (should_store) {
|
||||
index_t offset = config.staging_memory_offset(blockIdx.y);
|
||||
#ifndef USE_ROCM
|
||||
reduce_buffer[offset] = value;
|
||||
#else // [CMTSTRS]
|
||||
// In architectures with split caches, global fences are costly.
|
||||
// Here we preempt need for fences by committing stores to global memory.
|
||||
cmtdStore(&reduce_buffer[offset], value);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef USE_ROCM // skip fence if store are committed [CMTSTRS]
|
||||
__threadfence(); // make sure writes are globally visible
|
||||
#endif
|
||||
__syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
|
||||
bool is_last_block_done = mark_block_finished();
|
||||
|
||||
if (is_last_block_done) {
|
||||
#ifndef USE_ROCM // skip fence if store are committed [CMTSTRS]
|
||||
__threadfence(); // complete the acquire pattern after atomic
|
||||
#endif
|
||||
for (auto &v : value) {
|
||||
v = ident;
|
||||
}
|
||||
|
||||
@ -115,7 +115,10 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const std::opt
|
||||
return output;
|
||||
}
|
||||
|
||||
if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS)) {
|
||||
// No-graph execution causes nonsense if these are non-contiguous.
|
||||
const bool is_contiguous = input.is_contiguous() && weight.is_contiguous() && bias.is_contiguous();
|
||||
|
||||
if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_contiguous) {
|
||||
_mps_linear_nograph(input, weight, bias, output);
|
||||
// Squeeze last dim of 1D linear
|
||||
return weight_arg.dim() != 1 ? output : output.squeeze(-1);
|
||||
|
||||
@ -388,16 +388,11 @@ mha_bwd_ck(const at::Tensor &dout, // batch_size x seqlen_q x
|
||||
dv_expanded = dv;
|
||||
}
|
||||
|
||||
auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
|
||||
std::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
|
||||
uint64_t drop_seed = 1, drop_offset = 0;
|
||||
drop_seed = *philox_seed.data_ptr<int64_t>();
|
||||
drop_offset = *philox_offset.data_ptr<int64_t>();
|
||||
auto drop_seed_offset = std::make_pair(&drop_seed, &drop_offset);
|
||||
|
||||
uint64_t* drop_seed, drop_offset;
|
||||
int64_t counter_offset = batch_size * num_heads * ck_tile::get_warp_size();
|
||||
std::pair<uint64_t*, uint64_t*> drop_seed_offset = {nullptr,nullptr};
|
||||
if(is_dropout) {
|
||||
drop_seed_offset.first = philox_seed[0].data_ptr<uint64_t>();
|
||||
drop_seed_offset.second = philox_seed[1].data_ptr<uint64_t>();
|
||||
}
|
||||
|
||||
if (seqlen_q > 0) {
|
||||
ck_tile::stream_config stream_config{stream};
|
||||
|
||||
@ -177,6 +177,7 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x
|
||||
TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
|
||||
|
||||
const auto sizes = q.sizes();
|
||||
|
||||
const int batch_size = sizes[0];
|
||||
int seqlen_q = sizes[1];
|
||||
int num_heads = sizes[2];
|
||||
@ -225,6 +226,7 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x
|
||||
CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size);
|
||||
CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size);
|
||||
|
||||
|
||||
at::Tensor q_padded, k_padded, v_padded;
|
||||
if (head_size % 8 != 0) {
|
||||
q_padded = at::pad(temp_q, {0, 8 - head_size % 8});
|
||||
@ -237,6 +239,7 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x
|
||||
v_padded = v;
|
||||
}
|
||||
|
||||
|
||||
at::Tensor out;
|
||||
if (out_.has_value()) {
|
||||
out = out_.value();
|
||||
@ -263,6 +266,7 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x
|
||||
auto opts = q.options();
|
||||
bool has_lse = true;
|
||||
bool has_dropout = p_dropout > 0.0f;
|
||||
|
||||
at::Tensor softmax_lse;
|
||||
// TODO - check gradient, only training require lse
|
||||
softmax_lse = at::empty({batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat));
|
||||
@ -273,41 +277,46 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x
|
||||
p = at::empty({batch_size, num_heads, seqlen_q, seqlen_k}, opts.dtype(at::kByte));
|
||||
}
|
||||
else {
|
||||
p = at::empty({ 0 }, opts.dtype(at::kByte));
|
||||
p = at::empty({ 0 }, opts);
|
||||
}
|
||||
|
||||
|
||||
uint64_t drop_seed = 1, drop_offset = 0;
|
||||
int64_t counter_offset = batch_size * num_heads * ck_tile::get_warp_size();
|
||||
auto rng_state = at::empty({2}, opts.dtype(at::kLong));
|
||||
auto rng_state_ptr = reinterpret_cast<uint64_t*>(rng_state.data_ptr());
|
||||
|
||||
auto rng_state_options = at::TensorOptions().dtype(at::kUInt64).device(at::kCUDA);
|
||||
auto rng_state = at::zeros({2}, rng_state_options.dtype(at::kUInt64));
|
||||
auto _unused = at::empty({}, at::dtype(c10::kUInt64).device(at::kCUDA));
|
||||
|
||||
|
||||
at::Tensor seed_t, offset_t;
|
||||
|
||||
if (p_dropout > 0.0) {
|
||||
|
||||
auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
|
||||
gen_, at::cuda::detail::getDefaultCUDAGenerator());
|
||||
|
||||
// See Note [Acquire lock when using random generators]
|
||||
std::lock_guard<std::mutex> lock(gen->mutex_);
|
||||
|
||||
auto philox_args = gen->philox_cuda_state(counter_offset);
|
||||
|
||||
std::tie(drop_seed, drop_offset) = at::cuda::philox::unpack(philox_args);
|
||||
|
||||
|
||||
hipLaunchKernelGGL(
|
||||
flash::ParsePhiloxCudaState, dim3(1), dim3(64), 0, at::hip::getCurrentHIPStreamMasqueradingAsCUDA(), philox_args, rng_state_ptr);
|
||||
seed_t = at::scalar_tensor(at::Scalar(static_cast<uint64_t>(rng_state_ptr[0])), at::dtype(at::kLong));
|
||||
offset_t = at::scalar_tensor(at::Scalar(static_cast<uint64_t>(rng_state_ptr[1])), at::dtype(at::kLong));
|
||||
}
|
||||
else
|
||||
{
|
||||
seed_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
offset_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
}
|
||||
rng_state[0] = *(reinterpret_cast<int64_t*>(&drop_seed));
|
||||
rng_state[1] = *(reinterpret_cast<int64_t*>(&drop_offset));
|
||||
auto drop_options = at::TensorOptions().dtype(at::kLong).device(at::kCUDA);
|
||||
|
||||
std::optional<at::Tensor> attn_bias;
|
||||
if( attn_bias_.has_value())
|
||||
{
|
||||
attn_bias = attn_bias_;
|
||||
}
|
||||
|
||||
if (seqlen_k > 0) {
|
||||
auto drop_seed_offset = std::make_pair(rng_state[0].data_ptr<uint64_t>(),
|
||||
rng_state[1].data_ptr<uint64_t>());
|
||||
auto drop_seed_offset = std::make_pair(rng_state_ptr, rng_state_ptr + 1);
|
||||
auto stream = at::cuda::getCurrentHIPStream().stream();
|
||||
ck_tile::stream_config stream_config{stream};
|
||||
|
||||
@ -323,7 +332,7 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x
|
||||
auto args =
|
||||
get_ck_fmha_fwd_args(
|
||||
has_lse,
|
||||
has_dropout,
|
||||
return_dropout_randval,
|
||||
mask,
|
||||
batch_size,
|
||||
seqlen_q,
|
||||
@ -349,11 +358,12 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x
|
||||
out.zero_();
|
||||
softmax_lse.fill_(std::numeric_limits<float>::infinity());
|
||||
}
|
||||
|
||||
if (seqlenq_ngroups_swapped) {
|
||||
out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size});
|
||||
q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size});
|
||||
softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1});
|
||||
}
|
||||
return {out, q_padded, k_padded, v_padded, softmax_lse, rng_state, _unused, p};
|
||||
return {out, q_padded, k_padded, v_padded, softmax_lse, seed_t, offset_t, p};
|
||||
}
|
||||
} //namespace pytorch_flash
|
||||
|
||||
@ -15,6 +15,8 @@ flaky_models = {
|
||||
"timm_efficientnet", # see https://github.com/pytorch/pytorch/issues/148699
|
||||
"XGLMForCausalLM", # discovered in https://github.com/pytorch/pytorch/pull/128148
|
||||
"moondream", # discovered in https://github.com/pytorch/pytorch/pull/159291
|
||||
# discovered in https://github.com/pytorch/pytorch/issues/161419. Its not flaky but really hard to repro, so skipping it
|
||||
"mobilenetv3_large_100",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@ name,accuracy,graph_breaks
|
||||
|
||||
|
||||
|
||||
torchrec_dlrm,fail_to_run,3
|
||||
torchrec_dlrm,pass,6
|
||||
|
||||
|
||||
|
||||
@ -94,7 +94,7 @@ hf_Bert_large,pass,6
|
||||
|
||||
|
||||
|
||||
hf_BigBird,fail_to_run,3
|
||||
hf_BigBird,pass,6
|
||||
|
||||
|
||||
|
||||
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,fail_to_run,21
|
||||
hf_Reformer,pass,25
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -2,7 +2,7 @@ name,accuracy,graph_breaks
|
||||
|
||||
|
||||
|
||||
torchrec_dlrm,fail_to_run,3
|
||||
torchrec_dlrm,pass,6
|
||||
|
||||
|
||||
|
||||
@ -46,7 +46,7 @@ dcgan,pass,6
|
||||
|
||||
|
||||
|
||||
demucs,fail_to_run,4
|
||||
demucs,pass,9
|
||||
|
||||
|
||||
|
||||
@ -94,7 +94,7 @@ hf_Bert_large,pass,6
|
||||
|
||||
|
||||
|
||||
hf_BigBird,fail_to_run,3
|
||||
hf_BigBird,pass,6
|
||||
|
||||
|
||||
|
||||
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,fail_to_run,21
|
||||
hf_Reformer,pass,25
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -755,6 +755,7 @@ libtorch_cuda_distributed_extra_sources = [
|
||||
"torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu",
|
||||
"torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp",
|
||||
"torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
|
||||
"torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp",
|
||||
"torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
|
||||
]
|
||||
|
||||
|
||||
@ -45,7 +45,7 @@ size_t AcceleratorAllocatorConfig::roundup_power2_divisions(size_t size) {
|
||||
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoStart);
|
||||
const size_t interval_end =
|
||||
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoEnd);
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
interval_end - interval_start == kRoundUpPowerOfTwoIntervals,
|
||||
"kRoundUpPowerOfTwoIntervals mismatch");
|
||||
|
||||
@ -64,7 +64,7 @@ size_t AcceleratorAllocatorConfig::parseMaxSplitSize(
|
||||
std::numeric_limits<size_t>::max() / kMB;
|
||||
|
||||
size_t val_env = tokenizer.toSizeT(++i);
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
val_env >= min_allowed_split_size_mb,
|
||||
"CachingAllocator option max_split_size_mb too small, must be >= ",
|
||||
min_allowed_split_size_mb);
|
||||
@ -83,7 +83,7 @@ size_t AcceleratorAllocatorConfig::parseMaxNonSplitRoundingSize(
|
||||
std::numeric_limits<size_t>::max() / kMB;
|
||||
|
||||
size_t val_env = tokenizer.toSizeT(++i);
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
val_env >= min_allowed_split_size_mb,
|
||||
"CachingAllocator option max_non_split_rounding_mb too small, must be >= ",
|
||||
min_allowed_split_size_mb);
|
||||
@ -98,7 +98,7 @@ size_t AcceleratorAllocatorConfig::parseGarbageCollectionThreshold(
|
||||
size_t i) {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
double val_env = tokenizer.toDouble(++i);
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
val_env > 0 && val_env < 1.0,
|
||||
"garbage_collect_threshold is invalid, set it in (0.0, 1.0)");
|
||||
garbage_collection_threshold_ = val_env;
|
||||
@ -119,7 +119,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
|
||||
size_t value_index = i;
|
||||
tokenizer.checkToken(++i, ":");
|
||||
size_t value = tokenizer.toSizeT(++i);
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
value == 0 || llvm::isPowerOf2_64(value),
|
||||
"For roundups, the divisions has to be power of 2 or 0 to disable roundup ");
|
||||
|
||||
@ -133,7 +133,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
|
||||
value);
|
||||
} else {
|
||||
size_t boundary = tokenizer.toSizeT(value_index);
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
llvm::isPowerOf2_64(boundary),
|
||||
"For roundups, the intervals have to be power of 2 ");
|
||||
|
||||
@ -163,7 +163,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
|
||||
"Expected closing bracket ']' in ConfigTokenizer but reached end of config");
|
||||
} else { // Keep this for backwards compatibility
|
||||
size_t value = tokenizer.toSizeT(i);
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
llvm::isPowerOf2_64(value),
|
||||
"For roundups, the divisions has to be power of 2 ");
|
||||
std::fill(
|
||||
|
||||
@ -76,7 +76,7 @@ class ConfigTokenizer {
|
||||
} else if (token == "False") {
|
||||
return false;
|
||||
} else {
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Expected 'True' or 'False' at index ",
|
||||
i,
|
||||
|
||||
@ -1,119 +1,389 @@
|
||||
#include <c10/cuda/CUDAAllocatorConfig.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
#include <c10/util/llvmMathExtras.h>
|
||||
|
||||
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
||||
#include <c10/cuda/driver_api.h>
|
||||
#endif
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
namespace c10::cuda::CUDACachingAllocator {
|
||||
|
||||
size_t CUDAAllocatorConfig::parseAllocatorConfig(
|
||||
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||
constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
|
||||
|
||||
CUDAAllocatorConfig::CUDAAllocatorConfig()
|
||||
: m_max_split_size(std::numeric_limits<size_t>::max()),
|
||||
m_max_non_split_rounding_size(kLargeBuffer),
|
||||
m_garbage_collection_threshold(0),
|
||||
m_pinned_num_register_threads(1),
|
||||
m_expandable_segments(false),
|
||||
#if CUDA_VERSION >= 12030
|
||||
m_expandable_segments_handle_type(
|
||||
Expandable_Segments_Handle_Type::UNSPECIFIED),
|
||||
#else
|
||||
m_expandable_segments_handle_type(
|
||||
Expandable_Segments_Handle_Type::POSIX_FD),
|
||||
#endif
|
||||
m_release_lock_on_cudamalloc(false),
|
||||
m_pinned_use_cuda_host_register(false),
|
||||
m_pinned_use_background_threads(false) {
|
||||
m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
|
||||
}
|
||||
|
||||
size_t CUDAAllocatorConfig::roundup_power2_divisions(size_t size) {
|
||||
size_t log_size = (63 - llvm::countLeadingZeros(size));
|
||||
|
||||
// Our intervals start at 1MB and end at 64GB
|
||||
const size_t interval_start =
|
||||
63 - llvm::countLeadingZeros(static_cast<size_t>(1048576));
|
||||
const size_t interval_end =
|
||||
63 - llvm::countLeadingZeros(static_cast<size_t>(68719476736));
|
||||
TORCH_CHECK(
|
||||
(interval_end - interval_start == kRoundUpPowerOfTwoIntervals),
|
||||
"kRoundUpPowerOfTwoIntervals mismatch");
|
||||
|
||||
int index = static_cast<int>(log_size) - static_cast<int>(interval_start);
|
||||
|
||||
index = std::max(0, index);
|
||||
index = std::min(index, static_cast<int>(kRoundUpPowerOfTwoIntervals) - 1);
|
||||
return instance().m_roundup_power2_divisions[index];
|
||||
}
|
||||
|
||||
void CUDAAllocatorConfig::lexArgs(
|
||||
const std::string& env,
|
||||
std::vector<std::string>& config) {
|
||||
std::vector<char> buf;
|
||||
|
||||
for (char ch : env) {
|
||||
if (ch == ',' || ch == ':' || ch == '[' || ch == ']') {
|
||||
if (!buf.empty()) {
|
||||
config.emplace_back(buf.begin(), buf.end());
|
||||
buf.clear();
|
||||
}
|
||||
config.emplace_back(1, ch);
|
||||
} else if (ch != ' ') {
|
||||
buf.emplace_back(ch);
|
||||
}
|
||||
}
|
||||
if (!buf.empty()) {
|
||||
config.emplace_back(buf.begin(), buf.end());
|
||||
}
|
||||
}
|
||||
|
||||
void CUDAAllocatorConfig::consumeToken(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i,
|
||||
const char c) {
|
||||
TORCH_CHECK(
|
||||
i < config.size() && config[i] == std::string(1, c),
|
||||
"Error parsing CachingAllocator settings, expected ",
|
||||
c,
|
||||
"");
|
||||
}
|
||||
|
||||
size_t CUDAAllocatorConfig::parseMaxSplitSize(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i) {
|
||||
consumeToken(config, ++i, ':');
|
||||
constexpr int mb = 1024 * 1024;
|
||||
if (++i < config.size()) {
|
||||
size_t val1 = stoi(config[i]);
|
||||
TORCH_CHECK(
|
||||
val1 > kLargeBuffer / mb,
|
||||
"CachingAllocator option max_split_size_mb too small, must be > ",
|
||||
kLargeBuffer / mb,
|
||||
"");
|
||||
val1 = std::max(val1, kLargeBuffer / mb);
|
||||
val1 = std::min(val1, (std::numeric_limits<size_t>::max() / mb));
|
||||
m_max_split_size = val1 * 1024 * 1024;
|
||||
} else {
|
||||
TORCH_CHECK(false, "Error, expecting max_split_size_mb value", "");
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t CUDAAllocatorConfig::parseMaxNonSplitRoundingSize(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i) {
|
||||
consumeToken(config, ++i, ':');
|
||||
constexpr int mb = 1024 * 1024;
|
||||
if (++i < config.size()) {
|
||||
size_t val1 = stoi(config[i]);
|
||||
TORCH_CHECK(
|
||||
val1 > kLargeBuffer / mb,
|
||||
"CachingAllocator option max_non_split_rounding_mb too small, must be > ",
|
||||
kLargeBuffer / mb,
|
||||
"");
|
||||
val1 = std::max(val1, kLargeBuffer / mb);
|
||||
val1 = std::min(val1, (std::numeric_limits<size_t>::max() / mb));
|
||||
m_max_non_split_rounding_size = val1 * 1024 * 1024;
|
||||
} else {
|
||||
TORCH_CHECK(false, "Error, expecting max_non_split_rounding_mb value", "");
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t CUDAAllocatorConfig::parseGarbageCollectionThreshold(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i) {
|
||||
consumeToken(config, ++i, ':');
|
||||
if (++i < config.size()) {
|
||||
double val1 = stod(config[i]);
|
||||
TORCH_CHECK(
|
||||
val1 > 0, "garbage_collect_threshold too small, set it 0.0~1.0", "");
|
||||
TORCH_CHECK(
|
||||
val1 < 1.0, "garbage_collect_threshold too big, set it 0.0~1.0", "");
|
||||
m_garbage_collection_threshold = val1;
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
false, "Error, expecting garbage_collection_threshold value", "");
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t CUDAAllocatorConfig::parseRoundUpPower2Divisions(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i) {
|
||||
consumeToken(config, ++i, ':');
|
||||
bool first_value = true;
|
||||
|
||||
if (++i < config.size()) {
|
||||
if (std::string_view(config[i]) == "[") {
|
||||
size_t last_index = 0;
|
||||
// NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
|
||||
while (++i < config.size() && std::string_view(config[i]) != "]") {
|
||||
const std::string& val1 = config[i];
|
||||
size_t val2 = 0;
|
||||
|
||||
consumeToken(config, ++i, ':');
|
||||
if (++i < config.size()) {
|
||||
val2 = stoi(config[i]);
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
false, "Error parsing roundup_power2_divisions value", "");
|
||||
}
|
||||
TORCH_CHECK(
|
||||
val2 == 0 || llvm::isPowerOf2_64(val2),
|
||||
"For roundups, the divisions has to be power of 2 or 0 to disable roundup ",
|
||||
"");
|
||||
|
||||
if (std::string_view(val1) == ">") {
|
||||
std::fill(
|
||||
std::next(
|
||||
m_roundup_power2_divisions.begin(),
|
||||
static_cast<std::vector<unsigned long>::difference_type>(
|
||||
last_index)),
|
||||
m_roundup_power2_divisions.end(),
|
||||
val2);
|
||||
} else {
|
||||
size_t val1_long = stoul(val1);
|
||||
TORCH_CHECK(
|
||||
llvm::isPowerOf2_64(val1_long),
|
||||
"For roundups, the intervals have to be power of 2 ",
|
||||
"");
|
||||
|
||||
size_t index = 63 - llvm::countLeadingZeros(val1_long);
|
||||
index = std::max((size_t)0, index);
|
||||
index = std::min(index, m_roundup_power2_divisions.size() - 1);
|
||||
|
||||
if (first_value) {
|
||||
std::fill(
|
||||
m_roundup_power2_divisions.begin(),
|
||||
std::next(
|
||||
m_roundup_power2_divisions.begin(),
|
||||
static_cast<std::vector<unsigned long>::difference_type>(
|
||||
index)),
|
||||
val2);
|
||||
first_value = false;
|
||||
}
|
||||
if (index < m_roundup_power2_divisions.size()) {
|
||||
m_roundup_power2_divisions[index] = val2;
|
||||
}
|
||||
last_index = index;
|
||||
}
|
||||
|
||||
if (std::string_view(config[i + 1]) != "]") {
|
||||
consumeToken(config, ++i, ',');
|
||||
}
|
||||
}
|
||||
} else { // Keep this for backwards compatibility
|
||||
size_t val1 = stoi(config[i]);
|
||||
TORCH_CHECK(
|
||||
llvm::isPowerOf2_64(val1),
|
||||
"For roundups, the divisions has to be power of 2 ",
|
||||
"");
|
||||
std::fill(
|
||||
m_roundup_power2_divisions.begin(),
|
||||
m_roundup_power2_divisions.end(),
|
||||
val1);
|
||||
}
|
||||
} else {
|
||||
TORCH_CHECK(false, "Error, expecting roundup_power2_divisions value", "");
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t CUDAAllocatorConfig::parseAllocatorConfig(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i,
|
||||
bool& used_cudaMallocAsync) {
|
||||
// For ease of maintenance and understanding, the CUDA and ROCm
|
||||
// implementations of this function are separated. This avoids having many
|
||||
// #ifdef's throughout.
|
||||
#ifdef USE_ROCM
|
||||
// Ease burden on ROCm users by allowing either cuda or hip tokens.
|
||||
// cuda token is broken up to prevent hipify matching it.
|
||||
#define PYTORCH_TOKEN1 \
|
||||
"cud" \
|
||||
"aMallocAsync"
|
||||
#define PYTORCH_TOKEN2 "hipMallocAsync"
|
||||
tokenizer.checkToken(++i, ":");
|
||||
i++; // Move to the value after the colon
|
||||
TORCH_CHECK_VALUE(
|
||||
((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1) ||
|
||||
(tokenizer[i] == PYTORCH_TOKEN2)),
|
||||
"Unknown allocator backend, "
|
||||
"options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2);
|
||||
if (m_is_allocator_loaded) {
|
||||
bool aync_allocator_at_runtime = (tokenizer[i] != "native");
|
||||
consumeToken(config, ++i, ':');
|
||||
if (++i < config.size()) {
|
||||
TORCH_CHECK(
|
||||
aync_allocator_at_runtime == m_use_async_allocator,
|
||||
"Allocator async backend parsed at runtime != allocator async backend parsed at load time, ",
|
||||
aync_allocator_at_runtime,
|
||||
((config[i] == "native") || (config[i] == PYTORCH_TOKEN1) ||
|
||||
(config[i] == PYTORCH_TOKEN2)),
|
||||
"Unknown allocator backend, "
|
||||
"options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2);
|
||||
used_cudaMallocAsync =
|
||||
(config[i] == PYTORCH_TOKEN1 || config[i] == PYTORCH_TOKEN2);
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
config[i] == get()->name() ||
|
||||
(config[i] == PYTORCH_TOKEN1 && get()->name() == PYTORCH_TOKEN2),
|
||||
"Allocator backend parsed at runtime != "
|
||||
"allocator backend parsed at load time, ",
|
||||
config[i],
|
||||
" != ",
|
||||
m_use_async_allocator);
|
||||
get()->name());
|
||||
} else {
|
||||
TORCH_CHECK(false, "Error parsing backend value", "");
|
||||
}
|
||||
m_use_async_allocator =
|
||||
(tokenizer[i] == PYTORCH_TOKEN1 || tokenizer[i] == PYTORCH_TOKEN2);
|
||||
// CUDA allocator is always loaded at the start of the program
|
||||
m_is_allocator_loaded = true;
|
||||
|
||||
#if defined(CUDA_VERSION)
|
||||
if (m_use_async_allocator) {
|
||||
#if CUDA_VERSION >= 11040
|
||||
int version = 0;
|
||||
C10_CUDA_CHECK(cudaDriverGetVersion(&version));
|
||||
TORCH_CHECK(
|
||||
version >= 11040,
|
||||
"backend:cudaMallocAsync requires CUDA runtime "
|
||||
"11.4 or newer, but cudaDriverGetVersion returned ",
|
||||
version);
|
||||
#else
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"backend:cudaMallocAsync requires PyTorch to be built with "
|
||||
"CUDA 11.4 or newer, but CUDA_VERSION is ",
|
||||
CUDA_VERSION);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
return i;
|
||||
#undef PYTORCH_TOKEN1
|
||||
#undef PYTORCH_TOKEN2
|
||||
#else // USE_ROCM
|
||||
consumeToken(config, ++i, ':');
|
||||
if (++i < config.size()) {
|
||||
TORCH_CHECK(
|
||||
((config[i] == "native") || (config[i] == "cudaMallocAsync")),
|
||||
"Unknown allocator backend, "
|
||||
"options are native and cudaMallocAsync");
|
||||
used_cudaMallocAsync = (config[i] == "cudaMallocAsync");
|
||||
if (used_cudaMallocAsync) {
|
||||
#if CUDA_VERSION >= 11040
|
||||
int version = 0;
|
||||
C10_CUDA_CHECK(cudaDriverGetVersion(&version));
|
||||
TORCH_CHECK(
|
||||
version >= 11040,
|
||||
"backend:cudaMallocAsync requires CUDA runtime "
|
||||
"11.4 or newer, but cudaDriverGetVersion returned ",
|
||||
version);
|
||||
#else
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"backend:cudaMallocAsync requires PyTorch to be built with "
|
||||
"CUDA 11.4 or newer, but CUDA_VERSION is ",
|
||||
CUDA_VERSION);
|
||||
#endif
|
||||
}
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
config[i] == get()->name(),
|
||||
"Allocator backend parsed at runtime != "
|
||||
"allocator backend parsed at load time");
|
||||
} else {
|
||||
TORCH_CHECK(false, "Error parsing backend value", "");
|
||||
}
|
||||
return i;
|
||||
#endif // USE_ROCM
|
||||
}
|
||||
|
||||
void CUDAAllocatorConfig::parseArgs(const std::string& env) {
|
||||
void CUDAAllocatorConfig::parseArgs(const std::optional<std::string>& env) {
|
||||
// If empty, set the default values
|
||||
m_max_split_size = std::numeric_limits<size_t>::max();
|
||||
m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
|
||||
m_garbage_collection_threshold = 0;
|
||||
bool used_cudaMallocAsync = false;
|
||||
bool used_native_specific_option = false;
|
||||
|
||||
c10::CachingAllocator::ConfigTokenizer tokenizer(env);
|
||||
for (size_t i = 0; i < tokenizer.size(); i++) {
|
||||
const auto& key = tokenizer[i];
|
||||
if (key == "backend") {
|
||||
i = parseAllocatorConfig(tokenizer, i);
|
||||
if (!env.has_value()) {
|
||||
return;
|
||||
}
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_last_allocator_settings_mutex);
|
||||
m_last_allocator_settings = env.value();
|
||||
}
|
||||
|
||||
std::vector<std::string> config;
|
||||
lexArgs(env.value(), config);
|
||||
|
||||
for (size_t i = 0; i < config.size(); i++) {
|
||||
std::string_view config_item_view(config[i]);
|
||||
if (config_item_view == "max_split_size_mb") {
|
||||
i = parseMaxSplitSize(config, i);
|
||||
used_native_specific_option = true;
|
||||
} else if (config_item_view == "max_non_split_rounding_mb") {
|
||||
i = parseMaxNonSplitRoundingSize(config, i);
|
||||
used_native_specific_option = true;
|
||||
} else if (config_item_view == "garbage_collection_threshold") {
|
||||
i = parseGarbageCollectionThreshold(config, i);
|
||||
used_native_specific_option = true;
|
||||
} else if (config_item_view == "roundup_power2_divisions") {
|
||||
i = parseRoundUpPower2Divisions(config, i);
|
||||
used_native_specific_option = true;
|
||||
} else if (config_item_view == "backend") {
|
||||
i = parseAllocatorConfig(config, i, used_cudaMallocAsync);
|
||||
} else if (config_item_view == "expandable_segments") {
|
||||
used_native_specific_option = true;
|
||||
consumeToken(config, ++i, ':');
|
||||
++i;
|
||||
TORCH_CHECK(
|
||||
i < config.size() &&
|
||||
(std::string_view(config[i]) == "True" ||
|
||||
std::string_view(config[i]) == "False"),
|
||||
"Expected a single True/False argument for expandable_segments");
|
||||
config_item_view = config[i];
|
||||
m_expandable_segments = (config_item_view == "True");
|
||||
} else if (
|
||||
// ROCm build's hipify step will change "cuda" to "hip", but for ease of
|
||||
// use, accept both. We must break up the string to prevent hipify here.
|
||||
key == "release_lock_on_hipmalloc" ||
|
||||
key ==
|
||||
config_item_view == "release_lock_on_hipmalloc" ||
|
||||
config_item_view ==
|
||||
"release_lock_on_c"
|
||||
"udamalloc") {
|
||||
used_native_specific_option = true;
|
||||
tokenizer.checkToken(++i, ":");
|
||||
m_release_lock_on_cudamalloc = tokenizer.toBool(++i);
|
||||
consumeToken(config, ++i, ':');
|
||||
++i;
|
||||
TORCH_CHECK(
|
||||
i < config.size() &&
|
||||
(std::string_view(config[i]) == "True" ||
|
||||
std::string_view(config[i]) == "False"),
|
||||
"Expected a single True/False argument for release_lock_on_cudamalloc");
|
||||
config_item_view = config[i];
|
||||
m_release_lock_on_cudamalloc = (config_item_view == "True");
|
||||
} else if (
|
||||
// ROCm build's hipify step will change "cuda" to "hip", but for ease of
|
||||
// use, accept both. We must break up the string to prevent hipify here.
|
||||
key == "pinned_use_hip_host_register" ||
|
||||
key ==
|
||||
config_item_view == "pinned_use_hip_host_register" ||
|
||||
config_item_view ==
|
||||
"pinned_use_c"
|
||||
"uda_host_register") {
|
||||
i = parsePinnedUseCudaHostRegister(tokenizer, i);
|
||||
i = parsePinnedUseCudaHostRegister(config, i);
|
||||
used_native_specific_option = true;
|
||||
} else if (key == "pinned_num_register_threads") {
|
||||
i = parsePinnedNumRegisterThreads(tokenizer, i);
|
||||
} else if (config_item_view == "pinned_num_register_threads") {
|
||||
i = parsePinnedNumRegisterThreads(config, i);
|
||||
used_native_specific_option = true;
|
||||
} else if (config_item_view == "pinned_use_background_threads") {
|
||||
i = parsePinnedUseBackgroundThreads(config, i);
|
||||
used_native_specific_option = true;
|
||||
} else {
|
||||
const auto& keys =
|
||||
c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
|
||||
TORCH_CHECK(
|
||||
keys.find(key) != keys.end(),
|
||||
"Unrecognized key '",
|
||||
key,
|
||||
"' in Accelerator allocator config.");
|
||||
i = tokenizer.skipKey(i);
|
||||
false, "Unrecognized CachingAllocator option: ", config_item_view);
|
||||
}
|
||||
|
||||
if (i + 1 < tokenizer.size()) {
|
||||
tokenizer.checkToken(++i, ",");
|
||||
if (i + 1 < config.size()) {
|
||||
consumeToken(config, ++i, ',');
|
||||
}
|
||||
}
|
||||
|
||||
if (m_use_async_allocator && used_native_specific_option) {
|
||||
if (used_cudaMallocAsync && used_native_specific_option) {
|
||||
TORCH_WARN(
|
||||
"backend:cudaMallocAsync ignores max_split_size_mb,"
|
||||
"roundup_power2_divisions, and garbage_collect_threshold.");
|
||||
@ -121,33 +391,64 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
|
||||
}
|
||||
|
||||
size_t CUDAAllocatorConfig::parsePinnedUseCudaHostRegister(
|
||||
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||
const std::vector<std::string>& config,
|
||||
size_t i) {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
m_pinned_use_cuda_host_register = tokenizer.toBool(++i);
|
||||
|
||||
consumeToken(config, ++i, ':');
|
||||
if (++i < config.size()) {
|
||||
TORCH_CHECK(
|
||||
(config[i] == "True" || config[i] == "False"),
|
||||
"Expected a single True/False argument for pinned_use_cuda_host_register");
|
||||
m_pinned_use_cuda_host_register = (config[i] == "True");
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
false, "Error, expecting pinned_use_cuda_host_register value", "");
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
|
||||
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||
const std::vector<std::string>& config,
|
||||
size_t i) {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
size_t val2 = tokenizer.toSizeT(++i);
|
||||
TORCH_CHECK_VALUE(
|
||||
llvm::isPowerOf2_64(val2),
|
||||
"Number of register threads has to be power of 2 ",
|
||||
"");
|
||||
auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
|
||||
TORCH_CHECK_VALUE(
|
||||
val2 <= maxThreads,
|
||||
"Number of register threads should be less than or equal to " +
|
||||
std::to_string(maxThreads),
|
||||
"");
|
||||
m_pinned_num_register_threads = val2;
|
||||
consumeToken(config, ++i, ':');
|
||||
if (++i < config.size()) {
|
||||
size_t val2 = stoi(config[i]);
|
||||
TORCH_CHECK(
|
||||
llvm::isPowerOf2_64(val2),
|
||||
"Number of register threads has to be power of 2 ",
|
||||
"");
|
||||
auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
|
||||
TORCH_CHECK(
|
||||
val2 <= maxThreads,
|
||||
"Number of register threads should be less than or equal to " +
|
||||
std::to_string(maxThreads),
|
||||
"");
|
||||
m_pinned_num_register_threads = val2;
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
false, "Error, expecting pinned_num_register_threads value", "");
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK(CUDAAllocatorConfig)
|
||||
size_t CUDAAllocatorConfig::parsePinnedUseBackgroundThreads(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i) {
|
||||
consumeToken(config, ++i, ':');
|
||||
if (++i < config.size()) {
|
||||
TORCH_CHECK(
|
||||
(config[i] == "True" || config[i] == "False"),
|
||||
"Expected a single True/False argument for pinned_use_background_threads");
|
||||
m_pinned_use_background_threads = (config[i] == "True");
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
false, "Error, expecting pinned_use_background_threads value", "");
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
// General caching allocator utilities
|
||||
void setAllocatorSettings(const std::string& env) {
|
||||
CUDACachingAllocator::CUDAAllocatorConfig::instance().parseArgs(env.c_str());
|
||||
}
|
||||
|
||||
} // namespace c10::cuda::CUDACachingAllocator
|
||||
|
||||
@ -1,11 +1,16 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/AllocatorConfig.h>
|
||||
#include <c10/cuda/CUDAException.h>
|
||||
#include <c10/cuda/CUDAMacros.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/env.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cstddef>
|
||||
#include <cstdlib>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace c10::cuda::CUDACachingAllocator {
|
||||
|
||||
enum class Expandable_Segments_Handle_Type : int {
|
||||
@ -18,23 +23,20 @@ enum class Expandable_Segments_Handle_Type : int {
|
||||
class C10_CUDA_API CUDAAllocatorConfig {
|
||||
public:
|
||||
static size_t max_split_size() {
|
||||
return c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size();
|
||||
return instance().m_max_split_size;
|
||||
}
|
||||
static double garbage_collection_threshold() {
|
||||
return c10::CachingAllocator::AcceleratorAllocatorConfig::
|
||||
garbage_collection_threshold();
|
||||
return instance().m_garbage_collection_threshold;
|
||||
}
|
||||
|
||||
static bool expandable_segments() {
|
||||
bool enabled = c10::CachingAllocator::AcceleratorAllocatorConfig::
|
||||
use_expandable_segments();
|
||||
#ifndef PYTORCH_C10_DRIVER_API_SUPPORTED
|
||||
if (enabled) {
|
||||
if (instance().m_expandable_segments) {
|
||||
TORCH_WARN_ONCE("expandable_segments not supported on this platform")
|
||||
}
|
||||
return false;
|
||||
#else
|
||||
return enabled;
|
||||
return instance().m_expandable_segments;
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -61,8 +63,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
||||
}
|
||||
|
||||
static bool pinned_use_background_threads() {
|
||||
return c10::CachingAllocator::AcceleratorAllocatorConfig::
|
||||
pinned_use_background_threads();
|
||||
return instance().m_pinned_use_background_threads;
|
||||
}
|
||||
|
||||
static size_t pinned_max_register_threads() {
|
||||
@ -76,99 +77,88 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
||||
// More description below in function roundup_power2_next_division
|
||||
// As an example, if we want 4 divisions between 2's power, this can be done
|
||||
// using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
|
||||
static size_t roundup_power2_divisions(size_t size) {
|
||||
return c10::CachingAllocator::AcceleratorAllocatorConfig::
|
||||
roundup_power2_divisions(size);
|
||||
}
|
||||
static size_t roundup_power2_divisions(size_t size);
|
||||
|
||||
static std::vector<size_t> roundup_power2_divisions() {
|
||||
return c10::CachingAllocator::AcceleratorAllocatorConfig::
|
||||
roundup_power2_divisions();
|
||||
return instance().m_roundup_power2_divisions;
|
||||
}
|
||||
|
||||
static size_t max_non_split_rounding_size() {
|
||||
return c10::CachingAllocator::AcceleratorAllocatorConfig::
|
||||
max_non_split_rounding_size();
|
||||
return instance().m_max_non_split_rounding_size;
|
||||
}
|
||||
|
||||
static std::string last_allocator_settings() {
|
||||
return c10::CachingAllocator::getAllocatorSettings();
|
||||
}
|
||||
|
||||
static bool use_async_allocator() {
|
||||
return instance().m_use_async_allocator;
|
||||
}
|
||||
|
||||
// Use `Construct On First Use Idiom` to avoid `Static Initialization Order`
|
||||
// issue.
|
||||
static const std::unordered_set<std::string>& getKeys() {
|
||||
static std::unordered_set<std::string> keys{
|
||||
"backend",
|
||||
// keep BC for Rocm: `cuda` -> `cud` `a`, to avoid hipify issues
|
||||
// NOLINTBEGIN(bugprone-suspicious-missing-comma,-warnings-as-errors)
|
||||
"release_lock_on_cud"
|
||||
"amalloc",
|
||||
"pinned_use_cud"
|
||||
"a_host_register",
|
||||
// NOLINTEND(bugprone-suspicious-missing-comma,-warnings-as-errors)
|
||||
"release_lock_on_hipmalloc",
|
||||
"pinned_use_hip_host_register",
|
||||
"pinned_num_register_threads"};
|
||||
return keys;
|
||||
std::lock_guard<std::mutex> lock(
|
||||
instance().m_last_allocator_settings_mutex);
|
||||
return instance().m_last_allocator_settings;
|
||||
}
|
||||
|
||||
static CUDAAllocatorConfig& instance() {
|
||||
static CUDAAllocatorConfig* s_instance = ([]() {
|
||||
auto inst = new CUDAAllocatorConfig();
|
||||
auto env = c10::utils::get_env("PYTORCH_ALLOC_CONF");
|
||||
if (!env.has_value()) {
|
||||
// For backward compatibility, check for the old environment variable
|
||||
// PYTORCH_CUDA_ALLOC_CONF.
|
||||
env = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
|
||||
}
|
||||
auto env = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
|
||||
#ifdef USE_ROCM
|
||||
// convenience for ROCm users, allow alternative HIP token
|
||||
if (!env.has_value()) {
|
||||
env = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
|
||||
}
|
||||
#endif
|
||||
if (env.has_value()) {
|
||||
inst->parseArgs(env.value());
|
||||
}
|
||||
inst->parseArgs(env);
|
||||
return inst;
|
||||
})();
|
||||
return *s_instance;
|
||||
}
|
||||
|
||||
void parseArgs(const std::string& env);
|
||||
void parseArgs(const std::optional<std::string>& env);
|
||||
|
||||
private:
|
||||
CUDAAllocatorConfig() = default;
|
||||
CUDAAllocatorConfig();
|
||||
|
||||
size_t parseAllocatorConfig(
|
||||
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||
static void lexArgs(const std::string& env, std::vector<std::string>& config);
|
||||
static void consumeToken(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i,
|
||||
const char c);
|
||||
size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
|
||||
size_t parseMaxNonSplitRoundingSize(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i);
|
||||
size_t parseGarbageCollectionThreshold(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i);
|
||||
size_t parseRoundUpPower2Divisions(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i);
|
||||
size_t parseAllocatorConfig(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i,
|
||||
bool& used_cudaMallocAsync);
|
||||
size_t parsePinnedUseCudaHostRegister(
|
||||
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||
const std::vector<std::string>& config,
|
||||
size_t i);
|
||||
size_t parsePinnedNumRegisterThreads(
|
||||
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||
const std::vector<std::string>& config,
|
||||
size_t i);
|
||||
size_t parsePinnedUseBackgroundThreads(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i);
|
||||
|
||||
std::atomic<size_t> m_pinned_num_register_threads{1};
|
||||
std::atomic<Expandable_Segments_Handle_Type> m_expandable_segments_handle_type
|
||||
#if CUDA_VERSION >= 12030
|
||||
{Expandable_Segments_Handle_Type::UNSPECIFIED};
|
||||
#else
|
||||
{Expandable_Segments_Handle_Type::POSIX_FD};
|
||||
#endif
|
||||
std::atomic<bool> m_release_lock_on_cudamalloc{false};
|
||||
std::atomic<bool> m_pinned_use_cuda_host_register{false};
|
||||
std::atomic<bool> m_use_async_allocator{false};
|
||||
std::atomic<bool> m_is_allocator_loaded{false};
|
||||
std::atomic<size_t> m_max_split_size;
|
||||
std::atomic<size_t> m_max_non_split_rounding_size;
|
||||
std::vector<size_t> m_roundup_power2_divisions;
|
||||
std::atomic<double> m_garbage_collection_threshold;
|
||||
std::atomic<size_t> m_pinned_num_register_threads;
|
||||
std::atomic<bool> m_expandable_segments;
|
||||
std::atomic<Expandable_Segments_Handle_Type>
|
||||
m_expandable_segments_handle_type;
|
||||
std::atomic<bool> m_release_lock_on_cudamalloc;
|
||||
std::atomic<bool> m_pinned_use_cuda_host_register;
|
||||
std::atomic<bool> m_pinned_use_background_threads;
|
||||
std::string m_last_allocator_settings;
|
||||
std::mutex m_last_allocator_settings_mutex;
|
||||
};
|
||||
|
||||
// Keep this for backwards compatibility
|
||||
using c10::CachingAllocator::setAllocatorSettings;
|
||||
// General caching allocator utilities
|
||||
C10_CUDA_API void setAllocatorSettings(const std::string& env);
|
||||
|
||||
} // namespace c10::cuda::CUDACachingAllocator
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
|
||||
#include <c10/core/impl/GPUTrace.h>
|
||||
#include <c10/cuda/CUDAAllocatorConfig.h>
|
||||
#include <c10/cuda/CUDAException.h>
|
||||
#include <c10/cuda/CUDAFunctions.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
@ -63,6 +64,10 @@ namespace cuda::CUDACachingAllocator {
|
||||
using namespace c10::CachingAllocator;
|
||||
using namespace c10::CachingDeviceAllocator;
|
||||
|
||||
// Included here as this is externally used in CUDAAllocatorConfig
|
||||
const size_t kLargeBuffer =
|
||||
20971520; // "large" allocations may be packed in 20 MiB blocks
|
||||
|
||||
namespace Native {
|
||||
|
||||
//
|
||||
@ -4105,10 +4110,49 @@ CUDAAllocator* allocator();
|
||||
} // namespace CudaMallocAsync
|
||||
|
||||
struct BackendStaticInitializer {
|
||||
// Parses env for backend at load time, duplicating some logic from
|
||||
// CUDAAllocatorConfig. CUDAAllocatorConfig double-checks it later (at
|
||||
// runtime). Defers verbose exceptions and error checks, including Cuda
|
||||
// version checks, to CUDAAllocatorConfig's runtime doublecheck. If this
|
||||
// works, maybe we should move all of CUDAAllocatorConfig here?
|
||||
CUDAAllocator* parseEnvForBackend() {
|
||||
// If the environment variable is set, we use the CudaMallocAsync allocator.
|
||||
if (CUDAAllocatorConfig::use_async_allocator()) {
|
||||
return CudaMallocAsync::allocator();
|
||||
auto val = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
|
||||
#ifdef USE_ROCM
|
||||
// convenience for ROCm users to allow either CUDA or HIP env var
|
||||
if (!val.has_value()) {
|
||||
val = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
|
||||
}
|
||||
#endif
|
||||
if (val.has_value()) {
|
||||
const std::string& config = val.value();
|
||||
|
||||
std::regex exp("[\\s,]+");
|
||||
std::sregex_token_iterator it(config.begin(), config.end(), exp, -1);
|
||||
std::sregex_token_iterator end;
|
||||
std::vector<std::string> options(it, end);
|
||||
|
||||
for (auto option : options) {
|
||||
std::regex exp2("[:]+");
|
||||
std::sregex_token_iterator it2(option.begin(), option.end(), exp2, -1);
|
||||
std::sregex_token_iterator end2;
|
||||
std::vector<std::string> kv(it2, end2);
|
||||
if (kv.size() >= 2) {
|
||||
if (kv[0] == "backend") {
|
||||
#ifdef USE_ROCM
|
||||
// convenience for ROCm users to allow either CUDA or HIP env var
|
||||
if (kv[1] ==
|
||||
"cud"
|
||||
"aMallocAsync" ||
|
||||
kv[1] == "hipMallocAsync")
|
||||
#else
|
||||
if (kv[1] == "cudaMallocAsync")
|
||||
#endif
|
||||
return CudaMallocAsync::allocator();
|
||||
if (kv[1] == "native")
|
||||
return &Native::allocator;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return &Native::allocator;
|
||||
}
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/CachingDeviceAllocator.h>
|
||||
#include <c10/cuda/CUDAAllocatorConfig.h>
|
||||
#include <c10/cuda/CUDAGraphsC10Utils.h>
|
||||
#include <c10/cuda/CUDAMacros.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
@ -50,9 +49,10 @@ namespace c10::cuda::CUDACachingAllocator {
|
||||
|
||||
// Preserved only for BC reasons
|
||||
// NOLINTNEXTLINE(misc-unused-using-decls)
|
||||
using c10::CachingAllocator::kLargeBuffer;
|
||||
using c10::CachingDeviceAllocator::DeviceStats;
|
||||
|
||||
extern const size_t kLargeBuffer;
|
||||
|
||||
typedef std::shared_ptr<GatheredContext> (*CreateContextFn)();
|
||||
|
||||
// Struct containing info of an allocation block (i.e. a fractional part of a
|
||||
|
||||
@ -581,6 +581,7 @@ if(USE_CUDA)
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
|
||||
PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
|
||||
)
|
||||
endif()
|
||||
|
||||
@ -169,10 +169,6 @@ These backends include:
|
||||
.. autofunction:: torch.backends.cuda.sdp_kernel
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autofunction:: torch.backends.cuda.is_ck_sdpa_available
|
||||
```
|
||||
|
||||
## torch.backends.cudnn
|
||||
|
||||
```{eval-rst}
|
||||
|
||||
@ -1221,9 +1221,6 @@ coverage_ignore_functions = [
|
||||
"reduce_typed_storage_child",
|
||||
"storage_from_cache",
|
||||
# torch.multiprocessing.spawn
|
||||
# Added docstring for this but I think we need to go through
|
||||
# and add the entire torch.multiprocessing.spawn module to a .rst...
|
||||
"should_use_parallel_start",
|
||||
"start_processes",
|
||||
# torch.nn.functional
|
||||
"adaptive_max_pool1d_with_indices", # documented as adaptive_max_pool1d
|
||||
|
||||
439
docs/source/notes/extending_accelerator.md
Normal file
439
docs/source/notes/extending_accelerator.md
Normal file
@ -0,0 +1,439 @@
|
||||
# Extending PyTorch with New Accelerators
|
||||
|
||||
## Background
|
||||
|
||||
Since PyTorch 2.1, the community has made significant progress in simplifying the integration of new accelerators into the PyTorch ecosystem. These improvements include, but are not limited to: refinement of the `PrivateUse1` Dispatch Key, introduction and improvement of core subsystem extension mechanisms, and device-agnostic refactoring of key modules (e.g., `torch.accelerator`, `memory management`). Taken together, these improvements lay the foundation for a **robust**, **flexible** and developer-friendly accelerator integration path.
|
||||
|
||||
### Why Does This Matter?
|
||||
|
||||
This integration path has several key advantages:
|
||||
|
||||
* **Speed**: Extensibility is built-in for all core PyTorch modules. Developers can integrate new accelerators into their downstream codebase independently without modifying upstream code and without being constrained by community review bandwidth.
|
||||
* **Future-proofing**: This integration path is the default for all future PyTorch features, which means that new modules and features will automatically support scaling to new accelerators as long as this path is followed.
|
||||
* **Autonomy**: Vendors have full control over their accelerator integration timelines, enabling agile iteration cycles and reducing reliance on upstream coordination.
|
||||
|
||||
### About This Document
|
||||
|
||||
This guide aims to provide a **comprehensive overview of the modern integration pathway** for new accelerator in PyTorch. It walks through the full integration surface, from low-level device primitives to higher-level domain modules like compilation and quantization. The structure follows a **modular and scenario-driven approach**, where each topic is paired with corresponding code examples from [torch_openreg][OpenReg URL], an official reference implementation.
|
||||
|
||||
The goal is to help developers:
|
||||
|
||||
* Understand the full scope of accelerator integration;
|
||||
* Follow best practices to quickly launch new accelerators;
|
||||
* Avoid common pitfalls through clear, targeted examples.
|
||||
|
||||
### Target Audience
|
||||
|
||||
This document is intended for:
|
||||
|
||||
* **Accelerator Developers** who are integrating accelerator into PyTorch;
|
||||
* **Advanced PyTorch Users** interested in the inner workings of key modules;
|
||||
|
||||
Next, we will officially embark on the integration journey of the new PyTorch accelerator.
|
||||
|
||||
## Operators
|
||||
|
||||
For new accelerators, one of the most important and fundamental aspects of integration is supporting high-performance operators. To facilitate operator adaptation for users and accelerator developers, PyTorch provides multiple methods for developing and registering operators in both `Python` and `C++`. The following sections detail some of PyTorch's fundamental capabilities for operator registration.
|
||||
|
||||
```{note}
|
||||
`Dispatch Key` is used to uniquely identify accelerator within PyTorch, such as `CPU`, `CUDA`, `MPS`, and `PrivateUse1`. In theory, all subsequent new accelerators will share `PrivateUse1`, leveraging its built-in comprehensive scaffolding capabilities to complete the integration of new accelerators. Please refer to [Let's talk about the PyTorch dispatcher](https://blog.ezyang.com/2020/09/lets-talk-about-the-pytorch-dispatcher/) if you are interested with dispatcher.
|
||||
```
|
||||
|
||||
(operator-set)=
|
||||
|
||||
### Operator Set
|
||||
|
||||
PyTorch currently has over 3500 built-in operators (including related operator variants). This represents a significant workload from any perspective, and supporting this massive number of operators in a short period of time is no easy task. Therefore, as the first step in developing new backend operators, our goal should be to focus on the essential operators. For other operators, we can first use the community's fallback mechanism to support the feature as the first priority. After that, we can gradually complete other operators to improve the performance of the new backend.
|
||||
|
||||
The required operator set is listed below, primarily consisting of low-level operators required by factory functions and fallback operators:
|
||||
|
||||
| Operator Name | Dispatch Key | Description |
|
||||
| :---: | :---: | :---: |
|
||||
| empty.memory_format | PrivateUse1 | Create an uninitialized Tensor with the specified shape and memory layout (the stride is automatically calculated) |
|
||||
| empty_strided | PrivateUse1 | Create an uninitialized Tensor of the specified shape and stride (more degrees of freedom) |
|
||||
| as_strided | PrivateUse1 | Create a shared view of the input Tensor with new shape, stride, and offset (without allocating new memory) |
|
||||
| view | PrivateUse1 | Create a shared view of the input Tensor with new shape, but the original Tensor must be memory-contiguous |
|
||||
| _reshape_alias | PrivateUse1 | Creates a shared view without safety checks(Internal version of reshape) |
|
||||
| resize_ | PrivateUse1 | Modify the shape of the Tensor in place and reallocate memory if capacity is insufficient |
|
||||
| _copy_from | PrivateUse1 | The underlying core function of Tensor.copy_ is responsible for the actual cross-device data copying |
|
||||
| _copy_from_and_resize | PrivateUse1 | Combine `resize_` and `_copy_from` to resize first and then copy |
|
||||
| _local_scalar_dense | PrivateUse1 | The underlying implementation of `.item()`, extracting values from Tensor to CPU scalars |
|
||||
| set_.source_Tensor | PrivateUse1 | Set the current Tensor using the specified Tensor |
|
||||
| set_.source_Storage | PrivateUse1 | Set the current Tensor using the specified Storage |
|
||||
| set_.source_Storage_storage_offset | PrivateUse1 | Set the current Tensor using the specified Storage with the storage offset |
|
||||
| fallback | PrivateUse1 | Fallback to CPU |
|
||||
|
||||
### Basics
|
||||
|
||||
Now that we have defined the initial scope of operator support, we can begin developing operator adaptations. This section will explain these implementations in `Python` and `C++` based on actual scenarios.
|
||||
|
||||
(step-one)=
|
||||
|
||||
#### Step 1
|
||||
|
||||
{ref}`The operators mentioned above <operator-set>` share a common characteristic: They are built-in PyTorch operators with defined `namespaces` and `Schemas`, and these operators' built-in accelerators (`CPU`, `CUDA`, etc.) have been implemented. What we have to do next is to implement these operators for the new accelerators.
|
||||
|
||||
::::{tab-set}
|
||||
|
||||
:::{tab-item} C++
|
||||
|
||||
```{eval-rst}
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp
|
||||
:language: c++
|
||||
:start-after: LITERALINCLUDE START: EMPTY.MEMORY_FORMAT IMPL
|
||||
:end-before: LITERALINCLUDE END: EMPTY.MEMORY_FORMAT IMPL
|
||||
:linenos:
|
||||
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
|
||||
:language: c++
|
||||
:start-after: LITERALINCLUDE START: EMPTY.MEMORY_FORMAT WRAPPER
|
||||
:end-before: LITERALINCLUDE END: EMPTY.MEMORY_FORMAT WRAPPER
|
||||
:linenos:
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
Taking the `empty.memory_format` operator as an example, we first need to query the operator's `schema` information in `native_functions.yaml`, which contains detailed signature information. Then, we can implement the operator based on the capabilities of the new accelerator.
|
||||
|
||||
```Yaml
|
||||
- func: empty.memory_format(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
|
||||
dispatch:
|
||||
CPU: empty_cpu
|
||||
CUDA: empty_cuda
|
||||
...
|
||||
```
|
||||
|
||||
::::{tab-set-code}
|
||||
|
||||
```{eval-rst}
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
|
||||
:language: c++
|
||||
:start-after: LITERALINCLUDE START: TORCH_LIBRARY_IMPL DEFAULT
|
||||
:end-before: LITERALINCLUDE END: TORCH_LIBRARY_IMPL DEFAULT
|
||||
:emphasize-lines: 1,2
|
||||
:linenos:
|
||||
```
|
||||
|
||||
::::
|
||||
|
||||
After completing the `wrapper_empty_memory_format`, we can register `aten::empty.memory_format` for `PrivateUse1` through `TORCH_LIBRARY_IMPL`.
|
||||
|
||||
#### Step 2
|
||||
|
||||
By following {ref}`Step 1<step-one>`, we can complete the development and registration of all operators except `fallback`. Next, to support operators related to operations (such as mathematical operations and convolution operations), we need to implement the registration of fallback semantics. This is a built-in capability provided by the PyTorch framework that can fallback some operations that are not supported by new accelerators to the CPU for execution. For new backends in development, this is an extremely effective way to ensure functionality at the expense of performance.
|
||||
|
||||
::::{tab-set}
|
||||
|
||||
:::{tab-item} C++
|
||||
|
||||
```{eval-rst}
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp
|
||||
:language: c++
|
||||
:start-after: LITERALINCLUDE START: FALLBACK IMPL
|
||||
:end-before: LITERALINCLUDE END: FALLBACK IMPL
|
||||
:emphasize-lines: 15
|
||||
:linenos:
|
||||
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
|
||||
:language: c++
|
||||
:start-after: LITERALINCLUDE START: FALLBACK WRAPPER
|
||||
:end-before: LITERALINCLUDE END: FALLBACK WRAPPER
|
||||
:linenos:
|
||||
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
|
||||
:language: c++
|
||||
:start-after: LITERALINCLUDE START: FALLBACK GLOBAL
|
||||
:end-before: LITERALINCLUDE END: FALLBACK GLOBAL
|
||||
:linenos:
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
`wrapper_cpu_fallback` wraps the `at::native::cpu_fallback` method provided by PyTorch and is registered with `PrivateUse1` in PyTorch via `TORCH_LIBRARY_IMPL`. Subsequent operations not supported by the new backend will automatically fall back to the CPU for execution, and the results will be passed back to the new backend after execution.
|
||||
|
||||
### Advanced
|
||||
|
||||
#### Selective Fallback
|
||||
|
||||
Enabling the fallback mechanism only for certain operators, while following PyTorch's default behavior for other operators (an error will be reported if the accelerator does not have a corresponding operator implementation), this is a very reasonable scenario as well.
|
||||
|
||||
::::{tab-set}
|
||||
|
||||
:::{tab-item} C++
|
||||
|
||||
```{eval-rst}
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
|
||||
:language: c++
|
||||
:start-after: LITERALINCLUDE START: FALLBACK WRAPPER
|
||||
:end-before: LITERALINCLUDE END: FALLBACK WRAPPER
|
||||
:linenos:
|
||||
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
|
||||
:language: c++
|
||||
:start-after: LITERALINCLUDE START: FALLBACK SINGLE
|
||||
:end-before: LITERALINCLUDE END: FALLBACK SINGLE
|
||||
:linenos:
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
Per-operator fallbacks are very similar to global fallbacks, the only difference being the registration method: calling `m.impl` registers an implementation for a specific operator, while `m.fallback` registers a default implementation for all operators.
|
||||
|
||||
::::{tab-set-code}
|
||||
|
||||
```{eval-rst}
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp
|
||||
:language: c++
|
||||
:start-after: LITERALINCLUDE START: FALLBACK IMPL
|
||||
:end-before: LITERALINCLUDE END: FALLBACK IMPL
|
||||
:emphasize-lines: 2-5
|
||||
:linenos:
|
||||
```
|
||||
|
||||
::::
|
||||
|
||||
Of course, global fallbacks can also be combined with a blacklist of fallbacks, which is a common approach, especially when only a few operators do not support fallbacks.
|
||||
|
||||
#### PyTorch STUB
|
||||
|
||||
PyTorch also provides another approach for built-in operators: `STUB`. This method is essentially based on the `Step 1<step-one>` approach, but adds secondary scheduling capabilities (for example, scheduling based on CPU characteristics).
|
||||
|
||||
```{note}
|
||||
The `STUB` method currently supports only a limited set of operators. For new accelerator devices, the advantage of the `STUB` method is that it significantly reduces the cost of development at the cost of a small performance overhead. PyTorch currently does not clearly list the set of operators that can be registered through `STUB`. Due to the large number of related operators, only the query method for the supported operator list is provided here.
|
||||
```
|
||||
|
||||
```shell
|
||||
pushd ${TORCH_ROOT}
|
||||
|
||||
find aten -type f -a -name "*.h" | xargs -I {} grep -wl "^DECLARE_DISPATCH" {}
|
||||
|
||||
popd
|
||||
```
|
||||
|
||||
`DECLARE_DISPATCH` is a macro used to explicitly declare `STUB`. It is currently distributed in the `aten` directory. Based on this macro, you can find all operators that can be integrated using the `STUB` method.
|
||||
|
||||
```text
|
||||
...
|
||||
aten/src/ATen/native/Activation.h
|
||||
aten/src/ATen/native/FusedSGD.h
|
||||
aten/src/ATen/native/nested/NestedTensorBinaryOps.h
|
||||
aten/src/ATen/native/TensorCompare.h
|
||||
aten/src/ATen/native/Sorting.h
|
||||
...
|
||||
```
|
||||
|
||||
```c++
|
||||
using unary_fn = void(*)(TensorIteratorBase&);
|
||||
|
||||
DECLARE_DISPATCH(unary_fn, abs_stub)
|
||||
```
|
||||
|
||||
The above listing contains the file that declares the `STUB` operator, where you can clearly see the STUB name and the associated function signature. Next, we will take `abs_stub` as an example to briefly introduce the path to support operators through `STUB`.
|
||||
|
||||
::::{tab-set}
|
||||
|
||||
:::{tab-item} C++
|
||||
|
||||
```{eval-rst}
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Extra.cpp
|
||||
:language: c++
|
||||
:start-after: LITERALINCLUDE START: STUB ABS
|
||||
:end-before: LITERALINCLUDE END: STUB ABS
|
||||
:linenos:
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
|
||||
:language: c++
|
||||
:start-after: LITERALINCLUDE START: STUB DEFAULT
|
||||
:end-before: LITERALINCLUDE END: STUB DEFAULT
|
||||
:emphasize-lines: 1
|
||||
:linenos:
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
From the signature, we can see that the input of `abs_stub` is `TensorIteratorBase`, a powerful helper class provided by PyTorch that contains all input and output operators, as well as some other auxiliary methods. Based on it, we can develop the `abs_kernel` operator and then call `REGISTER_PRIVATEUSE1_DISPATCH` to specify `abs_stub` to complete the registration.
|
||||
|
||||
#### Custom Operators
|
||||
|
||||
In addition to PyTorch's built-in operators, custom accelerator operators are also very common to improve performance in specific scenarios. These can be categorized into three main approaches:
|
||||
|
||||
* Forward-only
|
||||
* Forward and backward: Separate registration
|
||||
* Forward and backward: Implemented using `torch.autograd.Function`
|
||||
|
||||
```{note}
|
||||
There are more details in PyTorch tutorials, so refer to [PyTorch Custom Operators](https://docs.pytorch.org/tutorials/advanced/custom_ops_landing_page.html) if you are interested.
|
||||
```
|
||||
|
||||
Here, we'll briefly introduce the implementation process of custom operators, focusing on the forward-only approach. The implementation can be summarized into the following three points:
|
||||
|
||||
1. **Define Schema:**
|
||||
|
||||
::::{tab-set}
|
||||
|
||||
:::{tab-item} C++
|
||||
|
||||
```{eval-rst}
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
|
||||
:language: c++
|
||||
:start-after: LITERALINCLUDE START: CUSTOM OPERATOR SCHEMA
|
||||
:end-before: LITERALINCLUDE END: CUSTOM OPERATOR SCHEMA
|
||||
:emphasize-lines: 2
|
||||
:linenos:
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
* Namespace Name: `openreg`
|
||||
* Function Name: `custom_abs`
|
||||
* Input Parameters:
|
||||
* Type: `Tensor`
|
||||
* Name: `input`
|
||||
* Output Type: `Tensor`
|
||||
|
||||
2. **Register Operator&Autograd Fallback:**
|
||||
|
||||
::::{tab-set}
|
||||
|
||||
:::{tab-item} C++
|
||||
|
||||
```{eval-rst}
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
|
||||
:language: c++
|
||||
:start-after: LITERALINCLUDE START: CUSTOM OPERATOR DEFAULT
|
||||
:end-before: LITERALINCLUDE END: CUSTOM OPERATOR DEFAULT
|
||||
:linenos:
|
||||
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
|
||||
:language: c++
|
||||
:start-after: LITERALINCLUDE START: CUSTOM OPERATOR FALLBACK
|
||||
:end-before: LITERALINCLUDE END: CUSTOM OPERATOR FALLBACK
|
||||
:emphasize-lines: 2
|
||||
:linenos:
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
Use `TORCH_LIBRARY_IMPL` to register the `wrapper_custom_abs` implementation for the `custom_abs` operator in `PrivateUse1`. However, because `Autograd` is always enabled in PyTorch, PyTorch defaults to finding and executing the corresponding backward implementation even if only forward computation is required(will fallthrough in backward implementation). Therefore, we also need to register the corresponding implementation for `AutogradPrivateUse1` of the `custom_abs` operator. Fortunately, PyTorch also provides a general `Autograd Fallback` mechanism named `torch::autograd::autogradNotImplementedFallback`, if only forward computation is involved, it is equivalent to a fallthrough operation, selecting the next DispatchKey for computation; if backward computation is involved, an error is thrown.
|
||||
|
||||
3. **Register Metadata(optional, but required by the graph mode, etc.):**
|
||||
|
||||
::::{tab-set-code}
|
||||
|
||||
```{eval-rst}
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/meta.py
|
||||
:language: python
|
||||
:start-after: LITERALINCLUDE START: CUSTOM OPERATOR META
|
||||
:end-before: LITERALINCLUDE END: CUSTOM OPERATOR META
|
||||
:linenos:
|
||||
```
|
||||
|
||||
::::
|
||||
|
||||
PyTorch supports registering `Meta` in both C++ and Python. Since Python registration is simpler, Python is used as an example here. Similar to the `TORCH_LIBRARY_IMPL` function in C++, Python provides the more user-friendly `torch.library.impl` decorator.
|
||||
|
||||
### Tools
|
||||
|
||||
Operator registration in PyTorch is complex, with diverse registration methods and numerous scenarios. Therefore, the PyTorch community has provided a number of tools to help developers quickly understand the underlying principles and assist in troubleshooting. Here we briefly introduce several commonly used tools:
|
||||
|
||||
#### Commands
|
||||
|
||||
PyTorch provides a set of commands prefixed with `torch._C._dispatch_` around its Dispatch feature. You can query all related interfaces using the following command:
|
||||
|
||||
```Shell
|
||||
python -c 'import torch; print("\n".join([x for x in dir(torch._C) if x.startswith("_dispatch_")]))'
|
||||
|
||||
...
|
||||
_dispatch_dump
|
||||
_dispatch_dump_table
|
||||
_dispatch_has_kernel
|
||||
_dispatch_has_kernel_for_any_dispatch_key
|
||||
_dispatch_has_kernel_for_dispatch_key
|
||||
_dispatch_isTensorSubclassLike
|
||||
_dispatch_is_alias_key
|
||||
_dispatch_is_included_in_alias
|
||||
_dispatch_is_main_interpreter
|
||||
_dispatch_kernel_for_dispatch_key_is_fallthrough
|
||||
_dispatch_key_for_device
|
||||
_dispatch_key_name
|
||||
_dispatch_key_parse
|
||||
_dispatch_key_set
|
||||
...
|
||||
```
|
||||
|
||||
Here are explanations for several commonly used commands:
|
||||
|
||||
* `torch._C._dispatch_key_set`:
|
||||
|
||||
Displays the DispatchKey of the current Tensor, with priority increasing from left to right.
|
||||
|
||||
```Python
|
||||
>>> import torch
|
||||
>>> a = torch.randn(3,3,device="cuda")
|
||||
>>> torch._C._dispatch_key_set(a)
|
||||
'DispatchKeySet(CUDA, ADInplaceOrView, AutogradCUDA, AutocastCUDA)'
|
||||
```
|
||||
|
||||
* `torch._C._dispatch_dump_table`:
|
||||
|
||||
Queries the support status of a given operator across different Dispatch Keys, making it easy to locate the corresponding implementation code.
|
||||
|
||||
```Python
|
||||
>>> import torch
|
||||
>>> print(torch._C._dispatch_dump_table("aten::add.Tensor"))
|
||||
>>> ...
|
||||
CPU: registered at ./build/aten/src/ATen/RegisterCPU_0.cpp:1309 [kernel]
|
||||
CUDA: registered at ./build/aten/src/ATen/RegisterCUDA_0.cpp:2420 [kernel]
|
||||
HIP: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
|
||||
MPS: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
|
||||
IPU: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
|
||||
XPU: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
|
||||
HPU: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
|
||||
VE: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
|
||||
MTIA: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
|
||||
MAIA: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
|
||||
PrivateUse1: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
|
||||
...
|
||||
```
|
||||
|
||||
You can easily query the corresponding implementation of the `aten::add.Tensor` operator on other platforms, so that you can track the entire operator calling process from the source code level.
|
||||
|
||||
#### Environment Variables
|
||||
|
||||
PyTorch also provides some dispatcher-related environment variables that can help with learning and quickly locating issues.
|
||||
|
||||
* TORCH_SHOW_DISPATCH_TRACE
|
||||
|
||||
Displays detailed internal dispatch key scheduling during PyTorch execution.
|
||||
|
||||
```Bash
|
||||
export TORCH_SHOW_DISPATCH_TRACE=1
|
||||
```
|
||||
|
||||
```Python
|
||||
>>> import torch
|
||||
>>> a = torch.randn(3,3)
|
||||
[call] op=[aten::randn], key=[BackendSelect]
|
||||
[redispatch] op=[aten::randn], key=[CPU]
|
||||
[call] op=[aten::empty.memory_format], key=[BackendSelect]
|
||||
[redispatch] op=[aten::empty.memory_format], key=[CPU]
|
||||
[call] op=[aten::normal_], key=[CPU]
|
||||
```
|
||||
|
||||
You can clearly see all the underlying operators called by Python-level operators within PyTorch: including the operator name, calling hierarchy, and corresponding `Dispatch Key`.
|
||||
|
||||
[OpenReg URL]: https://github.com/pytorch/pytorch/tree/main/test/cpp_extensions/open_registration_extension/torch_openreg "OpenReg URL"
|
||||
@ -88,6 +88,7 @@ set(JIT_TEST_SRCS
|
||||
${JIT_TEST_ROOT}/test_subgraph_matcher.cpp
|
||||
${JIT_TEST_ROOT}/test_subgraph_rewriter.cpp
|
||||
${JIT_TEST_ROOT}/test_subgraph_utils.cpp
|
||||
${JIT_TEST_ROOT}/test_te.cpp
|
||||
${JIT_TEST_ROOT}/test_union.cpp
|
||||
${JIT_TEST_ROOT}/test_utils.cpp
|
||||
${JIT_TEST_ROOT}/test_script_profile.cpp
|
||||
|
||||
41
test/cpp/jit/test_te.cpp
Normal file
41
test/cpp/jit/test_te.cpp
Normal file
@ -0,0 +1,41 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <test/cpp/jit/test_utils.h>
|
||||
#include <torch/csrc/jit/ir/irparser.h>
|
||||
#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
namespace torch {
|
||||
namespace jit {
|
||||
|
||||
TEST(TETest, RemoveProfiling) {
|
||||
auto g = std::make_shared<Graph>();
|
||||
const auto graph_string = R"IR(
|
||||
graph(%a : Tensor,
|
||||
%b : bool):
|
||||
%1 : None = prim::Constant()
|
||||
%2 : Tensor? = prim::If(%b)
|
||||
block0():
|
||||
%3 : Tensor? = prim::profile[profiled_type=Tensor, seen_none=0](%1)
|
||||
-> (%3)
|
||||
block1():
|
||||
%4 : Tensor = prim::profile[profiled_type=Tensor, seen_none=0](%a)
|
||||
-> (%4)
|
||||
return (%2))IR";
|
||||
torch::jit::parseIR(graph_string, g.get());
|
||||
|
||||
g->lint();
|
||||
RemoveProfileNodesAndSpecializeTypes(g);
|
||||
g->lint();
|
||||
|
||||
testing::FileCheck()
|
||||
.check("prim::Constant")
|
||||
->check("prim::If")
|
||||
->check("block")
|
||||
->check("block")
|
||||
->check("return")
|
||||
->run(*g);
|
||||
}
|
||||
} // namespace jit
|
||||
} // namespace torch
|
||||
@ -60,6 +60,7 @@ torch_openreg/
|
||||
├── __init__.py
|
||||
└── openreg
|
||||
├── __init__.py
|
||||
├── meta.py
|
||||
└── random.py
|
||||
```
|
||||
|
||||
@ -110,35 +111,18 @@ There are 4 DSOs in torch_openreg, and the dependencies between them are as foll
|
||||
|
||||
- Operator Implementation
|
||||
|
||||
- `TORCH_LIBRARY` form
|
||||
- Registering a specific operator for an existing schema: See `empty.memory_format`
|
||||
- Registering an operator with a custom schema
|
||||
- Extending an existing namespace: (TODO)
|
||||
- Custom namespace: See `custom_autograd_fn_returns_self`
|
||||
- Autograd: See `custom_autograd_fn_returns_self`
|
||||
- STUB form: See `abs_stub`
|
||||
|
||||
- Fallback
|
||||
- Register for builtin PyTorch Operators
|
||||
- `TORCH_LIBRARY_IMPL` form: See `empty.memory_format
|
||||
- `STUB` form: See `abs_stub`
|
||||
- Register for custom operators
|
||||
- Schema Registration: See `custom_abs`
|
||||
- Kernel Registration: See `custom_abs`
|
||||
- Fallback Registration for `AutogradPriavateUse1`: See `custom_abs`
|
||||
- Meta Registration: See `custom_abs`
|
||||
- `torch.autograd.Function`: See `custom_autograd_fn_aliasing`
|
||||
- Register for fallback
|
||||
- Per-operator Fallback: See `sub.Tensor`
|
||||
- Global Fallback: See `wrapper_cpu_fallback`
|
||||
- Per-operator Fallback: (TODO)
|
||||
|
||||
- AMP (TODO)
|
||||
|
||||
### Memory Management
|
||||
|
||||
- Device Memory Management (TODO)
|
||||
- Host Memory Management (TODO)
|
||||
|
||||
### Custom Storage
|
||||
|
||||
- Adding custom device descriptions (TODO)
|
||||
- Serialization support (TODO)
|
||||
|
||||
### Autoload
|
||||
|
||||
- (TODO)
|
||||
|
||||
...
|
||||
|
||||
## Installation and Usage
|
||||
|
||||
@ -177,7 +161,15 @@ print(f"Device of z: {z.device}")
|
||||
|
||||
## Future Plans
|
||||
|
||||
- **Enhance Features**: AMP, memory management, generators, distributed computing, etc. (to reiterate, the fundamental goal is to verify the integration mechanism).
|
||||
- **Enhance Features**:
|
||||
- Autoload
|
||||
- AMP
|
||||
- Device-agnostic APIs
|
||||
- Memory Management
|
||||
- Generator
|
||||
- Distrubuted
|
||||
- Custom Tensor&Storage
|
||||
- ...
|
||||
- **Improve Tests**: Add more test cases related to the integration mechanism.
|
||||
- **Improve Documentation**: Add a new chapter on third-party device integration in the `Developer Notes` section of the PyTorch documentation.
|
||||
- **Real-time Synchronization**: Keep the code and documentation updated iteratively and in sync.
|
||||
|
||||
@ -3,16 +3,18 @@
|
||||
#include <ATen/native/CPUFallback.h>
|
||||
#include <ATen/native/DispatchStub.h>
|
||||
|
||||
#include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
|
||||
#include <torch/library.h>
|
||||
|
||||
namespace at::openreg {
|
||||
|
||||
namespace {
|
||||
at::Tensor wrapper_quantize_per_tensor(
|
||||
const at::Tensor& self,
|
||||
double scale,
|
||||
int64_t zero_point,
|
||||
at::ScalarType dtype) {
|
||||
return at::native::quantize_per_tensor_openreg(
|
||||
return at::native::openreg::quantize_per_tensor(
|
||||
self, scale, zero_point, dtype);
|
||||
}
|
||||
|
||||
@ -25,10 +27,19 @@ int64_t wrapper__fused_sdp_choice(
|
||||
bool is_causal,
|
||||
std::optional<double> scale,
|
||||
bool enable_gqa) {
|
||||
return at::native::_fused_sdp_choice_openreg(
|
||||
return at::native::openreg::_fused_sdp_choice(
|
||||
query, key, value, attn_mask, dropout_p, is_causal, scale, enable_gqa);
|
||||
}
|
||||
|
||||
void wrapper_quantize_tensor_per_tensor_affine_stub(
|
||||
const at::Tensor& rtensor,
|
||||
at::Tensor& qtensor,
|
||||
double scale,
|
||||
int64_t zero_point) {
|
||||
at::native::openreg::quantize_tensor_per_tensor_affine_stub(
|
||||
rtensor, qtensor, scale, zero_point);
|
||||
}
|
||||
|
||||
std::tuple<
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
@ -48,7 +59,7 @@ wrapper__scaled_dot_product_fused_attention_overrideable(
|
||||
bool is_causal,
|
||||
bool return_debug_mask,
|
||||
std::optional<double> scale) {
|
||||
return at::native::_scaled_dot_product_fused_attention_overrideable_openreg(
|
||||
return at::native::openreg::_scaled_dot_product_fused_attention_overrideable(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
@ -78,8 +89,8 @@ wrapper_scaled_dot_product_fused_attention_overrideable_backward(
|
||||
const at::Tensor& philox_seed,
|
||||
const at::Tensor& philox_offset,
|
||||
std::optional<double> scale) {
|
||||
return at::native::
|
||||
_scaled_dot_product_fused_attention_overrideable_backward_openreg(
|
||||
return at::native::openreg::
|
||||
_scaled_dot_product_fused_attention_overrideable_backward(
|
||||
grad_out,
|
||||
query,
|
||||
key,
|
||||
@ -99,7 +110,66 @@ wrapper_scaled_dot_product_fused_attention_overrideable_backward(
|
||||
scale);
|
||||
}
|
||||
|
||||
at::Tensor wrapper_custom_autograd_fn_returns_self(at::Tensor x) {
|
||||
return at::native::openreg::custom_autograd_fn_returns_self(x);
|
||||
}
|
||||
|
||||
at::Tensor wrapper_custom_autograd_fn_aliasing(at::Tensor x) {
|
||||
return at::native::openreg::custom_autograd_fn_aliasing(x);
|
||||
}
|
||||
|
||||
at::Tensor& wrapper_abs_out(const at::Tensor& self, at::Tensor& out) {
|
||||
return at::native::openreg::abs_out(self, out);
|
||||
}
|
||||
|
||||
void wrapper_abs_stub(at::TensorIteratorBase& iter) {
|
||||
at::native::openreg::abs_kernel(iter);
|
||||
}
|
||||
|
||||
at::Tensor wrapper_custom_abs(at::Tensor x) {
|
||||
return at::native::openreg::custom_abs(x);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
using namespace at::native;
|
||||
// Registration via STUB
|
||||
// LITERALINCLUDE START: STUB DEFAULT
|
||||
REGISTER_PRIVATEUSE1_DISPATCH(abs_stub, &wrapper_abs_stub);
|
||||
REGISTER_PRIVATEUSE1_DISPATCH(
|
||||
quantize_tensor_per_tensor_affine_stub,
|
||||
&wrapper_quantize_tensor_per_tensor_affine_stub);
|
||||
REGISTER_PRIVATEUSE1_DISPATCH(
|
||||
_fused_sdp_choice_stub,
|
||||
&wrapper__fused_sdp_choice);
|
||||
// LITERALINCLUDE END: STUB DEFAULT
|
||||
|
||||
// Registration of custom operators
|
||||
// LITERALINCLUDE START: CUSTOM OPERATOR SCHEMA
|
||||
TORCH_LIBRARY(openreg, m) {
|
||||
m.def("custom_abs(Tensor input)-> Tensor");
|
||||
}
|
||||
// LITERALINCLUDE END: CUSTOM OPERATOR SCHEMA
|
||||
|
||||
// LITERALINCLUDE START: CUSTOM OPERATOR DEFAULT
|
||||
TORCH_LIBRARY_IMPL(openreg, PrivateUse1, m) {
|
||||
m.impl("custom_abs", &wrapper_custom_abs);
|
||||
}
|
||||
// LITERALINCLUDE END: CUSTOM OPERATOR DEFAULT
|
||||
|
||||
// LITERALINCLUDE START: CUSTOM OPERATOR FALLBACK
|
||||
TORCH_LIBRARY_IMPL(_, AutogradPrivateUse1, m) {
|
||||
m.fallback(torch::autograd::autogradNotImplementedFallback());
|
||||
}
|
||||
// LITERALINCLUDE END: CUSTOM OPERATOR FALLBACK
|
||||
|
||||
// The rest is for testing purposes
|
||||
TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
|
||||
/*
|
||||
abs_stub only works if abs.out is also registered with PrivateUse1, because
|
||||
abs.default is designed to redirect directly to abs.out, which calls
|
||||
abs_stub.
|
||||
*/
|
||||
m.impl("abs.out", &wrapper_abs_out);
|
||||
m.impl("quantize_per_tensor", &wrapper_quantize_per_tensor);
|
||||
m.impl("_fused_sdp_choice", &wrapper__fused_sdp_choice);
|
||||
m.impl(
|
||||
@ -110,10 +180,7 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
|
||||
&wrapper_scaled_dot_product_fused_attention_overrideable_backward);
|
||||
}
|
||||
|
||||
} // namespace at::openreg
|
||||
|
||||
namespace at::openreg {
|
||||
TORCH_LIBRARY(openreg, m) {
|
||||
TORCH_LIBRARY_FRAGMENT(openreg, m) {
|
||||
m.def("custom_autograd_fn_returns_self(Tensor input)-> Tensor");
|
||||
m.def("custom_autograd_fn_aliasing(Tensor(a) input)-> Tensor(a)");
|
||||
}
|
||||
@ -121,18 +188,8 @@ TORCH_LIBRARY(openreg, m) {
|
||||
TORCH_LIBRARY_IMPL(openreg, AutogradPrivateUse1, m) {
|
||||
m.impl(
|
||||
"custom_autograd_fn_returns_self",
|
||||
&at::native::custom_autograd_fn_returns_self);
|
||||
m.impl(
|
||||
"custom_autograd_fn_aliasing", &at::native::custom_autograd_fn_aliasing);
|
||||
&wrapper_custom_autograd_fn_returns_self);
|
||||
m.impl("custom_autograd_fn_aliasing", &wrapper_custom_autograd_fn_aliasing);
|
||||
}
|
||||
} // namespace at::openreg
|
||||
|
||||
namespace at::native {
|
||||
REGISTER_PRIVATEUSE1_DISPATCH(abs_stub, &abs_kernel_openreg);
|
||||
REGISTER_PRIVATEUSE1_DISPATCH(
|
||||
quantize_tensor_per_tensor_affine_stub,
|
||||
&quantize_tensor_per_tensor_affine_stub_openreg);
|
||||
REGISTER_PRIVATEUSE1_DISPATCH(
|
||||
_fused_sdp_choice_stub,
|
||||
&_fused_sdp_choice_openreg);
|
||||
} // namespace at::native
|
||||
} // namespace at::openreg
|
||||
|
||||
@ -7,6 +7,9 @@
|
||||
|
||||
namespace at::openreg {
|
||||
|
||||
namespace {
|
||||
|
||||
// LITERALINCLUDE START: EMPTY.MEMORY_FORMAT WRAPPER
|
||||
at::Tensor wrapper_empty_memory_format(
|
||||
c10::IntArrayRef size,
|
||||
std::optional<c10::ScalarType> dtype_opt,
|
||||
@ -14,7 +17,7 @@ at::Tensor wrapper_empty_memory_format(
|
||||
std::optional<c10::Device> device_opt,
|
||||
std::optional<bool> pin_memory_opt,
|
||||
std::optional<c10::MemoryFormat> memory_format_opt) {
|
||||
return at::native::empty_memory_format_openreg(
|
||||
return at::native::openreg::empty_memory_format(
|
||||
size,
|
||||
dtype_opt,
|
||||
layout_opt,
|
||||
@ -22,6 +25,7 @@ at::Tensor wrapper_empty_memory_format(
|
||||
pin_memory_opt,
|
||||
memory_format_opt);
|
||||
}
|
||||
// LITERALINCLUDE END: EMPTY.MEMORY_FORMAT WRAPPER
|
||||
|
||||
at::Tensor wrapper_empty_strided(
|
||||
c10::IntArrayRef size,
|
||||
@ -30,7 +34,7 @@ at::Tensor wrapper_empty_strided(
|
||||
std::optional<c10::Layout> layout_opt,
|
||||
std::optional<c10::Device> device_opt,
|
||||
std::optional<bool> pin_memory_opt) {
|
||||
return at::native::empty_strided_openreg(
|
||||
return at::native::openreg::empty_strided(
|
||||
size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
|
||||
}
|
||||
|
||||
@ -39,48 +43,48 @@ at::Tensor wrapper_as_strided(
|
||||
c10::SymIntArrayRef size,
|
||||
c10::SymIntArrayRef stride,
|
||||
std::optional<c10::SymInt> storage_offset) {
|
||||
return at::native::as_strided_openreg(self, size, stride, storage_offset);
|
||||
return at::native::openreg::as_strided(self, size, stride, storage_offset);
|
||||
}
|
||||
|
||||
const at::Tensor& wrapper_resize_(
|
||||
const at::Tensor& self,
|
||||
c10::SymIntArrayRef size,
|
||||
::std::optional<at::MemoryFormat> memory_format) {
|
||||
return at::native::resize_openreg_(self, size, memory_format);
|
||||
return at::native::openreg::resize_(self, size, memory_format);
|
||||
}
|
||||
|
||||
at::Tensor wrapper__reshape_alias(
|
||||
const at::Tensor& self,
|
||||
c10::SymIntArrayRef size,
|
||||
c10::SymIntArrayRef stride) {
|
||||
return at::native::_reshape_alias_openreg(self, size, stride);
|
||||
return at::native::openreg::_reshape_alias(self, size, stride);
|
||||
}
|
||||
|
||||
at::Tensor wrapper__copy_from(
|
||||
const at::Tensor& self,
|
||||
const at::Tensor& dst,
|
||||
bool non_blocking) {
|
||||
return at::native::_copy_from_openreg(self, dst, non_blocking);
|
||||
return at::native::openreg::_copy_from(self, dst, non_blocking);
|
||||
}
|
||||
|
||||
at::Tensor wrapper__copy_from_and_resize(
|
||||
const at::Tensor& self,
|
||||
const at::Tensor& dst) {
|
||||
return at::native::_copy_from_and_resize_openreg(self, dst);
|
||||
return at::native::openreg::_copy_from_and_resize(self, dst);
|
||||
}
|
||||
|
||||
at::Scalar wrapper__local_scalar_densor(const at::Tensor& self) {
|
||||
return at::native::_local_scalar_dense_openreg(self);
|
||||
return at::native::openreg::_local_scalar_dense(self);
|
||||
}
|
||||
|
||||
at::Tensor& wrapper_set_source_Tensor_(
|
||||
at::Tensor& self,
|
||||
const at::Tensor& source) {
|
||||
return at::native::set_source_Tensor_openreg_(self, source);
|
||||
return at::native::openreg::set_source_Tensor_(self, source);
|
||||
}
|
||||
|
||||
at::Tensor& wrapper_set_source_Storage_(at::Tensor& self, at::Storage source) {
|
||||
return at::native::set_source_Storage_openreg_(self, source);
|
||||
return at::native::openreg::set_source_Storage_(self, source);
|
||||
}
|
||||
|
||||
at::Tensor& wrapper_set_source_Storage_storage_offsetset_(
|
||||
@ -89,14 +93,25 @@ at::Tensor& wrapper_set_source_Storage_storage_offsetset_(
|
||||
int64_t storage_offset,
|
||||
c10::IntArrayRef size,
|
||||
c10::IntArrayRef stride) {
|
||||
return at::native::set_source_Storage_storage_offset_openreg_(
|
||||
return at::native::openreg::set_source_Storage_storage_offset_(
|
||||
result, storage, storage_offset, size, stride);
|
||||
}
|
||||
|
||||
at::Tensor wrapper_view(const at::Tensor& self, c10::SymIntArrayRef size) {
|
||||
return at::native::view_openreg(self, size);
|
||||
return at::native::openreg::view(self, size);
|
||||
}
|
||||
|
||||
// LITERALINCLUDE START: FALLBACK WRAPPER
|
||||
void wrapper_cpu_fallback(
|
||||
const c10::OperatorHandle& op,
|
||||
torch::jit::Stack* stack) {
|
||||
at::native::openreg::cpu_fallback(op, stack);
|
||||
}
|
||||
// LITERALINCLUDE END: FALLBACK WRAPPER
|
||||
|
||||
} // namespace
|
||||
|
||||
// LITERALINCLUDE START: TORCH_LIBRARY_IMPL DEFAULT
|
||||
TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
|
||||
m.impl("empty.memory_format", wrapper_empty_memory_format);
|
||||
m.impl("empty_strided", wrapper_empty_strided);
|
||||
@ -113,16 +128,21 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
|
||||
wrapper_set_source_Storage_storage_offsetset_);
|
||||
m.impl("view", wrapper_view);
|
||||
}
|
||||
// LITERALINCLUDE END: TORCH_LIBRARY_IMPL DEFAULT
|
||||
|
||||
void wrapper_cpu_fallback(
|
||||
const c10::OperatorHandle& op,
|
||||
torch::jit::Stack* stack) {
|
||||
at::native::cpu_fallback_openreg(op, stack);
|
||||
}
|
||||
|
||||
// LITERALINCLUDE START: FALLBACK GLOBAL
|
||||
TORCH_LIBRARY_IMPL(_, PrivateUse1, m) {
|
||||
m.fallback(
|
||||
torch::CppFunction::makeFromBoxedFunction<&wrapper_cpu_fallback>());
|
||||
}
|
||||
// LITERALINCLUDE END: FALLBACK GLOBAL
|
||||
|
||||
// LITERALINCLUDE START: FALLBACK SINGLE
|
||||
TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
|
||||
m.impl(
|
||||
"sub.Tensor",
|
||||
torch::CppFunction::makeFromBoxedFunction<&wrapper_cpu_fallback>());
|
||||
}
|
||||
// LITERALINCLUDE END: FALLBACK SINGLE
|
||||
|
||||
} // namespace at::openreg
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
#include <ATen/native/transformers/sdp_utils_cpp.h>
|
||||
#include <ATen/ops/_local_scalar_dense_native.h>
|
||||
#include <ATen/ops/_reshape_alias_native.h>
|
||||
#include <ATen/ops/abs_native.h>
|
||||
#include <ATen/ops/as_strided_cpu_dispatch.h>
|
||||
#include <ATen/ops/copy_native.h>
|
||||
#include <ATen/ops/quantize_per_tensor_native.h>
|
||||
@ -24,26 +25,18 @@
|
||||
|
||||
#include <c10/core/Allocator.h>
|
||||
|
||||
#include <set>
|
||||
|
||||
#include <include/openreg.h>
|
||||
|
||||
namespace at::native {
|
||||
namespace at::native::openreg {
|
||||
|
||||
class MemoryGuard {
|
||||
public:
|
||||
explicit MemoryGuard(const torch::jit::Stack& stack) {
|
||||
for (const c10::IValue& ivalue : stack) {
|
||||
find_and_unprotect_tensors(ivalue);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
explicit MemoryGuard(const Args&... args) {
|
||||
(handler(args), ...);
|
||||
(find_and_unprotect_tensors(args), ...);
|
||||
}
|
||||
|
||||
~MemoryGuard() {
|
||||
~MemoryGuard() noexcept {
|
||||
for (void* ptr : unprotected_pointers_) {
|
||||
orMemoryProtect(ptr);
|
||||
}
|
||||
@ -55,26 +48,31 @@ class MemoryGuard {
|
||||
MemoryGuard& operator=(MemoryGuard&&) = delete;
|
||||
|
||||
private:
|
||||
void find_and_unprotect_tensors(const c10::IValue& ivalue) {
|
||||
if (ivalue.isTensor()) {
|
||||
unprotect_if_needed(ivalue.toTensor());
|
||||
} else if (ivalue.isTensorList()) {
|
||||
for (const at::Tensor& tensor : ivalue.toTensorList()) {
|
||||
unprotect_if_needed(tensor);
|
||||
}
|
||||
} else if (ivalue.isList()) {
|
||||
for (const c10::IValue& element : ivalue.toListRef()) {
|
||||
find_and_unprotect_tensors(element);
|
||||
}
|
||||
} else if (ivalue.isGenericDict()) {
|
||||
for (const auto& pair : ivalue.toGenericDict()) {
|
||||
find_and_unprotect_tensors(pair.key());
|
||||
find_and_unprotect_tensors(pair.value());
|
||||
template <typename T>
|
||||
void find_and_unprotect_tensors(const T& item) {
|
||||
if constexpr (std::is_base_of_v<at::TensorBase, T>) {
|
||||
unprotect_if_needed(item);
|
||||
} else if constexpr (std::is_same_v<T, c10::IValue>) {
|
||||
if (item.isTensor()) {
|
||||
unprotect_if_needed(item.toTensor());
|
||||
} else if (item.isTensorList()) {
|
||||
for (const at::Tensor& tensor : item.toTensorListRef()) {
|
||||
unprotect_if_needed(tensor);
|
||||
}
|
||||
} else if (item.isList()) {
|
||||
for (const c10::IValue& element : item.toListRef()) {
|
||||
find_and_unprotect_tensors(element);
|
||||
}
|
||||
} else if (item.isGenericDict()) {
|
||||
for (const auto& [key, value] : item.toGenericDict()) {
|
||||
find_and_unprotect_tensors(key);
|
||||
find_and_unprotect_tensors(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void unprotect_if_needed(const at::Tensor& tensor) {
|
||||
void unprotect_if_needed(const at::TensorBase& tensor) {
|
||||
if (!tensor.defined() || !tensor.has_storage()) {
|
||||
return;
|
||||
}
|
||||
@ -82,25 +80,18 @@ class MemoryGuard {
|
||||
void* ptr = tensor.data_ptr();
|
||||
orPointerAttributes attr;
|
||||
|
||||
if (orPointerGetAttributes(&attr, ptr) == orSuccess) {
|
||||
if (attr.type == orMemoryTypeDevice) {
|
||||
if (unprotected_pointers_.find(attr.pointer) ==
|
||||
unprotected_pointers_.end()) {
|
||||
orMemoryUnprotect(attr.pointer);
|
||||
unprotected_pointers_.insert(attr.pointer);
|
||||
}
|
||||
}
|
||||
if (orPointerGetAttributes(&attr, ptr) != orSuccess ||
|
||||
attr.type != orMemoryTypeDevice) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto [it, inserted] = unprotected_pointers_.insert(attr.pointer);
|
||||
if (inserted) {
|
||||
orMemoryUnprotect(attr.pointer);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void handler(const T& x) {
|
||||
if constexpr (std::is_same_v<std::decay_t<T>, at::Tensor>) {
|
||||
unprotect_if_needed(x);
|
||||
}
|
||||
}
|
||||
|
||||
std::set<void*> unprotected_pointers_;
|
||||
std::unordered_set<void*> unprotected_pointers_;
|
||||
};
|
||||
|
||||
} // namespace at::native
|
||||
} // namespace at::native::openreg
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
#include "Extra.h"
|
||||
|
||||
namespace at::native {
|
||||
namespace at::native::openreg {
|
||||
|
||||
at::Tensor quantize_per_tensor_openreg(
|
||||
at::Tensor quantize_per_tensor(
|
||||
const at::Tensor& self,
|
||||
double scale,
|
||||
int64_t zero_point,
|
||||
@ -10,7 +10,7 @@ at::Tensor quantize_per_tensor_openreg(
|
||||
return at::native::quantize_per_tensor(self, scale, zero_point, dtype);
|
||||
}
|
||||
|
||||
int64_t _fused_sdp_choice_openreg(
|
||||
int64_t _fused_sdp_choice(
|
||||
const at::Tensor& query,
|
||||
const at::Tensor& key,
|
||||
const at::Tensor& value,
|
||||
@ -23,6 +23,12 @@ int64_t _fused_sdp_choice_openreg(
|
||||
return static_cast<int64_t>(backend);
|
||||
}
|
||||
|
||||
void quantize_tensor_per_tensor_affine_stub(
|
||||
const at::Tensor& rtensor,
|
||||
at::Tensor& qtensor,
|
||||
double scale,
|
||||
int64_t zero_point) {}
|
||||
|
||||
std::tuple<
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
@ -33,7 +39,7 @@ std::tuple<
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor>
|
||||
_scaled_dot_product_fused_attention_overrideable_openreg(
|
||||
_scaled_dot_product_fused_attention_overrideable(
|
||||
const at::Tensor& query,
|
||||
const at::Tensor& key,
|
||||
const at::Tensor& value,
|
||||
@ -72,7 +78,7 @@ _scaled_dot_product_fused_attention_overrideable_openreg(
|
||||
}
|
||||
|
||||
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
|
||||
_scaled_dot_product_fused_attention_overrideable_backward_openreg(
|
||||
_scaled_dot_product_fused_attention_overrideable_backward(
|
||||
const at::Tensor& grad_out,
|
||||
const at::Tensor& query,
|
||||
const at::Tensor& key,
|
||||
@ -97,104 +103,6 @@ _scaled_dot_product_fused_attention_overrideable_backward_openreg(
|
||||
at::empty_like(attn_bias));
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
namespace at::native {
|
||||
|
||||
void abs_kernel_openreg(at::TensorIteratorBase& iter) {
|
||||
// Abs only have a input tensor and a output tensor.
|
||||
auto& output_operand = iter.operand(0);
|
||||
auto& input_operand = iter.operand(1);
|
||||
auto& output_tensor_base = output_operand.tensor_base();
|
||||
auto& input_tensor_base = input_operand.tensor_base();
|
||||
TORCH_CHECK(
|
||||
!input_operand.original_tensor_base().defined(),
|
||||
"input original tensor is defined.");
|
||||
TORCH_CHECK(
|
||||
!output_operand.original_tensor_base().defined(),
|
||||
"output original tensor is defined.");
|
||||
// For easy test, only accept contiguous input tensor for calculate.
|
||||
auto memory_format = input_tensor_base.suggest_memory_format();
|
||||
TORCH_CHECK(
|
||||
input_tensor_base.is_contiguous(memory_format),
|
||||
"Input tensor need be contiguous.");
|
||||
// Add necessary restrictions to ensure the security of the demo.
|
||||
TORCH_CHECK(
|
||||
input_tensor_base.sizes() == output_tensor_base.sizes(),
|
||||
"Intput and output tensor size are not equal.");
|
||||
// Common dtype is calculate in TensorIteratorBase.
|
||||
TORCH_CHECK(
|
||||
iter.common_dtype() == at::ScalarType::Float, "Only support float type.")
|
||||
// Using for loop for abs calculate.
|
||||
auto abs_function =
|
||||
[](float* output_ptr, const float* input_ptr, const int64_t NUM) {
|
||||
for (int64_t i = 0; i < NUM; ++i) {
|
||||
*(output_ptr + i) = std::abs(*(input_ptr + i));
|
||||
}
|
||||
};
|
||||
// To simplify the logic of the test demo code,
|
||||
// we only use contiguous tensor to calculate on device side.
|
||||
// And using input tensor memory format.
|
||||
if (iter.is_contiguous()) {
|
||||
// Add for will_resize flag check. You can convert to differernt
|
||||
// tensor memory format when will_resize is True.
|
||||
// If TensorIteratorConfig resize_outputs_ flag is true, and there are two
|
||||
// situations:
|
||||
// 1) Out tensor is undefined, and TensorIterator set will_resize to true;
|
||||
// 2) Out tensor is defined and tensor size is not equal to input tensor
|
||||
// size;
|
||||
// TensorIterator set will_resize to true, and call
|
||||
// set_output_raw_strided to resize output tensor.
|
||||
// When output operand will_resize flag is ture, dummy
|
||||
// device can convert tensor to dummy device preferred memory format.
|
||||
// Here we don't convert tensor memory format, because it will become
|
||||
// complex when dummy device want keep same memory format for training
|
||||
// network.
|
||||
TORCH_CHECK(
|
||||
output_operand.will_resize,
|
||||
"output operand will_resize flag need be True.");
|
||||
abs_function(
|
||||
(float*)iter.data_ptr(0), (float*)iter.data_ptr(1), iter.numel());
|
||||
} else {
|
||||
// Stride copy is not support for foo device, using cpu device instead.
|
||||
// For abs op, the last situation is: output tensor is not contiguous with
|
||||
// operand will_resize is False.
|
||||
TORCH_CHECK(
|
||||
!output_operand.will_resize, "output operand will_resize is True.");
|
||||
// Get a contiguous tensor with input memory format.
|
||||
at::Tensor output = at::empty(
|
||||
output_tensor_base.sizes(),
|
||||
input_tensor_base.options().memory_format(memory_format));
|
||||
// For structured op which inheried from TensorIteratorBase, maybe you need
|
||||
// to call set_output_raw_strided function to update output stored in op
|
||||
// sturctured. abs op is no need to do this.
|
||||
output_operand.exchange_tensor(
|
||||
c10::MaybeOwned<at::TensorBase>::owned(std::in_place, output));
|
||||
abs_function(
|
||||
(float*)output_operand.tensor_base().mutable_data_ptr(),
|
||||
(float*)iter.data_ptr(1),
|
||||
iter.numel());
|
||||
// Copy tensor base to original tensor base, and keep same scalar type and
|
||||
// stride with cpu and gpu.
|
||||
if (output_operand.original_tensor_base().defined() &&
|
||||
!output_operand.original_tensor_base().is_same(
|
||||
output_operand.tensor_base())) {
|
||||
output_operand.original_tensor().copy_(output_operand.tensor());
|
||||
output_operand.restore_original_tensor();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_tensor_per_tensor_affine_stub_openreg(
|
||||
const at::Tensor& rtensor,
|
||||
at::Tensor& qtensor,
|
||||
double scale,
|
||||
int64_t zero_point) {}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
namespace at::native {
|
||||
|
||||
namespace {
|
||||
struct CustomAutogradFnReturnsSelf
|
||||
: public torch::autograd::Function<CustomAutogradFnReturnsSelf> {
|
||||
@ -235,4 +143,68 @@ at::Tensor custom_autograd_fn_aliasing(at::Tensor x) {
|
||||
return CustomAutogradFnAliasing::apply(x);
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
/*
|
||||
This implementation is only used to test stub registration, so not all
|
||||
capabilities are fully supported.
|
||||
|
||||
Current Limitations:
|
||||
- dtype: Float only
|
||||
- input tensor: must be contiguous layout
|
||||
*/
|
||||
// LITERALINCLUDE START: STUB ABS
|
||||
void abs_kernel(at::TensorIteratorBase& iter) {
|
||||
TORCH_CHECK(iter.ntensors() == 2, "Abs kernel expects 2 tensors");
|
||||
TORCH_CHECK(
|
||||
iter.common_dtype() == at::ScalarType::Float,
|
||||
"Abs kernel only supports float type");
|
||||
|
||||
auto& output_tensor = iter.tensor(0);
|
||||
auto& input_tensor = iter.tensor(1);
|
||||
|
||||
TORCH_CHECK(
|
||||
input_tensor.sizes() == output_tensor.sizes(),
|
||||
"Input and output tensor sizes must match.");
|
||||
|
||||
auto abs_loop = [](float* out_ptr, const float* in_ptr, int64_t n) {
|
||||
for (int64_t i = 0; i < n; ++i) {
|
||||
out_ptr[i] = std::abs(in_ptr[i]);
|
||||
}
|
||||
};
|
||||
|
||||
MemoryGuard guard(input_tensor, output_tensor);
|
||||
|
||||
if (iter.is_contiguous()) {
|
||||
abs_loop(
|
||||
static_cast<float*>(iter.data_ptr(0)),
|
||||
static_cast<float*>(iter.data_ptr(1)),
|
||||
iter.numel());
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
input_tensor.is_contiguous(), "Input tensor must be contiguous.")
|
||||
|
||||
auto output = at::empty(
|
||||
input_tensor.sizes(),
|
||||
input_tensor.options().memory_format(
|
||||
input_tensor.suggest_memory_format()));
|
||||
|
||||
MemoryGuard guard(output);
|
||||
|
||||
abs_loop(
|
||||
static_cast<float*>(output.data_ptr()),
|
||||
static_cast<float*>(iter.data_ptr(1)),
|
||||
iter.numel());
|
||||
|
||||
output_tensor.copy_(output);
|
||||
}
|
||||
}
|
||||
// LITERALINCLUDE END: STUB ABS
|
||||
|
||||
at::Tensor& abs_out(const at::Tensor& self, at::Tensor& out) {
|
||||
return at::native::abs_out(self, out);
|
||||
}
|
||||
|
||||
at::Tensor custom_abs(at::Tensor x) {
|
||||
return at::abs(x);
|
||||
}
|
||||
|
||||
} // namespace at::native::openreg
|
||||
|
||||
@ -1,12 +1,13 @@
|
||||
#include "Common.h"
|
||||
|
||||
namespace at::native {
|
||||
at::Tensor quantize_per_tensor_openreg(
|
||||
namespace at::native::openreg {
|
||||
|
||||
at::Tensor quantize_per_tensor(
|
||||
const at::Tensor& self,
|
||||
double scale,
|
||||
int64_t zero_point,
|
||||
at::ScalarType dtype);
|
||||
int64_t _fused_sdp_choice_openreg(
|
||||
int64_t _fused_sdp_choice(
|
||||
const at::Tensor& query,
|
||||
const at::Tensor& key,
|
||||
const at::Tensor& value,
|
||||
@ -15,6 +16,11 @@ int64_t _fused_sdp_choice_openreg(
|
||||
bool is_causal,
|
||||
std::optional<double> scale,
|
||||
bool enable_gqa);
|
||||
void quantize_tensor_per_tensor_affine_stub(
|
||||
const at::Tensor& rtensor,
|
||||
at::Tensor& qtensor,
|
||||
double scale,
|
||||
int64_t zero_point);
|
||||
std::tuple<
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
@ -25,7 +31,7 @@ std::tuple<
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor>
|
||||
_scaled_dot_product_fused_attention_overrideable_openreg(
|
||||
_scaled_dot_product_fused_attention_overrideable(
|
||||
const at::Tensor& query,
|
||||
const at::Tensor& key,
|
||||
const at::Tensor& value,
|
||||
@ -35,7 +41,7 @@ _scaled_dot_product_fused_attention_overrideable_openreg(
|
||||
bool return_debug_mask,
|
||||
std::optional<double> scale);
|
||||
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
|
||||
_scaled_dot_product_fused_attention_overrideable_backward_openreg(
|
||||
_scaled_dot_product_fused_attention_overrideable_backward(
|
||||
const at::Tensor& grad_out,
|
||||
const at::Tensor& query,
|
||||
const at::Tensor& key,
|
||||
@ -53,18 +59,11 @@ _scaled_dot_product_fused_attention_overrideable_backward_openreg(
|
||||
const at::Tensor& philox_seed,
|
||||
const at::Tensor& philox_offset,
|
||||
std::optional<double> scale);
|
||||
} // namespace at::native
|
||||
|
||||
namespace at::native {
|
||||
void abs_kernel_openreg(at::TensorIteratorBase& iter);
|
||||
void quantize_tensor_per_tensor_affine_stub_openreg(
|
||||
const at::Tensor& rtensor,
|
||||
at::Tensor& qtensor,
|
||||
double scale,
|
||||
int64_t zero_point);
|
||||
} // namespace at::native
|
||||
|
||||
namespace at::native {
|
||||
at::Tensor custom_autograd_fn_returns_self(at::Tensor x);
|
||||
at::Tensor custom_autograd_fn_aliasing(at::Tensor x);
|
||||
} // namespace at::native
|
||||
at::Tensor& abs_out(const at::Tensor& self, at::Tensor& out);
|
||||
void abs_kernel(at::TensorIteratorBase& iter);
|
||||
at::Tensor custom_abs(at::Tensor x);
|
||||
|
||||
} // namespace at::native::openreg
|
||||
|
||||
@ -1,8 +1,11 @@
|
||||
#include "Minimal.h"
|
||||
|
||||
namespace at::native {
|
||||
#include <unordered_set>
|
||||
|
||||
at::Tensor empty_memory_format_openreg(
|
||||
namespace at::native::openreg {
|
||||
|
||||
// LITERALINCLUDE START: EMPTY.MEMORY_FORMAT IMPL
|
||||
at::Tensor empty_memory_format(
|
||||
c10::IntArrayRef size,
|
||||
std::optional<c10::ScalarType> dtype_opt,
|
||||
std::optional<c10::Layout> layout_opt,
|
||||
@ -24,8 +27,9 @@ at::Tensor empty_memory_format_openreg(
|
||||
return at::detail::empty_generic(
|
||||
size, allocator, pu1_dks, dtype, memory_format_opt);
|
||||
}
|
||||
// LITERALINCLUDE END: EMPTY.MEMORY_FORMAT IMPL
|
||||
|
||||
at::Tensor empty_strided_openreg(
|
||||
at::Tensor empty_strided(
|
||||
c10::IntArrayRef size,
|
||||
c10::IntArrayRef stride,
|
||||
std::optional<c10::ScalarType> dtype_opt,
|
||||
@ -48,7 +52,7 @@ at::Tensor empty_strided_openreg(
|
||||
size, stride, allocator, pu1_dks, dtype);
|
||||
}
|
||||
|
||||
at::Tensor as_strided_openreg(
|
||||
at::Tensor as_strided(
|
||||
const at::Tensor& self,
|
||||
c10::SymIntArrayRef size,
|
||||
c10::SymIntArrayRef stride,
|
||||
@ -58,7 +62,7 @@ at::Tensor as_strided_openreg(
|
||||
return at::cpu::as_strided_symint(self, size, stride, storage_offset);
|
||||
}
|
||||
|
||||
const at::Tensor& resize_openreg_(
|
||||
const at::Tensor& resize_(
|
||||
const at::Tensor& self,
|
||||
c10::SymIntArrayRef size,
|
||||
::std::optional<at::MemoryFormat> memory_format) {
|
||||
@ -66,7 +70,7 @@ const at::Tensor& resize_openreg_(
|
||||
self, C10_AS_INTARRAYREF_SLOW(size), memory_format);
|
||||
}
|
||||
|
||||
at::Tensor _reshape_alias_openreg(
|
||||
at::Tensor _reshape_alias(
|
||||
const at::Tensor& self,
|
||||
c10::SymIntArrayRef size,
|
||||
c10::SymIntArrayRef stride) {
|
||||
@ -74,7 +78,7 @@ at::Tensor _reshape_alias_openreg(
|
||||
self, C10_AS_INTARRAYREF_SLOW(size), C10_AS_INTARRAYREF_SLOW(stride));
|
||||
}
|
||||
|
||||
at::Tensor _copy_from_openreg(
|
||||
at::Tensor _copy_from(
|
||||
const at::Tensor& self,
|
||||
const at::Tensor& dst,
|
||||
bool non_blocking) {
|
||||
@ -124,50 +128,58 @@ at::Tensor _copy_from_openreg(
|
||||
return dst;
|
||||
}
|
||||
|
||||
at::Tensor _copy_from_and_resize_openreg(
|
||||
at::Tensor _copy_from_and_resize(
|
||||
const at::Tensor& self,
|
||||
const at::Tensor& dst) {
|
||||
at::native::resize_(dst, self.sizes(), std::nullopt);
|
||||
|
||||
MemoryGuard guard(self, dst);
|
||||
|
||||
return at::native::copy_(const_cast<at::Tensor&>(dst), self, false);
|
||||
}
|
||||
|
||||
at::Scalar _local_scalar_dense_openreg(const at::Tensor& self) {
|
||||
at::Scalar _local_scalar_dense(const at::Tensor& self) {
|
||||
MemoryGuard guard(self);
|
||||
return at::native::_local_scalar_dense_cpu(self);
|
||||
}
|
||||
|
||||
at::Tensor& set_source_Tensor_openreg_(
|
||||
at::Tensor& self,
|
||||
const at::Tensor& source) {
|
||||
at::Tensor& set_source_Tensor_(at::Tensor& self, const at::Tensor& source) {
|
||||
return at::native::set_tensor_(self, source);
|
||||
}
|
||||
|
||||
at::Tensor& set_source_Storage_openreg_(at::Tensor& self, at::Storage source) {
|
||||
at::Tensor& set_source_Storage_(at::Tensor& self, at::Storage source) {
|
||||
return at::native::set_(self, source);
|
||||
}
|
||||
|
||||
at::Tensor& set_source_Storage_storage_offset_openreg_(
|
||||
at::Tensor& set_source_Storage_storage_offset_(
|
||||
at::Tensor& result,
|
||||
at::Storage storage,
|
||||
int64_t storage_offset,
|
||||
c10::IntArrayRef size,
|
||||
c10::IntArrayRef stride) {
|
||||
// call native::
|
||||
return at::cpu::set_(result, storage, storage_offset, size, stride);
|
||||
}
|
||||
|
||||
at::Tensor view_openreg(const at::Tensor& self, c10::SymIntArrayRef size) {
|
||||
at::Tensor view(const at::Tensor& self, c10::SymIntArrayRef size) {
|
||||
MemoryGuard guard(self);
|
||||
return at::native::view(self, C10_AS_INTARRAYREF_SLOW(size));
|
||||
}
|
||||
|
||||
void cpu_fallback_openreg(
|
||||
const c10::OperatorHandle& op,
|
||||
torch::jit::Stack* stack) {
|
||||
at::native::cpu_fallback(op, stack);
|
||||
}
|
||||
// LITERALINCLUDE START: FALLBACK IMPL
|
||||
void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
|
||||
static const std::unordered_set<c10::OperatorName> cpu_fallback_blacklist = {
|
||||
c10::OperatorName("aten::abs", ""),
|
||||
c10::OperatorName("aten::abs", "out"),
|
||||
};
|
||||
|
||||
} // namespace at::native
|
||||
const auto& op_name = op.schema().operator_name();
|
||||
if (cpu_fallback_blacklist.count(op_name)) {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Operator '",
|
||||
op_name,
|
||||
"' is not implemented for device openreg.");
|
||||
} else {
|
||||
at::native::cpu_fallback(op, stack);
|
||||
}
|
||||
}
|
||||
// LITERALINCLUDE END: FALLBACK IMPL
|
||||
|
||||
} // namespace at::native::openreg
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
#include "Common.h"
|
||||
|
||||
namespace at::native {
|
||||
namespace at::native::openreg {
|
||||
|
||||
at::Tensor empty_memory_format_openreg(
|
||||
at::Tensor empty_memory_format(
|
||||
c10::IntArrayRef size,
|
||||
std::optional<c10::ScalarType> dtype_opt,
|
||||
std::optional<c10::Layout> layout_opt,
|
||||
@ -10,7 +10,7 @@ at::Tensor empty_memory_format_openreg(
|
||||
std::optional<bool> pin_memory_opt,
|
||||
std::optional<c10::MemoryFormat> memory_format_opt);
|
||||
|
||||
at::Tensor empty_strided_openreg(
|
||||
at::Tensor empty_strided(
|
||||
c10::IntArrayRef size,
|
||||
c10::IntArrayRef stride,
|
||||
std::optional<c10::ScalarType> dtype_opt,
|
||||
@ -18,50 +18,44 @@ at::Tensor empty_strided_openreg(
|
||||
std::optional<c10::Device> device_opt,
|
||||
std::optional<bool> pin_memory_opt);
|
||||
|
||||
at::Tensor as_strided_openreg(
|
||||
at::Tensor as_strided(
|
||||
const at::Tensor& self,
|
||||
c10::SymIntArrayRef size,
|
||||
c10::SymIntArrayRef stride,
|
||||
std::optional<c10::SymInt> storage_offset);
|
||||
|
||||
const at::Tensor& resize_openreg_(
|
||||
const at::Tensor& resize_(
|
||||
const at::Tensor& self,
|
||||
c10::SymIntArrayRef size,
|
||||
::std::optional<at::MemoryFormat> memory_format);
|
||||
|
||||
at::Tensor _reshape_alias_openreg(
|
||||
at::Tensor _reshape_alias(
|
||||
const at::Tensor& self,
|
||||
c10::SymIntArrayRef size,
|
||||
c10::SymIntArrayRef stride);
|
||||
|
||||
at::Tensor _copy_from_openreg(
|
||||
at::Tensor _copy_from(
|
||||
const at::Tensor& self,
|
||||
const at::Tensor& dst,
|
||||
bool non_blocking);
|
||||
|
||||
at::Tensor _copy_from_and_resize_openreg(
|
||||
const at::Tensor& self,
|
||||
const at::Tensor& dst);
|
||||
at::Tensor _copy_from_and_resize(const at::Tensor& self, const at::Tensor& dst);
|
||||
|
||||
at::Scalar _local_scalar_dense_openreg(const at::Tensor& self);
|
||||
at::Scalar _local_scalar_dense(const at::Tensor& self);
|
||||
|
||||
at::Tensor& set_source_Tensor_openreg_(
|
||||
at::Tensor& self,
|
||||
const at::Tensor& source);
|
||||
at::Tensor& set_source_Tensor_(at::Tensor& self, const at::Tensor& source);
|
||||
|
||||
at::Tensor& set_source_Storage_openreg_(at::Tensor& self, at::Storage source);
|
||||
at::Tensor& set_source_Storage_(at::Tensor& self, at::Storage source);
|
||||
|
||||
at::Tensor& set_source_Storage_storage_offset_openreg_(
|
||||
at::Tensor& set_source_Storage_storage_offset_(
|
||||
at::Tensor& result,
|
||||
at::Storage storage,
|
||||
int64_t storage_offset,
|
||||
c10::IntArrayRef size,
|
||||
c10::IntArrayRef stride);
|
||||
|
||||
at::Tensor view_openreg(const at::Tensor& self, c10::SymIntArrayRef size);
|
||||
at::Tensor view(const at::Tensor& self, c10::SymIntArrayRef size);
|
||||
|
||||
void cpu_fallback_openreg(
|
||||
const c10::OperatorHandle& op,
|
||||
torch::jit::Stack* stack);
|
||||
void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack);
|
||||
|
||||
} // namespace at::native
|
||||
} // namespace at::native::openreg
|
||||
|
||||
@ -103,6 +103,7 @@ def main():
|
||||
"-Wno-unused-parameter",
|
||||
"-Wno-missing-field-initializers",
|
||||
"-Wno-unknown-pragmas",
|
||||
"-fno-strict-aliasing",
|
||||
]
|
||||
|
||||
ext_modules = [
|
||||
|
||||
@ -2,6 +2,8 @@ import torch
|
||||
|
||||
import torch_openreg._C # type: ignore[misc]
|
||||
|
||||
from . import meta # noqa: F401
|
||||
|
||||
|
||||
_initialized = False
|
||||
|
||||
@ -42,6 +44,10 @@ def set_device(device) -> None:
|
||||
return torch_openreg._C._set_device(device)
|
||||
|
||||
|
||||
def init():
|
||||
_lazy_init()
|
||||
|
||||
|
||||
def is_initialized():
|
||||
return _initialized
|
||||
|
||||
@ -64,6 +70,7 @@ __all__ = [
|
||||
"set_device",
|
||||
"initial_seed",
|
||||
"is_available",
|
||||
"init",
|
||||
"is_initialized",
|
||||
"random",
|
||||
"manual_seed",
|
||||
|
||||
@ -0,0 +1,13 @@
|
||||
import torch
|
||||
|
||||
|
||||
# LITERALINCLUDE START: CUSTOM OPERATOR META
|
||||
lib = torch.library.Library("openreg", "IMPL", "Meta") # noqa: TOR901
|
||||
|
||||
|
||||
@torch.library.impl(lib, "custom_abs")
|
||||
def custom_abs(self):
|
||||
return torch.empty_like(self)
|
||||
|
||||
|
||||
# LITERALINCLUDE END: CUSTOM OPERATOR META
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user