Compare commits

..

54 Commits

Author SHA1 Message Date
7669445a70 Merge remote-tracking branch 'origin/add_op_tests' into perf_ops 2025-09-28 01:55:13 -07:00
b7eae1cc34 b200 benchmarks seperate run 2025-09-28 01:51:07 -07:00
a710d65523 Update measurement for cuda 2025-09-26 22:16:33 -07:00
eddf149b0c Remove short config from cpu run 2025-09-24 09:10:53 -07:00
62a91acda9 Updates 2025-09-24 09:05:42 -07:00
45760a2f7f Don't configure AWS credentials on A100/H100 2025-09-23 23:48:43 -07:00
5fa2fe9539 Seperate the b100 and h100+a100 run 2025-09-23 22:18:47 -07:00
2e3d0429c2 Update and re-run 2025-09-23 20:04:26 -07:00
c8a53c3383 <Replace this line with a title. Use 1 line only, 67 chars or less>
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
2025-09-23 19:59:50 -07:00
682d542bfb Change cron schedule comment to 'everyday' 2025-09-23 15:18:58 -07:00
a60037fd72 Add B200 and seperate add addmm 2025-09-23 15:12:52 -07:00
bfc9680175 Remove py2.8 seperate code 2025-09-22 11:24:24 -07:00
5031e026fc Build with fallback 2025-09-21 23:44:31 -07:00
195779ec3b [Testing] Operator benchmark baseline 2.8 2025-09-19 18:12:49 -07:00
e934b6ab40 Update add_test 2025-09-18 11:50:06 -07:00
ca59a71675 Test 2025-09-16 18:06:05 -07:00
93ad3fec44 Merge remote-tracking branch 'origin/main' into add_op_tests 2025-09-16 10:01:05 -07:00
783f8064d1 Fix yml string 2025-09-15 21:29:02 -07:00
f47fa8d2f8 Fix syntax for running operator benchmarks 2025-09-14 11:04:24 -07:00
3635731fc2 Update yml 2025-09-12 18:47:13 -07:00
98a71c71b2 Fix extra_flags formatting in benchmark workflow 2025-09-12 17:38:22 -07:00
86e3803f3b Update Docker image name for operator benchmark 2025-09-12 16:05:05 -07:00
056bcfc333 Update operator_microbenchmark.yml to use a bigger runner 2025-09-12 15:38:56 -07:00
cc2b171704 Update docker 2025-09-12 15:36:11 -07:00
7b16f72b09 Fix quotes 2025-09-12 13:50:46 -07:00
f47e539765 fix trailing comma 2025-09-12 13:23:30 -07:00
49e5e122fe Fix params for python cmd 2025-09-12 12:55:57 -07:00
6b8cc19597 Fix concurrency and cuda arch 2025-09-12 11:43:08 -07:00
d683fb9ebe Add include into the test matrix 2025-09-12 11:14:54 -07:00
9eca494626 Tweak the build step 2025-09-12 10:56:27 -07:00
7f5b0bcec8 Fix CI 2025-09-11 23:30:42 -07:00
4c257bca07 Fix CI 2025-09-11 21:26:28 -07:00
05bb4d4fc6 Fix CI 2025-09-11 17:00:49 -07:00
8d0cafb8bb Add h100, a100 2025-09-11 14:23:36 -07:00
629de8d7ba Fixes 2025-09-11 14:18:51 -07:00
71ae2d8280 Add ci flow 2025-09-10 22:05:03 -07:00
2fe66701c1 Merge remote-tracking branch 'origin/main' into add_op_tests 2025-09-10 21:54:16 -07:00
c021d0349e Add ci flow 2025-09-10 21:47:50 -07:00
c6f1a29b17 Merge remote-tracking branch 'origin/main' into add_op_tests 2025-09-09 21:23:23 -07:00
54c9527a81 Add mm benchmarking tests 2025-09-09 14:25:07 -07:00
cf31d4b744 Add mm benchmarking tests 2025-09-09 14:17:08 -07:00
9c701f03ee update json 2025-09-08 22:16:33 -07:00
c193ed6c84 Merge remote-tracking branch 'origin/main' into add_compile_benchmarking 2025-09-08 12:42:06 -07:00
eab7bd0d4c Remove mm_bwd_test.py 2025-09-08 11:34:58 -07:00
199318f978 Remove cpu benchmarking 2025-09-08 11:28:30 -07:00
9b226b2ce4 Add cpu memory calculation 2025-09-08 00:10:10 -07:00
6357d4e05a Add cpu memory calculation 2025-09-08 00:03:50 -07:00
162e7d3c20 Updates 2025-09-07 22:02:53 -07:00
ada9c165dd Lint fixes 2025-09-04 13:12:34 -07:00
461c7ad698 Enable bwd pass 2025-09-03 21:51:42 -07:00
819159610d Add fixes 2025-09-03 20:47:09 -07:00
d257ebf9c7 Add peak memory calculation 2025-09-03 11:10:16 -07:00
aab478833d Make jit and compile mutually exclusive 2025-08-27 14:21:37 -07:00
ba1319f414 Update the op benchmarking, to benchmark using torch.compile 2025-08-25 00:15:50 -07:00
1104 changed files with 19687 additions and 37381 deletions

View File

@ -31,7 +31,8 @@ pip install -r /pytorch/requirements.txt
pip install auditwheel==6.2.0 wheel pip install auditwheel==6.2.0 wheel
if [ "$DESIRED_CUDA" = "cpu" ]; then if [ "$DESIRED_CUDA" = "cpu" ]; then
echo "BASE_CUDA_VERSION is not set. Building cpu wheel." echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
else else
echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA" echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
export USE_SYSTEM_NCCL=1 export USE_SYSTEM_NCCL=1
@ -45,5 +46,6 @@ else
export USE_NVIDIA_PYPI_LIBS=1 export USE_NVIDIA_PYPI_LIBS=1
fi fi
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
fi fi

View File

@ -317,7 +317,7 @@ if __name__ == "__main__":
).decode() ).decode()
print("Building PyTorch wheel") print("Building PyTorch wheel")
build_vars = "" build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b) # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
if enable_cuda: if enable_cuda:
build_vars += "MAX_JOBS=5 " build_vars += "MAX_JOBS=5 "

View File

@ -241,7 +241,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
try: try:
with socket.create_connection((addr, port), timeout=timeout): with socket.create_connection((addr, port), timeout=timeout):
return return
except (ConnectionRefusedError, TimeoutError): # noqa: PERF203 except (ConnectionRefusedError, socket.timeout): # noqa: PERF203
if i == attempt_cnt - 1: if i == attempt_cnt - 1:
raise raise
time.sleep(timeout) time.sleep(timeout)
@ -1004,7 +1004,7 @@ if __name__ == "__main__":
install_condaforge_python(host, args.python_version) install_condaforge_python(host, args.python_version)
sys.exit(0) sys.exit(0)
python_version = args.python_version if args.python_version is not None else "3.10" python_version = args.python_version if args.python_version is not None else "3.9"
if args.use_torch_from_pypi: if args.use_torch_from_pypi:
configure_system(host, compiler=args.compiler, python_version=python_version) configure_system(host, compiler=args.compiler, python_version=python_version)

View File

@ -69,8 +69,7 @@ RUN bash ./install_cuda.sh 13.0
ENV DESIRED_CUDA=13.0 ENV DESIRED_CUDA=13.0
FROM ${ROCM_IMAGE} as rocm FROM ${ROCM_IMAGE} as rocm
ARG PYTORCH_ROCM_ARCH ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
ADD ./common/install_mkl.sh install_mkl.sh ADD ./common/install_mkl.sh install_mkl.sh
RUN bash ./install_mkl.sh && rm install_mkl.sh RUN bash ./install_mkl.sh && rm install_mkl.sh
ENV MKLROOT /opt/intel ENV MKLROOT /opt/intel

View File

@ -36,12 +36,6 @@ case ${DOCKER_TAG_PREFIX} in
;; ;;
rocm*) rocm*)
BASE_TARGET=rocm BASE_TARGET=rocm
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
# add gfx950 conditionally starting in ROCm 7.0
if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
fi
EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
;; ;;
*) *)
echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}" echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"

View File

@ -262,10 +262,13 @@ case "$tag" in
TRITON_CPU=yes TRITON_CPU=yes
;; ;;
pytorch-linux-jammy-linter) pytorch-linux-jammy-linter)
PYTHON_VERSION=3.10 # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
# We will need to update mypy version eventually, but that's for another day. The task
# would be to upgrade mypy to 1.0.0 with Python 3.11
PYTHON_VERSION=3.9
;; ;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter) pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
PYTHON_VERSION=3.10 PYTHON_VERSION=3.9
CUDA_VERSION=12.8.1 CUDA_VERSION=12.8.1
;; ;;
pytorch-linux-jammy-aarch64-py3.10-gcc11) pytorch-linux-jammy-aarch64-py3.10-gcc11)

View File

@ -1 +1 @@
e0dda9059d082537cee36be6c5e4fe3b18c880c0 56392aa978594cc155fa8af48cd949f5b5f1823a

View File

@ -1,2 +1,2 @@
transformers==4.56.0 transformers==4.54.0
soxr==0.5.0 soxr==0.5.0

View File

@ -1 +1 @@
v2.28.3-1 v2.27.5-1

View File

@ -1 +1 @@
v2.28.3-1 v2.27.7-1

View File

@ -1 +1 @@
bbb06c0334a6772b92d24bde54956e675c8c6604 5ae38bdb0dc066c5823e34dc9797afb9de42c866

View File

@ -42,27 +42,22 @@ install_pip_dependencies() {
# A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current # A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current
# numba and scipy version used in PyTorch CI # numba and scipy version used in PyTorch CI
conda_run pip uninstall -y numba scipy conda_run pip uninstall -y numba scipy
# Yaspin is needed for running CI test (get_benchmark_analysis_data.py)
pip_install yaspin==3.1.0
popd popd
} }
setup_executorch() { setup_executorch() {
pushd executorch
export PYTHON_EXECUTABLE=python export PYTHON_EXECUTABLE=python
export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON" export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
popd
} }
if [ $# -eq 0 ]; then clone_executorch
clone_executorch install_buck2
install_buck2 install_conda_dependencies
install_conda_dependencies install_pip_dependencies
install_pip_dependencies setup_executorch
pushd executorch
setup_executorch
popd
else
"$@"
fi

View File

@ -12,8 +12,8 @@ function do_install() {
rocm_version_nodot=${rocm_version//./} rocm_version_nodot=${rocm_version//./}
# https://github.com/icl-utk-edu/magma/pull/65 # Version 2.7.2 + ROCm related updates
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2" magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
rocm_dir="/opt/rocm" rocm_dir="/opt/rocm"

View File

@ -40,16 +40,12 @@ case ${DOCKER_TAG_PREFIX} in
;; ;;
rocm*) rocm*)
# we want the patch version of 6.4 instead # we want the patch version of 6.4 instead
if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2" GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
fi fi
BASE_TARGET=rocm BASE_TARGET=rocm
GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
# add gfx950 conditionally starting in ROCm 7.0
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
fi
DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}" DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
;; ;;
*) *)

View File

@ -82,7 +82,7 @@ case ${image} in
;; ;;
manylinux2_28-builder:rocm*) manylinux2_28-builder:rocm*)
# we want the patch version of 6.4 instead # we want the patch version of 6.4 instead
if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2" GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
fi fi
TARGET=rocm_final TARGET=rocm_final
@ -90,10 +90,6 @@ case ${image} in
DEVTOOLSET_VERSION="11" DEVTOOLSET_VERSION="11"
GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
# add gfx950 conditionally starting in ROCm 7.0
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
fi
DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}" DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
;; ;;
manylinux2_28-builder:xpu) manylinux2_28-builder:xpu)

View File

@ -93,9 +93,8 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
#Pinned versions: #Pinned versions:
#test that import: #test that import:
mypy==1.16.0 ; platform_system != "Windows" mypy==1.16.0
# Pin MyPy version because new errors are likely to appear with each release # Pin MyPy version because new errors are likely to appear with each release
# Skip on Windows as lots of type annotations are POSIX specific
#Description: linter #Description: linter
#Pinned versions: 1.16.0 #Pinned versions: 1.16.0
#test that import: test_typing.py, test_type_hints.py #test that import: test_typing.py, test_type_hints.py
@ -112,6 +111,8 @@ ninja==1.11.1.3
#Pinned versions: 1.11.1.3 #Pinned versions: 1.11.1.3
#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x"
numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x" numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x" numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
#Description: Just-In-Time Compiler for Numerical Functions #Description: Just-In-Time Compiler for Numerical Functions
@ -132,7 +133,7 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
#test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py, #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
#test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py, #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
#test_binary_ufuncs.py #test_binary_ufuncs.py
numpy==1.22.4; python_version == "3.10" numpy==1.22.4; python_version == "3.9" or python_version == "3.10"
numpy==1.26.2; python_version == "3.11" or python_version == "3.12" numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
numpy==2.1.2; python_version >= "3.13" numpy==2.1.2; python_version >= "3.13"
@ -324,6 +325,8 @@ pywavelets==1.7.0 ; python_version >= "3.12"
lxml==5.3.0 lxml==5.3.0
#Description: This is a requirement of unittest-xml-reporting #Description: This is a requirement of unittest-xml-reporting
# Python-3.9 binaries
PyGithub==2.3.0 PyGithub==2.3.0
sympy==1.13.3 sympy==1.13.3

View File

@ -1,15 +1,8 @@
sphinx==5.3.0 sphinx==5.3.0
#Description: This is used to generate PyTorch docs #Description: This is used to generate PyTorch docs
#Pinned versions: 5.3.0 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
standard-imghdr==3.13.0; python_version >= "3.13"
#Description: This is needed by Sphinx, so it needs to be added here.
# The reasons are as follows:
# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
# something related to Docker setup. We can investigate this later. # something related to Docker setup. We can investigate this later.

View File

@ -41,6 +41,7 @@ def sample_vllm_test_library():
"pytest -v -s basic_correctness/test_cumem.py", "pytest -v -s basic_correctness/test_cumem.py",
"pytest -v -s basic_correctness/test_basic_correctness.py", "pytest -v -s basic_correctness/test_basic_correctness.py",
"pytest -v -s basic_correctness/test_cpu_offload.py", "pytest -v -s basic_correctness/test_cpu_offload.py",
"VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py",
], ],
}, },
"vllm_basic_models_test": { "vllm_basic_models_test": {
@ -67,12 +68,15 @@ def sample_vllm_test_library():
"-v", "-v",
"-s", "-s",
"entrypoints/llm", "entrypoints/llm",
"--ignore=entrypoints/llm/test_lazy_outlines.py",
"--ignore=entrypoints/llm/test_generate.py", "--ignore=entrypoints/llm/test_generate.py",
"--ignore=entrypoints/llm/test_generate_multiple_loras.py",
"--ignore=entrypoints/llm/test_collective_rpc.py", "--ignore=entrypoints/llm/test_collective_rpc.py",
] ]
), ),
"pytest -v -s entrypoints/llm/test_generate.py", "pytest -v -s entrypoints/llm/test_lazy_outlines.py",
"pytest -v -s entrypoints/offline_mode", "pytest -v -s entrypoints/llm/test_generate.py ",
"VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
], ],
}, },
"vllm_regression_test": { "vllm_regression_test": {

View File

@ -1,11 +1,11 @@
SHELL=/usr/bin/env bash SHELL=/usr/bin/env bash
DOCKER_CMD ?= docker DOCKER_CMD ?= docker
DESIRED_ROCM ?= 7.0 DESIRED_ROCM ?= 6.4
DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM)) DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
PACKAGE_NAME = magma-rocm PACKAGE_NAME = magma-rocm
# inherit this from underlying docker image, do not pass this env var to docker # inherit this from underlying docker image, do not pass this env var to docker
#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201 #PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
-v $(shell git rev-parse --show-toplevel)/.ci:/builder \ -v $(shell git rev-parse --show-toplevel)/.ci:/builder \
@ -16,7 +16,6 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
magma-rocm/build_magma.sh magma-rocm/build_magma.sh
.PHONY: all .PHONY: all
all: magma-rocm70
all: magma-rocm64 all: magma-rocm64
all: magma-rocm63 all: magma-rocm63
@ -25,11 +24,6 @@ clean:
$(RM) -r magma-* $(RM) -r magma-*
$(RM) -r output $(RM) -r output
.PHONY: magma-rocm70
magma-rocm70: DESIRED_ROCM := 7.0
magma-rocm70:
$(DOCKER_RUN)
.PHONY: magma-rocm64 .PHONY: magma-rocm64
magma-rocm64: DESIRED_ROCM := 6.4 magma-rocm64: DESIRED_ROCM := 6.4
magma-rocm64: magma-rocm64:

View File

@ -6,8 +6,8 @@ set -eou pipefail
# The script expects DESIRED_CUDA and PACKAGE_NAME to be set # The script expects DESIRED_CUDA and PACKAGE_NAME to be set
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
# https://github.com/icl-utk-edu/magma/pull/65 # Version 2.7.2 + ROCm related updates
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
# Folders for the build # Folders for the build
PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE
# Fetch magma sources and verify checksum # Fetch magma sources and verify checksum
pushd ${PACKAGE_DIR} pushd ${PACKAGE_DIR}
git clone https://github.com/jeffdaily/magma git clone https://bitbucket.org/icl/magma.git
pushd magma pushd magma
git checkout ${MAGMA_VERSION} git checkout ${MAGMA_VERSION}
popd popd

View File

@ -58,7 +58,7 @@ time python tools/setup_helpers/generate_code.py \
# Build the docs # Build the docs
pushd docs/cpp pushd docs/cpp
time make VERBOSE=1 html time make VERBOSE=1 html -j
popd popd
popd popd

View File

@ -0,0 +1,40 @@
#!/bin/bash
# This is where the local pytorch install in the docker image is located
pt_checkout="/var/lib/jenkins/workspace"
source "$pt_checkout/.ci/pytorch/common_utils.sh"
echo "functorch_doc_push_script.sh: Invoked with $*"
set -ex -o pipefail
version=${DOCS_VERSION:-nightly}
echo "version: $version"
# Build functorch docs
pushd $pt_checkout/functorch/docs
make html
popd
git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages
pushd functorch_ghpages
if [ "$version" == "main" ]; then
version=nightly
fi
git rm -rf "$version" || true
mv "$pt_checkout/functorch/docs/build/html" "$version"
git add "$version" || true
git status
git config user.email "soumith+bot@pytorch.org"
git config user.name "pytorchbot"
# If there aren't changes, don't make a commit; push is no-op
git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true
git status
if [[ "${WITH_PUSH:-}" == true ]]; then
git push -u origin gh-pages
fi
popd

View File

@ -35,11 +35,10 @@ fi
print_cmake_info print_cmake_info
if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
# Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
else else
# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests # NB: we always build with distributed; USE_DISTRIBUTED turns off all
# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448 # backends (specifically the gloo backend), so test that this case works too
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64 USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
fi fi
if which sccache > /dev/null; then if which sccache > /dev/null; then

View File

@ -13,9 +13,13 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
fi fi
popd popd
python -mpip install -r requirements.txt
# enable debug asserts in serialization # enable debug asserts in serialization
export TORCH_SERIALIZATION_DEBUG=1 export TORCH_SERIALIZATION_DEBUG=1
python -mpip install --no-input -r requirements.txt
setup_test_python() { setup_test_python() {
# The CircleCI worker hostname doesn't resolve to an address. # The CircleCI worker hostname doesn't resolve to an address.
# This environment variable makes ProcessGroupGloo default to # This environment variable makes ProcessGroupGloo default to
@ -55,7 +59,7 @@ test_python_shard() {
setup_test_python setup_test_python
time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "$1" "$NUM_TEST_SHARDS" time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS"
assert_git_not_dirty assert_git_not_dirty
} }

View File

@ -1,25 +0,0 @@
From 6e08c9d08e9de59c7af28b720289debbbd384764 Mon Sep 17 00:00:00 2001
From: Michael Wang <13521008+isVoid@users.noreply.github.com>
Date: Tue, 1 Apr 2025 17:28:05 -0700
Subject: [PATCH] Avoid bumping certain driver API to avoid future breakage
(#185)
Co-authored-by: isVoid <isVoid@users.noreply.github.com>
---
numba_cuda/numba/cuda/cudadrv/driver.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 1641bf77..233e9ed7 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -365,6 +365,9 @@ def _find_api(self, fname):
else:
variants = ('_v2', '')
+ if fname in ("cuCtxGetDevice", "cuCtxSynchronize"):
+ return getattr(self.lib, fname)
+
for variant in variants:
try:
return getattr(self.lib, f'{fname}{variant}')

View File

@ -32,16 +32,6 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v
git config --global --add safe.directory /var/lib/jenkins/workspace git config --global --add safe.directory /var/lib/jenkins/workspace
fi fi
# Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
if [ -n "$NUMBA_CUDA_DIR" ]; then
NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
pushd "$NUMBA_CUDA_DIR"
patch -p4 <"$NUMBA_PATCH"
popd
fi
echo "Environment variables:" echo "Environment variables:"
env env
@ -322,29 +312,23 @@ test_python_shard() {
# modify LD_LIBRARY_PATH to ensure it has the conda env. # modify LD_LIBRARY_PATH to ensure it has the conda env.
# This set of tests has been shown to be buggy without it for the split-build # This set of tests has been shown to be buggy without it for the split-build
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
assert_git_not_dirty assert_git_not_dirty
} }
test_python() { test_python() {
# shellcheck disable=SC2086 # shellcheck disable=SC2086
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
assert_git_not_dirty assert_git_not_dirty
} }
test_python_smoke() { test_python_smoke() {
# Smoke tests for H100/B200 # Smoke tests for H100
time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
assert_git_not_dirty assert_git_not_dirty
} }
test_python_smoke_b200() {
# Targeted smoke tests for B200 - staged approach to avoid too many failures
time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
assert_git_not_dirty
}
test_h100_distributed() { test_h100_distributed() {
# Distributed tests at H100 # Distributed tests at H100
time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
@ -390,7 +374,6 @@ test_dynamo_wrapped_shard() {
--exclude-distributed-tests \ --exclude-distributed-tests \
--exclude-torch-export-tests \ --exclude-torch-export-tests \
--exclude-aot-dispatch-tests \ --exclude-aot-dispatch-tests \
--exclude-quantization-tests \
--shard "$1" "$NUM_TEST_SHARDS" \ --shard "$1" "$NUM_TEST_SHARDS" \
--verbose \ --verbose \
--upload-artifacts-while-running --upload-artifacts-while-running
@ -1163,12 +1146,6 @@ test_distributed() {
fi fi
} }
test_quantization() {
echo "Testing quantization"
python test/test_quantization.py
}
test_rpc() { test_rpc() {
echo "Testing RPC C++ tests" echo "Testing RPC C++ tests"
# NB: the ending test_rpc must match the current function name for the current # NB: the ending test_rpc must match the current function name for the current
@ -1563,10 +1540,14 @@ test_executorch() {
install_torchvision install_torchvision
install_torchaudio install_torchaudio
INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh"
pushd /executorch pushd /executorch
"${INSTALL_SCRIPT}" setup_executorch
export PYTHON_EXECUTABLE=python
export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
# NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
# from the PR
bash .ci/scripts/setup-linux.sh --build-tool cmake
echo "Run ExecuTorch unit tests" echo "Run ExecuTorch unit tests"
pytest -v -n auto pytest -v -n auto
@ -1580,14 +1561,17 @@ test_executorch() {
popd popd
# Test torchgen generated code for Executorch.
echo "Testing ExecuTorch op registration"
"$BUILD_BIN_DIR"/test_edge_op_registration
assert_git_not_dirty assert_git_not_dirty
} }
test_linux_aarch64() { test_linux_aarch64() {
python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \ python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \ test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops profiler/test_memory_profiler \ test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
--shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
# Dynamo tests # Dynamo tests
@ -1635,11 +1619,12 @@ test_operator_microbenchmark() {
mkdir -p "$TEST_REPORTS_DIR" mkdir -p "$TEST_REPORTS_DIR"
TEST_DIR=$(pwd) TEST_DIR=$(pwd)
pip_uninstall torch torchvision torchaudio
pip_install torch==2.8.0 torchvision torchaudio ninja --force-reinstall
cd benchmarks/operator_benchmark/pt_extension cd benchmarks/operator_benchmark/pt_extension
python -m pip install . python -m pip install .
cd "${TEST_DIR}"/benchmarks/operator_benchmark cd "${TEST_DIR}"/benchmarks/operator_benchmark
for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do
$TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \ $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
--output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \ --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
@ -1681,8 +1666,6 @@ elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
test_executorch test_executorch
elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
test_python_legacy_jit test_python_legacy_jit
elif [[ "$TEST_CONFIG" == 'quantization' ]]; then
test_quantization
elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
# TODO: run some C++ tests # TODO: run some C++ tests
echo "no-op at the moment" echo "no-op at the moment"
@ -1809,14 +1792,10 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
test_xpu_bin test_xpu_bin
elif [[ "${TEST_CONFIG}" == smoke ]]; then elif [[ "${TEST_CONFIG}" == smoke ]]; then
test_python_smoke test_python_smoke
elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then
test_python_smoke_b200
elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
test_h100_distributed test_h100_distributed
elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
test_h100_symm_mem test_h100_symm_mem
elif [[ "${TEST_CONFIG}" == "b200-symm-mem" ]]; then
test_h100_symm_mem
elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then
test_h100_cutlass_backend test_h100_cutlass_backend
else else

View File

@ -137,7 +137,7 @@ sccache --show-stats
python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])" python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
( (
if "%BUILD_ENVIRONMENT%"=="" ( if "%BUILD_ENVIRONMENT%"=="" (
echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%\envs\py_tmp` in Command Prompt before running Git Bash. echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash.
) else ( ) else (
copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%" copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%"

View File

@ -3,12 +3,12 @@ if "%BUILD_ENVIRONMENT%"=="" (
) else ( ) else (
set CONDA_PARENT_DIR=C:\Jenkins set CONDA_PARENT_DIR=C:\Jenkins
) )
set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3
:: Be conservative here when rolling out the new AMI with conda. This will try :: Be conservative here when rolling out the new AMI with conda. This will try
:: to install conda as before if it couldn't find the conda installation. This :: to install conda as before if it couldn't find the conda installation. This
:: can be removed eventually after we gain enough confidence in the AMI :: can be removed eventually after we gain enough confidence in the AMI
if not exist %CONDA_ROOT_DIR% ( if not exist %CONDA_PARENT_DIR%\Miniconda3 (
set INSTALL_FRESH_CONDA=1 set INSTALL_FRESH_CONDA=1
) )
@ -17,14 +17,10 @@ if "%INSTALL_FRESH_CONDA%"=="1" (
if errorlevel 1 exit /b if errorlevel 1 exit /b
if not errorlevel 0 exit /b if not errorlevel 0 exit /b
%TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR% %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
if errorlevel 1 exit /b if errorlevel 1 exit /b
if not errorlevel 0 exit /b if not errorlevel 0 exit /b
) )
:: Activate conda so that we can use its commands, i.e. conda, python, pip :: Activate conda so that we can use its commands, i.e. conda, python, pip
call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR% call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
:: Activate conda so that we can use its commands, i.e. conda, python, pip
call conda activate py_tmp
call pip install -r .ci/docker/requirements-ci.txt

View File

@ -14,7 +14,7 @@ if not errorlevel 0 exit /b
:: build\torch. Rather than changing all these references, making a copy of torch folder :: build\torch. Rather than changing all these references, making a copy of torch folder
:: from conda to the current workspace is easier. The workspace will be cleaned up after :: from conda to the current workspace is easier. The workspace will be cleaned up after
:: the job anyway :: the job anyway
xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\ xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
pushd . pushd .
if "%VC_VERSION%" == "" ( if "%VC_VERSION%" == "" (

View File

@ -25,7 +25,7 @@ echo Copying over test times file
robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files" robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files"
echo Run nn tests echo Run nn tests
python run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
if ERRORLEVEL 1 goto fail if ERRORLEVEL 1 goto fail
popd popd

View File

@ -38,14 +38,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
fi fi
# TODO: Move both of them to Windows AMI # TODO: Move both of them to Windows AMI
python -m pip install tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1 python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
# Copied from https://github.com/pytorch/test-infra/blob/be01a40157c36cd5a48391fdf44a7bc3ebd4c7e3/aws/ami/windows/scripts/Installers/Install-Pip-Dependencies.ps1#L16 with some adjustments
# pytest-rerunfailures==10.3 as 10.2 fails with INTERNALERROR> pluggy._manager.PluginValidationError: unknown hook 'pytest_configure_node'
# scipy from 1.6.3 to 1.10
# expecttest from 0.1.3 to 0.3.0
# xdoctest from 1.0.2 to 1.3.0
python -m pip install "future==0.18.2" "hypothesis==5.35.1" "expecttest==0.3.0" "librosa>=0.6.2" "scipy==1.10.1" "psutil==5.9.1" "pynvml==11.4.1" "pillow==9.2.0" "unittest-xml-reporting<=3.2.0,>=2.0.0" "pytest==7.1.3" "pytest-xdist==2.5.0" "pytest-flakefinder==1.1.0" "pytest-rerunfailures==10.3" "pytest-shard==0.1.2" "sympy==1.11.1" "xdoctest==1.3.0" "pygments==2.12.0" "opt-einsum>=3.3" "networkx==2.8.8" "mpmath==1.2.1" "pytest-cpp==2.3.0" "boto3==1.35.42"
# Install Z3 optional dependency for Windows builds. # Install Z3 optional dependency for Windows builds.
python -m pip install z3-solver==4.15.1.0 python -m pip install z3-solver==4.15.1.0
@ -59,6 +52,9 @@ python -m pip install parameterized==0.8.1
# Install pulp for testing ilps under torch\distributed\_tools # Install pulp for testing ilps under torch\distributed\_tools
python -m pip install pulp==2.9.0 python -m pip install pulp==2.9.0
# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308
python -m pip install expecttest==0.3.0
run_tests() { run_tests() {
# Run nvidia-smi if available # Run nvidia-smi if available
for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do

View File

@ -63,7 +63,7 @@ if errorlevel 1 exit /b 1
call %CONDA_HOME%\condabin\activate.bat testenv call %CONDA_HOME%\condabin\activate.bat testenv
if errorlevel 1 exit /b 1 if errorlevel 1 exit /b 1
call conda install -y -q -c conda-forge libuv=1.51 call conda install -y -q -c conda-forge libuv=1.39
call conda install -y -q intel-openmp call conda install -y -q intel-openmp
echo "install and test libtorch" echo "install and test libtorch"

View File

@ -177,7 +177,8 @@ source ~/${desired_python}-build/bin/activate
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt" retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
retry brew install libomp retry brew install libomp
# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule # For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
# is build as part of tensorpipe submodule
export USE_DISTRIBUTED=1 export USE_DISTRIBUTED=1
export USE_MKLDNN=OFF export USE_MKLDNN=OFF

View File

@ -0,0 +1,47 @@
#!/bin/bash
# =================== The following code **should** be executed inside Docker container ===================
# Install dependencies
sudo apt-get -y update
sudo apt-get -y install expect-dev
# This is where the local pytorch install in the docker image is located
pt_checkout="/var/lib/jenkins/workspace"
source "$pt_checkout/.ci/pytorch/common_utils.sh"
echo "functorch_doc_push_script.sh: Invoked with $*"
set -ex
version=${DOCS_VERSION:-nightly}
echo "version: $version"
# Build functorch docs
pushd $pt_checkout/functorch/docs
pip -q install -r requirements.txt
make html
popd
git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages
pushd functorch_ghpages
if [ $version == "main" ]; then
version=nightly
fi
git rm -rf "$version" || true
mv "$pt_checkout/functorch/docs/build/html" "$version"
git add "$version" || true
git status
git config user.email "soumith+bot@pytorch.org"
git config user.name "pytorchbot"
# If there aren't changes, don't make a commit; push is no-op
git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true
git status
if [[ "${WITH_PUSH:-}" == true ]]; then
git push -u origin gh-pages
fi
popd
# =================== The above code **should** be executed inside Docker container ===================

View File

@ -69,8 +69,6 @@ readability-string-compare,
' '
HeaderFilterRegex: '^(aten/|c10/|torch/).*$' HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
WarningsAsErrors: '*' WarningsAsErrors: '*'
LineFilter:
- name: '/usr/include/.*'
CheckOptions: CheckOptions:
cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: true cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: true
cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove: true cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove: true

View File

@ -1,10 +1,6 @@
--- ---
name: "⚠️ CI SEV" name: "⚠️ CI SEV"
about: Tracking incidents for PyTorch's CI infra. about: Tracking incidents for PyTorch's CI infra.
title: ''
labels: ''
assignees: ''
--- ---
> NOTE: Remember to label this issue with "`ci: sev`" > NOTE: Remember to label this issue with "`ci: sev`"

View File

@ -1,18 +0,0 @@
---
name: DISABLE AUTOREVERT
about: Disables autorevert when open
title: "❌​\U0001F519 [DISABLE AUTOREVERT]"
labels: 'ci: disable-autorevert'
assignees: ''
---
This issue, while open, disables the autorevert functionality.
More details can be found [here](https://github.com/pytorch/test-infra/blob/main/aws/lambda/pytorch-auto-revert/README.md)
## Why are you disabling autorevert?
## Links to any issues/commits/errors that shows the source of problem

View File

@ -1,10 +1,8 @@
--- ---
name: Disable CI jobs (PyTorch Dev Infra only) name: Disable CI jobs (PyTorch Dev Infra only)
about: Use this template to disable CI jobs about: Use this template to disable CI jobs
title: DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME] title: "DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]"
labels: 'module: ci' labels: "module: ci"
assignees: ''
--- ---
> For example, DISABLED pull / win-vs2022-cpu-py3 / test (default). Once > For example, DISABLED pull / win-vs2022-cpu-py3 / test (default). Once

View File

@ -22,9 +22,6 @@ self-hosted-runner:
- linux.arm64.m7g.4xlarge - linux.arm64.m7g.4xlarge
- linux.arm64.m7g.4xlarge.ephemeral - linux.arm64.m7g.4xlarge.ephemeral
- linux.arm64.r7g.12xlarge.memory - linux.arm64.r7g.12xlarge.memory
- linux.aws.h100
- linux.aws.h100.4
- linux.aws.h100.8
- linux.4xlarge.nvidia.gpu - linux.4xlarge.nvidia.gpu
- linux.8xlarge.nvidia.gpu - linux.8xlarge.nvidia.gpu
- linux.16xlarge.nvidia.gpu - linux.16xlarge.nvidia.gpu

View File

@ -264,7 +264,7 @@ def unzip_artifact_and_replace_files() -> None:
change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py") change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py")
for file in Path(f"artifacts/dist/{old_stem}").glob( for file in Path(f"artifacts/dist/{old_stem}").glob(
"*.dist-info/*", "*.dist-info/**",
): ):
change_content_to_new_version(file) change_content_to_new_version(file)

View File

@ -6,12 +6,6 @@ inputs:
cuda-version: cuda-version:
description: which cuda version to install, 'cpu' for none description: which cuda version to install, 'cpu' for none
required: true required: true
python-version:
required: false
type: string
default: "3.10"
description: |
The python version to be used. Will be 3.10 by default
runs: runs:
using: composite using: composite
@ -44,24 +38,18 @@ runs:
CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat" CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat"
{ {
echo "CONDA=${CONDA}";
echo "CONDA_RUN=${CONDA} run --no-capture-output"; echo "CONDA_RUN=${CONDA} run --no-capture-output";
echo "CONDA_BUILD=${CONDA} run conda-build"; echo "CONDA_BUILD=${CONDA} run conda-build";
echo "CONDA_INSTALL=${CONDA} install"; echo "CONDA_INSTALL=${CONDA} install";
} >> "${GITHUB_ENV}" } >> "${GITHUB_ENV}"
- name: Setup Python3 - name: Setup Python3
env:
PYTHON_VERSION: ${{ inputs.python-version }}
shell: bash shell: bash
run: | run: |
set +e set +e
set -x set -x
# Create new py_tmp env with python-version PYTHON3=$(${CONDA_RUN} which python3)
${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp libuv
PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
EXIT_CODE=$? EXIT_CODE=$?
if [[ "${EXIT_CODE}" == "0" ]]; then if [[ "${EXIT_CODE}" == "0" ]]; then
@ -74,7 +62,7 @@ runs:
# installation, which is Python 3 based. Its Python is default to Python 3. Further, there # installation, which is Python 3 based. Its Python is default to Python 3. Further, there
# is also the Miniconda installation that is Python 2 based, and both can be installed if # is also the Miniconda installation that is Python 2 based, and both can be installed if
# needed. In both cases, Python binary is just called python # needed. In both cases, Python binary is just called python
PYTHON=$(${CONDA_RUN} -n py_tmp which python) PYTHON=$(${CONDA_RUN} which python)
EXIT_CODE=$? EXIT_CODE=$?
if [[ "${EXIT_CODE}" == "0" ]]; then if [[ "${EXIT_CODE}" == "0" ]]; then

View File

@ -1 +1 @@
0307428d65acf5cf1a73a70a7722e076bbb83f22 5bcc153d7bf69ef34bc5788a33f60f1792cf2861

View File

@ -1 +1 @@
0fc62aa26a30ed7ca419d285f285cb5ba02c4394 c77852e117bdf056c8e9a087e51d6f65cf6ba53d

3
.github/labeler.yml vendored
View File

@ -130,6 +130,3 @@
- torch/csrc/inductor/aoti_include/** - torch/csrc/inductor/aoti_include/**
- torchgen/aoti/** - torchgen/aoti/**
- torchgen/gen_aoti_c_shim.py - torchgen/gen_aoti_c_shim.py
"ciflow/vllm":
- .github/ci_commit_pins/vllm.txt

View File

@ -525,21 +525,6 @@
- Lint - Lint
- pull - pull
- name: typechecking
patterns:
- 'pyrefly.toml'
- 'mypy.ini'
- 'mypy-strict.ini'
approved_by:
- lolpack
- maggiemoss
- ndmitchell
- kinto0
mandatory_checks_name:
- EasyCLA
- Lint
- pull
- name: superuser - name: superuser
patterns: patterns:
- '*' - '*'

View File

@ -1,44 +1,41 @@
tracking_issue: 24422 tracking_issue: 24422
ciflow_tracking_issue: 64124 ciflow_tracking_issue: 64124
ciflow_push_tags: ciflow_push_tags:
- ciflow/b200
- ciflow/b200-symm-mem
- ciflow/binaries - ciflow/binaries
- ciflow/binaries_libtorch - ciflow/binaries_libtorch
- ciflow/binaries_wheel - ciflow/binaries_wheel
- ciflow/h100 - ciflow/triton_binaries
- ciflow/h100-cutlass-backend
- ciflow/h100-distributed
- ciflow/h100-symm-mem
- ciflow/inductor - ciflow/inductor
- ciflow/inductor-cu126
- ciflow/inductor-micro-benchmark
- ciflow/inductor-micro-benchmark-cpu-x86
- ciflow/inductor-perf-compare
- ciflow/inductor-perf-test-nightly-rocm
- ciflow/inductor-perf-test-nightly-x86-zen
- ciflow/inductor-periodic - ciflow/inductor-periodic
- ciflow/inductor-rocm - ciflow/inductor-rocm
- ciflow/inductor-perf-test-nightly-rocm
- ciflow/inductor-perf-compare
- ciflow/inductor-micro-benchmark
- ciflow/inductor-micro-benchmark-cpu-x86
- ciflow/inductor-perf-test-nightly-x86-zen
- ciflow/inductor-cu126
- ciflow/linux-aarch64 - ciflow/linux-aarch64
- ciflow/mps - ciflow/mps
- ciflow/nightly - ciflow/nightly
- ciflow/op-benchmark
- ciflow/periodic - ciflow/periodic
- ciflow/periodic-rocm-mi300 - ciflow/periodic-rocm-mi300
- ciflow/pull
- ciflow/quantization-periodic
- ciflow/riscv64
- ciflow/rocm - ciflow/rocm
- ciflow/rocm-mi300 - ciflow/rocm-mi300
- ciflow/s390 - ciflow/s390
- ciflow/riscv64
- ciflow/slow - ciflow/slow
- ciflow/torchbench
- ciflow/triton_binaries
- ciflow/trunk - ciflow/trunk
- ciflow/unstable - ciflow/unstable
- ciflow/vllm
- ciflow/win-arm64
- ciflow/xpu - ciflow/xpu
- ciflow/vllm
- ciflow/torchbench
- ciflow/op-benchmark
- ciflow/pull
- ciflow/h100
- ciflow/h100-distributed
- ciflow/win-arm64
- ciflow/h100-symm-mem
- ciflow/h100-cutlass-backend
retryable_workflows: retryable_workflows:
- pull - pull
- trunk - trunk
@ -47,4 +44,4 @@ retryable_workflows:
- inductor-A100-perf-nightly - inductor-A100-perf-nightly
labeler_config: labeler.yml labeler_config: labeler.yml
label_to_label_config: label_to_label.yml label_to_label_config: label_to_label.yml
mergebot: true mergebot: True

View File

@ -30,7 +30,7 @@ CUDA_ARCHES_CUDNN_VERSION = {
} }
# NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this # NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
ROCM_ARCHES = ["6.4", "7.0"] ROCM_ARCHES = ["6.3", "6.4"]
XPU_ARCHES = ["xpu"] XPU_ARCHES = ["xpu"]
@ -53,7 +53,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | " "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | "
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | " "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
"nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | " "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | " "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | " "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | " "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
@ -70,7 +70,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | " "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | "
"nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | " "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
"nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | " "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | " "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
"nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | " "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | " "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
@ -87,7 +87,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | " "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
"nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | " "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | " "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
"nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | " "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
"nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | " "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
"nvidia-nvtx==13.0.39; platform_system == 'Linux' | " "nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
"nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | " "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "

View File

@ -135,7 +135,7 @@ ROCM_SMOKE_WORKFLOWS = [
build_configs=generate_binary_build_matrix.generate_wheels_matrix( build_configs=generate_binary_build_matrix.generate_wheels_matrix(
OperatingSystem.LINUX, OperatingSystem.LINUX,
arches=["6.4"], arches=["6.4"],
python_versions=["3.10"], python_versions=["3.9"],
), ),
ciflow_config=CIFlowConfig( ciflow_config=CIFlowConfig(
labels={ labels={
@ -155,7 +155,7 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
package_type="manywheel", package_type="manywheel",
build_configs=generate_binary_build_matrix.generate_wheels_matrix( build_configs=generate_binary_build_matrix.generate_wheels_matrix(
OperatingSystem.LINUX, OperatingSystem.LINUX,
arches=["13.0"], arches=["12.8"],
python_versions=["3.12"], python_versions=["3.12"],
), ),
branches="main", branches="main",

View File

@ -71,15 +71,12 @@ jobs:
with:!{{ upload.binary_env_as_input(config) }} with:!{{ upload.binary_env_as_input(config) }}
{%- if "aarch64" in build_environment %} {%- if "aarch64" in build_environment %}
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
{%- elif "s390x" in build_environment %} {%- elif "s390x" in build_environment %}
runs_on: linux.s390x runs_on: linux.s390x
ALPINE_IMAGE: "docker.io/s390x/alpine" ALPINE_IMAGE: "docker.io/s390x/alpine"
timeout-minutes: 420 timeout-minutes: 420
{%- elif config["gpu_arch_type"] == "rocm" %}
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
{%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %} {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.24xlarge.ephemeral runs_on: linux.24xlarge.ephemeral

View File

@ -187,6 +187,8 @@ jobs:
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
uses: pytorch/test-infra/.github/actions/setup-nvidia@main uses: pytorch/test-infra/.github/actions/setup-nvidia@main
with:
driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }}
if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }} if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}
- name: configure aws credentials - name: configure aws credentials

View File

@ -67,7 +67,7 @@ jobs:
# an OOM issue when running the job, so this upgrades the runner from 4xlarge # an OOM issue when running the job, so this upgrades the runner from 4xlarge
# to the next available tier of 12xlarge. So much memory just to generate cpp # to the next available tier of 12xlarge. So much memory just to generate cpp
# doc # doc
runner: ${{ inputs.runner_prefix }}linux.12xlarge.memory runner: ${{ inputs.runner_prefix }}linux.12xlarge
# TODO: Nightly cpp docs take longer and longer to finish (more than 3h now) # TODO: Nightly cpp docs take longer and longer to finish (more than 3h now)
# Let's try to figure out how this can be improved # Let's try to figure out how this can be improved
timeout-minutes: 360 timeout-minutes: 360
@ -75,6 +75,10 @@ jobs:
runner: ${{ inputs.runner_prefix }}linux.2xlarge runner: ${{ inputs.runner_prefix }}linux.2xlarge
# It takes less than 30m to finish python docs unless there are issues # It takes less than 30m to finish python docs unless there are issues
timeout-minutes: 30 timeout-minutes: 30
- docs_type: functorch
runner: ${{ inputs.runner_prefix }}linux.2xlarge
# It takes less than 15m to finish functorch docs unless there are issues
timeout-minutes: 15
# Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180) # Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)
# The current name requires updating the database last docs push query from test-infra every time the matrix is updated # The current name requires updating the database last docs push query from test-infra every time the matrix is updated
name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }} name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
@ -207,6 +211,16 @@ jobs:
path: cppdocs/ path: cppdocs/
s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/cppdocs s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/cppdocs
- name: Upload functorch Docs Preview
uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'functorch' && steps.build-docs.outcome == 'success' }}
with:
retention-days: 14
s3-bucket: doc-previews
if-no-files-found: error
path: functorch_ghpages/nightly/
s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs
- name: Teardown Linux - name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always() if: always()

View File

@ -2,12 +2,6 @@ name: Get Changed Files
on: on:
workflow_call: workflow_call:
inputs:
all_files:
description: "Whether to return all files instead of just changed files"
required: false
type: boolean
default: false
outputs: outputs:
changed-files: changed-files:
description: "List of changed files (space-separated) or '*' if not in a PR" description: "List of changed files (space-separated) or '*' if not in a PR"
@ -32,23 +26,17 @@ jobs:
# Get the PR number from the github context # Get the PR number from the github context
PR_NUMBER="${{ github.event.number }}" PR_NUMBER="${{ github.event.number }}"
# Check if all_files is requested # Use gh CLI to get changed files in the PR with explicit repo
if [ "${{ inputs.all_files }}" = "true" ]; then CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
echo "all_files input is true, returning all files"
echo "changed-files=*" >> "$GITHUB_OUTPUT"
else
# Use gh CLI to get changed files in the PR with explicit repo
CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
if [ -z "$CHANGED_FILES" ]; then if [ -z "$CHANGED_FILES" ]; then
echo "No changed files found, setting to '*'" echo "No changed files found, setting to '*'"
CHANGED_FILES="*" CHANGED_FILES="*"
fi
echo "Changed files: $CHANGED_FILES"
echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
fi fi
echo "Changed files: $CHANGED_FILES"
echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
else else
echo "Not in PR context, setting changed files to '*'" echo "Not in PR context, setting changed files to '*'"
echo "changed-files=*" >> "$GITHUB_OUTPUT" echo "changed-files=*" >> "$GITHUB_OUTPUT"

View File

@ -169,7 +169,7 @@ jobs:
id: install-nvidia-driver id: install-nvidia-driver
uses: pytorch/test-infra/.github/actions/setup-nvidia@main uses: pytorch/test-infra/.github/actions/setup-nvidia@main
with: with:
driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }} driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }} if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}
- name: Setup GPU_FLAG for docker run - name: Setup GPU_FLAG for docker run

View File

@ -151,7 +151,7 @@ jobs:
BUILD_WHEEL: 1 BUILD_WHEEL: 1
MAX_JOBS: 8 MAX_JOBS: 8
CUDA_VERSION: ${{ inputs.cuda-version }} CUDA_VERSION: ${{ inputs.cuda-version }}
PYTHON_VERSION: "3.10" PYTHON_VERSION: "3.9"
SCCACHE_BUCKET: "ossci-compiler-cache" SCCACHE_BUCKET: "ossci-compiler-cache"
SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }} SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
SCCACHE_REGION: us-east-1 SCCACHE_REGION: us-east-1

View File

@ -184,7 +184,7 @@ jobs:
env: env:
USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }} USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
INSTALL_WINDOWS_SDK: 1 INSTALL_WINDOWS_SDK: 1
PYTHON_VERSION: "3.10" PYTHON_VERSION: 3.9
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }} TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}

View File

@ -1,60 +0,0 @@
name: Limited CI for symmetric memory tests on B200
on:
pull_request:
paths:
- .github/workflows/b200-symm-mem.yml
workflow_dispatch:
push:
tags:
- ciflow/b200-symm-mem/*
schedule:
- cron: 22 8 * * * # about 1:22am PDT
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-label-type:
if: github.repository_owner == 'pytorch'
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '10.0'
test-matrix: |
{ include: [
{ config: "b200-symm-mem", shard: 1, num_shards: 1, runner: "linux.dgx.b200.8" },
]}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
secrets: inherit

View File

@ -36,7 +36,7 @@ jobs:
runs-on: linux.9xlarge.ephemeral runs-on: linux.9xlarge.ephemeral
strategy: strategy:
matrix: matrix:
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "rocm7.0", "cpu"] tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "cpu"]
steps: steps:
- name: Build docker image - name: Build docker image
uses: pytorch/pytorch/.github/actions/binary-docker-build@main uses: pytorch/pytorch/.github/actions/binary-docker-build@main

View File

@ -52,8 +52,8 @@ jobs:
{ tag: "cuda12.9" }, { tag: "cuda12.9" },
{ tag: "cuda12.8" }, { tag: "cuda12.8" },
{ tag: "cuda12.6" }, { tag: "cuda12.6" },
{ tag: "rocm6.3" },
{ tag: "rocm6.4" }, { tag: "rocm6.4" },
{ tag: "rocm7.0" },
{ tag: "cpu" }, { tag: "cpu" },
] ]
steps: steps:

View File

@ -34,7 +34,7 @@ jobs:
id-token: write id-token: write
strategy: strategy:
matrix: matrix:
rocm_version: ["70", "64"] rocm_version: ["64", "63"]
steps: steps:
- name: Checkout PyTorch - name: Checkout PyTorch
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

View File

@ -52,8 +52,8 @@ jobs:
{ name: "manylinuxaarch64-builder", tag: "cuda13.0", runner: "linux.arm64.2xlarge.ephemeral" }, { name: "manylinuxaarch64-builder", tag: "cuda13.0", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinuxaarch64-builder", tag: "cuda12.8", runner: "linux.arm64.2xlarge.ephemeral" }, { name: "manylinuxaarch64-builder", tag: "cuda12.8", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinuxaarch64-builder", tag: "cuda12.6", runner: "linux.arm64.2xlarge.ephemeral" }, { name: "manylinuxaarch64-builder", tag: "cuda12.6", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "rocm6.3", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "rocm6.4", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "rocm6.4", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "rocm7.0", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "cpu", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "cpu", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28_aarch64-builder", tag: "cpu-aarch64", runner: "linux.arm64.2xlarge.ephemeral" }, { name: "manylinux2_28_aarch64-builder", tag: "cpu-aarch64", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinuxcxx11-abi-builder", tag: "cpu-cxx11-abi", runner: "linux.9xlarge.ephemeral" }, { name: "manylinuxcxx11-abi-builder", tag: "cpu-cxx11-abi", runner: "linux.9xlarge.ephemeral" },

View File

@ -50,12 +50,12 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ] py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
device: ["cuda", "rocm", "xpu", "aarch64"] device: ["cuda", "rocm", "xpu", "aarch64"]
docker-image: ["pytorch/manylinux2_28-builder:cpu"] docker-image: ["pytorch/manylinux2_28-builder:cpu"]
include: include:
- device: "rocm" - device: "rocm"
rocm_version: "7.0" rocm_version: "6.4"
runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
- device: "cuda" - device: "cuda"
rocm_version: "" rocm_version: ""
@ -108,6 +108,9 @@ jobs:
# Determine python executable for given version # Determine python executable for given version
case $PY_VERS in case $PY_VERS in
3.9)
PYTHON_EXECUTABLE=/opt/python/cp39-cp39/bin/python
;;
3.10) 3.10)
PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python
;; ;;
@ -191,7 +194,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ] py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
device: ["xpu"] device: ["xpu"]
timeout-minutes: 40 timeout-minutes: 40
env: env:

View File

@ -178,7 +178,7 @@ jobs:
contents: read contents: read
container: container:
image: continuumio/miniconda3:4.12.0 image: continuumio/miniconda3:4.12.0
environment: ${{ ((github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && 'nightly-wheel-upload' || '' }} environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }}
steps: steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

View File

@ -35,7 +35,6 @@ jobs:
contents: write contents: write
outputs: outputs:
pt_release_name: ${{ steps.release_name.outputs.pt_release_name }} pt_release_name: ${{ steps.release_name.outputs.pt_release_name }}
pt_pep517_release_name: ${{ steps.release_name.outputs.pt_pep517_release_name }}
steps: steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with: with:
@ -54,12 +53,8 @@ jobs:
tag_or_branch="${tag_or_branch#refs/heads/}" tag_or_branch="${tag_or_branch#refs/heads/}"
# replace directory separators with _ in branch name # replace directory separators with _ in branch name
tag_or_branch="${tag_or_branch//\//_}" tag_or_branch="${tag_or_branch//\//_}"
torch_version="$(python -c 'from tools.generate_torch_version import get_torch_version; print(get_torch_version())')" echo "PT_RELEASE_NAME=pytorch-$tag_or_branch" >> "$GITHUB_ENV"
{ echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
echo "PT_RELEASE_NAME=pytorch-$tag_or_branch";
echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz";
echo "PT_PEP517_RELEASE_FILE=torch-${torch_version}.tar.gz";
} >> "$GITHUB_ENV"
- name: Checkout optional submodules - name: Checkout optional submodules
run: python3 tools/optional_submodules.py run: python3 tools/optional_submodules.py
- name: Copy docs requirements for inclusion - name: Copy docs requirements for inclusion
@ -69,47 +64,30 @@ jobs:
cp .ci/docker/requirements-docs.txt docs/requirements.txt cp .ci/docker/requirements-docs.txt docs/requirements.txt
- name: Create source distribution - name: Create source distribution
run: | run: |
# Create new folder with specified name so extracting the archive yields that # Create new folder with specified name so extracting the archive yields that
rm -rf "/tmp/$PT_RELEASE_NAME" rm -rf "/tmp/$PT_RELEASE_NAME"
cp -r "$PWD" "/tmp/$PT_RELEASE_NAME" cp -r "$PWD" "/tmp/$PT_RELEASE_NAME"
mv "/tmp/$PT_RELEASE_NAME" . mv "/tmp/$PT_RELEASE_NAME" .
# Cleanup # Cleanup
rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci} rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci}
find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
# Create archive # Create archive
tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME" tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")" echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
- name: Create PEP 517 compatible source distribution
run: |
pip install build==1.2.2.post1 || exit 1
python -m build --sdist || exit 1
cd dist || exit 1
- name: Upload source distribution for release - name: Upload source distribution for release
if: ${{ github.event_name == 'release' }} if: ${{ github.event_name == 'release' }}
uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2 uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2
with: with:
files: | files: ${{env.PT_RELEASE_FILE}}
${{ env.PT_RELEASE_FILE }} - name: Upload source distribution to GHA artifacts for release tags
${{ env.PT_PEP517_RELEASE_FILE }}
- name: Upload source distribution to GHA artifacts # for release tags
if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }} if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
with: with:
name: ${{ env.PT_RELEASE_FILE }} name: ${{ env.PT_RELEASE_FILE }}
path: ${{ env.PT_RELEASE_FILE }} path: ${{ env.PT_RELEASE_FILE }}
- name: Upload PEP 517 source distribution to GHA artifacts # for release tags
if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
with:
name: ${{ env.PT_PEP517_RELEASE_FILE }}
path: dist/${{ env.PT_PEP517_RELEASE_FILE }}
- name: Set output - name: Set output
id: release_name id: release_name
run: | run: echo "pt_release_name=${{ env.PT_RELEASE_NAME }}.tar.gz" >> "${GITHUB_OUTPUT}"
{
echo "pt_release_name=${{ env.PT_RELEASE_FILE }}";
echo "pt_pep517_release_name=${{ env.PT_PEP517_RELEASE_FILE }}";
} >> "${GITHUB_OUTPUT}"
upload_source_code_to_s3: upload_source_code_to_s3:
if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }} if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
@ -125,9 +103,6 @@ jobs:
- uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
with: with:
name: ${{ needs.release.outputs.pt_release_name }} name: ${{ needs.release.outputs.pt_release_name }}
- uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
with:
name: ${{ needs.release.outputs.pt_pep517_release_name }}
- name: Configure AWS credentials(PyTorch account) - name: Configure AWS credentials(PyTorch account)
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with: with:
@ -138,9 +113,7 @@ jobs:
s3-bucket: pytorch s3-bucket: pytorch
s3-prefix: source_code/test s3-prefix: source_code/test
if-no-files-found: warn if-no-files-found: warn
path: | path: ${{ needs.release.outputs.pt_release_name }}
${{ needs.release.outputs.pt_release_name }}
${{ needs.release.outputs.pt_pep517_release_name }}
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }} group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}

View File

@ -70,8 +70,9 @@ jobs:
pytorch-linux-jammy-py3-clang18-asan, pytorch-linux-jammy-py3-clang18-asan,
pytorch-linux-jammy-py3-clang12-onnx, pytorch-linux-jammy-py3-clang12-onnx,
pytorch-linux-jammy-linter, pytorch-linux-jammy-linter,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter, pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
pytorch-linux-jammy-py3-clang12-executorch, # Executorch pin needs update
# pytorch-linux-jammy-py3-clang12-executorch,
pytorch-linux-jammy-py3.12-triton-cpu, pytorch-linux-jammy-py3.12-triton-cpu,
pytorch-linux-noble-riscv64-py3.12-gcc14 pytorch-linux-noble-riscv64-py3.12-gcc14
] ]

View File

@ -62,7 +62,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.10" DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cpu-aarch64 build_name: manywheel-py3_10-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
@ -128,11 +128,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.10" DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-12_6 build_name: manywheel-py3_10-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -174,11 +174,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.10" DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-12_8 build_name: manywheel-py3_10-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -220,11 +220,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.10" DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-13_0 build_name: manywheel-py3_10-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -265,7 +265,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.11" DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cpu-aarch64 build_name: manywheel-py3_11-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
@ -331,11 +331,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.11" DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-12_6 build_name: manywheel-py3_11-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -377,11 +377,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.11" DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-12_8 build_name: manywheel-py3_11-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -423,11 +423,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.11" DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-13_0 build_name: manywheel-py3_11-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -468,7 +468,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.12" DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cpu-aarch64 build_name: manywheel-py3_12-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
@ -534,11 +534,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.12" DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-12_6 build_name: manywheel-py3_12-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -580,11 +580,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.12" DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-12_8 build_name: manywheel-py3_12-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -626,11 +626,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.12" DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-13_0 build_name: manywheel-py3_12-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -671,7 +671,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.13" DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cpu-aarch64 build_name: manywheel-py3_13-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
@ -737,11 +737,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.13" DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-12_6 build_name: manywheel-py3_13-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -783,11 +783,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.13" DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-12_8 build_name: manywheel-py3_13-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -829,11 +829,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.13" DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-13_0 build_name: manywheel-py3_13-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -874,7 +874,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.13t" DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cpu-aarch64 build_name: manywheel-py3_13t-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
@ -940,11 +940,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.13t" DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-12_6 build_name: manywheel-py3_13t-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -986,11 +986,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.13t" DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-12_8 build_name: manywheel-py3_13t-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1032,11 +1032,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.13t" DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-13_0 build_name: manywheel-py3_13t-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1077,7 +1077,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.14" DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cpu-aarch64 build_name: manywheel-py3_14-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
@ -1143,11 +1143,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.14" DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-12_6 build_name: manywheel-py3_14-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1189,11 +1189,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.14" DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-12_8 build_name: manywheel-py3_14-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1235,11 +1235,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.14" DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-13_0 build_name: manywheel-py3_14-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1280,7 +1280,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.14t" DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cpu-aarch64 build_name: manywheel-py3_14t-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
@ -1346,11 +1346,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.14t" DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-12_6 build_name: manywheel-py3_14t-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1392,11 +1392,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.14t" DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-12_8 build_name: manywheel-py3_14t-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1438,11 +1438,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.14t" DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-13_0 build_name: manywheel-py3_14t-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}

View File

@ -316,6 +316,120 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml uses: ./.github/workflows/_binary-upload.yml
libtorch-rocm6_3-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.3
GPU_ARCH_VERSION: "6.3"
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: libtorch-rocm6_3-shared-with-deps-release
build_environment: linux-binary-libtorch
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
libtorch-rocm6_3-shared-with-deps-release-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- libtorch-rocm6_3-shared-with-deps-release-build
- get-label-type
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.3
GPU_ARCH_VERSION: "6.3"
GPU_ARCH_TYPE: rocm
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: libtorch-rocm6_3-shared-with-deps-release
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: ROCm set GPU_FLAG
run: |
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
- name: configure aws credentials
id: aws_creds
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
role-duration-seconds: 18000
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
docker-image-name: libtorch-cxx11-builder
custom-tag-prefix: rocm6.3
docker-build-dir: .ci/docker
working-directory: pytorch
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Test Pytorch binary
uses: ./pytorch/.github/actions/test-pytorch-binary
env:
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Teardown ROCm
uses: ./.github/actions/teardown-rocm
libtorch-rocm6_3-shared-with-deps-release-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: libtorch-rocm6_3-shared-with-deps-release-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.3
GPU_ARCH_VERSION: "6.3"
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
build_name: libtorch-rocm6_3-shared-with-deps-release
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
libtorch-rocm6_4-shared-with-deps-release-build: libtorch-rocm6_4-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }} if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml uses: ./.github/workflows/_binary-build-linux.yml
@ -333,7 +447,6 @@ jobs:
LIBTORCH_CONFIG: release LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps LIBTORCH_VARIANT: shared-with-deps
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: libtorch-rocm6_4-shared-with-deps-release build_name: libtorch-rocm6_4-shared-with-deps-release
build_environment: linux-binary-libtorch build_environment: linux-binary-libtorch
secrets: secrets:
@ -430,118 +543,3 @@ jobs:
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml uses: ./.github/workflows/_binary-upload.yml
libtorch-rocm7_0-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm7.0
GPU_ARCH_VERSION: "7.0"
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: libtorch-rocm7_0-shared-with-deps-release
build_environment: linux-binary-libtorch
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
libtorch-rocm7_0-shared-with-deps-release-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- libtorch-rocm7_0-shared-with-deps-release-build
- get-label-type
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm7.0
GPU_ARCH_VERSION: "7.0"
GPU_ARCH_TYPE: rocm
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: libtorch-rocm7_0-shared-with-deps-release
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: ROCm set GPU_FLAG
run: |
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
- name: configure aws credentials
id: aws_creds
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
role-duration-seconds: 18000
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
docker-image-name: libtorch-cxx11-builder
custom-tag-prefix: rocm7.0
docker-build-dir: .ci/docker
working-directory: pytorch
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Test Pytorch binary
uses: ./pytorch/.github/actions/test-pytorch-binary
env:
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Teardown ROCm
uses: ./.github/actions/teardown-rocm
libtorch-rocm7_0-shared-with-deps-release-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: libtorch-rocm7_0-shared-with-deps-release-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm7.0
GPU_ARCH_VERSION: "7.0"
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
build_name: libtorch-rocm7_0-shared-with-deps-release
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml

View File

@ -42,7 +42,7 @@ jobs:
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }} curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }} curr_ref_type: ${{ github.ref_type }}
manywheel-py3_12-cuda13_0-build: manywheel-py3_12-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }} if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type needs: get-label-type
@ -51,22 +51,22 @@ jobs:
PACKAGE_TYPE: manywheel PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in # TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION # favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130 DESIRED_CUDA: cu128
GPU_ARCH_VERSION: "13.0" GPU_ARCH_VERSION: "12.8"
GPU_ARCH_TYPE: cuda GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.12" DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda13_0 build_name: manywheel-py3_12-cuda12_8
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda13_0-test: # Testing manywheel-py3_12-cuda12_8-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }} if: ${{ github.repository_owner == 'pytorch' }}
needs: needs:
- manywheel-py3_12-cuda13_0-build - manywheel-py3_12-cuda12_8-build
- get-label-type - get-label-type
uses: ./.github/workflows/_binary-test-linux.yml uses: ./.github/workflows/_binary-test-linux.yml
with: with:
@ -74,13 +74,13 @@ jobs:
PACKAGE_TYPE: manywheel PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in # TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION # favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130 DESIRED_CUDA: cu128
GPU_ARCH_VERSION: "13.0" GPU_ARCH_VERSION: "12.8"
GPU_ARCH_TYPE: cuda GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.12" DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cuda13_0 build_name: manywheel-py3_12-cuda12_8
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner

File diff suppressed because it is too large Load Diff

View File

@ -44,7 +44,7 @@ jobs:
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }} curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }} curr_ref_type: ${{ github.ref_type }}
manywheel-py3_10-rocm6_4-build: manywheel-py3_9-rocm6_4-build:
if: ${{ github.repository_owner == 'pytorch' }} if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type needs: get-label-type
@ -58,17 +58,16 @@ jobs:
GPU_ARCH_TYPE: rocm GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.10" DESIRED_PYTHON: "3.9"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300 build_name: manywheel-py3_9-rocm6_4
build_name: manywheel-py3_10-rocm6_4
build_environment: linux-binary-manywheel-rocm build_environment: linux-binary-manywheel-rocm
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-rocm6_4-test: # Testing manywheel-py3_9-rocm6_4-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }} if: ${{ github.repository_owner == 'pytorch' }}
needs: needs:
- manywheel-py3_10-rocm6_4-build - manywheel-py3_9-rocm6_4-build
- get-label-type - get-label-type
runs-on: linux.rocm.gpu.mi250 runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240 timeout-minutes: 240
@ -83,14 +82,14 @@ jobs:
SKIP_ALL_TESTS: 1 SKIP_ALL_TESTS: 1
DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.10" DESIRED_PYTHON: "3.9"
steps: steps:
- name: Setup ROCm - name: Setup ROCm
uses: ./.github/actions/setup-rocm uses: ./.github/actions/setup-rocm
- uses: actions/download-artifact@v4.1.7 - uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts name: Download Build Artifacts
with: with:
name: manywheel-py3_10-rocm6_4 name: manywheel-py3_9-rocm6_4
path: "${{ runner.temp }}/artifacts/" path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch - name: Checkout PyTorch
uses: actions/checkout@v4 uses: actions/checkout@v4

View File

@ -31,8 +31,6 @@ jobs:
if: github.repository_owner == 'pytorch' if: github.repository_owner == 'pytorch'
name: Get changed files name: Get changed files
uses: ./.github/workflows/_get-changed-files.yml uses: ./.github/workflows/_get-changed-files.yml
with:
all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') }}
lintrunner-clang: lintrunner-clang:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@ -55,7 +53,7 @@ jobs:
with: with:
timeout: 120 timeout: 120
runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter
# NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
# to run git rev-parse HEAD~:.ci/docker when a new image is needed # to run git rev-parse HEAD~:.ci/docker when a new image is needed
fetch-depth: 0 fetch-depth: 0
@ -266,10 +264,10 @@ jobs:
with: with:
submodules: false submodules: false
fetch-depth: 1 fetch-depth: 1
- name: Setup Python 3.10 - name: Setup Python 3.9
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with: with:
python-version: '3.10' python-version: '3.9'
architecture: x64 architecture: x64
cache: pip cache: pip
- name: Install dependencies - name: Install dependencies

View File

@ -0,0 +1,46 @@
name: operator_microbenchmark_b200
on:
push:
tags:
- ciflow/op-benchmark/*
workflow_dispatch:
schedule:
# Run at 06:00 UTC everyday
- cron: 0 6 * * *
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
opmicrobenchmark-build:
if: github.repository_owner == 'pytorch'
name: opmicrobenchmark-build
uses: ./.github/workflows/_linux-build.yml
with:
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '10.0'
test-matrix: |
{ include: [
{ config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
]}
secrets: inherit
opmicrobenchmark-test:
name: opmicrobenchmark-test
uses: ./.github/workflows/_linux-test.yml
needs: opmicrobenchmark-build
with:
timeout-minutes: 500
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
secrets: inherit

View File

@ -127,8 +127,6 @@ jobs:
uses: ./.github/workflows/_linux-build.yml uses: ./.github/workflows/_linux-build.yml
needs: get-label-type needs: get-label-type
with: with:
# More memory is needed to build with asan
runner: linux.2xlarge.memory
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-clang18-asan build-environment: linux-jammy-py3.10-clang18-asan
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
@ -318,6 +316,32 @@ jobs:
]} ]}
secrets: inherit secrets: inherit
linux-jammy-py3-clang12-executorch-build:
if: false # Docker build needs pin update
name: linux-jammy-py3-clang12-executorch
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3-clang12-executorch
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
test-matrix: |
{ include: [
{ config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
]}
secrets: inherit
linux-jammy-py3-clang12-executorch-test:
name: linux-jammy-py3-clang12-executorch
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-py3-clang12-executorch-build
if: false # Has been broken for a while
with:
build-environment: linux-jammy-py3-clang12-executorch
docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
name: cuda12.8-py3.10-gcc9-sm75 name: cuda12.8-py3.10-gcc9-sm75
uses: ./.github/workflows/_linux-build.yml uses: ./.github/workflows/_linux-build.yml

View File

@ -1,54 +0,0 @@
name: quantization-periodic
on:
push:
tags:
- ciflow/quantization-periodic/*
workflow_dispatch:
schedule:
# run weekly
- cron: "45 0 * * 0"
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-default-label-prefix:
name: get-default-label-prefix
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
periodic-quantization-build:
name: periodic-quantization-build
uses: ./.github/workflows/_linux-build.yml
needs: get-default-label-prefix
with:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '8.9'
test-matrix: |
{ include: [
{ config: "quantization", shard: 1, num_shards: 1, runner: "${{ needs.get-default-label-prefix.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
]}
secrets: inherit
periodic-test-quantization:
name: periodic-test-quantization
uses: ./.github/workflows/_linux-test.yml
needs: periodic-quantization-build
with:
build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
docker-image: ${{ needs.periodic-quantization-build.outputs.docker-image }}
test-matrix: ${{ needs.periodic-quantization-build.outputs.test-matrix }}
secrets: inherit

View File

@ -140,8 +140,6 @@ jobs:
uses: ./.github/workflows/_linux-build.yml uses: ./.github/workflows/_linux-build.yml
needs: get-label-type needs: get-label-type
with: with:
# More memory is needed to build with asan
runner: linux.2xlarge.memory
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-clang18-asan build-environment: linux-jammy-py3.10-clang18-asan
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan

View File

@ -1,76 +0,0 @@
# B200 Smoke Tests CI Workflow
#
# This workflow runs smoke tests on B200 hardware
#
# Flow:
# 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200
# 2. Runs smoke tests on linux.dgx.b200 runner
# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function
#
# Triggered by:
# - Pull requests modifying this workflow file
# - Manual dispatch
# - Schedule (every 6 hours)
# - Adding ciflow/b200 label to a PR (creates ciflow/b200/* tag)
name: B200 Smoke Tests
on:
pull_request:
paths:
- .github/workflows/test-b200.yml
workflow_dispatch:
schedule:
- cron: 0 4,10,16,22 * * * # every 6 hours
push:
tags:
- ciflow/b200/*
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-label-type:
if: github.repository_owner == 'pytorch'
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-cuda12_8-py3_10-gcc11-sm100-build:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '10.0'
test-matrix: |
{ include: [
{ config: "smoke_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
]}
# config: "smoke_b200" maps to test_python_smoke_b200() in .ci/pytorch/test.sh
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda12_8-py3_10-gcc11-sm100-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
secrets: inherit

View File

@ -259,27 +259,3 @@ jobs:
docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }} docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }} test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
secrets: inherit secrets: inherit
linux-jammy-py3-clang12-executorch-build:
name: linux-jammy-py3-clang12-executorch
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3-clang12-executorch
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
test-matrix: |
{ include: [
{ config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
]}
secrets: inherit
linux-jammy-py3-clang12-executorch-test:
name: linux-jammy-py3-clang12-executorch
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-py3-clang12-executorch-build
with:
build-environment: linux-jammy-py3-clang12-executorch
docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
secrets: inherit

View File

@ -53,3 +53,27 @@ jobs:
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }} curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }} curr_ref_type: ${{ github.ref_type }}
linux-jammy-py3_9-clang9-xla-build:
name: linux-jammy-py3_9-clang9-xla
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.9-clang9-xla
docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
test-matrix: |
{ include: [
{ config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
]}
secrets: inherit
linux-jammy-py3_9-clang9-xla-test:
name: linux-jammy-py3_9-clang9-xla
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-py3_9-clang9-xla-build
with:
build-environment: linux-jammy-py3.9-clang9-xla
docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
secrets: inherit

View File

@ -36,8 +36,6 @@ jobs:
uses: ./.github/workflows/_linux-build.yml uses: ./.github/workflows/_linux-build.yml
needs: get-label-type needs: get-label-type
with: with:
# When building vLLM, uv doesn't like that we rename wheel without changing the wheel metadata
allow-reuse-old-whl: false
build-additional-packages: "vision audio" build-additional-packages: "vision audio"
build-external-packages: "vllm" build-external-packages: "vllm"
build-environment: linux-jammy-cuda12.8-py3.12-gcc11 build-environment: linux-jammy-cuda12.8-py3.12-gcc11

4
.gitignore vendored
View File

@ -82,7 +82,6 @@ torch/return_types.pyi
torch/nn/functional.pyi torch/nn/functional.pyi
torch/utils/data/datapipes/datapipe.pyi torch/utils/data/datapipes/datapipe.pyi
torch/csrc/autograd/generated/* torch/csrc/autograd/generated/*
torch/csrc/functionalization/generated/*
torch/csrc/lazy/generated/*.[!m]* torch/csrc/lazy/generated/*.[!m]*
torch_compile_debug/ torch_compile_debug/
# Listed manually because some files in this directory are not generated # Listed manually because some files in this directory are not generated
@ -260,9 +259,6 @@ gen
.pytest_cache .pytest_cache
aten/build/* aten/build/*
# Linker scripts for prioritized text optimization
cmake/linker_script.ld
# Bram # Bram
plsdontbreak plsdontbreak

View File

@ -49,7 +49,7 @@ init_command = [
'mccabe==0.7.0', 'mccabe==0.7.0',
'pycodestyle==2.14.0', 'pycodestyle==2.14.0',
'pyflakes==3.4.0', 'pyflakes==3.4.0',
'torchfix==0.4.0 ; python_version >= "3.10" and python_version < "3.13"', 'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"',
] ]
@ -123,7 +123,6 @@ is_formatter = true
code = 'MYPY' code = 'MYPY'
include_patterns = [ include_patterns = [
'setup.py', 'setup.py',
'functorch/dim/**/*.py',
'torch/**/*.py', 'torch/**/*.py',
'torch/**/*.pyi', 'torch/**/*.pyi',
'caffe2/**/*.py', 'caffe2/**/*.py',
@ -153,7 +152,7 @@ init_command = [
'python3', 'python3',
'tools/linter/adapters/pip_init.py', 'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}', '--dry-run={{DRYRUN}}',
'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"', 'numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11"',
'numpy==2.1.0 ; python_version >= "3.12"', 'numpy==2.1.0 ; python_version >= "3.12"',
'expecttest==0.3.0', 'expecttest==0.3.0',
'mypy==1.16.0', 'mypy==1.16.0',
@ -196,7 +195,6 @@ exclude_patterns = [
'tools/test/gen_operators_yaml_test.py', 'tools/test/gen_operators_yaml_test.py',
'tools/test/gen_oplist_test.py', 'tools/test/gen_oplist_test.py',
'tools/test/test_selective_build.py', 'tools/test/test_selective_build.py',
'tools/experimental/dynamic_shapes/torchfuzz/**',
] ]
command = [ command = [
'python3', 'python3',
@ -966,6 +964,7 @@ exclude_patterns = [
'test/jit/**', # should be run through test/test_jit.py 'test/jit/**', # should be run through test/test_jit.py
'test/ao/sparsity/**', # should be run through test/test_ao_sparsity.py 'test/ao/sparsity/**', # should be run through test/test_ao_sparsity.py
'test/fx/**', # should be run through test/test_fx.py 'test/fx/**', # should be run through test/test_fx.py
'test/bottleneck_test/**', # excluded by test/run_test.py
'test/package/**', # excluded by test/run_test.py 'test/package/**', # excluded by test/run_test.py
'test/distributed/argparse_util_test.py', 'test/distributed/argparse_util_test.py',
'test/distributed/bin/test_script.py', 'test/distributed/bin/test_script.py',
@ -1411,6 +1410,8 @@ exclude_patterns = [
'torch/utils/benchmark/utils/timer.py', 'torch/utils/benchmark/utils/timer.py',
'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py', 'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py',
'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py', 'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py',
'torch/utils/bottleneck/__init__.py',
'torch/utils/bottleneck/__main__.py',
'torch/utils/bundled_inputs.py', 'torch/utils/bundled_inputs.py',
'torch/utils/checkpoint.py', 'torch/utils/checkpoint.py',
'torch/utils/collect_env.py', 'torch/utils/collect_env.py',
@ -1453,7 +1454,7 @@ init_command = [
'--dry-run={{DRYRUN}}', '--dry-run={{DRYRUN}}',
'usort==1.0.8.post1', 'usort==1.0.8.post1',
'isort==6.0.1', 'isort==6.0.1',
'ruff==0.13.1', # sync with RUFF 'ruff==0.12.9', # sync with RUFF
] ]
is_formatter = true is_formatter = true
@ -1587,7 +1588,7 @@ init_command = [
'python3', 'python3',
'tools/linter/adapters/pip_init.py', 'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}', '--dry-run={{DRYRUN}}',
'ruff==0.13.1', # sync with PYFMT 'ruff==0.12.9', # sync with PYFMT
] ]
is_formatter = true is_formatter = true

View File

@ -22,7 +22,6 @@ COMMON_COPTS = [
"-DHAVE_SHM_UNLINK=1", "-DHAVE_SHM_UNLINK=1",
"-D_FILE_OFFSET_BITS=64", "-D_FILE_OFFSET_BITS=64",
"-DUSE_FBGEMM", "-DUSE_FBGEMM",
"-DUSE_DISTRIBUTED",
"-DAT_PER_OPERATOR_HEADERS", "-DAT_PER_OPERATOR_HEADERS",
"-DATEN_THREADING=NATIVE", "-DATEN_THREADING=NATIVE",
"-DNO_CUDNN_DESTROY_HANDLE", "-DNO_CUDNN_DESTROY_HANDLE",
@ -91,8 +90,6 @@ generated_cpu_cpp = [
"aten/src/ATen/NativeMetaFunctions.h", "aten/src/ATen/NativeMetaFunctions.h",
"aten/src/ATen/RegistrationDeclarations.h", "aten/src/ATen/RegistrationDeclarations.h",
"aten/src/ATen/VmapGeneratedPlumbing.h", "aten/src/ATen/VmapGeneratedPlumbing.h",
"aten/src/ATen/ViewMetaClasses.h",
"aten/src/ATen/ViewMetaClasses.cpp",
"aten/src/ATen/core/aten_interned_strings.h", "aten/src/ATen/core/aten_interned_strings.h",
"aten/src/ATen/core/enum_tag.h", "aten/src/ATen/core/enum_tag.h",
"aten/src/ATen/core/TensorBody.h", "aten/src/ATen/core/TensorBody.h",
@ -813,7 +810,7 @@ cc_library(
name = "torch_python", name = "torch_python",
srcs = libtorch_python_core_sources srcs = libtorch_python_core_sources
+ if_cuda(libtorch_python_cuda_sources) + if_cuda(libtorch_python_cuda_sources)
+ if_cuda(libtorch_python_distributed_sources) + libtorch_python_distributed_sources
+ GENERATED_AUTOGRAD_PYTHON, + GENERATED_AUTOGRAD_PYTHON,
hdrs = glob([ hdrs = glob([
"torch/csrc/generic/*.cpp", "torch/csrc/generic/*.cpp",
@ -835,6 +832,36 @@ pybind_extension(
], ],
) )
cc_library(
name = "functorch",
hdrs = glob([
"functorch/csrc/dim/*.h",
]),
srcs = glob([
"functorch/csrc/dim/*.cpp",
]),
deps = [
":aten_nvrtc",
":torch_python",
"@pybind11",
],
)
pybind_extension(
name = "functorch/_C",
copts=[
"-DTORCH_EXTENSION_NAME=_C"
],
srcs = [
"functorch/csrc/init_dim_only.cpp",
],
deps = [
":functorch",
":torch_python",
":aten_nvrtc",
],
)
cc_binary( cc_binary(
name = "torch/bin/torch_shm_manager", name = "torch/bin/torch_shm_manager",
srcs = [ srcs = [
@ -875,6 +902,7 @@ py_library(
], ],
data = [ data = [
":torch/_C.so", ":torch/_C.so",
":functorch/_C.so",
":torch/bin/torch_shm_manager", ":torch/bin/torch_shm_manager",
], ],
) )
@ -1077,7 +1105,6 @@ test_suite(
"aten/src/ATen/templates/LazyNonNativeIr.h", "aten/src/ATen/templates/LazyNonNativeIr.h",
"aten/src/ATen/templates/RegisterDispatchKey.cpp", "aten/src/ATen/templates/RegisterDispatchKey.cpp",
"aten/src/ATen/templates/RegisterDispatchDefinitions.ini", "aten/src/ATen/templates/RegisterDispatchDefinitions.ini",
"aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp",
"aten/src/ATen/native/native_functions.yaml", "aten/src/ATen/native/native_functions.yaml",
"aten/src/ATen/native/tags.yaml", "aten/src/ATen/native/tags.yaml",
"aten/src/ATen/native/ts_native_functions.yaml", "aten/src/ATen/native/ts_native_functions.yaml",

View File

@ -1,4 +1,5 @@
cmake_minimum_required(VERSION 3.27 FATAL_ERROR) cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
# cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW)
# Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this # Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this
# sometimes makes XCode C compiler gets detected as "Clang", even when the C++ # sometimes makes XCode C compiler gets detected as "Clang", even when the C++
@ -180,8 +181,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
set(CPU_POWER ON) set(CPU_POWER ON)
endif() endif()
# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not # For non-supported platforms, turn USE_DISTRIBUTED off by default.
# tested and likely won't work without additional changes. # NB: USE_DISTRIBUTED simply disables the backend; distributed code
# still gets built
if(NOT LINUX AND NOT WIN32) if(NOT LINUX AND NOT WIN32)
set(USE_DISTRIBUTED set(USE_DISTRIBUTED
OFF OFF
@ -261,11 +263,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF) option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
option(USE_NATIVE_ARCH "Use -march=native" OFF) option(USE_NATIVE_ARCH "Use -march=native" OFF)
cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF) cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
option(USE_DISTRIBUTED "Use distributed" ON) option(USE_DISTRIBUTED "Enable default distributed backends" ON)
cmake_dependent_option(USE_NCCL "Use NCCL" ON cmake_dependent_option(USE_NCCL "Use NCCL" ON
"USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF) "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
cmake_dependent_option(USE_XCCL "Use XCCL" ON cmake_dependent_option(USE_XCCL "Use XCCL" ON
"USE_XPU;UNIX;NOT APPLE" OFF) "USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF) cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF) cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF) cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
@ -378,13 +380,6 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
OFF "USE_CUDA" OFF) OFF "USE_CUDA" OFF)
cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
"CPU_AARCH64" OFF) "CPU_AARCH64" OFF)
# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le.
set(USE_PRIORITIZED_TEXT_DEFAULT OFF)
if(LINUX AND CPU_AARCH64)
set(USE_PRIORITIZED_TEXT_DEFAULT ON)
endif()
cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld."
"${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF)
option(USE_MIMALLOC "Use mimalloc" OFF) option(USE_MIMALLOC "Use mimalloc" OFF)
# Enable third party mimalloc library to improve memory allocation performance # Enable third party mimalloc library to improve memory allocation performance
@ -437,12 +432,11 @@ if(WIN32)
PATH_SUFFIXES lib PATH_SUFFIXES lib
NO_DEFAULT_PATH) NO_DEFAULT_PATH)
if(NOT libuv_tmp_LIBRARY) if(NOT libuv_tmp_LIBRARY)
set(USE_DISTRIBUTED OFF)
set(USE_GLOO OFF) set(USE_GLOO OFF)
message( message(
WARNING WARNING
"Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. " "Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
"Please run command 'conda install -c conda-forge libuv=1.51' to install libuv." "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
) )
else() else()
set(ENV{libuv_ROOT} ${libuv_tmp_LIBRARY}/../../) set(ENV{libuv_ROOT} ${libuv_tmp_LIBRARY}/../../)
@ -663,11 +657,6 @@ endif(MSVC)
string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all") string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")
# Set linker max-page-size to 64KiB on AArch64 Linux
if(LINUX AND CPU_AARCH64)
add_link_options_if_supported("-z,max-page-size=0x10000")
endif()
# Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not # Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
# applicable to mobile are disabled by this variable. Setting # applicable to mobile are disabled by this variable. Setting
# `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it # `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it
@ -902,7 +891,7 @@ IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
endif() endif()
# Set USE_FBGEMM_GENAI to ON for CUDA build on SM100. # Set USE_FBGEMM_GENAI to ON for CUDA build on SM100.
if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32) if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a") message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a")
set(USE_FBGEMM_GENAI ON) set(USE_FBGEMM_GENAI ON)
endif() endif()
@ -1390,6 +1379,10 @@ endif()
include(cmake/Summary.cmake) include(cmake/Summary.cmake)
caffe2_print_configuration_summary() caffe2_print_configuration_summary()
if(BUILD_FUNCTORCH)
add_subdirectory(functorch)
endif()
# Parse custom debug info # Parse custom debug info
if(DEFINED USE_CUSTOM_DEBINFO) if(DEFINED USE_CUSTOM_DEBINFO)
string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}") string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}")
@ -1428,57 +1421,3 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas" install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas"
DESTINATION "${CMAKE_INSTALL_BINDIR}") DESTINATION "${CMAKE_INSTALL_BINDIR}")
endif() endif()
if(USE_PRIORITIZED_TEXT_FOR_LD)
add_compile_options(
$<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
$<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
)
set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
add_custom_command(
OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
COMMENT "Generating prioritized text linker files"
VERBATIM
)
add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
if(BUILD_PYTHON)
set(LINKER_OPT_TARGETS torch_python)
endif()
if(NOT BUILD_LIBTORCHLESS)
list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
if(USE_CUDA)
list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
endif()
if(USE_XPU)
list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
endif()
if(USE_ROCM)
list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
endif()
endif()
foreach(tgt IN LISTS LINKER_OPT_TARGETS)
if(TARGET ${tgt})
add_dependencies("${tgt}" generate_linker_script)
target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
else()
message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
endif()
endforeach()
else()
if(LINUX AND CPU_AARCH64)
message(WARNING [[
It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
]])
endif()
endif()

View File

@ -1,61 +1,20 @@
# Reference: https://setuptools.pypa.io/en/latest/userguide/miscellaneous.html # Reference: https://setuptools.pypa.io/en/latest/userguide/miscellaneous.html
# Include individual top-level files # Include source files in SDist
include CITATION.cff include CMakeLists.txt
include CODEOWNERS include *.bzl *.bazel .bazel* BUILD *.BUILD BUILD.* WORKSPACE
include Dockerfile include BUCK BUCK.*
include LICENSE include requirements*.txt
include MANIFEST.in include version.txt
include Makefile include [Mm]akefile *.[Mm]akefile [Mm]akefile.*
include NOTICE include [Dd]ockerfile *.[Dd]ockerfile [Dd]ockerfile.* .dockerignore
include .bc-linter.yml
include .clang-format .clang-tidy
include .cmakelintrc
include .coveragerc
include .dockerignore
include .editorconfig
include .flake8
include .gdbinit
include .lintrunner.toml
include .lldbinit
include codex_setup.sh
include docker.Makefile
include pyrefly.toml
include ubsan.supp
# Include bazel and BUCK related files
include BUILD.bazel BUCK.oss
include WORKSPACE
include *.bzl
include .bazelignore .bazelrc .bazelversion
# Include general configuration files
include *.ini
# Include important top-level information
include *.md
# Include technical text files at the moment, comprises
# version.txt, CMakeLists.txt, requirements.txt
include *.txt
# Include ctags configuration
include .ctags.d/*.ctags
# Include subfolders completely
graft .devcontainer
graft .vscode
graft android graft android
graft aten graft aten
graft benchmarks
graft binaries graft binaries
graft c10 graft c10
graft caffe2 graft caffe2
graft cmake graft cmake
graft docs
graft functorch graft functorch
graft ios
graft mypy_plugins
graft scripts
graft test
graft third_party graft third_party
graft tools graft tools
graft torch graft torch
@ -63,37 +22,29 @@ graft torchgen
# FIXME: torch-xla build during codegen will fail if include this file in wheel # FIXME: torch-xla build during codegen will fail if include this file in wheel
exclude torchgen/BUILD.bazel exclude torchgen/BUILD.bazel
# The following exclusions omit parts from third-party dependencies that # Misc files and directories in SDist
# contain invalid symlinks[1] and that are not needed for pytorch, such as include *.md
# bindings for unused languages include CITATION.cff
prune third_party/flatbuffers/java include LICENSE NOTICE
prune third_party/flatbuffers/kotlin include mypy*.ini
prune third_party/ittapi/rust graft benchmarks
prune third_party/nccl/pkg/debian graft docs
prune third_party/opentelemetry-cpp/third_party/prometheus-cpp/cmake/project-import-* graft mypy_plugins
graft scripts
# The following document is also an invalid symlink[1] and superfluous
exclude third_party/flatbuffers/docs/source/CONTRIBUTING.md
# Omit autogenerated code
prune torchgen/packaged
# Omit caches, compiled, and scm related content
prune */__pycache__
prune **/.github
prune **/.gitlab
global-exclude *.o *.obj *.so *.dylib *.a *.pxd *.dll *.lib
global-exclude *.py[cod] *.swp *~
global-exclude .git .git-blame-ignore-revs .gitattributes .gitignore .gitmodules
global-exclude .gitlab-ci.yml
# Misc files needed for custom setuptools command # Misc files needed for custom setuptools command
include .gitignore include .gitignore
include .gitmodules include .gitmodules
# [1] Invalid symlinks for the purposes of Python source distributions are, # Include test suites in SDist
# according to the source distribution format[2] links pointing outside the graft test
# destination directory or links with a `..` component, which is those of include pytest.ini
# concern here. include .coveragerc
# [2] https://packaging.python.org/en/latest/specifications/source-distribution-format/#source-distribution-archive-features # Prune generated/compiled files
prune torchgen/packaged
prune */__pycache__
global-exclude *.o *.obj *.so *.a *.dylib *.pxd *.dll *.lib *.py[cod]
prune */.git
global-exclude .git *~ *.swp

View File

@ -161,7 +161,7 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv)
#### Prerequisites #### Prerequisites
If you are installing from source, you will need: If you are installing from source, you will need:
- Python 3.10 or later - Python 3.9 or later
- A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux) - A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux)
- Visual Studio or Visual Studio Build Tool (Windows only) - Visual Studio or Visual Studio Build Tool (Windows only)
@ -275,7 +275,7 @@ conda install pkg-config libuv
pip install mkl-static mkl-include pip install mkl-static mkl-include
# Add these packages if torch.distributed is needed. # Add these packages if torch.distributed is needed.
# Distributed package support on Windows is a prototype feature and is subject to changes. # Distributed package support on Windows is a prototype feature and is subject to changes.
conda install -c conda-forge libuv=1.51 conda install -c conda-forge libuv
``` ```
#### Install PyTorch #### Install PyTorch

View File

@ -317,20 +317,10 @@ IF(USE_FBGEMM_GENAI)
-greedy-reverse-local-assignment=1 -greedy-reverse-local-assignment=1
-fhip-new-launch-api) -fhip-new-launch-api)
# Only compile for gfx942 for now.
# This is rather hacky, I could not figure out a clean solution :(
set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
endif()
set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
hip_add_library( hip_add_library(
fbgemm_genai STATIC fbgemm_genai STATIC
${fbgemm_genai_native_rocm_hip} ${fbgemm_genai_native_rocm_hip}
HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS}) HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES) target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)

View File

@ -65,24 +65,14 @@ DLDataType getDLDataType(const Tensor& t) {
break; break;
// TODO(#146647): use macro here instead of spelling out each shell dtype // TODO(#146647): use macro here instead of spelling out each shell dtype
case ScalarType::Float8_e5m2: case ScalarType::Float8_e5m2:
dtype.code = DLDataTypeCode::kDLFloat8_e5m2;
break;
case ScalarType::Float8_e5m2fnuz: case ScalarType::Float8_e5m2fnuz:
dtype.code = DLDataTypeCode::kDLFloat8_e5m2fnuz;
break;
case ScalarType::Float8_e4m3fn: case ScalarType::Float8_e4m3fn:
dtype.code = DLDataTypeCode::kDLFloat8_e4m3fn;
break;
case ScalarType::Float8_e4m3fnuz: case ScalarType::Float8_e4m3fnuz:
dtype.code = DLDataTypeCode::kDLFloat8_e4m3fnuz;
break;
case ScalarType::Float8_e8m0fnu: case ScalarType::Float8_e8m0fnu:
dtype.code = DLDataTypeCode::kDLFloat8_e8m0fnu; TORCH_CHECK_BUFFER(false, "float8 types are not supported by dlpack");
break; break;
case ScalarType::Float4_e2m1fn_x2: case ScalarType::Float4_e2m1fn_x2:
dtype.code = DLDataTypeCode::kDLFloat4_e2m1fn; TORCH_CHECK_BUFFER(false, "float4 types are not supported by dlpack");
dtype.lanes = 2;
dtype.bits = 4;
break; break;
case ScalarType::QInt8: case ScalarType::QInt8:
case ScalarType::QUInt8: case ScalarType::QUInt8:
@ -187,11 +177,7 @@ static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* dat
ScalarType toScalarType(const DLDataType& dtype) { ScalarType toScalarType(const DLDataType& dtype) {
ScalarType stype = ScalarType::Undefined; ScalarType stype = ScalarType::Undefined;
if (dtype.code != DLDataTypeCode::kDLFloat4_e2m1fn) { TORCH_CHECK_BUFFER(dtype.lanes == 1, "ATen does not support lanes != 1");
TORCH_CHECK_BUFFER(
dtype.lanes == 1,
"ATen does not support lanes != 1 for dtype code", std::to_string(dtype.code));
}
switch (dtype.code) { switch (dtype.code) {
case DLDataTypeCode::kDLUInt: case DLDataTypeCode::kDLUInt:
switch (dtype.bits) { switch (dtype.bits) {
@ -283,73 +269,6 @@ ScalarType toScalarType(const DLDataType& dtype) {
false, "Unsupported kDLBool bits ", std::to_string(dtype.bits)); false, "Unsupported kDLBool bits ", std::to_string(dtype.bits));
} }
break; break;
case DLDataTypeCode::kDLFloat8_e5m2:
switch (dtype.bits) {
case 8:
stype = ScalarType::Float8_e5m2;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat8_e5m2 bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat8_e5m2fnuz:
switch (dtype.bits) {
case 8:
stype = ScalarType::Float8_e5m2fnuz;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat8_e5m2fnuz bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat8_e4m3fn:
switch (dtype.bits) {
case 8:
stype = ScalarType::Float8_e4m3fn;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat8_e4m3fn bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat8_e4m3fnuz:
switch (dtype.bits) {
case 8:
stype = ScalarType::Float8_e4m3fnuz;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat8_e4m3fnuz bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat8_e8m0fnu:
switch (dtype.bits) {
case 8:
stype = ScalarType::Float8_e8m0fnu;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat8_e8m0fnu bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat4_e2m1fn:
switch (dtype.bits) {
case 4:
switch (dtype.lanes) {
case 2:
stype = ScalarType::Float4_e2m1fn_x2;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat4_e2m1fn lanes ", std::to_string(dtype.lanes));
}
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat4_e2m1fn bits ", std::to_string(dtype.bits));
}
break;
default: default:
TORCH_CHECK_BUFFER(false, "Unsupported code ", std::to_string(dtype.code)); TORCH_CHECK_BUFFER(false, "Unsupported code ", std::to_string(dtype.code));
} }
@ -401,13 +320,30 @@ T* toDLPackImpl(const Tensor& src) {
// The following code detects whether the src follows // The following code detects whether the src follows
// a continuous pattern. If the src follows such pattern (common-case) // a continuous pattern. If the src follows such pattern (common-case)
// then we do not need to normalize the strides. // then we do not need to normalize the strides.
bool need_normalize_strides = src.dim() == 1 && src.size(0) == 1 && src.stride(0) != 1; bool need_normalize_strides = false;
int64_t expected_stride = 1;
for (int i = src.dim() - 1; i >= 0; i--) {
// detect if we do not meet continuous pattern
// and the size is 1, so there is opportunity to normalize
if (src.stride(i) != expected_stride && src.size(i) == 1) {
need_normalize_strides = true;
break;
}
expected_stride *= src.size(i);
}
// less common case, try normalizing the strides // less common case, try normalizing the strides
if (need_normalize_strides) { if (need_normalize_strides) {
// create a new tensor with possibly normalized strides // create a new tensor with possibly normalized strides
// gh-83069 // gh-83069
auto shape = src.sizes(); auto shape = src.sizes();
view = src.as_strided(shape, {1}, src.storage_offset()); auto strides = src.strides().vec();
for (int i = 0; i < src.dim(); i++) {
if (shape[i] < 2) {
strides[i] = 1;
}
}
view = src.as_strided(shape, strides, src.storage_offset());
} }
ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>); ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
@ -418,8 +354,8 @@ T* toDLPackImpl(const Tensor& src) {
atDLMTensor->tensor.dl_tensor.device = torchDeviceToDLDevice(src.device()); atDLMTensor->tensor.dl_tensor.device = torchDeviceToDLDevice(src.device());
atDLMTensor->tensor.dl_tensor.ndim = static_cast<int32_t>(src.dim()); atDLMTensor->tensor.dl_tensor.ndim = static_cast<int32_t>(src.dim());
atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src); atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src);
atDLMTensor->tensor.dl_tensor.shape = const_cast<int64_t*>(view.sizes().data()); atDLMTensor->tensor.dl_tensor.shape = view.sizes().data();
atDLMTensor->tensor.dl_tensor.strides = const_cast<int64_t*>(view.strides().data()); atDLMTensor->tensor.dl_tensor.strides = view.strides().data();
atDLMTensor->tensor.dl_tensor.byte_offset = 0; atDLMTensor->tensor.dl_tensor.byte_offset = 0;
fillVersion(&atDLMTensor->tensor); fillVersion(&atDLMTensor->tensor);

View File

@ -468,7 +468,7 @@ inline Tensor _sum_to(
// if we assume no reduction due to unbacked we ensure that at runtime. // if we assume no reduction due to unbacked we ensure that at runtime.
TORCH_MAYBE_SYM_CHECK( TORCH_MAYBE_SYM_CHECK(
sym_eq(shape[i - leading_dims], sizes[i]), sym_eq(shape[i - leading_dims], sizes[i]),
"non-reduction path was assumed due to unbacked symbols expected those two sizes to be the same:", "non-reduction path was assumed due to unabcked symbols expected those two sizes to be the same:",
shape[i - leading_dims], shape[i - leading_dims],
", ", ", ",
sizes[i]) sizes[i])

View File

@ -9,6 +9,11 @@
namespace at::functionalization { namespace at::functionalization {
ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
if (out_idx == this->out_index) return *this;
return ViewMeta(forward_fn, reverse_fn, has_symbolic_inputs, is_multi_output, is_as_strided, out_idx);
}
// Note [Functionalization: Alias Removal Part 2] // Note [Functionalization: Alias Removal Part 2]
// See Note [Functionalization: Alias Removal] for more details. // See Note [Functionalization: Alias Removal] for more details.
// This function applies a single update from one of the views to the StorageImpl. // This function applies a single update from one of the views to the StorageImpl.
@ -37,12 +42,12 @@ namespace at::functionalization {
static const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) { static const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) {
at::Tensor t = update.new_val; at::Tensor t = update.new_val;
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
if (update.view_metas.empty()) { return t; } if (update.view_metas.empty()) return t;
std::vector<at::Tensor> tmp_values({base}); std::vector<at::Tensor> tmp_values({base});
tmp_values.reserve(update.view_metas.size()); tmp_values.reserve(update.view_metas.size());
for (size_t i = 0; i < update.view_metas.size() - 1; ++i) { for (size_t i = 0; i < update.view_metas.size() - 1; ++i) {
at::Tensor next_view = update.view_metas[i]->forward(tmp_values.back()); at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index);
// NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided // NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided
// All of these ops require additional information to recover the sizes of the original tensor. // All of these ops require additional information to recover the sizes of the original tensor.
// If need to, we could probably apply this optimization and only bother computing tmp_values // If need to, we could probably apply this optimization and only bother computing tmp_values
@ -50,8 +55,9 @@ static const Tensor apply_update(const FunctionalStorageImpl::Update& update, co
tmp_values.push_back(std::move(next_view)); tmp_values.push_back(std::move(next_view));
} }
for(int64_t i = static_cast<int64_t>(update.view_metas.size()) - 1; i >= 0; --i) { for(int64_t i = static_cast<int64_t>(update.view_metas.size()) - 1; i >= 0; --i) {
int64_t out_idx = update.view_metas[i].out_index;
// Each view inverse is implemented in ViewInverses.cpp. // Each view inverse is implemented in ViewInverses.cpp.
t = update.view_metas[i]->reverse(tmp_values[i], t); t = update.view_metas[i].reverse_fn(tmp_values[i], t, out_idx);
} }
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
return t; return t;
@ -105,13 +111,13 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_)); TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_));
} }
void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<std::shared_ptr<ViewMeta>>& metas) { void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& metas) {
TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage"); TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage");
if (metas.size() > 1) { if (metas.size() > 1) {
for (size_t i = 1; i < metas.size(); ++i) { for (size_t i = 1; i < metas.size(); ++i) {
// Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI // Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI
TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i]->is_as_strided, TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i].is_as_strided,
"During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i, "During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i,
" was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today," " was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today,"
"so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you " "so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you "

View File

@ -8,89 +8,44 @@ namespace at::functionalization {
// See Note [Functionalization Pass In Core] // See Note [Functionalization Pass In Core]
enum class InverseReturnMode {
/// Specifies that functional inverses should always return a view.
AlwaysView,
/// Specifies that functional inverses should always return a non-view / copy.
NeverView,
/// Specifies that functional inverses should return a view unless a (copying)
/// scatter
/// inverse exists, in which case that will be used instead.
/// This avoids as_strided() calls that can be difficult for subclasses to
/// handle.
ViewOrScatterInverse,
};
#define FUNCTIONALIZATION_VIEWMETA_NAME(TYPE) \
static const char* name() { \
return #TYPE; \
}
#define FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(...) \
using SerializableTuple = std::tuple<__VA_ARGS__>
// ViewMeta is a class used by the functionalization pass to navigate between // ViewMeta is a class used by the functionalization pass to navigate between
// a base tensor and a view tensor. // a base tensor and a view tensor.
// For example, if I call `b = a.view1(...)` // For example, if I call `b = a.view1(...)`
// the functionalization pass will generate and store a ViewMeta specialization // the functionalization pass will generate and store a ViewMeta on b that looks
// for `view1` operation on b that looks like: // like:
// //
// struct TORCH_API view1_ViewMeta : public ViewMeta { // ViewMeta(
// FUNCTIONALIZATION_VIEWMETA_NAME(view1_ViewMeta); // [<captures>](const Tensor& base, int64_t mutated_view_idx) {
// FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE( // return base.view1(...);
// bool /* reapply_views */, // },
// const std::vector<int64_t>&); // [<captures>](const at::Tensor& base, const at::Tensor& mutated_view,
// // int64_t mutated_view_idx) -> at::Tensor {
// view1_ViewMeta(const SerializableTuple& tpl) // return at::functionalization::impl::view1_inverse(base, mutated_view,
// : view1_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {} // ...);
//
// view1_ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
// : ViewMeta(/*has_symbolic_inputs=*/false),
// reapply_views(reapply_views),
// size(size) {}
//
// Tensor forward(const Tensor& base) override {
// return base.view1(...);
// } // }
// //
// Tensor reverse(const Tensor& base, const Tensor& mutated_view) override { // The forward_fn lambda describes how to replay view1 on a tensor.
// return at::functionalization::impl::view1_inverse(base, mutated_view,
// ...);
// }
// //
// SerializableTuple to_serializable_tuple() { // The reverse_fn lambda describes how, given a tensor that is already a view,
// return std::make_tuple(reapply_views, size);
// }
//
// bool reapply_views;
// std::vector<int64_t> size;
// };
//
// The forward function describes how to replay view1 on a tensor.
//
// The reverse function describes how, given a tensor that is already a view,
// how to get the corresponding base tensor. See Note [Functionalization Pass: // how to get the corresponding base tensor. See Note [Functionalization Pass:
// View Inverses] for details. // View Inverses] for details.
//
// `SerializedTuple` is a typedef that defines an `std::tuple<...>` type
// representing the `ViewMeta` instance state. Methods that take in/return such
// a type are used for supporting pickle serialization.
struct ViewMeta { struct ViewMeta {
ViewMeta( ViewMeta(
std::function<Tensor(const Tensor&, int64_t)> forward,
std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse,
bool has_symbolic_inputs, bool has_symbolic_inputs,
bool is_multi_output = false, bool is_multi_output = false,
bool is_as_strided = false, bool is_as_strided = false,
int64_t out_idx = 0) int64_t out_idx = 0)
: out_index(out_idx), : forward_fn(std::move(forward)),
reverse_fn(std::move(reverse)),
out_index(out_idx),
is_multi_output(is_multi_output), is_multi_output(is_multi_output),
is_as_strided(is_as_strided), is_as_strided(is_as_strided),
has_symbolic_inputs(has_symbolic_inputs) {} has_symbolic_inputs(has_symbolic_inputs) {}
virtual ~ViewMeta() = default; std::function<Tensor(const Tensor&, int64_t)> forward_fn;
std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse_fn;
virtual Tensor forward(const Tensor& base) = 0;
virtual Tensor reverse(const Tensor& base, const Tensor& mutated_view) = 0;
// See Note [out_idx in ViewMeta] // See Note [out_idx in ViewMeta]
int64_t out_index; int64_t out_index;
@ -102,17 +57,10 @@ struct ViewMeta {
// Tells us if this view operation has any symbolic inputs // Tells us if this view operation has any symbolic inputs
bool has_symbolic_inputs; bool has_symbolic_inputs;
// Returns a new ViewMeta with the same forward/reverse // Returns a copy of the current ViewMeta, if out_idx matches the current
// out_index. Otherwise, returns a new ViewMeta with the same forward/reverse
// functions, but a new out index. // functions, but a new out index.
// ViewMeta to_out_idx(int64_t out_idx);
// This method should be implemented by those `ViewMeta` that have more than
// one output.
virtual std::shared_ptr<ViewMeta> to_out_index(int64_t out_index) {
TORCH_CHECK_NOT_IMPLEMENTED(
false,
"ViewMeta::to_out_index not implemented. ",
"Likely because there's only one output.");
}
}; };
// FunctionalStorageImpl is a subclass of StorageImpl used by the // FunctionalStorageImpl is a subclass of StorageImpl used by the
@ -145,14 +93,14 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
// NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
const at::Tensor new_val; const at::Tensor new_val;
// NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
const std::vector<std::shared_ptr<ViewMeta>> view_metas; const std::vector<ViewMeta> view_metas;
}; };
explicit FunctionalStorageImpl(const Tensor& value); explicit FunctionalStorageImpl(const Tensor& value);
void add_update( void add_update(
const Tensor& updated_val, const Tensor& updated_val,
const std::vector<std::shared_ptr<ViewMeta>>& view_metas); const std::vector<ViewMeta>& view_metas);
bool apply_updates(); bool apply_updates();
const Tensor& base() { const Tensor& base() {
return base_; return base_;

View File

@ -129,19 +129,17 @@ void FunctionalTensorWrapper::freeze_storage() const {
// - view_value: The output tensor that we need to wrap. // - view_value: The output tensor that we need to wrap.
// - base: The "base" of the view that `view_value` was generated from. // - base: The "base" of the view that `view_value` was generated from.
// See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic. // See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic.
FunctionalTensorWrapper::FunctionalTensorWrapper( FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, const functionalization::ViewMeta& meta)
const Tensor& view_value, : c10::TensorImpl(
const FunctionalTensorWrapper* base, c10::DispatchKeySet(DispatchKey::Functionalize),
const std::shared_ptr<functionalization::ViewMeta>& meta) view_value.dtype(),
: c10::TensorImpl( view_value.device()
c10::DispatchKeySet(DispatchKey::Functionalize), ),
view_value.dtype(), value_(view_value),
base->storage().data_ptr().device()), is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
value_(view_value), was_storage_changed_(base->was_storage_changed_),
is_multi_output_view_( is_symbolic_(base->is_symbolic_)
base->is_multi_output_view_ || meta->is_multi_output), {
was_storage_changed_(base->was_storage_changed_),
is_symbolic_(base->is_symbolic_) {
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_)); TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_));
TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize)); TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
set_constructor_metadata(); set_constructor_metadata();
@ -150,10 +148,11 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(
view_metas_ = base->view_metas_; // copy view_metas_ = base->view_metas_; // copy
} }
view_metas_.push_back(meta); view_metas_.push_back(meta);
maybe_mark_symbolic(meta.get()); maybe_mark_symbolic(meta);
storage_ = base->storage_; // alias this tensor's storage with the base tensor's storage_ = base->storage_; // alias this tensor's storage with the base tensor's
} }
functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const { functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const {
return static_cast<functionalization::FunctionalStorageImpl*>(storage_.unsafeGetStorageImpl()); return static_cast<functionalization::FunctionalStorageImpl*>(storage_.unsafeGetStorageImpl());
} }
@ -177,18 +176,18 @@ bool FunctionalTensorWrapper::is_up_to_date() const {
} }
// See Note [Functionalization Pass - Inplace View Ops] // See Note [Functionalization Pass - Inplace View Ops]
void FunctionalTensorWrapper::mutate_view_meta(const std::shared_ptr<at::functionalization::ViewMeta>& meta) { void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) {
view_metas_.push_back(meta); view_metas_.push_back(meta);
// Manually track the fact that this tensor received a metadata mutation! // Manually track the fact that this tensor received a metadata mutation!
has_metadata_mutation_ = true; has_metadata_mutation_ = true;
// Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation. // Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation.
maybe_mark_symbolic(meta.get()); maybe_mark_symbolic(meta);
// Note [Functionalization Pass - Inplace View Ops] // Note [Functionalization Pass - Inplace View Ops]
// So, these ops are special - they're mutation AND view ops. They get special codegen. // So, these ops are special - they're mutation AND view ops. They get special codegen.
// An example is transpose_, e.g. `a.transpose_()` // An example is transpose_, e.g. `a.transpose_()`
// Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas. // Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas.
at::AutoDispatchSkipFunctionalize guard; at::AutoDispatchSkipFunctionalize guard;
value_ = meta->forward(value_); value_ = meta.forward_fn(value_, meta.out_index);
TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize)); TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
} }
@ -369,8 +368,15 @@ void FunctionalTensorWrapper::sync_() {
regenerate_from_base(); regenerate_from_base();
} }
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& FunctionalTensorWrapper::view_metas() const { Tensor FunctionalTensorWrapper::apply_view_metas(const Tensor& base) {
return view_metas_; auto t = base;
// Reapply views to get the viewed tensor from the base in alias_
for (auto& view_meta: view_metas_) {
t = view_meta.forward_fn(t, view_meta.out_index);
}
return t;
} }
void FunctionalTensorWrapper::regenerate_from_base() { void FunctionalTensorWrapper::regenerate_from_base() {
@ -379,7 +385,7 @@ void FunctionalTensorWrapper::regenerate_from_base() {
auto t = storage_impl->base(); auto t = storage_impl->base();
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
t = at::functionalization::impl::apply_view_meta_sequence(t, view_metas_); t = apply_view_metas(t);
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
replace_(t, /*from_lazy_regenerate=*/true); replace_(t, /*from_lazy_regenerate=*/true);
@ -479,10 +485,7 @@ void FunctionalTensorWrapper::shallow_copy_from(const c10::intrusive_ptr<TensorI
c10::Device FunctionalTensorWrapper::device_custom() const { c10::Device FunctionalTensorWrapper::device_custom() const {
// The storage pointer already uses the underlying tensor custom device (if return value_.unsafeGetTensorImpl()->device();
// applicable) to extract the device. So, we dont have to recurse again by
// doing value_.unsafeGetTensorImpl()->device().
return storage().data_ptr().device();
} }
at::IntArrayRef FunctionalTensorWrapper::sizes_custom() const { at::IntArrayRef FunctionalTensorWrapper::sizes_custom() const {
return value_.unsafeGetTensorImpl()->sizes(); return value_.unsafeGetTensorImpl()->sizes();
@ -721,11 +724,11 @@ bool isFunctionalTensor(const std::optional<Tensor>& t) {
} }
bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) { bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
if (t_list.empty()) { return false; } if (t_list.empty()) return false;
auto functional_count = 0; auto functional_count = 0;
for (const auto i : c10::irange(t_list.size())) { for (const auto i : c10::irange(t_list.size())) {
auto const & e= t_list[i]; auto const & e= t_list[i];
if (!e.has_value() || !e->defined()) { continue; } if (!e.has_value() || !e->defined()) continue;
if (isFunctionalTensor(e)) { if (isFunctionalTensor(e)) {
++functional_count; ++functional_count;
} }
@ -735,10 +738,10 @@ bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
template <typename T> template <typename T>
static bool isFunctionalTensorIListRef(c10::IListRef<T> list) { static bool isFunctionalTensorIListRef(c10::IListRef<T> list) {
if (list.size() == 0) { return false; } if (list.size() == 0) return false;
auto functional_count = 0; auto functional_count = 0;
for (const auto& tensor : list) { for (const auto& tensor : list) {
if (!tensor.defined()) { continue; } if (!tensor.defined()) continue;
if (isFunctionalTensor(tensor)) { if (isFunctionalTensor(tensor)) {
++functional_count; ++functional_count;
} }
@ -756,28 +759,20 @@ void freeze_functional_tensor(const Tensor& tensor) {
functional_base_impl->freeze_storage(); functional_base_impl->freeze_storage();
} }
Tensor create_functional_tensor_with_view_meta( Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) {
const at::Tensor& view_to_wrap,
const at::Tensor& base,
const std::shared_ptr<functionalization::ViewMeta>& meta,
int64_t out_idx) {
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap)); TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap));
TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base)); TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base));
auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base); auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base);
auto meta_ = meta;
if (out_idx != 0) { if (out_idx != 0) {
// Note [out_idx in ViewMeta] // Note [out_idx in ViewMeta]
// When a view op outputs multiple tensors, each output needs its own separate ViewMeta. // When a view op outputs multiple tensors, each output needs its own separate ViewMeta.
// Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function. // Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function.
meta_ = meta->to_out_index(out_idx); meta = meta.to_out_idx(out_idx);
} }
return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta_); return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta);
} }
std::vector<Tensor> create_functional_tensor_with_view_meta( std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, const functionalization::ViewMeta& meta) {
ITensorListRef view_to_wrap,
const at::Tensor& base,
const std::shared_ptr<functionalization::ViewMeta>& meta) {
std::vector<Tensor> outputs(view_to_wrap.size()); std::vector<Tensor> outputs(view_to_wrap.size());
int64_t i = 0; int64_t i = 0;
for (const auto& tensor : view_to_wrap) { for (const auto& tensor : view_to_wrap) {
@ -787,22 +782,12 @@ std::vector<Tensor> create_functional_tensor_with_view_meta(
return outputs; return outputs;
} }
void mutate_view_meta(const at::Tensor& self, const std::shared_ptr<functionalization::ViewMeta>& meta) { void mutate_view_meta(const at::Tensor& self, const functionalization::ViewMeta& meta) {
TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self)); TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self));
auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self); auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
self_impl->mutate_view_meta(meta); self_impl->mutate_view_meta(meta);
} }
Tensor apply_view_meta_sequence(
const Tensor& base,
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence) {
Tensor r = base;
for (auto& vm : sequence) {
r = vm->forward(r);
}
return r;
}
// Note [Propagating strides in the functionalization pass] // Note [Propagating strides in the functionalization pass]
// In order to properly compute stride information, the functionalization pass // In order to properly compute stride information, the functionalization pass
// calls each {view} reference implementations with meta tensors. // calls each {view} reference implementations with meta tensors.
@ -896,7 +881,7 @@ void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* s
const auto& ivalue = returns[idx]; const auto& ivalue = returns[idx];
if (ivalue.isTensor()) { if (ivalue.isTensor()) {
const auto& t = ivalue.toTensor(); const auto& t = ivalue.toTensor();
if (!t.defined()) { continue; } if (!t.defined()) continue;
at::functionalization::impl::sync(t); at::functionalization::impl::sync(t);
auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t)); auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t));
(*stack)[returns_begin + idx] = t_new; (*stack)[returns_begin + idx] = t_new;

View File

@ -56,7 +56,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
explicit FunctionalTensorWrapper( explicit FunctionalTensorWrapper(
const Tensor& view_value, const Tensor& view_value,
const FunctionalTensorWrapper* base, const FunctionalTensorWrapper* base,
const std::shared_ptr<functionalization::ViewMeta>& meta); const functionalization::ViewMeta& meta);
// Get the underlying, actual tensor, that doesn't know anything about // Get the underlying, actual tensor, that doesn't know anything about
// functionalization. // functionalization.
@ -99,17 +99,17 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
->are_all_mutations_under_no_grad_or_inference_mode(); ->are_all_mutations_under_no_grad_or_inference_mode();
} }
void maybe_mark_symbolic(functionalization::ViewMeta* meta) { void maybe_mark_symbolic(const functionalization::ViewMeta& meta) {
is_symbolic_ = is_symbolic_ | meta->has_symbolic_inputs; is_symbolic_ = is_symbolic_ | meta.has_symbolic_inputs;
} }
bool is_symbolic() const { bool is_symbolic() const {
return is_symbolic_; return is_symbolic_;
} }
// Retrieves the ViewMeta sequence of this tensor. // Runs the forward_fn of every ViewMeta collected in the current instance
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& view_metas() // to some other base.
const; Tensor apply_view_metas(const Tensor& base);
// Sync's the underlying tensor with its alias, if it's out of date. This // Sync's the underlying tensor with its alias, if it's out of date. This
// involves two steps: 1) Apply any pending updates/mutations to the alias 2) // involves two steps: 1) Apply any pending updates/mutations to the alias 2)
@ -146,8 +146,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
// from the base tensor. This method is used by inplace-view ops like // from the base tensor. This method is used by inplace-view ops like
// transpose_. It appends a ViewMeta to the existing stack, and refreshes the // transpose_. It appends a ViewMeta to the existing stack, and refreshes the
// tensor by replaying the views off of the alias. // tensor by replaying the views off of the alias.
void mutate_view_meta( void mutate_view_meta(const at::functionalization::ViewMeta& meta);
const std::shared_ptr<at::functionalization::ViewMeta>& meta);
// Custom implementation of self.set_(src) // Custom implementation of self.set_(src)
void set__impl(const FunctionalTensorWrapper* other); void set__impl(const FunctionalTensorWrapper* other);
@ -286,7 +285,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
bool is_symbolic_ = false; bool is_symbolic_ = false;
size_t generation_ = 0; size_t generation_ = 0;
std::vector<std::shared_ptr<at::functionalization::ViewMeta>> view_metas_; std::vector<at::functionalization::ViewMeta> view_metas_;
protected: protected:
static void copy_tensor_metadata( static void copy_tensor_metadata(
@ -378,20 +377,16 @@ TORCH_API void propagate_xla_data_direct(
Tensor create_functional_tensor_with_view_meta( Tensor create_functional_tensor_with_view_meta(
const Tensor& view_to_wrap, const Tensor& view_to_wrap,
const Tensor& base, const Tensor& base,
const std::shared_ptr<functionalization::ViewMeta>& meta, functionalization::ViewMeta meta,
int64_t out_idx = 0); int64_t out_idx = 0);
std::vector<Tensor> create_functional_tensor_with_view_meta( std::vector<Tensor> create_functional_tensor_with_view_meta(
ITensorListRef view_to_wrap, ITensorListRef view_to_wrap,
const Tensor& base, const Tensor& base,
const std::shared_ptr<functionalization::ViewMeta>& meta); const functionalization::ViewMeta& meta);
void mutate_view_meta( void mutate_view_meta(
const Tensor& self, const Tensor& self,
const std::shared_ptr<functionalization::ViewMeta>& meta); const functionalization::ViewMeta& meta);
TORCH_API Tensor apply_view_meta_sequence(
const Tensor& base,
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence);
void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out); void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
void set_sizes_strides_offset( void set_sizes_strides_offset(

View File

@ -1,5 +1,3 @@
#include <ATen/FunctionalizeFallbackKernel.h>
#include <ATen/core/dispatch/Dispatcher.h> #include <ATen/core/dispatch/Dispatcher.h>
#include <ATen/core/LegacyTypeDispatch.h> #include <ATen/core/LegacyTypeDispatch.h>
#include <ATen/EmptyTensor.h> #include <ATen/EmptyTensor.h>
@ -9,6 +7,7 @@
#include <torch/library.h> #include <torch/library.h>
#include <c10/util/irange.h> #include <c10/util/irange.h>
#include <c10/util/strides.h> #include <c10/util/strides.h>
#include <ATen/EmptyTensor.h>
#ifndef AT_PER_OPERATOR_HEADERS #ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/ATen.h> #include <ATen/ATen.h>
@ -29,31 +28,6 @@
#include <utility> #include <utility>
#endif #endif
namespace at::functionalization {
Tensor resize__ViewMeta::forward(const Tensor& base) {
if (reapply_views) {
return base.as_strided(size, c10::contiguous_strides(size));
} else {
return at::as_strided_copy(base, size, c10::contiguous_strides(size));
}
}
Tensor resize__ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
return base.as_strided_scatter(
mutated_view, size, c10::contiguous_strides(size));
}
Tensor _unsafe_view_ViewMeta::forward(const Tensor& base) {
return at::_unsafe_view_symint(base, size);
}
Tensor _unsafe_view_ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
}
} // namespace at::functionalization
namespace { namespace {
void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet [[maybe_unused]], torch::jit::Stack* stack) { void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet [[maybe_unused]], torch::jit::Stack* stack) {
const auto& schema = op.schema(); const auto& schema = op.schema();
@ -132,9 +106,7 @@ namespace {
const auto& ivalue = returns[idx]; const auto& ivalue = returns[idx];
if (ivalue.isTensor() && should_wrap_outputs) { if (ivalue.isTensor() && should_wrap_outputs) {
const auto& t = ivalue.toTensor(); const auto& t = ivalue.toTensor();
if (!t.defined()) { if (!t.defined()) continue;
continue;
}
auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t)); auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t));
(*stack)[returns_begin + idx] = t_new; (*stack)[returns_begin + idx] = t_new;
} else if (ivalue.isTensorList() && should_wrap_outputs) { } else if (ivalue.isTensorList() && should_wrap_outputs) {
@ -197,8 +169,19 @@ static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatch
// The output of resizing is equivalent to taking a slice of a larger tensor. // The output of resizing is equivalent to taking a slice of a larger tensor.
// We have to emulate this "slicing" with an as_strided call. // We have to emulate this "slicing" with an as_strided call.
auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS(); auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS();
auto view_meta = std::make_shared<at::functionalization::resize__ViewMeta>( at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
reapply_views, size.vec()); [reapply_views = reapply_views, size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
if (reapply_views) {
return base.as_strided(size, c10::contiguous_strides(size));
} else {
return at::as_strided_copy(base, size, c10::contiguous_strides(size));
}
},
[size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
return base.as_strided_scatter(mutated_view, size, c10::contiguous_strides(size));
},
/*has_symbolic_inputs=*/false
);
at::functionalization::impl::mutate_view_meta(self, view_meta); at::functionalization::impl::mutate_view_meta(self, view_meta);
return self; return self;
} }
@ -317,11 +300,17 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt
tmp_output = at::_unsafe_view_symint(self_, size); tmp_output = at::_unsafe_view_symint(self_, size);
} }
bool has_symbolic_inputs = std::any_of( bool has_symbolic_inputs = std::any_of(size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
auto view_meta = at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
std::make_shared<at::functionalization::_unsafe_view_ViewMeta>( [size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
has_symbolic_inputs, size.vec()); return at::_unsafe_view_symint(base, size);
},
[size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
},
/*has_symbolic_inputs=*/has_symbolic_inputs
);
auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, self, std::move(view_meta)); auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, self, std::move(view_meta));
// See Note [Propagating strides in the functionalization pass] // See Note [Propagating strides in the functionalization pass]

View File

@ -1,58 +0,0 @@
#pragma once
#include <ATen/FunctionalStorageImpl.h>
namespace at::functionalization {
// `ViewMeta` implementation for `resize_` operation.
struct TORCH_API resize__ViewMeta : public ViewMeta {
FUNCTIONALIZATION_VIEWMETA_NAME(resize__ViewMeta)
FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
bool /* reapply_views */,
const std::vector<int64_t>&);
resize__ViewMeta(const SerializableTuple& tpl)
: resize__ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
resize__ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
: ViewMeta(/*has_symbolic_inputs=*/false),
reapply_views(reapply_views),
size(size) {}
Tensor forward(const Tensor& base) override;
Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
SerializableTuple to_serializable_tuple() {
return std::make_tuple(reapply_views, size);
}
bool reapply_views;
std::vector<int64_t> size;
};
// `ViewMeta` implementation for `_unsafe_view` operation.
struct TORCH_API _unsafe_view_ViewMeta : public ViewMeta {
FUNCTIONALIZATION_VIEWMETA_NAME(_unsafe_view_ViewMeta)
FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
bool /* has_symbolic_inputs */,
const std::vector<c10::SymInt>&);
_unsafe_view_ViewMeta(const SerializableTuple& tpl)
: _unsafe_view_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
_unsafe_view_ViewMeta(
bool has_symbolic_inputs,
const std::vector<c10::SymInt>& size)
: ViewMeta(has_symbolic_inputs), size(size) {}
Tensor forward(const Tensor& base) override;
Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
SerializableTuple to_serializable_tuple() {
return std::make_tuple(has_symbolic_inputs, size);
}
std::vector<c10::SymInt> size;
};
} // namespace at::functionalization

View File

@ -45,39 +45,7 @@ inline void infer_size_impl(
} }
} }
if (infer_dim) { auto set_infer_dim = [&]() {
// numel is the product of known sizes, it has to be divisible by newsize.
// and newsize should be positive unless newsize == numel (we throw
// different) error message in that case.
if constexpr (std::is_same_v<NumelType, c10::SymInt>) {
auto v = newsize.maybe_as_int();
if (v and *v == 0) {
// Avoid div by 0 when sym_eq(numel % newsize, 0) is constructed!
// which may happen when newsize is not a symbol! if its a symbol
// division won't happen anyway during compile.
TORCH_MAYBE_SYM_CHECK(
numel == newsize,
"shape '",
shape,
"' is invalid for input of size ",
numel);
} else {
auto cond = sym_gt(newsize, 0)
.sym_and(sym_eq(numel % newsize, 0))
.sym_or(sym_eq(numel, newsize));
TORCH_MAYBE_SYM_CHECK(
cond, "shape '", shape, "' is invalid for input of size ", numel);
}
} else {
TORCH_CHECK(
(newsize > 0 && (numel % newsize == 0)) || numel == newsize,
"shape '",
shape,
"' is invalid for input of size ",
numel);
}
// We have a degree of freedom here to select the dimension size; follow // We have a degree of freedom here to select the dimension size; follow
// NumPy semantics and just bail. However, a nice error message is needed // NumPy semantics and just bail. However, a nice error message is needed
// because users often use `view` as a way to flatten & unflatten // because users often use `view` as a way to flatten & unflatten
@ -86,15 +54,19 @@ inline void infer_size_impl(
// works yet // works yet
// empty_tensor.view(-1, 0) // empty_tensor.view(-1, 0)
// doesn't. // doesn't.
TORCH_MAYBE_SYM_CHECK( TORCH_CHECK(
newsize != 0, newsize != 0,
"cannot reshape tensor of 0 elements into shape ", "cannot reshape tensor of 0 elements into shape ",
shape, shape,
" because the unspecified dimension size -1 can be any " " because the unspecified dimension size -1 can be any "
"value and is ambiguous"); "value and is ambiguous");
res[*infer_dim] = numel / newsize; res[*infer_dim] = numel / newsize;
return; return;
};
if (infer_dim && newsize > 0 && numel % newsize == 0) {
set_infer_dim();
return;
} }
TORCH_MAYBE_SYM_CHECK( TORCH_MAYBE_SYM_CHECK(
@ -103,6 +75,9 @@ inline void infer_size_impl(
shape, shape,
"' is invalid for input of size ", "' is invalid for input of size ",
numel); numel);
if (infer_dim) {
set_infer_dim();
}
} }
inline std::vector<int64_t> infer_size(IntArrayRef shape, int64_t numel) { inline std::vector<int64_t> infer_size(IntArrayRef shape, int64_t numel) {

View File

@ -103,9 +103,7 @@ std::string get_cpu_capability() {
#elif defined(HAVE_ZVECTOR_CPU_DEFINITION) #elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
case native::CPUCapability::ZVECTOR: case native::CPUCapability::ZVECTOR:
return "Z VECTOR"; return "Z VECTOR";
#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION) #elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
case native::CPUCapability::SVE128:
return "SVE128";
case native::CPUCapability::SVE256: case native::CPUCapability::SVE256:
return "SVE256"; return "SVE256";
#else #else

View File

@ -1,22 +1,32 @@
#include <ATen/core/PythonOpRegistrationTrampoline.h> #include <ATen/core/PythonOpRegistrationTrampoline.h>
#include <c10/core/impl/PyInterpreterHooks.h>
// TODO: delete this
namespace at::impl { namespace at::impl {
c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::interpreter_ = nullptr; // The strategy is that all python interpreters attempt to register themselves
// as the main interpreter, but only one wins. Only that interpreter is
// allowed to interact with the C++ dispatcher. Furthermore, when we execute
// logic on that interpreter, we do so hermetically, never setting pyobj field
// on Tensor.
std::atomic<c10::impl::PyInterpreter*>
PythonOpRegistrationTrampoline::interpreter_{nullptr};
c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::getInterpreter() { c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::getInterpreter() {
return c10::impl::getGlobalPyInterpreter(); return PythonOpRegistrationTrampoline::interpreter_.load();
} }
bool PythonOpRegistrationTrampoline::registerInterpreter( bool PythonOpRegistrationTrampoline::registerInterpreter(
c10::impl::PyInterpreter* interp) { c10::impl::PyInterpreter* interp) {
if (interpreter_ != nullptr) { c10::impl::PyInterpreter* expected = nullptr;
interpreter_.compare_exchange_strong(expected, interp);
if (expected != nullptr) {
// This is the second (or later) Python interpreter, which means we need
// non-trivial hermetic PyObject TLS
c10::impl::HermeticPyObjectTLS::init_state();
return false; return false;
} else {
return true;
} }
interpreter_ = interp;
return true;
} }
} // namespace at::impl } // namespace at::impl

View File

@ -2,21 +2,19 @@
#include <ATen/core/dispatch/Dispatcher.h> #include <ATen/core/dispatch/Dispatcher.h>
// TODO: We can get rid of this // TODO: this can probably live in c10
namespace at::impl { namespace at::impl {
// Manages the single Python interpreter instance for PyTorch.
class TORCH_API PythonOpRegistrationTrampoline final { class TORCH_API PythonOpRegistrationTrampoline final {
static c10::impl::PyInterpreter* interpreter_; static std::atomic<c10::impl::PyInterpreter*> interpreter_;
public: public:
// Register the Python interpreter. Returns true on first registration, // Returns true if you successfully registered yourself (that means
// false if an interpreter was already registered. // you are in the hot seat for doing the operator registrations!)
static bool registerInterpreter(c10::impl::PyInterpreter*); static bool registerInterpreter(c10::impl::PyInterpreter*);
// Returns the registered interpreter via the global PyInterpreter hooks.
// Returns nullptr if no interpreter has been registered yet. // Returns nullptr if no interpreter has been registered yet.
static c10::impl::PyInterpreter* getInterpreter(); static c10::impl::PyInterpreter* getInterpreter();
}; };

View File

@ -1234,7 +1234,7 @@ struct TORCH_API TupleType : public NamedType {
std::shared_ptr<FunctionSchema> schema_; std::shared_ptr<FunctionSchema> schema_;
}; };
// the common supertype of all Enums, only used in operator registration. // the common supertype of all Enums, only used in operator registraion.
// EnumType <: AnyEnumType for all Enums // EnumType <: AnyEnumType for all Enums
struct AnyEnumType; struct AnyEnumType;
using AnyEnumTypePtr = SingletonTypePtr<AnyEnumType>; using AnyEnumTypePtr = SingletonTypePtr<AnyEnumType>;

View File

@ -102,31 +102,8 @@ struct VecReduceAllSIMD<float, Op> {
#endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && #endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) &&
// !defined(C10_MOBILE) // !defined(C10_MOBILE)
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) #if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
#if defined(CPU_CAPABILITY_SVE256) !defined(CPU_CAPABILITY_SVE)
template <typename Op>
struct VecReduceAllSIMD<float, Op> {
static inline float apply(
const Op& vec_fun,
const Vectorized<float>& acc_vec) {
using Vec = Vectorized<float>;
Vec v = acc_vec;
// 128-bit shuffle
svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
Vec v1 = svtbl_f32(v, ind);
v = vec_fun(v, v1);
// 64-bit shuffle
ind = svdupq_n_u32(2, 3, 0, 1);
v1 = svtbl_f32(v, ind);
v = vec_fun(v, v1);
// 32-bit shuffle
ind = svdupq_n_u32(1, 0, 2, 3);
v1 = svtbl_f32(v, ind);
v = vec_fun(v, v1);
return svlasta(svpfalse(), v);
}
};
#else
template <typename Op> template <typename Op>
struct VecReduceAllSIMD<float, Op> { struct VecReduceAllSIMD<float, Op> {
static inline float apply( static inline float apply(
@ -163,8 +140,35 @@ struct VecReduceAllSIMD<float, std::plus<Vectorized<float>>> {
return vaddvq_f32(acc_vec); return vaddvq_f32(acc_vec);
} }
}; };
#endif // defined(CPU_CAPABILITY_SVE256)
#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) #endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
// && !defined(CPU_CAPABILITY_SVE)
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
defined(CPU_CAPABILITY_SVE256)
template <typename Op>
struct VecReduceAllSIMD<float, Op> {
static inline float apply(
const Op& vec_fun,
const Vectorized<float>& acc_vec) {
using Vec = Vectorized<float>;
Vec v = acc_vec;
// 128-bit shuffle
svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
Vec v1 = svtbl_f32(v, ind);
v = vec_fun(v, v1);
// 64-bit shuffle
ind = svdupq_n_u32(2, 3, 0, 1);
v1 = svtbl_f32(v, ind);
v = vec_fun(v, v1);
// 32-bit shuffle
ind = svdupq_n_u32(1, 0, 2, 3);
v1 = svtbl_f32(v, ind);
v = vec_fun(v, v1);
return svlasta(svpfalse(), v);
}
};
#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
// && defined(CPU_CAPABILITY_SVE256)
template <typename scalar_t, typename Op> template <typename scalar_t, typename Op>
inline scalar_t vec_reduce_all( inline scalar_t vec_reduce_all(

View File

@ -1,21 +1,9 @@
#pragma once #pragma once
#include <ATen/cpu/vec/intrinsics.h> #include <ATen/cpu/vec/intrinsics.h>
#include <c10/macros/Macros.h>
#include <cstdint>
#include <ATen/cpu/vec/vec_base.h> #include <ATen/cpu/vec/vec_base.h>
#if defined(__aarch64__) && \
(defined(AT_BUILD_ARM_VEC256_WITH_SLEEF) || \
defined(AT_BUILD_ARM_VECSVE_WITH_SLEEF))
#define SLEEF_STATIC_LIBS
#include <sleef.h>
#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
#else
#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
#endif
#if defined(CPU_CAPABILITY_SVE) #if defined(CPU_CAPABILITY_SVE)
// Define the data type of VLS(vector-length specific). // Define the data type of VLS(vector-length specific).

Some files were not shown because too many files have changed in this diff Show More