Compare commits

..

1 Commits

Author SHA1 Message Date
ba053355cc lint: add .pyi to changed files on .pyi.in changes
We were observing issues where the lint on trunk vs. PRs would be different
due to missing .pyi files. This change adds the .pyi files to the changed files
list when .pyi.in files are changed.

Signed-off-by: Eli Uriegas <eliuriegas@meta.com>
ghstack-source-id: a3ebda896c5f97854db0a143b666658c64a54f21
Pull-Request: https://github.com/pytorch/pytorch/pull/164603
2025-10-03 13:39:45 -07:00
2265 changed files with 26731 additions and 54090 deletions

View File

@ -8,8 +8,6 @@ if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0"
elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
elif [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
fi

View File

@ -37,9 +37,9 @@ case ${DOCKER_TAG_PREFIX} in
rocm*)
BASE_TARGET=rocm
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
# add gfx950, gfx115x conditionally starting in ROCm 7.0
# add gfx950 conditionally starting in ROCm 7.0
if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
fi
EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
;;

View File

@ -113,7 +113,6 @@ case "$tag" in
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INSTALL_MINGW=yes
;;
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
CUDA_VERSION=13.0.0
@ -182,7 +181,7 @@ case "$tag" in
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100"
PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
if [[ $tag =~ "benchmarks" ]]; then
INDUCTOR_BENCHMARKS=yes
fi
@ -345,7 +344,7 @@ docker build \
--build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
--build-arg "KATEX=${KATEX:-}" \
--build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
--build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}" \
--build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942}" \
--build-arg "IMAGE_NAME=${IMAGE_NAME}" \
--build-arg "UCX_COMMIT=${UCX_COMMIT}" \
--build-arg "UCC_COMMIT=${UCC_COMMIT}" \
@ -362,7 +361,6 @@ docker build \
--build-arg "OPENBLAS=${OPENBLAS:-}" \
--build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
--build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
--build-arg "INSTALL_MINGW=${INSTALL_MINGW:-}" \
-f $(dirname ${DOCKERFILE})/Dockerfile \
-t "$tmp_tag" \
"$@" \

View File

@ -1 +1 @@
deb42f2a8e48f5032b4a98ee781a15fa87a157cf
e0dda9059d082537cee36be6c5e4fe3b18c880c0

View File

@ -1 +1 @@
7416ffcb92cdbe98d9f97e4e6f95247e46dfc9fd
27664085f804afc83df26f740bb46c365854f2c4

View File

@ -83,6 +83,10 @@ function build_cpython {
py_suffix=${py_ver::-1}
py_folder=$py_suffix
fi
# Update to rc2 due to https://github.com/python/cpython/commit/c72699086fe4
if [ "$py_suffix" == "3.14.0" ]; then
py_suffix="3.14.0rc2"
fi
wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz
do_cpython_build $py_ver Python-$py_suffix

View File

@ -1,10 +0,0 @@
#!/bin/bash
set -ex
# Install MinGW-w64 for Windows cross-compilation
apt-get update
apt-get install -y g++-mingw-w64-x86-64-posix
echo "MinGW-w64 installed successfully"
x86_64-w64-mingw32-g++ --version

View File

@ -19,8 +19,8 @@ pip_install \
transformers==4.36.2
pip_install coloredlogs packaging
pip_install onnxruntime==1.23.0
pip_install onnxscript==0.5.4
pip_install onnxruntime==1.22.1
pip_install onnxscript==0.4.0
# Cache the transformers model to be used later by ONNX tests. We need to run the transformers
# package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/

View File

@ -39,20 +39,16 @@ case ${DOCKER_TAG_PREFIX} in
DOCKER_GPU_BUILD_ARG=""
;;
rocm*)
# we want the patch version of 7.0 instead
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
fi
# we want the patch version of 6.4 instead
if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.4"
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
fi
BASE_TARGET=rocm
GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
# add gfx950, gfx115x conditionally starting in ROCm 7.0
# add gfx950 conditionally starting in ROCm 7.0
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
fi
DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
;;

View File

@ -115,9 +115,6 @@ RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
# cmake-3.28.0 from pip for onnxruntime
RUN python3 -mpip install cmake==3.28.0
ADD ./common/patch_libstdc.sh patch_libstdc.sh
RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
# build onnxruntime 1.21.0 from sources.
# it is not possible to build it from sources using pip,
# so just build it from upstream repository.

View File

@ -75,22 +75,18 @@ case ${image} in
DOCKERFILE_SUFFIX="_cuda_aarch64"
;;
manylinux2_28-builder:rocm*)
# we want the patch version of 7.0 instead
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
fi
# we want the patch version of 6.4 instead
if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.4"
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
fi
TARGET=rocm_final
MANY_LINUX_VERSION="2_28"
DEVTOOLSET_VERSION="11"
GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
# add gfx950, gfx115x conditionally starting in ROCm 7.0
# add gfx950 conditionally starting in ROCm 7.0
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
fi
DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
;;

View File

@ -10,6 +10,11 @@ BAD_SSL = "https://self-signed.badssl.com"
print("Testing SSL certificate checking for Python:", sys.version)
if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4):
print("This version never checks SSL certs; skipping tests")
sys.exit(0)
EXC = OSError
print(f"Connecting to {GOOD_SSL} should work")

View File

@ -120,8 +120,9 @@ ninja==1.11.1.4
numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
#Description: Just-In-Time Compiler for Numerical Functions
#Pinned versions: 0.55.2, 0.60.0
#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
#test that import: test_numba_integration.py
#For numba issue see https://github.com/pytorch/pytorch/issues/51511
#Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073
#numpy
@ -241,9 +242,10 @@ pygments==2.15.0
#Pinned versions: 14.1.0
#test that import:
scikit-image==0.22.0
scikit-image==0.19.3 ; python_version < "3.10"
scikit-image==0.22.0 ; python_version >= "3.10"
#Description: image processing routines
#Pinned versions: 0.22.0
#Pinned versions:
#test that import: test_nn.py
#scikit-learn
@ -339,7 +341,7 @@ onnx==1.18.0
#Pinned versions:
#test that import:
onnxscript==0.5.3
onnxscript==0.4.0
#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
#Pinned versions:
#test that import:

View File

@ -1,11 +1,15 @@
sphinx==7.2.6
sphinx==5.3.0
#Description: This is used to generate PyTorch docs
#Pinned versions: 7.2.6
#Pinned versions: 5.3.0
pytorch_sphinx_theme2==0.1.0
#Description: This is needed to generate PyTorch docs
#Pinned versions: 0.1.0
standard-imghdr==3.13.0; python_version >= "3.13"
#Description: This is needed by Sphinx, so it needs to be added here.
# The reasons are as follows:
# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
# something related to Docker setup. We can investigate this later.
@ -32,17 +36,17 @@ tensorboard==2.18.0 ; python_version >= "3.13"
#Description: This is used to generate PyTorch docs
#Pinned versions: 2.13.0
breathe==4.36.0
breathe==4.34.0
#Description: This is used to generate PyTorch C++ docs
#Pinned versions: 4.36.0
#Pinned versions: 4.34.0
exhale==0.3.7
exhale==0.2.3
#Description: This is used to generate PyTorch C++ docs
#Pinned versions: 0.3.7
#Pinned versions: 0.2.3
docutils==0.20
docutils==0.16
#Description: This is used to generate PyTorch C++ docs
#Pinned versions: 0.20
#Pinned versions: 0.16
bs4==0.0.1
#Description: This is used to generate PyTorch C++ docs
@ -52,13 +56,13 @@ IPython==8.12.0
#Description: This is used to generate PyTorch functorch docs
#Pinned versions: 8.12.0
myst-nb==1.3.0
myst-nb==0.17.2
#Description: This is used to generate PyTorch functorch and torch.compile docs.
#Pinned versions: 1.3.0
#Pinned versions: 0.17.2
# The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
python-etcd==0.4.5
sphinx-copybutton==0.5.0
sphinx-design==0.6.1
sphinx-design==0.4.0
sphinxcontrib-mermaid==1.0.0
myst-parser==4.0.1
myst-parser==0.18.1

View File

@ -103,11 +103,6 @@ COPY ci_commit_pins/torchbench.txt torchbench.txt
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
ARG INSTALL_MINGW
COPY ./common/install_mingw.sh install_mingw.sh
RUN if [ -n "${INSTALL_MINGW}" ]; then bash ./install_mingw.sh; fi
RUN rm install_mingw.sh
ARG TRITON
ARG TRITON_CPU

View File

@ -57,8 +57,8 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules
logger.info("Successfully cloned %s", target)
return r, commit
except GitCommandError:
logger.exception("Git operation failed")
except GitCommandError as e:
logger.error("Git operation failed: %s", e)
raise

View File

@ -143,7 +143,7 @@ def sample_vllm_test_library():
"pytest -v -s compile/test_decorator.py",
],
},
"vllm_language_model_test_extended_generation_28_failure_test": {
"vllm_languagde_model_test_extended_generation_28_failure_test": {
"title": "Language Models Test (Extended Generation) 2.8 release failure",
"id": "vllm_languagde_model_test_extended_generation_28_failure_test",
"package_install": [

View File

@ -63,7 +63,7 @@ class VllmBuildParameters:
# DOCKERFILE_PATH: path to Dockerfile used when use_local_dockerfile is True"
use_local_dockerfile: bool = env_bool_field("USE_LOCAL_DOCKERFILE", True)
dockerfile_path: Path = env_path_field(
"DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile"
"DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm"
)
# the cleaning script to remove torch dependencies from pip

View File

@ -5,7 +5,7 @@ DESIRED_ROCM ?= 7.0
DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
PACKAGE_NAME = magma-rocm
# inherit this from underlying docker image, do not pass this env var to docker
#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201
#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
@ -18,6 +18,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
.PHONY: all
all: magma-rocm70
all: magma-rocm64
all: magma-rocm63
.PHONY:
clean:
@ -33,3 +34,8 @@ magma-rocm70:
magma-rocm64: DESIRED_ROCM := 6.4
magma-rocm64:
$(DOCKER_RUN)
.PHONY: magma-rocm63
magma-rocm63: DESIRED_ROCM := 6.3
magma-rocm63:
$(DOCKER_RUN)

View File

@ -187,22 +187,19 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
export USE_CUFILE=0
else
DEPS_LIST+=(
"/usr/local/cuda/lib64/libnvToolsExt.so.1"
"/usr/local/cuda/lib64/libcublas.so.12"
"/usr/local/cuda/lib64/libcublasLt.so.12"
"/usr/local/cuda/lib64/libcudart.so.12"
"/usr/local/cuda/lib64/libnvrtc.so.12"
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12")
DEPS_SONAME+=(
"libnvToolsExt.so.1"
"libcublas.so.12"
"libcublasLt.so.12"
"libcudart.so.12"
"libnvrtc.so.12"
"libcupti.so.12")
if [[ $CUDA_VERSION != 12.9* ]]; then
DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1")
DEPS_SONAME+=("libnvToolsExt.so.1")
fi
fi
else
echo "Using nvidia libs from pypi."

View File

@ -233,9 +233,7 @@ if [[ "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
export BUILD_STATIC_RUNTIME_BENCHMARK=ON
fi
if [[ "$BUILD_ENVIRONMENT" == *-full-debug* ]]; then
export CMAKE_BUILD_TYPE=Debug
elif [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
export CMAKE_BUILD_TYPE=RelWithAssert
fi
@ -301,11 +299,6 @@ else
python -m build --wheel --no-isolation
fi
pip_install_whl "$(echo dist/*.whl)"
if [[ "$BUILD_ENVIRONMENT" == *full-debug* ]]; then
# Regression test for https://github.com/pytorch/pytorch/issues/164297
# Torch should be importable and that's about it
pushd /; python -c "import torch;print(torch.__config__.show(), torch.randn(5) + 1.7)"; popd
fi
if [[ "${BUILD_ADDITIONAL_PACKAGES:-}" == *vision* ]]; then
install_torchvision

View File

@ -67,7 +67,7 @@ fi
# wheels with cxx11-abi
echo "Checking that the gcc ABI is what we expect"
if [[ "$(uname)" != 'Darwin' ]]; then
if [[ "$(uname)" != 'Darwin' && "$(uname -m)" != "s390x" ]]; then
# We also check that there are cxx11 symbols in libtorch
#
echo "Checking that symbols in libtorch.so have the right gcc abi"

View File

@ -256,7 +256,7 @@ test_torchbench_smoketest() {
local device=mps
local dtypes=(undefined float16 bfloat16 notset)
local dtype=${dtypes[$1]}
local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)
local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
for backend in eager inductor; do
@ -319,7 +319,7 @@ test_aoti_torchbench_smoketest() {
local device=mps
local dtypes=(undefined float16 bfloat16 notset)
local dtype=${dtypes[$1]}
local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)
local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}"
local dtype_arg="--${dtype}"

View File

@ -102,18 +102,8 @@ if [ "$is_main_doc" = true ]; then
echo coverage output not found
exit 1
elif [ $undocumented -gt 0 ]; then
echo "======================================"
echo "ERROR: $undocumented undocumented objects found!"
echo "======================================"
echo ""
echo "Full coverage report:"
echo undocumented objects found:
cat build/coverage/python.txt
echo ""
echo "======================================"
echo "Undocumented modules/objects (lines after TOTAL):"
tail -n +$((lines - undocumented + 1)) build/coverage/python.txt
echo "======================================"
echo ""
echo "Make sure you've updated relevant .rsts in docs/source!"
echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'"
exit 1

View File

@ -34,14 +34,12 @@ fi
# Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
if [ -n "$NUMBA_CUDA_DIR" ]; then
NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
pushd "$NUMBA_CUDA_DIR"
patch -p4 <"$NUMBA_PATCH"
popd
fi
NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
if [ -n "$NUMBA_CUDA_DIR" ]; then
NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
pushd "$NUMBA_CUDA_DIR"
patch -p4 <"$NUMBA_PATCH"
popd
fi
echo "Environment variables:"
@ -337,13 +335,13 @@ test_python() {
test_python_smoke() {
# Smoke tests for H100/B200
time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
assert_git_not_dirty
}
test_python_smoke_b200() {
# Targeted smoke tests for B200 - staged approach to avoid too many failures
time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
assert_git_not_dirty
}
@ -485,22 +483,6 @@ test_inductor_aoti() {
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
}
test_inductor_aoti_cross_compile_for_windows() {
TEST_REPORTS_DIR=$(pwd)/test/test-reports
mkdir -p "$TEST_REPORTS_DIR"
# Set WINDOWS_CUDA_HOME environment variable
WINDOWS_CUDA_HOME="$(pwd)/win-torch-wheel-extracted"
export WINDOWS_CUDA_HOME
echo "WINDOWS_CUDA_HOME is set to: $WINDOWS_CUDA_HOME"
echo "Contents:"
ls -lah "$(pwd)/win-torch-wheel-extracted/lib/x64/" || true
python test/inductor/test_aoti_cross_compile_windows.py -k compile --package-dir "$TEST_REPORTS_DIR" --win-torch-lib-dir "$(pwd)/win-torch-wheel-extracted/torch/lib"
}
test_inductor_cpp_wrapper_shard() {
if [[ -z "$NUM_TEST_SHARDS" ]]; then
echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
@ -854,7 +836,7 @@ test_dynamo_benchmark() {
elif [[ "${suite}" == "timm_models" ]]; then
export TORCHBENCH_ONLY_MODELS="inception_v3"
elif [[ "${suite}" == "torchbench" ]]; then
export TORCHBENCH_ONLY_MODELS="BERT_pytorch"
export TORCHBENCH_ONLY_MODELS="hf_Bert"
fi
fi
test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
@ -885,13 +867,13 @@ test_inductor_torchbench_smoketest_perf() {
mkdir -p "$TEST_REPORTS_DIR"
python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
--batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only BERT_pytorch \
--batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
--output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
# The threshold value needs to be actively maintained to make this check useful
python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4
# Check memory compression ratio for a few models
for test in BERT_pytorch yolov3; do
for test in hf_Albert timm_vision_transformer; do
python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
--disable-cudagraphs --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" \
--only $test --output "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv"
@ -902,7 +884,7 @@ test_inductor_torchbench_smoketest_perf() {
done
# Perform some "warm-start" runs for a few huggingface models.
for test in AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
--only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
python benchmarks/dynamo/check_accuracy.py \
@ -916,7 +898,7 @@ test_inductor_set_cpu_affinity(){
export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD"
export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
if [[ "$(uname -m)" != "aarch64" ]]; then
if [[ "${TEST_CONFIG}" != *aarch64* ]]; then
# Use Intel OpenMP for x86
IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
export LD_PRELOAD="$IOMP_LIB":"$LD_PRELOAD"
@ -930,7 +912,7 @@ test_inductor_set_cpu_affinity(){
cores=$((cpus / thread_per_core))
# Set number of cores to 16 on aarch64 for performance runs
if [[ "$(uname -m)" == "aarch64" && $cores -gt 16 ]]; then
if [[ "${TEST_CONFIG}" == *aarch64* && $cores -gt 16 ]]; then
cores=16
fi
export OMP_NUM_THREADS=$cores
@ -1631,7 +1613,6 @@ test_operator_benchmark() {
TEST_REPORTS_DIR=$(pwd)/test/test-reports
mkdir -p "$TEST_REPORTS_DIR"
TEST_DIR=$(pwd)
ARCH=$(uname -m)
test_inductor_set_cpu_affinity
@ -1646,7 +1627,7 @@ test_operator_benchmark() {
pip_install pandas
python check_perf_csv.py \
--actual "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv" \
--expected "${ARCH}_expected_ci_operator_benchmark_eager_float32_cpu.csv"
--expected "expected_ci_operator_benchmark_eager_float32_cpu.csv"
}
test_operator_microbenchmark() {
@ -1683,7 +1664,7 @@ if [[ "${TEST_CONFIG}" == *numpy_2* ]]; then
python -m pip install --pre numpy==2.0.2 scipy==1.13.1 numba==0.60.0
fi
python test/run_test.py --include dynamo/test_functions.py dynamo/test_unspec.py test_binary_ufuncs.py test_fake_tensor.py test_linalg.py test_numpy_interop.py test_tensor_creation_ops.py test_torch.py torch_np/test_basic.py
elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]]; then
elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" != *perf_cpu_aarch64* ]]; then
test_linux_aarch64
elif [[ "${TEST_CONFIG}" == *backward* ]]; then
test_forward_backward_compatibility
@ -1734,8 +1715,6 @@ elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
test_inductor_triton_cpu
elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
test_inductor_micro_benchmark
elif [[ "${TEST_CONFIG}" == *aoti_cross_compile_for_windows* ]]; then
test_inductor_aoti_cross_compile_for_windows
elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
install_torchvision
id=$((SHARD_NUMBER-1))

View File

@ -15,35 +15,37 @@ if errorlevel 1 exit /b 1
if not errorlevel 0 exit /b 1
cd %TMP_DIR_WIN%\build\torch\test
:: Enable delayed variable expansion to make the list
setlocal enabledelayedexpansion
set EXE_LIST=
for /r "." %%a in (*.exe) do (
if "%%~na" == "c10_intrusive_ptr_benchmark" (
@REM NB: This is not a gtest executable file, thus couldn't be handled by
@REM pytest-cpp and is excluded from test discovery by run_test
call "%%~fa"
call :libtorch_check "%%~na" "%%~fa"
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
) else (
if "%%~na" == "verify_api_visibility" (
@REM Skip verify_api_visibility as it is a compile-level test
) else (
set EXE_LIST=!EXE_LIST! cpp/%%~na
)
)
)
goto :eof
:libtorch_check
cd %CWD%
set CPP_TESTS_DIR=%TMP_DIR_WIN%\build\torch\test
:: Run python test\run_test.py on the list
set NO_TD=True && python test\run_test.py --cpp --verbose -i !EXE_LIST!
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
:: Skip verify_api_visibility as it a compile level test
if "%~1" == "verify_api_visibility" goto :eof
goto :eof
echo Running "%~2"
if "%~1" == "c10_intrusive_ptr_benchmark" (
:: NB: This is not a gtest executable file, thus couldn't be handled by pytest-cpp
call "%~2"
goto :eof
)
python test\run_test.py --cpp --verbose -i "cpp/%~1"
if errorlevel 1 (
echo %1 failed with exit code %errorlevel%
goto fail
)
if not errorlevel 0 (
echo %1 failed with exit code %errorlevel%
goto fail
)
:eof
exit /b 0

View File

@ -38,7 +38,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
fi
# TODO: Move this to .ci/docker/requirements-ci.txt
python -m pip install "psutil==5.9.1" nvidia-ml-py "pytest-shard==0.1.2"
python -m pip install "psutil==5.9.1" "pynvml==11.4.1" "pytest-shard==0.1.2"
run_tests() {
# Run nvidia-smi if available

View File

@ -71,7 +71,14 @@ export PYTORCH_BUILD_NUMBER=1
# Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
TRITON_CONSTRAINT="platform_system == 'Linux'"
# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries.
if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then
TRITON_CONSTRAINT="platform_system == 'Linux'"
fi
if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
@ -163,13 +170,8 @@ if [[ "$(uname)" != Darwin ]]; then
MEMORY_LIMIT_MAX_JOBS=12
NUM_CPUS=$(( $(nproc) - 2 ))
if [[ "$(uname)" == Linux ]]; then
# Defaults here for **binary** linux builds so they can be changed in one place
export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))}
else
# For other builds
export MAX_JOBS=${NUM_CPUS}
fi
# Defaults here for **binary** linux builds so they can be changed in one place
export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))}
cat >>"$envfile" <<EOL
export MAX_JOBS="${MAX_JOBS}"

View File

@ -66,7 +66,6 @@ readability-simplify-subscript-expr,
readability-string-compare,
-readability-redundant-access-specifiers,
-readability-redundant-control-flow,
-readability-redundant-inline-specifier,
'
HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
WarningsAsErrors: '*'

View File

@ -7,12 +7,16 @@ max-line-length = 120
# C408 ignored because we like the dict keyword argument syntax
# E501 is not flexible enough, we're using B950 instead
ignore =
E203,E305,E402,E501,E704,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
# to line this up with executable bit
EXE001,
# these ignores are from flake8-bugbear; please fix!
B007,B008,B017,B019,B023,B028,B903,B905,B906,B907,B908,B910
B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907,B908,B910
# these ignores are from flake8-comprehensions; please fix!
C407,
# these ignores are from flake8-logging-format; please fix!
G100,G101,G200
# these ignores are from flake8-simplify. please fix or ignore with commented reason
SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12,
# SIM104 is already covered by pyupgrade ruff

View File

@ -8,7 +8,6 @@ assignees: ''
---
> NOTE: Remember to label this issue with "`ci: sev`"
> If you want autorevert to be disabled, keep the ci: disable-autorevert label
<!-- Add the `merge blocking` label to this PR to prevent PRs from being merged while this issue is open -->

View File

@ -1,7 +1,7 @@
---
name: "D❌\U0001F519 ISABLE AUTOREVERT"
name: DISABLE AUTOREVERT
about: Disables autorevert when open
title: "[DISABLE AUTOREVERT]"
title: "❌​\U0001F519 [DISABLE AUTOREVERT]"
labels: 'ci: disable-autorevert'
assignees: ''

View File

@ -65,7 +65,7 @@ runs:
cd .ci/lumen_cli
python3 -m pip install -e .
)
MAX_JOBS="$(nproc --ignore=10)"
MAX_JOBS="$(nproc --ignore=6)"
export MAX_JOBS
# Split the comma-separated list and build each target

View File

@ -274,6 +274,8 @@ runs:
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}"
)
# Propagate download.pytorch.org IP to container
grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"

View File

@ -28,10 +28,6 @@ runs:
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: Print GPU info (if present)
shell: bash
run: if [ -f /usr/bin/nvidia-smi ]; then nvidia-smi; fi
- name: Check if in a container runner
shell: bash
id: check_container_runner
@ -86,6 +82,37 @@ runs:
# Prune all of the docker images
docker system prune -af
- name: Manually resolve download.pytorch.org
shell: bash
continue-on-error: true
run: |
set +e
set -x
PT_DOMAIN=download.pytorch.org
# TODO: Flaky access to download.pytorch.org https://github.com/pytorch/pytorch/issues/100400,
# cleaning this up once the issue is fixed. There are more than one resolved IP here, the last
# one is returned at random
RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" | tail -n1)
if [ -z "${RESOLVED_IP}" ]; then
echo "Couldn't resolve ${PT_DOMAIN}, retrying with Google DNS..."
RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" @8.8.8.8 | tail -n1)
if [ -z "${RESOLVED_IP}" ]; then
echo "Couldn't resolve ${PT_DOMAIN}, exiting..."
exit 1
fi
fi
if grep -r "${PT_DOMAIN}" /etc/hosts; then
# Clean up any old records first
sudo sed -i "/${PT_DOMAIN}/d" /etc/hosts
fi
echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts
cat /etc/hosts
- name: Check that the docker daemon is running
shell: bash
continue-on-error: true

View File

@ -111,16 +111,3 @@ runs:
# This video group ID maps to subgid 1 inside the docker image due to the /etc/subgid entries.
# The group name corresponding to group ID 1 can change depending on the OS, so both are necessary.
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd $DEVICE_FLAG --group-add video --group-add $render_gid --group-add daemon --group-add bin --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --network=host" >> "${GITHUB_ENV}"
- name: configure aws credentials
id: aws_creds
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
role-duration-seconds: 18000
- name: Login to Amazon ECR
id: login-ecr
continue-on-error: true
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1

View File

@ -33,6 +33,10 @@ runs:
)
echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" && "${GPU_ARCH_TYPE}" != "xpu" ]]; then
# Propagate download.pytorch.org IP to container. This is only needed on Linux non aarch64 runner
grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" bash -c "/bin/cat >> /etc/hosts"
fi
docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
# Generate test script

View File

@ -1 +1 @@
69bbe7363897764f9e758d851cd0340147d27f94
87ff22e49ed0e92576c4935ccb8c143daac4a3cd

View File

@ -1 +1 @@
faffd5cf673615583da6517275e361cb3dbc77e6
966da7e46f65d6d49df3e31214470a4fe5cc8e66

View File

@ -1 +1 @@
e5192819208c4d68194844b7dfafbc00020d0dea
0ad9951c416d33c5da4f7a504fb162cbe62386f5

View File

@ -1 +1 @@
0fa6e3129e61143224663e1ec67980d12b7ec4eb
0fc62aa26a30ed7ca419d285f285cb5ba02c4394

View File

@ -1,41 +1,59 @@
# TODO(elainwy): remove this file after the torch nightly dockerfile is in sync in vllm repo
# The vLLM Dockerfile is used to construct vLLM image against torch nightly and torch main that can be directly used for testing
ARG CUDA_VERSION=12.8.1
ARG PYTHON_VERSION=3.12
# BUILD_BASE_IMAGE: used to setup python build xformers, and vllm wheels, It can be replaced with a different base image from local machine,
# by default, it uses the torch-nightly-base stage from this docker image
ARG BUILD_BASE_IMAGE=torch-nightly-base
# FINAL_BASE_IMAGE: used to set up vllm-instaled environment and build flashinfer,
# by default, it uses devel-ubuntu22.04 official image.
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
# The logic is copied from https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile
ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py"
#################### TORCH NIGHTLY BASE IMAGE ####################
# A base image for building vLLM with devel ubuntu 22.04, this is mainly used to build vllm in vllm builtkite ci
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 as torch-nightly-base
ARG CUDA_VERSION
ARG PYTHON_VERSION
ARG GET_PIP_URL
# Install system dependencies and uv, then create Python virtual environment
# Install Python and other dependencies
RUN apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
&& rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
&& ln -s /opt/venv/bin/python3 /usr/bin/python3 \
&& ln -s /opt/venv/bin/python3-config /usr/bin/python3-config \
&& ln -s /opt/venv/bin/pip /usr/bin/pip \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim \
&& add-apt-repository -y ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels
RUN apt-get install -y gcc-10 g++-10
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
RUN <<EOF
gcc --version
EOF
# Ensure gcc >= 10 to avoid CUTLASS issues (bug 92519)
RUN current_gcc_version=$(gcc -dumpversion | cut -f1 -d.) && \
if command -v apt-get >/dev/null; then \
if [ "$current_gcc_version" -lt 10 ]; then \
echo "GCC version is $current_gcc_version, installing gcc-10..."; \
apt-get update \
&& apt-get install -y gcc-10 g++-10 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 \
&& update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100; \
else \
echo "GCC version is $current_gcc_version, no need to install gcc-10."; \
fi \
fi \
&& gcc --version && g++ --version
# Install uv for faster pip installs
# install uv for faster pip installs
RUN --mount=type=cache,target=/root/.cache/uv \
python3 -m pip install uv==0.8.4
@ -43,32 +61,36 @@ ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
#################### TORCH NIGHTLY BASE IMAGE ####################
#################### BASE BUILD IMAGE ####################
# A base image for building vLLM with torch nightly or torch wheels
# prepare basic build environment
FROM ${BUILD_BASE_IMAGE} AS base
USER root
ARG CUDA_VERSION
ARG PYTHON_VERSION
# Only work with PyTorch manylinux builder
# TODO (huydhn): Only work with PyTorch manylinux builder
ENV PATH="/opt/python/cp312-cp312/bin:${PATH}"
# Install some system dependencies and double check python version
RUN if command -v apt-get >/dev/null; then \
apt-get update -y \
&& apt-get install -y ccache software-properties-common git wget sudo vim; \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim; \
else \
dnf install -y git wget sudo; \
dnf install -y git curl wget sudo; \
fi \
&& python3 --version && python3 -m pip --version
# Install uv for faster pip installs if not existed
RUN --mount=type=cache,target=/root/.cache/uv \
python3 -m pip install uv==0.8.4
if ! python3 -m uv --version >/dev/null 2>&1; then \
python3 -m pip install uv==0.8.4; \
fi
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
@ -76,15 +98,15 @@ ENV UV_LINK_MODE=copy
WORKDIR /workspace
# Install build and runtime dependencies
# install build and runtime dependencies
COPY requirements/common.txt requirements/common.txt
COPY use_existing_torch.py use_existing_torch.py
COPY pyproject.toml pyproject.toml
# Install build and runtime dependencies without stable torch version
# install build and runtime dependencies without stable torch version
RUN python3 use_existing_torch.py
# Default mount file as placeholder, this just avoid the mount error
# default mount file as placeholder, this just avoid the mount error
# change to a different vllm folder if this does not exist anymore
ARG TORCH_WHEELS_PATH="./requirements"
ARG PINNED_TORCH_VERSION
@ -116,36 +138,56 @@ RUN --mount=type=cache,target=/root/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/common.txt
# Must put before installing xformers, so it can install the correct version of xfomrers.
ARG xformers_cuda_arch_list='7.5;8.0+PTX;9.0a'
ENV TORCH_CUDA_ARCH_LIST=${xformers_cuda_arch_list}
ARG max_jobs=16
ENV MAX_JOBS=${max_jobs}
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
git clone https://github.com/facebookresearch/xformers.git
RUN echo ${TORCH_CUDA_ARCH_LIST}
RUN echo ${MAX_JOBS}
RUN pip freeze | grep -E 'ninja'
pushd xformers
git checkout v0.0.32.post2
git submodule update --init --recursive
python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose
popd
# Build xformers with cuda and torch nightly/wheel
# following official xformers guidance: https://github.com/facebookresearch/xformers#build
# sha for https://github.com/facebookresearch/xformers/tree/v0.0.32.post2
ARG XFORMERS_COMMIT=5d4b92a5e5a9c6c6d4878283f47d82e17995b468
ENV CCACHE_DIR=/root/.cache/ccache
rm -rf xformers
BASH
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/uv \
echo 'git clone xformers...' \
&& git clone https://github.com/facebookresearch/xformers.git --recursive \
&& cd xformers \
&& git checkout ${XFORMERS_COMMIT} \
&& git submodule update --init --recursive \
&& echo 'finish git clone xformers...' \
&& rm -rf build \
&& python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
&& cd .. \
&& rm -rf xformers
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system xformers-dist/*.whl
uv pip install --system xformers-dist/*.whl --verbose
# Build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
RUN cat torch_build_versions.txt
RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'
#################### BASE BUILD IMAGE ####################
#################### WHEEL BUILD IMAGE ####################
# Image used to build vllm wheel
FROM base AS build
ARG TARGETPLATFORM
COPY . .
RUN python3 use_existing_torch.py
RUN --mount=type=cache,target=/root/.cache/uv \
@ -155,17 +197,20 @@ ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
# Max jobs used by Ninja to build extensions
ARG max_jobs=16
ENV MAX_JOBS=${max_jobs}
ARG nvcc_threads=8
ARG nvcc_threads=4
ENV NVCC_THREADS=$nvcc_threads
ARG torch_cuda_arch_list='8.0 8.6 8.9 9.0'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
ARG USE_SCCACHE
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
# Use sccache to speed up compilation
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" = "1" ]; then \
@ -190,9 +235,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
&& sccache --show-stats; \
fi
ARG torch_cuda_arch_list='8.0 8.6 8.9 9.0'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
ARG vllm_target_device="cuda"
ENV VLLM_TARGET_DEVICE=${vllm_target_device}
ENV CCACHE_DIR=/root/.cache/ccache
@ -206,10 +248,17 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38; \
fi
RUN echo "[INFO] Listing current directory:" && \
ls -al && \
echo "[INFO] Showing torch_build_versions.txt content:" && \
cat torch_build_versions.txt
#################### WHEEL BUILD IMAGE ####################
################### VLLM INSTALLED IMAGE ####################
# Setup clean environment for vLLM for test and api server using ubuntu22.04 with AOT flashinfer
FROM ${FINAL_BASE_IMAGE} AS vllm-base
USER root
@ -217,7 +266,7 @@ ARG CUDA_VERSION
ARG PYTHON_VERSION
ARG GET_PIP_URL
# Only work with PyTorch manylinux builder
# TODO (huydhn): Only work with PyTorch manylinux builder
ENV PATH="/opt/python/cp312-cp312/bin:${PATH}"
# prepare for environment starts
@ -226,19 +275,20 @@ WORKDIR /workspace
# Install Python and other dependencies
RUN if command -v apt-get >/dev/null; then \
apt-get update -y \
&& apt-get install -y ccache software-properties-common git sudo vim python3-pip; \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim \
&& add-apt-repository -y ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
else \
dnf install -y git wget sudo; \
dnf install -y git curl wget sudo; \
fi \
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
&& rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
&& ln -s /opt/venv/bin/python3 /usr/bin/python3 \
&& ln -s /opt/venv/bin/python3-config /usr/bin/python3-config \
&& ln -s /opt/venv/bin/pip /usr/bin/pip \
&& python3 --version && python3 -m pip --version
# Get the torch versions, and whls used in previous stage
# Get the torch versions, and whls used in previous stagtes for consistency
COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
COPY --from=base /workspace/xformers-dist /wheels/xformers
COPY --from=build /workspace/vllm-dist /wheels/vllm
@ -247,29 +297,33 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
echo "[INFO] Showing torch_build_versions.txt content:" && \
cat torch_build_versions.txt
# Install uv for faster pip installs if not existed
RUN --mount=type=cache,target=/root/.cache/uv \
python3 -m pip install uv==0.8.4
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
# Install build and runtime dependencies, this is needed for flashinfer install
COPY requirements/build.txt requirements/build.txt
COPY use_existing_torch.py use_existing_torch.py
RUN python3 use_existing_torch.py
RUN cat requirements/build.txt
# Install uv for faster pip installs if not existed
RUN --mount=type=cache,target=/root/.cache/uv \
if ! python3 -m uv --version > /dev/null 2>&1; then \
python3 -m pip install uv==0.8.4; \
fi
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/build.txt
# Default mount file as placeholder, this just avoid the mount error
ARG TORCH_WHEELS_PATH="./requirements"
# Install torch, torchaudio and torchvision. If TORCH_WHEELS_PATH is default
# to ./requirements, it will pull the nightly versions using pip. Otherwise,
# it will use the local wheels from TORCH_WHEELS_PATH
# Install torch, torchaudio and torchvision
# if TORCH_WHEELS_PATH is default "./requirements", it will pull the nightly versions using pip using torch_build_versions.txt
# otherwise, it will use the whls from TORCH_WHEELS_PATH from the host machine
RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
--mount=type=cache,target=/root/.cache/uv \
if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
@ -290,14 +344,18 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# Install xformers wheel from previous stage
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system /wheels/xformers/*.whl --verbose
# Build FlashInfer from source
# Build flashinfer from source.
ARG torch_cuda_arch_list='8.0;8.9;9.0a;10.0a;12.0'
# install package for build flashinfer
# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
RUN pip freeze | grep -E 'setuptools|packaging|build'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# Build flashinfer for torch nightly from source around 10 mins
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
ARG FLASHINFER_GIT_REF="v0.2.14.post1"
RUN --mount=type=cache,target=/root/.cache/uv \
git clone --depth 1 --recursive --shallow-submodules \
--branch ${FLASHINFER_GIT_REF} \
@ -309,7 +367,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
&& cd .. \
&& rm -rf flashinfer
# Install FlashInfer
# install flashinfer python
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system wheels/flashinfer/*.whl --verbose
@ -319,6 +377,49 @@ RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio\|^xformers\|^vllm
################### VLLM INSTALLED IMAGE ####################
#################### UNITTEST IMAGE #############################
FROM vllm-base as test
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
COPY tests/ tests/
COPY examples examples
COPY benchmarks benchmarks
COPY ./vllm/collect_env.py .
COPY requirements/common.txt requirements/common.txt
COPY use_existing_torch.py use_existing_torch.py
COPY pyproject.toml pyproject.toml
# Install build and runtime dependencies without stable torch version
COPY requirements/nightly_torch_test.txt requirements/nightly_torch_test.txt
RUN python3 use_existing_torch.py
# install packages
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/common.txt
# enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER 1
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -e tests/vllm_test_utils
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/nightly_torch_test.txt
# Logging to confirm the torch versions
RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
# Logging to confirm all the packages are installed
RUN pip freeze
#################### UNITTEST IMAGE #############################
#################### EXPORT STAGE ####################
FROM scratch as export-wheels

29
.github/labeler.yml vendored
View File

@ -133,32 +133,3 @@
"ciflow/vllm":
- .github/ci_commit_pins/vllm.txt
"ciflow/b200":
- test/test_matmul_cuda.py
- test/test_scaled_matmul_cuda.py
- test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/Blas.cpp
- torch/**/*cublas*
- torch/_inductor/kernel/mm.py
- test/inductor/test_max_autotune.py
- third_party/fbgemm
"ciflow/h100":
- test/test_matmul_cuda.py
- test/test_scaled_matmul_cuda.py
- test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/Blas.cpp
- torch/**/*cublas*
- torch/_inductor/kernel/mm.py
- test/inductor/test_max_autotune.py
- third_party/fbgemm
"ciflow/rocm":
- test/test_matmul_cuda.py
- test/test_scaled_matmul_cuda.py
- test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/Blas.cpp
- torch/_inductor/kernel/mm.py
- test/inductor/test_max_autotune.py
- third_party/fbgemm

View File

@ -3,7 +3,6 @@ ciflow_tracking_issue: 64124
ciflow_push_tags:
- ciflow/b200
- ciflow/b200-symm-mem
- ciflow/b200-distributed
- ciflow/binaries
- ciflow/binaries_libtorch
- ciflow/binaries_wheel
@ -16,8 +15,7 @@ ciflow_push_tags:
- ciflow/inductor-micro-benchmark
- ciflow/inductor-micro-benchmark-cpu-x86
- ciflow/inductor-perf-compare
- ciflow/inductor-perf-test-nightly-rocm-mi300
- ciflow/inductor-perf-test-nightly-rocm-mi355
- ciflow/inductor-perf-test-nightly-rocm
- ciflow/inductor-perf-test-nightly-x86-zen
- ciflow/inductor-periodic
- ciflow/inductor-rocm
@ -32,8 +30,6 @@ ciflow_push_tags:
- ciflow/riscv64
- ciflow/rocm
- ciflow/rocm-mi300
- ciflow/rocm-mi355
- ciflow/rocm-navi31
- ciflow/s390
- ciflow/slow
- ciflow/torchbench

Binary file not shown.

View File

@ -512,8 +512,6 @@ def perform_misc_tasks(
"keep-going",
branch == MAIN_BRANCH
or bool(tag and re.match(r"^trunk/[a-f0-9]{40}$", tag))
# Pattern for tags created via manual run on HUD
or bool(tag and re.match(r"^ciflow/[^/]+/[a-f0-9]{40}$", tag))
or check_for_setting(labels, pr_body, "keep-going"),
)
set_output(

View File

@ -16,18 +16,16 @@ from typing import Optional
# NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
CUDA_ARCHES = ["12.6", "12.8", "13.0"]
CUDA_STABLE = "12.8"
CUDA_ARCHES_FULL_VERSION = {
"12.6": "12.6.3",
"12.8": "12.8.1",
"12.9": "12.9.1",
"13.0": "13.0.0",
}
CUDA_ARCHES_CUDNN_VERSION = {
"12.6": "9",
"12.8": "9",
"12.9": "9",
"13.0": "9",
}
@ -40,7 +38,7 @@ CPU_AARCH64_ARCH = ["cpu-aarch64"]
CPU_S390X_ARCH = ["cpu-s390x"]
CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "12.9-aarch64", "13.0-aarch64"]
CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "13.0-aarch64"]
PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
@ -78,23 +76,6 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
),
"12.9": (
"nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | "
"nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | "
"nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | "
"nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
"nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | "
"nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | "
"nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | "
"nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | "
"nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | "
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | "
"nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | "
"nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | "
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
),
"13.0": (
"nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
"nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | "
@ -241,11 +222,7 @@ def generate_libtorch_matrix(
arches += CUDA_ARCHES
arches += ROCM_ARCHES
elif os == "windows":
# TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up
# in 2.10
windows_cuda_arches = CUDA_ARCHES.copy()
windows_cuda_arches.remove("12.9")
arches += windows_cuda_arches
arches += CUDA_ARCHES
if libtorch_variants is None:
libtorch_variants = [
"shared-with-deps",
@ -309,11 +286,7 @@ def generate_wheels_matrix(
if os == "linux":
arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
elif os == "windows":
# TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up
# in 2.10
windows_cuda_arches = CUDA_ARCHES.copy()
windows_cuda_arches.remove("12.9")
arches += windows_cuda_arches + XPU_ARCHES
arches += CUDA_ARCHES + XPU_ARCHES
elif os == "linux-aarch64":
# Separate new if as the CPU type is different and
# uses different build/test scripts
@ -349,7 +322,7 @@ def generate_wheels_matrix(
# cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
if (
arch_version in ["13.0", "12.9", "12.8", "12.6"]
arch_version in ["13.0", "12.8", "12.6"]
and os == "linux"
or arch_version in CUDA_AARCH64_ARCHES
):
@ -413,6 +386,5 @@ def generate_wheels_matrix(
validate_nccl_dep_consistency("13.0")
validate_nccl_dep_consistency("12.9")
validate_nccl_dep_consistency("12.8")
validate_nccl_dep_consistency("12.6")

View File

@ -18,7 +18,6 @@ class GitHubComment:
body_text: str
created_at: str
author_login: str
author_url: Optional[str]
author_association: str
editor_login: Optional[str]
database_id: int

Binary file not shown.

View File

@ -38,7 +38,6 @@ def mock_get_comments() -> list[GitHubComment]:
body_text="mock_body_text",
created_at="",
author_login="",
author_url=None,
author_association="",
editor_login=None,
database_id=1,
@ -49,7 +48,6 @@ def mock_get_comments() -> list[GitHubComment]:
body_text=" #" + LABEL_ERR_MSG_TITLE.replace("`", ""),
created_at="",
author_login=BOT_AUTHORS[1],
author_url=None,
author_association="",
editor_login=None,
database_id=2,

View File

@ -32,7 +32,6 @@ from trymerge import (
main as trymerge_main,
MandatoryChecksMissingError,
MergeRule,
PostCommentError,
RE_GHSTACK_DESC,
read_merge_rules,
remove_job_name_suffix,
@ -589,23 +588,6 @@ class TestTryMerge(TestCase):
self.assertEqual(mock_merge_base, pr.get_merge_base())
mocked_gh_fetch_merge_base.assert_called_once()
def test_app_can_revert(self, *args: Any) -> None:
pr = GitHubPR("pytorch", "pytorch", 164660)
repo = DummyGitRepo()
app_comment_id, impostor_comment_id = 3375785595, 3377647892
# Check that app can revert
self.assertIsNotNone(validate_revert(repo, pr, comment_id=app_comment_id))
# But impostor can not
self.assertRaises(
PostCommentError,
lambda: validate_revert(repo, pr, comment_id=impostor_comment_id),
)
# Despite it's name being the name of the bot
self.assertEqual(
pr.get_comment_by_id(impostor_comment_id).author_login,
"pytorch-auto-revert",
)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")

View File

@ -234,7 +234,6 @@ query ($owner: String!, $name: String!, $number: Int!) {
createdAt
author {
login
url
}
authorAssociation
editor {
@ -1092,9 +1091,8 @@ class GitHubPR:
editor = node["editor"]
return GitHubComment(
body_text=node["bodyText"],
created_at=node.get("createdAt", ""),
created_at=node["createdAt"] if "createdAt" in node else "",
author_login=node["author"]["login"],
author_url=node["author"].get("url", None),
author_association=node["authorAssociation"],
editor_login=editor["login"] if editor else None,
database_id=node["databaseId"],
@ -2031,17 +2029,16 @@ def validate_revert(
# For some reason, one can not be a member of private repo, only CONTRIBUTOR
if pr.is_base_repo_private():
allowed_reverters.append("CONTRIBUTOR")
# Special case the pytorch-auto-revert app, whose does not have association
# But should be able to issue revert command
if comment.author_url == "https://github.com/apps/pytorch-auto-revert":
allowed_reverters.append("NONE")
if author_association not in allowed_reverters:
raise PostCommentError(
f"Will not revert as @{author_login} is not one of "
f"[{', '.join(allowed_reverters)}], but instead is {author_association}."
)
# Raises exception if matching rule is not found, but ignores all status checks
find_matching_merge_rule(
pr, repo, skip_mandatory_checks=True, skip_internal_checks=True
)
commit_sha = get_pr_commit_sha(repo, pr)
return (author_login, commit_sha)

View File

@ -177,9 +177,6 @@ jobs:
runs-on: linux.rocm.gpu.mi250
timeout-minutes: !{{ common.timeout_minutes }}
!{{ upload.binary_env(config) }}
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm

View File

@ -26,8 +26,9 @@ name: !{{ build_environment }}
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "!{{ py_ver.strip('t') + ('.4' if '3.14' not in py_ver else '.0') }}"
python-version: "!{{ (py_ver.strip('t') + '.4') if '3.14' not in py_ver else '3.14.0-rc.2' }}"
freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }}
{%- endmacro %}

View File

@ -79,9 +79,9 @@ jobs:
runs-on: "windows-11-arm64-preview"
{%- else %}
{%- if branches == "nightly" %}
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
{%- else %}
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge.nonephemeral"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
{%- endif %}
{%- endif %}
timeout-minutes: !{{ common.timeout_minutes_windows_binary }}

View File

@ -72,7 +72,7 @@ jobs:
# Let's try to figure out how this can be improved
timeout-minutes: 360
- docs_type: python
runner: ${{ inputs.runner_prefix }}linux.c7i.2xlarge
runner: ${{ inputs.runner_prefix }}linux.2xlarge
# It takes less than 30m to finish python docs unless there are issues
timeout-minutes: 30
# Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)

View File

@ -37,7 +37,7 @@ on:
runner:
required: false
type: string
default: "linux.c7i.2xlarge"
default: "linux.2xlarge"
description: |
Label of the runner this job should run on.
test-matrix:

View File

@ -224,46 +224,6 @@ jobs:
continue-on-error: true
uses: ./.github/actions/download-td-artifacts
- name: Download Windows torch wheel for cross-compilation
if: matrix.win_torch_wheel_artifact != ''
uses: seemethere/download-artifact-s3@1da556a7aa0a088e3153970611f6c432d58e80e6 # v4.2.0
with:
name: ${{ matrix.win_torch_wheel_artifact }}
path: win-torch-wheel
- name: Extract Windows wheel and setup CUDA libraries
if: matrix.win_torch_wheel_artifact != ''
shell: bash
run: |
set -x
# Find the wheel file
WHEEL_FILE=$(find win-torch-wheel -name "*.whl" -type f | head -n 1)
if [ -z "$WHEEL_FILE" ]; then
echo "Error: No wheel file found in win-torch-wheel directory"
exit 1
fi
echo "Found wheel file: $WHEEL_FILE"
# Unzip the wheel file
unzip -q "$WHEEL_FILE" -d win-torch-wheel-extracted
echo "Extracted wheel contents"
# Setup CUDA libraries (cuda.lib and cudart.lib) directory
mkdir -p win-torch-wheel-extracted/lib/x64
if [ -f "win-torch-wheel/cuda.lib" ]; then
mv win-torch-wheel/cuda.lib win-torch-wheel-extracted/lib/x64/
echo "Moved cuda.lib to win-torch-wheel-extracted/lib/x64/"
fi
if [ -f "win-torch-wheel/cudart.lib" ]; then
mv win-torch-wheel/cudart.lib win-torch-wheel-extracted/lib/x64/
echo "Moved cudart.lib to win-torch-wheel-extracted/lib/x64/"
fi
# Verify CUDA libraries are present
echo "CUDA libraries:"
ls -la win-torch-wheel-extracted/lib/x64/ || echo "No CUDA libraries found"
- name: Parse ref
id: parse-ref
run: .github/scripts/parse_ref.py
@ -429,6 +389,8 @@ jobs:
"${DOCKER_IMAGE}" \
${DOCKER_SHELL_CMD}
)
# Propagate download.pytorch.org IP to container
grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then

View File

@ -102,6 +102,19 @@ jobs:
exit 1
fi
- name: configure aws credentials
id: aws_creds
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
role-duration-seconds: 18000
- name: Login to Amazon ECR
id: login-ecr
continue-on-error: true
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main

View File

@ -168,31 +168,6 @@ jobs:
run: |
.ci/pytorch/win-build.sh
# Collect Windows torch libs and CUDA libs for cross-compilation
- name: Collect Windows CUDA libs for cross-compilation
if: steps.build.outcome != 'skipped' && inputs.cuda-version != 'cpu'
shell: bash
run: |
set -ex
# Create directory structure if does not exist
mkdir -p /c/${{ github.run_id }}/build-results
# Copy CUDA libs
CUDA_PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${{ inputs.cuda-version }}"
if [ -f "${CUDA_PATH}/lib/x64/cuda.lib" ]; then
cp "${CUDA_PATH}/lib/x64/cuda.lib" /c/${{ github.run_id }}/build-results/
fi
if [ -f "${CUDA_PATH}/lib/x64/cudart.lib" ]; then
cp "${CUDA_PATH}/lib/x64/cudart.lib" /c/${{ github.run_id }}/build-results/
fi
# List collected files
echo "Collected CUDA libs:"
ls -lah /c/${{ github.run_id }}/build-results/*.lib
# Upload to github so that people can click and download artifacts
- name: Upload artifacts to s3
if: steps.build.outcome != 'skipped'

View File

@ -1,62 +0,0 @@
name: CI for distributed tests on B200
on:
pull_request:
paths:
- .github/workflows/b200-distributed.yml
workflow_dispatch:
push:
tags:
- ciflow/b200-distributed/*
schedule:
- cron: 46 8 * * * # about 1:46am PDT
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-label-type:
if: github.repository_owner == 'pytorch'
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200:
name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed-b200
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '10.0'
test-matrix: |
{ include: [
{ config: "distributed", shard: 1, num_shards: 2, runner: "linux.dgx.b200.8" },
{ config: "distributed", shard: 2, num_shards: 2, runner: "linux.dgx.b200.8" },
]}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-test-distributed-b200:
name: linux-jammy-cuda12.8-py3.10-gcc11-test-b200
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200
with:
timeout-minutes: 1200
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
secrets: inherit

View File

@ -46,12 +46,10 @@ jobs:
fail-fast: false
matrix:
include: [
{ name: "manylinux2_28-builder", tag: "cuda13.0", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "cuda13.0", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "cuda12.8", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "cuda12.9", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "cuda12.6", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinuxaarch64-builder", tag: "cuda13.0", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinuxaarch64-builder", tag: "cuda12.9", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinuxaarch64-builder", tag: "cuda12.8", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinuxaarch64-builder", tag: "cuda12.6", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "rocm6.4", runner: "linux.9xlarge.ephemeral" },

View File

@ -27,8 +27,9 @@ jobs:
fail-fast: false
matrix:
python-version: [ '3.12' ]
# TODO (huydhn): Add cu130 after https://github.com/vllm-project/vllm/issues/24464 is resolved
platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
device: [ 'cu128', 'cu129', 'cu130' ]
device: [ 'cu128', 'cu129' ]
include:
- platform: manylinux_2_28_x86_64
device: cu128
@ -38,10 +39,6 @@ jobs:
device: cu129
manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
runner: linux.12xlarge.memory
- platform: manylinux_2_28_x86_64
device: cu130
manylinux-image: 'pytorch/manylinux2_28-builder:cuda13.0'
runner: linux.12xlarge.memory
- platform: manylinux_2_28_aarch64
device: cu128
manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.8'
@ -50,11 +47,6 @@ jobs:
device: cu129
manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.9'
runner: linux.arm64.r7g.12xlarge.memory
exclude:
# TODO (huydhn): Add cu130 aarch64 once PyTorch is on 2.9+ and
# xformers is update to support 13.0
- platform: manylinux_2_28_aarch64
device: cu130
name: "Build ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
runs-on: ${{ matrix.runner }}
timeout-minutes: 480
@ -177,12 +169,7 @@ jobs:
fail-fast: false
matrix:
platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
device: [ 'cu128', 'cu129', 'cu130' ]
exclude:
# TODO (huydhn): Add cu130 aarch64 once PyTorch is on 2.9+ and
# xformers is update to support 13.0
- platform: manylinux_2_28_aarch64
device: cu130
device: [ 'cu128', 'cu129' ]
env:
PLATFORM: ${{ matrix.platform }}
BUILD_DEVICE: ${{ matrix.device }}

View File

@ -204,52 +204,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_10-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_10-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_10-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -453,52 +407,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_11-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_11-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_11-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -702,52 +610,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_12-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_12-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_12-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -951,52 +813,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_13-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13"
build_name: manywheel-py3_13-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -1200,52 +1016,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13t-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_13t-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13t"
build_name: manywheel-py3_13t-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13t-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -1449,52 +1219,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_14-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14"
build_name: manywheel-py3_14-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -1698,52 +1422,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14t-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_14t-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14t"
build_name: manywheel-py3_14t-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14t-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml

View File

@ -248,74 +248,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
libtorch-cuda12_9-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: libtorch-cuda12_9-shared-with-deps-release
build_environment: linux-binary-libtorch
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
libtorch-cuda12_9-shared-with-deps-release-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- libtorch-cuda12_9-shared-with-deps-release-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
build_name: libtorch-cuda12_9-shared-with-deps-release
build_environment: linux-binary-libtorch
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
libtorch-cuda12_9-shared-with-deps-release-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: libtorch-cuda12_9-shared-with-deps-release-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
build_name: libtorch-cuda12_9-shared-with-deps-release
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
libtorch-cuda13_0-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -426,9 +358,6 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -544,9 +473,6 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm

View File

@ -241,72 +241,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_10-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_10-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_10-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_10-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -413,9 +347,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.10"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -528,9 +459,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
DESIRED_PYTHON: "3.10"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -907,72 +835,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_11-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_11-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_11-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_11-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -1079,9 +941,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.11"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -1194,9 +1053,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
DESIRED_PYTHON: "3.11"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -1573,72 +1429,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_12-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_12-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_12-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_12-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -1745,9 +1535,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.12"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -1860,9 +1647,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
DESIRED_PYTHON: "3.12"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -2239,72 +2023,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_13-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13"
build_name: manywheel-py3_13-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_13-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13"
build_name: manywheel-py3_13-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -2411,9 +2129,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.13"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -2526,9 +2241,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
DESIRED_PYTHON: "3.13"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -2905,72 +2617,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13t-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13t-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_13t-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13t"
build_name: manywheel-py3_13t-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_13t-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13t"
build_name: manywheel-py3_13t-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13t-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -3077,9 +2723,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.13t"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -3192,9 +2835,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
DESIRED_PYTHON: "3.13t"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -3571,72 +3211,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_14-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14"
build_name: manywheel-py3_14-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_14-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14"
build_name: manywheel-py3_14-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -3743,9 +3317,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.14"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -3858,9 +3429,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
DESIRED_PYTHON: "3.14"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -4237,72 +3805,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14t-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14t-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_14t-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14t"
build_name: manywheel-py3_14t-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_14t-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14t"
build_name: manywheel-py3_14t-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14t-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -4409,9 +3911,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.14t"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -4524,9 +4023,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
DESIRED_PYTHON: "3.14t"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm

View File

@ -63,6 +63,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.10.4"
freethreaded: false

View File

@ -59,6 +59,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.10.4"
freethreaded: false
@ -168,6 +169,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.11.4"
freethreaded: false
@ -277,6 +279,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.12.4"
freethreaded: false
@ -386,6 +389,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.13.4"
freethreaded: false
@ -495,6 +499,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.13.4"
freethreaded: true
@ -604,8 +609,9 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.14.0"
python-version: "3.14.0-rc.2"
freethreaded: false
- name: Checkout PyTorch
uses: actions/checkout@v4
@ -713,8 +719,9 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.14.0"
python-version: "3.14.0-rc.2"
freethreaded: true
- name: Checkout PyTorch
uses: actions/checkout@v4

View File

@ -44,7 +44,7 @@ jobs:
libtorch-cpu-shared-with-deps-debug-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -291,7 +291,7 @@ jobs:
libtorch-cuda12_6-shared-with-deps-debug-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -541,7 +541,7 @@ jobs:
libtorch-cuda12_8-shared-with-deps-debug-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -791,7 +791,7 @@ jobs:
libtorch-cuda13_0-shared-with-deps-debug-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch

View File

@ -44,7 +44,7 @@ jobs:
libtorch-cpu-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -291,7 +291,7 @@ jobs:
libtorch-cuda12_6-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -541,7 +541,7 @@ jobs:
libtorch-cuda12_8-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -791,7 +791,7 @@ jobs:
libtorch-cuda13_0-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch

View File

@ -44,7 +44,7 @@ jobs:
wheel-py3_10-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -279,7 +279,7 @@ jobs:
wheel-py3_10-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -517,7 +517,7 @@ jobs:
wheel-py3_10-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -755,7 +755,7 @@ jobs:
wheel-py3_10-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -993,7 +993,7 @@ jobs:
wheel-py3_10-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1229,7 +1229,7 @@ jobs:
wheel-py3_11-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1464,7 +1464,7 @@ jobs:
wheel-py3_11-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1702,7 +1702,7 @@ jobs:
wheel-py3_11-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1940,7 +1940,7 @@ jobs:
wheel-py3_11-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2178,7 +2178,7 @@ jobs:
wheel-py3_11-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2414,7 +2414,7 @@ jobs:
wheel-py3_12-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2649,7 +2649,7 @@ jobs:
wheel-py3_12-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2887,7 +2887,7 @@ jobs:
wheel-py3_12-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3125,7 +3125,7 @@ jobs:
wheel-py3_12-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3363,7 +3363,7 @@ jobs:
wheel-py3_12-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3599,7 +3599,7 @@ jobs:
wheel-py3_13-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3834,7 +3834,7 @@ jobs:
wheel-py3_13-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -4072,7 +4072,7 @@ jobs:
wheel-py3_13-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -4310,7 +4310,7 @@ jobs:
wheel-py3_13-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -4548,7 +4548,7 @@ jobs:
wheel-py3_13-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -4784,7 +4784,7 @@ jobs:
wheel-py3_13t-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -5019,7 +5019,7 @@ jobs:
wheel-py3_13t-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -5257,7 +5257,7 @@ jobs:
wheel-py3_13t-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -5495,7 +5495,7 @@ jobs:
wheel-py3_13t-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -5733,7 +5733,7 @@ jobs:
wheel-py3_13t-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -5969,7 +5969,7 @@ jobs:
wheel-py3_14-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -6204,7 +6204,7 @@ jobs:
wheel-py3_14-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -6442,7 +6442,7 @@ jobs:
wheel-py3_14-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -6680,7 +6680,7 @@ jobs:
wheel-py3_14-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -6918,7 +6918,7 @@ jobs:
wheel-py3_14-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -7154,7 +7154,7 @@ jobs:
wheel-py3_14t-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -7389,7 +7389,7 @@ jobs:
wheel-py3_14t-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -7627,7 +7627,7 @@ jobs:
wheel-py3_14t-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -7865,7 +7865,7 @@ jobs:
wheel-py3_14t-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -8103,7 +8103,7 @@ jobs:
wheel-py3_14t-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch

View File

@ -37,7 +37,7 @@ jobs:
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runner: "linux.c7i.12xlarge"
runner: "linux.12xlarge"
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '9.0'

View File

@ -2,7 +2,7 @@ name: inductor-perf-nightly-h100
on:
schedule:
- cron: 15 0 * * 1-6
- cron: 15 0,12 * * 1-6
- cron: 0 7 * * 0
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
@ -130,7 +130,7 @@ jobs:
name: test-periodically
uses: ./.github/workflows/_linux-test.yml
needs: build
if: github.event.schedule == '15 0 * * 1-6'
if: github.event.schedule == '15 0,12 * * 1-6'
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true

View File

@ -63,7 +63,6 @@ jobs:
# Same as the build job
python-version: 3.12.7
test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }}
timeout-minutes: 300
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4

View File

@ -1,132 +0,0 @@
name: inductor-perf-nightly-rocm-mi300
on:
push:
tags:
- ciflow/inductor-perf-test-nightly-rocm-mi300/*
schedule:
- cron: 15 0 * * *
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
workflow_dispatch:
inputs:
training:
description: Run training (on by default)?
required: false
type: boolean
default: true
inference:
description: Run inference (on by default)?
required: false
type: boolean
default: true
default:
description: Run inductor_default?
required: false
type: boolean
default: false
dynamic:
description: Run inductor_dynamic_shapes?
required: false
type: boolean
default: false
cppwrapper:
description: Run inductor_cpp_wrapper?
required: false
type: boolean
default: false
cudagraphs:
description: Run inductor_cudagraphs?
required: false
type: boolean
default: true
freezing_cudagraphs:
description: Run inductor_cudagraphs with freezing for inference?
required: false
type: boolean
default: false
aotinductor:
description: Run aot_inductor for inference?
required: false
type: boolean
default: false
maxautotune:
description: Run inductor_max_autotune?
required: false
type: boolean
default: false
benchmark_configs:
description: The list of configs used the benchmark
required: false
type: string
default: inductor_huggingface_perf_rocm_mi300,inductor_timm_perf_rocm_mi300,inductor_torchbench_perf_rocm_mi300
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions: read-all
jobs:
get-label-type:
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
linux-jammy-rocm-py3_10-inductor-benchmark-build:
if: github.repository_owner == 'pytorch'
name: rocm-py3_10-inductor-benchmark-build
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-rocm-py3_10
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_rocm_mi300", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_huggingface_perf_rocm_mi300", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_huggingface_perf_rocm_mi300", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_huggingface_perf_rocm_mi300", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_huggingface_perf_rocm_mi300", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm_mi300", shard: 1, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm_mi300", shard: 2, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm_mi300", shard: 3, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm_mi300", shard: 4, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm_mi300", shard: 5, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm_mi300", shard: 6, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm_mi300", shard: 7, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 1, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 2, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 3, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 4, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 5, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 6, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 7, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 8, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 9, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
]}
secrets: inherit
linux-jammy-rocm-py3_10-inductor-benchmark-test:
permissions:
id-token: write
contents: read
name: rocm-py3_10-inductor-benchmark-test
uses: ./.github/workflows/_rocm-test.yml
needs: linux-jammy-rocm-py3_10-inductor-benchmark-build
with:
build-environment: linux-jammy-rocm-py3_10
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.test-matrix }}
timeout-minutes: 720
# Disable monitor in perf tests for more investigation
disable-monitor: true
monitor-log-interval: 10
monitor-data-collect-interval: 2
secrets: inherit

View File

@ -1,11 +1,11 @@
name: inductor-perf-nightly-rocm-mi355
name: inductor-perf-nightly-rocm
on:
push:
tags:
- ciflow/inductor-perf-test-nightly-rocm-mi355/*
- ciflow/inductor-perf-test-nightly-rocm/*
schedule:
- cron: 15 0 * * *
- cron: 0 7 * * 0,3
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
workflow_dispatch:
@ -59,7 +59,7 @@ on:
description: The list of configs used the benchmark
required: false
type: string
default: inductor_huggingface_perf_rocm_mi355,inductor_timm_perf_rocm_mi355,inductor_torchbench_perf_rocm_mi355
default: inductor_huggingface_perf_rocm,inductor_timm_perf_rocm,inductor_torchbench_perf_rocm
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
@ -88,27 +88,23 @@ jobs:
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 1, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 2, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 3, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 4, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 5, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 6, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 7, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 1, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 2, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 3, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 4, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 5, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 6, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 7, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 8, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 9, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
]}
secrets: inherit

View File

@ -106,16 +106,6 @@ jobs:
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
]}
secrets: inherit

View File

@ -12,7 +12,6 @@ on:
- landchecks/*
tags:
- ciflow/pull/*
- ciflow/trunk/*
workflow_dispatch:
permissions: read-all
@ -33,12 +32,10 @@ jobs:
name: Get changed files
uses: ./.github/workflows/_get-changed-files.yml
with:
all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') || github.event_name == 'push' }}
all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') }}
lintrunner-clang:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
# Needed to prevent deduping on HUD
name: lintrunner-clang-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }}
needs: [get-label-type, get-changed-files]
# Only run if there are changed files relevant to clangtidy / clangformat
if: |
@ -78,7 +75,6 @@ jobs:
# fails to find types when it should
lintrunner-mypy:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
name: lintrunner-mypy-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }}
needs: [get-label-type, get-changed-files]
# Only run if there are changed files relevant to mypy
if: |
@ -103,7 +99,6 @@ jobs:
lintrunner-noclang:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
name: lintrunner-noclang-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }}
needs: [get-label-type, get-changed-files]
with:
timeout: 120
@ -118,9 +113,9 @@ jobs:
CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
echo "Running all other linters"
if [ "$CHANGED_FILES" = '*' ]; then
ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT,PYREFLY --all-files" .github/scripts/lintrunner.sh
ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
else
ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT,PYREFLY ${CHANGED_FILES}" .github/scripts/lintrunner.sh
ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT ${CHANGED_FILES}" .github/scripts/lintrunner.sh
fi
quick-checks:

View File

@ -7,11 +7,9 @@ on:
workflow_dispatch:
inputs:
test_mode:
type: choice
options:
- 'short'
- 'long'
- 'all'
required: false
type: string
default: 'short'
description: tag filter for operator benchmarks, options from long, short, all
schedule:
# Run at 07:00 UTC every Sunday
@ -30,49 +28,38 @@ permissions:
contents: read
jobs:
x86-opbenchmark-build:
opbenchmark-build:
if: github.repository_owner == 'pytorch'
name: x86-opbenchmark-build
name: opbenchmark-build
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-py3.10-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
test-matrix: |
{ include: [
{ config: "cpu_operator_benchmark_${{ inputs.test_mode || 'short' }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
{ config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
]}
secrets: inherit
x86-opbenchmark-test:
name: x86-opbenchmark-test
uses: ./.github/workflows/_linux-test.yml
needs: x86-opbenchmark-build
with:
build-environment: linux-jammy-py3.10-gcc11-build
docker-image: ${{ needs.x86-opbenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.x86-opbenchmark-build.outputs.test-matrix }}
secrets: inherit
aarch64-opbenchmark-build:
if: github.repository_owner == 'pytorch'
name: aarch64-opbenchmark-build
opbenchmark-on-demand-build:
if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'pytorch' }}
name: opbenchmark-on-demand-build
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-aarch64-py3.10
runner: linux.arm64.m7g.4xlarge
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11
build-environment: linux-jammy-py3.10-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
test-matrix: |
{ include: [
{ config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.arm64.m8g.4xlarge" },
{ config: "cpu_operator_benchmark_${{ inputs.test_mode }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
]}
secrets: inherit
aarch64-opbenchmark-test:
name: aarch64-opbenchmark-test
opbenchmark-test:
name: opbenchmark-test
uses: ./.github/workflows/_linux-test.yml
needs: aarch64-opbenchmark-build
needs: opbenchmark-build
with:
build-environment: linux-jammy-aarch64-py3.10
docker-image: ${{ needs.aarch64-opbenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.aarch64-opbenchmark-build.outputs.test-matrix }}
build-environment: linux-jammy-py3.10-gcc11-build
docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
secrets: inherit

View File

@ -182,11 +182,11 @@ jobs:
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
test-matrix: |
{ include: [
{ config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
{ config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
{ config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
{ config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
{ config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
{ config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
{ config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
{ config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
]}
secrets: inherit
@ -213,9 +213,9 @@ jobs:
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
test-matrix: |
{ include: [
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
]}
secrets: inherit

View File

@ -127,6 +127,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
# More memory is needed to build with asan
runner: linux.2xlarge.memory
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-clang18-asan

View File

@ -1,9 +1,6 @@
name: rocm-mi355
on:
push:
tags:
- ciflow/rocm-mi355/*
workflow_dispatch:
schedule:
- cron: 30 11,1 * * * # about 4:30am PDT and 6:30pm PDT
@ -45,12 +42,12 @@ jobs:
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
]}
secrets: inherit
@ -67,7 +64,5 @@ jobs:
build-environment: linux-noble-rocm-py3.12-mi355
docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
tests-to-include: >-
${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor test_matmul_cuda test_scaled_matmul_cuda'
|| '' }}
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
secrets: inherit

View File

@ -1,63 +0,0 @@
name: rocm-navi31
on:
push:
tags:
- ciflow/rocm-navi31/*
workflow_dispatch:
schedule:
# We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
# Also run less frequently on weekends.
- cron: 45 */2 * * 1-5
- cron: 45 4,12 * * 0,6
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions: read-all
jobs:
target-determination:
if: github.repository_owner == 'pytorch'
name: before-test
uses: ./.github/workflows/target_determination.yml
permissions:
id-token: write
contents: read
linux-jammy-rocm-py3_10-build:
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-rocm-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
]}
secrets: inherit
linux-jammy-rocm-py3_10-test:
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3_10
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-jammy-rocm-py3_10-build
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
tests-to-include: >-
${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
inductor/test_flex_attention inductor/test_max_autotune' || '' }}
secrets: inherit

View File

@ -140,6 +140,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
# More memory is needed to build with asan
runner: linux.2xlarge.memory
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-clang18-asan

View File

@ -56,7 +56,7 @@ jobs:
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
build-generates-artifacts: false
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runner: "linux.c7i.4xlarge"
runner: "linux.4xlarge"
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 1 },
@ -160,10 +160,9 @@ jobs:
runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
{ config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
{ config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
{ config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
{ config: "default", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
{ config: "default", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
{ config: "default", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
]}
secrets: inherit
@ -180,50 +179,16 @@ jobs:
disable-monitor: false
secrets: inherit
win-vs2022-cuda12_8-py3-build:
name: win-vs2022-cuda12.8-py3
win-vs2022-cuda12_6-py3-build:
name: win-vs2022-cuda12.6-py3
uses: ./.github/workflows/_win-build.yml
needs: get-label-type
with:
build-environment: win-vs2022-cuda12.8-py3
cuda-version: "12.8"
build-environment: win-vs2022-cuda12.6-py3
cuda-version: "12.6"
runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
secrets: inherit
linux-jammy-rocm-py3_10-build:
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-rocm-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
]}
secrets: inherit
linux-jammy-rocm-py3_10-test:
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-jammy-rocm-py3_10-build
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
secrets: inherit
inductor-build:
name: inductor-build
uses: ./.github/workflows/_linux-build.yml
@ -234,23 +199,6 @@ jobs:
cuda-arch-list: '8.0'
secrets: inherit
# Test cross-compiled models with Windows libs extracted from wheel
cross-compile-linux-test:
name: cross-compile-linux-test
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda12_8-py3_10-gcc11-build
- get-label-type
- win-vs2022-cuda12_8-py3-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }}
test-matrix: |
{ include: [
{ config: "aoti_cross_compile_for_windows", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", win_torch_wheel_artifact: "win-vs2022-cuda12.8-py3" },
]}
secrets: inherit
verify-cachebench-cpu-build:
name: verify-cachebench-cpu-build
uses: ./.github/workflows/_linux-build.yml
@ -300,14 +248,3 @@ jobs:
docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-py3_10-gcc11-full-debug-build-only:
name: linux-jammy-py3.10-gcc11-full-debug-build-only
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runner: linux.2xlarge.memory
build-environment: linux-jammy-py3.10-gcc11-full-debug-build-only
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
secrets: inherit

View File

@ -46,7 +46,7 @@ jobs:
runner: linux.24xlarge.memory
test-matrix: |
{ include: [
{ config: "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "vllm_basic_models_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "vllm_entrypoints_test", shard: 1, num_shards: 1,runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "vllm_regression_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
@ -54,7 +54,7 @@ jobs:
{ config: "vllm_pytorch_compilation_unit_tests", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "vllm_lora_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "vllm_multi_model_test_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"},
{ config: "vllm_language_model_test_extended_generation_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"},
{ config: "vllm_languagde_model_test_extended_generation_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"},
{ config: "vllm_distributed_test_2_gpu_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "vllm_lora_test", shard: 0, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "vllm_lora_test", shard: 1, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },

View File

@ -35,7 +35,7 @@ jobs:
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
build-environment: linux-jammy-xpu-n-1-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3
runner: linux.c7i.12xlarge
runner: linux.12xlarge
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
@ -56,7 +56,7 @@ jobs:
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
build-environment: linux-jammy-xpu-n-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
runner: linux.c7i.12xlarge
runner: linux.12xlarge
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },

4
.gitignore vendored
View File

@ -88,7 +88,7 @@ torch_compile_debug/
# Listed manually because some files in this directory are not generated
torch/testing/_internal/generated/annotated_fn_args.py
torch/testing/_internal/data/*.pt
torch/headeronly/version.h
torch/csrc/api/include/torch/version.h
torch/csrc/cudnn/cuDNN.cpp
torch/csrc/generated
torch/csrc/generic/TensorMethods.cpp
@ -374,7 +374,6 @@ third_party/ruy/
third_party/glog/
# Virtualenv
.venv/
venv/
# Log files
@ -396,4 +395,3 @@ android/pytorch_android_torchvision/.cxx
CLAUDE.local.md
/test_*.py
/debug_*.py
CLAUDE_CONTEXT/

View File

@ -28,7 +28,7 @@ exclude_patterns = [
'torch/lib/**',
'venv/**',
'**/*.pyi',
"tools/experimental/torchfuzz/**",
"tools/experimental/dynamic_shapes/torchfuzz/**",
'tools/test/test_selective_build.py',
]
command = [
@ -198,7 +198,7 @@ exclude_patterns = [
'tools/test/gen_operators_yaml_test.py',
'tools/test/gen_oplist_test.py',
'tools/test/test_selective_build.py',
'tools/experimental/torchfuzz/**',
'tools/experimental/dynamic_shapes/torchfuzz/**',
]
command = [
'python3',
@ -209,46 +209,6 @@ command = [
'@{{PATHSFILE}}'
]
[[linter]]
code = 'PYREFLY'
include_patterns = [
'torch/**/*.py',
'torch/**/*.pyi',
'torchgen/**/*.py',
'torchgen/**/*.pyi',
'functorch/**/*.py',
'functorch/**/*.pyi',
]
exclude_patterns = []
command = [
'python3',
'tools/linter/adapters/pyrefly_linter.py',
'--config=pyrefly.toml',
]
init_command = [
'python3',
'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}',
'numpy==2.1.0 ; python_version >= "3.12"',
'expecttest==0.3.0',
'pyrefly==0.36.2',
'sympy==1.13.3',
'types-requests==2.27.25',
'types-pyyaml==6.0.2',
'types-tabulate==0.8.8',
'types-protobuf==5.29.1.20250403',
'types-setuptools==79.0.0.20250422',
'types-jinja2==2.11.9',
'types-colorama==0.4.6',
'filelock==3.18.0',
'junitparser==2.1.1',
'rich==14.1.0',
'optree==0.17.0',
'types-openpyxl==3.1.5.20250919',
'types-python-dateutil==2.9.0.20251008'
]
[[linter]]
code = 'CLANGTIDY'
include_patterns = [
@ -1613,7 +1573,6 @@ exclude_patterns = [
'torch/_inductor/fx_passes/serialized_patterns/**',
'torch/_inductor/autoheuristic/artifacts/**',
'test/dynamo/cpython/**',
'test/test_torchfuzz_repros.py',
'scripts/**',
'third_party/**',
'fb/**',

View File

@ -13,9 +13,6 @@ load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libt
load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources")
load("//:tools/bazel.bzl", "rules")
# Export files for use by torch/headeronly (where version.h generation now lives)
exports_files(["version.txt"])
define_targets(rules = rules)
COMMON_COPTS = [
@ -693,9 +690,7 @@ cc_library(
"torch/csrc/*/generated/*.h",
"torch/csrc/jit/serialization/mobile_bytecode_generated.h",
] + torch_cuda_headers,
) + GENERATED_AUTOGRAD_CPP + [
"//torch/headeronly:version_h",
],
) + GENERATED_AUTOGRAD_CPP + [":version_h"],
includes = [
"third_party/kineto/libkineto/include",
"torch/csrc",

View File

@ -388,9 +388,9 @@ cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker
option(USE_MIMALLOC "Use mimalloc" OFF)
# Enable third party mimalloc library to improve memory allocation performance
# on Windows and AArch64.
# on Windows.
option(USE_MIMALLOC_ON_MKL "Use mimalloc on MKL" OFF)
if(WIN32 OR (CPU_AARCH64 AND NOT APPLE))
if(WIN32)
set(USE_MIMALLOC ON)
# Not enable USE_MIMALLOC_ON_MKL due to it caused issue:

View File

@ -201,17 +201,3 @@ torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
/torch/csrc/stable/ @janeyx99 @mikaylagawarecki
/torch/headeronly/ @janeyx99
/torch/header_only_apis.txt @janeyx99
# FlexAttention
/torch/nn/attention/flex_attention.py @drisspg
/torch/_higher_order_ops/flex_attention.py @drisspg
/torch/_inductor/kernel/flex/ @drisspg
/torch/_inductor/codegen/cpp_flex_attention_template.py @drisspg
/test/inductor/test_flex_attention.py @drisspg
/test/inductor/test_flex_decoding.py @drisspg
# Low Precision GEMMs
/aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58
/aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58
/aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58
/test/test_scaled_matmul_cuda.py @drisspg @slayton58

View File

@ -53,7 +53,7 @@ ARG CUDA_PATH=cu121
ARG INSTALL_CHANNEL=whl/nightly
# Automatically set by buildx
# pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574
RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda=25.7.0
RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION} conda=25.7.0
ARG TARGETPLATFORM

View File

@ -28,19 +28,4 @@ inline std::ostream& operator<<(std::ostream& stream, at::BlasBackend backend) {
return stream << BlasBackendToString(backend);
}
namespace blas {
enum class ScalingType : std::uint8_t {
TensorWise, // fp32 scales
RowWise, // fp32 scales
BlockWise1x16, // fp8_e4m3fn scales
BlockWise1x32, // fp8_e8m0fnu scales
BlockWise1x128, // fp32 scales
BlockWise128x128, // fp32 scales
};
enum class SwizzleType : std::uint8_t { NO_SWIZZLE = 0, SWIZZLE_32_4_4 = 1 };
} // namespace blas
} // namespace at

View File

@ -256,7 +256,6 @@ endif()
IF(USE_FBGEMM_GENAI)
set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/)
set(FBGEMM_GENAI_SRCS ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
if(USE_CUDA)
# To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
# If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
@ -293,65 +292,58 @@ IF(USE_FBGEMM_GENAI)
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
)
target_include_directories(fbgemm_genai PRIVATE
target_include_directories(fbgemm_genai PUBLIC
${FBGEMM_THIRD_PARTY}/cutlass/include
${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
${fbgemm_genai_mx8mx8bf16_grouped}
${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h
)
else()
if(USE_ROCM)
# Only include the kernels we want to build to avoid increasing binary size.
file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
"${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
"${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
# Add FBGEMM_GENAI include directories for torch_ops.h
list(APPEND ATen_CUDA_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
list(APPEND ATen_CUDA_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
elseif(USE_ROCM)
# Only include the kernels we want to build to avoid increasing binary size.
file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
"${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
"${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
# Add additional HIPCC compiler flags for performance
set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
-mllvm
-amdgpu-coerce-illegal-types=1
-mllvm
-enable-post-misched=0
-mllvm
-greedy-reverse-local-assignment=1
-fhip-new-launch-api)
# Add additional HIPCC compiler flags for performance
set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
-mllvm
-enable-post-misched=0
-mllvm
-greedy-reverse-local-assignment=1
-fhip-new-launch-api)
if(DEFINED ROCM_VERSION_DEV AND ROCM_VERSION_DEV VERSION_LESS "7.2.0")
list(PREPEND FBGEMM_GENAI_EXTRA_HIPCC_FLAGS -mllvm -amdgpu-coerce-illegal-types=1)
# Only compile for gfx942 for now.
# This is rather hacky, I could not figure out a clean solution :(
set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
endif()
set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
# Only compile for gfx942 for now.
# This is rather hacky, I could not figure out a clean solution :(
set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
hip_add_library(
fbgemm_genai STATIC
${fbgemm_genai_native_rocm_hip}
HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
target_include_directories(fbgemm_genai PUBLIC
# FBGEMM version of Composable Kernel is used due to some customizations
${FBGEMM_THIRD_PARTY}/composable_kernel/include
${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
${FBGEMM_THIRD_PARTY}/cutlass/include
${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h
)
endif()
set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
hip_add_library(
fbgemm_genai STATIC
${fbgemm_genai_native_rocm_hip}
HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
target_include_directories(fbgemm_genai PRIVATE
# FBGEMM version of Composable Kernel is used due to some customizations
${FBGEMM_THIRD_PARTY}/composable_kernel/include
${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
${FBGEMM_THIRD_PARTY}/cutlass/include
${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h
)
# Add FBGEMM_GENAI include directories for torch_ops.h
list(APPEND ATen_HIP_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
list(APPEND ATen_HIP_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
endif()
endif()
@ -700,6 +692,12 @@ if(USE_CUDA AND NOT USE_ROCM)
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include)
# Add FBGEMM_GENAI include directories for torch_ops.h
if(USE_FBGEMM_GENAI)
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
endif()
if($ENV{ATEN_STATIC_CUDA})
if(CUDA_VERSION VERSION_LESS_EQUAL 12.9)
list(APPEND ATen_CUDA_DEPENDENCY_LIBS

View File

@ -144,7 +144,8 @@ inline std::string _all_equal_numel_error(at::ArrayRef<Tensor> tensors) {
inline bool _apply_preamble(ArrayRef<Tensor> tensors) {
checkDeviceType("CPU_tensor_apply", tensors, kCPU);
checkLayout("CPU_tensor_apply", tensors, kStrided);
TORCH_CHECK(_all_equal_numel(tensors), _all_equal_numel_error(tensors));
if (!_all_equal_numel(tensors))
TORCH_CHECK(false, _all_equal_numel_error(tensors));
// An empty tensor has no elements
for (auto& t : tensors)
if (t.numel() == 0)

View File

@ -40,6 +40,41 @@ namespace {
->conv
->rnn
*/
const std::map<std::string, std::vector<std::string>> _fp32_precisions = {
{"generic", {{"ieee", "tf32", "bf16", "none"}}},
{"mkldnn", {{"ieee", "tf32", "bf16", "none"}}},
{"cuda", {{"ieee", "tf32", "none"}}}};
// Check whether the backend and op are legal
void check_fp32_prec_backend_and_op(
const std::string& backend,
const std::string& op) {
static std::vector<std::string> backends = {"generic", "mkldnn", "cuda"};
static std::vector<std::string> operators = {"conv", "matmul", "rnn", "all"};
TORCH_CHECK(
std::find(backends.begin(), backends.end(), backend) != backends.end(),
"Invalid backend: ",
backend);
TORCH_CHECK(
std::find(operators.begin(), operators.end(), op) != operators.end(),
"Invalid operator: ",
op);
if (backend == "generic") {
TORCH_CHECK(op == "all", "Invalid operation for generic backend: ", op);
}
}
// Return whether the precision is supported by backends
bool validate_fp32_prec(
const std::string& backend,
const std::string& precision) {
auto iterp = _fp32_precisions.find(backend);
TORCH_CHECK(iterp != _fp32_precisions.end());
auto precisions = iterp->second;
bool valid = std::find(precisions.begin(), precisions.end(), precision) !=
precisions.end();
return valid;
}
C10_ALWAYS_INLINE void warn_deprecated_fp32_precision_api(){
TORCH_WARN_ONCE(
@ -51,54 +86,6 @@ namespace {
}
} // namespace
Float32Backend str2backend(const std::string& name) {
if (name == "generic")
return Float32Backend::GENERIC;
else if (name == "cuda")
return Float32Backend::CUDA;
else if (name == "mkldnn")
return Float32Backend::MKLDNN;
TORCH_CHECK(false, "Unknown backend: ", name);
}
Float32Op str2op(const std::string& name) {
if (name == "all")
return Float32Op::ALL;
else if (name == "conv")
return Float32Op::CONV;
else if (name == "rnn")
return Float32Op::RNN;
else if (name == "matmul")
return Float32Op::MATMUL;
TORCH_CHECK(false, "Unknown op: ", name);
}
Float32Precision str2precision(const std::string& name) {
if (name == "none")
return Float32Precision::NONE;
else if (name == "ieee")
return Float32Precision::IEEE;
else if (name == "tf32")
return Float32Precision::TF32;
else if (name == "bf16")
return Float32Precision::BF16;
TORCH_CHECK(false, "Unknown precision: ", name);
}
std::string precision2str(Float32Precision prec) {
switch (prec) {
case Float32Precision::NONE:
return "none";
case Float32Precision::IEEE:
return "ieee";
case Float32Precision::TF32:
return "tf32";
case Float32Precision::BF16:
return "bf16";
}
TORCH_CHECK(false, "Invalid enum Float32Precision(", static_cast<int>(prec), ")");
}
Context::Context() = default;
// TODO: This could be bad juju if someone calls globalContext() in the
@ -192,10 +179,10 @@ void Context::setUserEnabledNNPACK(bool e) {
enabled_nnpack = e;
}
bool Context::allowTF32CuDNN(std::optional<Float32Op> op) const {
if (!op.has_value()) {
bool allow_tf32_rnn = float32Precision(Float32Backend::CUDA, Float32Op::RNN) == Float32Precision::TF32;
bool allow_tf32_conv = float32Precision(Float32Backend::CUDA, Float32Op::CONV) == Float32Precision::TF32;
bool Context::allowTF32CuDNN(const std::string& op) const {
if (op.empty()){
bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32";
bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32";
TORCH_CHECK(
allow_tf32_rnn == allow_tf32_conv && allow_tf32_rnn == allow_tf32_cudnn,
"PyTorch is checking whether allow_tf32 is enabled for cuDNN without a specific operator name,",
@ -204,15 +191,15 @@ bool Context::allowTF32CuDNN(std::optional<Float32Op> op) const {
"We suggest only using the new API to set the TF32 flag(s). See also: ",
"https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices");
} else {
return float32Precision(Float32Backend::CUDA, op.value()) == Float32Precision::TF32;
return float32Precision("cuda", op) == "tf32";
}
warn_deprecated_fp32_precision_api();
return allow_tf32_cudnn;
}
void Context::setAllowTF32CuDNN(bool b) {
setFloat32Precision(Float32Backend::CUDA, Float32Op::RNN, b ? Float32Precision::TF32 : Float32Precision::NONE);
setFloat32Precision(Float32Backend::CUDA, Float32Op::CONV, b ? Float32Precision::TF32 : Float32Precision::NONE);
setFloat32Precision("cuda", "rnn", b ? "tf32" : "none");
setFloat32Precision("cuda", "conv", b ? "tf32" : "none");
allow_tf32_cudnn = b;
warn_deprecated_fp32_precision_api();
}
@ -318,7 +305,7 @@ void Context::setImmediateMiopen(bool b) {
bool Context::allowTF32CuBLAS() const {
bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
bool allow_tf32_new = float32Precision(Float32Backend::CUDA, Float32Op::MATMUL) == Float32Precision::TF32;
bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32";
TORCH_CHECK(
legacy_allow_tf32 == allow_tf32_new,
"PyTorch is checking whether allow_tf32_new is enabled for cuBlas matmul,",
@ -331,17 +318,17 @@ bool Context::allowTF32CuBLAS() const {
void Context::setAllowTF32CuBLAS(bool b) {
float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST;
setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, b ? Float32Precision::TF32 : Float32Precision::IEEE);
setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee");
}
Float32MatmulPrecision Context::float32MatmulPrecision() const {
bool invalid = float32Precision(Float32Backend::CUDA, Float32Op::MATMUL) == Float32Precision::TF32 &&
bool invalid = float32Precision("cuda", "matmul") == "tf32" &&
float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST;
invalid = invalid ||
(float32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL) == Float32Precision::BF16 &&
(float32Precision("mkldnn", "matmul") == "bf16" &&
float32_matmul_precision != at::Float32MatmulPrecision::MEDIUM);
invalid = invalid ||
(float32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL) == Float32Precision::TF32 &&
(float32Precision("mkldnn", "matmul") == "tf32" &&
float32_matmul_precision != at::Float32MatmulPrecision::HIGH);
TORCH_CHECK(
!invalid,
@ -353,26 +340,15 @@ Float32MatmulPrecision Context::float32MatmulPrecision() const {
return float32_matmul_precision;
}
Float32Precision Context::float32Precision(Float32Backend backend, Float32Op op) const {
std::pair<Float32Backend, Float32Op> key{backend, op};
auto it = fp32_precision.find(key);
TORCH_CHECK(it != fp32_precision.end(), "Invalid (backend, op) pair: (", backend, ", ", op, ")");
Float32Precision precision = it->second;
if (precision == Float32Precision::NONE) {
key.second = Float32Op::ALL;
precision = fp32_precision.find(key)->second;
}
if (precision == Float32Precision::NONE) {
key.first = Float32Backend::GENERIC;
precision = fp32_precision.find(key)->second;
}
// "cuda" does not support "bf16"
if (backend == Float32Backend::CUDA && precision == Float32Precision::BF16) {
return Float32Precision::NONE;
}
return precision;
std::string Context::float32Precision(const std::string& backend, const std::string& op) const {
check_fp32_prec_backend_and_op(backend, op);
auto precision = fp32_precision.find(backend)->second.find(op)->second;
if (precision == "none")
precision = fp32_precision.find(backend)->second.find("all")->second;
if (precision == "none")
precision = fp32_precision.find("generic")->second.find("all")->second;
bool valid_prec = validate_fp32_prec(backend, precision);
return valid_prec ? precision : "none";
}
void Context::setFloat32MatmulPrecision(const std::string &s) {
@ -381,18 +357,18 @@ void Context::setFloat32MatmulPrecision(const std::string &s) {
// TODO: consider if CuDNN field needs to also be set for potential future CuDNN ops like multi-headed attention
if (s_ == "highest") {
float32_matmul_precision = at::Float32MatmulPrecision::HIGHEST;
setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::IEEE);
setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::IEEE);
setFloat32Precision("cuda", "matmul", "ieee");
setFloat32Precision("mkldnn", "matmul", "ieee");
return true;
} else if (s_ == "high") {
float32_matmul_precision = at::Float32MatmulPrecision::HIGH;
setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::TF32);
setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::TF32);
setFloat32Precision("cuda", "matmul", "tf32");
setFloat32Precision("mkldnn", "matmul", "tf32");
return true;
} else if (s_ == "medium") {
float32_matmul_precision = at::Float32MatmulPrecision::MEDIUM;
setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::TF32);
setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::BF16);
setFloat32Precision("cuda", "matmul", "tf32");
setFloat32Precision("mkldnn", "matmul", "bf16");
return true;
}
return false;
@ -406,16 +382,25 @@ void Context::setFloat32MatmulPrecision(const std::string &s) {
"setFloat32MatmulPrecision call has no effect.");
}
void Context::setFloat32Precision(Float32Backend backend, Float32Op op, Float32Precision p) {
auto it = fp32_precision.find(std::make_pair(backend, op));
TORCH_CHECK(
it != fp32_precision.end(),
"Invalid (backend, op) pair: (", backend, ", ", op, ")");
TORCH_CHECK(
!(backend == Float32Backend::CUDA && p == Float32Precision::BF16),
"backend 'cuda' does not support precision 'bf16'");
it->second = p;
void Context::setFloat32Precision(const std::string& backend, const std::string& op, const std::string& p) {
check_fp32_prec_backend_and_op(backend, op);
if (validate_fp32_prec(backend, p)) {
fp32_precision[backend][op] = p;
} else {
std::string msg;
auto iterp = _fp32_precisions.find(backend);
TORCH_CHECK(iterp != _fp32_precisions.end());
for (const auto& p : iterp->second) {
msg += p;
msg += " ";
}
TORCH_WARN(
"you have set wrong precision for backend:",
backend,
" setFloat32Precision call has no effect.",
"Please choose precision from: ",
msg);
}
}
at::LinalgBackend Context::linalgPreferredBackend() const {
@ -483,8 +468,8 @@ at::BlasBackend Context::blasPreferredBackend() {
#if ROCM_VERSION >= 60300
"gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
#endif
#if ROCM_VERSION >= 70000
"gfx950", "gfx1150", "gfx1151"
#if ROCM_VERSION >= 60500
"gfx950"
#endif
};
for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
@ -587,33 +572,20 @@ void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
rocm_fa_preferred_backend = b;
}
CuBLASReductionOption Context::allowFP16ReductionCuBLAS() const {
bool Context::allowFP16ReductionCuBLAS() const {
return allow_fp16_reduction_cublas;
}
CuBLASReductionOption inline get_reduction_option(bool allow_reduced_precision, bool allow_splitk) {
TORCH_CHECK(
!(allow_reduced_precision && !allow_splitk),
"allow_splitk=False is not supported when reduced precision reductions are enabled");
if (allow_reduced_precision) {
return CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
} else if (allow_splitk) {
return CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK;
} else {
return CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK;
}
void Context::setAllowFP16ReductionCuBLAS(bool b) {
allow_fp16_reduction_cublas = b;
}
void Context::setAllowFP16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
allow_fp16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
}
CuBLASReductionOption Context::allowBF16ReductionCuBLAS() const {
bool Context::allowBF16ReductionCuBLAS() const {
return allow_bf16_reduction_cublas;
}
void Context::setAllowBF16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
allow_bf16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
void Context::setAllowBF16ReductionCuBLAS(bool b) {
allow_bf16_reduction_cublas = b;
}
bool Context::allowFP16AccumulationCuBLAS() const {

View File

@ -25,13 +25,11 @@
#include <c10/util/CallOnce.h>
#include <c10/util/Exception.h>
#include <c10/util/env.h>
#include <c10/util/hash.h>
#include <c10/util/irange.h>
#include <cstdint>
#include <map>
#include <mutex>
#include <unordered_map>
namespace at {
@ -39,20 +37,6 @@ class Tensor;
enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM };
enum class CuBLASReductionOption : uint8_t {
AllowReducedPrecisionWithSplitK = 0,
DisallowReducedPrecisionAllowSplitK = 1,
DisallowReducedPrecisionDisallowSplitK = 2,
};
enum class TORCH_API Float32Backend { GENERIC, CUDA, MKLDNN };
enum class TORCH_API Float32Op { ALL, CONV, RNN, MATMUL };
enum class TORCH_API Float32Precision { NONE, IEEE, TF32, BF16 };
TORCH_API Float32Backend str2backend(const std::string& name);
TORCH_API Float32Op str2op(const std::string& name);
TORCH_API Float32Precision str2precision(const std::string& name);
TORCH_API std::string precision2str(Float32Precision prec);
class TORCH_API Context {
public:
Context();
@ -226,15 +210,15 @@ class TORCH_API Context {
bool userEnabledMkldnn() const;
void setUserEnabledMkldnn(bool e);
bool benchmarkCuDNN() const;
void setBenchmarkCuDNN(bool /*b*/);
void setBenchmarkCuDNN(bool);
int benchmarkLimitCuDNN() const;
void setBenchmarkLimitCuDNN(int /*b*/);
void setBenchmarkLimitCuDNN(int);
bool immediateMiopen() const;
void setImmediateMiopen(bool /*b*/);
void setImmediateMiopen(bool);
bool deterministicCuDNN() const;
void setDeterministicCuDNN(bool /*b*/);
void setDeterministicCuDNN(bool);
bool deterministicMkldnn() const;
void setDeterministicMkldnn(bool /*b*/);
void setDeterministicMkldnn(bool);
bool userEnabledNNPACK() const;
void setUserEnabledNNPACK(bool e);
@ -252,32 +236,32 @@ class TORCH_API Context {
void setSDPPriorityOrder(const std::vector<int64_t>& order);
std::array<at::SDPBackend, at::num_sdp_backends> sDPPriorityOrder();
void setSDPUseFlash(bool /*e*/);
void setSDPUseFlash(bool);
bool userEnabledFlashSDP() const;
void setSDPUseMemEfficient(bool /*e*/);
void setSDPUseMemEfficient(bool);
bool userEnabledMemEfficientSDP() const;
void setSDPUseMath(bool /*e*/);
void setSDPUseMath(bool);
bool userEnabledMathSDP() const;
void setSDPUseCuDNN(bool /*e*/);
void setSDPUseCuDNN(bool);
bool userEnabledCuDNNSDP() const;
void setAllowFP16BF16ReductionMathSDP(bool /*e*/);
void setAllowFP16BF16ReductionMathSDP(bool);
bool allowFP16BF16ReductionMathSDP() const;
void setSDPUseOverrideable(bool /*e*/);
void setSDPUseOverrideable(bool);
bool userEnabledOverrideableSDP() const;
at::LinalgBackend linalgPreferredBackend() const;
void setLinalgPreferredBackend(at::LinalgBackend /*b*/);
void setLinalgPreferredBackend(at::LinalgBackend);
at::BlasBackend blasPreferredBackend();
void setBlasPreferredBackend(at::BlasBackend /*b*/);
void setBlasPreferredBackend(at::BlasBackend);
at::ROCmFABackend getROCmFAPreferredBackend();
void setROCmFAPreferredBackend(at::ROCmFABackend /*b*/);
void setROCmFAPreferredBackend(at::ROCmFABackend);
// Note [Enabling Deterministic Operations]
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -310,9 +294,9 @@ class TORCH_API Context {
bool deterministicAlgorithms() const;
bool deterministicAlgorithmsWarnOnly() const;
void setDeterministicAlgorithms(bool /*b*/, bool /*warn_only*/);
void setDeterministicAlgorithms(bool, bool);
bool deterministicFillUninitializedMemory() const;
void setDeterministicFillUninitializedMemory(bool /*b*/);
void setDeterministicFillUninitializedMemory(bool);
// Note [Writing Nondeterministic Operations]
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -352,27 +336,25 @@ class TORCH_API Context {
void setFloat32MatmulPrecision(const std::string& s);
void setFloat32Precision(
Float32Backend backend,
Float32Op op,
Float32Precision p);
bool allowTF32CuDNN(std::optional<Float32Op> op = std::nullopt) const;
void setAllowTF32CuDNN(bool /*b*/);
const std::string& backend,
const std::string& op,
const std::string& s);
bool allowTF32CuDNN(const std::string& op = std::string()) const;
void setAllowTF32CuDNN(bool);
bool allowTF32OneDNN() const;
void setAllowTF32OneDNN(bool /*b*/);
void setAllowTF32OneDNN(bool);
bool allowTF32CuBLAS() const;
void setAllowTF32CuBLAS(bool /*b*/);
void setAllowTF32CuBLAS(bool);
Float32MatmulPrecision float32MatmulPrecision() const;
Float32Precision float32Precision(Float32Backend backend, Float32Op op) const;
CuBLASReductionOption allowFP16ReductionCuBLAS() const;
void setAllowFP16ReductionCuBLAS(
bool allow_reduced_precision,
bool allow_splitk = true);
CuBLASReductionOption allowBF16ReductionCuBLAS() const;
void setAllowBF16ReductionCuBLAS(
bool allow_reduced_precision,
bool allow_splitk = true);
std::string float32Precision(
const std::string& backend,
const std::string& op) const;
bool allowFP16ReductionCuBLAS() const;
void setAllowFP16ReductionCuBLAS(bool);
bool allowBF16ReductionCuBLAS() const;
void setAllowBF16ReductionCuBLAS(bool);
bool allowFP16AccumulationCuBLAS() const;
void setAllowFP16AccumulationCuBLAS(bool /*b*/);
void setAllowFP16AccumulationCuBLAS(bool);
// Matmuls can use a so-called "persistent" kernel which launches one CUDA
// block for each SM on the GPU, and each block then iterates over multiple
@ -384,7 +366,7 @@ class TORCH_API Context {
// to make matmuls target only a subset of the SMs, so they can fully schedule
// even next to a comms kernel, and only be a few percent slower.
std::optional<int32_t> _SMCarveout_EXPERIMENTAL() const;
void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t> /*c*/);
void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t>);
at::QEngine qEngine() const;
void setQEngine(at::QEngine e);
@ -405,7 +387,7 @@ class TORCH_API Context {
void setDefaultMobileCPUAllocator();
void unsetDefaultMobileCPUAllocator();
bool allowFP16ReductionCPU() const;
void setAllowFP16ReductionCPU(bool /*b*/);
void setAllowFP16ReductionCPU(bool);
// Preserved for BC
void lazyInitCUDA() {
@ -462,10 +444,8 @@ class TORCH_API Context {
: at::Float32MatmulPrecision::HIGHEST;
int benchmark_limit_cudnn = 10;
bool allow_tf32_cudnn = true;
CuBLASReductionOption allow_fp16_reduction_cublas =
CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
CuBLASReductionOption allow_bf16_reduction_cublas =
CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
bool allow_fp16_reduction_cublas = true;
bool allow_bf16_reduction_cublas = true;
bool allow_fp16_accumulation_cublas = false;
std::optional<int32_t> sm_carveout = std::nullopt;
bool enabled_mkldnn = true;
@ -495,20 +475,21 @@ class TORCH_API Context {
bool enable_sparse_tensor_invariant_checks = false;
bool allow_fp16_reduction_cpu = false;
using Key = std::pair<Float32Backend, Float32Op>;
std::unordered_map<Key, Float32Precision, c10::hash<Key>> fp32_precision = {
{{Float32Backend::GENERIC, Float32Op::ALL}, Float32Precision::NONE},
{{Float32Backend::MKLDNN, Float32Op::ALL}, Float32Precision::NONE},
{{Float32Backend::MKLDNN, Float32Op::CONV}, Float32Precision::NONE},
{{Float32Backend::MKLDNN, Float32Op::RNN}, Float32Precision::NONE},
{{Float32Backend::MKLDNN, Float32Op::MATMUL}, Float32Precision::NONE},
{{Float32Backend::CUDA, Float32Op::ALL}, Float32Precision::NONE},
{{Float32Backend::CUDA, Float32Op::CONV}, Float32Precision::TF32},
{{Float32Backend::CUDA, Float32Op::RNN}, Float32Precision::TF32},
{{Float32Backend::CUDA, Float32Op::MATMUL},
float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST
? Float32Precision::NONE
: Float32Precision::TF32},
std::map<std::string, std::map<std::string, std::string>> fp32_precision = {
{"generic", {{"all", "none"}}},
{"mkldnn",
{{"matmul", "none"},
{"conv", "none"},
{"rnn", "none"},
{"all", "none"}}},
{"cuda",
{{"matmul",
float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST
? "none"
: "tf32"},
{"conv", "tf32"},
{"rnn", "tf32"},
{"all", "none"}}},
};
Allocator* prev_allocator_ptr_{nullptr};
@ -690,4 +671,5 @@ struct TORCH_API ROCmBackwardPassGuard {
~ROCmBackwardPassGuard();
static bool is_backward_pass();
};
} // namespace at

View File

@ -389,16 +389,37 @@ void fillVersion<DLManagedTensorVersioned>(
// constructed out of ATen tensor
template <class T>
T* toDLPackImpl(const Tensor& src) {
auto view = src;
// Detect whether there is need to normalize the strides
// Background: gh-83069
//
// However, normalizing strides can come at a high-cost
// to slow down toDLPack conversion 3x, so we
// only normalize if needed.
//
// The following code detects whether the src follows
// a continuous pattern. If the src follows such pattern (common-case)
// then we do not need to normalize the strides.
bool need_normalize_strides = src.dim() == 1 && src.size(0) == 1 && src.stride(0) != 1;
// less common case, try normalizing the strides
if (need_normalize_strides) {
// create a new tensor with possibly normalized strides
// gh-83069
auto shape = src.sizes();
view = src.as_strided(shape, {1}, src.storage_offset());
}
ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
atDLMTensor->handle = src;
atDLMTensor->handle = view;
atDLMTensor->tensor.manager_ctx = atDLMTensor;
atDLMTensor->tensor.deleter = &deleter<T>;
atDLMTensor->tensor.dl_tensor.data = src.data_ptr();
atDLMTensor->tensor.dl_tensor.data = view.data_ptr();
atDLMTensor->tensor.dl_tensor.device = torchDeviceToDLDevice(src.device());
atDLMTensor->tensor.dl_tensor.ndim = static_cast<int32_t>(src.dim());
atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src);
atDLMTensor->tensor.dl_tensor.shape = const_cast<int64_t*>(src.sizes().data());
atDLMTensor->tensor.dl_tensor.strides = const_cast<int64_t*>(src.strides().data());
atDLMTensor->tensor.dl_tensor.shape = const_cast<int64_t*>(view.sizes().data());
atDLMTensor->tensor.dl_tensor.strides = const_cast<int64_t*>(view.strides().data());
atDLMTensor->tensor.dl_tensor.byte_offset = 0;
fillVersion(&atDLMTensor->tensor);

View File

@ -52,16 +52,16 @@ struct DLPackTraits {};
template <>
struct DLPackTraits<DLManagedTensor> {
inline static constexpr const char* capsule = "dltensor";
inline static constexpr const char* used = "used_dltensor";
inline static const char* capsule = "dltensor";
inline static const char* used = "used_dltensor";
inline static auto toDLPack = at::toDLPack;
inline static auto fromDLPack = at::fromDLPack;
};
template <>
struct DLPackTraits<DLManagedTensorVersioned> {
inline static constexpr const char* capsule = "dltensor_versioned";
inline static constexpr const char* used = "used_dltensor_versioned";
inline static const char* capsule = "dltensor_versioned";
inline static const char* used = "used_dltensor_versioned";
inline static auto toDLPack = at::toDLPackVersioned;
inline static auto fromDLPack = at::fromDLPackVersioned;
};

Some files were not shown because too many files have changed in this diff Show More