Compare commits

..

3 Commits

Author SHA1 Message Date
21aa086ecc [Dynamo][Hierarchical Compile] Flatten tuple inputs for regions
ghstack-source-id: e99eea21f6c2e02a15b0027ae1cedffbf4003231
Pull Request resolved: https://github.com/pytorch/pytorch/pull/158812
2025-08-15 23:45:18 -07:00
c5f23c5cbf [Dynamo][Hierarchical Compile] Flatten tuple outputs in graph dedupe pass
ghstack-source-id: 9b509d723379eee9e38c7ad61ea0c5620ef0d844
Pull Request resolved: https://github.com/pytorch/pytorch/pull/158811
2025-08-15 18:45:06 -07:00
4b146389a4 [Dynamo][Hierarchical Compile] Refactor for tuple flattening
ghstack-source-id: f168b556bb440ea93f5ed3001baa9b36acf929ff
Pull Request resolved: https://github.com/pytorch/pytorch/pull/158810
2025-08-14 16:14:27 -07:00
396 changed files with 2308 additions and 13741 deletions

View File

@ -92,7 +92,6 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
"/usr/local/cuda/lib64/libnccl.so.2",
"/usr/local/cuda/lib64/libnvJitLink.so.12",
"/usr/local/cuda/lib64/libnvrtc.so.12",
"/usr/local/cuda/lib64/libnvshmem_host.so.3",
"/usr/local/cuda/lib64/libcudnn_adv.so.9",
"/usr/local/cuda/lib64/libcudnn_cnn.so.9",
"/usr/local/cuda/lib64/libcudnn_graph.so.9",
@ -210,6 +209,8 @@ if __name__ == "__main__":
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
if enable_cuda:
build_vars += "MAX_JOBS=5 "
# nvshmem is broken for aarch64 see https://github.com/pytorch/pytorch/issues/160425
build_vars += "USE_NVSHMEM=OFF "
override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
desired_cuda = os.getenv("DESIRED_CUDA")

View File

@ -64,10 +64,6 @@ FROM cuda as cuda12.9
RUN bash ./install_cuda.sh 12.9
ENV DESIRED_CUDA=12.9
FROM cuda as cuda13.0
RUN bash ./install_cuda.sh 13.0
ENV DESIRED_CUDA=13.0
FROM ${ROCM_IMAGE} as rocm
ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
ADD ./common/install_mkl.sh install_mkl.sh
@ -80,10 +76,10 @@ ADD ./common/install_mnist.sh install_mnist.sh
RUN bash ./install_mnist.sh
FROM base as all_cuda
COPY --from=cuda11.8 /usr/local/cuda-11.8 /usr/local/cuda-11.8
COPY --from=cuda12.6 /usr/local/cuda-12.6 /usr/local/cuda-12.6
COPY --from=cuda12.8 /usr/local/cuda-12.8 /usr/local/cuda-12.8
COPY --from=cuda12.9 /usr/local/cuda-12.9 /usr/local/cuda-12.9
COPY --from=cuda13.0 /usr/local/cuda-13.0 /usr/local/cuda-13.0
# Final step
FROM ${BASE_TARGET} as final

View File

@ -168,7 +168,7 @@ case "$tag" in
TRITON=yes
;;
pytorch-linux-jammy-py3-clang12-onnx)
ANACONDA_PYTHON_VERSION=3.10
ANACONDA_PYTHON_VERSION=3.9
CLANG_VERSION=12
VISION=yes
ONNX=yes
@ -288,6 +288,7 @@ case "$tag" in
GCC_VERSION=11
ACL=yes
VISION=yes
CONDA_CMAKE=yes
OPENBLAS=yes
# snadampal: skipping llvm src build install because the current version
# from pytorch/llvm:9.0.1 is x86 specific
@ -298,6 +299,7 @@ case "$tag" in
GCC_VERSION=11
ACL=yes
VISION=yes
CONDA_CMAKE=yes
OPENBLAS=yes
# snadampal: skipping llvm src build install because the current version
# from pytorch/llvm:9.0.1 is x86 specific

View File

@ -1,2 +0,0 @@
transformers==4.54.0
soxr==0.5.0

View File

@ -0,0 +1 @@
v4.54.0

View File

@ -1 +0,0 @@
v2.27.7-1

View File

@ -1 +1 @@
0958dc9b2bb815e428f721f9da599dab0dc1c5d7
ae324eeac8e102a2b40370e341460f3791353398

View File

@ -10,7 +10,7 @@ else
arch_path='sbsa'
fi
NVSHMEM_VERSION=3.3.20
NVSHMEM_VERSION=3.3.9
function install_cuda {
version=$1
@ -62,16 +62,14 @@ function install_nvshmem {
mkdir -p "${tmpdir}" && cd "${tmpdir}"
# nvSHMEM license: https://docs.nvidia.com/nvshmem/api/sla.html
# This pattern is a lie as it is not consistent across versions, for 3.3.9 it was cuda_ver-arch-nvshhem-ver
filename="libnvshmem-linux-${arch_path}-${nvshmem_version}_cuda${cuda_major_version}-archive"
suffix=".tar.xz"
url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}${suffix}"
filename="libnvshmem_cuda${cuda_major_version}-linux-${arch_path}-${nvshmem_version}"
url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}.tar.gz"
# download, unpack, install
wget -q "${url}"
tar xf "${filename}${suffix}"
cp -a "${filename}/include/"* /usr/local/cuda/include/
cp -a "${filename}/lib/"* /usr/local/cuda/lib64/
tar xf "${filename}.tar.gz"
cp -a "libnvshmem/include/"* /usr/local/cuda/include/
cp -a "libnvshmem/lib/"* /usr/local/cuda/lib64/
# cleanup
cd ..
@ -128,6 +126,74 @@ function install_129 {
ldconfig
}
function prune_124 {
echo "Pruning CUDA 12.4"
#####################################################################################
# CUDA 12.4 prune static libs
#####################################################################################
export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
if [[ -n "$OVERRIDE_GENCODE" ]]; then
export GENCODE=$OVERRIDE_GENCODE
fi
if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
fi
# all CUDA libs except CuDNN and CuBLAS
ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis" \
| xargs -I {} bash -c \
"echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
# prune CuDNN and CuBLAS
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
#####################################################################################
# CUDA 12.4 prune visual tools
#####################################################################################
export CUDA_BASE="/usr/local/cuda-12.4/"
rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
}
function prune_126 {
echo "Pruning CUDA 12.6"
#####################################################################################
# CUDA 12.6 prune static libs
#####################################################################################
export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune"
export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64"
export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
if [[ -n "$OVERRIDE_GENCODE" ]]; then
export GENCODE=$OVERRIDE_GENCODE
fi
if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
fi
# all CUDA libs except CuDNN and CuBLAS
ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis" \
| xargs -I {} bash -c \
"echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
# prune CuDNN and CuBLAS
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
#####################################################################################
# CUDA 12.6 prune visual tools
#####################################################################################
export CUDA_BASE="/usr/local/cuda-12.6/"
rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
}
function install_128 {
CUDNN_VERSION=9.8.0.87
echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
@ -146,39 +212,18 @@ function install_128 {
ldconfig
}
function install_130 {
CUDNN_VERSION=9.12.0.46
NVSHMEM_VERSION=3.3.20
echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
# install CUDA 13.0 in the same container
install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
install_cudnn 13 $CUDNN_VERSION
install_nvshmem 13 $NVSHMEM_VERSION
CUDA_VERSION=13.0 bash install_nccl.sh
CUDA_VERSION=13.0 bash install_cusparselt.sh
ldconfig
}
# idiomatic parameter and option handling in sh
while test $# -gt 0
do
case "$1" in
12.4) install_124;
12.4) install_124; prune_124
;;
12.6|12.6.*) install_126;
12.6|12.6.*) install_126; prune_126
;;
12.8|12.8.*) install_128;
;;
12.9|12.9.*) install_129;
;;
13.0|13.0.*) install_130;
;;
*) echo "bad argument $1"; exit 1
;;
esac

View File

@ -5,15 +5,7 @@ set -ex
# cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
mkdir tmp_cusparselt && cd tmp_cusparselt
if [[ ${CUDA_VERSION:0:4} =~ "13" ]]; then
arch_path='sbsa'
export TARGETARCH=${TARGETARCH:-$(uname -m)}
if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
arch_path='x86_64'
fi
CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.8.0.4_cuda13-archive"
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
elif [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
arch_path='sbsa'
export TARGETARCH=${TARGETARCH:-$(uname -m)}
if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then

View File

@ -5,7 +5,9 @@ set -ex
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
function install_huggingface() {
pip_install -r huggingface-requirements.txt
local version
commit=$(get_pinned_commit huggingface)
pip_install "git+https://github.com/huggingface/transformers@${commit}"
}
function install_timm() {
@ -24,6 +26,9 @@ function install_torchbench() {
python install.py --continue_on_fail
# soxr comes from https://github.com/huggingface/transformers/pull/39429
pip install transformers==4.54.0 soxr==0.5.0
echo "Print all dependencies after TorchBench is installed"
python -mpip freeze
popd

View File

@ -7,8 +7,6 @@ if [[ ${CUDA_VERSION:0:2} == "11" ]]; then
NCCL_VERSION=$(cat ci_commit_pins/nccl-cu11.txt)
elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
NCCL_VERSION=$(cat ci_commit_pins/nccl-cu12.txt)
elif [[ ${CUDA_VERSION:0:2} == "13" ]]; then
NCCL_VERSION=$(cat ci_commit_pins/nccl-cu13.txt)
else
echo "Unexpected CUDA_VERSION ${CUDA_VERSION}"
exit 1

View File

@ -19,7 +19,7 @@ pip_install \
transformers==4.36.2
pip_install coloredlogs packaging
pip_install onnxruntime==1.22.1
pip_install onnxruntime==1.18.1
pip_install onnxscript==0.3.1
# Cache the transformers model to be used later by ONNX tests. We need to run the transformers

View File

@ -96,11 +96,11 @@ ARG ANACONDA_PYTHON_VERSION
ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
COPY ci_commit_pins/huggingface.txt huggingface.txt
COPY ci_commit_pins/timm.txt timm.txt
COPY ci_commit_pins/torchbench.txt torchbench.txt
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
# (optional) Install non-default Ninja version
ARG NINJA_VERSION

View File

@ -56,10 +56,10 @@ RUN rm install_openssl.sh
ARG INDUCTOR_BENCHMARKS
COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
COPY ci_commit_pins/huggingface.txt huggingface.txt
COPY ci_commit_pins/timm.txt timm.txt
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
# Install XPU Dependencies
ARG XPU_VERSION

View File

@ -96,11 +96,11 @@ RUN rm install_openssl.sh
ARG INDUCTOR_BENCHMARKS
COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
COPY ci_commit_pins/huggingface.txt huggingface.txt
COPY ci_commit_pins/timm.txt timm.txt
COPY ci_commit_pins/torchbench.txt torchbench.txt
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
ARG TRITON
ARG TRITON_CPU

View File

@ -62,7 +62,7 @@ class VllmBuildParameters:
)
# OUTPUT_DIR: where docker buildx (local exporter) will write artifacts
output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm")
output_dir: Path = env_path_field("OUTPUT_DIR", "shared")
# --- Build args ----------------------------------------------------------
target_stage: str = env_str_field("TARGET_STAGE", "export-wheels")

View File

@ -16,7 +16,6 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
magma/build_magma.sh
.PHONY: all
all: magma-cuda130
all: magma-cuda129
all: magma-cuda128
all: magma-cuda126
@ -26,12 +25,6 @@ clean:
$(RM) -r magma-*
$(RM) -r output
.PHONY: magma-cuda130
magma-cuda130: DESIRED_CUDA := 13.0
magma-cuda130: CUDA_ARCH_LIST := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
magma-cuda130:
$(DOCKER_RUN)
.PHONY: magma-cuda129
magma-cuda129: DESIRED_CUDA := 12.9
magma-cuda129: CUDA_ARCH_LIST += -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120

View File

@ -28,7 +28,6 @@ pushd ${PACKAGE_DIR}/magma-${MAGMA_VERSION}
patch < ${PACKAGE_FILES}/CMake.patch
patch < ${PACKAGE_FILES}/cmakelists.patch
patch -p0 < ${PACKAGE_FILES}/thread_queue.patch
patch -p1 < ${PACKAGE_FILES}/cuda13.patch
patch -p1 < ${PACKAGE_FILES}/getrf_shfl.patch
patch -p1 < ${PACKAGE_FILES}/getrf_nbparam.patch
# The build.sh script expects to be executed from the sources root folder
@ -38,7 +37,6 @@ popd
# Package recipe, license and tarball
# Folder and package name are backward compatible for the build workflow
cp ${PACKAGE_FILES}/build.sh ${PACKAGE_RECIPE}/build.sh
cp ${PACKAGE_FILES}/cuda13.patch ${PACKAGE_RECIPE}/cuda13.patch
cp ${PACKAGE_FILES}/thread_queue.patch ${PACKAGE_RECIPE}/thread_queue.patch
cp ${PACKAGE_FILES}/cmakelists.patch ${PACKAGE_RECIPE}/cmakelists.patch
cp ${PACKAGE_FILES}/getrf_shfl.patch ${PACKAGE_RECIPE}/getrf_shfl.patch

View File

@ -1,26 +0,0 @@
diff --git a/interface_cuda/interface.cpp b/interface_cuda/interface.cpp
index 73fed1b20..e77519bfe 100644
--- a/interface_cuda/interface.cpp
+++ b/interface_cuda/interface.cpp
@@ -438,14 +438,20 @@ magma_print_environment()
cudaDeviceProp prop;
err = cudaGetDeviceProperties( &prop, dev );
check_error( err );
+ #ifdef MAGMA_HAVE_CUDA
+#if CUDA_VERSION < 13000
printf( "%% device %d: %s, %.1f MHz clock, %.1f MiB memory, capability %d.%d\n",
dev,
prop.name,
prop.clockRate / 1000.,
+#else
+ printf( "%% device %d: %s, ??? MHz clock, %.1f MiB memory, capability %d.%d\n",
+ dev,
+ prop.name,
+#endif
prop.totalGlobalMem / (1024.*1024.),
prop.major,
prop.minor );
- #ifdef MAGMA_HAVE_CUDA
int arch = prop.major*100 + prop.minor*10;
if ( arch < MAGMA_CUDA_ARCH_MIN ) {
printf("\n"

View File

@ -134,7 +134,7 @@ if [[ $CUDA_VERSION == 12* ]]; then
"/usr/local/cuda/lib64/libnvrtc-builtins.so"
"/usr/local/cuda/lib64/libcufile.so.0"
"/usr/local/cuda/lib64/libcufile_rdma.so.1"
"/usr/local/cuda/lib64/libnvshmem_host.so.3"
"/usr/local/cuda/lib64/libnvshem_host.so.3"
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12"
"/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so"
)

View File

@ -174,15 +174,13 @@ checkout_install_torchbench() {
# to install and test other models
python install.py --continue_on_fail
fi
popd
pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt
# https://github.com/pytorch/pytorch/issues/160689 to remove torchao because
# its current version 0.12.0 doesn't work with transformers 4.54.0
pip uninstall -y torchao
# soxr comes from https://github.com/huggingface/transformers/pull/39429
pip install transformers==4.54.0 soxr==0.5.0
echo "Print all dependencies after TorchBench is installed"
python -mpip freeze
popd
}
torchbench_setup_macos() {

View File

@ -1701,7 +1701,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
fi
elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
install_torchvision
PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
PYTHONPATH=/torchbench:$PYTHONPATH test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
if [[ "$SHARD_NUMBER" -eq "1" ]]; then
test_inductor_aoti
fi

View File

@ -133,25 +133,6 @@ EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
RENAME_WHEEL=true
case $desired_python in
3.14t)
echo "Using 3.14 deps"
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
PYYAML_PINNED_VERSION=">=6.0.1"
NUMPY_PINNED_VERSION="=2.1.0"
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
RENAME_WHEEL=false
;;
3.14)
echo "Using 3.14t deps"
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
PYYAML_PINNED_VERSION=">=6.0.1"
NUMPY_PINNED_VERSION="=2.1.0"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
RENAME_WHEEL=false
;;
3.13t)
echo "Using 3.13 deps"
SETUPTOOLS_PINNED_VERSION=">=70.1.0"

View File

@ -1,80 +0,0 @@
# .github/workflows/build-external.yml
name: Build External packages
description: build external packages for PyTorch
inputs:
cuda-arch-list:
description: TORCH_CUDA_ARCH_LIST (e.g., "8.0;8.9;9.0")
type: string
required: true
default: ""
docker-image:
description: Base image to use
type: string
required: true
build-targets:
description: Build targets
type: string
required: true
torch-wheel-dir:
description: Directory to built torch wheel
type: string
required: false
default: dist
output-dir:
description: Directory to store build artifact
default: external
type: string
required: false
outputs:
build_time:
description: "Total build time in seconds"
value: ${{ steps.build-external.outputs.build_time }}
output_dir:
description: "Directory where build artifact is stored"
value: ${{ steps.build-external.outputs.output_dir }}
runs:
using: composite
steps:
- name: Build external packages in sequence
id: build-external
env:
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
SCCACHE_REGION: us-east-1
TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
BASE_IMAGE: ${{ inputs.docker-image }}
BUILD_TARGETS: ${{ inputs.build-targets }}
PARENT_OUTPUT_DIR: ${{ inputs.output-dir}}
shell: bash
run: |
set -euo pipefail
python3 --version
docker images
START_TIME=$(date +%s)
(
cd .ci/lumen_cli
python3 -m pip install -e .
)
MAX_JOBS="$(nproc --ignore=6)"
export MAX_JOBS
# Split the comma-separated list and build each target
IFS=',' read -ra TARGETS <<< "$BUILD_TARGETS"
for target in "${TARGETS[@]}"; do
OUTPUT_DIR="$PARENT_OUTPUT_DIR/$target"
export OUTPUT_DIR
echo "Building external package: $target in directory $OUTPUT_DIR"
python3 -m cli.run build external "$target"
done
END_TIME=$(date +%s)
{
echo "build_time=$((END_TIME - START_TIME))"
if [ -d "$PARENT_OUTPUT_DIR" ]; then
echo "output_dir=$PARENT_OUTPUT_DIR"
fi
} >> "$GITHUB_OUTPUT"

View File

@ -1 +1 @@
02351a683668dd65bc82343e55245e308eb97b4e
bdb88e1d66f272cad72156c90ac8428ca61a601c

View File

@ -1 +1 @@
0fc8fa751a4321d6531467537ff77cf3c1c70260
0ca2393b47e72c4424a49aa3b32c7c5d0e378a72

View File

@ -1 +1 @@
a1c6ee92c85e8b0955c20892ed68f032a6015c09
095faec1e7b6cc47220181e74ae9cde2605f9b00

View File

@ -1,20 +0,0 @@
version: 2
updates:
# Update to the latest transformers version with dependabot
- package-ecosystem: "pip"
directory: "/.ci/docker/ci_commit_pins"
schedule:
interval: "daily"
target-branch: "main"
allow:
- dependency-name: "transformers"
commit-message:
prefix: "[Dependabot] Update"
include: "scope"
labels:
- "dependencies"
- "open source"
- "python"
- "topic: not user facing"
- "module: ci"
- "module: inductor"

View File

@ -27,7 +27,6 @@ ciflow_push_tags:
- ciflow/trunk
- ciflow/unstable
- ciflow/xpu
- ciflow/vllm
- ciflow/torchbench
- ciflow/op-benchmark
- ciflow/pull

View File

@ -54,7 +54,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -71,7 +71,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -88,7 +88,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -315,7 +315,7 @@ def generate_wheels_matrix(
if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
continue
# TODO: Enable python 3.14 on non linux OSes
if os not in ["linux", "linux-aarch64", "macos-arm64"] and (
if os != "linux" and (
python_version == "3.14" or python_version == "3.14t"
):
continue

View File

@ -110,33 +110,12 @@ jobs:
# Create new "clean" conda environment for testing
SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
if [[ $DESIRED_PYTHON == "3.13t" ]]; then
conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
SMOKE_TEST_PARAMS="--torch-compile-check disabled"
else
conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
fi
conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

View File

@ -96,13 +96,6 @@ on:
required: false
type: string
default: ""
build-external-packages:
description: |
If set, the build external packages and saves their wheels as artifacts
use command separated list of packages to build ex: 'vllm,transformers'.
required: false
type: string
default: ""
secrets:
HUGGING_FACE_HUB_TOKEN:
@ -363,26 +356,6 @@ jobs:
END_TIME=$(date +%s)
echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT"
- name: Build external packages
id: build-external-packages
if: inputs.build-external-packages != '' && steps.build.outcome != 'skipped'
uses: ./.github/actions/build-external-packages
with:
build-targets: ${{ inputs.build-external-packages }}
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
cuda-arch-list: ${{ inputs.cuda-arch-list }}
output-dir: external
- name: Move external packages to dist
if: steps.build-external-packages.outputs.output_dir != '' && steps.build-external-packages.outcome != 'skipped'
shell: bash
run: |
src="${{ steps.build-external-packages.outputs.output_dir }}"
if [ -d "$src" ]; then
mkdir -p "dist/$(dirname "$src")"
mv "$src" "dist/$(dirname "$src")/"
fi
- name: Stop monitoring script
if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
shell: bash

View File

@ -136,7 +136,7 @@ jobs:
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
run: |
"$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_json==0.6.7
"$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_sajson==0.6.7
"$VENV_PATH/bin/python3" -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

View File

@ -36,7 +36,7 @@ jobs:
runs-on: linux.9xlarge.ephemeral
strategy:
matrix:
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "cpu"]
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "rocm6.3", "rocm6.4", "cpu"]
steps:
- name: Build docker image
uses: pytorch/pytorch/.github/actions/binary-docker-build@main

View File

@ -34,7 +34,7 @@ jobs:
id-token: write
strategy:
matrix:
cuda_version: ["130", "129", "128", "126"]
cuda_version: ["129", "128", "126"]
steps:
- name: Checkout PyTorch
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

View File

@ -57,11 +57,6 @@ jobs:
echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
- name: Checkout optional submodules
run: python3 tools/optional_submodules.py
- name: Copy docs requirements for inclusion
run: |
# Replace symlink with actual file
rm docs/requirements.txt || true
cp .ci/docker/requirements-docs.txt docs/requirements.txt
- name: Create source distribution
run: |
# Create new folder with specified name so extracting the archive yields that

View File

@ -132,7 +132,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_9-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -243,7 +243,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -354,7 +354,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -465,7 +465,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -576,7 +576,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -687,7 +687,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -712,225 +712,3 @@ jobs:
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14-cpu-aarch64-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cpu-aarch64-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_14-cpu-aarch64-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.14"
build_name: manywheel-py3_14-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.2xlarge
ALPINE_IMAGE: "arm64v8/alpine"
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cpu-aarch64-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_14-cpu-aarch64-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.14"
build_name: manywheel-py3_14-cpu-aarch64
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: 12.9-aarch64
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_14-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: 12.9-aarch64
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14"
build_name: manywheel-py3_14-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14t-cpu-aarch64-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cpu-aarch64-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_14t-cpu-aarch64-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.14t"
build_name: manywheel-py3_14t-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.2xlarge
ALPINE_IMAGE: "arm64v8/alpine"
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cpu-aarch64-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_14t-cpu-aarch64-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.14t"
build_name: manywheel-py3_14t-cpu-aarch64
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14t-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: 12.9-aarch64
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_14t-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: 12.9-aarch64
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14t"
build_name: manywheel-py3_14t-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml

View File

@ -60,7 +60,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_8-test: # Testing

View File

@ -127,7 +127,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_9-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda12_6-test: # Testing
@ -193,7 +193,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_9-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda12_8-test: # Testing
@ -259,7 +259,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_9-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda12_9-test: # Testing
@ -719,7 +719,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda12_6-test: # Testing
@ -785,7 +785,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda12_8-test: # Testing
@ -851,7 +851,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda12_9-test: # Testing
@ -1311,7 +1311,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda12_6-test: # Testing
@ -1377,7 +1377,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda12_8-test: # Testing
@ -1508,7 +1508,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda12_9-test: # Testing
@ -1968,7 +1968,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_6-test: # Testing
@ -2034,7 +2034,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_8-test: # Testing
@ -2100,7 +2100,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_9-test: # Testing
@ -2560,7 +2560,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda12_6-test: # Testing
@ -2626,7 +2626,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda12_8-test: # Testing
@ -2692,7 +2692,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda12_9-test: # Testing
@ -3152,7 +3152,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13t-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda12_6-test: # Testing
@ -3218,7 +3218,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13t-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda12_8-test: # Testing
@ -3284,7 +3284,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13t-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda12_9-test: # Testing
@ -3744,7 +3744,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda12_6-test: # Testing
@ -3810,7 +3810,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda12_8-test: # Testing
@ -3876,7 +3876,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda12_9-test: # Testing
@ -4336,7 +4336,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14t-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda12_6-test: # Testing
@ -4402,7 +4402,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14t-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda12_8-test: # Testing
@ -4468,7 +4468,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14t-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda12_9-test: # Testing

View File

@ -115,33 +115,12 @@ jobs:
# Create new "clean" conda environment for testing
SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
if [[ $DESIRED_PYTHON == "3.13t" ]]; then
conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
SMOKE_TEST_PARAMS="--torch-compile-check disabled"
else
conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
fi
conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
@ -260,33 +239,12 @@ jobs:
# Create new "clean" conda environment for testing
SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
if [[ $DESIRED_PYTHON == "3.13t" ]]; then
conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
SMOKE_TEST_PARAMS="--torch-compile-check disabled"
else
conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
fi
conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
@ -405,33 +363,12 @@ jobs:
# Create new "clean" conda environment for testing
SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
if [[ $DESIRED_PYTHON == "3.13t" ]]; then
conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
SMOKE_TEST_PARAMS="--torch-compile-check disabled"
else
conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
fi
conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
@ -550,33 +487,12 @@ jobs:
# Create new "clean" conda environment for testing
SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
if [[ $DESIRED_PYTHON == "3.13t" ]]; then
conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
SMOKE_TEST_PARAMS="--torch-compile-check disabled"
else
conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
fi
conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
@ -695,33 +611,12 @@ jobs:
# Create new "clean" conda environment for testing
SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
if [[ $DESIRED_PYTHON == "3.13t" ]]; then
conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
SMOKE_TEST_PARAMS="--torch-compile-check disabled"
else
conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
fi
conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
@ -840,33 +735,12 @@ jobs:
# Create new "clean" conda environment for testing
SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
if [[ $DESIRED_PYTHON == "3.13t" ]]; then
conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
SMOKE_TEST_PARAMS="--torch-compile-check disabled"
else
conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
fi
conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
@ -900,293 +774,3 @@ jobs:
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
wheel-py3_14-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: macos-14-xlarge
timeout-minutes: 240
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: wheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.14"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
# shellcheck disable=SC2129
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
- name: Install conda and dependencies
run: |
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
fi
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Populate binary env
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
# Build
USE_PYTORCH_METAL_EXPORT=1
USE_COREML_DELEGATE=1
TORCH_PACKAGE_NAME="${TORCH_PACKAGE_NAME//-/_}"
export USE_PYTORCH_METAL_EXPORT
export USE_COREML_DELEGATE
export TORCH_PACKAGE_NAME
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
- name: Test PyTorch wheel
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
pip uninstall -y "$TORCH_PACKAGE_NAME" || true
pip uninstall -y "$TORCH_PACKAGE_NAME" || true
# Create new "clean" conda environment for testing
SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
# shellcheck disable=SC2086
python "${PYTORCH_ROOT}/.ci/pytorch/smoke_test/smoke_test.py" --package torchonly ${SMOKE_TEST_PARAMS}
- uses: actions/upload-artifact@v4.4.0
if: always()
with:
name: wheel-py3_14-cpu
retention-days: 14
if-no-files-found: error
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
wheel-py3_14-cpu-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: wheel-py3_14-cpu-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: wheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cpu
DESIRED_PYTHON: "3.14"
build_name: wheel-py3_14-cpu
use_s3: False
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
wheel-py3_14t-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: macos-14-xlarge
timeout-minutes: 240
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: wheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.14t"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
# shellcheck disable=SC2129
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
- name: Install conda and dependencies
run: |
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
fi
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Populate binary env
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
# Build
USE_PYTORCH_METAL_EXPORT=1
USE_COREML_DELEGATE=1
TORCH_PACKAGE_NAME="${TORCH_PACKAGE_NAME//-/_}"
export USE_PYTORCH_METAL_EXPORT
export USE_COREML_DELEGATE
export TORCH_PACKAGE_NAME
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
- name: Test PyTorch wheel
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
pip uninstall -y "$TORCH_PACKAGE_NAME" || true
pip uninstall -y "$TORCH_PACKAGE_NAME" || true
# Create new "clean" conda environment for testing
SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
# shellcheck disable=SC2086
python "${PYTORCH_ROOT}/.ci/pytorch/smoke_test/smoke_test.py" --package torchonly ${SMOKE_TEST_PARAMS}
- uses: actions/upload-artifact@v4.4.0
if: always()
with:
name: wheel-py3_14t-cpu
retention-days: 14
if-no-files-found: error
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
wheel-py3_14t-cpu-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: wheel-py3_14t-cpu-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: wheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cpu
DESIRED_PYTHON: "3.14t"
build_name: wheel-py3_14t-cpu
use_s3: False
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml

View File

@ -4,12 +4,9 @@ on:
pull_request:
paths:
- .github/workflows/h100-cutlass-backend.yml
- torch/_inductor/codegen/cuda/**
- test/inductor/test_cutlass_backend.py
- test/inductor/test_cutlass_evt.py
workflow_dispatch:
schedule:
- cron: 22 9,21 * * * # every 12 hours
- cron: 22 9 * * * # every 24 hours about 2:22am PDT
push:
tags:
- ciflow/h100-cutlass-backend/*

View File

@ -93,7 +93,7 @@ jobs:
script: |
CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
echo "Running mypy"
ADDITIONAL_LINTRUNNER_ARGS="--take MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
ADDITIONAL_LINTRUNNER_ARGS="--take MYPY --all-files" .github/scripts/lintrunner.sh
lintrunner-noclang:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@ -111,9 +111,9 @@ jobs:
CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
echo "Running all other linters"
if [ "$CHANGED_FILES" = '*' ]; then
ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY --all-files" .github/scripts/lintrunner.sh
else
ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT ${CHANGED_FILES}" .github/scripts/lintrunner.sh
ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY ${CHANGED_FILES}" .github/scripts/lintrunner.sh
fi
quick-checks:

View File

@ -156,13 +156,13 @@ jobs:
sync-tag: asan-test
secrets: inherit
linux-jammy-py3_10-clang12-onnx-build:
name: linux-jammy-py3.10-clang12-onnx
linux-jammy-py3_9-clang12-onnx-build:
name: linux-jammy-py3.9-clang12-onnx
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-clang12-onnx
build-environment: linux-jammy-py3.9-clang12-onnx
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-onnx
test-matrix: |
{ include: [
@ -171,16 +171,16 @@ jobs:
]}
secrets: inherit
linux-jammy-py3_10-clang12-onnx-test:
name: linux-jammy-py3.10-clang12-onnx
linux-jammy-py3_9-clang12-onnx-test:
name: linux-jammy-py3.9-clang12-onnx
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-py3_10-clang12-onnx-build
- linux-jammy-py3_9-clang12-onnx-build
- target-determination
with:
build-environment: linux-jammy-py3.10-clang12-onnx
docker-image: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.test-matrix }}
build-environment: linux-jammy-py3.9-clang12-onnx
docker-image: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-py3_9-clang12-build:

View File

@ -1,45 +0,0 @@
name: vllm-test
on:
push:
tags:
- ciflow/vllm/*
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-label-type:
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
torch-build-sm89:
name: sm89-vllm-test
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-additional-packages: "vision audio torchao"
build-external-packages: "vllm"
build-environment: linux-jammy-cuda12.8-py3.12-gcc11-sm89
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm
cuda-arch-list: '8.9'
runner: linux.24xlarge.memory
test-matrix: |
{ include: [
{ config: "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "vllm_basic_models_test", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
]}
secrets: inherit

1
.gitignore vendored
View File

@ -32,7 +32,6 @@ coverage.xml
aten/build/
aten/src/ATen/Config.h
aten/src/ATen/cuda/CUDAConfig.h
aten/src/ATen/hip/HIPConfig.h
benchmarks/.data
caffe2/cpp_test/
dist/

View File

@ -121,7 +121,7 @@ inline int64_t legacy_cat_wrap_dim_symint(
const std::vector<std::vector<c10::SymInt>>& tensor_sizes) {
for (auto& sizes : tensor_sizes) {
if (sizes.size() == 1) {
if (TORCH_GUARD_OR_FALSE(sizes[0].sym_eq(0))) {
if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[0].sym_eq(0))) {
continue;
}
}
@ -135,7 +135,7 @@ inline int64_t legacy_cat_wrap_dim(
const MaterializedITensorListRef& tensors) {
for (const Tensor& tensor : tensors) {
if (tensor.dim() == 1) {
if (TORCH_GUARD_OR_FALSE(tensor.sym_sizes()[0].sym_eq(0))) {
if (TORCH_GUARD_SIZE_OBLIVIOUS(tensor.sym_sizes()[0].sym_eq(0))) {
continue;
}
}

View File

@ -1847,12 +1847,8 @@ int get_scale_mode(ScalingType scaling_type, ScalarType scale_dtype, bool use_fa
switch (scaling_type) {
case ScalingType::BlockWise1x32:
TORCH_CHECK(scale_dtype == kFloat8_e8m0fnu);
#if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && ROCM_VERSION >= 70000)
#ifdef USE_ROCM
return HIPBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0;
#else
#if CUDA_VERSION >= 12080
return CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0;
#endif // USE_ROCM
#else
TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales of 1x32 blocks is only supported for CUDA 12.8 and above");
#endif // if CUDA_VERSION >= 12080
@ -1950,26 +1946,12 @@ void scaled_gemm(
// hipblaslt supported row-wise before cublas, and did so their own way (via
// the SCALE_POINTERSs), but then migrated to match how cublas does it (via
// the SCALE_MODEs). Here we check for this early custom mode.
bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise);
#if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
if (use_rowwise) {
if (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise) {
matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
}
else if (mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) {
#if ROCM_VERSION >= 70000
if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) {
// TODO: add constraints based on hipblaslt internals
TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0),
"Matrix dimensions must be multiples of 32 for MX format. "
"Got m=", m, ", n=", n, ", k=", k);
}
#endif
}
#else
// rowwise isn't supported using cublaslt or older hipblaslt
TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with blaslt");
#endif // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
#endif // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
computeDesc.setAttribute(matmulDescA, mat1_scale_ptr);
computeDesc.setAttribute(matmulDescB, mat2_scale_ptr);
if (result_scale_ptr != nullptr) {
@ -2008,16 +1990,15 @@ void scaled_gemm(
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, CUBLASLT_EPILOGUE_BIAS);
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype));
}
// For other data types, use the get_scale_mode function based on scaling type
// The SCALE_MODE attrs only exist in cuBLAS 12.8+/ROCm 7.0 or in recent hipblaslt,
// but we must invoke get_scale_mode anyways to trigger the version checks.
// Note that AMD/ROCm follows OCP Spec 1.0, which is different from NVIDIA's implementation. See get_scale_mode() for details.
[[maybe_unused]] int a_scale_mode = get_scale_mode(mat1_scaling_type, mat1_scale_dtype, use_fast_accum);
[[maybe_unused]] int b_scale_mode = get_scale_mode(mat2_scaling_type, mat2_scale_dtype, use_fast_accum);
#if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && ROCM_VERSION >= 70000 && defined(HIPBLASLT_OUTER_VEC))
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, a_scale_mode);
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, b_scale_mode);
#endif // if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && ROCM_VERSION >= 70000 && defined(HIPBLASLT_OUTER_VEC))
// The SCALE_MODE attrs only exist in cuBLAS 12.8+ or in recent hipblaslt,
// but we must invoke get_scale_mode anyways to trigger the version checks.
[[maybe_unused]] int a_scale_mode = get_scale_mode(mat1_scaling_type, mat1_scale_dtype, use_fast_accum);
[[maybe_unused]] int b_scale_mode = get_scale_mode(mat2_scaling_type, mat2_scale_dtype, use_fast_accum);
#if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && defined(HIPBLASLT_OUTER_VEC))
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, a_scale_mode);
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, b_scale_mode);
#endif
CuBlasLtMatmulPreference preference;
auto ltworkspace = CublasLtWorkspace();

View File

@ -90,7 +90,7 @@ inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type)
case c10::ScalarType::Float8_e5m2fnuz:
return HIP_R_8F_E5M2_FNUZ;
#endif
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12080) || (defined(USE_ROCM) && ROCM_VERSION >= 70000)
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12080)
case c10::ScalarType::Float4_e2m1fn_x2:
return CUDA_R_4F_E2M1;
#endif

View File

@ -4,9 +4,6 @@
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDAGuard.h>
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
#include <c10/cuda/driver_api.h>
#endif
#include <c10/util/Exception.h>
#include <c10/util/irange.h>
@ -15,7 +12,6 @@
namespace at::cuda {
static std::vector<int8_t> p2pAccessEnabled_;
static std::vector<int8_t> fabricAccessEnabled_;
static int64_t num_devices_ = -1;
namespace detail {
@ -33,23 +29,20 @@ void init_p2p_access_cache(int64_t num_devices) {
for (const auto i : c10::irange(num_devices)) {
p2pAccessEnabled_[i * num_devices + i] = 1;
}
fabricAccessEnabled_.clear();
fabricAccessEnabled_.resize(num_devices, -1);
}
} // namespace detail
} // namespace detail
bool get_p2p_access(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) {
at::globalContext().lazyInitDevice(c10::DeviceType::CUDA);
TORCH_CHECK(dev >= 0 || dev < num_devices_, dev, " is not a device");
TORCH_CHECK(
dev_to_access >= 0 || dev_to_access < num_devices_,
dev_to_access,
" is not a device");
TORCH_CHECK(dev >= 0 || dev < num_devices_,
dev, " is not a device");
TORCH_CHECK(dev_to_access >= 0 || dev_to_access < num_devices_,
dev_to_access, " is not a device");
TORCH_INTERNAL_ASSERT(num_devices_ >= 0, "p2p access cache not initialized");
auto& cache = p2pAccessEnabled_[dev * num_devices_ + dev_to_access];
auto &cache = p2pAccessEnabled_[dev * num_devices_ + dev_to_access];
if (cache != -1) {
return cache;
@ -65,118 +58,4 @@ bool get_p2p_access(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) {
return cache;
}
namespace {
#if !defined USE_ROCM && defined CUDA_VERSION && CUDA_VERSION >= 12040 && defined PYTORCH_C10_DRIVER_API_SUPPORTED
nvmlDevice_t get_nvml_device(c10::DeviceIndex dev) {
static bool nvml_init [[maybe_unused]] = []() {
TORCH_INTERNAL_ASSERT(NVML_SUCCESS == DriverAPI::get()->nvmlInit_v2_());
return true;
}();
auto prop = at::cuda::getDeviceProperties(dev);
char pci_id // NOLINT(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
snprintf(
pci_id,
sizeof(pci_id),
NVML_DEVICE_PCI_BUS_ID_FMT,
prop->pciDomainID,
prop->pciBusID,
prop->pciDeviceID);
nvmlDevice_t nvml_device = nullptr;
TORCH_INTERNAL_ASSERT(
NVML_SUCCESS ==
DriverAPI::get()->nvmlDeviceGetHandleByPciBusId_v2_(
pci_id, &nvml_device));
return nvml_device;
}
bool isFabricSupported() {
// 1. try allocating memory
CUmemGenericAllocationHandle handle = 0;
CUmemAllocationProp prop = {};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
size_t granularity{};
const auto driver_api = c10::cuda::DriverAPI::get();
C10_CUDA_DRIVER_CHECK(driver_api->cuMemGetAllocationGranularity_(
&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
auto status = driver_api->cuMemCreate_(&handle, granularity, &prop, 0);
if (status != CUDA_SUCCESS) {
LOG(INFO)
<< "status " << status
<< " Could not allocate memory with FABRIC handle, falling back to fd handle exchange\n";
return false;
}
// 2. check export
CUmemFabricHandle sharedHandle;
status = driver_api->cuMemExportToShareableHandle_(
&sharedHandle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0);
if (status != CUDA_SUCCESS) {
LOG(INFO)
<< "status " << status
<< " Could not export FABRIC handle, falling back to fd handle exchange\n";
driver_api->cuMemRelease_(handle);
return false;
}
// 3. check import
CUmemGenericAllocationHandle import_handle = 0;
status = driver_api->cuMemImportFromShareableHandle_(
&import_handle, &sharedHandle, CU_MEM_HANDLE_TYPE_FABRIC);
if (status != CUDA_SUCCESS) {
LOG(INFO)
<< "status " << status
<< " Could not import FABRIC handle, falling back to fd handle exchange\n";
driver_api->cuMemRelease_(handle);
return false;
}
driver_api->cuMemRelease_(import_handle);
driver_api->cuMemRelease_(handle);
LOG(INFO) << "using fabric to exchange memory handles\n";
return true;
}
#endif
} // namespace
bool get_fabric_access(c10::DeviceIndex dev) {
#if !defined USE_ROCM && defined CUDA_VERSION && CUDA_VERSION >= 12040 && defined PYTORCH_C10_DRIVER_API_SUPPORTED
at::globalContext().lazyInitDevice(c10::DeviceType::CUDA);
TORCH_CHECK(dev >= 0 || dev < num_devices_, dev, " is not a device");
auto& cache = fabricAccessEnabled_[dev];
if (cache != -1) {
return cache;
}
auto nvml_device = get_nvml_device(dev);
if (nvml_device != nullptr) {
nvmlGpuFabricInfoV_t fabricInfo;
fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED;
fabricInfo.version = nvmlGpuFabricInfo_v2;
if (DriverAPI::get()->nvmlDeviceGetGpuFabricInfoV_ == nullptr) {
return false;
}
TORCH_CHECK(
NVML_SUCCESS ==
DriverAPI::get()->nvmlDeviceGetGpuFabricInfoV_(
nvml_device, &fabricInfo));
auto state = fabricInfo.state != NVML_GPU_FABRIC_STATE_NOT_SUPPORTED;
if (state) {
// now perform the full cycle of allocating - exporting - importing memory
state = isFabricSupported();
}
cache = state ? 1 : 0;
return cache;
} else {
return false;
}
#else
return false;
#endif
}
} // namespace at::cuda
} // namespace at::cuda::detail

View File

@ -8,6 +8,5 @@ void init_p2p_access_cache(int64_t num_devices);
}
TORCH_CUDA_CPP_API bool get_p2p_access(c10::DeviceIndex source_dev, c10::DeviceIndex dest_dev);
TORCH_CUDA_CPP_API bool get_fabric_access(c10::DeviceIndex device);
} // namespace at::cuda

View File

@ -85,15 +85,6 @@ constexpr hipDataType HipDataTypeFor<c10::Float8_e8m0fnu>() {
return static_cast<hipDataType>(500);
}
template <>
constexpr hipDataType HipDataTypeFor<c10::Float4_e2m1fn_x2>() {
#if ROCM_VERSION >= 70000
return HIP_R_4F_E2M1;
#else
return static_cast<hipDataType>(33);
#endif
}
template <typename T>
int GetBatchFromParams(const GemmParams<T>* params) {
return 1;

View File

@ -411,8 +411,7 @@ Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) {
Tensor fbgemm_linear_fp16_weight_fp32_activation(
const Tensor& input,
const Tensor& packed_weight,
const std::optional<Tensor>& bias,
at::Tensor& output) {
const std::optional<Tensor>& bias) {
TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated "
"and will be removed in a future PyTorch release.")
@ -437,11 +436,9 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
const int64_t N = packed_weight_fp16.numCols();
std::vector<int64_t> output_size = input.sizes().vec();
output_size.back() = N;
// Resize output Tensor
output.resize_(output_size);
Tensor output = at::empty(output_size, input.options().dtype(at::kFloat));
// Call the fp16 gemm interface
fbgemm::cblas_gemm_compute(
@ -463,14 +460,6 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
return output;
}
Tensor fbgemm_linear_fp16_weight_fp32_activation(
const Tensor& input,
const Tensor& packed_weight,
const std::optional<Tensor>& bias) {
at::Tensor output = at::empty({0}, input.options().dtype(at::kFloat));
return at::native::fbgemm_linear_fp16_weight_fp32_activation(input, packed_weight, bias, output);
}
Tensor fbgemm_linear_fp16_weight(
const Tensor& input,
const Tensor& packed_weight,
@ -479,15 +468,6 @@ Tensor fbgemm_linear_fp16_weight(
input, packed_weight, bias);
}
Tensor fbgemm_linear_fp16_weight(
const Tensor& input,
const Tensor& packed_weight,
const Tensor& bias,
at::Tensor& output) {
return at::native::fbgemm_linear_fp16_weight_fp32_activation(
input, packed_weight, bias, output);
}
#else // USE_FBGEMM
Tensor fbgemm_linear_int8_weight_fp32_activation(
@ -574,21 +554,6 @@ Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) {
false, "This PyTorch installation was not built with FBGEMM operators");
}
Tensor fbgemm_linear_fp16_weight_fp32_activation(
const Tensor& input,
const Tensor& packed_weight,
const std::optional<Tensor>& bias,
at::Tensor& output) {
TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated "
"and will be removed in a future PyTorch release.")
// We make a strong guarantee that models using these operators will have the
// same numerics across different machines. Therefore, we do not provide a
// fallback path and rather fail loudly if we cannot run FBGEMM.
TORCH_CHECK(
false, "This PyTorch installation was not built with FBGEMM operators");
}
Tensor fbgemm_linear_fp16_weight_fp32_activation(
const Tensor& input,
const Tensor& packed_weight,
@ -603,21 +568,6 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
false, "This PyTorch installation was not built with FBGEMM operators");
}
Tensor fbgemm_linear_fp16_weight(
const Tensor& input,
const Tensor& packed_weight,
const Tensor& bias,
at::Tensor& output) {
TORCH_WARN_ONCE("fbgemm_linear_fp16_weight is deprecated "
"and will be removed in a future PyTorch release.")
// We make a strong guarantee that models using these operators will have the
// same numerics across different machines. Therefore, we do not provide a
// fallback path and rather fail loudly if we cannot run FBGEMM.
TORCH_CHECK(
false, "This PyTorch installation was not built with FBGEMM operators");
}
Tensor fbgemm_linear_fp16_weight(
const Tensor& input,
const Tensor& packed_weight,

View File

@ -1283,35 +1283,15 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
if (use_fast_accum) {
TORCH_CHECK(mat1.scalar_type() != ScalarType::Float4_e2m1fn_x2 && mat2.scalar_type() != ScalarType::Float4_e2m1fn_x2, "`use_fast_accum` is not supported when `mat1` or `mat2` tensors have the `Float4_e2m1fn_x2` dtype.");
}
#ifdef USE_ROCM
if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2 || mat2.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
TORCH_CHECK(ROCM_VERSION >= 70000, "Float4_e2m1fn_x2 is only supported for ROCm 7.0 and above");
}
if (mat1.scalar_type() == ScalarType::Float8_e5m2 || mat2.scalar_type() == ScalarType::Float8_e5m2) {
TORCH_CHECK(ROCM_VERSION >= 60500, "Float8_e5m2 is only supported for ROCm 6.5 and above");
}
if (mat1.scalar_type() == ScalarType::Float8_e4m3fn || mat2.scalar_type() == ScalarType::Float8_e4m3fn) {
TORCH_CHECK(ROCM_VERSION >= 60500, "Float8_e4m3fn is only supported for ROCm 6.5 and above");
}
#endif
if (bias) {
TORCH_CHECK(out.scalar_type() != kFloat,
"Bias is not supported when out_dtype is set to Float32");
TORCH_CHECK(bias->scalar_type() == ScalarType::BFloat16 ||
bias->scalar_type() == ScalarType::Half,
"Bias must be BFloat16 or Half, but got ", bias->scalar_type());
TORCH_CHECK((out.scalar_type() != kFloat &&
out.scalar_type() != ScalarType::BFloat16) ||
bias->scalar_type() == ScalarType::BFloat16,
"Bias must be BFloat16 to compute ", out.scalar_type(),
" output, but got ", bias->scalar_type());
TORCH_CHECK(out.scalar_type() != ScalarType::Half ||
bias->scalar_type() == ScalarType::Half,
"Bias must be Float16 to compute ", out.scalar_type(),
" output, but got ", bias->scalar_type());
TORCH_CHECK(out.scalar_type() != kFloat, "Bias is not supported when out_dtype is set to Float32");
TORCH_CHECK(bias->scalar_type() == ScalarType::BFloat16 || bias->scalar_type() == ScalarType::Half,
"Bias must be either Half or BFloat16, but got ", bias->scalar_type());
TORCH_CHECK((out.scalar_type() != kFloat && out.scalar_type() != ScalarType::BFloat16) ||
bias->scalar_type() == ScalarType::BFloat16,
"Bias must be BFloat16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type());
TORCH_CHECK(out.scalar_type() != ScalarType::Half || bias->scalar_type() == ScalarType::Half,
"Bias must be Float16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type());
}
{
auto bias_ = bias.value_or(Tensor());
@ -1373,22 +1353,6 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16,
"hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type());
}
else if (scaling_choice_a == ScalingType::BlockWise1x32 && scaling_choice_b == ScalingType::BlockWise1x32) {
#if ROCM_VERSION >= 70000
TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
"Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
"Matrix dimensions must be multiples of 32 for block-wise scaling");
TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
out.scalar_type() == ScalarType::Half,
"Block-wise scaling only supports BFloat16 or Half output types");
#else
TORCH_CHECK(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
#endif
}
#endif
cublasCommonArgs args(mat1, mat2, out, scale_a, scale_b, scale_result, scaling_choice_a, scaling_choice_b);
@ -1466,14 +1430,12 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
params.k = args.k;
params.a = args.mata->data_ptr();
params.a_scale_ptr = args.scale_mata_ptr;
params.a_scale_dtype = args.scale_mata_dtype.value();
params.lda = args.lda;
params.a_dtype = args.mata->scalar_type();
params.a_scale_dtype = args.scale_mata_dtype.value();
params.a_scaling_type = args.scaling_mata_type.value();
params.b = args.matb->data_ptr();
params.b_scale_ptr = args.scale_matb_ptr;
params.b_scale_dtype = args.scale_matb_dtype.value();
params.ldb = args.ldb;
params.b_dtype = args.matb->scalar_type();
params.b_scale_dtype = args.scale_matb_dtype.value();

View File

@ -148,56 +148,6 @@ namespace fe = cudnn_frontend;
#define MAX_MHA_DIM 4
// Whether we will use ragged offsets in the dense (non-nested) path
// to avoid recompilation
bool use_ragged_in_dense(
const Tensor& q,
const Tensor& k,
const Tensor& v,
const Tensor& o,
bool has_bias) {
static bool flag =
c10::utils::check_env("TORCH_CUDNN_SDPA_AVOID_RECOMPILE") == true;
if (!flag) {
return flag;
}
TORCH_WARN_ONCE(
"TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1 is currently experimental. "
"Please report any issues to https://github.com/pytorch/pytorch/issues.");
if (has_bias) {
TORCH_WARN_ONCE(
"TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1 only works without bias."
"Consider using the is_causal hint instead of bias for causal masking."
"Falling back to regular dense case, which may trigger excessive recompilation.");
return !has_bias;
}
bool all_bshd = q.dim() == 4 && q.transpose(1, 2).is_contiguous() &&
k.dim() == 4 && k.transpose(1, 2).is_contiguous() && v.dim() == 4 &&
v.transpose(1, 2).is_contiguous() && o.dim() == 4 &&
o.transpose(1, 2).is_contiguous();
if (!all_bshd) {
TORCH_WARN_ONCE(
"TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1 only works with Q, K, V, and output in BSHD memory layout,"
"e.g., Q, K, V must be allocated with torch.randn((B, S, H, D).transpose(1, 2)."
"Falling back to regualr dense case, which may trigger excessive recompilation.");
}
return all_bshd;
}
int roundup_power2(int dim) {
if (!dim) {
return 1;
}
dim--;
dim |= dim >> 1;
dim |= dim >> 2;
dim |= dim >> 4;
dim |= dim >> 8;
dim |= dim >> 16;
dim++;
return dim;
}
struct MHAParams {
c10::DeviceIndex device_id;
fe::DataType_t dataType;
@ -221,7 +171,6 @@ struct MHAParams {
// might be redundant if we take 0 dim/stride
// as signaling no-bias
bool has_attn_bias;
bool use_ragged;
};
void setMHAParams(
@ -279,20 +228,6 @@ void setMHAParams(
std::copy(k.strides().begin(), k.strides().end(), params.k_stride.begin());
std::copy(v.sizes().begin(), v.sizes().end(), params.v_dim.begin());
std::copy(v.strides().begin(), v.strides().end(), params.v_stride.begin());
bool use_ragged = use_ragged_in_dense(q, k, v, q, params.has_attn_bias);
params.use_ragged = use_ragged;
if (use_ragged) {
// ignore B - stride in BSHD (THD) avoid-recompile
params.q_stride[0] = INT_MAX;
params.k_stride[0] = INT_MAX;
params.v_stride[0] = INT_MAX;
// fix seqlen to rounded value
params.s_q = roundup_power2(params.s_q);
params.s_kv = roundup_power2(params.s_kv);
params.q_dim[2] = roundup_power2(params.q_dim[2]);
params.k_dim[2] = roundup_power2(params.k_dim[2]);
params.v_dim[2] = roundup_power2(params.v_dim[2]);
}
// uninit is OK as the struct is memset 0'd
if (params.has_attn_bias) {
std::copy(
@ -342,29 +277,15 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
template <typename T, typename KeyType>
struct MHAGraphCache {
std::unordered_map<KeyType, T, ParamsWrapperHash<KeyType>> engine_cache;
int count = 0;
int hits = 0;
// no mutexes here as caches are now thread local for v8, can also return a
// pointer to the Execution Plan if we know it will not be invalidated by
// another thread
T* find(const KeyType& key) {
static bool flag =
c10::utils::check_env("TORCH_CUDNN_SDPA_CACHE_DEBUG") == true;
if (flag && count) {
TORCH_WARN(
"SDPA Cache Called ",
count,
" times. Hit rate: ",
100 * hits / count,
"%");
}
count++;
auto it = engine_cache.find(key);
if (it == engine_cache.end()) {
return nullptr;
}
hits++;
return &(it->second);
}
@ -481,25 +402,6 @@ auto build_graph(
.set_is_inference(return_softmaxstats == false)
.set_causal_mask(is_causal)
.set_attn_scale(attn_scale);
if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
auto SEQ_LEN_Q_ =
mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(SEQ_LEN_Q)
.set_name("Seq_q")
.set_dim({b, 1, 1, 1})
.set_stride({1, 1, 1, 1})
.set_data_type(fe::DataType_t::INT32));
auto SEQ_LEN_KV_ =
mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(SEQ_LEN_KV)
.set_name("Seq_kv")
.set_dim({b, 1, 1, 1})
.set_stride({1, 1, 1, 1})
.set_data_type(fe::DataType_t::INT32));
scaled_dot_product_flash_attention_options.set_seq_len_q(SEQ_LEN_Q_)
.set_seq_len_kv(SEQ_LEN_KV_)
.set_padding_mask(true);
}
if (dropout_probability != 0.0f) {
auto seed = mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(SEED)
@ -523,11 +425,23 @@ auto build_graph(
dropout_probability, seed, offset);
}
auto Q_ = mha_graph->tensor(
fe::graph::Tensor_attributes().set_uid(Q).set_name("Q"));
fe::graph::Tensor_attributes()
.set_uid(Q)
.set_name("Q")
.set_dim(q.sizes().vec())
.set_stride(fixSizeOneDimStrideSDPA(q.sizes(), q.strides().vec())));
auto K_ = mha_graph->tensor(
fe::graph::Tensor_attributes().set_uid(K).set_name("K"));
fe::graph::Tensor_attributes()
.set_uid(K)
.set_name("K")
.set_dim(k.sizes().vec())
.set_stride(fixSizeOneDimStrideSDPA(k.sizes(), k.strides().vec())));
auto V_ = mha_graph->tensor(
fe::graph::Tensor_attributes().set_uid(V).set_name("V"));
fe::graph::Tensor_attributes()
.set_uid(V)
.set_name("V")
.set_dim(v.sizes().vec())
.set_stride(fixSizeOneDimStrideSDPA(v.sizes(), v.strides().vec())));
std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
if (attn_bias.has_value()) {
bias =
@ -541,90 +455,12 @@ auto build_graph(
auto [O_, Stats] =
mha_graph->sdpa(Q_, K_, V_, scaled_dot_product_flash_attention_options);
O_->set_uid(O).set_output(true);
O_->set_uid(O);
O_->set_output(true).set_dim(o.sizes().vec()).set_stride(o.strides().vec());
if (Stats) {
Stats->set_uid(LSE)
.set_output(true)
.set_data_type(fe::DataType_t::FLOAT)
.set_stride(softmaxstats.strides().vec());
}
if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
auto RAG_Q_OFF_ =
mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(RAG_Q_OFF)
.set_name("cum_seq_q")
.set_dim({b + 1, 1, 1, 1})
.set_stride({1, 1, 1, 1})
.set_data_type(fe::DataType_t::INT32));
auto RAG_K_OFF_ =
mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(RAG_K_OFF)
.set_name("cum_seq_k")
.set_dim({b + 1, 1, 1, 1})
.set_stride({1, 1, 1, 1})
.set_data_type(fe::DataType_t::INT32));
auto RAG_V_OFF_ =
mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(RAG_V_OFF)
.set_name("cum_seq_v")
.set_dim({b + 1, 1, 1, 1})
.set_stride({1, 1, 1, 1})
.set_data_type(fe::DataType_t::INT32));
auto RAG_O_OFF_ =
mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(RAG_O_OFF)
.set_name("cum_seq_o")
.set_dim({b + 1, 1, 1, 1})
.set_stride({1, 1, 1, 1})
.set_data_type(fe::DataType_t::INT32));
auto RAG_STATS_OFF_ =
mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(RAG_LSE_OFF)
.set_name("cum_seq_stats")
.set_dim({b + 1, 1, 1, 1})
.set_stride({1, 1, 1, 1})
.set_data_type(fe::DataType_t::INT32));
O_->set_ragged_offset(RAG_O_OFF_);
Q_->set_ragged_offset(RAG_Q_OFF_);
K_->set_ragged_offset(RAG_K_OFF_);
V_->set_ragged_offset(RAG_V_OFF_);
auto qsizevec = q.sizes().vec();
auto ksizevec = k.sizes().vec();
auto vsizevec = v.sizes().vec();
auto osizevec = o.sizes().vec();
qsizevec[2] = roundup_power2(qsizevec[2]);
ksizevec[2] = roundup_power2(ksizevec[2]);
vsizevec[2] = roundup_power2(vsizevec[2]);
osizevec[2] = roundup_power2(osizevec[2]);
// we checked for BSHD contig., set fake strides as cuDNN will complain
// if e.g., a ragged dim is smaller than a non-ragged one:
// consider HBSD tensor where H is 1
Q_->set_dim(qsizevec).set_stride(
{INT_MAX, qsizevec[3], qsizevec[1] * qsizevec[3], 1});
K_->set_dim(ksizevec).set_stride(
{INT_MAX, ksizevec[3], ksizevec[1] * ksizevec[3], 1});
V_->set_dim(vsizevec).set_stride(
{INT_MAX, vsizevec[3], vsizevec[1] * vsizevec[3], 1});
O_->set_dim(osizevec).set_stride(
{INT_MAX, osizevec[3], osizevec[1] * osizevec[3], 1});
if (Stats) {
Stats->set_ragged_offset(RAG_STATS_OFF_);
auto statssizevec = softmaxstats.sizes().vec();
statssizevec[2] = roundup_power2(statssizevec[2]);
Stats->set_dim(statssizevec);
}
} else {
Q_->set_dim(q.sizes().vec())
.set_stride(fixSizeOneDimStrideSDPA(q.sizes(), q.strides().vec()));
K_->set_dim(k.sizes().vec())
.set_stride(fixSizeOneDimStrideSDPA(k.sizes(), k.strides().vec()));
V_->set_dim(v.sizes().vec())
.set_stride(fixSizeOneDimStrideSDPA(v.sizes(), v.strides().vec()));
O_->set_dim(o.sizes().vec())
.set_stride(fixSizeOneDimStrideSDPA(o.sizes(), o.strides().vec()));
if (Stats) {
Stats->set_dim(softmaxstats.sizes().vec());
}
Stats->set_uid(LSE);
Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT);
}
AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
@ -730,7 +566,7 @@ auto build_graph_nestedtensor(
auto q_strides = q.strides();
auto k_strides = k.strides();
auto v_strides = v.strides();
// NB: cuDNN API shape is transposed: we pass it nominally as HTD
// NB: cuDNN API shape is transposed
constexpr int strideidx0 = 1;
constexpr int strideidx1 = 0;
constexpr int strideidx2 = 2;
@ -888,32 +724,21 @@ auto build_graph_backward(
.set_name("CUDNN_SDPA_BACKWARD")
.set_causal_mask(is_causal)
.set_attn_scale(attn_scale);
if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
auto SEQ_LEN_Q_ =
mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(SEQ_LEN_Q)
.set_name("Seq_q")
.set_dim({b, 1, 1, 1})
.set_stride({1, 1, 1, 1})
.set_data_type(fe::DataType_t::INT32));
auto SEQ_LEN_KV_ =
mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(SEQ_LEN_KV)
.set_name("Seq_kv")
.set_dim({b, 1, 1, 1})
.set_stride({1, 1, 1, 1})
.set_data_type(fe::DataType_t::INT32));
sdpa_backward_options.set_seq_len_q(SEQ_LEN_Q_)
.set_seq_len_kv(SEQ_LEN_KV_)
.set_padding_mask(true);
}
auto Q_ = mha_graph->tensor(
fe::graph::Tensor_attributes().set_uid(Q).set_name("Q"));
auto K_ = mha_graph->tensor(
fe::graph::Tensor_attributes().set_uid(K).set_name("K"));
auto V_ = mha_graph->tensor(
fe::graph::Tensor_attributes().set_uid(V).set_name("V"));
auto Q_ = mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(Q)
.set_name("Q")
.set_dim(q.sizes().vec())
.set_stride(q.strides().vec()));
auto K_ = mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(K)
.set_name("K")
.set_dim(k.sizes().vec())
.set_stride(k.strides().vec()));
auto V_ = mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(V)
.set_name("V")
.set_dim(v.sizes().vec())
.set_stride(v.strides().vec()));
std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
if (attn_bias.has_value()) {
bias =
@ -945,108 +770,31 @@ auto build_graph_backward(
: fe::DataType_t::INT64));
sdpa_backward_options.set_dropout(dropout_probability, seed, offset);
}
auto O_ = mha_graph->tensor(
fe::graph::Tensor_attributes().set_uid(O).set_name("O"));
auto O_ = mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(O)
.set_name("O")
.set_dim(o.sizes().vec())
.set_stride(o.strides().vec()));
auto Stats = mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(LSE)
.set_name("Stats")
.set_dim(softmaxstats.sizes().vec())
.set_stride(softmaxstats.strides().vec())
.set_data_type(fe::DataType_t::FLOAT));
auto Do = mha_graph->tensor(
fe::graph::Tensor_attributes().set_uid(DO).set_name("DO"));
auto Do = mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(DO)
.set_name("DO")
.set_dim(dO.sizes().vec())
.set_stride(dO.strides().vec()));
auto [Dq, Dk, Dv] = mha_graph->sdpa_backward(
Q_, K_, V_, O_, Do, Stats, sdpa_backward_options);
Dq->set_uid(DQ).set_output(true);
Dk->set_uid(DK).set_output(true);
Dv->set_uid(DV).set_output(true);
if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
auto RAG_Q_OFF_ =
mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(RAG_Q_OFF)
.set_name("cum_seq_q")
.set_dim({b + 1, 1, 1, 1})
.set_stride({1, 1, 1, 1})
.set_data_type(fe::DataType_t::INT32));
auto RAG_K_OFF_ =
mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(RAG_K_OFF)
.set_name("cum_seq_k")
.set_dim({b + 1, 1, 1, 1})
.set_stride({1, 1, 1, 1})
.set_data_type(fe::DataType_t::INT32));
auto RAG_V_OFF_ =
mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(RAG_V_OFF)
.set_name("cum_seq_v")
.set_dim({b + 1, 1, 1, 1})
.set_stride({1, 1, 1, 1})
.set_data_type(fe::DataType_t::INT32));
auto RAG_O_OFF_ =
mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(RAG_O_OFF)
.set_name("cum_seq_o")
.set_dim({b + 1, 1, 1, 1})
.set_stride({1, 1, 1, 1})
.set_data_type(fe::DataType_t::INT32));
auto RAG_STATS_OFF_ =
mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(RAG_LSE_OFF)
.set_name("cum_seq_stats")
.set_dim({b + 1, 1, 1, 1})
.set_stride({1, 1, 1, 1})
.set_data_type(fe::DataType_t::INT32));
O_->set_ragged_offset(RAG_O_OFF_);
Q_->set_ragged_offset(RAG_Q_OFF_);
K_->set_ragged_offset(RAG_K_OFF_);
V_->set_ragged_offset(RAG_V_OFF_);
Dq->set_ragged_offset(RAG_Q_OFF_);
Dk->set_ragged_offset(RAG_K_OFF_);
Dv->set_ragged_offset(RAG_V_OFF_);
Do->set_ragged_offset(RAG_O_OFF_);
auto qsizevec = q.sizes().vec();
auto ksizevec = k.sizes().vec();
auto vsizevec = v.sizes().vec();
auto osizevec = o.sizes().vec();
qsizevec[2] = roundup_power2(qsizevec[2]);
ksizevec[2] = roundup_power2(ksizevec[2]);
vsizevec[2] = roundup_power2(vsizevec[2]);
osizevec[2] = roundup_power2(osizevec[2]);
// see corresponding section in the forward about the hardcoding
// of strides here
Q_->set_dim(qsizevec).set_stride(
{INT_MAX, qsizevec[3], qsizevec[1] * qsizevec[3], 1});
K_->set_dim(ksizevec).set_stride(
{INT_MAX, ksizevec[3], ksizevec[1] * ksizevec[3], 1});
V_->set_dim(vsizevec).set_stride(
{INT_MAX, vsizevec[3], vsizevec[1] * vsizevec[3], 1});
O_->set_dim(osizevec).set_stride(
{INT_MAX, osizevec[3], osizevec[1] * osizevec[3], 1});
// should be identical to their non-d counterparts
Dq->set_dim(qsizevec).set_stride(
{INT_MAX, qsizevec[3], qsizevec[1] * qsizevec[3], 1});
Dk->set_dim(ksizevec).set_stride(
{INT_MAX, ksizevec[3], ksizevec[1] * ksizevec[3], 1});
Dv->set_dim(vsizevec).set_stride(
{INT_MAX, vsizevec[3], vsizevec[1] * vsizevec[3], 1});
Do->set_dim(osizevec).set_stride(
{INT_MAX, osizevec[3], osizevec[1] * osizevec[3], 1});
Stats->set_ragged_offset(RAG_STATS_OFF_);
auto statssizevec = softmaxstats.sizes().vec();
statssizevec[2] = roundup_power2(statssizevec[2]);
Stats->set_dim(statssizevec);
} else {
O_->set_dim(o.sizes().vec()).set_stride(o.strides().vec());
Q_->set_dim(q.sizes().vec()).set_stride(q.strides().vec());
K_->set_dim(k.sizes().vec()).set_stride(k.strides().vec());
V_->set_dim(v.sizes().vec()).set_stride(v.strides().vec());
Dq->set_dim(dQ.sizes().vec()).set_stride(dQ.strides().vec());
Dk->set_dim(dK.sizes().vec()).set_stride(dK.strides().vec());
Dv->set_dim(dV.sizes().vec()).set_stride(dV.strides().vec());
Do->set_dim(dO.sizes().vec()).set_stride(dO.strides().vec());
Stats->set_dim(softmaxstats.sizes().vec());
}
Dq->set_uid(DQ);
Dq->set_output(true).set_dim(dQ.sizes().vec()).set_stride(dQ.strides().vec());
Dk->set_uid(DK);
Dk->set_output(true).set_dim(dK.sizes().vec()).set_stride(dK.strides().vec());
Dv->set_uid(DV);
Dv->set_output(true).set_dim(dV.sizes().vec()).set_stride(dV.strides().vec());
AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
AT_CUDNN_FRONTEND_CHECK(mha_graph->build_operation_graph(handle));
AT_CUDNN_FRONTEND_CHECK(
@ -1318,47 +1066,6 @@ void run_cudnn_SDP_fprop(
Tensor& o,
Tensor& dropoutseed,
Tensor& dropoutoffset) {
// do nothing if we got 0-element tensors
if (!q.numel() || !k.numel() || !v.numel()) {
return;
}
Tensor seqlen_q, seqlen_kv;
Tensor rag_off_q, rag_off_k, rag_off_v, rag_off_o, rag_off_lse;
if (!o.defined()) {
// q is passed to us in BHSD dim order
alloc_with_matching_layout(q, o, {b, h, s_q, d_v});
}
bool use_ragged = use_ragged_in_dense(q, k, v, o, attn_bias.has_value());
if (return_softmaxstats && !softmaxstats.defined()) {
// TODO(eqy): investigate why cuDNN doesn't like BSH layout softmaxstats
if (!use_ragged) {
softmaxstats = at::empty({b, h, s_q, 1}, q.options().dtype(kFloat));
} else {
softmaxstats =
at::empty({b, s_q, h, 1}, q.options().dtype(kFloat)).transpose(1, 2);
}
}
if (use_ragged) {
seqlen_q = at::full({b, 1, 1, 1}, s_q, q.options().dtype(kInt));
seqlen_kv = at::full({b, 1, 1, 1}, s_kv, q.options().dtype(kInt));
auto cum_seqlen_q = at::full({b + 1, 1, 1, 1}, s_q, q.options().dtype(kInt))
.cumsum(0, kInt)
.add_(-s_q);
auto cum_seqlen_kv =
at::full({b + 1, 1, 1, 1}, s_kv, q.options().dtype(kInt))
.cumsum(0, kInt)
.add_(-s_kv);
rag_off_q = cum_seqlen_q.mul(q.stride(-2));
rag_off_k = cum_seqlen_kv.mul(k.stride(-2));
rag_off_v = cum_seqlen_kv.mul(v.stride(-2));
rag_off_o = cum_seqlen_q.mul(o.stride(-2));
if (return_softmaxstats) {
rag_off_lse = cum_seqlen_q.mul(softmaxstats.stride(-2));
}
}
const auto dprops = at::cuda::getCurrentDeviceProperties();
auto _dropoutseed = dropoutseed;
auto _dropoutoffset = dropoutoffset;
@ -1369,10 +1076,21 @@ void run_cudnn_SDP_fprop(
}
cudnnHandle_t handle = getCudnnHandle();
if (!o.defined()) {
// q is passed to us in BHSD dim order
alloc_with_matching_layout(q, o, {b, h, s_q, d_v});
}
if (return_softmaxstats && !softmaxstats.defined()) {
// TODO(eqy): verify that this is correct
softmaxstats = at::empty({b, h, s_q}, q.options().dtype(kFloat));
}
// do nothing if we got 0-element tensors
if (!q.numel() || !k.numel() || !v.numel()) {
return;
}
// NB: The key initialization will round up sequence length, stride data etc.
// if use_ragged_in_dense is enabled (to allow multiple sequence lenghths to
// reuse the same cached value/graph)
auto key = MHACacheKeyWrapper(
b,
h,
@ -1429,17 +1147,6 @@ void run_cudnn_SDP_fprop(
variant_pack[SEED] = _dropoutseed.data_ptr();
variant_pack[OFFSET] = _dropoutoffset.data_ptr();
}
if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
variant_pack[SEQ_LEN_Q] = seqlen_q.data_ptr();
variant_pack[SEQ_LEN_KV] = seqlen_kv.data_ptr();
variant_pack[RAG_Q_OFF] = rag_off_q.data_ptr();
variant_pack[RAG_K_OFF] = rag_off_k.data_ptr();
variant_pack[RAG_V_OFF] = rag_off_v.data_ptr();
variant_pack[RAG_O_OFF] = rag_off_o.data_ptr();
if (return_softmaxstats) {
variant_pack[RAG_LSE_OFF] = rag_off_lse.data_ptr();
}
}
auto workspace_size = mha_graph->get_workspace_size();
auto workspace_ptr =
c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
@ -1571,9 +1278,6 @@ void run_cudnn_SDP_bprop(
!softmaxstats.numel()) {
return;
}
Tensor seqlen_q, seqlen_kv;
Tensor rag_off_q, rag_off_k, rag_off_v, rag_off_o, rag_off_lse;
auto dprops = at::cuda::getCurrentDeviceProperties();
auto _dropoutseed = dropoutseed;
auto _dropoutoffset = dropoutoffset;
@ -1600,28 +1304,10 @@ void run_cudnn_SDP_bprop(
"with matching strides...");
#else
const auto innermost_dO_stride = dO.strides()[dO.strides().size() - 1];
if (innermost_dO_stride != 1 ||
use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
if (innermost_dO_stride != 1) {
permute_to_matching_layout(o, dO_);
}
#endif
if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
seqlen_q = at::full({b, 1, 1, 1}, s_q, q.options().dtype(kInt));
seqlen_kv = at::full({b, 1, 1, 1}, s_kv, q.options().dtype(kInt));
auto cum_seqlen_q = at::full({b + 1, 1, 1, 1}, s_q, q.options().dtype(kInt))
.cumsum(0, kInt)
.add_(-s_q);
auto cum_seqlen_kv =
at::full({b + 1, 1, 1, 1}, s_kv, q.options().dtype(kInt))
.cumsum(0, kInt)
.add_(-s_kv);
rag_off_q = cum_seqlen_q.mul(q.stride(-2));
rag_off_k = cum_seqlen_kv.mul(k.stride(-2));
rag_off_v = cum_seqlen_kv.mul(v.stride(-2));
rag_off_o = cum_seqlen_q.mul(o.stride(-2));
rag_off_lse = cum_seqlen_q.mul(softmaxstats.stride(-2));
}
cudnnHandle_t handle = getCudnnHandle();
auto key = MHACacheKeyWrapper(
b,
@ -1686,16 +1372,6 @@ void run_cudnn_SDP_bprop(
if (attn_bias.has_value()) {
variant_pack[BIAS] = attn_bias.value().data_ptr();
}
if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
variant_pack[SEQ_LEN_Q] = seqlen_q.data_ptr();
variant_pack[SEQ_LEN_KV] = seqlen_kv.data_ptr();
variant_pack[RAG_Q_OFF] = rag_off_q.data_ptr();
variant_pack[RAG_K_OFF] = rag_off_k.data_ptr();
variant_pack[RAG_V_OFF] = rag_off_v.data_ptr();
variant_pack[RAG_O_OFF] = rag_off_o.data_ptr();
variant_pack[RAG_LSE_OFF] = rag_off_lse.data_ptr();
}
auto workspace_size = mha_graph->get_workspace_size();
auto workspace_ptr =
c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);

View File

@ -1,25 +0,0 @@
#pragma once
#include <c10/metal/common.h>
#ifdef __METAL__
enum class GridSamplerInterpolation { Bilinear, Nearest, Bicubic };
enum class GridSamplerPadding { Zeros, Border, Reflection };
#else
#include <ATen/native/GridSamplerUtils.h>
using at::native::GridSamplerInterpolation;
using at::native::GridSamplerPadding;
#endif
template <unsigned N = 5, typename idx_type_t = int32_t>
struct GridSamplerParams {
int32_t sampler_dims;
::c10::metal::array<idx_type_t, N> output_sizes;
::c10::metal::array<idx_type_t, N> output_strides;
::c10::metal::array<idx_type_t, N> input_sizes;
::c10::metal::array<idx_type_t, N> input_strides;
::c10::metal::array<idx_type_t, N> grid_sizes;
::c10::metal::array<idx_type_t, N> grid_strides;
GridSamplerInterpolation interpolation_mode;
GridSamplerPadding padding_mode;
bool align_corners;
};

View File

@ -1,324 +0,0 @@
#include <ATen/native/mps/kernels/GridSampler.h>
#include <c10/metal/utils.h>
#include <metal_array>
#include <metal_stdlib>
using namespace metal;
using namespace c10::metal;
struct GridSamplerOffsets {
int32_t output;
int32_t input;
int32_t grid;
GridSamplerOffsets() : output(0), input(0), grid(0) {}
};
// Find offsets into the tensors that this thread will operate on,
// based on the thread ID.
static GridSamplerOffsets find_grid_sampler_offsets(
constant int32_t* output_sizes,
constant int32_t* output_strides,
constant int32_t* input_strides,
constant int32_t* grid_strides,
int32_t sampler_dims,
uint tid) {
auto dims = sampler_dims + 2;
auto output_idx = static_cast<int32_t>(tid);
GridSamplerOffsets offsets;
for (auto dim = dims - 1; dim >= 0; dim--) {
auto dim_idx = output_idx % output_sizes[dim];
output_idx = output_idx / output_sizes[dim];
// Select the output element that this thread will calculate.
// output shape:
// 2 sampler dims: (N, C, Hout, Wout)
// 3 sampler dims: (N, C, Dout, Hout, Wout)
offsets.output += output_strides[dim] * dim_idx;
// Select the batch and channel for the input.
// input shape:
// 2 sampler dims: (N, C, Hin, Win)
// 3 sampler dims: (N, C, Din, Hin, Win)
if (dim < 2) {
offsets.input += input_strides[dim] * dim_idx;
}
// Select the grid coordinates for the output element.
// grid shape:
// 2 sampler dims: (N, Hout, Wout, 2)
// 3 sampler dims: (N, Dout, Hout, Wout, 3)
if (dim == 0) {
offsets.grid += grid_strides[dim] * dim_idx;
} else if (dim >= 2) {
offsets.grid += grid_strides[dim - 1] * dim_idx;
}
}
return offsets;
}
// Mod function which gives postive output when `a` is negative
static int32_t mod(int32_t a, int32_t b) {
auto r = a % b;
return r + (r < 0 ? b : 0);
}
// Sentinel index value to indicate zero padding
constant int32_t IDX_ZERO = -1;
// Apply padding to an index into the input
static int32_t pad_input_index(
int32_t idx,
int32_t input_size,
GridSamplerPadding padding_mode,
bool align_corners) {
int32_t idx_padded = idx;
if (padding_mode == GridSamplerPadding::Zeros) {
idx_padded = (idx < 0) ? IDX_ZERO : idx_padded;
idx_padded = (idx >= input_size) ? IDX_ZERO : idx_padded;
} else if (padding_mode == GridSamplerPadding::Border) {
idx_padded = (idx < 0) ? 0 : idx_padded;
idx_padded = (idx >= input_size) ? input_size - 1 : idx_padded;
} else if (padding_mode == GridSamplerPadding::Reflection) {
auto scale_length = align_corners ? (input_size - 1) : input_size;
auto idx_mod = mod(idx, scale_length);
auto idx_mod_reverse = (input_size - 1) - idx_mod;
bool is_reverse = (abs(idx - idx_mod) / scale_length) % 2 == 1;
idx_padded = is_reverse ? idx_mod_reverse : idx_mod;
}
return idx_padded;
}
template <int32_t dims, typename T>
T get_tensor_val(
constant T* input,
constant int32_t* input_strides,
int32_t indices[dims]) {
bool found_idx_zero = false;
int32_t offset = 0;
for (auto dim = 0; dim < dims; dim++) {
auto idx = indices[dim];
found_idx_zero = found_idx_zero || (idx == IDX_ZERO);
offset += (found_idx_zero ? 0 : idx) * input_strides[dim];
}
return found_idx_zero ? 0 : input[offset];
}
// This function performs 3D linear interpolation for one value. One way to
// think of how this works is to imagine a unit cube where each corner of the
// cube has one scalar value associated with it. Inside the cube, the values
// change linearly, so the gradient is constant. The values associated with each
// corner are given by the `input`, indexed at all eight different combinations
// of the `left_indices` and `right_indices`. Given a 3D coordinate anywhere
// within the cube, specified by the `scales` argument, we must calculate the
// value associated with that position.
template <typename T>
T interpolate_linear_3d(
constant T* input,
constant int32_t* input_strides,
int32_t left_indices[3],
int32_t right_indices[3],
opmath_t<T> scales[3]) {
int32_t a_idx[3] = {left_indices[0], left_indices[1], left_indices[2]};
int32_t b_idx[3] = {left_indices[0], left_indices[1], right_indices[2]};
int32_t c_idx[3] = {left_indices[0], right_indices[1], left_indices[2]};
int32_t d_idx[3] = {left_indices[0], right_indices[1], right_indices[2]};
int32_t e_idx[3] = {right_indices[0], left_indices[1], left_indices[2]};
int32_t f_idx[3] = {right_indices[0], left_indices[1], right_indices[2]};
int32_t g_idx[3] = {right_indices[0], right_indices[1], left_indices[2]};
int32_t h_idx[3] = {right_indices[0], right_indices[1], right_indices[2]};
auto a =
static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, a_idx));
auto b =
static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, b_idx));
auto c =
static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, c_idx));
auto d =
static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, d_idx));
auto e =
static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, e_idx));
auto f =
static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, f_idx));
auto g =
static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, g_idx));
auto h =
static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, h_idx));
auto scale0_right = scales[0];
auto scale1_right = scales[1];
auto scale2_right = scales[2];
auto scale0_left = 1 - scale0_right;
auto scale1_left = 1 - scale1_right;
auto scale2_left = 1 - scale2_right;
return static_cast<T>(
scale0_left * scale1_left * scale2_left * a +
scale0_left * scale1_left * scale2_right * b +
scale0_left * scale1_right * scale2_left * c +
scale0_left * scale1_right * scale2_right * d +
scale0_right * scale1_left * scale2_left * e +
scale0_right * scale1_left * scale2_right * f +
scale0_right * scale1_right * scale2_left * g +
scale0_right * scale1_right * scale2_right * h);
}
// Calculates a single output element.
// `input` shape:
// 2 sampler dims: (Hin, Win)
// 3 sampler dims: (Din, Hin, Win)
// `coords` values:
// 2 sampler dims: (Wcoord, Hcoord)
// 3 sampler dims: (Wcoord, Hcoord, Dcoord)
template <typename T>
void grid_sampler_single_element(
device T* output,
constant T* input,
constant T* coords,
int32_t dims,
constant int32_t* input_sizes,
constant int32_t* input_strides,
GridSamplerInterpolation interpolation_mode,
GridSamplerPadding padding_mode,
bool align_corners) {
int32_t left_indices[3];
int32_t right_indices[3];
opmath_t<T> scales[3];
// For each dimension, find the pair of indices in the cooresponding dimension
// of `input` which surround the grid coordinate in that dimension. We'll do
// this by mapping different coordiante spaces onto each other. There are
// basically three different coordinate spaces to keep in mind:
//
// * aligned grid space
// - `-1` refers to the leftmost input value.
// - `1` refers to the rightmost input value.
//
// * unaligned grid space
// - `-1` refers to the midpoint between the leftmost input value and
// a padding value to the left of that.
// - `1` refers to the midpoint between the rightmost input value and
// a padding value to the right of that.
//
// * input index space
// - `n` refers to the n-th value of the input.
// - `0` refers to the leftmost input value.
// - `N-1` refers to the rightmost input value.
//
// If `align_corners == False`, then the coordinates are is in unaligned grid
// space, and we will map it onto aligned grid space. If `align_corners ==
// True`, then coordinates are already in aligned grid space.
//
// Then we will map unaligned grid space onto input index space, making it
// relatively simple to find the two input indices that surround the
// coordinate.
for (auto coord_dim = 0; coord_dim < dims; coord_dim++) {
auto input_dim = dims - coord_dim - 1;
auto input_size = input_sizes[input_dim];
auto coord = static_cast<opmath_t<T>>(coords[coord_dim]);
// Interpret nan as -1
coord = isnan(coord) ? -1 : coord;
if (!align_corners) {
// Map unaligned grid space to aligned grid space
auto corner_alignment_factor = static_cast<opmath_t<T>>(input_size) /
static_cast<opmath_t<T>>(input_size - 1);
coord = coord * corner_alignment_factor;
}
// Map aligned grid space to input index space
coord = (coord + 1) * (static_cast<opmath_t<T>>(input_size - 1) / 2);
// Get the input indices surrounding the coordinate, apply padding to them,
// and obtain the scaling factor between the two for interpolation.
auto left_idx = static_cast<int32_t>(floor(coord));
auto right_idx = static_cast<int32_t>(ceil(coord));
left_indices[input_dim] =
pad_input_index(left_idx, input_size, padding_mode, align_corners);
right_indices[input_dim] =
pad_input_index(right_idx, input_size, padding_mode, align_corners);
auto scale = coord - left_idx;
if (interpolation_mode == GridSamplerInterpolation::Nearest) {
// TODO: For some reason, rounding the scale to 0 or 1 and then using
// linear interpolation seems to work perfectly with zero padding mode,
// but we get flaky failures with border and reflection padding modes.
// Need to investigate and fix it.
scale = (scale <= 0.5) ? 0 : 1;
}
scales[input_dim] = scale;
}
// Now that we have the bounding indices and scale factor for each dimension
// of the input, we can interpolate.
if (dims == 3) {
*output = interpolate_linear_3d(
input, input_strides, left_indices, right_indices, scales);
}
}
template <typename T>
kernel void grid_sampler(
device T* output [[buffer(0)]],
constant T* input [[buffer(1)]],
constant T* grid [[buffer(2)]],
constant GridSamplerParams<5>& params [[buffer(3)]],
uint tid [[thread_position_in_grid]]) {
auto output_sizes = params.output_sizes.data();
auto output_strides = params.output_strides.data();
auto input_sizes = params.input_sizes.data();
auto input_strides = params.input_strides.data();
auto grid_strides = params.grid_strides.data();
auto sampler_dims = params.sampler_dims;
auto offsets = find_grid_sampler_offsets(
output_sizes,
output_strides,
input_strides,
grid_strides,
sampler_dims,
tid);
output += offsets.output;
input += offsets.input;
auto coords = grid + offsets.grid;
input_sizes += 2;
input_strides += 2;
auto interpolation_mode = params.interpolation_mode;
auto padding_mode = params.padding_mode;
auto align_corners = params.align_corners;
grid_sampler_single_element(
output,
input,
coords,
sampler_dims,
input_sizes,
input_strides,
interpolation_mode,
padding_mode,
align_corners);
}
#define REGISTER_GRID_SAMPLER_OP(DTYPE) \
template [[host_name("grid_sampler_" #DTYPE)]] \
kernel void grid_sampler<DTYPE>( \
device DTYPE * output [[buffer(0)]], \
constant DTYPE * input [[buffer(1)]], \
constant DTYPE * grid [[buffer(2)]], \
constant GridSamplerParams<5> & params [[buffer(3)]], \
uint tid [[thread_position_in_grid]]);
REGISTER_GRID_SAMPLER_OP(float);
REGISTER_GRID_SAMPLER_OP(half);
REGISTER_GRID_SAMPLER_OP(bfloat);

View File

@ -1,10 +1,7 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/mps/MPSProfiler.h>
#include <ATen/native/GridSamplerUtils.h>
#include <ATen/native/Pool.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/native/mps/kernels/GridSampler.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
@ -12,17 +9,9 @@
#else
#include <ATen/ops/grid_sampler_2d.h>
#include <ATen/ops/grid_sampler_2d_native.h>
#include <ATen/ops/grid_sampler_3d_native.h>
#endif
namespace at::native {
#ifndef PYTORCH_JIT_COMPILE_SHADERS
static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
#else
#include <ATen/native/mps/GridSampler_metallib.h>
#endif
namespace mps {
static void grid_sampler_2d_mps_impl(Tensor& output,
const Tensor& input,
@ -131,96 +120,6 @@ static void grid_sampler_2d_mps_impl(Tensor& output,
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
}
}
static void grid_sampler_template(Tensor& output,
const Tensor& input,
const Tensor& grid,
int64_t _interpolation_mode,
int64_t _padding_mode,
bool align_corners,
int32_t sampler_dims,
const std::string& op_name) {
check_grid_sampler_common(input, grid);
switch (sampler_dims) {
case 2:
check_grid_sampler_2d(input, grid);
break;
case 3:
check_grid_sampler_3d(input, grid, _interpolation_mode);
break;
default:
TORCH_INTERNAL_ASSERT(false, "Only 2D and 3D sampling are supported, but got: ", sampler_dims);
}
TORCH_CHECK(input.scalar_type() == grid.scalar_type(),
"expected input and grid to have the same type, but got ",
input.scalar_type(),
" and ",
grid.scalar_type());
auto interpolation_mode = static_cast<GridSamplerInterpolation>(_interpolation_mode);
auto padding_mode = static_cast<GridSamplerPadding>(_padding_mode);
switch (interpolation_mode) {
case GridSamplerInterpolation::Bilinear:
break;
case GridSamplerInterpolation::Nearest:
TORCH_CHECK(false, op_name, ": Unsupported Nearest interpolation");
break;
case GridSamplerInterpolation::Bicubic:
TORCH_CHECK(false, op_name, ": Unsupported Bicubic interpolation");
break;
default:
TORCH_CHECK(false, op_name, ": Unrecognised interpolation mode: ", _interpolation_mode);
}
switch (padding_mode) {
case GridSamplerPadding::Zeros:
case GridSamplerPadding::Border:
case GridSamplerPadding::Reflection:
break;
default:
TORCH_CHECK(false, op_name, ": Unrecognised Padding Mode: ", _padding_mode);
}
auto input_size = input.sizes();
auto grid_size = grid.sizes();
output.resize_({input_size[0], input_size[1], grid_size[1], grid_size[2], grid_size[3]}, MemoryFormat::Contiguous);
auto dims = input.dim();
GridSamplerParams<5> params;
params.sampler_dims = sampler_dims;
params.padding_mode = padding_mode;
params.interpolation_mode = interpolation_mode;
params.align_corners = align_corners;
for (const auto dim : c10::irange(dims)) {
params.output_sizes[dim] = safe_downcast<int32_t, int64_t>(output.size(dim));
params.output_strides[dim] = safe_downcast<int32_t, int64_t>(output.stride(dim));
params.input_sizes[dim] = safe_downcast<int32_t, int64_t>(input.size(dim));
params.input_strides[dim] = safe_downcast<int32_t, int64_t>(input.stride(dim));
params.grid_sizes[dim] = safe_downcast<int32_t, int64_t>(grid.size(dim));
params.grid_strides[dim] = safe_downcast<int32_t, int64_t>(grid.stride(dim));
}
auto num_threads = output.numel();
MPSStream* mpsStream = getCurrentMPSStream();
dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
@autoreleasepool {
id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
auto pso = lib.getPipelineStateForFunc("grid_sampler_" + scalarToMetalTypeString(input));
getMPSProfiler().beginProfileKernel(pso, op_name, {input, grid});
[computeEncoder setComputePipelineState:pso];
mtl_setArgs(computeEncoder, output, input, grid, params);
mtl_dispatch1DJob(computeEncoder, pso, num_threads);
getMPSProfiler().endProfileKernel(pso);
}
});
}
} // namespace mps
Tensor grid_sampler_2d_mps(const Tensor& input,
@ -236,21 +135,4 @@ Tensor grid_sampler_2d_mps(const Tensor& input,
return output;
}
Tensor grid_sampler_3d_mps(const Tensor& input,
const Tensor& grid,
int64_t interpolation_mode,
int64_t padding_mode,
bool align_corners) {
auto output = at::empty({0}, input.options(), MemoryFormat::Contiguous);
mps::grid_sampler_template(output,
input,
grid,
interpolation_mode,
padding_mode,
align_corners,
/*sampler_dims=*/3,
/*op_name=*/"grid_sampler_3d");
return output;
}
} // namespace at::native

View File

@ -456,7 +456,7 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t,
errMessage += ": reduction dim must be in the range of input shape";
for (const auto dim : dim_value) {
auto wrap_dim = maybe_wrap_dim(dim, num_input_dims);
TORCH_CHECK(wrap_dim < (num_input_dims ? num_input_dims : 1), errMessage.c_str())
TORCH_CHECK(wrap_dim < static_cast<decltype(wrap_dim)>(input_shape.size()), errMessage.c_str())
}
}

View File

@ -2931,7 +2931,6 @@
dispatch:
CPU: grid_sampler_3d_cpu
CUDA: grid_sampler_3d_cuda
MPS: grid_sampler_3d_mps
autogen: grid_sampler_3d.out
# `grid_sampler_3d_backward` takes in `output_mask` to optimize performance for
@ -3448,12 +3447,8 @@
- func: fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor? bias) -> Tensor
- func: fbgemm_linear_fp16_weight_fp32_activation.out(Tensor input, Tensor packed_weight, Tensor? bias, Tensor(a!) output) -> Tensor
- func: fbgemm_linear_fp16_weight(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor
- func: fbgemm_linear_fp16_weight.out(Tensor input, Tensor packed_weight, Tensor bias, Tensor(a!) output) -> Tensor
- func: fbgemm_pack_quantized_matrix(Tensor input) -> Tensor
- func: fbgemm_pack_quantized_matrix.KN(Tensor input, int K, int N) -> Tensor

View File

@ -260,7 +260,7 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_attention_backward(
attn_bias_ /*const std::optional<Tensor>& attn_bias*/,
out /*const Tensor& o*/,
grad_out/*const Tensor& dO*/,
logsumexp/*const Tensor& softmaxstats*/,
logsumexp.unsqueeze(-1)/*const Tensor& softmaxstats*/,
dq/*Tensor& dQ*/,
dk/*Tensor& dK*/,
dv/*Tensor& dV*/,

View File

@ -243,6 +243,12 @@ mha_fwd_aot(const at::Tensor &q, // batch_size x seqlen_q x num_heads x
} else {
softmax_fa_t = at::empty({ 0, 0, 0, 0 }, opts);
}
at::Tensor atomic_counter;
if (is_causal) {
atomic_counter = at::zeros({1}, opts.dtype(at::kInt));
}
auto [needs_swa, window_left, window_right] = calculate_swa(window_size_left,
window_size_right,
seqlen_q,
@ -256,14 +262,6 @@ mha_fwd_aot(const at::Tensor &q, // batch_size x seqlen_q x num_heads x
constexpr bool uses_swa = false;
#endif
// SWA in AOTriton Kernels is treated as "Generalized Causal masks"
is_causal = is_causal || uses_swa;
at::Tensor atomic_counter;
if (is_causal) {
atomic_counter = at::zeros({1}, opts.dtype(at::kInt));
}
hipError_t err; // TODO: Error handling
using aotriton::v2::flash::attn_fwd;
using sdp::aotriton_adapter::mk_aotensor;
@ -457,9 +455,6 @@ mha_varlen_fwd_aot(const at::Tensor &q, // total_q x num_heads x head_size, tot
constexpr bool uses_swa = false;
#endif
// SWA in AOTriton Kernels is treated as "Generalized Causal masks"
is_causal = is_causal || needs_swa;
auto [seed_t, offset_t, philox_state, use_philox_state] =
prepare_philox_arguments(p_dropout, batch_size * num_heads * 32);

View File

@ -4190,7 +4190,7 @@ def run(runner, args, original_dir=None):
nonlocal marked
for i, s in enumerate(t.size()):
if s == batch_size:
torch._dynamo.maybe_mark_dynamic(t, i)
torch._dynamo.mark_dynamic(t, i)
marked = True
break

View File

@ -370,7 +370,6 @@ class HuggingfaceRunner(BenchmarkRunner):
return name in [
"ElectraForQuestionAnswering",
"MegatronBertForQuestionAnswering",
"GPT2ForSequenceClassification",
]
def _get_model_cls_and_config(self, model_name):

View File

@ -631,9 +631,6 @@ libtorch_nativert_sources = [
"torch/nativert/kernels/NativeKernels.cpp",
"torch/nativert/kernels/GeneratedStaticDispatchKernels.cpp",
"torch/nativert/kernels/GeneratedNativeStaticDispatchKernels.cpp",
"torch/nativert/graph/passes/SubgraphRewriter.cpp",
"torch/nativert/graph/passes/pass_manager/GraphPasses.cpp",
"torch/nativert/graph/passes/pass_manager/PassManager.cpp",
]
torch_mobile_tracer_sources = [

View File

@ -38,13 +38,6 @@ DriverAPI create_driver_api() {
C10_NVML_DRIVER_API(LOOKUP_NVML_ENTRY)
#undef LOOKUP_NVML_ENTRY
}
if (handle_1) {
#define LOOKUP_NVML_ENTRY_OPTIONAL(name) \
r.name##_ = ((decltype(&name))dlsym(handle_1, #name));
C10_NVML_DRIVER_API_OPTIONAL(LOOKUP_NVML_ENTRY_OPTIONAL)
#undef LOOKUP_NVML_ENTRY_OPTIONAL
}
return r;
}

View File

@ -67,8 +67,6 @@
_(nvmlDeviceGetComputeRunningProcesses) \
_(nvmlSystemGetCudaDriverVersion_v2)
#define C10_NVML_DRIVER_API_OPTIONAL(_) _(nvmlDeviceGetGpuFabricInfoV)
namespace c10::cuda {
struct DriverAPI {
@ -77,7 +75,6 @@ struct DriverAPI {
C10_LIBCUDA_DRIVER_API_REQUIRED(CREATE_MEMBER_VERSIONED)
C10_LIBCUDA_DRIVER_API_OPTIONAL(CREATE_MEMBER_VERSIONED)
C10_NVML_DRIVER_API(CREATE_MEMBER)
C10_NVML_DRIVER_API_OPTIONAL(CREATE_MEMBER)
#undef CREATE_MEMBER_VERSIONED
#undef CREATE_MEMBER

View File

@ -1122,11 +1122,6 @@ elseif(USE_CUDA)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
endif()
# Set driver api defined for PeerToPeerAccess
if(NOT WIN32)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/cuda/PeerToPeerAccess.cpp PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1")
endif()
endif()
if(USE_XPU)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 220 KiB

View File

@ -202,7 +202,6 @@ Below are some useful tools for debugging AOT Inductor.
logging
torch.compiler_aot_inductor_minifier
torch.compiler_aot_inductor_debugging_guide
```
To enable runtime checks on inputs, set the environment variable `AOTI_RUNTIME_CHECK_INPUTS` to 1. This will raise a `RuntimeError` if the inputs to the compiled model differ in size, data type, or strides from those used during export.

View File

@ -1,73 +0,0 @@
# AOTInductor Debugging Guide
If you encounter CUDA illegal memory access (IMA) errors while using [AOT Inductor](./torch.compiler_aot_inductor.md), this guide provides a systematic approach to debug such errors. AOT Inductor is part of the PT2 stack, similar to torch.compile, but it produces a compilation artifact that can work in a C++ environment. CUDA illegal memory errors can happen non-deterministically and even appear transient at times.
On a high-level, there are three main steps in debugging CUDA IMA errors:
- **Sanity checks**: Use basic debugging flags to catch common issues before diving deeper.
- **Pinpoint the CUDA IMA**: Make the error deterministic and identify the problematic kernel.
- **Identify problematic kernels**: Use intermediate value debugging to inspect kernel inputs and outputs.
## Step 1: Sanity Checks
Before diving deep into reliably reproducing the error, try out some existing debugging flags:
```bash
AOTI_RUNTIME_CHECK_INPUTS=1
TORCHINDUCTOR_NAN_ASSERTS=1
```
These flags take effect at compilation time (more precisely, at codegen time):
- `AOTI_RUNTIME_CHECK_INPUTS=1` checks if the inputs satisfy the same set of guards used during compilation. See {ref}`torch.compiler_troubleshooting` for more details.
- `TORCHINDUCTOR_NAN_ASSERTS=1` adds codegen before and after each Inductor's kernel to check for NaN.
## Step 2: Pinpoint the CUDA IMA
One hard part is CUDA IMA errors can be non-deterministic. They can happen at different locations, and sometimes not happen at all (though that just means the numerics are silently incorrect). With the following two flags, we can trigger the error deterministically:
```bash
PYTORCH_NO_CUDA_MEMORY_CACHING=1
CUDA_LAUNCH_BLOCKING=1
```
These flags take effect at runtime:
- `PYTORCH_NO_CUDA_MEMORY_CACHING=1` disables PyTorch's Caching Allocator, which allocates a bigger buffer than needed immediately to reduce the number of buffer allocations. This is usually the reason why CUDA illegal memory access errors are non-deterministic.
![How PyTorch's caching allocator can mask CUDA illegal memory access errors](./_static/img/aoti_debugging_guide/cuda_ima_cca.png)
*Figure: How PyTorch's caching allocator can mask CUDA illegal memory access errors*
- `CUDA_LAUNCH_BLOCKING=1` forces the kernels to launch one at a time. Without this, we would get the famous "CUDA kernel errors might be asynchronously reported at some other API call" warning since kernels are launched asynchronously.
## Step 3: Identify Problematic Kernels with Intermediate Value Debugger
The AOTI Intermediate Value Debugger can help pinpoint the problematic kernel and get information about the inputs and outputs of said kernel.
First, use:
```bash
AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=3
```
This flag takes effect at compilation time and prints the kernels one by one at runtime. Together with the previous flags, this would let us know which kernel was launched right before the error happened.
However, it is important to note that just because the error happened in that kernel, it doesn't mean that kernel is problematic. For example, it can happen that an earlier kernel is problematic and produces some wrong outputs. So the natural next step is to inspect the inputs to the problematic kernel:
```bash
AOT_INDUCTOR_FILTERED_KERNELS_TO_PRINT="triton_poi_fused_add_ge_logical_and_logical_or_lt_231,_add_position_embeddings_kernel_5" AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=2
```
The filtered kernels to print environment variable has the names of the kernels you want to inspect. If the inputs to the kernel are not as expected, you then inspect the kernel that produces the bad input.
## Additional Debugging Tools
### Logging and Tracing
- **tlparse / TORCH_TRACE**: Provides complete output codes for inspection and records the set of guards used. See {ref}`tlparse / TORCH_TRACE <tlparse-torch-trace>` for more details.
- **TORCH_LOGS**: Use `TORCH_LOGS="+inductor,output_code"` to see more PT2 internal logs. See {ref}`TORCH_LOGS <torch-logs>` for more details.
- **TORCH_SHOW_CPP_STACKTRACES**: Set `TORCH_SHOW_CPP_STACKTRACES=1` to potentially see more stack traces.
### Common Sources of Issues
- [**Dynamic shapes**](./torch.compiler_dynamic_shapes.md): Historically a source of many IMAs. Pay special attention when debugging dynamic shape scenarios.
- **Custom ops**: Especially when implemented in C++ and used with dynamic shapes. There is a need to Symint'ify the meta function.

View File

@ -192,8 +192,6 @@ For more information on dynamic shapes, see [The dynamic shapes manual](https://
## Logging Tools
(tlparse-torch-trace)=
### tlparse / TORCH_TRACE
`tlparse` / `TORCH_TRACE` are a pair of tools that produce compilation reports that look like this:
@ -254,8 +252,6 @@ Here are some insights you can gain from a `tlparse`:
For example, you can look at the high-level generated FX graph or the generated Triton code.
- Is there relevant information for a particular frame? You can find these in `compilation_metrics`.
(torch-logs)=
### TORCH_LOGS
You can use the `TORCH_LOGS` environment variable to selectively enable parts of the `torch.compile` stack to log.

View File

@ -1 +0,0 @@
1171719005974771805808300960005001569062

View File

@ -1 +0,0 @@
6

View File

@ -1,33 +0,0 @@
cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
project(aoti_example LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
add_executable(aoti_example main.cpp)
set_property(TARGET aoti_example PROPERTY CXX_STANDARD 17)
find_package(TorchStandalone REQUIRED)
# Set up include directories to find headers at the correct paths
target_include_directories(aoti_example PRIVATE ${TorchStandalone_INCLUDE_DIRS})
target_include_directories(aoti_example PRIVATE ${TorchStandalone_INCLUDE_DIRS}/standalone)
enable_language(CUDA)
set(CMAKE_CUDA_STANDARD 17)
find_package(CUDAToolkit REQUIRED)
target_compile_definitions(aoti_example PRIVATE NOMINMAX USE_CUDA)
# Add compile flags
target_compile_options(aoti_example PRIVATE /O2 /DLL /MD /std:c++20 /wd4819 /wd4251 /wd4244 /wd4267 /wd4275 /wd4018 /wd4190 /wd4624 /wd4067 /wd4068 /EHsc /Zc:__cplusplus /permissive- /openmp /openmp:experimental )
target_link_libraries(aoti_example PRIVATE
${TorchStandalone_LIBRARIES} # if you have this variable from find_package(TorchStandalone)
CUDA::cudart # CUDA runtime library
cuda # CUDA driver library (usually nvcuda.lib on Windows)
)
# cmake -DTorchStandalone_DIR="C:/Users/shangdiy/source/repos/torchnative/standalone/build/torchstandalone_install/lib/cmake/TorchStandalone" ..
# cmake --build . --config Release

View File

@ -1 +0,0 @@
pt2

View File

@ -1 +0,0 @@
0

View File

@ -1 +0,0 @@
little

View File

@ -1,69 +0,0 @@
cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
project(model LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
# Set a library target
add_library(model SHARED)
# TODO: change to TorchStandalone
find_package(TorchStandalone REQUIRED)
# Set up include directories to find headers at the correct paths
target_include_directories(model PRIVATE ${TorchStandalone_INCLUDE_DIRS})
target_include_directories(model PRIVATE ${TorchStandalone_INCLUDE_DIRS}/standalone)
# Add macro definitions
target_compile_definitions(model PRIVATE NOMINMAX TORCH_INDUCTOR_CPP_WRAPPER STANDALONE_TORCH_HEADER C10_USING_CUSTOM_GENERATED_MACROS USE_CUDA) # CPU_CAPABILITY_AVX512
# Add compile flags
target_compile_options(model PRIVATE /O2 /DLL /MD /std:c++20 /wd4819 /wd4251 /wd4244 /wd4267 /wd4275 /wd4018 /wd4190 /wd4624 /wd4067 /wd4068 /EHsc /Zc:__cplusplus /permissive- /openmp /openmp:experimental )
# Backend-specific flags
# target_compile_options(model PRIVATE -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512bf16 -c) # TODO remove
enable_language(CUDA)
set(CMAKE_CUDA_STANDARD 17)
find_package(CUDAToolkit REQUIRED)
# Make output use .pyd instead of .dll
set_target_properties(model PROPERTIES
SUFFIX ".pyd"
LINK_FLAGS "/DEF:${CMAKE_CURRENT_SOURCE_DIR}/model_exports.def"
)
set(KERNEL_TARGETS "")
set(KERNEL_OBJECT_FILES "")
# Function to compile ptx to cubin
function(embed_gpu_kernel KERNEL_NAME PTX_FILE)
set(CUBIN_BASENAME ${KERNEL_NAME}.cubin)
set(CUBIN_FILE ${CMAKE_CURRENT_BINARY_DIR}/${CUBIN_BASENAME})
# --- PTX to FATBIN Command & Target ---
add_custom_command(
OUTPUT ${CUBIN_FILE}
COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} --cubin ${PTX_FILE}
-o ${CUBIN_FILE} ${NVCC_GENCODE_FLAGS}
-gencode arch=compute_89,code=sm_89
DEPENDS ${PTX_FILE}
)
add_custom_target(build_kernel_object_${KERNEL_NAME} DEPENDS ${CUBIN_FILE})
set(KERNEL_TARGETS ${KERNEL_TARGETS} build_kernel_object_${KERNEL_NAME} PARENT_SCOPE)
endfunction()
target_sources(model PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/model.wrapper.cpp)
target_sources(model PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/model_consts.weights.cpp)
embed_gpu_kernel(model_triton_tem_fused_addmm_relu_t_0 ${CMAKE_CURRENT_SOURCE_DIR}/model_triton_tem_fused_addmm_relu_t_0.ptx)
embed_gpu_kernel(model_triton_tem_fused_addmm_relu_sigmoid_t_1 ${CMAKE_CURRENT_SOURCE_DIR}/model_triton_tem_fused_addmm_relu_sigmoid_t_1.ptx)
add_dependencies(model ${KERNEL_TARGETS})
target_link_libraries(model PRIVATE ${KERNEL_OBJECT_FILES})
# Add linker flags
target_link_options(model PRIVATE )
# Add libraries
# TODO: change to TorchStandalone
target_link_libraries(model PRIVATE ${TorchStandalone_LIBRARIES} cuda CUDA::cudart)

File diff suppressed because it is too large Load Diff

View File

@ -1 +0,0 @@
{"compiler": "/home/shangdiy/miniconda3/envs/pytorch-3.10/bin/x86_64-conda-linux-gnu-c++", "definitions": ["NOMINMAX", "TORCH_INDUCTOR_CPP_WRAPPER", "STANDALONE_TORCH_HEADER", " C10_USING_CUSTOM_GENERATED_MACROS", "CPU_CAPABILITY_AVX512", " USE_CUDA"], "include_dirs": ["/home/shangdiy/miniconda3/envs/pytorch-3.10/include/python3.10", "/home/shangdiy/miniconda3/envs/pytorch-3.10/Include", "/home/shangdiy/pytorch/torch/include", "/home/shangdiy/pytorch/torch/include/torch/csrc/api/include", "/usr/local/cuda-12/include"], "cflags": ["O2", "DLL", "MD", "std:c++20", "wd4819", "wd4251", "wd4244", "wd4267", "wd4275", "wd4018", "wd4190", "wd4624", "wd4067", "wd4068", "EHsc", "Zc:__cplusplus", "permissive-", "openmp", "openmp:experimental"], "ldflags": [], "libraries_dirs": [], "libraries": [], "passthrough_args": ["", "-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512bf16"], "aot_mode": true, "use_relative_path": false, "compile_only": true}

View File

@ -1 +0,0 @@
{"compiler": "/home/shangdiy/miniconda3/envs/pytorch-3.10/bin/x86_64-conda-linux-gnu-c++", "definitions": ["NOMINMAX", "TORCH_INDUCTOR_CPP_WRAPPER", "STANDALONE_TORCH_HEADER", " C10_USING_CUSTOM_GENERATED_MACROS", "CPU_CAPABILITY_AVX512", " USE_CUDA"], "include_dirs": ["/home/shangdiy/miniconda3/envs/pytorch-3.10/include/python3.10", "/home/shangdiy/miniconda3/envs/pytorch-3.10/Include", "/home/shangdiy/pytorch/torch/include", "/home/shangdiy/pytorch/torch/include/torch/csrc/api/include", "/usr/local/cuda-12/include"], "cflags": ["O2", "DLL", "MD", "std:c++20", "wd4819", "wd4251", "wd4244", "wd4267", "wd4275", "wd4018", "wd4190", "wd4624", "wd4067", "wd4068", "EHsc", "Zc:__cplusplus", "permissive-", "openmp", "openmp:experimental"], "ldflags": [], "libraries_dirs": ["/home/shangdiy/miniconda3/envs/pytorch-3.10/libs", "/home/shangdiy/pytorch/torch/lib", "/usr/local/cuda-12.8/targets/x86_64-linux/lib", "/usr/local/cuda-12.8/targets/x86_64-linux/lib/stubs"], "libraries": ["torch", "torch_cpu", "sleef", "c10", "c10_cuda", "cuda", "torch_cuda"], "passthrough_args": ["", "-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512bf16"], "aot_mode": true, "use_relative_path": false, "compile_only": false}

View File

@ -1 +0,0 @@
{"AOTI_DEVICE_KEY": "cuda"}

View File

@ -1,58 +0,0 @@
#if defined(__clang__) || defined (__GNUC__)
#define ATTRIBUTE_NO_SANITIZE_ADDRESS __attribute__((no_sanitize("address")))
#else
#define ATTRIBUTE_NO_SANITIZE_ADDRESS
#endif
ATTRIBUTE_NO_SANITIZE_ADDRESS
alignas(64) extern unsigned char _binary_constants_bin_start[768] = {
69, 165, 123, 190, 252, 181, 16, 62, 242, 69, 193, 59, 44, 80, 100, 62,
237, 163, 142, 188, 138, 139, 109, 190, 61, 248, 20, 190, 77, 84, 143, 60,
111, 90, 163, 60, 96, 140, 230, 189, 101, 69, 38, 190, 132, 190, 12, 188,
28, 113, 159, 190, 252, 128, 154, 62, 247, 234, 217, 61, 206, 79, 58, 61,
224, 209, 135, 61, 211, 238, 147, 62, 231, 229, 27, 190, 169, 208, 57, 62,
100, 20, 153, 190, 161, 160, 85, 190, 207, 10, 156, 62, 234, 107, 155, 190,
188, 85, 116, 62, 27, 211, 114, 60, 94, 21, 158, 189, 147, 210, 34, 62,
203, 109, 80, 62, 28, 242, 141, 189, 205, 27, 152, 190, 38, 104, 6, 189,
211, 16, 249, 189, 72, 103, 143, 190, 163, 44, 140, 189, 178, 223, 127, 189,
5, 112, 160, 189, 177, 55, 132, 190, 218, 22, 159, 62, 115, 30, 35, 190,
26, 247, 9, 62, 251, 219, 27, 62, 165, 86, 135, 62, 99, 168, 66, 190,
238, 64, 93, 62, 65, 147, 86, 62, 167, 108, 97, 62, 183, 219, 50, 190,
138, 83, 106, 62, 90, 122, 208, 189, 149, 140, 161, 188, 44, 145, 194, 189,
5, 142, 186, 61, 202, 230, 153, 190, 133, 72, 136, 62, 251, 1, 3, 62,
225, 146, 54, 190, 91, 176, 219, 189, 118, 244, 10, 189, 232, 107, 142, 62,
185, 6, 151, 62, 241, 137, 223, 61, 124, 100, 114, 190, 15, 240, 168, 189,
149, 252, 58, 190, 238, 93, 243, 188, 144, 218, 115, 61, 159, 91, 6, 62,
57, 139, 74, 190, 84, 200, 49, 61, 193, 78, 32, 190, 84, 121, 26, 190,
219, 39, 115, 190, 171, 127, 94, 62, 248, 253, 177, 61, 63, 18, 127, 62,
146, 18, 137, 189, 203, 90, 161, 190, 139, 194, 239, 58, 126, 54, 40, 190,
47, 247, 30, 190, 106, 93, 191, 61, 22, 48, 120, 61, 56, 123, 7, 62,
150, 229, 210, 189, 118, 231, 158, 188, 7, 98, 215, 60, 72, 251, 89, 190,
190, 160, 137, 190, 173, 194, 158, 62, 225, 26, 118, 190, 174, 199, 4, 189,
205, 148, 16, 62, 20, 225, 155, 61, 90, 124, 133, 190, 88, 196, 34, 61,
26, 104, 51, 190, 149, 106, 40, 62, 25, 136, 177, 60, 169, 111, 138, 190,
214, 181, 226, 189, 109, 17, 77, 62, 224, 166, 55, 62, 250, 128, 160, 61,
104, 223, 250, 61, 34, 182, 210, 187, 60, 87, 149, 190, 189, 55, 98, 188,
58, 86, 85, 190, 170, 43, 132, 190, 81, 220, 87, 190, 47, 226, 138, 62,
189, 162, 36, 190, 30, 232, 34, 60, 138, 147, 167, 61, 151, 129, 157, 61,
206, 33, 152, 62, 109, 227, 113, 190, 147, 255, 11, 190, 175, 56, 46, 189,
46, 238, 1, 189, 123, 159, 85, 188, 14, 126, 148, 189, 226, 226, 169, 189,
255, 106, 134, 61, 38, 140, 187, 60, 119, 73, 49, 62, 32, 236, 43, 62,
78, 33, 232, 189, 72, 188, 139, 62, 94, 206, 20, 62, 25, 230, 75, 189,
171, 239, 26, 190, 136, 218, 121, 62, 96, 115, 85, 62, 126, 92, 55, 190,
112, 108, 134, 61, 64, 212, 69, 190, 253, 118, 214, 188, 210, 116, 66, 62,
204, 131, 123, 190, 13, 151, 38, 190, 56, 17, 252, 189, 153, 151, 138, 62,
21, 30, 216, 61, 146, 103, 32, 62, 140, 60, 78, 62, 183, 149, 174, 61,
95, 153, 164, 61, 144, 167, 187, 189, 112, 53, 153, 190, 127, 195, 105, 61,
169, 167, 251, 189, 42, 204, 123, 62, 116, 193, 86, 62, 98, 147, 30, 61,
176, 138, 137, 62, 245, 244, 17, 62, 201, 90, 140, 62, 177, 110, 77, 62,
188, 31, 129, 190, 66, 203, 85, 62, 182, 209, 112, 188, 216, 91, 222, 59,
18, 208, 131, 189, 151, 142, 150, 190, 36, 252, 31, 62, 241, 2, 180, 61,
83, 240, 159, 62, 37, 152, 115, 190, 13, 52, 107, 62, 169, 178, 148, 62,
171, 54, 38, 62, 33, 4, 199, 189, 201, 247, 216, 189, 225, 89, 146, 190,
192, 118, 79, 189, 92, 171, 12, 62, 136, 235, 3, 62, 180, 202, 87, 62,
8, 129, 122, 61, 160, 75, 170, 188, 20, 84, 6, 62, 60, 194, 56, 190,
182, 99, 44, 190, 88, 96, 228, 189, 50, 106, 5, 190, 34, 133, 12, 190,
26, 50, 0, 190, 176, 25, 127, 61, 48, 69, 219, 61, 192, 237, 252, 187,
};
alignas(64) extern unsigned char * _binary_constants_bin_end;

View File

@ -1,6 +0,0 @@
LIBRARY model
EXPORTS
AOTInductorModelContainerCreate
AOTInductorModelContainerCreateWithDevice
AOTInductorModelContainerRun
AOTInductorModelContainerDelete

View File

@ -1,684 +0,0 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 8.7
.target sm_89
.address_size 64
// .globl model_triton_tem_fused_addmm_relu_sigmoid_t_1 // -- Begin function model_triton_tem_fused_addmm_relu_sigmoid_t_1
.extern .shared .align 16 .b8 global_smem[];
// @model_triton_tem_fused_addmm_relu_sigmoid_t_1
.visible .entry model_triton_tem_fused_addmm_relu_sigmoid_t_1(
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_0,
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_1,
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_2,
.param .u32 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_3,
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_4
)
.reqntid 32
{
.reg .pred %p<12>;
.reg .b32 %r<375>;
.reg .b64 %rd<27>;
.loc 1 18 0 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:18:0
$L__func_begin0:
.loc 1 18 0 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:18:0
// %bb.0:
ld.param.b32 %r1, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_3];
$L__tmp0:
.loc 1 34 16 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:34:16
setp.ne.s32 %p1, %r1, 0;
@%p1 bra $L__BB0_2;
bra.uni $L__BB0_1;
$L__BB0_2:
.loc 1 0 16 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:0:16
ld.param.b64 %rd3, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_2];
ld.param.b64 %rd2, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_1];
ld.param.b64 %rd1, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_0];
.loc 1 43 24 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:43:24
mov.u32 %r26, %ctaid.x;
.loc 1 44 28 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:44:28
add.s32 %r27, %r1, 15;
.loc 1 44 34 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:44:34
shr.s32 %r28, %r27, 31;
shr.u32 %r29, %r28, 28;
add.s32 %r30, %r27, %r29;
shr.s32 %r31, %r30, 4;
.loc 1 50 41 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:50:41
and.b32 %r32, %r26, 2147483640;
.loc 1 50 30 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:50:30
sub.s32 %r33, %r31, %r32;
.loc 1 50 50 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:50:50
min.s32 %r34, %r33, 8;
.loc 1 51 40 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:51:40
rem.s32 %r35, %r26, %r34;
.loc 1 51 34 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:51:34
add.s32 %r36, %r35, %r32;
.loc 1 52 19 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:52:19
and.b32 %r37, %r26, 7;
.loc 1 52 30 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:52:30
div.s32 %r38, %r37, %r34;
.loc 1 56 17 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:56:17
shl.b32 %r39, %r36, 4;
.loc 1 56 40 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:56:40
mov.u32 %r40, %tid.x;
shr.u32 %r41, %r40, 2;
and.b32 %r42, %r41, 2;
bfe.u32 %r43, %r40, 2, 2;
and.b32 %r44, %r40, 16;
shr.u32 %r45, %r44, 2;
or.b32 %r46, %r43, %r45;
and.b32 %r47, %r40, 15;
bfe.u32 %r48, %r40, 4, 1;
.loc 1 56 27 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:56:27
or.b32 %r49, %r39, %r46;
or.b32 %r50, %r49, 8;
or.b32 %r51, %r39, %r47;
.loc 1 0 0 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:0
rem.s32 %r52, %r50, %r1;
rem.s32 %r53, %r49, %r1;
.loc 1 71 30 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:71:30
shl.b32 %r54, %r40, 2;
and.b32 %r55, %r54, 12;
.loc 1 76 28 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:76:28
shl.b32 %r56, %r53, 4;
shl.b32 %r57, %r52, 4;
.loc 1 76 25 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:76:25
or.b32 %r58, %r56, %r55;
or.b32 %r59, %r57, %r55;
.loc 1 77 25 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:77:25
mul.wide.s32 %rd16, %r58, 4;
add.s64 %rd4, %rd1, %rd16;
mul.wide.s32 %rd17, %r59, 4;
add.s64 %rd5, %rd1, %rd17;
.loc 1 77 20 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:77:20
// begin inline asm
mov.u32 %r2, 0x0;
mov.u32 %r3, 0x0;
mov.u32 %r4, 0x0;
mov.u32 %r5, 0x0;
ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd4 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r6, 0x0;
mov.u32 %r7, 0x0;
mov.u32 %r8, 0x0;
mov.u32 %r9, 0x0;
ld.global.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd5 + 0 ];
// end inline asm
shl.b32 %r60, %r46, 6;
shl.b32 %r61, %r55, 2;
or.b32 %r62, %r60, %r61;
mov.b32 %r63, global_smem;
add.s32 %r64, %r63, %r62;
st.shared.v4.b32 [%r64], {%r2, %r3, %r4, %r5};
st.shared.v4.b32 [%r64+512], {%r6, %r7, %r8, %r9};
.loc 1 82 25 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:82:25
mul.wide.u32 %rd18, %r55, 4;
add.s64 %rd6, %rd2, %rd18;
.loc 1 82 20 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:82:20
// begin inline asm
mov.u32 %r10, 0x0;
mov.u32 %r11, 0x0;
mov.u32 %r12, 0x0;
mov.u32 %r13, 0x0;
ld.global.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd6 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r14, 0x0;
mov.u32 %r15, 0x0;
mov.u32 %r16, 0x0;
mov.u32 %r17, 0x0;
ld.global.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd6 + 0 ];
// end inline asm
add.s32 %r65, %r63, 1024;
add.s32 %r66, %r65, %r62;
st.shared.v4.b32 [%r66], {%r10, %r11, %r12, %r13};
st.shared.v4.b32 [%r66+512], {%r14, %r15, %r16, %r17};
.loc 1 90 17 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:90:17
shl.b32 %r67, %r38, 4;
.loc 1 90 27 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:90:27
or.b32 %r68, %r67, %r48;
.loc 1 93 20 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:93:20
setp.lt.s32 %p10, %r51, %r1;
.loc 1 93 34 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:93:34
setp.eq.s32 %p11, %r68, 0;
.loc 1 93 26 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:93:26
and.pred %p2, %p10, %p11;
.loc 1 96 21 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:96:21
add.s32 %r69, %r51, %r68;
add.s32 %r70, %r69, 2;
add.s32 %r71, %r69, 4;
add.s32 %r72, %r69, 6;
add.s32 %r73, %r69, 8;
add.s32 %r74, %r69, 10;
add.s32 %r75, %r69, 12;
add.s32 %r76, %r69, 14;
.loc 1 77 20 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:77:20
bar.sync 0;
or.b32 %r77, %r42, %r45;
shl.b32 %r78, %r77, 6;
add.s32 %r79, %r63, %r78;
ld.shared.v4.b32 {%r80, %r81, %r82, %r83}, [%r79+112];
ld.shared.v4.b32 {%r84, %r85, %r86, %r87}, [%r79+96];
ld.shared.v4.b32 {%r88, %r89, %r90, %r91}, [%r79+80];
ld.shared.v4.b32 {%r92, %r93, %r94, %r95}, [%r79+64];
ld.shared.v4.b32 {%r96, %r97, %r98, %r99}, [%r79+48];
ld.shared.v4.b32 {%r100, %r101, %r102, %r103}, [%r79+32];
ld.shared.v4.b32 {%r104, %r105, %r106, %r107}, [%r79+16];
ld.shared.v4.b32 {%r108, %r109, %r110, %r111}, [%r79];
ld.shared.v4.b32 {%r112, %r113, %r114, %r115}, [%r79+624];
ld.shared.v4.b32 {%r116, %r117, %r118, %r119}, [%r79+608];
ld.shared.v4.b32 {%r120, %r121, %r122, %r123}, [%r79+592];
ld.shared.v4.b32 {%r124, %r125, %r126, %r127}, [%r79+576];
ld.shared.v4.b32 {%r128, %r129, %r130, %r131}, [%r79+560];
ld.shared.v4.b32 {%r132, %r133, %r134, %r135}, [%r79+544];
ld.shared.v4.b32 {%r136, %r137, %r138, %r139}, [%r79+528];
ld.shared.v4.b32 {%r140, %r141, %r142, %r143}, [%r79+512];
.loc 1 82 20 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:82:20
shl.b32 %r144, %r40, 1;
and.b32 %r145, %r144, 14;
shl.b32 %r146, %r145, 6;
add.s32 %r147, %r65, %r146;
ld.shared.v4.b32 {%r148, %r149, %r150, %r151}, [%r147];
ld.shared.v4.b32 {%r152, %r153, %r154, %r155}, [%r147+64];
ld.shared.v4.b32 {%r156, %r157, %r158, %r159}, [%r147+16];
ld.shared.v4.b32 {%r160, %r161, %r162, %r163}, [%r147+80];
ld.shared.v4.b32 {%r164, %r165, %r166, %r167}, [%r147+32];
ld.shared.v4.b32 {%r168, %r169, %r170, %r171}, [%r147+96];
ld.shared.v4.b32 {%r172, %r173, %r174, %r175}, [%r147+48];
ld.shared.v4.b32 {%r176, %r177, %r178, %r179}, [%r147+112];
.loc 1 85 25 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:85:25
fma.rn.f32 %r180, %r108, %r148, 0fBE0AE428;
fma.rn.f32 %r181, %r109, %r149, %r180;
fma.rn.f32 %r182, %r110, %r150, %r181;
fma.rn.f32 %r183, %r111, %r151, %r182;
fma.rn.f32 %r184, %r104, %r156, %r183;
fma.rn.f32 %r185, %r105, %r157, %r184;
fma.rn.f32 %r186, %r106, %r158, %r185;
fma.rn.f32 %r187, %r107, %r159, %r186;
fma.rn.f32 %r188, %r100, %r164, %r187;
fma.rn.f32 %r189, %r101, %r165, %r188;
fma.rn.f32 %r190, %r102, %r166, %r189;
fma.rn.f32 %r191, %r103, %r167, %r190;
fma.rn.f32 %r192, %r96, %r172, %r191;
fma.rn.f32 %r193, %r97, %r173, %r192;
fma.rn.f32 %r194, %r98, %r174, %r193;
fma.rn.f32 %r195, %r99, %r175, %r194;
fma.rn.f32 %r196, %r108, %r152, 0fBE0AE428;
fma.rn.f32 %r197, %r109, %r153, %r196;
fma.rn.f32 %r198, %r110, %r154, %r197;
fma.rn.f32 %r199, %r111, %r155, %r198;
fma.rn.f32 %r200, %r104, %r160, %r199;
fma.rn.f32 %r201, %r105, %r161, %r200;
fma.rn.f32 %r202, %r106, %r162, %r201;
fma.rn.f32 %r203, %r107, %r163, %r202;
fma.rn.f32 %r204, %r100, %r168, %r203;
fma.rn.f32 %r205, %r101, %r169, %r204;
fma.rn.f32 %r206, %r102, %r170, %r205;
fma.rn.f32 %r207, %r103, %r171, %r206;
fma.rn.f32 %r208, %r96, %r176, %r207;
fma.rn.f32 %r209, %r97, %r177, %r208;
fma.rn.f32 %r210, %r98, %r178, %r209;
fma.rn.f32 %r211, %r99, %r179, %r210;
fma.rn.f32 %r212, %r92, %r148, 0fBE0AE428;
fma.rn.f32 %r213, %r93, %r149, %r212;
fma.rn.f32 %r214, %r94, %r150, %r213;
fma.rn.f32 %r215, %r95, %r151, %r214;
fma.rn.f32 %r216, %r88, %r156, %r215;
fma.rn.f32 %r217, %r89, %r157, %r216;
fma.rn.f32 %r218, %r90, %r158, %r217;
fma.rn.f32 %r219, %r91, %r159, %r218;
fma.rn.f32 %r220, %r84, %r164, %r219;
fma.rn.f32 %r221, %r85, %r165, %r220;
fma.rn.f32 %r222, %r86, %r166, %r221;
fma.rn.f32 %r223, %r87, %r167, %r222;
fma.rn.f32 %r224, %r80, %r172, %r223;
fma.rn.f32 %r225, %r81, %r173, %r224;
fma.rn.f32 %r226, %r82, %r174, %r225;
fma.rn.f32 %r227, %r83, %r175, %r226;
fma.rn.f32 %r228, %r92, %r152, 0fBE0AE428;
fma.rn.f32 %r229, %r93, %r153, %r228;
fma.rn.f32 %r230, %r94, %r154, %r229;
fma.rn.f32 %r231, %r95, %r155, %r230;
fma.rn.f32 %r232, %r88, %r160, %r231;
fma.rn.f32 %r233, %r89, %r161, %r232;
fma.rn.f32 %r234, %r90, %r162, %r233;
fma.rn.f32 %r235, %r91, %r163, %r234;
fma.rn.f32 %r236, %r84, %r168, %r235;
fma.rn.f32 %r237, %r85, %r169, %r236;
fma.rn.f32 %r238, %r86, %r170, %r237;
fma.rn.f32 %r239, %r87, %r171, %r238;
fma.rn.f32 %r240, %r80, %r176, %r239;
fma.rn.f32 %r241, %r81, %r177, %r240;
fma.rn.f32 %r242, %r82, %r178, %r241;
fma.rn.f32 %r243, %r83, %r179, %r242;
fma.rn.f32 %r244, %r140, %r148, 0fBE0AE428;
fma.rn.f32 %r245, %r141, %r149, %r244;
fma.rn.f32 %r246, %r142, %r150, %r245;
fma.rn.f32 %r247, %r143, %r151, %r246;
fma.rn.f32 %r248, %r136, %r156, %r247;
fma.rn.f32 %r249, %r137, %r157, %r248;
fma.rn.f32 %r250, %r138, %r158, %r249;
fma.rn.f32 %r251, %r139, %r159, %r250;
fma.rn.f32 %r252, %r132, %r164, %r251;
fma.rn.f32 %r253, %r133, %r165, %r252;
fma.rn.f32 %r254, %r134, %r166, %r253;
fma.rn.f32 %r255, %r135, %r167, %r254;
fma.rn.f32 %r256, %r128, %r172, %r255;
fma.rn.f32 %r257, %r129, %r173, %r256;
fma.rn.f32 %r258, %r130, %r174, %r257;
fma.rn.f32 %r259, %r131, %r175, %r258;
fma.rn.f32 %r260, %r140, %r152, 0fBE0AE428;
fma.rn.f32 %r261, %r141, %r153, %r260;
fma.rn.f32 %r262, %r142, %r154, %r261;
fma.rn.f32 %r263, %r143, %r155, %r262;
fma.rn.f32 %r264, %r136, %r160, %r263;
fma.rn.f32 %r265, %r137, %r161, %r264;
fma.rn.f32 %r266, %r138, %r162, %r265;
fma.rn.f32 %r267, %r139, %r163, %r266;
fma.rn.f32 %r268, %r132, %r168, %r267;
fma.rn.f32 %r269, %r133, %r169, %r268;
fma.rn.f32 %r270, %r134, %r170, %r269;
fma.rn.f32 %r271, %r135, %r171, %r270;
fma.rn.f32 %r272, %r128, %r176, %r271;
fma.rn.f32 %r273, %r129, %r177, %r272;
fma.rn.f32 %r274, %r130, %r178, %r273;
fma.rn.f32 %r275, %r131, %r179, %r274;
fma.rn.f32 %r276, %r124, %r148, 0fBE0AE428;
fma.rn.f32 %r277, %r125, %r149, %r276;
fma.rn.f32 %r278, %r126, %r150, %r277;
fma.rn.f32 %r279, %r127, %r151, %r278;
fma.rn.f32 %r280, %r120, %r156, %r279;
fma.rn.f32 %r281, %r121, %r157, %r280;
fma.rn.f32 %r282, %r122, %r158, %r281;
fma.rn.f32 %r283, %r123, %r159, %r282;
fma.rn.f32 %r284, %r116, %r164, %r283;
fma.rn.f32 %r285, %r117, %r165, %r284;
fma.rn.f32 %r286, %r118, %r166, %r285;
fma.rn.f32 %r287, %r119, %r167, %r286;
fma.rn.f32 %r288, %r112, %r172, %r287;
fma.rn.f32 %r289, %r113, %r173, %r288;
fma.rn.f32 %r290, %r114, %r174, %r289;
fma.rn.f32 %r291, %r115, %r175, %r290;
fma.rn.f32 %r292, %r124, %r152, 0fBE0AE428;
fma.rn.f32 %r293, %r125, %r153, %r292;
fma.rn.f32 %r294, %r126, %r154, %r293;
fma.rn.f32 %r295, %r127, %r155, %r294;
fma.rn.f32 %r296, %r120, %r160, %r295;
fma.rn.f32 %r297, %r121, %r161, %r296;
fma.rn.f32 %r298, %r122, %r162, %r297;
fma.rn.f32 %r299, %r123, %r163, %r298;
fma.rn.f32 %r300, %r116, %r168, %r299;
fma.rn.f32 %r301, %r117, %r169, %r300;
fma.rn.f32 %r302, %r118, %r170, %r301;
fma.rn.f32 %r303, %r119, %r171, %r302;
fma.rn.f32 %r304, %r112, %r176, %r303;
fma.rn.f32 %r305, %r113, %r177, %r304;
fma.rn.f32 %r306, %r114, %r178, %r305;
fma.rn.f32 %r307, %r115, %r179, %r306;
mov.b32 %r308, 0f00000000;
$L__tmp1:
.loc 2 47 30 // standard.py:47:30 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ]
sub.f32 %r309, %r308, %r195;
sub.f32 %r310, %r308, %r211;
sub.f32 %r311, %r308, %r227;
sub.f32 %r312, %r308, %r243;
sub.f32 %r313, %r308, %r259;
sub.f32 %r314, %r308, %r275;
sub.f32 %r315, %r308, %r291;
sub.f32 %r316, %r308, %r307;
.loc 2 47 29 // standard.py:47:29 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ]
mul.f32 %r317, %r309, 0f3FB8AA3B;
ex2.approx.f32 %r318, %r317;
mul.f32 %r319, %r310, 0f3FB8AA3B;
ex2.approx.f32 %r320, %r319;
mul.f32 %r321, %r311, 0f3FB8AA3B;
ex2.approx.f32 %r322, %r321;
mul.f32 %r323, %r312, 0f3FB8AA3B;
ex2.approx.f32 %r324, %r323;
mul.f32 %r325, %r313, 0f3FB8AA3B;
ex2.approx.f32 %r326, %r325;
mul.f32 %r327, %r314, 0f3FB8AA3B;
ex2.approx.f32 %r328, %r327;
mul.f32 %r329, %r315, 0f3FB8AA3B;
ex2.approx.f32 %r330, %r329;
mul.f32 %r331, %r316, 0f3FB8AA3B;
ex2.approx.f32 %r332, %r331;
.loc 2 47 20 // standard.py:47:20 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ]
add.f32 %r333, %r318, 0f3F800000;
add.f32 %r334, %r320, 0f3F800000;
add.f32 %r335, %r322, 0f3F800000;
add.f32 %r336, %r324, 0f3F800000;
add.f32 %r337, %r326, 0f3F800000;
add.f32 %r338, %r328, 0f3F800000;
add.f32 %r339, %r330, 0f3F800000;
add.f32 %r340, %r332, 0f3F800000;
mov.b32 %r341, 0f3F800000;
.loc 2 47 16 // standard.py:47:16 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ]
div.full.f32 %r342, %r341, %r333;
div.full.f32 %r343, %r341, %r334;
div.full.f32 %r344, %r341, %r335;
div.full.f32 %r345, %r341, %r336;
div.full.f32 %r346, %r341, %r337;
div.full.f32 %r347, %r341, %r338;
div.full.f32 %r348, %r341, %r339;
div.full.f32 %r349, %r341, %r340;
$L__tmp2:
.loc 1 100 25 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:100:25
mul.wide.s32 %rd19, %r69, 4;
add.s64 %rd8, %rd3, %rd19;
mul.wide.s32 %rd20, %r70, 4;
add.s64 %rd9, %rd3, %rd20;
mul.wide.s32 %rd21, %r71, 4;
add.s64 %rd10, %rd3, %rd21;
mul.wide.s32 %rd22, %r72, 4;
add.s64 %rd11, %rd3, %rd22;
mul.wide.s32 %rd23, %r73, 4;
add.s64 %rd12, %rd3, %rd23;
mul.wide.s32 %rd24, %r74, 4;
add.s64 %rd13, %rd3, %rd24;
mul.wide.s32 %rd25, %r75, 4;
add.s64 %rd14, %rd3, %rd25;
mul.wide.s32 %rd26, %r76, 4;
add.s64 %rd15, %rd3, %rd26;
.loc 1 100 68 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:100:68
bar.sync 0;
and.b32 %r350, %r54, 96;
or.b32 %r351, %r145, %r350;
shl.b32 %r352, %r40, 4;
and.b32 %r353, %r352, 240;
shr.u32 %r354, %r350, 1;
add.s32 %r355, %r63, %r354;
shl.b32 %r356, %r351, 2;
add.s32 %r357, %r355, %r356;
st.shared.v2.b32 [%r357], {%r342, %r343};
or.b32 %r358, %r350, 16;
shr.u32 %r359, %r358, 1;
add.s32 %r360, %r63, %r359;
add.s32 %r361, %r360, %r356;
st.shared.v2.b32 [%r361+64], {%r344, %r345};
or.b32 %r362, %r350, 128;
shr.u32 %r363, %r362, 1;
add.s32 %r364, %r63, %r363;
add.s32 %r365, %r364, %r356;
st.shared.v2.b32 [%r365+512], {%r346, %r347};
or.b32 %r366, %r350, 144;
shr.u32 %r367, %r366, 1;
add.s32 %r368, %r63, %r367;
add.s32 %r369, %r368, %r356;
st.shared.v2.b32 [%r369+576], {%r348, %r349};
bar.sync 0;
shr.u32 %r370, %r353, 1;
add.s32 %r371, %r63, %r370;
shl.b32 %r372, %r353, 2;
add.s32 %r373, %r371, %r372;
add.s32 %r374, %r373, %r45;
ld.shared.b32 %r18, [%r374];
ld.shared.b32 %r19, [%r374+8];
ld.shared.b32 %r20, [%r374+16];
ld.shared.b32 %r21, [%r374+24];
ld.shared.b32 %r22, [%r374+32];
ld.shared.b32 %r23, [%r374+40];
ld.shared.b32 %r24, [%r374+48];
ld.shared.b32 %r25, [%r374+56];
// begin inline asm
@%p2 st.global.b32 [ %rd8 + 0 ], { %r18 };
// end inline asm
mov.pred %p3, 0;
// begin inline asm
@%p3 st.global.b32 [ %rd9 + 0 ], { %r19 };
// end inline asm
// begin inline asm
@%p3 st.global.b32 [ %rd10 + 0 ], { %r20 };
// end inline asm
// begin inline asm
@%p3 st.global.b32 [ %rd11 + 0 ], { %r21 };
// end inline asm
// begin inline asm
@%p3 st.global.b32 [ %rd12 + 0 ], { %r22 };
// end inline asm
// begin inline asm
@%p3 st.global.b32 [ %rd13 + 0 ], { %r23 };
// end inline asm
// begin inline asm
@%p3 st.global.b32 [ %rd14 + 0 ], { %r24 };
// end inline asm
// begin inline asm
@%p3 st.global.b32 [ %rd15 + 0 ], { %r25 };
// end inline asm
$L__BB0_1: // %common.ret
.loc 1 0 0 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:0
ret;
$L__tmp3:
$L__func_end0:
// -- End function
}
.file 1 "/tmp/torchinductor_shangdiy/q4/cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py"
.file 2 "/home/shangdiy/miniconda3/envs/pytorch-3.10/lib/python3.10/site-packages/triton/language/standard.py"
.section .debug_abbrev
{
.b8 1 // Abbreviation Code
.b8 17 // DW_TAG_compile_unit
.b8 1 // DW_CHILDREN_yes
.b8 37 // DW_AT_producer
.b8 8 // DW_FORM_string
.b8 19 // DW_AT_language
.b8 5 // DW_FORM_data2
.b8 3 // DW_AT_name
.b8 8 // DW_FORM_string
.b8 16 // DW_AT_stmt_list
.b8 6 // DW_FORM_data4
.b8 27 // DW_AT_comp_dir
.b8 8 // DW_FORM_string
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 2 // Abbreviation Code
.b8 46 // DW_TAG_subprogram
.b8 0 // DW_CHILDREN_no
.b8 3 // DW_AT_name
.b8 8 // DW_FORM_string
.b8 32 // DW_AT_inline
.b8 11 // DW_FORM_data1
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 3 // Abbreviation Code
.b8 46 // DW_TAG_subprogram
.b8 1 // DW_CHILDREN_yes
.b8 17 // DW_AT_low_pc
.b8 1 // DW_FORM_addr
.b8 18 // DW_AT_high_pc
.b8 1 // DW_FORM_addr
.b8 49 // DW_AT_abstract_origin
.b8 19 // DW_FORM_ref4
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 4 // Abbreviation Code
.b8 29 // DW_TAG_inlined_subroutine
.b8 0 // DW_CHILDREN_no
.b8 49 // DW_AT_abstract_origin
.b8 19 // DW_FORM_ref4
.b8 17 // DW_AT_low_pc
.b8 1 // DW_FORM_addr
.b8 18 // DW_AT_high_pc
.b8 1 // DW_FORM_addr
.b8 88 // DW_AT_call_file
.b8 11 // DW_FORM_data1
.b8 89 // DW_AT_call_line
.b8 11 // DW_FORM_data1
.b8 87 // DW_AT_call_column
.b8 11 // DW_FORM_data1
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 0 // EOM(3)
}
.section .debug_info
{
.b32 203 // Length of Unit
.b8 2 // DWARF version number
.b8 0
.b32 .debug_abbrev // Offset Into Abbrev. Section
.b8 8 // Address Size (in bytes)
.b8 1 // Abbrev [1] 0xb:0xc4 DW_TAG_compile_unit
.b8 116 // DW_AT_producer
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2 // DW_AT_language
.b8 0
.b8 99 // DW_AT_name
.b8 113
.b8 52
.b8 54
.b8 118
.b8 111
.b8 108
.b8 114
.b8 122
.b8 109
.b8 97
.b8 54
.b8 55
.b8 105
.b8 110
.b8 100
.b8 116
.b8 119
.b8 104
.b8 106
.b8 53
.b8 97
.b8 52
.b8 110
.b8 54
.b8 122
.b8 114
.b8 51
.b8 54
.b8 55
.b8 103
.b8 113
.b8 112
.b8 107
.b8 99
.b8 119
.b8 111
.b8 50
.b8 104
.b8 108
.b8 112
.b8 104
.b8 121
.b8 110
.b8 104
.b8 106
.b8 116
.b8 104
.b8 110
.b8 51
.b8 117
.b8 107
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line // DW_AT_stmt_list
.b8 47 // DW_AT_comp_dir
.b8 116
.b8 109
.b8 112
.b8 47
.b8 116
.b8 111
.b8 114
.b8 99
.b8 104
.b8 105
.b8 110
.b8 100
.b8 117
.b8 99
.b8 116
.b8 111
.b8 114
.b8 95
.b8 115
.b8 104
.b8 97
.b8 110
.b8 103
.b8 100
.b8 105
.b8 121
.b8 47
.b8 113
.b8 52
.b8 0
.b8 2 // Abbrev [2] 0x70:0x30 DW_TAG_subprogram
.b8 109 // DW_AT_name
.b8 111
.b8 100
.b8 101
.b8 108
.b8 95
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 116
.b8 101
.b8 109
.b8 95
.b8 102
.b8 117
.b8 115
.b8 101
.b8 100
.b8 95
.b8 97
.b8 100
.b8 100
.b8 109
.b8 109
.b8 95
.b8 114
.b8 101
.b8 108
.b8 117
.b8 95
.b8 115
.b8 105
.b8 103
.b8 109
.b8 111
.b8 105
.b8 100
.b8 95
.b8 116
.b8 95
.b8 49
.b8 0
.b8 1 // DW_AT_inline
.b8 3 // Abbrev [3] 0xa0:0x2e DW_TAG_subprogram
.b64 $L__func_begin0 // DW_AT_low_pc
.b64 $L__func_end0 // DW_AT_high_pc
.b32 112 // DW_AT_abstract_origin
.b8 4 // Abbrev [4] 0xb5:0x18 DW_TAG_inlined_subroutine
.b32 112 // DW_AT_abstract_origin
.b64 $L__tmp1 // DW_AT_low_pc
.b64 $L__tmp2 // DW_AT_high_pc
.b8 1 // DW_AT_call_file
.b8 99 // DW_AT_call_line
.b8 22 // DW_AT_call_column
.b8 0 // End Of Children Mark
.b8 0 // End Of Children Mark
}
.section .debug_macinfo { }

View File

@ -1,727 +0,0 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 8.7
.target sm_89
.address_size 64
// .globl model_triton_tem_fused_addmm_relu_t_0 // -- Begin function model_triton_tem_fused_addmm_relu_t_0
.extern .shared .align 16 .b8 global_smem[];
// @model_triton_tem_fused_addmm_relu_t_0
.visible .entry model_triton_tem_fused_addmm_relu_t_0(
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_0,
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_1,
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_2,
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_3,
.param .u32 model_triton_tem_fused_addmm_relu_t_0_param_4,
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_5
)
.reqntid 32
{
.reg .pred %p<27>;
.reg .b32 %r<398>;
.reg .b64 %rd<29>;
.loc 1 18 0 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:18:0
$L__func_begin0:
.loc 1 18 0 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:18:0
// %bb.0:
ld.param.b32 %r1, [model_triton_tem_fused_addmm_relu_t_0_param_4];
$L__tmp0:
.loc 1 34 16 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:34:16
and.b32 %r2, %r1, 268435455;
setp.ne.s32 %p1, %r2, 0;
@%p1 bra $L__BB0_2;
bra.uni $L__BB0_1;
$L__BB0_2:
.loc 1 0 16 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:0:16
ld.param.b64 %rd4, [model_triton_tem_fused_addmm_relu_t_0_param_3];
ld.param.b64 %rd3, [model_triton_tem_fused_addmm_relu_t_0_param_2];
ld.param.b64 %rd2, [model_triton_tem_fused_addmm_relu_t_0_param_1];
ld.param.b64 %rd1, [model_triton_tem_fused_addmm_relu_t_0_param_0];
.loc 1 43 24 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:43:24
mov.u32 %r51, %ctaid.x;
.loc 1 44 28 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:44:28
add.s32 %r52, %r1, 15;
.loc 1 44 34 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:44:34
shr.s32 %r53, %r52, 31;
shr.u32 %r54, %r53, 28;
add.s32 %r55, %r52, %r54;
shr.s32 %r56, %r55, 4;
.loc 1 50 41 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:50:41
and.b32 %r57, %r51, 2147483640;
.loc 1 50 30 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:50:30
sub.s32 %r58, %r56, %r57;
.loc 1 50 50 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:50:50
min.s32 %r59, %r58, 8;
.loc 1 51 40 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:51:40
rem.s32 %r60, %r51, %r59;
.loc 1 51 34 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:51:34
add.s32 %r61, %r60, %r57;
.loc 1 52 19 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:52:19
and.b32 %r62, %r51, 7;
.loc 1 52 30 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:52:30
div.s32 %r63, %r62, %r59;
.loc 1 56 17 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:56:17
shl.b32 %r64, %r61, 4;
.loc 1 56 40 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:56:40
mov.u32 %r65, %tid.x;
and.b32 %r66, %r65, 4;
bfe.u32 %r67, %r65, 2, 1;
shr.u32 %r68, %r65, 2;
and.b32 %r69, %r68, 6;
or.b32 %r70, %r69, %r67;
bfe.u32 %r71, %r65, 3, 2;
and.b32 %r72, %r65, 1;
shl.b32 %r73, %r72, 2;
shl.b32 %r74, %r65, 2;
and.b32 %r75, %r74, 12;
or.b32 %r76, %r75, 2;
.loc 1 56 27 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:56:27
or.b32 %r77, %r64, %r70;
or.b32 %r78, %r77, 8;
.loc 1 57 17 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:57:17
shl.b32 %r79, %r63, 4;
.loc 1 57 27 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:57:27
or.b32 %r80, %r79, %r75;
.loc 1 0 0 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:0
rem.s32 %r81, %r78, %r1;
rem.s32 %r82, %r77, %r1;
.loc 1 71 36 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:71:36
setp.lt.u32 %p2, %r75, 10;
setp.lt.u32 %p3, %r76, 10;
.loc 1 72 24 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:72:24
shl.b32 %r83, %r65, 1;
and.b32 %r84, %r83, 6;
shl.b32 %r85, %r66, 1;
or.b32 %r86, %r84, %r85;
.loc 1 72 36 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:72:36
setp.lt.u32 %p6, %r86, 10;
.loc 1 79 28 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:79:28
mul.lo.s32 %r87, %r82, 10;
mul.lo.s32 %r88, %r81, 10;
.loc 1 79 25 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:79:25
add.s32 %r89, %r87, %r75;
add.s32 %r90, %r87, %r76;
add.s32 %r91, %r88, %r75;
add.s32 %r92, %r88, %r76;
.loc 1 80 25 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:80:25
mul.wide.s32 %rd21, %r89, 4;
add.s64 %rd5, %rd1, %rd21;
mul.wide.s32 %rd22, %r90, 4;
add.s64 %rd6, %rd1, %rd22;
mul.wide.s32 %rd23, %r91, 4;
add.s64 %rd7, %rd1, %rd23;
mul.wide.s32 %rd24, %r92, 4;
add.s64 %rd8, %rd1, %rd24;
mov.b32 %r5, 0;
.loc 1 80 20 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:80:20
// begin inline asm
mov.u32 %r3, %r5;
mov.u32 %r4, %r5;
@%p2 ld.global.v2.b32 { %r3, %r4 }, [ %rd5 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r7, %r5;
mov.u32 %r8, %r5;
@%p3 ld.global.v2.b32 { %r7, %r8 }, [ %rd6 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r11, %r5;
mov.u32 %r12, %r5;
@%p2 ld.global.v2.b32 { %r11, %r12 }, [ %rd7 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r15, %r5;
mov.u32 %r16, %r5;
@%p3 ld.global.v2.b32 { %r15, %r16 }, [ %rd8 + 0 ];
// end inline asm
shl.b32 %r93, %r75, 2;
mov.b32 %r94, global_smem;
add.s32 %r95, %r94, %r93;
shl.b32 %r96, %r70, 6;
add.s32 %r97, %r95, %r96;
st.shared.v4.b32 [%r97], {%r3, %r4, %r7, %r8};
st.shared.v4.b32 [%r97+512], {%r11, %r12, %r15, %r16};
.loc 1 85 50 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:50
mad.lo.s32 %r98, %r71, 10, %r86;
.loc 1 85 25 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:25
mul.wide.u32 %rd25, %r98, 4;
add.s64 %rd9, %rd2, %rd25;
add.s64 %rd10, %rd9, 160;
add.s64 %rd11, %rd9, 320;
add.s64 %rd12, %rd9, 480;
.loc 1 85 20 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20
// begin inline asm
mov.u32 %r19, %r5;
mov.u32 %r20, %r5;
@%p6 ld.global.v2.b32 { %r19, %r20 }, [ %rd9 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r23, %r5;
mov.u32 %r24, %r5;
@%p6 ld.global.v2.b32 { %r23, %r24 }, [ %rd10 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r27, %r5;
mov.u32 %r28, %r5;
@%p6 ld.global.v2.b32 { %r27, %r28 }, [ %rd11 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r31, %r5;
mov.u32 %r32, %r5;
@%p6 ld.global.v2.b32 { %r31, %r32 }, [ %rd12 + 0 ];
// end inline asm
shl.b32 %r99, %r86, 2;
add.s32 %r100, %r94, 1024;
add.s32 %r101, %r100, %r99;
shl.b32 %r102, %r71, 6;
add.s32 %r103, %r101, %r102;
st.shared.v2.b32 [%r103], {%r19, %r20};
st.shared.v2.b32 [%r103+256], {%r23, %r24};
st.shared.v2.b32 [%r103+512], {%r27, %r28};
st.shared.v2.b32 [%r103+768], {%r31, %r32};
.loc 1 96 20 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:96:20
setp.lt.s32 %p14, %r77, %r1;
setp.lt.s32 %p15, %r78, %r1;
.loc 1 96 34 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:96:34
setp.eq.s32 %p16, %r63, 0;
.loc 1 96 26 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:96:26
and.pred %p10, %p16, %p14;
and.pred %p11, %p15, %p16;
.loc 1 100 30 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:30
mul.wide.u32 %rd26, %r80, 4;
add.s64 %rd14, %rd3, %rd26;
.loc 1 100 66 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:66
// begin inline asm
mov.u64 %rd15, 0x0;
createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0;
// end inline asm
// begin inline asm
mov.u32 %r35, 0x0;
mov.u32 %r36, 0x0;
mov.u32 %r37, 0x0;
mov.u32 %r38, 0x0;
@%p10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r35, %r36, %r37, %r38 }, [ %rd14 + 0 ], %rd15;
// end inline asm
// begin inline asm
mov.u64 %rd18, 0x0;
createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0;
// end inline asm
// begin inline asm
mov.u32 %r39, 0x0;
mov.u32 %r40, 0x0;
mov.u32 %r41, 0x0;
mov.u32 %r42, 0x0;
@%p11 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r39, %r40, %r41, %r42 }, [ %rd14 + 0 ], %rd18;
// end inline asm
.loc 1 80 20 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:80:20
bar.sync 0;
shl.b32 %r104, %r69, 6;
add.s32 %r105, %r94, %r104;
ld.shared.v4.b32 {%r106, %r107, %r108, %r109}, [%r105+48];
ld.shared.v4.b32 {%r110, %r111, %r112, %r113}, [%r105+112];
ld.shared.v4.b32 {%r114, %r115, %r116, %r117}, [%r105+32];
ld.shared.v4.b32 {%r118, %r119, %r120, %r121}, [%r105+96];
ld.shared.v4.b32 {%r122, %r123, %r124, %r125}, [%r105+16];
ld.shared.v4.b32 {%r126, %r127, %r128, %r129}, [%r105+80];
ld.shared.v4.b32 {%r130, %r131, %r132, %r133}, [%r105];
ld.shared.v4.b32 {%r134, %r135, %r136, %r137}, [%r105+64];
ld.shared.v4.b32 {%r138, %r139, %r140, %r141}, [%r105+560];
ld.shared.v4.b32 {%r142, %r143, %r144, %r145}, [%r105+624];
ld.shared.v4.b32 {%r146, %r147, %r148, %r149}, [%r105+544];
ld.shared.v4.b32 {%r150, %r151, %r152, %r153}, [%r105+608];
ld.shared.v4.b32 {%r154, %r155, %r156, %r157}, [%r105+528];
ld.shared.v4.b32 {%r158, %r159, %r160, %r161}, [%r105+592];
ld.shared.v4.b32 {%r162, %r163, %r164, %r165}, [%r105+512];
ld.shared.v4.b32 {%r166, %r167, %r168, %r169}, [%r105+576];
.loc 1 85 20 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20
mad.lo.s32 %r170, %r86, 60, %r101;
or.b32 %r171, %r84, 1;
or.b32 %r172, %r171, %r85;
shl.b32 %r173, %r172, 6;
add.s32 %r174, %r100, %r173;
ld.shared.b32 %r175, [%r170+16];
ld.shared.b32 %r176, [%r174+16];
ld.shared.b32 %r177, [%r174+52];
ld.shared.b32 %r178, [%r170+60];
.loc 1 100 66 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:66
shr.u32 %r179, %r66, 1;
setp.eq.s32 %p17, %r66, 0;
bfe.u32 %r180, %r65, 1, 1;
or.b32 %r181, %r73, %r180;
or.b32 %r182, %r181, %r179;
and.b32 %r183, %r65, 24;
or.b32 %r184, %r182, %r183;
selp.b32 %r185, %r35, %r37, %p17;
shfl.sync.idx.b32 %r186, %r185, %r184, 31, -1;
selp.b32 %r187, %r36, %r38, %p17;
shfl.sync.idx.b32 %r188, %r187, %r184, 31, -1;
selp.b32 %r189, %r37, %r35, %p17;
xor.b32 %r190, %r184, 4;
shfl.sync.idx.b32 %r191, %r189, %r190, 31, -1;
selp.b32 %r192, %r38, %r36, %p17;
shfl.sync.idx.b32 %r193, %r192, %r190, 31, -1;
selp.b32 %r194, %r39, %r41, %p17;
shfl.sync.idx.b32 %r195, %r194, %r184, 31, -1;
selp.b32 %r196, %r40, %r42, %p17;
shfl.sync.idx.b32 %r197, %r196, %r184, 31, -1;
selp.b32 %r198, %r41, %r39, %p17;
shfl.sync.idx.b32 %r199, %r198, %r190, 31, -1;
selp.b32 %r200, %r42, %r40, %p17;
shfl.sync.idx.b32 %r201, %r200, %r190, 31, -1;
setp.eq.s32 %p18, %r72, 0;
.loc 1 85 20 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20
ld.shared.v4.b32 {%r202, %r203, %r204, %r205}, [%r170];
ld.shared.v4.b32 {%r206, %r207, %r208, %r209}, [%r174];
.loc 1 100 66 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:66
selp.f32 %r210, %r186, %r191, %p18;
selp.f32 %r211, %r188, %r193, %p18;
selp.f32 %r212, %r191, %r186, %p18;
selp.f32 %r213, %r193, %r188, %p18;
selp.f32 %r214, %r195, %r199, %p18;
selp.f32 %r215, %r197, %r201, %p18;
selp.f32 %r216, %r199, %r195, %p18;
selp.f32 %r217, %r201, %r197, %p18;
.loc 1 88 25 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:88:25
fma.rn.f32 %r218, %r166, %r206, %r217;
fma.rn.f32 %r219, %r166, %r202, %r216;
fma.rn.f32 %r220, %r162, %r206, %r215;
fma.rn.f32 %r221, %r162, %r202, %r214;
fma.rn.f32 %r222, %r134, %r206, %r213;
fma.rn.f32 %r223, %r134, %r202, %r212;
fma.rn.f32 %r224, %r130, %r206, %r211;
fma.rn.f32 %r225, %r130, %r202, %r210;
fma.rn.f32 %r226, %r131, %r203, %r225;
fma.rn.f32 %r227, %r131, %r207, %r224;
fma.rn.f32 %r228, %r135, %r203, %r223;
fma.rn.f32 %r229, %r135, %r207, %r222;
fma.rn.f32 %r230, %r163, %r203, %r221;
fma.rn.f32 %r231, %r163, %r207, %r220;
fma.rn.f32 %r232, %r167, %r203, %r219;
fma.rn.f32 %r233, %r167, %r207, %r218;
fma.rn.f32 %r234, %r168, %r208, %r233;
fma.rn.f32 %r235, %r168, %r204, %r232;
fma.rn.f32 %r236, %r164, %r208, %r231;
fma.rn.f32 %r237, %r164, %r204, %r230;
fma.rn.f32 %r238, %r136, %r208, %r229;
fma.rn.f32 %r239, %r136, %r204, %r228;
fma.rn.f32 %r240, %r132, %r208, %r227;
fma.rn.f32 %r241, %r132, %r204, %r226;
fma.rn.f32 %r242, %r133, %r205, %r241;
fma.rn.f32 %r243, %r133, %r209, %r240;
fma.rn.f32 %r244, %r137, %r205, %r239;
fma.rn.f32 %r245, %r137, %r209, %r238;
fma.rn.f32 %r246, %r165, %r205, %r237;
fma.rn.f32 %r247, %r165, %r209, %r236;
fma.rn.f32 %r248, %r169, %r205, %r235;
fma.rn.f32 %r249, %r169, %r209, %r234;
fma.rn.f32 %r250, %r158, %r176, %r249;
fma.rn.f32 %r251, %r158, %r175, %r248;
fma.rn.f32 %r252, %r154, %r176, %r247;
fma.rn.f32 %r253, %r154, %r175, %r246;
fma.rn.f32 %r254, %r126, %r176, %r245;
fma.rn.f32 %r255, %r126, %r175, %r244;
fma.rn.f32 %r256, %r122, %r176, %r243;
fma.rn.f32 %r257, %r122, %r175, %r242;
.loc 1 85 20 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20
ld.shared.b32 %r258, [%r170+24];
ld.shared.b32 %r259, [%r170+20];
ld.shared.b32 %r260, [%r174+24];
ld.shared.b32 %r261, [%r174+20];
ld.shared.b32 %r262, [%r170+32];
ld.shared.b32 %r263, [%r170+28];
ld.shared.b32 %r264, [%r174+32];
ld.shared.b32 %r265, [%r174+28];
ld.shared.b32 %r266, [%r170+40];
ld.shared.b32 %r267, [%r170+36];
ld.shared.b32 %r268, [%r174+40];
ld.shared.b32 %r269, [%r174+36];
ld.shared.b32 %r270, [%r170+48];
ld.shared.b32 %r271, [%r170+44];
ld.shared.b32 %r272, [%r174+48];
ld.shared.b32 %r273, [%r174+44];
ld.shared.b32 %r274, [%r170+56];
ld.shared.b32 %r275, [%r170+52];
ld.shared.v2.b32 {%r276, %r277}, [%r174+56];
.loc 1 88 25 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:88:25
fma.rn.f32 %r278, %r123, %r259, %r257;
fma.rn.f32 %r279, %r123, %r261, %r256;
fma.rn.f32 %r280, %r127, %r259, %r255;
fma.rn.f32 %r281, %r127, %r261, %r254;
fma.rn.f32 %r282, %r155, %r259, %r253;
fma.rn.f32 %r283, %r155, %r261, %r252;
fma.rn.f32 %r284, %r159, %r259, %r251;
fma.rn.f32 %r285, %r159, %r261, %r250;
fma.rn.f32 %r286, %r160, %r260, %r285;
fma.rn.f32 %r287, %r160, %r258, %r284;
fma.rn.f32 %r288, %r156, %r260, %r283;
fma.rn.f32 %r289, %r156, %r258, %r282;
fma.rn.f32 %r290, %r128, %r260, %r281;
fma.rn.f32 %r291, %r128, %r258, %r280;
fma.rn.f32 %r292, %r124, %r260, %r279;
fma.rn.f32 %r293, %r124, %r258, %r278;
fma.rn.f32 %r294, %r125, %r263, %r293;
fma.rn.f32 %r295, %r125, %r265, %r292;
fma.rn.f32 %r296, %r129, %r263, %r291;
fma.rn.f32 %r297, %r129, %r265, %r290;
fma.rn.f32 %r298, %r157, %r263, %r289;
fma.rn.f32 %r299, %r157, %r265, %r288;
fma.rn.f32 %r300, %r161, %r263, %r287;
fma.rn.f32 %r301, %r161, %r265, %r286;
fma.rn.f32 %r302, %r150, %r264, %r301;
fma.rn.f32 %r303, %r150, %r262, %r300;
fma.rn.f32 %r304, %r146, %r264, %r299;
fma.rn.f32 %r305, %r146, %r262, %r298;
fma.rn.f32 %r306, %r118, %r264, %r297;
fma.rn.f32 %r307, %r118, %r262, %r296;
fma.rn.f32 %r308, %r114, %r264, %r295;
fma.rn.f32 %r309, %r114, %r262, %r294;
fma.rn.f32 %r310, %r115, %r267, %r309;
fma.rn.f32 %r311, %r115, %r269, %r308;
fma.rn.f32 %r312, %r119, %r267, %r307;
fma.rn.f32 %r313, %r119, %r269, %r306;
fma.rn.f32 %r314, %r147, %r267, %r305;
fma.rn.f32 %r315, %r147, %r269, %r304;
fma.rn.f32 %r316, %r151, %r267, %r303;
fma.rn.f32 %r317, %r151, %r269, %r302;
fma.rn.f32 %r318, %r152, %r268, %r317;
fma.rn.f32 %r319, %r152, %r266, %r316;
fma.rn.f32 %r320, %r148, %r268, %r315;
fma.rn.f32 %r321, %r148, %r266, %r314;
fma.rn.f32 %r322, %r120, %r268, %r313;
fma.rn.f32 %r323, %r120, %r266, %r312;
fma.rn.f32 %r324, %r116, %r268, %r311;
fma.rn.f32 %r325, %r116, %r266, %r310;
fma.rn.f32 %r326, %r117, %r271, %r325;
fma.rn.f32 %r327, %r117, %r273, %r324;
fma.rn.f32 %r328, %r121, %r271, %r323;
fma.rn.f32 %r329, %r121, %r273, %r322;
fma.rn.f32 %r330, %r149, %r271, %r321;
fma.rn.f32 %r331, %r149, %r273, %r320;
fma.rn.f32 %r332, %r153, %r271, %r319;
fma.rn.f32 %r333, %r153, %r273, %r318;
fma.rn.f32 %r334, %r142, %r272, %r333;
fma.rn.f32 %r335, %r142, %r270, %r332;
fma.rn.f32 %r336, %r138, %r272, %r331;
fma.rn.f32 %r337, %r138, %r270, %r330;
fma.rn.f32 %r338, %r110, %r272, %r329;
fma.rn.f32 %r339, %r110, %r270, %r328;
fma.rn.f32 %r340, %r106, %r272, %r327;
fma.rn.f32 %r341, %r106, %r270, %r326;
fma.rn.f32 %r342, %r107, %r275, %r341;
fma.rn.f32 %r343, %r107, %r177, %r340;
fma.rn.f32 %r344, %r111, %r275, %r339;
fma.rn.f32 %r345, %r111, %r177, %r338;
fma.rn.f32 %r346, %r139, %r275, %r337;
fma.rn.f32 %r347, %r139, %r177, %r336;
fma.rn.f32 %r348, %r143, %r275, %r335;
fma.rn.f32 %r349, %r143, %r177, %r334;
fma.rn.f32 %r350, %r144, %r276, %r349;
fma.rn.f32 %r351, %r144, %r274, %r348;
fma.rn.f32 %r352, %r140, %r276, %r347;
fma.rn.f32 %r353, %r140, %r274, %r346;
fma.rn.f32 %r354, %r112, %r276, %r345;
fma.rn.f32 %r355, %r112, %r274, %r344;
fma.rn.f32 %r356, %r108, %r276, %r343;
fma.rn.f32 %r357, %r108, %r274, %r342;
fma.rn.f32 %r358, %r109, %r178, %r357;
fma.rn.f32 %r359, %r109, %r277, %r356;
fma.rn.f32 %r360, %r113, %r178, %r355;
fma.rn.f32 %r361, %r113, %r277, %r354;
fma.rn.f32 %r362, %r141, %r178, %r353;
fma.rn.f32 %r363, %r141, %r277, %r352;
fma.rn.f32 %r364, %r145, %r178, %r351;
fma.rn.f32 %r365, %r145, %r277, %r350;
$L__tmp1:
.loc 2 110 15 // triton_helpers.py:110:15 @[ cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:103:40 ]
setp.lt.f32 %p19, %r365, 0f00000000;
setp.lt.f32 %p20, %r364, 0f00000000;
setp.lt.f32 %p21, %r363, 0f00000000;
setp.lt.f32 %p22, %r362, 0f00000000;
setp.lt.f32 %p23, %r361, 0f00000000;
setp.lt.f32 %p24, %r360, 0f00000000;
setp.lt.f32 %p25, %r359, 0f00000000;
setp.lt.f32 %p26, %r358, 0f00000000;
.loc 2 113 29 // triton_helpers.py:113:29 @[ cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:103:40 ]
selp.f32 %r366, 0f00000000, %r358, %p26;
selp.f32 %r367, 0f00000000, %r359, %p25;
selp.f32 %r368, 0f00000000, %r360, %p24;
selp.f32 %r369, 0f00000000, %r361, %p23;
selp.f32 %r370, 0f00000000, %r362, %p22;
selp.f32 %r371, 0f00000000, %r363, %p21;
selp.f32 %r372, 0f00000000, %r364, %p20;
selp.f32 %r373, 0f00000000, %r365, %p19;
$L__tmp2:
.loc 1 104 52 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:52
shl.b32 %r374, %r77, 4;
shl.b32 %r375, %r78, 4;
.loc 1 104 49 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:49
add.s32 %r376, %r374, %r80;
add.s32 %r377, %r80, %r375;
.loc 1 104 25 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:25
mul.wide.s32 %rd27, %r376, 4;
add.s64 %rd19, %rd4, %rd27;
mul.wide.s32 %rd28, %r377, 4;
add.s64 %rd20, %rd4, %rd28;
.loc 1 104 78 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:78
selp.f32 %r378, %r366, %r368, %p18;
or.b32 %r379, %r183, %r67;
or.b32 %r380, %r379, %r84;
shfl.sync.idx.b32 %r381, %r378, %r380, 31, -1;
selp.f32 %r382, %r367, %r369, %p18;
shfl.sync.idx.b32 %r383, %r382, %r380, 31, -1;
selp.f32 %r384, %r368, %r366, %p18;
xor.b32 %r385, %r171, %r67;
or.b32 %r386, %r183, %r385;
shfl.sync.idx.b32 %r387, %r384, %r386, 31, -1;
selp.f32 %r388, %r369, %r367, %p18;
shfl.sync.idx.b32 %r389, %r388, %r386, 31, -1;
selp.f32 %r390, %r370, %r372, %p18;
shfl.sync.idx.b32 %r391, %r390, %r380, 31, -1;
selp.f32 %r392, %r371, %r373, %p18;
shfl.sync.idx.b32 %r393, %r392, %r380, 31, -1;
selp.f32 %r394, %r372, %r370, %p18;
shfl.sync.idx.b32 %r395, %r394, %r386, 31, -1;
selp.f32 %r396, %r373, %r371, %p18;
shfl.sync.idx.b32 %r397, %r396, %r386, 31, -1;
selp.b32 %r45, %r387, %r381, %p17;
selp.b32 %r46, %r389, %r383, %p17;
selp.b32 %r49, %r395, %r391, %p17;
selp.b32 %r50, %r397, %r393, %p17;
selp.b32 %r43, %r381, %r387, %p17;
selp.b32 %r44, %r383, %r389, %p17;
// begin inline asm
@%p10 st.global.v4.b32 [ %rd19 + 0 ], { %r43, %r44, %r45, %r46 };
// end inline asm
selp.b32 %r47, %r391, %r395, %p17;
selp.b32 %r48, %r393, %r397, %p17;
// begin inline asm
@%p11 st.global.v4.b32 [ %rd20 + 0 ], { %r47, %r48, %r49, %r50 };
// end inline asm
$L__BB0_1: // %common.ret
.loc 1 0 0 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:0
ret;
$L__tmp3:
$L__func_end0:
// -- End function
}
.file 1 "/tmp/torchinductor_shangdiy/uw/cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py"
.file 2 "/home/shangdiy/pytorch/torch/_inductor/runtime/triton_helpers.py"
.section .debug_abbrev
{
.b8 1 // Abbreviation Code
.b8 17 // DW_TAG_compile_unit
.b8 1 // DW_CHILDREN_yes
.b8 37 // DW_AT_producer
.b8 8 // DW_FORM_string
.b8 19 // DW_AT_language
.b8 5 // DW_FORM_data2
.b8 3 // DW_AT_name
.b8 8 // DW_FORM_string
.b8 16 // DW_AT_stmt_list
.b8 6 // DW_FORM_data4
.b8 27 // DW_AT_comp_dir
.b8 8 // DW_FORM_string
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 2 // Abbreviation Code
.b8 46 // DW_TAG_subprogram
.b8 0 // DW_CHILDREN_no
.b8 3 // DW_AT_name
.b8 8 // DW_FORM_string
.b8 32 // DW_AT_inline
.b8 11 // DW_FORM_data1
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 3 // Abbreviation Code
.b8 46 // DW_TAG_subprogram
.b8 1 // DW_CHILDREN_yes
.b8 17 // DW_AT_low_pc
.b8 1 // DW_FORM_addr
.b8 18 // DW_AT_high_pc
.b8 1 // DW_FORM_addr
.b8 49 // DW_AT_abstract_origin
.b8 19 // DW_FORM_ref4
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 4 // Abbreviation Code
.b8 29 // DW_TAG_inlined_subroutine
.b8 0 // DW_CHILDREN_no
.b8 49 // DW_AT_abstract_origin
.b8 19 // DW_FORM_ref4
.b8 17 // DW_AT_low_pc
.b8 1 // DW_FORM_addr
.b8 18 // DW_AT_high_pc
.b8 1 // DW_FORM_addr
.b8 88 // DW_AT_call_file
.b8 11 // DW_FORM_data1
.b8 89 // DW_AT_call_line
.b8 11 // DW_FORM_data1
.b8 87 // DW_AT_call_column
.b8 11 // DW_FORM_data1
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 0 // EOM(3)
}
.section .debug_info
{
.b32 195 // Length of Unit
.b8 2 // DWARF version number
.b8 0
.b32 .debug_abbrev // Offset Into Abbrev. Section
.b8 8 // Address Size (in bytes)
.b8 1 // Abbrev [1] 0xb:0xbc DW_TAG_compile_unit
.b8 116 // DW_AT_producer
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2 // DW_AT_language
.b8 0
.b8 99 // DW_AT_name
.b8 117
.b8 119
.b8 104
.b8 98
.b8 112
.b8 101
.b8 99
.b8 100
.b8 50
.b8 117
.b8 107
.b8 101
.b8 115
.b8 111
.b8 51
.b8 106
.b8 120
.b8 101
.b8 107
.b8 99
.b8 122
.b8 118
.b8 103
.b8 104
.b8 98
.b8 55
.b8 97
.b8 104
.b8 50
.b8 104
.b8 107
.b8 50
.b8 122
.b8 111
.b8 98
.b8 54
.b8 55
.b8 111
.b8 112
.b8 99
.b8 52
.b8 51
.b8 97
.b8 103
.b8 104
.b8 101
.b8 53
.b8 119
.b8 108
.b8 118
.b8 51
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line // DW_AT_stmt_list
.b8 47 // DW_AT_comp_dir
.b8 116
.b8 109
.b8 112
.b8 47
.b8 116
.b8 111
.b8 114
.b8 99
.b8 104
.b8 105
.b8 110
.b8 100
.b8 117
.b8 99
.b8 116
.b8 111
.b8 114
.b8 95
.b8 115
.b8 104
.b8 97
.b8 110
.b8 103
.b8 100
.b8 105
.b8 121
.b8 47
.b8 117
.b8 119
.b8 0
.b8 2 // Abbrev [2] 0x70:0x28 DW_TAG_subprogram
.b8 109 // DW_AT_name
.b8 111
.b8 100
.b8 101
.b8 108
.b8 95
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 116
.b8 101
.b8 109
.b8 95
.b8 102
.b8 117
.b8 115
.b8 101
.b8 100
.b8 95
.b8 97
.b8 100
.b8 100
.b8 109
.b8 109
.b8 95
.b8 114
.b8 101
.b8 108
.b8 117
.b8 95
.b8 116
.b8 95
.b8 48
.b8 0
.b8 1 // DW_AT_inline
.b8 3 // Abbrev [3] 0x98:0x2e DW_TAG_subprogram
.b64 $L__func_begin0 // DW_AT_low_pc
.b64 $L__func_end0 // DW_AT_high_pc
.b32 112 // DW_AT_abstract_origin
.b8 4 // Abbrev [4] 0xad:0x18 DW_TAG_inlined_subroutine
.b32 112 // DW_AT_abstract_origin
.b64 $L__tmp1 // DW_AT_low_pc
.b64 $L__tmp2 // DW_AT_high_pc
.b8 1 // DW_AT_call_file
.b8 103 // DW_AT_call_line
.b8 40 // DW_AT_call_column
.b8 0 // End Of Children Mark
.b8 0 // End Of Children Mark
}
.section .debug_macinfo { }

View File

@ -1,8 +0,0 @@
SECTIONS {
/* By default, in LLD 16, .lrodata is placed immediately after .rodata.
* However, .lrodata can be very large in our compiled models, which leads to
* relocation out-of-range errors for relative relocations. So we place it
* after other the sections that are referenced from .text using relative
* relocations. This is the default behavior in GNU ld. */
.lrodata : { *(.lrodata) }
} INSERT AFTER .bss;

View File

@ -1,147 +0,0 @@
// Windows for #include <dlfcn.h>
#include <windows.h>
#include <stdio.h>
#include <iostream>
#include <memory>
#include <vector>
#include <string>
// Include the AOTInductor headers
// #include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h>
#include <torch/csrc/inductor/aoti_runtime/interface.h>
// #include <torch/csrc/inductor/aoti_runtime/model_container.h>
// #include <torch/csrc/inductor/aoti_torch/tensor_converter.h> // @manual
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
#include <standalone/slim/core/Empty.h>
#include <standalone/slim/cuda/Guard.h>
#include <standalone/torch/csrc/inductor/aoti_torch/tensor_converter.h>
static std::wstring u8u16(const char* s) {
int len = MultiByteToWideChar(CP_UTF8, 0, s, -1, NULL, 0);
std::wstring wbuf(len, L'\0');
MultiByteToWideChar(CP_UTF8, 0, s, -1, &wbuf[0], len);
if (!wbuf.empty() && wbuf.back() == L'\0') {
wbuf.pop_back();
}
return wbuf;
}
int main() {
try {
// Load the DLL (model.pyd is a DLL on Windows)
HMODULE handle = nullptr;
{
auto wname = u8u16(R"(C:\Users\shangdiy\source\repos\pytorch\model2\model.pyd)");
// Try LoadLibraryExW with safe search flags if supported
if (GetProcAddress(GetModuleHandleW(L"KERNEL32.DLL"), "AddDllDirectory") != NULL) {
handle = LoadLibraryExW(
wname.c_str(),
NULL,
LOAD_LIBRARY_SEARCH_DEFAULT_DIRS);
}
// Fallback if that failed
if (!handle) {
handle = LoadLibraryW(wname.c_str());
}
if (!handle) {
DWORD dw = GetLastError();
char buf[512];
FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, dw, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
buf, sizeof(buf), NULL);
std::cerr << "Failed to load model.pyd. WinError " << dw << ": " << buf << std::endl;
return 1;
} else {
std::cout << "Loaded model.pyd" << std::endl;
}
}
decltype(&AOTInductorModelContainerCreateWithDevice) create_model{nullptr};
decltype(&AOTInductorModelContainerDelete) delete_model{nullptr};
decltype(&AOTInductorModelContainerRun) run_model{nullptr};
#define AOTI_LOAD_SYMBOL(handle_, var, name_str) \
var = reinterpret_cast<decltype(var)>(GetProcAddress(handle_, name_str)); \
if (!var) { \
throw std::runtime_error("Could not GetProcAddress " name_str); \
}
AOTI_LOAD_SYMBOL(handle, create_model, "AOTInductorModelContainerCreateWithDevice");
AOTI_LOAD_SYMBOL(handle, run_model, "AOTInductorModelContainerRun");
AOTI_LOAD_SYMBOL(handle, delete_model, "AOTInductorModelContainerDelete");
#undef AOTI_LOAD_SYMBOL
// Create array of input/output handles
slim::SlimTensor x = slim::empty({8, 10}, c10::kFloat, c10::Device(c10::kCUDA, 0));
float fill_value = 1.0;
x.fill_(fill_value);
// AOTInductorModel::run will steal the ownership of the input and output
// tensor pointers
std::vector<slim::SlimTensor> inputs = {x};
std::vector<AtenTensorHandle> input_handles =
unsafe_alloc_new_handles_from_tensors(inputs);
AtenTensorHandle output_handle;
AOTInductorModelContainerHandle container_handle;
cudaStream_t stream = slim::cuda::getCurrentCUDAStream(0);
// aoti_torch_get_current_cuda_stream(0, (void**)&stream);
// Reinterpret as the opaque handle for AOTInductor
AOTInductorStreamHandle stream_handle = reinterpret_cast<AOTInductorStreamHandle>(stream);
// Construct model
const char* cubin_dir = R"(C:\Users\shangdiy\source\repos\pytorch\model2\)";
AOTIRuntimeError err =
create_model(&container_handle, 1, "cuda", cubin_dir);
if (err != AOTI_RUNTIME_SUCCESS) {
throw std::runtime_error("Failed to create model container");
} else {
std::cout << "Created model\n";
}
// Run the model
err = run_model(container_handle, input_handles.data(),
1, // num_inputs
&output_handle,
1, // num_outputs
stream_handle, // stream
nullptr // proxy_executor
);
if (err != AOTI_RUNTIME_SUCCESS) {
throw std::runtime_error("Failed to run model");
} else {
std::cout << "Finish model\n";
}
std::vector<slim::SlimTensor> outputs =
alloc_tensors_by_stealing_from_handles(&output_handle, 1);
// Print the result
slim::SlimTensor slim_tensor = outputs[0];
auto slim_cpu = slim_tensor.cpu();
float *slim_data = static_cast<float *>(slim_cpu.data_ptr());
std::cout << "Output" << std::endl;
std::cout << "slim_data ptr: " << slim_data << "\n";
size_t num_elements = slim_cpu.numel(); // or equivalent method
std::cout << num_elements << std::endl;
for (size_t i = 0; i < num_elements; ++i) {
std::cout << slim_data[i] << "\n";
}
std::cout << "Done" << std::endl;
delete_model(container_handle);
FreeLibrary(handle);
return 0;
} catch (const std::exception &e) {
std::cerr << "Error: " << e.what() << std::endl;
return 1;
}
}

Binary file not shown.

View File

@ -10,7 +10,7 @@ filelock
fsspec>=0.8.5
hypothesis
jinja2
lintrunner ; platform_machine != "s390x" and platform_machine != "riscv64"
lintrunner ; platform_machine != "s390x"
networkx>=2.5.1
optree>=0.13.0
psutil

View File

@ -2709,7 +2709,6 @@ TEST(ProfilerDisableInCallbackTest, Basic) {
}
TEST(RecordDebugHandles, Basic) {
GTEST_SKIP() << "Test is flaky and sometimes hangs on CI. ";
// Enable the profiler in this thread
const std::set<torch::autograd::profiler::ActivityType> activities(
{torch::autograd::profiler::ActivityType::CPU});

View File

@ -36,9 +36,6 @@ set(NATIVERT_TEST_SRCS
${TORCH_ROOT}/torch/nativert/kernels/AutoFunctionalizeKernel.cpp
${TORCH_ROOT}/torch/nativert/kernels/CallTorchBindKernel.cpp
${TORCH_ROOT}/torch/nativert/kernels/HigherOrderKernel.cpp
${TORCH_ROOT}/torch/nativert/graph/passes/SubgraphRewriter.cpp
${TORCH_ROOT}/torch/nativert/graph/passes/pass_manager/GraphPasses.cpp
${TORCH_ROOT}/torch/nativert/graph/passes/pass_manager/PassManager.cpp
)
add_executable(test_nativert

View File

@ -1,33 +0,0 @@
#include <gtest/gtest.h>
#include <torch/nativert/graph/Graph.h>
#include <torch/nativert/graph/passes/pass_manager/PassManager.h>
#include <torch/csrc/jit/testing/file_check.h>
using namespace ::testing;
using namespace torch::nativert;
TEST(PassManagerTest, TestEmptyPass) {
GraphPassManager manager({"EmptyPass"});
EXPECT_FALSE(manager.run(Graph::createGraph().get()));
}
TEST(PassPipelineTest, TestConcat) {
GraphPassPipeline p1({"test"});
EXPECT_EQ(p1.size(), 1);
EXPECT_EQ(p1.at(0), "test");
p1.concat({"test1", "test2"});
EXPECT_EQ(p1.at(0), "test");
EXPECT_EQ(p1.at(1), "test1");
EXPECT_EQ(p1.at(2), "test2");
}
TEST(PassPipelineTest, TestPushFront) {
GraphPassPipeline p1({"test"});
EXPECT_EQ(p1.size(), 1);
EXPECT_EQ(p1.at(0), "test");
p1.push_front("test1");
EXPECT_EQ(p1.at(0), "test1");
EXPECT_EQ(p1.at(1), "test");
}

View File

@ -288,16 +288,6 @@ void boxed_empty_like(StableIValue* stack, uint64_t num_args, uint64_t num_outpu
stack[0] = from(res);
}
bool my_is_cpu(Tensor t) {
return t.is_cpu();
}
void boxed_my_is_cpu(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
auto res = my_is_cpu(to<Tensor>(stack[0]));
stack[0] = from(res);
}
Tensor fill_infinity(Tensor t) {
auto value = std::numeric_limits<float>::infinity();
return fill_(t, value);
@ -354,7 +344,6 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
m.impl("my_transpose", &boxed_my_transpose);
m.impl("my_empty_like", &boxed_empty_like);
m.impl("fill_infinity", &boxed_fill_infinity);
m.impl("my_is_cpu", &boxed_my_is_cpu);
}
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeImplicitAutograd, m) {
@ -373,8 +362,6 @@ void boxed_my_zero_(StableIValue* stack, uint64_t num_args, uint64_t num_outputs
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
m.def("my_zero_(Tensor(a!) t) -> Tensor(a!)");
m.def("my_is_cpu(Tensor t) -> bool");
}
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {

View File

@ -51,19 +51,6 @@ def my_abs(t) -> Tensor:
return torch.ops.libtorch_agnostic.my_abs.default(t)
def my_is_cpu(t) -> bool:
"""
Returns is_cpu on the input tensor.
Args:
t: any Tensor
Returns:
a bool
"""
return torch.ops.libtorch_agnostic.my_is_cpu.default(t)
def my_ones_like(tensor, device) -> Tensor:
"""
Returns a new Tensor like the input tensor, but with all ones

View File

@ -209,13 +209,6 @@ if not IS_WINDOWS:
self.assertEqual(id(out), id(t))
self.assertEqual(out, torch.zeros_like(t))
def test_my_is_cpu(self, device):
import libtorch_agnostic
t = torch.rand(2, 7, device=device)
out = libtorch_agnostic.ops.my_is_cpu(t)
self.assertEqual(out, t.is_cpu)
def test_fill_infinity(self, device):
import libtorch_agnostic

Some files were not shown because too many files have changed in this diff Show More