Compare commits

..

1 Commits

Author SHA1 Message Date
56e1eb05b0 move some of vec into headeronly in preparation for Half.h
ghstack-source-id: febe660fe7110543f81855c4b3cbf2b33fa44040
Pull Request resolved: https://github.com/pytorch/pytorch/pull/158976
2025-07-28 11:44:31 -07:00
1039 changed files with 20503 additions and 75361 deletions

View File

@ -438,7 +438,9 @@ def build_torchvision(
)
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
elif build_version is not None:
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
build_vars += (
f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
)
if host.using_docker():
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
@ -493,7 +495,9 @@ def build_torchdata(
)
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
elif build_version is not None:
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
build_vars += (
f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
)
if host.using_docker():
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
@ -549,7 +553,9 @@ def build_torchtext(
)
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
elif build_version is not None:
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
build_vars += (
f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
)
if host.using_docker():
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
@ -607,7 +613,9 @@ def build_torchaudio(
)
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
elif build_version is not None:
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
build_vars += (
f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
)
if host.using_docker():
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

View File

@ -104,6 +104,7 @@ If your new Docker image needs a library installed from a specific pinned commit
```bash
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-new1)
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=11
VISION=yes

View File

@ -93,6 +93,7 @@ tag=$(echo $image | awk -F':' '{print $2}')
case "$tag" in
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11)
CUDA_VERSION=12.4
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
VISION=yes
@ -103,6 +104,7 @@ case "$tag" in
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11)
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
VISION=yes
@ -113,6 +115,7 @@ case "$tag" in
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
VISION=yes
@ -124,6 +127,7 @@ case "$tag" in
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=9
VISION=yes
@ -135,6 +139,7 @@ case "$tag" in
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.13
GCC_VERSION=9
VISION=yes
@ -144,8 +149,20 @@ case "$tag" in
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9)
CUDA_VERSION=12.6.3
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=11
VISION=yes
@ -154,8 +171,45 @@ case "$tag" in
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
;;
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=12.6
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
CUDA_VERSION=12.6
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
CUDA_VERSION=12.6
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.13
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
VISION=yes
@ -176,7 +230,19 @@ case "$tag" in
VISION=yes
TRITON=yes
;;
pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3)
pytorch-linux-jammy-py3.11-clang12)
ANACONDA_PYTHON_VERSION=3.11
CLANG_VERSION=12
VISION=yes
TRITON=yes
;;
pytorch-linux-jammy-py3.9-gcc9)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=9
VISION=yes
TRITON=yes
;;
pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-noble-rocm-n-py3)
if [[ $tag =~ "jammy" ]]; then
ANACONDA_PYTHON_VERSION=3.10
else
@ -190,9 +256,7 @@ case "$tag" in
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
if [[ $tag =~ "benchmarks" ]]; then
INDUCTOR_BENCHMARKS=yes
fi
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-noble-rocm-alpha-py3)
ANACONDA_PYTHON_VERSION=3.12
@ -204,6 +268,7 @@ case "$tag" in
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
INDUCTOR_BENCHMARKS=yes
PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
;;
pytorch-linux-jammy-xpu-2025.0-py3)
@ -234,6 +299,7 @@ case "$tag" in
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
ANACONDA_PYTHON_VERSION=3.9
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
CLANG_VERSION=12
VISION=yes
TRITON=yes
@ -312,6 +378,7 @@ case "$tag" in
fi
if [[ "$image" == *cuda* ]]; then
extract_version_from_image_name cuda CUDA_VERSION
extract_version_from_image_name cudnn CUDNN_VERSION
fi
if [[ "$image" == *rocm* ]]; then
extract_version_from_image_name rocm ROCM_VERSION
@ -363,6 +430,9 @@ docker build \
--build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
--build-arg "GCC_VERSION=${GCC_VERSION}" \
--build-arg "CUDA_VERSION=${CUDA_VERSION}" \
--build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \
--build-arg "TENSORRT_VERSION=${TENSORRT_VERSION}" \
--build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \
--build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
--build-arg "KATEX=${KATEX:-}" \
--build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \

View File

@ -1 +1 @@
f7888497a1eb9e98d4c07537f0d0bcfe180d1363
11ec6354315768a85da41032535e3b7b99c5f706

View File

@ -66,9 +66,8 @@ function do_cpython_build {
ln -s pip3 ${prefix}/bin/pip
fi
# install setuptools since python 3.12 is required to use distutils
# packaging is needed to create symlink since wheel no longer provides needed information
${prefix}/bin/pip install packaging==25.0 wheel==0.45.1 setuptools==80.9.0
local abi_tag=$(${prefix}/bin/python -c "from packaging.tags import interpreter_name, interpreter_version; import sysconfig ; from sysconfig import get_config_var; print('{0}{1}-{0}{1}{2}'.format(interpreter_name(), interpreter_version(), 't' if sysconfig.get_config_var('Py_GIL_DISABLED') else ''))")
${prefix}/bin/pip install wheel==0.45.1 setuptools==80.9.0
local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))")
ln -sf ${prefix} /opt/python/${abi_tag}
}

View File

@ -68,8 +68,8 @@ function install_nvshmem {
# download, unpack, install
wget -q "${url}"
tar xf "${filename}.tar.gz"
cp -a "libnvshmem/include/"* /usr/local/cuda/include/
cp -a "libnvshmem/lib/"* /usr/local/cuda/lib64/
cp -a "libnvshmem/include/"* /usr/local/include/
cp -a "libnvshmem/lib/"* /usr/local/lib/
# cleanup
cd ..

View File

@ -0,0 +1,26 @@
#!/bin/bash
if [[ -n "${CUDNN_VERSION}" ]]; then
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
mkdir tmp_cudnn
pushd tmp_cudnn
if [[ ${CUDA_VERSION:0:4} == "12.9" || ${CUDA_VERSION:0:4} == "12.8" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
elif [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive"
else
print "Unsupported CUDA version ${CUDA_VERSION}"
exit 1
fi
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
tar xf ${CUDNN_NAME}.tar.xz
cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/
cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/
popd
rm -rf tmp_cudnn
ldconfig
fi

View File

@ -15,37 +15,11 @@ function install_timm() {
commit=$(get_pinned_commit timm)
pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
}
function install_torchbench() {
local commit
commit=$(get_pinned_commit torchbench)
git clone https://github.com/pytorch/benchmark torchbench
pushd torchbench
git checkout "$commit"
python install.py --continue_on_fail
# TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
# is regressing speedup metric. This needs to be investigated further
pip install transformers==4.38.1
echo "Print all dependencies after TorchBench is installed"
python -mpip freeze
popd
chown -R jenkins torchbench
# Clean up
conda_run pip uninstall -y torch torchvision triton
}
# Pango is needed for weasyprint which is needed for doctr
conda_install pango
# Stable packages are ok here, just to satisfy TorchBench check
pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
install_torchbench
install_huggingface
install_timm
# Clean up
conda_run pip uninstall -y torch torchvision torchaudio triton

View File

@ -34,27 +34,18 @@ function install_ubuntu() {
# The xpu-smi packages
apt-get install -y flex bison xpu-smi
if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
# Compute and Media Runtimes
apt-get install -y \
intel-opencl-icd intel-level-zero-gpu level-zero \
intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
# Development Packages
apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
else # rolling driver
apt-get install -y \
intel-opencl-icd libze-intel-gpu1 libze1 \
intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev
# Compute and Media Runtimes
apt-get install -y \
intel-opencl-icd intel-level-zero-gpu level-zero \
intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
apt-get install -y intel-ocloc
fi
# Development Packages
apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
# Install Intel Support Packages
apt-get install -y ${XPU_PACKAGES}
@ -139,11 +130,11 @@ function install_sles() {
}
# Default use GPU driver rolling releases
XPU_DRIVER_VERSION=""
if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
# Use GPU driver LTS releases
XPU_DRIVER_VERSION="/lts/2350"
# Default use GPU driver LTS releases
XPU_DRIVER_VERSION="/lts/2350"
if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
# Use GPU driver rolling releases
XPU_DRIVER_VERSION=""
fi
# Default use Intel® oneAPI Deep Learning Essentials 2025.0

View File

@ -63,12 +63,11 @@ lark==0.12.0
#Pinned versions: 0.12.0
#test that import:
librosa>=0.6.2 ; python_version < "3.11" and platform_machine != "s390x"
librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
librosa>=0.6.2 ; python_version < "3.11"
librosa==0.10.2 ; python_version == "3.12"
#Description: A python package for music and audio analysis
#Pinned versions: >=0.6.2
#test that import: test_spectral_ops.py
#librosa depends on numba; disable it for s390x while numba is disabled too
#mkl #this breaks linux-bionic-rocm4.5-py3.7
#Description: Intel oneAPI Math Kernel Library
@ -111,15 +110,14 @@ ninja==1.11.1.3
#Pinned versions: 1.11.1.3
#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x"
numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
numba==0.49.0 ; python_version < "3.9"
numba==0.55.2 ; python_version == "3.9"
numba==0.55.2 ; python_version == "3.10"
numba==0.60.0 ; python_version == "3.12"
#Description: Just-In-Time Compiler for Numerical Functions
#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
#test that import: test_numba_integration.py
#For numba issue see https://github.com/pytorch/pytorch/issues/51511
#Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073
#numpy
#Description: Provides N-dimensional arrays and linear algebra
@ -309,7 +307,7 @@ pytest-cpp==2.3.0
#Pinned versions: 2.3.0
#test that import:
z3-solver==4.15.1.0 ; platform_machine != "s390x"
z3-solver==4.15.1.0
#Description: The Z3 Theorem Prover Project
#Pinned versions:
#test that import:
@ -363,6 +361,7 @@ pwlf==2.2.1
#Pinned versions: 2.2.1
#test that import: test_sac_estimator.py
# To build PyTorch itself
pyyaml
pyzstd

View File

@ -1,7 +1,7 @@
sphinx==5.3.0
#Description: This is used to generate PyTorch docs
#Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
@ -50,8 +50,8 @@ IPython==8.12.0
#Pinned versions: 8.12.0
myst-nb==0.17.2
#Description: This is used to generate PyTorch functorch and torch.compile docs.
#Pinned versions: 0.17.2
#Description: This is used to generate PyTorch functorch docs
#Pinned versions: 0.13.2
# The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
python-etcd==0.4.5
@ -59,3 +59,4 @@ sphinx-copybutton==0.5.0
sphinx-design==0.4.0
sphinxcontrib-mermaid==1.0.0
myst-parser==0.18.1
myst-nb

View File

@ -98,9 +98,8 @@ COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/huggingface.txt huggingface.txt
COPY ci_commit_pins/timm.txt timm.txt
COPY ci_commit_pins/torchbench.txt torchbench.txt
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
# (optional) Install non-default Ninja version
ARG NINJA_VERSION

View File

@ -98,9 +98,8 @@ COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/huggingface.txt huggingface.txt
COPY ci_commit_pins/timm.txt timm.txt
COPY ci_commit_pins/torchbench.txt torchbench.txt
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
ARG TRITON
ARG TRITON_CPU

View File

@ -138,11 +138,28 @@ fi
echo "Calling setup.py bdist at $(date)"
time CMAKE_ARGS=${CMAKE_ARGS[@]} \
EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 \
BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 \
BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
CMAKE_FRESH=1 python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
else
time CMAKE_ARGS=${CMAKE_ARGS[@]} \
EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
fi
echo "Finished setup.py bdist at $(date)"
# Build libtorch packages
@ -255,6 +272,10 @@ ls /tmp/$WHEELHOUSE_DIR
mkdir -p "/$WHEELHOUSE_DIR"
mv /tmp/$WHEELHOUSE_DIR/torch*linux*.whl /$WHEELHOUSE_DIR/
if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
mv /tmp/$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/ || true
fi
if [[ -n "$BUILD_PYTHONLESS" ]]; then
mkdir -p /$LIBTORCH_HOUSE_DIR
mv /tmp/$LIBTORCH_HOUSE_DIR/*.zip /$LIBTORCH_HOUSE_DIR
@ -431,8 +452,16 @@ if [[ -z "$BUILD_PYTHONLESS" ]]; then
pushd $PYTORCH_ROOT/test
# Install the wheel for this Python version
if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
pip uninstall -y "$TORCH_NO_PYTHON_PACKAGE_NAME" || true
fi
pip uninstall -y "$TORCH_PACKAGE_NAME"
if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
pip install "$TORCH_NO_PYTHON_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v
fi
pip install "$TORCH_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v
# Print info on the libraries installed in this wheel

View File

@ -194,7 +194,7 @@ ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library
ROCBLAS_LIB_DST=lib/rocblas/library
ROCBLAS_ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH)
ROCBLAS_OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx)
ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $ROCBLAS_OTHER_FILES)
ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $OTHER_FILES)
# hipblaslt library files
HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library

View File

@ -50,9 +50,6 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
export ATEN_THREADING=NATIVE
fi
# Enable LLVM dependency for TensorExpr testing
export USE_LLVM=/opt/llvm
export LLVM_DIR=/opt/llvm/lib/cmake/llvm
if ! which conda; then
# In ROCm CIs, we are doing cross compilation on build machines with
@ -176,7 +173,7 @@ fi
# We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
# memory to build and will OOM
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && echo "${TORCH_CUDA_ARCH_LIST}" | tr ' ' '\n' | sed 's/$/>= 8.0/' | bc | grep -q 1; then
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then
export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j 2"
fi
@ -192,7 +189,6 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
export USE_ASAN=1
export REL_WITH_DEB_INFO=1
export UBSAN_FLAGS="-fno-sanitize-recover=all"
unset USE_LLVM
fi
if [[ "${BUILD_ENVIRONMENT}" == *no-ops* ]]; then
@ -265,13 +261,22 @@ else
WERROR=1 python setup.py clean
WERROR=1 python setup.py bdist_wheel
if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
python3 tools/packaging/split_wheel.py bdist_wheel
else
WERROR=1 python setup.py bdist_wheel
fi
else
python setup.py clean
if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
source .ci/pytorch/install_cache_xla.sh
fi
python setup.py bdist_wheel
if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
echo "USE_SPLIT_BUILD cannot be used with xla or rocm"
exit 1
else
python setup.py bdist_wheel
fi
fi
pip_install_whl "$(echo dist/*.whl)"

View File

@ -229,6 +229,7 @@ function install_torchrec_and_fbgemm() {
pip_install tabulate # needed for newer fbgemm
pip_install patchelf # needed for rocm fbgemm
pushd /tmp
local wheel_dir=dist/fbgemm_gpu
local found_whl=0
@ -244,7 +245,7 @@ function install_torchrec_and_fbgemm() {
if [ "${found_whl}" == "0" ]; then
git clone --recursive https://github.com/pytorch/fbgemm
pushd fbgemm/fbgemm_gpu
git checkout "${fbgemm_commit}" --recurse-submodules
git checkout "${fbgemm_commit}"
python setup.py bdist_wheel \
--build-variant=rocm \
-DHIP_ROOT_DIR="${ROCM_PATH}" \
@ -263,6 +264,7 @@ function install_torchrec_and_fbgemm() {
done
rm -rf fbgemm
popd
else
pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
@ -281,6 +283,30 @@ function clone_pytorch_xla() {
fi
}
function checkout_install_torchbench() {
local commit
commit=$(get_pinned_commit torchbench)
git clone https://github.com/pytorch/benchmark torchbench
pushd torchbench
git checkout "$commit"
if [ "$1" ]; then
python install.py --continue_on_fail models "$@"
else
# Occasionally the installation may fail on one model but it is ok to continue
# to install and test other models
python install.py --continue_on_fail
fi
# TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
# is regressing speedup metric. This needs to be investigated further
pip install transformers==4.38.1
echo "Print all dependencies after TorchBench is installed"
python -mpip freeze
popd
}
function install_torchao() {
local commit
commit=$(get_pinned_commit torchao)

View File

@ -157,29 +157,6 @@ test_jit_hooks() {
assert_git_not_dirty
}
# Shellcheck doesn't like it when you pass no arguments to a function
# that can take args. See https://www.shellcheck.net/wiki/SC2120
# shellcheck disable=SC2120
checkout_install_torchbench() {
local commit
commit=$(cat .ci/docker/ci_commit_pins/torchbench.txt)
git clone https://github.com/pytorch/benchmark torchbench
pushd torchbench
git checkout "$commit"
if [ "$1" ]; then
python install.py --continue_on_fail models "$@"
else
# Occasionally the installation may fail on one model but it is ok to continue
# to install and test other models
python install.py --continue_on_fail
fi
echo "Print all dependencies after TorchBench is installed"
python -mpip freeze
popd
}
torchbench_setup_macos() {
git clone --recursive https://github.com/pytorch/vision torchvision
git clone --recursive https://github.com/pytorch/audio torchaudio
@ -202,6 +179,8 @@ torchbench_setup_macos() {
USE_OPENMP=0 python setup.py develop
popd
# Shellcheck doesn't like it when you pass no arguments to a function that can take args. See https://www.shellcheck.net/wiki/SC2120
# shellcheck disable=SC2119,SC2120
checkout_install_torchbench
}

View File

@ -627,8 +627,6 @@ test_perf_for_dashboard() {
device=cuda_a10g
elif [[ "${TEST_CONFIG}" == *h100* ]]; then
device=cuda_h100
elif [[ "${TEST_CONFIG}" == *b200* ]]; then
device=cuda_b200
elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
device=rocm
fi
@ -803,16 +801,6 @@ test_dynamo_benchmark() {
if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
elif [[ "${TEST_CONFIG}" == *perf* ]]; then
# TODO (huydhn): Just smoke test some sample models
if [[ "${TEST_CONFIG}" == *b200* ]]; then
if [[ "${suite}" == "huggingface" ]]; then
export TORCHBENCH_ONLY_MODELS="DistillGPT2"
elif [[ "${suite}" == "timm_models" ]]; then
export TORCHBENCH_ONLY_MODELS="inception_v3"
elif [[ "${suite}" == "torchbench" ]]; then
export TORCHBENCH_ONLY_MODELS="hf_Bert"
fi
fi
test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
else
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
@ -1051,20 +1039,10 @@ test_libtorch_api() {
mkdir -p $TEST_REPORTS_DIR
OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
"$TORCH_BIN_DIR"/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml
else
# Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy
OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest"
# On s390x, pytorch is built without llvm.
# Even if it would be built with llvm, llvm currently doesn't support used features on s390x and
# test fails with errors like:
# JIT session error: Unsupported target machine architecture in ELF object pytorch-jitted-objectbuffer
# unknown file: Failure
# C++ exception with description "valOrErr INTERNAL ASSERT FAILED at "/var/lib/jenkins/workspace/torch/csrc/jit/tensorexpr/llvm_jit.h":34, please report a bug to PyTorch. Unexpected failure in LLVM JIT: Failed to materialize symbols: { (main, { func }) }
if [[ "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr
fi
fi
# quantization is not fully supported on s390x yet
@ -1684,11 +1662,13 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
elif [[ "${TEST_CONFIG}" == cachebench ]]; then
install_torchaudio
install_torchvision
PYTHONPATH=/torchbench test_cachebench
checkout_install_torchbench nanogpt BERT_pytorch resnet50 hf_T5 llama moco
PYTHONPATH=$(pwd)/torchbench test_cachebench
elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
install_torchaudio
install_torchvision
PYTHONPATH=/torchbench test_verify_cachebench
checkout_install_torchbench nanogpt
PYTHONPATH=$(pwd)/torchbench test_verify_cachebench
elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
install_torchaudio
install_torchvision
@ -1697,22 +1677,28 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
# https://github.com/opencv/opencv-python/issues/885
pip_install opencv-python==4.8.0.74
if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
PYTHONPATH=/torchbench test_inductor_torchbench_smoketest_perf
checkout_install_torchbench hf_Bert hf_Albert timm_vision_transformer
PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
PYTHONPATH=/torchbench test_inductor_torchbench_cpu_smoketest_perf
checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \
llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \
functorch_maml_omniglot yolov3 mobilenet_v2 resnext50_32x4d densenet121 mnasnet1_0
PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf
elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
TORCHBENCHPATH=/torchbench test_torchbench_gcp_smoketest
checkout_install_torchbench
TORCHBENCHPATH=$(pwd)/torchbench test_torchbench_gcp_smoketest
else
checkout_install_torchbench
# Do this after checkout_install_torchbench to ensure we clobber any
# nightlies that torchbench may pull in
if [[ "${TEST_CONFIG}" != *cpu* ]]; then
install_torchrec_and_fbgemm
fi
PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
fi
elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
install_torchvision
PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
if [[ "$SHARD_NUMBER" -eq "1" ]]; then
test_inductor_aoti
fi

View File

@ -192,6 +192,9 @@ retry brew install libomp
# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
export USE_DISTRIBUTED=1
if [[ -n "$CROSS_COMPILE_ARM64" ]]; then
export CMAKE_OSX_ARCHITECTURES=arm64
fi
export USE_MKLDNN=OFF
export USE_QNNPACK=OFF
export BUILD_TEST=OFF
@ -199,7 +202,16 @@ export BUILD_TEST=OFF
pushd "$pytorch_rootdir"
echo "Calling setup.py bdist_wheel at $(date)"
python setup.py bdist_wheel -d "$whl_tmp_dir"
if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 python setup.py bdist_wheel -d "$whl_tmp_dir"
echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 CMAKE_FRESH=1 python setup.py bdist_wheel -d "$whl_tmp_dir"
echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
else
python setup.py bdist_wheel -d "$whl_tmp_dir"
fi
echo "Finished setup.py bdist_wheel at $(date)"

View File

@ -65,8 +65,16 @@ fi
if [[ "$PACKAGE_TYPE" != libtorch ]]; then
if [[ "\$BUILD_ENVIRONMENT" != *s390x* ]]; then
pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
retry pip install -q numpy protobuf typing-extensions
if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
pkg_no_python="$(ls -1 /final_pkgs/torch_no_python* | sort |tail -1)"
pkg_torch="$(ls -1 /final_pkgs/torch-* | sort |tail -1)"
# todo: after folder is populated use the pypi_pkg channel instead
pip install "\$pkg_no_python" "\$pkg_torch" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}_pypi_pkg"
retry pip install -q numpy protobuf typing-extensions
else
pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
retry pip install -q numpy protobuf typing-extensions
fi
else
pip install "\$pkg"
retry pip install -q numpy protobuf typing-extensions

View File

@ -134,6 +134,7 @@ export DESIRED_PYTHON="${DESIRED_PYTHON:-}"
export DESIRED_CUDA="$DESIRED_CUDA"
export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}"
export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}"
export USE_SPLIT_BUILD="${USE_SPLIT_BUILD:-}"
if [[ "${OSTYPE}" == "msys" ]]; then
export LIBTORCH_CONFIG="${LIBTORCH_CONFIG:-}"
if [[ "${LIBTORCH_CONFIG:-}" == 'debug' ]]; then

View File

@ -23,6 +23,10 @@ if [[ "${DRY_RUN}" = "disabled" ]]; then
AWS_S3_CP="aws s3 cp"
fi
if [[ "${USE_SPLIT_BUILD:-false}" == "true" ]]; then
UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_pypi_pkg"
fi
# this is special build with all dependencies packaged
if [[ ${BUILD_NAME} == *-full* ]]; then
UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_full"

View File

@ -53,12 +53,16 @@ self-hosted-runner:
- linux.rocm.gpu.mi250
- linux.rocm.gpu.2
- linux.rocm.gpu.4
# gfx942 runners
- linux.rocm.gpu.gfx942.2
- linux.rocm.gpu.gfx942.4
# MI300 runners
- linux.rocm.gpu.mi300.2
- linux.rocm.gpu.mi300.4
- rocm-docker
# Repo-specific Apple hosted runners
- macos-m1-ultra
- macos-m2-14
# Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors)
- macos-m1-stable
- macos-m1-13
- macos-m1-14
# GitHub-hosted MacOS runners
- macos-latest-xlarge

View File

@ -24,6 +24,7 @@ runs:
-e PYTORCH_FINAL_PACKAGE_DIR \
-e PYTORCH_ROOT \
-e SKIP_ALL_TESTS \
-e USE_SPLIT_BUILD \
--tty \
--detach \
-v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \

View File

@ -1 +1 @@
0c22347335f4c9a5b92a2f5bad65e05e2464c184
f6dfe1231dcdd221a68416e49ab85c2575cbb824

View File

@ -1 +1 @@
7e3a8dc90670fd312ce1e0d4eba9bf11c571e3ad
8f605ee30912541126c0fe46d0c8c413101b600a

View File

@ -1 +1 @@
b6a5b82b9948b610fa4c304d0d869c82b8f17db1
29ae4c76c026185f417a25e841d2cd5e65f087a3

View File

@ -488,10 +488,6 @@
- torch/_dynamo/**
- torch/csrc/dynamo/**
- test/dynamo/**
- test/dynamo_expected_failures/**
- test/dynamo_skips/**
- test/inductor_expected_failures/**
- test/inductor_skips/**
approved_by:
- guilhermeleobas
mandatory_checks_name:

View File

@ -2,7 +2,7 @@ boto3==1.35.42
cmake==3.27.*
expecttest==0.3.0
fbscribelogger==0.1.7
filelock==3.18.0
filelock==3.13.1
hypothesis==6.56.4
librosa>=0.6.2
mpmath==1.3.0

View File

@ -193,7 +193,7 @@ LIBTORCH_CONTAINER_IMAGES: dict[str, str] = {
"cpu": "libtorch-cxx11-builder:cpu",
}
FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]
FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"]
def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
@ -273,6 +273,7 @@ def generate_wheels_matrix(
os: str,
arches: Optional[list[str]] = None,
python_versions: Optional[list[str]] = None,
use_split_build: bool = False,
) -> list[dict[str, str]]:
package_type = "wheel"
if os == "linux" or os == "linux-aarch64" or os == "linux-s390x":
@ -314,11 +315,15 @@ def generate_wheels_matrix(
# TODO: Enable python 3.13t on cpu-s390x
if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
continue
# TODO: Enable python 3.14 on non linux OSes
if os != "linux" and (
python_version == "3.14" or python_version == "3.14t"
if use_split_build and (
arch_version not in ["12.6", "12.8", "12.9", "cpu"] or os != "linux"
):
continue
raise RuntimeError(
"Split build is only supported on linux with cuda 12* and cpu.\n"
f"Currently attempting to build on arch version {arch_version} and os {os}.\n"
"Please modify the matrix generation to exclude this combination."
)
# cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
@ -334,6 +339,7 @@ def generate_wheels_matrix(
"gpu_arch_type": gpu_arch_type,
"gpu_arch_version": gpu_arch_version,
"desired_cuda": desired_cuda,
"use_split_build": "True" if use_split_build else "False",
"container_image": WHEEL_CONTAINER_IMAGES[arch_version].split(
":"
)[0],
@ -366,6 +372,7 @@ def generate_wheels_matrix(
"desired_cuda": translate_desired_cuda(
gpu_arch_type, gpu_arch_version
),
"use_split_build": "True" if use_split_build else "False",
"container_image": WHEEL_CONTAINER_IMAGES[
arch_version
].split(":")[0],
@ -388,6 +395,7 @@ def generate_wheels_matrix(
"desired_cuda": translate_desired_cuda(
gpu_arch_type, gpu_arch_version
),
"use_split_build": "True" if use_split_build else "False",
"container_image": WHEEL_CONTAINER_IMAGES[arch_version].split(
":"
)[0],

View File

@ -59,7 +59,9 @@ class BinaryBuildWorkflow:
is_scheduled: str = ""
branches: str = "nightly"
# Mainly for macos
cross_compile_arm64: bool = False
macos_runner: str = "macos-14-xlarge"
use_split_build: bool = False
# Mainly used for libtorch builds
build_variant: str = ""
@ -70,6 +72,9 @@ class BinaryBuildWorkflow:
for item in [self.os, "binary", self.package_type, self.build_variant]
if item != ""
)
if self.use_split_build:
# added to distinguish concurrency groups
self.build_environment += "-split"
def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
output_file_path = (
@ -112,6 +117,21 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
isolated_workflow=True,
),
),
# See https://github.com/pytorch/pytorch/issues/138750
# BinaryBuildWorkflow(
# os=OperatingSystem.LINUX,
# package_type="manywheel",
# build_configs=generate_binary_build_matrix.generate_wheels_matrix(
# OperatingSystem.LINUX,
# use_split_build=True,
# arches=["11.8", "12.1", "12.4", "cpu"],
# ),
# ciflow_config=CIFlowConfig(
# labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
# isolated_workflow=True,
# ),
# use_split_build=True,
# ),
BinaryBuildWorkflow(
os=OperatingSystem.LINUX,
package_type="libtorch",
@ -155,11 +175,27 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
package_type="manywheel",
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
OperatingSystem.LINUX,
arches=["12.8"],
python_versions=["3.12"],
arches=["12.6", "12.8", "12.9"],
python_versions=["3.9"],
),
branches="main",
),
# See https://github.com/pytorch/pytorch/issues/138750
# BinaryBuildWorkflow(
# os=OperatingSystem.LINUX,
# package_type="manywheel",
# build_configs=generate_binary_build_matrix.generate_wheels_matrix(
# OperatingSystem.LINUX,
# arches=["11.8", "12.1", "12.4"],
# python_versions=["3.9"],
# use_split_build=True,
# ),
# ciflow_config=CIFlowConfig(
# labels={LABEL_CIFLOW_PERIODIC},
# ),
# branches="main",
# use_split_build=True,
# ),
BinaryBuildWorkflow(
os=OperatingSystem.LINUX,
package_type="libtorch",
@ -302,6 +338,7 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
generate_binary_build_matrix.RELEASE,
libtorch_variants=["shared-with-deps"],
),
cross_compile_arm64=False,
macos_runner="macos-14-xlarge",
ciflow_config=CIFlowConfig(
labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
@ -314,6 +351,7 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
OperatingSystem.MACOS_ARM64
),
cross_compile_arm64=False,
macos_runner="macos-14-xlarge",
ciflow_config=CIFlowConfig(
labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},

View File

@ -262,12 +262,7 @@ def is_exception_branch(branch: str) -> bool:
"""
Branches that get opted out of experiments by default, until they're explicitly enabled.
"""
return branch.split("/", maxsplit=1)[0] in {
"main",
"nightly",
"release",
"landchecks",
}
return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}
def load_yaml(yaml_text: str) -> Any:

View File

@ -1891,9 +1891,7 @@ def validate_revert(
else pr.get_comment_by_id(comment_id)
)
if comment.editor_login is not None:
raise PostCommentError(
"Halting the revert as the revert comment has been edited."
)
raise PostCommentError("Don't want to revert based on edited command")
author_association = comment.author_association
author_login = comment.author_login
allowed_reverters = ["COLLABORATOR", "MEMBER", "OWNER"]

View File

@ -47,6 +47,9 @@ env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ github.event.pull_request.number }}
SKIP_ALL_TESTS: 0
{%- if cross_compile_arm64 %}
CROSS_COMPILE_ARM64: 1
{% endif %}
!{{ common.concurrency(build_environment) }}
jobs:

View File

@ -25,6 +25,11 @@
DOCKER_IMAGE: !{{ config["container_image"] }}
DOCKER_IMAGE_TAG_PREFIX: !{{ config["container_image_tag_prefix"] }}
{%- endif %}
{%- if config["package_type"] == "manywheel" %}
{%- if config.use_split_build is defined %}
use_split_build: !{{ config["use_split_build"] }}
{%- endif %}
{%- endif %}
{%- if config["package_type"] == "libtorch" %}
{%- if config["libtorch_config"] %}
LIBTORCH_CONFIG: !{{ config["libtorch_config"] }}

View File

@ -26,6 +26,13 @@ on:
default: 240
type: number
description: timeout for the job
use_split_build:
description: |
[Experimental] Build a libtorch only wheel and build pytorch such that
are built from the libtorch wheel.
required: false
type: boolean
default: false
ALPINE_IMAGE:
required: false
type: string
@ -110,6 +117,7 @@ jobs:
PR_NUMBER: ${{ github.event.pull_request.number }}
PYTORCH_FINAL_PACKAGE_DIR: /artifacts
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
steps:
- name: Make the env permanent during this workflow (but not the secrets)
shell: bash
@ -134,6 +142,7 @@ jobs:
echo "PR_NUMBER=${{ env.PR_NUMBER }}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
echo "SHA1=${{ env.SHA1 }}"
echo "USE_SPLIT_BUILD=${{ env.use_split_build }}"
} >> "${GITHUB_ENV} }}"
- name: List the env
@ -252,6 +261,7 @@ jobs:
-e PYTORCH_ROOT \
-e SKIP_ALL_TESTS \
-e PYTORCH_EXTRA_INSTALL_REQUIREMENTS \
-e USE_SPLIT_BUILD \
--tty \
--detach \
-v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \

View File

@ -64,6 +64,13 @@ on:
required: true
type: string
description: Hardware to run this job on. Valid values are linux.4xlarge, linux.4xlarge.nvidia.gpu, linux.arm64.2xlarge, and linux.rocm.gpu
use_split_build:
description: |
[Experimental] Build a libtorch only wheel and build pytorch such that
are built from the libtorch wheel.
required: false
type: boolean
default: false
secrets:
github-token:
required: true
@ -97,6 +104,7 @@ jobs:
PR_NUMBER: ${{ github.event.pull_request.number }}
PYTORCH_FINAL_PACKAGE_DIR: /artifacts
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
steps:
- name: Make the env permanent during this workflow (but not the secrets)
shell: bash
@ -121,6 +129,7 @@ jobs:
echo "PR_NUMBER=${{ env.PR_NUMBER }}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
echo "SHA1=${{ env.SHA1 }}"
echo "USE_SPLIT_BUILD=${{ env.USE_SPLIT_BUILD }}"
} >> "${GITHUB_ENV} }}"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"

View File

@ -51,6 +51,13 @@ on:
required: false
type: string
description: Desired python version
use_split_build:
description: |
[Experimental] Build a libtorch only wheel and build pytorch such that
are built from the libtorch wheel.
required: false
type: boolean
default: false
secrets:
github-token:
required: true
@ -79,6 +86,7 @@ jobs:
PR_NUMBER: ${{ github.event.pull_request.number }}
PYTORCH_FINAL_PACKAGE_DIR: /artifacts
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
steps:
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main

View File

@ -306,6 +306,7 @@ jobs:
-e OUR_GITHUB_JOB_ID \
-e HUGGING_FACE_HUB_TOKEN \
-e SCRIBE_GRAPHQL_ACCESS_TOKEN \
-e USE_SPLIT_BUILD \
-e BUILD_ADDITIONAL_PACKAGES \
--memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
--memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \

View File

@ -96,7 +96,7 @@ jobs:
steps:
- name: Setup SSH (Click me for login details)
uses: pytorch/test-infra/.github/actions/setup-ssh@main
if: ${{ !contains(matrix.runner, 'b200') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
if: ${{ matrix.runner != 'B200' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
instructions: |
@ -109,7 +109,7 @@ jobs:
no-sudo: true
- name: Setup Python
if: contains(matrix.runner, 'b200')
if: matrix.runner == 'B200'
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: '3.12'
@ -117,7 +117,7 @@ jobs:
- name: Setup Linux
uses: ./.github/actions/setup-linux
if: inputs.build-environment != 'linux-s390x-binary-manywheel' && !contains(matrix.runner, 'b200')
if: inputs.build-environment != 'linux-s390x-binary-manywheel' && matrix.runner != 'B200'
- name: configure aws credentials
if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
@ -128,7 +128,7 @@ jobs:
aws-region: us-east-1
- name: Login to Amazon ECR
if: ${{ inputs.aws-role-to-assume != '' && contains(matrix.runner, 'b200') }}
if: ${{ inputs.aws-role-to-assume != '' && matrix.runner == 'B200' }}
id: login-ecr
continue-on-error: true
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
@ -166,17 +166,17 @@ jobs:
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
with:
driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && matrix.runner != 'B200' }}
- name: Setup GPU_FLAG for docker run
id: setup-gpu-flag
run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || contains(matrix.runner, 'b200')) }}
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || matrix.runner == 'B200') }}
- name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
id: setup-sscache-port-flag
run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && !contains(matrix.runner, 'b200') }}
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && matrix.runner != 'B200' }}
- name: Lock NVIDIA A100 40GB Frequency
run: |
@ -277,8 +277,8 @@ jobs:
NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
# Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs
SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }}
SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }}
SCCACHE_BUCKET: ${{ matrix.runner != 'B200' && 'ossci-compiler-cache-circleci-v2' || '' }}
SCCACHE_REGION: ${{ matrix.runner != 'B200' && 'us-east-1' || '' }}
SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
DOCKER_IMAGE: ${{ inputs.docker-image }}
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
@ -403,7 +403,7 @@ jobs:
job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
- name: Authenticate with AWS
if: ${{ contains(matrix.runner, 'b200') }}
if: ${{ matrix.runner == 'B200' }}
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results

View File

@ -269,8 +269,8 @@ jobs:
# copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"
- name: Change permissions (only needed for kubernetes runners for now)
if: ${{ always() && steps.test.conclusion && (contains(matrix.runner, 'gfx942') || contains(matrix.runner, 'mi355')) }}
- name: Change permissions (only needed for MI300 and MI355 kubernetes runners for now)
if: ${{ always() && steps.test.conclusion && (contains(matrix.runner, 'mi300') || contains(matrix.runner, 'mi355')) }}
run: |
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"

View File

@ -50,7 +50,7 @@ jobs:
strategy:
fail-fast: false
matrix:
py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t" ]
device: ["cuda", "rocm", "xpu", "aarch64"]
docker-image: ["pytorch/manylinux2_28-builder:cpu"]
include:
@ -126,12 +126,6 @@ jobs:
3.13t)
PYTHON_EXECUTABLE=/opt/python/cp313-cp313t/bin/python
;;
3.14)
PYTHON_EXECUTABLE=/opt/python/cp314-cp314/bin/python
;;
3.14t)
PYTHON_EXECUTABLE=/opt/python/cp314-cp314t/bin/python
;;
*)
echo "Unsupported python version ${PY_VERS}"
exit 1

View File

@ -34,8 +34,7 @@ jobs:
contents: read
pull-requests: write
name: Check labels
# Disabling the job until https://github.com/pytorch/pytorch/issues/159825 is resolved
if: github.repository_owner == 'pytorch' && false
if: github.repository_owner == 'pytorch'
runs-on: linux.24_04.4x
steps:
- name: Checkout PyTorch

View File

@ -7,8 +7,7 @@ on:
jobs:
ghstack-mergeability-check:
# Disabling the job until https://github.com/pytorch/pytorch/issues/159825 is resolved
if: github.repository_owner == 'pytorch' && false
if: github.repository_owner == 'pytorch'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

View File

@ -51,17 +51,21 @@ jobs:
docker-image-name: [
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
pytorch-linux-jammy-py3.9-clang12,
pytorch-linux-jammy-py3.11-clang12,
pytorch-linux-jammy-py3.12-clang12,
pytorch-linux-jammy-py3.13-clang12,
pytorch-linux-jammy-rocm-n-py3,
pytorch-linux-noble-rocm-n-py3,
pytorch-linux-noble-rocm-alpha-py3,
pytorch-linux-jammy-rocm-n-py3-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
pytorch-linux-jammy-py3.9-gcc11,
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
@ -72,8 +76,7 @@ jobs:
pytorch-linux-jammy-py3-clang12-onnx,
pytorch-linux-jammy-linter,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
# Executorch pin needs update
# pytorch-linux-jammy-py3-clang12-executorch,
pytorch-linux-jammy-py3-clang12-executorch,
pytorch-linux-jammy-py3.12-triton-cpu
]
include:

View File

@ -60,6 +60,7 @@ jobs:
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
use_split_build: False
DESIRED_PYTHON: "3.9"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -83,6 +84,7 @@ jobs:
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
use_split_build: False
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
@ -106,6 +108,7 @@ jobs:
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
use_split_build: False
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cpu-aarch64
secrets:
@ -126,6 +129,7 @@ jobs:
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
use_split_build: False
DESIRED_PYTHON: "3.9"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -152,6 +156,7 @@ jobs:
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
use_split_build: False
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cuda-aarch64-12_9
secrets:
@ -171,6 +176,7 @@ jobs:
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
use_split_build: False
DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -194,6 +200,7 @@ jobs:
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
use_split_build: False
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
@ -217,6 +224,7 @@ jobs:
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
use_split_build: False
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cpu-aarch64
secrets:
@ -237,6 +245,7 @@ jobs:
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
use_split_build: False
DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -263,6 +272,7 @@ jobs:
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
use_split_build: False
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cuda-aarch64-12_9
secrets:
@ -282,6 +292,7 @@ jobs:
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
use_split_build: False
DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -305,6 +316,7 @@ jobs:
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
use_split_build: False
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
@ -328,6 +340,7 @@ jobs:
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
use_split_build: False
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cpu-aarch64
secrets:
@ -348,6 +361,7 @@ jobs:
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
use_split_build: False
DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -374,6 +388,7 @@ jobs:
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
use_split_build: False
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cuda-aarch64-12_9
secrets:
@ -393,6 +408,7 @@ jobs:
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
use_split_build: False
DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -416,6 +432,7 @@ jobs:
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
use_split_build: False
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
@ -439,6 +456,7 @@ jobs:
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
use_split_build: False
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cpu-aarch64
secrets:
@ -459,6 +477,7 @@ jobs:
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
use_split_build: False
DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -485,6 +504,7 @@ jobs:
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
use_split_build: False
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cuda-aarch64-12_9
secrets:
@ -504,6 +524,7 @@ jobs:
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
use_split_build: False
DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -527,6 +548,7 @@ jobs:
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
use_split_build: False
DESIRED_PYTHON: "3.13"
build_name: manywheel-py3_13-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
@ -550,6 +572,7 @@ jobs:
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
use_split_build: False
DESIRED_PYTHON: "3.13"
build_name: manywheel-py3_13-cpu-aarch64
secrets:
@ -570,6 +593,7 @@ jobs:
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
use_split_build: False
DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -596,6 +620,7 @@ jobs:
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
use_split_build: False
DESIRED_PYTHON: "3.13"
build_name: manywheel-py3_13-cuda-aarch64-12_9
secrets:
@ -615,6 +640,7 @@ jobs:
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
use_split_build: False
DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -638,6 +664,7 @@ jobs:
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
use_split_build: False
DESIRED_PYTHON: "3.13t"
build_name: manywheel-py3_13t-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
@ -661,6 +688,7 @@ jobs:
GPU_ARCH_TYPE: cpu-aarch64
DOCKER_IMAGE: manylinux2_28_aarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
use_split_build: False
DESIRED_PYTHON: "3.13t"
build_name: manywheel-py3_13t-cpu-aarch64
secrets:
@ -681,6 +709,7 @@ jobs:
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
use_split_build: False
DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -707,6 +736,7 @@ jobs:
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
use_split_build: False
DESIRED_PYTHON: "3.13t"
build_name: manywheel-py3_13t-cuda-aarch64-12_9
secrets:

View File

@ -42,7 +42,54 @@ jobs:
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
manywheel-py3_12-cuda12_8-build:
manywheel-py3_9-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu126
GPU_ARCH_VERSION: 12.6
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
use_split_build: False
DESIRED_PYTHON: "3.9"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_9-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda12_6-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_9-cuda12_6-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu126
GPU_ARCH_VERSION: 12.6
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
use_split_build: False
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cuda12_6
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
@ -56,17 +103,18 @@ jobs:
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.12"
use_split_build: False
DESIRED_PYTHON: "3.9"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda12_8
build_name: manywheel-py3_9-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_8-test: # Testing
manywheel-py3_9-cuda12_8-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_12-cuda12_8-build
- manywheel-py3_9-cuda12_8-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
@ -79,8 +127,56 @@ jobs:
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cuda12_8
use_split_build: False
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cuda12_8
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: 12.9
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
use_split_build: False
DESIRED_PYTHON: "3.9"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_9-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_9-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: 12.9
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
use_split_build: False
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner

File diff suppressed because it is too large Load Diff

View File

@ -58,6 +58,7 @@ jobs:
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
use_split_build: False
DESIRED_PYTHON: "3.9"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_9-rocm6_4
@ -82,6 +83,7 @@ jobs:
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
use_split_build: False
DESIRED_PYTHON: "3.9"
steps:
- name: Setup ROCm

View File

@ -60,6 +60,7 @@ jobs:
GPU_ARCH_TYPE: cpu-s390x
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
use_split_build: False
DESIRED_PYTHON: "3.9"
runs_on: linux.s390x
ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -83,6 +84,7 @@ jobs:
GPU_ARCH_TYPE: cpu-s390x
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
use_split_build: False
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cpu-s390x
build_environment: linux-s390x-binary-manywheel
@ -105,6 +107,7 @@ jobs:
GPU_ARCH_TYPE: cpu-s390x
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
use_split_build: False
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cpu-s390x
secrets:
@ -124,6 +127,7 @@ jobs:
GPU_ARCH_TYPE: cpu-s390x
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
use_split_build: False
DESIRED_PYTHON: "3.10"
runs_on: linux.s390x
ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -147,6 +151,7 @@ jobs:
GPU_ARCH_TYPE: cpu-s390x
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
use_split_build: False
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cpu-s390x
build_environment: linux-s390x-binary-manywheel
@ -169,6 +174,7 @@ jobs:
GPU_ARCH_TYPE: cpu-s390x
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
use_split_build: False
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cpu-s390x
secrets:
@ -188,6 +194,7 @@ jobs:
GPU_ARCH_TYPE: cpu-s390x
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
use_split_build: False
DESIRED_PYTHON: "3.11"
runs_on: linux.s390x
ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -211,6 +218,7 @@ jobs:
GPU_ARCH_TYPE: cpu-s390x
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
use_split_build: False
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cpu-s390x
build_environment: linux-s390x-binary-manywheel
@ -233,6 +241,7 @@ jobs:
GPU_ARCH_TYPE: cpu-s390x
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
use_split_build: False
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cpu-s390x
secrets:
@ -252,6 +261,7 @@ jobs:
GPU_ARCH_TYPE: cpu-s390x
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
use_split_build: False
DESIRED_PYTHON: "3.12"
runs_on: linux.s390x
ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -275,6 +285,7 @@ jobs:
GPU_ARCH_TYPE: cpu-s390x
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
use_split_build: False
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cpu-s390x
build_environment: linux-s390x-binary-manywheel
@ -297,6 +308,7 @@ jobs:
GPU_ARCH_TYPE: cpu-s390x
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
use_split_build: False
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cpu-s390x
secrets:
@ -316,6 +328,7 @@ jobs:
GPU_ARCH_TYPE: cpu-s390x
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
use_split_build: False
DESIRED_PYTHON: "3.13"
runs_on: linux.s390x
ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -339,6 +352,7 @@ jobs:
GPU_ARCH_TYPE: cpu-s390x
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
use_split_build: False
DESIRED_PYTHON: "3.13"
build_name: manywheel-py3_13-cpu-s390x
build_environment: linux-s390x-binary-manywheel
@ -361,6 +375,7 @@ jobs:
GPU_ARCH_TYPE: cpu-s390x
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
use_split_build: False
DESIRED_PYTHON: "3.13"
build_name: manywheel-py3_13-cpu-s390x
secrets:

View File

@ -1,154 +0,0 @@
name: inductor-perf-b200
on:
schedule:
- cron: 0 7 * * 1-6
- cron: 0 7 * * 0
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
workflow_dispatch:
inputs:
training:
description: Run training (on by default)?
required: false
type: boolean
default: true
inference:
description: Run inference (on by default)?
required: false
type: boolean
default: true
default:
description: Run inductor_default?
required: false
type: boolean
default: false
dynamic:
description: Run inductor_dynamic_shapes?
required: false
type: boolean
default: false
cppwrapper:
description: Run inductor_cpp_wrapper?
required: false
type: boolean
default: false
cudagraphs:
description: Run inductor_cudagraphs?
required: false
type: boolean
default: true
freezing_cudagraphs:
description: Run inductor_cudagraphs with freezing for inference?
required: false
type: boolean
default: false
aotinductor:
description: Run aot_inductor for inference?
required: false
type: boolean
default: false
maxautotune:
description: Run inductor_max_autotune?
required: false
type: boolean
default: false
benchmark_configs:
description: The list of configs used the benchmark
required: false
type: string
default: inductor_huggingface_perf_cuda_b200,inductor_timm_perf_cuda_b200,inductor_torchbench_perf_cuda_b200
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-label-type:
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
build:
name: cuda12.8-py3.10-gcc9-sm100
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
# Use a bigger runner here because CUDA_ARCH 9.0 is only built for H100
# or newer GPUs, so it doesn't benefit much from existing compiler cache
# from trunk. Also use a memory-intensive runner here because memory is
# usually the bottleneck
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '10.0'
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
{ config: "inductor_timm_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
{ config: "inductor_torchbench_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
]}
selected-test-configs: ${{ inputs.benchmark_configs }}
build-additional-packages: "vision audio fbgemm torchao"
secrets: inherit
test-periodically:
name: cuda12.8-py3.10-gcc9-sm100
uses: ./.github/workflows/_linux-test.yml
needs: build
if: github.event.schedule == '0 7 * * 1-6'
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
timeout-minutes: 720
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
test-weekly:
name: cuda12.8-py3.10-gcc9-sm100
uses: ./.github/workflows/_linux-test.yml
needs: build
if: github.event.schedule == '0 7 * * 0'
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
timeout-minutes: 1440
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
test:
name: cuda12.8-py3.10-gcc9-sm100
uses: ./.github/workflows/_linux-test.yml
needs: build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
timeout-minutes: 720
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit

View File

@ -85,26 +85,26 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-rocm-py3_10
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
]}
secrets: inherit

View File

@ -81,21 +81,21 @@ jobs:
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
{ config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
]}
secrets: inherit

View File

@ -47,8 +47,8 @@ jobs:
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
test-matrix: |
{ include: [
{ config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
]}
secrets: inherit

View File

@ -28,6 +28,7 @@ jobs:
# than our AWS macos-m1-14 runners
test-matrix: |
{ include: [
{ config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m1-13" },
{ config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
{ config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m2-15" },
]}

View File

@ -75,11 +75,10 @@ jobs:
repo-owner: pytorch
branch: main
pin-folder: .github/ci_commit_pins
# executorch jobs are disabled since it needs some manual work for the hash update
# - repo-name: executorch
# repo-owner: pytorch
# branch: main
# pin-folder: .ci/docker/ci_commit_pins
- repo-name: executorch
repo-owner: pytorch
branch: main
pin-folder: .ci/docker/ci_commit_pins
- repo-name: triton
repo-owner: triton-lang
branch: main

View File

@ -59,9 +59,9 @@ jobs:
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
test-matrix: |
{ include: [
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
]}
secrets: inherit

View File

@ -51,6 +51,37 @@ jobs:
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-cuda12_4-py3_10-gcc11-sm89-build:
name: linux-jammy-cuda12.4-py3.10-gcc11-sm89
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda12.4-py3.10-gcc11-sm89
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11
cuda-arch-list: 8.9
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
]}
secrets: inherit
linux-jammy-cuda12_4-py3_10-gcc11-sm89-test:
name: linux-jammy-cuda12.4-py3.10-gcc11-sm89
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda12_4-py3_10-gcc11-sm89-build
- target-determination
with:
build-environment: linux-jammy-cuda12.4-py3.10-gcc11-sm89
docker-image: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-sm89-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-sm89-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda12_4-py3_10-gcc11-build:
name: linux-jammy-cuda12.4-py3.10-gcc11
uses: ./.github/workflows/_linux-build.yml

View File

@ -254,6 +254,36 @@ jobs:
timeout-minutes: 600
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-build-distributed:
name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '7.5'
test-matrix: |
{ include: [
{ config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
{ config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
{ config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
]}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-test-distributed:
name: linux-jammy-cuda12.8-py3.10-gcc11-test
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda12_8-py3_10-gcc11-build-distributed
- target-determination
with:
timeout-minutes: 360
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-build:
name: linux-jammy-cuda12.8-py3.10-gcc11
uses: ./.github/workflows/_linux-build.yml
@ -262,18 +292,13 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '7.5 8.9'
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
{ config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
{ config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
{ config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
]}
secrets: inherit
@ -304,6 +329,30 @@ jobs:
]}
secrets: inherit
linux-jammy-py3_9-clang9-xla-build:
name: linux-jammy-py3_9-clang9-xla
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.9-clang9-xla
docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
test-matrix: |
{ include: [
{ config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
]}
secrets: inherit
linux-jammy-py3_9-clang9-xla-test:
name: linux-jammy-py3_9-clang9-xla
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-py3_9-clang9-xla-build
with:
build-environment: linux-jammy-py3.9-clang9-xla
docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cpu-py3_10-gcc11-bazel-test:
name: linux-jammy-cpu-py3.10-gcc11-bazel-test
uses: ./.github/workflows/_bazel-build-test.yml
@ -353,8 +402,38 @@ jobs:
]}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-sm89-build:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm89
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: 8.9
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
]}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-sm89-test:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm89
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda12_8-py3_10-gcc11-sm89-build
- target-determination
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-py3-clang12-executorch-build:
if: false # Docker build needs pin update
name: linux-jammy-py3-clang12-executorch
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
@ -379,6 +458,31 @@ jobs:
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
name: cuda12.8-py3.10-gcc9-sm75
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '7.5'
test-matrix: |
{ include: [
{ config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
]}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
name: cuda12.8-py3.10-gcc9-sm75
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-xpu-2025_1-py3_9-build:
name: linux-jammy-xpu-2025.1-py3.9
uses: ./.github/workflows/_linux-build.yml

View File

@ -48,12 +48,12 @@ jobs:
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
]}
secrets: inherit

View File

@ -3,7 +3,7 @@ name: rocm-mi355
on:
workflow_dispatch:
schedule:
- cron: 30 11,1 * * * # about 4:30am PDT and 6:30pm PDT
- cron: 30 9 * * * # about 2:30am PDT
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}

View File

@ -10,10 +10,6 @@ concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-default-label-prefix:
if: github.repository_owner == 'pytorch'

View File

@ -94,6 +94,7 @@ jobs:
{ config: "default", shard: 1, num_shards: 3, runner: "macos-m1-stable" },
{ config: "default", shard: 2, num_shards: 3, runner: "macos-m1-stable" },
{ config: "default", shard: 3, num_shards: 3, runner: "macos-m1-stable" },
{ config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-13" },
{ config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
{ config: "mps", shard: 1, num_shards: 1, runner: "macos-m2-15" },
]}
@ -205,7 +206,7 @@ jobs:
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.9-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
test-matrix: |
{ include: [
{ config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },

View File

@ -12,9 +12,7 @@ concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
permissions: read-all
jobs:
# There must be at least one job here to satisfy GitHub action workflow syntax
@ -53,27 +51,3 @@ jobs:
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-py3_9-clang9-xla-build:
name: linux-jammy-py3_9-clang9-xla
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.9-clang9-xla
docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
test-matrix: |
{ include: [
{ config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
]}
secrets: inherit
linux-jammy-py3_9-clang9-xla-test:
name: linux-jammy-py3_9-clang9-xla
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-py3_9-clang9-xla-build
with:
build-environment: linux-jammy-py3.9-clang9-xla
docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
secrets: inherit

View File

@ -23,7 +23,7 @@ jobs:
with:
repository: pytorch/pytorch
stable-branch: viable/strict
requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\", \"linux-aarch64\"]'
requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\"]'
secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
clickhouse-url: ${{ secrets.CLICKHOUSE_URL }}
clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }}

View File

@ -164,7 +164,7 @@ init_command = [
'types-setuptools==79.0.0.20250422',
'types-jinja2==2.11.9',
'types-colorama==0.4.6',
'filelock==3.18.0',
'filelock==3.13.1',
'junitparser==2.1.1',
'rich==14.1.0',
'pyyaml==6.0.2',
@ -1452,6 +1452,8 @@ init_command = [
'python3',
'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}',
'--no-black-binary',
'black==23.12.1',
'usort==1.0.8.post1',
'isort==6.0.1',
'ruff==0.12.2', # sync with RUFF

View File

@ -1,17 +1 @@
- This is the only AGENTS.md, there are no recursive AGENTS.md
- When you are working on a bug, first create a standalone file that
reproduces the bug and verify it fails in the expected way. Use this to
test if your changes work. Once the change is passing, find an appropriate
test file to add the test to and make sure to follow local conventions on
the test file.
- If you are running the real test suite, DO NOT run the entire test suite.
Instead run only a single test case, e.g., 'python test/test_torch.py TestTorch.test_dir'
- Do NOT run setup.py, you do not have a working build environment
- Do NOT run pre-commit, it is not setup
- To run lint, run 'lintrunner -a' (which will autoapply changes)
- Do NOT attempt to install dependencies, you do not have Internet access
- When you are ready to make a PR, do exactly these steps:
- git stash -u
- git reset --hard $(cat /tmp/orig_work.txt) # NB: reset to the LOCAL branch, do NOT fetch
- git stash pop
- Resolve conflicts if necessary

View File

@ -679,7 +679,6 @@ cc_library(
[
"torch/*.h",
"torch/csrc/**/*.h",
"torch/nativert/**/*.h",
"torch/csrc/distributed/c10d/**/*.hpp",
"torch/lib/libshm/*.h",
],

View File

@ -564,7 +564,7 @@ if(MSVC)
set(CMAKE_NINJA_CMCLDEPS_RC OFF)
if(MSVC_Z7_OVERRIDE)
# CMake set debug flags to use /Z7
set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$<CONFIG:Debug,RelWithDebInfo>:Embedded>")
set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT Embedded)
endif()
foreach(
flag_var
@ -872,14 +872,6 @@ cmake_dependent_option(
"USE_CUDA OR USE_ROCM;NOT MSVC"
OFF)
cmake_dependent_option(
USE_FBGEMM_GENAI
"Whether to build FBGEMM GenAI quantized GEMM kernels.\
Will be disabled if not supported by the platform"
OFF
"USE_CUDA OR USE_ROCM"
OFF)
# CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
# Eff Attention won't
cmake_dependent_option(
@ -913,10 +905,6 @@ if(USE_FBGEMM)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM")
endif()
if(USE_FBGEMM_GENAI)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM_GENAI")
endif()
if(USE_PYTORCH_QNNPACK)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK")
endif()

View File

@ -14,6 +14,7 @@
/torch/csrc/autograd/ @albanD @soulitzer
/torch/autograd/ @albanD @soulitzer
/tools/autograd/ @albanD @soulitzer
/torch/header_only_apis.txt @janeyx99
/torch/nn/ @albanD @jbschlosser @mikaylagawarecki
/torch/optim/ @albanD @janeyx99
/test/test_public_bindings.py @albanD
@ -50,12 +51,12 @@ nn/qat/ @jerryzh168
/torch/csrc/distributed/c10d/Ops.* @kwen2501
# ONNX Export
/torch/_dynamo/backends/onnxrt.py @titaiwangms @xadupre @justinchuby
/torch/csrc/jit/passes/onnx.h @titaiwangms @xadupre
/torch/csrc/jit/passes/onnx.cpp @titaiwangms @xadupre
/torch/csrc/jit/passes/onnx/ @titaiwangms @xadupre
/torch/onnx/ @titaiwangms @xadupre @justinchuby
/test/onnx/ @titaiwangms @xadupre @justinchuby
/torch/_dynamo/backends/onnxrt.py @wschin
/torch/csrc/jit/passes/onnx.h @titaiwangms @shubhambhokare1
/torch/csrc/jit/passes/onnx.cpp @titaiwangms @shubhambhokare1
/torch/csrc/jit/passes/onnx/ @titaiwangms @shubhambhokare1
/torch/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin
/test/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin
# CI
/.ci @pytorch/pytorch-dev-infra
@ -195,8 +196,3 @@ torch/backends/cudnn/ @eqy @syed-ahmed
/torch/utils/_cxx_pytree.py @XuehaiPan
/torch/utils/pytree/ @XuehaiPan
/torch/_dynamo/polyfills/pytree.py @XuehaiPan
# Relating to libtorch ABI
/torch/csrc/stable/ @janeyx99 @mikaylagawarecki
/torch/headeronly/ @janeyx99
/torch/header_only_apis.txt @janeyx99

View File

@ -276,7 +276,7 @@ conda install pkg-config libuv
pip install mkl-static mkl-include
# Add these packages if torch.distributed is needed.
# Distributed package support on Windows is a prototype feature and is subject to changes.
conda install -c conda-forge libuv
conda install -c conda-forge libuv=1.39
```
#### Install PyTorch

View File

@ -119,8 +119,6 @@ file(GLOB_RECURSE native_mps_cpp "native/mps/*.cpp")
file(GLOB_RECURSE native_mps_mm "native/mps/*.mm")
file(GLOB_RECURSE native_mps_metal "native/mps/*.metal")
file(GLOB_RECURSE native_mps_h "native/mps/*.h")
file(GLOB_RECURSE native_sparse_mps_mm "native/sparse/mps/*.mm")
file(GLOB_RECURSE native_mps_sparse_metal "native/sparse/mps/*.metal")
file(GLOB native_sparse_cpp "native/sparse/*.cpp")
file(GLOB native_quantized_cpp
@ -249,50 +247,6 @@ if(USE_MEM_EFF_ATTENTION)
list(APPEND ATen_ATTENTION_KERNEL_SRCS ${mem_eff_attention_cuda_kernels_cu})
endif()
IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF")
set(USE_FBGEMM_GENAI off)
endif()
# FBGEMM GenAI
IF(USE_FBGEMM_GENAI)
set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/)
set(FBGEMM_GENAI_DIR ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
if(USE_ROCM)
# Only include the kernels we want to build to avoid increasing binary size.
file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
"${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
"${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
# Add additional HIPCC compiler flags for performance
set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
-mllvm
-amdgpu-coerce-illegal-types=1
-mllvm
-enable-post-misched=0
-mllvm
-greedy-reverse-local-assignment=1
-fhip-new-launch-api)
hip_add_library(
fbgemm_genai STATIC
${fbgemm_genai_native_rocm_hip}
HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
target_include_directories(fbgemm_genai PUBLIC
# FBGEMM version of Composable Kernel is used due to some customizations
${FBGEMM_THIRD_PARTY}/composable_kernel/include
${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
${FBGEMM_GENAI_DIR}/include/
${FBGEMM_GENAI_DIR}/common/include/
)
endif()
endif()
# XNNPACK
file(GLOB native_xnnpack "native/xnnpack/*.cpp")
@ -441,7 +395,6 @@ if(USE_ROCM)
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/example/ck_tile/01_fmha)
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/aiter/csrc/include)
_pytorch_rocm_generate_ck_conf()
@ -701,25 +654,29 @@ endif()
if(USE_MPS)
include(../../../cmake/Metal.cmake)
set(ATen_MPS_SRCS ${ATen_MPS_SRCS} ${mps_cpp} ${mps_mm} ${mps_h} ${native_mps_cpp} ${native_mps_mm} ${native_mps_h} ${native_sparse_mps_mm})
set(ATen_MPS_SRCS ${ATen_MPS_SRCS} ${mps_cpp} ${mps_mm} ${mps_h} ${native_mps_cpp} ${native_mps_mm} ${native_mps_h})
if(CAN_COMPILE_METAL)
foreach(SHADER ${native_mps_metal} ${native_mps_sparse_metal})
foreach(SHADER ${native_mps_metal})
cmake_path(GET SHADER STEM TGT_STEM)
string(CONCAT TGT_BASIC ${TGT_STEM} "_31.air")
string(CONCAT TGT_BASIC ${TGT_STEM} "_30.air")
string(CONCAT TGT_BFLOAT ${TGT_STEM} "_31.air")
list(APPEND AIR_BASIC ${TGT_BASIC})
metal_to_air(${SHADER} ${TGT_BASIC} "-std=metal3.1")
list(APPEND AIR_BFLOAT ${TGT_BFLOAT})
metal_to_air(${SHADER} ${TGT_BASIC} "-std=metal3.0")
metal_to_air(${SHADER} ${TGT_BFLOAT} "-std=metal3.1")
endforeach()
air_to_metallib(kernels_basic.metallib ${AIR_BASIC})
air_to_metallib(kernels_bfloat.metallib ${AIR_BFLOAT})
add_custom_command(
COMMAND echo "// $$(date)" > metallib_dummy.cpp
DEPENDS kernels_basic.metallib
DEPENDS kernels_basic.metallib kernels_bfloat.metallib
OUTPUT metallib_dummy.cpp
COMMENT "Updating metallibs timestamp")
add_custom_target(metallibs DEPENDS kernels_basic.metallib metallib_dummy.cpp)
add_custom_target(metallibs DEPENDS kernels_basic.metallib kernels_bfloat.metallib metallib_dummy.cpp)
else()
file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/native/mps")
foreach(SHADER ${native_mps_metal} ${native_mps_sparse_metal})
foreach(SHADER ${native_mps_metal})
cmake_path(GET SHADER STEM TGT_STEM)
string(CONCAT SHADER_HDR_NAME "${CMAKE_CURRENT_BINARY_DIR}" /native/mps/ ${TGT_STEM} "_metallib.h")
metal_to_metallib_h(${SHADER} ${SHADER_HDR_NAME})

View File

@ -31,9 +31,7 @@ c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
return at::globalContext().getPinnedMemoryAllocator(opt_device_type);
} else {
TORCH_CHECK(
false,
"pin_memory=True requires a CUDA or other accelerator backend; "
"no pinned memory allocator is available on this system.")
false, "Need to provide pin_memory allocator to use pin memory.")
}
}

View File

@ -162,7 +162,7 @@ struct CUDACachingHostAllocatorImpl
}
bool pinned_use_background_threads() override {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
pinned_use_background_threads();
}

View File

@ -43,6 +43,7 @@ TensorBase empty_mps(
int64_t nelements = c10::multiply_integers(size);
auto dtype = dtype_or_default(dtype_opt);
TORCH_CHECK_TYPE(dtype != ScalarType::Double, MPS_ERROR_DOUBLE_NOT_SUPPORTED);
TORCH_CHECK_TYPE(dtype != ScalarType::BFloat16 || is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_14_0_PLUS), "MPS BFloat16 is only supported on MacOS 14 or newer");
auto dtype_meta = scalarTypeToTypeMeta(dtype);

View File

@ -18,7 +18,11 @@ namespace at::mps {
// Helper enum to check if a MPSGraph op is supported in a given macOS version
enum class MacOSVersion : uint32_t {
MACOS_VER_14_4_PLUS = 0,
MACOS_VER_13_1_PLUS = 0,
MACOS_VER_13_2_PLUS,
MACOS_VER_13_3_PLUS,
MACOS_VER_14_0_PLUS,
MACOS_VER_14_4_PLUS,
MACOS_VER_15_0_PLUS,
MACOS_VER_15_1_PLUS,
MACOS_VER_15_2_PLUS,

View File

@ -32,11 +32,11 @@ MPSDevice::~MPSDevice() {
MPSDevice::MPSDevice() : _mtl_device(nil) {
// Check that MacOS 13.0+ version of MPS framework is available
// Create the MPSGraph and check method introduced in 14.0
// Create the MPSGraph and check method introduced in 13.0
// which is used by MPS backend.
id mpsCD = NSClassFromString(@"MPSGraph");
if ([mpsCD instancesRespondToSelector:@selector(HermiteanToRealFFTWithTensor:axes:descriptor:name:)] == NO) {
if ([mpsCD instancesRespondToSelector:@selector(cumulativeSumWithTensor:axis:name:)] == NO) {
return;
}
@ -66,12 +66,24 @@ bool MPSDevice::isMacOS13Plus(MacOSVersion version) const {
isOperatingSystemAtLeastVersion:{.majorVersion = major, .minorVersion = minor, .patchVersion = 0}];
}
};
static bool _macos_13_1_plus = is_os_version_at_least(13, 1);
static bool _macos_13_2_plus = is_os_version_at_least(13, 2);
static bool _macos_13_3_plus = is_os_version_at_least(13, 3);
static bool _macos_14_0_plus = is_os_version_at_least(14, 0);
static bool _macos_14_4_plus = is_os_version_at_least(14, 4);
static bool _macos_15_0_plus = is_os_version_at_least(15, 0);
static bool _macos_15_1_plus = is_os_version_at_least(15, 1);
static bool _macos_15_2_plus = is_os_version_at_least(15, 2);
switch (version) {
case MacOSVersion::MACOS_VER_13_1_PLUS:
return _macos_13_1_plus;
case MacOSVersion::MACOS_VER_13_2_PLUS:
return _macos_13_2_plus;
case MacOSVersion::MACOS_VER_13_3_PLUS:
return _macos_13_3_plus;
case MacOSVersion::MACOS_VER_14_0_PLUS:
return _macos_14_0_plus;
case MacOSVersion::MACOS_VER_14_4_PLUS:
return _macos_14_4_plus;
case MacOSVersion::MACOS_VER_15_0_PLUS:

View File

@ -34,7 +34,7 @@ bool MPSHooks::isOnMacOSorNewer(unsigned major, unsigned minor) const {
case 14:
switch (minor) {
case 0:
return true;
return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS);
case 4:
return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_4_PLUS);
default:
@ -42,7 +42,19 @@ bool MPSHooks::isOnMacOSorNewer(unsigned major, unsigned minor) const {
return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_4_PLUS);
}
case 13:
return true;
switch (minor) {
case 0:
return true;
case 1:
return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_1_PLUS);
case 2:
return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS);
case 3:
return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
default:
TORCH_WARN("Can't check whether running on 13.", minor, "+ returning one for 13.3+");
return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
}
default:
TORCH_WARN("Checking for unexpected MacOS ", major, ".", minor, " returning false");
return false;

View File

@ -51,7 +51,7 @@ extern "C" void zaxpy_(int *n, void *a, const void *x, int *incx, void *y, int *
// brgemm_pack_B is changed to transform and the setting of brgemm beta is changed to set_add_C
#if (IDEEP_VERSION_MAJOR == 3 && IDEEP_VERSION_MINOR == 5)
#define ONEDNN_UKERNEL_1
#elif ((IDEEP_VERSION_MAJOR == 3 && IDEEP_VERSION_MINOR >= 6) || (IDEEP_VERSION_MAJOR > 3))
#elif (IDEEP_VERSION_MAJOR >= 3 && IDEEP_VERSION_MINOR >= 6)
#define ONEDNN_UKERNEL_2
#endif
#if ((defined(ONEDNN_UKERNEL_1) || defined(ONEDNN_UKERNEL_2)) && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC))))

View File

@ -206,16 +206,6 @@ void copy(int64_t n, const c10::complex<float> *x, int64_t incx, c10::complex<fl
// B Base pointer to a tensor B.
// C Pointer to a tensor C (accumulation buffer).
// Note only batch size 1 is used currently
// Define macros for available brgemm APIs
// so that callers can determine which APIs are available
#define CPUBLAS_BRGEMM_F16F16F32 // half * half -> float
#define CPUBLAS_BRGEMM_BF16BF16F32 // bfloat16 * bfloat16 -> float
#define CPUBLAS_BRGEMM_F32F32F32 // float * float -> float
#define CPUBLAS_BRGEMM_U8U8I32 // unsigned char * unsigned char -> int32
#define CPUBLAS_BRGEMM_U8I8I32 // unsigned char * signed char -> int32
#define CPUBLAS_BRGEMM_I8I8I32 // signed char * signed char -> int32
TORCH_API void brgemm(
int64_t M,
int64_t N,

View File

@ -24,29 +24,6 @@ static void _assert_match(const O& original, const C& compared, const std::strin
}
}
template<>
void _assert_match<c10::Device, std::optional<c10::Device>>(
const c10::Device& original,
const std::optional<c10::Device>& compared,
const std::string& name) {
if (compared) {
const c10::Device& expected = compared.value();
if (original.type() != expected.type()) {
std::stringstream msg;
msg << "Tensor " << name << " mismatch! Expected: " << expected << ", Got: " << original;
throw std::runtime_error(msg.str());
}
// If the expected device doesn't have an index (e.g., just "cuda"),
// or if both devices have the same index, consider them equal
if (expected.has_index() && original.has_index() && expected.index() != original.index()) {
std::stringstream msg;
msg << "Tensor " << name << " mismatch! Expected: " << expected << ", Got: " << original;
throw std::runtime_error(msg.str());
}
}
}
void _assert_tensor_metadata_meta_symint(at::Tensor const& tensor, at::OptionalSymIntArrayRef sizes, at::OptionalSymIntArrayRef strides, std::optional<c10::ScalarType> dtype, std::optional<c10::Device> device, std::optional<c10::Layout> layout) {
_assert_match(tensor.sym_sizes(), sizes, "sizes");
_assert_match(tensor.sym_strides(), strides, "strides");

View File

@ -367,27 +367,27 @@ void int8pack_mm_kernel_(
auto* C_data = C.data_ptr<T>();
const auto* S_data = scales.const_data_ptr<T>();
int64_t M = A.size(0);
int64_t N = B.size(0);
int64_t K = A.size(1);
int64_t lda = A.stride(0);
constexpr int64_t BLOCK_M = 4;
constexpr int64_t BLOCK_N = 4;
int M = A.size(0);
int N = B.size(0);
int K = A.size(1);
int lda = A.stride(0);
constexpr int BLOCK_M = 4;
constexpr int BLOCK_N = 4;
const int64_t MB = (M + BLOCK_M - 1) / BLOCK_M;
const int64_t NB = (N + BLOCK_N - 1) / BLOCK_N;
const int MB = (M + BLOCK_M - 1) / BLOCK_M;
const int NB = (N + BLOCK_N - 1) / BLOCK_N;
at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
int64_t mb{0}, nb{0};
at::parallel_for(0, MB * NB, 0, [&](int begin, int end) {
int mb{0}, nb{0};
data_index_init(begin, mb, MB, nb, NB);
for (const auto i : c10::irange(begin, end)) {
(void)i;
int64_t mb_start = mb * BLOCK_M;
int64_t mb_size = std::min(BLOCK_M, M - mb_start);
int64_t nb_start = nb * BLOCK_N;
int64_t nb_size = std::min(BLOCK_N, N - nb_start);
int mb_start = mb * BLOCK_M;
int mb_size = std::min(BLOCK_M, M - mb_start);
int nb_start = nb * BLOCK_N;
int nb_size = std::min(BLOCK_N, N - nb_start);
const auto* A_ptr = A_data + mb_start * lda;
const auto* B_ptr = B_data + nb_start * K;

View File

@ -526,7 +526,7 @@ namespace {
// we are dealing with packed tensor here. max index is the same as numel.
// TODO: to really support input tensor large enough to go beyond int32,
// TODO: to really support input tensor large enought to go beyond int32,
// we will need to restrict out shared memory usage and adjust the launch
// config;
AT_ASSERT(input_.numel() < std::numeric_limits<int32_t>::max());
@ -681,7 +681,7 @@ namespace {
const dim3 grid(grid_x, grid_y, grid_z);
// we are dealing with packed tensor here. max index is the same as numel.
// TODO: to really support input tensor large enough to go beyond int32,
// TODO: to really support input tensor large enought to go beyond int32,
// we will need to restrict out shared memory usage and adjust the launch
// config;
AT_ASSERT(input.numel() < std::numeric_limits<int32_t>::max());

View File

@ -21,10 +21,6 @@
#include <ATen/native/cuda/GroupMM.h>
#include <ATen/ceil_div.h>
#ifdef USE_FBGEMM_GENAI
#include <fbgemm_gpu/torch_ops.h>
#endif
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
@ -1220,7 +1216,7 @@ std::pair<ScalingType, ScalingType> get_joint_scaling(
// - `scale_a`: a tensor with the inverse scale of `mat1`, whose shape/strides/dtype depend on the scaling scheme
// - `scale_b`: a tensor with the inverse scale of `mat2`, whose shape/strides/dtype depend on the scaling scheme
// - `scale_result`: a scalar tensor with the scale of the output, only utilized if the output is a float8 type
// - `use_fast_accum`: if true, enables fast float8 accumulation. Backends may ignore this option if not applicable.
// - `use_fast_accum`: if true, enables fast float8 accumulation
// - `out`: a reference to the output tensor
Tensor&
@ -1529,7 +1525,6 @@ namespace {
const auto out_dtype_ = out_dtype.value_or(kBFloat16);
TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm");
#ifndef USE_ROCM
// For TMA transfers, strides of output tensor have to be either
// 1, or aligned to 16 bytes.
const auto last_dim = out_size.size() - 1;
@ -1541,10 +1536,9 @@ namespace {
} else {
out_stride = {out_size[1] * size_padded, size_padded, 1};
}
return at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype_));
#else
return at::empty(out_size, mat_a.options().dtype(out_dtype_));
#endif
auto out = at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype_));
return out;
}
bool check_valid_strides_and_return_transposed(const Tensor& mat) {
@ -1625,18 +1619,18 @@ const std::optional<at::Tensor>& bias,
const std::optional<at::Tensor>& scale_result,
std::optional<c10::ScalarType> out_dtype,
bool use_fast_accum) {
bool allowed_device = _scaled_mm_allowed_device();
TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0, or ROCm MI300+");
#ifndef USE_ROCM
bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true);
TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0");
TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_a.scalar_type());
TORCH_CHECK(mat_b.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_b.scalar_type());
TORCH_CHECK(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed");
TORCH_CHECK(check_valid_strides_and_return_transposed(mat_b), "Expected mat2 to be transposed");
TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d");
TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
const bool a_is_2d = mat_a.dim() == 2;
const bool b_is_2d = mat_b.dim() == 2;
if (!a_is_2d || !b_is_2d) {
TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
}
TORCH_CHECK(
mat_a.size(-1) % 16 == 0,
"Expected trailing dimension of mat_a to be divisible by 16 ",
@ -1670,10 +1664,6 @@ bool use_fast_accum) {
Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype);
#ifndef USE_ROCM
TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_a.scalar_type());
TORCH_CHECK(mat_b.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_b.scalar_type());
at::cuda::detail::f8f8bf16_grouped_mm(
mat_a,
mat_b,
@ -1684,23 +1674,12 @@ bool use_fast_accum) {
use_fast_accum,
out);
return out;
#else
#ifdef USE_FBGEMM_GENAI
TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fnuz, "Expected mat_a to be Float8_e4m3fnuz matrix got ", mat_a.scalar_type());
TORCH_CHECK(mat_b.dtype() == at::kFloat8_e4m3fnuz, "Expected mat_a to be Float8_e4m3fnuz matrix got ", mat_b.scalar_type());
fbgemm_gpu::f8f8bf16_rowwise_grouped_mm(
mat_a,
// FBGEMM expects B matrix shape to be (.., N, K)
mat_b.transpose(-2, -1),
scale_a,
scale_b,
offs,
out);
return out;
#else
TORCH_CHECK(false, "grouped gemm is not supported without USE_FBGEMM_GENAI on ROCM")
#endif
TORCH_CHECK(false, "grouped gemm is not supported on ROCM")
#endif
}
@ -1719,9 +1698,6 @@ std::optional<c10::ScalarType> out_dtype) {
TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
const bool a_is_2d = mat_a.dim() == 2;
const bool b_is_2d = mat_b.dim() == 2;
if (!a_is_2d || !b_is_2d) {
TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
}
// check that the strides are valid, the fn will throw an error if not
check_valid_strides_and_return_transposed(mat_a);

View File

@ -223,7 +223,7 @@ inline CuFFTDataLayout as_cufft_embed(IntArrayRef strides, IntArrayRef sizes, bo
class CuFFTConfig {
public:
// Only move semantics is enough for this class. Although we already use
// Only move semantics is enought for this class. Although we already use
// unique_ptr for the plan, still remove copy constructor and assignment op so
// we don't accidentally copy and take perf hit.
CuFFTConfig(const CuFFTConfig&) = delete;

View File

@ -38,19 +38,17 @@ static inline std::string _cudaGetErrorEnum(cufftResult error)
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
case CUFFT_INCOMPLETE_PARAMETER_LIST:
return "CUFFT_INCOMPLETE_PARAMETER_LIST";
case CUFFT_INVALID_DEVICE:
return "CUFFT_INVALID_DEVICE";
case CUFFT_PARSE_ERROR:
return "CUFFT_PARSE_ERROR";
case CUFFT_NO_WORKSPACE:
return "CUFFT_NO_WORKSPACE";
case CUFFT_NOT_IMPLEMENTED:
return "CUFFT_NOT_IMPLEMENTED";
#if CUDA_VERSION <= 12090
case CUFFT_INCOMPLETE_PARAMETER_LIST:
return "CUFFT_INCOMPLETE_PARAMETER_LIST";
case CUFFT_PARSE_ERROR:
return "CUFFT_PARSE_ERROR";
#endif
#if !defined(USE_ROCM) && CUDA_VERSION <= 12090
#if !defined(USE_ROCM)
case CUFFT_LICENSE_ERROR:
return "CUFFT_LICENSE_ERROR";
#endif

View File

@ -241,8 +241,6 @@ void bf16bf16_grouped_gemm_impl_sm90_sm100(
Strides tensor_StrideA = make_strides(mat_a.strides());
Strides tensor_StrideB = make_strides(mat_b.strides());
Strides tensor_StrideOutput = make_strides(out.strides());
Strides tensor_ShapeA = make_strides(mat_a.sizes());
Strides tensor_ShapeB = make_strides(mat_b.sizes());
at::cuda::detail::prepare_grouped_gemm_data<<<1, group_count, 0, stream>>>(
reinterpret_cast<DtypeA*>(mat_a.data_ptr()),
@ -266,8 +264,6 @@ void bf16bf16_grouped_gemm_impl_sm90_sm100(
tensor_StrideA,
tensor_StrideB,
tensor_StrideOutput,
tensor_ShapeA,
tensor_ShapeB,
0,
0,
a_row_major,

View File

@ -38,20 +38,18 @@ __global__ void prepare_grouped_gemm_data(
Strides tensor_StrideA,
Strides tensor_StrideB,
Strides tensor_StrideOutput,
Strides tensor_ShapeA,
Strides tensor_ShapeB,
int64_t a_scale_stride,
int64_t b_scale_stride,
bool a_row_major = true,
bool b_row_major = false) {
int32_t tid = threadIdx.x;
int32_t delta = 0;
int32_t offset = 0;
if (offs != nullptr) {
int32_t start = tid == 0 ? 0 : offs[tid - 1];
offset = offs[tid];
delta = offset - start;
CUDA_KERNEL_ASSERT(delta >=0 && "expected gemm dimension to be greater or equal 0\n");
delta = offs[tid] - start;
if (K < 0) {
CUDA_KERNEL_ASSERT(delta >=0 && "expected ofsets to be greater or equal 0\n");
}
// TMA transfers require global memory tensor addresses to be
// aligned to 16 bytes.
@ -86,7 +84,6 @@ __global__ void prepare_grouped_gemm_data(
int64_t lda, ldb, ldoutput;
if (M < 0) {
// A and output is 2d
CUDA_KERNEL_ASSERT(offset <= tensor_ShapeA[0] && "expected offset to be less than tensor size\n");
M = delta;
lda = a_row_major ? tensor_StrideA[0] : tensor_StrideA[1];
ldb = b_row_major ? tensor_StrideB[1] : tensor_StrideB[2];
@ -99,7 +96,6 @@ __global__ void prepare_grouped_gemm_data(
output_ptrs[tid] = tid == 0 ? output : output + offs[tid - 1] * ldoutput;
B_ptrs[tid] = B + tid * tensor_StrideB[0];
} else if (N < 0) {
CUDA_KERNEL_ASSERT(offset <= tensor_ShapeB[1] && "expected offset to be less than tensor size\n");
N = delta;
lda = a_row_major ? tensor_StrideA[1] : tensor_StrideA[2];
ldb = b_row_major ? tensor_StrideB[0] : tensor_StrideB[1]; // B is transposed
@ -112,7 +108,6 @@ __global__ void prepare_grouped_gemm_data(
inputB_scale_ptrs[tid] = tid == 0 ? scale_B : scale_B + offs[tid - 1];
}
} else if (K < 0) {
CUDA_KERNEL_ASSERT(offset <= tensor_ShapeA[1] && offset <= tensor_ShapeB[0] && "expected offset to be less than tensor size\n");
// A, B is 2d, output is 3d
K = delta;
lda = a_row_major ? tensor_StrideA[0] : tensor_StrideA[1];

View File

@ -644,12 +644,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
Tensor grad = at::full_like(log_probs, neginf, LEGACY_CONTIGUOUS_MEMORY_FORMAT); // initialization for log(sum (alpha beta))
// As above, there may be better configurations to use.
constexpr int max_threads_ = std::is_same_v<scalar_t, float> ? 1024 : 896; // we need 72 or so 32 bit registers for double
int max_threads = max_threads_;
// Blackwell launch bounds
if (at::cuda::getCurrentDeviceProperties()->major >= 10) {
max_threads = 512;
}
constexpr int max_threads = std::is_same_v<scalar_t, float> ? 1024 : 896; // we need 72 or so 32 bit registers for double
int threads_target = max_threads;
while (threads_target / 2 >= 2*max_target_length+1) {
threads_target /= 2;

View File

@ -9,7 +9,6 @@
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wmissing-field-initializers")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
// Determine if the architecture supports rowwise scaled mm
// Currently failing on windows with:
@ -45,7 +44,6 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
#include <ATen/native/cuda/cutlass_common.cuh>
C10_DIAGNOSTIC_POP()
C10_DIAGNOSTIC_POP()
C10_DIAGNOSTIC_POP()

View File

@ -10,7 +10,6 @@
// Two warninngs in Cutlass included header files
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
// Determine if the architecture supports rowwise scaled mm
// Currently failing on windows with:
@ -45,7 +44,6 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
#include <cutlass/gemm/kernel/gemm_universal.hpp>
#include <cutlass/util/packed_stride.hpp>
C10_DIAGNOSTIC_POP()
C10_DIAGNOSTIC_POP()
C10_DIAGNOSTIC_POP()
@ -298,9 +296,6 @@ void f8f8bf16_grouped_gemm_impl_sm90(
Strides tensor_StrideA = make_strides(mat_a.strides());
Strides tensor_StrideB = make_strides(mat_b.strides());
Strides tensor_StrideOutput = make_strides(out.strides());
Strides tensor_ShapeA = make_strides(mat_a.sizes());
Strides tensor_ShapeB = make_strides(mat_b.sizes());
// scale stride will be used inside the kernel only if needed,
// so for 1d scales the "1" assigned here won't be used
int64_t a_scale_stride = scale_a.stride(0);
@ -328,8 +323,6 @@ void f8f8bf16_grouped_gemm_impl_sm90(
tensor_StrideA,
tensor_StrideB,
tensor_StrideOutput,
tensor_ShapeA,
tensor_ShapeB,
a_scale_stride,
b_scale_stride);

View File

@ -1,74 +0,0 @@
#include <ATen/ATen.h>
#include <ATen/core/Tensor.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
namespace at::native {
__global__ void weight_int8pack_mm_kernel(const float* x, const int8_t* w, const float* scale, float* out, int B, int K, int N) {
// one thread per output element: [B, N]
int b = blockIdx.y * blockDim.y + threadIdx.y;
int n = blockIdx.x * blockDim.x + threadIdx.x;
if (b >= B || n >= N) return;
float acc = 0.0f;
for (int k = 0; k < K; ++k) {
acc += x[b * K + k] * static_cast<float>(w[n * K + k]);
}
out[b * N + n] = acc * scale[n];
}
void launch_weight_int8pack_mm_cuda_kernel(const Tensor& x, const Tensor& w_int8, const Tensor& scale, Tensor& out) {
const int B = x.size(0);
const int K = x.size(1);
const int N = w_int8.size(0);
const dim3 block(16, 16);
const dim3 grid((N + block.x - 1) / block.x, (B + block.y - 1) / block.y);
auto stream = at::cuda::getCurrentCUDAStream();
weight_int8pack_mm_kernel<<<grid, block, 0, stream>>>(
x.data_ptr<float>(),
w_int8.data_ptr<int8_t>(),
scale.data_ptr<float>(),
out.data_ptr<float>(),
B, K, N);
}
// Main GPU entry point
at::Tensor _weight_int8pack_mm_cuda(const at::Tensor& x, const at::Tensor& w_int8, const at::Tensor& scale) {
// --- Check inputs ---
TORCH_CHECK(x.is_cuda(), "x must be a CUDA tensor");
TORCH_CHECK(w_int8.is_cuda(), "w must be a CUDA tensor");
TORCH_CHECK(scale.is_cuda(), "scale must be a CUDA tensor");
TORCH_CHECK(x.dim() == 2, "x must be 2D");
TORCH_CHECK(w_int8.dim() == 2, "w must be 2D");
TORCH_CHECK(scale.dim() == 1, "scale must be 1D");
TORCH_CHECK(x.size(1) == w_int8.size(1), "K dimension mismatch: x.size(1) != w.size(1)");
TORCH_CHECK(w_int8.size(0) == scale.size(0), "Output dim mismatch: w.size(0) != scale.size(0)");
// --- Determine shapes ---
auto B = x.size(0); // batch size
auto N = w_int8.size(0); // output dim
// Ensure inputs are in the correct types for the kernel
auto x_f32 = x.to(at::kFloat);
auto w_int8_contiguous = w_int8.contiguous();
auto scale_f32 = scale.to(at::kFloat);
// --- Allocate output ---
auto out = at::empty({B, N}, x.options().dtype(at::kFloat));
// --- Launch kernel ---
launch_weight_int8pack_mm_cuda_kernel(x_f32, w_int8_contiguous, scale_f32, out);
return out;
}
} // namespace at::native

View File

@ -45,7 +45,7 @@ namespace at::cuda::jit {
// Copied from aten/src/ATen/cuda/llvm_basic.cpp, then modified as above.
// If not compiling for ROCm, return the original get_traits_string().
std::string get_traits_string_but_hiprtc_safe() {
#if defined(USE_ROCM) && HIP_VERSION_MAJOR < 7
#if defined(USE_ROCM) && ROCM_VERSION < 70000
return R"ESCAPE(
namespace std {

View File

@ -28,22 +28,6 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
TORCH_CHECK(false, "cudnn_batch_norm: ATen not compiled with cuDNN support");
}
std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
const Tensor& input,
const Tensor& weight,
const std::optional<Tensor>& bias,
const std::optional<Tensor>& running_mean,
const std::optional<Tensor>& running_var,
bool training,
double exponential_average_factor,
double epsilon,
Tensor& out,
Tensor& save_mean,
Tensor& save_var,
Tensor& reserve) {
AT_ERROR("cudnn_batch_norm_out: ATen not compiled with cuDNN support");
}
std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
const Tensor& input,
const Tensor& grad_output,
@ -136,12 +120,7 @@ size_t _get_cudnn_batch_norm_reserve_space_size(
return reserve_size;
}
// Param `reserve` is a placeholder, just passing an empty tensor.
// usage:
// auto reserve = torch::empty({0}, torch::device(torch::kCUDA));
// at::native::cudnn_batch_norm_out(..., epsilon, output, save_mean, save_var,
// reserve);
std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
const Tensor& input_t,
const Tensor& weight_t,
const std::optional<Tensor>& bias_t_opt,
@ -149,11 +128,7 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
const std::optional<Tensor>& running_var_t_opt,
bool training,
double exponential_average_factor,
double epsilon,
Tensor& output_t,
Tensor& save_mean,
Tensor& save_var,
Tensor& reserve) {
double epsilon) {
// See [Note: hacky wrapper removal for optional tensor]
c10::MaybeOwned<Tensor> bias_t_maybe_owned =
at::borrow_from_optional_tensor(bias_t_opt);
@ -193,6 +168,9 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
cudnnBatchNormMode_t mode = getCudnnBatchNormMode(
training, input->suggest_memory_format(), input->dim());
auto output_t =
at::empty_like(*input, input->options(), input->suggest_memory_format());
TensorArg output{output_t, "output", 0};
auto handle = getCudnnHandle();
@ -204,8 +182,15 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
Constant one(dataType, 1);
Constant zero(dataType, 0);
Tensor save_mean, save_var;
Tensor reserve;
if (training) {
int64_t num_features = input_t.size(1);
save_mean = at::empty({num_features}, weight_t.options());
save_var = at::empty({num_features}, weight_t.options());
auto op = CUDNN_BATCHNORM_OPS_BN;
size_t workspace_size;
AT_CUDNN_CHECK(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
@ -253,6 +238,9 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
reserve_size));
} else {
reserve = at::empty({0}, input->options().dtype(kByte));
// This keeps a consistent output with native_batch_norm
save_mean = at::empty({0}, weight_t.options());
save_var = at::empty({0}, weight_t.options());
AT_CUDNN_CHECK(cudnnBatchNormalizationForwardInference(
handle,
mode,
@ -273,48 +261,10 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
// save_mean and save_var can be undefined
// If this causes problems, we can initialize them to empty tensors
// of the correct type
return std::tuple<Tensor&, Tensor&, Tensor&, Tensor&>{
return std::tuple<Tensor, Tensor, Tensor, Tensor>{
output_t, save_mean, save_var, reserve};
}
std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
const Tensor& input_t,
const Tensor& weight_t,
const std::optional<Tensor>& bias_t_opt,
const std::optional<Tensor>& running_mean_t_opt,
const std::optional<Tensor>& running_var_t_opt,
bool training,
double exponential_average_factor,
double epsilon) {
auto output_t = at::empty_like(
input_t, input_t.options(), input_t.suggest_memory_format());
Tensor save_mean, save_var, reserve;
if (training) {
int64_t num_features = input_t.size(1);
save_mean = at::empty({num_features}, weight_t.options());
save_var = at::empty({num_features}, weight_t.options());
} else {
// This keeps a consistent output with native_batch_norm
save_mean = at::empty({0}, weight_t.options());
save_var = at::empty({0}, weight_t.options());
}
return cudnn_batch_norm_out(
input_t,
weight_t,
bias_t_opt,
running_mean_t_opt,
running_var_t_opt,
training,
exponential_average_factor,
epsilon,
output_t,
save_mean,
save_var,
reserve);
}
// NB: CuDNN only implements the backward algorithm for batchnorm
// in training mode (evaluation mode batchnorm has a different algorithm),
// which is why this doesn't accept a 'training' parameter.

View File

@ -342,8 +342,8 @@ Tensor rms_norm_symint(
if (weight_opt.has_value() && weight_opt.value().defined() && weight_opt.value().dtype() != input.dtype()) {
TORCH_WARN_ONCE(
"Mismatch dtype between input and weight: input dtype = ", input.dtype(),
", weight dtype = ", weight_opt.value().dtype(), ", Cannot dispatch to fused implementation."
"Mismatch dtype between input and module: input dtype = ", input.dtype(),
", module dtype = ", weight_opt.value().dtype(), ", Can not dispatch to fused implementation"
);
return std::get<0>(rms_norm_composite(input, IntArrayRef(reinterpret_cast<const int64_t*>(normalized_shape.data()), normalized_shape.size()), weight_opt, eps));
}

View File

@ -1,6 +1,7 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/Config.h>
#include <ATen/Context.h>
#include <ATen/Dispatch.h>
#include <ATen/core/Tensor.h>
#include <ATen/native/mkldnn/Matmul.h>
@ -427,74 +428,56 @@ static inline bool checksize(const Tensor& mat1, const Tensor& mat2){
}
}
bool use_mkldnn_bf16_matmul(
template <typename T>
bool use_mkldnn_typed_matmul(
const Tensor& mat1,
const Tensor& mat2,
const Tensor& result) {
bool dtype_check = false;
if constexpr (std::is_same_v<T, c10::BFloat16>) {
#if defined(__aarch64__)
if (mkldnn_bf16_device_check_arm()) {
// onednn fastmath mode can leverage bf16 HW even for the fp32 input, e.g.
// Arm Neoverse V1 so, don't restrict the mkldnn_matmul only for bf16
// inputs, allow it for float as well
return (
use_mkldnn_bf16_matmul() &&
(mat1.scalar_type() == mat2.scalar_type()) &&
(!result.defined() || (mat1.scalar_type() == result.scalar_type())) &&
((mat1.scalar_type() == kFloat) || (mat1.scalar_type() == kBFloat16)) &&
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
} else
if (mkldnn_bf16_device_check_arm()) {
// onednn fastmath mode can leverage bf16 HW even for the fp32 input, e.g.
// Arm Neoverse V1 so, don't restrict the mkldnn_matmul only for bf16
// inputs, allow it for float as well
dtype_check = use_mkldnn_bf16_matmul() &&
((mat1.scalar_type() == kFloat) || (mat1.scalar_type() == kBFloat16));
}
#else
dtype_check = dtype_check && use_mkldnn_bf16_matmul() &&
(mat1.scalar_type() == kBFloat16);
#endif
{
return (
use_mkldnn_bf16_matmul() && mat1.scalar_type() == kBFloat16 &&
mat2.scalar_type() == kBFloat16 &&
(!result.defined() || result.scalar_type() == kBFloat16) &&
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
} else if constexpr (std::is_same_v<T, c10::Half>) {
dtype_check = dtype_check && use_mkldnn_fp16_matmul() &&
(mat1.scalar_type() == kHalf);
} else if constexpr (std::is_same_v<T, float>) {
dtype_check = dtype_check &&
(use_mkldnn_bf32_matmul() || use_mkldnn_tf32_matmul()) &&
(mat1.scalar_type() == kFloat);
}
}
bool use_mkldnn_fp16_matmul(
const Tensor& mat1,
const Tensor& mat2,
const Tensor& result) {
return (
use_mkldnn_fp16_matmul() && mat1.scalar_type() == kHalf &&
mat2.scalar_type() == kHalf &&
(!result.defined() || result.scalar_type() == kHalf) &&
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
}
bool use_mkldnn_bf32_matmul(
const Tensor& mat1,
const Tensor& mat2,
const Tensor& result) {
return (
use_mkldnn_bf32_matmul() && mat1.scalar_type() == kFloat &&
mat2.scalar_type() == kFloat &&
(!result.defined() || result.scalar_type() == kFloat) &&
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
}
bool use_mkldnn_tf32_matmul(
const Tensor& mat1,
const Tensor& mat2,
const Tensor& result) {
return (
use_mkldnn_tf32_matmul() && mat1.scalar_type() == kFloat &&
mat2.scalar_type() == kFloat &&
(!result.defined() || result.scalar_type() == kFloat) &&
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
if (!dtype_check) {
return false;
}
bool size_check =
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2);
dtype_check = (mat1.scalar_type() == mat2.scalar_type()) &&
(!result.defined() || result.scalar_type() == mat1.scalar_type());
return dtype_check && size_check;
}
bool use_mkldnn_matmul(
const Tensor& mat1,
const Tensor& mat2,
const Tensor& result) {
return (
use_mkldnn_bf16_matmul(mat1, mat2, result) ||
use_mkldnn_fp16_matmul(mat1, mat2, result) ||
use_mkldnn_bf32_matmul(mat1, mat2, result) ||
use_mkldnn_tf32_matmul(mat1, mat2, result));
auto mat1_type = mat1.scalar_type();
if (mat1_type != kBFloat16 || mat1_type != kHalf || mat1_type != kFloat) {
return false;
}
AT_DISPATCH_FLOATING_TYPES_AND2(
kBFloat16, kHalf, mat1.scalar_type(), "use_mkldnn_matmul", [&] {
return use_mkldnn_typed_matmul<scalar_t>(mat1, mat2, result);
});
return false;
}
static void _mkldnn_matmul_i8i8i32_with_primitive(

View File

@ -469,94 +469,4 @@ Tensor _weight_int4pack_mm_xpu(
return C;
}
Tensor& _int_mm_out_xpu(
const Tensor& self,
const Tensor& mat2,
Tensor& result) {
TORCH_CHECK(
self.dim() == 2,
"Expected self to be of dimension 2 but got ",
self.dim());
TORCH_CHECK(
mat2.dim() == 2,
"Expected mat2 to be of dimension 2 but got ",
mat2.dim());
TORCH_CHECK(
self.size(1) == mat2.size(0),
"self.size(1) needs to match mat2.size(0) but got ",
self.size(1),
" and ",
mat2.size(0));
TORCH_CHECK(
self.dtype() == at::kChar,
"Expected self dtype to be of type int8 but got ",
self.dtype());
TORCH_CHECK(
mat2.dtype() == at::kChar,
"Expected mat2 dtype to be of type int8 but got ",
mat2.dtype());
TORCH_CHECK(
result.dtype() == at::kInt,
"Expected result dtype to be of type kInt but got ",
result.dtype());
TORCH_CHECK(
result.size(0) == self.size(0),
"Expected result.size(0) to be ",
self.size(0),
" but got ",
result.size(0));
TORCH_CHECK(
result.size(1) == mat2.size(1),
"Expected result.size(1) to be ",
mat2.size(1),
" but got ",
result.size(1));
TORCH_CHECK(
result.dim() == 2,
"Expected result to be of dimension 2 but got ",
result.dim());
TORCH_CHECK(result.is_contiguous(), "Expected result to be contiguous.");
if (result.numel() == 0 || self.size(1) == 0) {
return result.zero_();
}
Tensor bias = at::Tensor();
Tensor mat2_scales = at::ones({1}, mat2.options().dtype(at::kFloat));
Tensor mat2_zero_points = at::Tensor();
auto post_op_args = torch::List<std::optional<at::Scalar>>();
at::native::onednn::quantized_matmul(
self.contiguous(),
1.0,
0,
mat2.contiguous(),
mat2_scales,
mat2_zero_points,
bias,
result,
1.0,
0,
result.scalar_type(),
/*other*/ std::nullopt,
/*other scale*/ 1.0,
/*other zp*/ 0,
/*binary post op*/ "none",
/*binary alpha*/ 1.0,
/*post_op_name*/ "none",
post_op_args,
/*post_op_algorithm*/ "none",
/*m2_trans*/ true);
return result;
}
Tensor _int_mm_xpu(const Tensor& self, const Tensor& mat2) {
Tensor result =
at::empty({self.size(0), mat2.size(1)}, self.options().dtype(at::kInt));
return _int_mm_out_xpu(self, mat2, result);
}
} // namespace at::native

View File

@ -88,8 +88,14 @@ std::string getArrayRefString(const IntArrayRef s);
// use has_storage() on the returned tensor to determine if src actually is a view
Tensor gatherViewTensor(const Tensor& src, Tensor& dst);
Tensor& scatterViewTensor(const Tensor& src, Tensor& output);
MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const TensorBase& input);
MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const TensorBase& input);
MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph,
MPSGraphTensor* inputTensor,
const TensorBase& input,
bool includesInt64 = false);
MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph,
MPSGraphTensor* inputTensor,
const TensorBase& input,
bool includesInt64 = false);
MPSNDArray* getStridedMPSNDArray(const TensorBase& src, MPSNDArray* srcNDArray);
MPSNDArray* getMPSNDArray(const TensorBase& t, const IntArrayRef& sizes = {}, const IntArrayRef& strides = {});
@ -429,6 +435,14 @@ inline T* LookUpOrCreateCachedGraph(const std::string& key, std::function<void(M
// Common math operations
MPSGraphTensor* log1p(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor);
#define MPS_CHECK_INT64_OP_SUPPORTED(input_tensor, mac_os_13_3_plus, op_name) \
if (!mac_os_13_3_plus && input_tensor.scalar_type() == kLong) { \
TORCH_WARN_ONCE( \
"MPS: no support for int64 for ", \
op_name, \
", downcasting to a smaller data type (int32/float32). Native support for int64 has been added in macOS 13.3."); \
}
/**
* Returns distance from lowest to highest element offset in given tensor.
*/
@ -604,6 +618,10 @@ inline void runMPSGraph(MPSStream* stream, MPSGraph* graph, NSDictionary* feeds,
runMPSGraph(stream, graph, feeds, dictionaryFromPlaceholders(result));
}
inline bool supportsComplex() {
return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS);
}
// MPS yet to support double types, but starting from MacOS 14, supports bfloat16
inline bool supportedFloatingType(ScalarType dtype) {
return dtype == kFloat || dtype == kHalf || dtype == kBFloat16;
@ -615,7 +633,7 @@ inline bool supportedFloatingType(const TensorBase& t) {
inline bool supportedFloatingOrComplexType(ScalarType dtype) {
if (dtype == kComplexFloat || dtype == kComplexHalf) {
return true;
return supportsComplex();
}
return supportedFloatingType(dtype);
}
@ -623,6 +641,11 @@ inline bool supportedFloatingOrComplexType(const TensorBase& t) {
return supportedFloatingOrComplexType(t.scalar_type());
}
inline void checkSupportsBFloat16() {
TORCH_CHECK_TYPE(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS),
"MPS bfloat16 type is supported on MacOS 14.0 or newer.");
}
inline bool needsGather(const TensorBase& t) {
static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
return !is_macOS_15_0_or_newer && (!t.is_contiguous() || t.storage_offset());

Some files were not shown because too many files have changed in this diff Show More