mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-26 16:44:54 +08:00
Compare commits
9 Commits
flex_flash
...
VLA_exp
| Author | SHA1 | Date | |
|---|---|---|---|
| 3411990fa0 | |||
| fc32f3d5eb | |||
| ef8f493676 | |||
| 92eaa3d3b8 | |||
| e0340e599e | |||
| c3e4e4079e | |||
| 62f61292e3 | |||
| 41cbceee59 | |||
| 46706e7c34 |
@ -438,7 +438,9 @@ def build_torchvision(
|
||||
)
|
||||
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
|
||||
elif build_version is not None:
|
||||
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
|
||||
build_vars += (
|
||||
f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
|
||||
)
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
|
||||
@ -493,7 +495,9 @@ def build_torchdata(
|
||||
)
|
||||
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
|
||||
elif build_version is not None:
|
||||
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
|
||||
build_vars += (
|
||||
f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
|
||||
)
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
|
||||
@ -549,7 +553,9 @@ def build_torchtext(
|
||||
)
|
||||
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
|
||||
elif build_version is not None:
|
||||
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
|
||||
build_vars += (
|
||||
f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
|
||||
)
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
|
||||
@ -607,7 +613,9 @@ def build_torchaudio(
|
||||
)
|
||||
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
|
||||
elif build_version is not None:
|
||||
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
|
||||
build_vars += (
|
||||
f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
|
||||
)
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
|
||||
|
||||
@ -144,6 +144,16 @@ case "$tag" in
|
||||
TRITON=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9)
|
||||
CUDA_VERSION=12.6.3
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)
|
||||
CUDA_VERSION=12.8.1
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
@ -154,6 +164,39 @@ case "$tag" in
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.6
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.6
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.6
|
||||
ANACONDA_PYTHON_VERSION=3.13
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
|
||||
CUDA_VERSION=12.8.1
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
@ -176,7 +219,19 @@ case "$tag" in
|
||||
VISION=yes
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3)
|
||||
pytorch-linux-jammy-py3.11-clang12)
|
||||
ANACONDA_PYTHON_VERSION=3.11
|
||||
CLANG_VERSION=12
|
||||
VISION=yes
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3.9-gcc9)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-noble-rocm-n-py3)
|
||||
if [[ $tag =~ "jammy" ]]; then
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
else
|
||||
@ -190,9 +245,7 @@ case "$tag" in
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
if [[ $tag =~ "benchmarks" ]]; then
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
fi
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-noble-rocm-alpha-py3)
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
@ -204,6 +257,7 @@ case "$tag" in
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
|
||||
;;
|
||||
pytorch-linux-jammy-xpu-2025.0-py3)
|
||||
|
||||
@ -1 +1 @@
|
||||
f7888497a1eb9e98d4c07537f0d0bcfe180d1363
|
||||
11ec6354315768a85da41032535e3b7b99c5f706
|
||||
|
||||
@ -66,9 +66,8 @@ function do_cpython_build {
|
||||
ln -s pip3 ${prefix}/bin/pip
|
||||
fi
|
||||
# install setuptools since python 3.12 is required to use distutils
|
||||
# packaging is needed to create symlink since wheel no longer provides needed information
|
||||
${prefix}/bin/pip install packaging==25.0 wheel==0.45.1 setuptools==80.9.0
|
||||
local abi_tag=$(${prefix}/bin/python -c "from packaging.tags import interpreter_name, interpreter_version; import sysconfig ; from sysconfig import get_config_var; print('{0}{1}-{0}{1}{2}'.format(interpreter_name(), interpreter_version(), 't' if sysconfig.get_config_var('Py_GIL_DISABLED') else ''))")
|
||||
${prefix}/bin/pip install wheel==0.45.1 setuptools==80.9.0
|
||||
local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))")
|
||||
ln -sf ${prefix} /opt/python/${abi_tag}
|
||||
}
|
||||
|
||||
|
||||
@ -68,8 +68,8 @@ function install_nvshmem {
|
||||
# download, unpack, install
|
||||
wget -q "${url}"
|
||||
tar xf "${filename}.tar.gz"
|
||||
cp -a "libnvshmem/include/"* /usr/local/cuda/include/
|
||||
cp -a "libnvshmem/lib/"* /usr/local/cuda/lib64/
|
||||
cp -a "libnvshmem/include/"* /usr/local/include/
|
||||
cp -a "libnvshmem/lib/"* /usr/local/lib/
|
||||
|
||||
# cleanup
|
||||
cd ..
|
||||
|
||||
@ -15,37 +15,11 @@ function install_timm() {
|
||||
commit=$(get_pinned_commit timm)
|
||||
|
||||
pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
|
||||
}
|
||||
|
||||
function install_torchbench() {
|
||||
local commit
|
||||
commit=$(get_pinned_commit torchbench)
|
||||
git clone https://github.com/pytorch/benchmark torchbench
|
||||
pushd torchbench
|
||||
git checkout "$commit"
|
||||
|
||||
python install.py --continue_on_fail
|
||||
|
||||
# TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
|
||||
# is regressing speedup metric. This needs to be investigated further
|
||||
pip install transformers==4.38.1
|
||||
|
||||
echo "Print all dependencies after TorchBench is installed"
|
||||
python -mpip freeze
|
||||
popd
|
||||
|
||||
chown -R jenkins torchbench
|
||||
# Clean up
|
||||
conda_run pip uninstall -y torch torchvision triton
|
||||
}
|
||||
|
||||
# Pango is needed for weasyprint which is needed for doctr
|
||||
conda_install pango
|
||||
|
||||
# Stable packages are ok here, just to satisfy TorchBench check
|
||||
pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
|
||||
|
||||
install_torchbench
|
||||
install_huggingface
|
||||
install_timm
|
||||
|
||||
# Clean up
|
||||
conda_run pip uninstall -y torch torchvision torchaudio triton torchao
|
||||
|
||||
@ -103,5 +103,5 @@ fi
|
||||
# It depends on torch and triton. We don't want to install
|
||||
# triton and torch from production on Docker CI images
|
||||
if [[ "$ANACONDA_PYTHON_VERSION" != 3.9* ]]; then
|
||||
pip_install helion --no-deps
|
||||
pip_install helion==0.0.10 --no-deps
|
||||
fi
|
||||
|
||||
@ -34,27 +34,18 @@ function install_ubuntu() {
|
||||
|
||||
# The xpu-smi packages
|
||||
apt-get install -y flex bison xpu-smi
|
||||
|
||||
if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
|
||||
# Compute and Media Runtimes
|
||||
apt-get install -y \
|
||||
intel-opencl-icd intel-level-zero-gpu level-zero \
|
||||
intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
|
||||
libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
|
||||
libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
|
||||
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
|
||||
# Development Packages
|
||||
apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
|
||||
else # rolling driver
|
||||
apt-get install -y \
|
||||
intel-opencl-icd libze-intel-gpu1 libze1 \
|
||||
intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
|
||||
libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
|
||||
libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
|
||||
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
|
||||
apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev
|
||||
# Compute and Media Runtimes
|
||||
apt-get install -y \
|
||||
intel-opencl-icd intel-level-zero-gpu level-zero \
|
||||
intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
|
||||
libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
|
||||
libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
|
||||
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
|
||||
if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
|
||||
apt-get install -y intel-ocloc
|
||||
fi
|
||||
|
||||
# Development Packages
|
||||
apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
|
||||
# Install Intel Support Packages
|
||||
apt-get install -y ${XPU_PACKAGES}
|
||||
|
||||
@ -139,11 +130,11 @@ function install_sles() {
|
||||
|
||||
}
|
||||
|
||||
# Default use GPU driver rolling releases
|
||||
XPU_DRIVER_VERSION=""
|
||||
if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
|
||||
# Use GPU driver LTS releases
|
||||
XPU_DRIVER_VERSION="/lts/2350"
|
||||
# Default use GPU driver LTS releases
|
||||
XPU_DRIVER_VERSION="/lts/2350"
|
||||
if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
|
||||
# Use GPU driver rolling releases
|
||||
XPU_DRIVER_VERSION=""
|
||||
fi
|
||||
|
||||
# Default use Intel® oneAPI Deep Learning Essentials 2025.0
|
||||
|
||||
@ -63,12 +63,11 @@ lark==0.12.0
|
||||
#Pinned versions: 0.12.0
|
||||
#test that import:
|
||||
|
||||
librosa>=0.6.2 ; python_version < "3.11" and platform_machine != "s390x"
|
||||
librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
|
||||
librosa>=0.6.2 ; python_version < "3.11"
|
||||
librosa==0.10.2 ; python_version == "3.12"
|
||||
#Description: A python package for music and audio analysis
|
||||
#Pinned versions: >=0.6.2
|
||||
#test that import: test_spectral_ops.py
|
||||
#librosa depends on numba; disable it for s390x while numba is disabled too
|
||||
|
||||
#mkl #this breaks linux-bionic-rocm4.5-py3.7
|
||||
#Description: Intel oneAPI Math Kernel Library
|
||||
@ -111,15 +110,14 @@ ninja==1.11.1.3
|
||||
#Pinned versions: 1.11.1.3
|
||||
#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
|
||||
|
||||
numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
|
||||
numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x"
|
||||
numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
|
||||
numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
|
||||
numba==0.49.0 ; python_version < "3.9"
|
||||
numba==0.55.2 ; python_version == "3.9"
|
||||
numba==0.55.2 ; python_version == "3.10"
|
||||
numba==0.60.0 ; python_version == "3.12"
|
||||
#Description: Just-In-Time Compiler for Numerical Functions
|
||||
#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
|
||||
#test that import: test_numba_integration.py
|
||||
#For numba issue see https://github.com/pytorch/pytorch/issues/51511
|
||||
#Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073
|
||||
|
||||
#numpy
|
||||
#Description: Provides N-dimensional arrays and linear algebra
|
||||
@ -309,7 +307,7 @@ pytest-cpp==2.3.0
|
||||
#Pinned versions: 2.3.0
|
||||
#test that import:
|
||||
|
||||
z3-solver==4.15.1.0 ; platform_machine != "s390x"
|
||||
z3-solver==4.15.1.0
|
||||
#Description: The Z3 Theorem Prover Project
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
@ -363,6 +361,7 @@ pwlf==2.2.1
|
||||
#Pinned versions: 2.2.1
|
||||
#test that import: test_sac_estimator.py
|
||||
|
||||
|
||||
# To build PyTorch itself
|
||||
pyyaml
|
||||
pyzstd
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
sphinx==5.3.0
|
||||
#Description: This is used to generate PyTorch docs
|
||||
#Pinned versions: 5.3.0
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
|
||||
|
||||
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
|
||||
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
|
||||
@ -50,7 +50,7 @@ IPython==8.12.0
|
||||
#Pinned versions: 8.12.0
|
||||
|
||||
myst-nb==0.17.2
|
||||
#Description: This is used to generate PyTorch functorch and torch.compile docs.
|
||||
#Description: This is used to generate PyTorch functorch and torch.compile docs
|
||||
#Pinned versions: 0.17.2
|
||||
|
||||
# The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
|
||||
|
||||
@ -98,9 +98,8 @@ COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps
|
||||
COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ci_commit_pins/huggingface.txt huggingface.txt
|
||||
COPY ci_commit_pins/timm.txt timm.txt
|
||||
COPY ci_commit_pins/torchbench.txt torchbench.txt
|
||||
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
|
||||
|
||||
# (optional) Install non-default Ninja version
|
||||
ARG NINJA_VERSION
|
||||
|
||||
@ -98,9 +98,8 @@ COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps
|
||||
COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ci_commit_pins/huggingface.txt huggingface.txt
|
||||
COPY ci_commit_pins/timm.txt timm.txt
|
||||
COPY ci_commit_pins/torchbench.txt torchbench.txt
|
||||
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
|
||||
|
||||
ARG TRITON
|
||||
ARG TRITON_CPU
|
||||
|
||||
@ -138,11 +138,28 @@ fi
|
||||
|
||||
echo "Calling setup.py bdist at $(date)"
|
||||
|
||||
time CMAKE_ARGS=${CMAKE_ARGS[@]} \
|
||||
EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
|
||||
if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
|
||||
echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
|
||||
time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
|
||||
BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 \
|
||||
BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
|
||||
USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
|
||||
python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
|
||||
echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
|
||||
echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
|
||||
time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
|
||||
BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 \
|
||||
BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
|
||||
USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
|
||||
CMAKE_FRESH=1 python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
|
||||
echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
|
||||
else
|
||||
time CMAKE_ARGS=${CMAKE_ARGS[@]} \
|
||||
EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
|
||||
BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
|
||||
USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
|
||||
python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
|
||||
fi
|
||||
echo "Finished setup.py bdist at $(date)"
|
||||
|
||||
# Build libtorch packages
|
||||
@ -255,6 +272,10 @@ ls /tmp/$WHEELHOUSE_DIR
|
||||
mkdir -p "/$WHEELHOUSE_DIR"
|
||||
mv /tmp/$WHEELHOUSE_DIR/torch*linux*.whl /$WHEELHOUSE_DIR/
|
||||
|
||||
if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
|
||||
mv /tmp/$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/ || true
|
||||
fi
|
||||
|
||||
if [[ -n "$BUILD_PYTHONLESS" ]]; then
|
||||
mkdir -p /$LIBTORCH_HOUSE_DIR
|
||||
mv /tmp/$LIBTORCH_HOUSE_DIR/*.zip /$LIBTORCH_HOUSE_DIR
|
||||
@ -431,8 +452,16 @@ if [[ -z "$BUILD_PYTHONLESS" ]]; then
|
||||
pushd $PYTORCH_ROOT/test
|
||||
|
||||
# Install the wheel for this Python version
|
||||
if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
|
||||
pip uninstall -y "$TORCH_NO_PYTHON_PACKAGE_NAME" || true
|
||||
fi
|
||||
|
||||
pip uninstall -y "$TORCH_PACKAGE_NAME"
|
||||
|
||||
if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
|
||||
pip install "$TORCH_NO_PYTHON_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v
|
||||
fi
|
||||
|
||||
pip install "$TORCH_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v
|
||||
|
||||
# Print info on the libraries installed in this wheel
|
||||
|
||||
@ -194,7 +194,7 @@ ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library
|
||||
ROCBLAS_LIB_DST=lib/rocblas/library
|
||||
ROCBLAS_ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH)
|
||||
ROCBLAS_OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx)
|
||||
ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $ROCBLAS_OTHER_FILES)
|
||||
ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $OTHER_FILES)
|
||||
|
||||
# hipblaslt library files
|
||||
HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library
|
||||
|
||||
@ -50,6 +50,9 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
|
||||
export ATEN_THREADING=NATIVE
|
||||
fi
|
||||
|
||||
# Enable LLVM dependency for TensorExpr testing
|
||||
export USE_LLVM=/opt/llvm
|
||||
export LLVM_DIR=/opt/llvm/lib/cmake/llvm
|
||||
|
||||
if ! which conda; then
|
||||
# In ROCm CIs, we are doing cross compilation on build machines with
|
||||
@ -173,7 +176,7 @@ fi
|
||||
|
||||
# We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
|
||||
# memory to build and will OOM
|
||||
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && echo "${TORCH_CUDA_ARCH_LIST}" | tr ' ' '\n' | sed 's/$/>= 8.0/' | bc | grep -q 1; then
|
||||
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then
|
||||
export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j 2"
|
||||
fi
|
||||
|
||||
@ -189,6 +192,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
|
||||
export USE_ASAN=1
|
||||
export REL_WITH_DEB_INFO=1
|
||||
export UBSAN_FLAGS="-fno-sanitize-recover=all"
|
||||
unset USE_LLVM
|
||||
fi
|
||||
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *no-ops* ]]; then
|
||||
@ -261,13 +265,22 @@ else
|
||||
|
||||
WERROR=1 python setup.py clean
|
||||
|
||||
WERROR=1 python setup.py bdist_wheel
|
||||
if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
|
||||
python3 tools/packaging/split_wheel.py bdist_wheel
|
||||
else
|
||||
WERROR=1 python setup.py bdist_wheel
|
||||
fi
|
||||
else
|
||||
python setup.py clean
|
||||
if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
|
||||
source .ci/pytorch/install_cache_xla.sh
|
||||
fi
|
||||
python setup.py bdist_wheel
|
||||
if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
|
||||
echo "USE_SPLIT_BUILD cannot be used with xla or rocm"
|
||||
exit 1
|
||||
else
|
||||
python setup.py bdist_wheel
|
||||
fi
|
||||
fi
|
||||
pip_install_whl "$(echo dist/*.whl)"
|
||||
|
||||
|
||||
@ -229,6 +229,7 @@ function install_torchrec_and_fbgemm() {
|
||||
|
||||
pip_install tabulate # needed for newer fbgemm
|
||||
pip_install patchelf # needed for rocm fbgemm
|
||||
pushd /tmp
|
||||
|
||||
local wheel_dir=dist/fbgemm_gpu
|
||||
local found_whl=0
|
||||
@ -244,7 +245,7 @@ function install_torchrec_and_fbgemm() {
|
||||
if [ "${found_whl}" == "0" ]; then
|
||||
git clone --recursive https://github.com/pytorch/fbgemm
|
||||
pushd fbgemm/fbgemm_gpu
|
||||
git checkout "${fbgemm_commit}" --recurse-submodules
|
||||
git checkout "${fbgemm_commit}"
|
||||
python setup.py bdist_wheel \
|
||||
--build-variant=rocm \
|
||||
-DHIP_ROOT_DIR="${ROCM_PATH}" \
|
||||
@ -263,6 +264,7 @@ function install_torchrec_and_fbgemm() {
|
||||
done
|
||||
|
||||
rm -rf fbgemm
|
||||
popd
|
||||
else
|
||||
pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
|
||||
pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
|
||||
@ -281,6 +283,30 @@ function clone_pytorch_xla() {
|
||||
fi
|
||||
}
|
||||
|
||||
function checkout_install_torchbench() {
|
||||
local commit
|
||||
commit=$(get_pinned_commit torchbench)
|
||||
git clone https://github.com/pytorch/benchmark torchbench
|
||||
pushd torchbench
|
||||
git checkout "$commit"
|
||||
|
||||
if [ "$1" ]; then
|
||||
python install.py --continue_on_fail models "$@"
|
||||
else
|
||||
# Occasionally the installation may fail on one model but it is ok to continue
|
||||
# to install and test other models
|
||||
python install.py --continue_on_fail
|
||||
fi
|
||||
|
||||
# TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
|
||||
# is regressing speedup metric. This needs to be investigated further
|
||||
pip install transformers==4.38.1
|
||||
|
||||
echo "Print all dependencies after TorchBench is installed"
|
||||
python -mpip freeze
|
||||
popd
|
||||
}
|
||||
|
||||
function install_torchao() {
|
||||
local commit
|
||||
commit=$(get_pinned_commit torchao)
|
||||
|
||||
@ -157,29 +157,6 @@ test_jit_hooks() {
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
# Shellcheck doesn't like it when you pass no arguments to a function
|
||||
# that can take args. See https://www.shellcheck.net/wiki/SC2120
|
||||
# shellcheck disable=SC2120
|
||||
checkout_install_torchbench() {
|
||||
local commit
|
||||
commit=$(cat .ci/docker/ci_commit_pins/torchbench.txt)
|
||||
git clone https://github.com/pytorch/benchmark torchbench
|
||||
pushd torchbench
|
||||
git checkout "$commit"
|
||||
|
||||
if [ "$1" ]; then
|
||||
python install.py --continue_on_fail models "$@"
|
||||
else
|
||||
# Occasionally the installation may fail on one model but it is ok to continue
|
||||
# to install and test other models
|
||||
python install.py --continue_on_fail
|
||||
fi
|
||||
|
||||
echo "Print all dependencies after TorchBench is installed"
|
||||
python -mpip freeze
|
||||
popd
|
||||
}
|
||||
|
||||
torchbench_setup_macos() {
|
||||
git clone --recursive https://github.com/pytorch/vision torchvision
|
||||
git clone --recursive https://github.com/pytorch/audio torchaudio
|
||||
@ -202,6 +179,8 @@ torchbench_setup_macos() {
|
||||
USE_OPENMP=0 python setup.py develop
|
||||
popd
|
||||
|
||||
# Shellcheck doesn't like it when you pass no arguments to a function that can take args. See https://www.shellcheck.net/wiki/SC2120
|
||||
# shellcheck disable=SC2119,SC2120
|
||||
checkout_install_torchbench
|
||||
}
|
||||
|
||||
|
||||
@ -627,8 +627,6 @@ test_perf_for_dashboard() {
|
||||
device=cuda_a10g
|
||||
elif [[ "${TEST_CONFIG}" == *h100* ]]; then
|
||||
device=cuda_h100
|
||||
elif [[ "${TEST_CONFIG}" == *b200* ]]; then
|
||||
device=cuda_b200
|
||||
elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
|
||||
device=rocm
|
||||
fi
|
||||
@ -803,16 +801,6 @@ test_dynamo_benchmark() {
|
||||
if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
|
||||
test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
|
||||
elif [[ "${TEST_CONFIG}" == *perf* ]]; then
|
||||
# TODO (huydhn): Just smoke test some sample models
|
||||
if [[ "${TEST_CONFIG}" == *b200* ]]; then
|
||||
if [[ "${suite}" == "huggingface" ]]; then
|
||||
export TORCHBENCH_ONLY_MODELS="DistillGPT2"
|
||||
elif [[ "${suite}" == "timm_models" ]]; then
|
||||
export TORCHBENCH_ONLY_MODELS="inception_v3"
|
||||
elif [[ "${suite}" == "torchbench" ]]; then
|
||||
export TORCHBENCH_ONLY_MODELS="hf_Bert"
|
||||
fi
|
||||
fi
|
||||
test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
|
||||
else
|
||||
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
|
||||
@ -1051,10 +1039,20 @@ test_libtorch_api() {
|
||||
mkdir -p $TEST_REPORTS_DIR
|
||||
|
||||
OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
|
||||
"$TORCH_BIN_DIR"/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml
|
||||
else
|
||||
# Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy
|
||||
OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest"
|
||||
|
||||
# On s390x, pytorch is built without llvm.
|
||||
# Even if it would be built with llvm, llvm currently doesn't support used features on s390x and
|
||||
# test fails with errors like:
|
||||
# JIT session error: Unsupported target machine architecture in ELF object pytorch-jitted-objectbuffer
|
||||
# unknown file: Failure
|
||||
# C++ exception with description "valOrErr INTERNAL ASSERT FAILED at "/var/lib/jenkins/workspace/torch/csrc/jit/tensorexpr/llvm_jit.h":34, please report a bug to PyTorch. Unexpected failure in LLVM JIT: Failed to materialize symbols: { (main, { func }) }
|
||||
if [[ "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
|
||||
python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr
|
||||
fi
|
||||
fi
|
||||
|
||||
# quantization is not fully supported on s390x yet
|
||||
@ -1674,11 +1672,13 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
|
||||
elif [[ "${TEST_CONFIG}" == cachebench ]]; then
|
||||
install_torchaudio
|
||||
install_torchvision
|
||||
PYTHONPATH=/torchbench test_cachebench
|
||||
checkout_install_torchbench nanogpt BERT_pytorch resnet50 hf_T5 llama moco
|
||||
PYTHONPATH=$(pwd)/torchbench test_cachebench
|
||||
elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
|
||||
install_torchaudio
|
||||
install_torchvision
|
||||
PYTHONPATH=/torchbench test_verify_cachebench
|
||||
checkout_install_torchbench nanogpt
|
||||
PYTHONPATH=$(pwd)/torchbench test_verify_cachebench
|
||||
elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
|
||||
install_torchaudio
|
||||
install_torchvision
|
||||
@ -1687,22 +1687,28 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
|
||||
# https://github.com/opencv/opencv-python/issues/885
|
||||
pip_install opencv-python==4.8.0.74
|
||||
if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
|
||||
PYTHONPATH=/torchbench test_inductor_torchbench_smoketest_perf
|
||||
checkout_install_torchbench hf_Bert hf_Albert timm_vision_transformer
|
||||
PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
|
||||
elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
|
||||
PYTHONPATH=/torchbench test_inductor_torchbench_cpu_smoketest_perf
|
||||
checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \
|
||||
llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \
|
||||
functorch_maml_omniglot yolov3 mobilenet_v2 resnext50_32x4d densenet121 mnasnet1_0
|
||||
PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf
|
||||
elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
|
||||
TORCHBENCHPATH=/torchbench test_torchbench_gcp_smoketest
|
||||
checkout_install_torchbench
|
||||
TORCHBENCHPATH=$(pwd)/torchbench test_torchbench_gcp_smoketest
|
||||
else
|
||||
checkout_install_torchbench
|
||||
# Do this after checkout_install_torchbench to ensure we clobber any
|
||||
# nightlies that torchbench may pull in
|
||||
if [[ "${TEST_CONFIG}" != *cpu* ]]; then
|
||||
install_torchrec_and_fbgemm
|
||||
fi
|
||||
PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
|
||||
PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
|
||||
fi
|
||||
elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
|
||||
install_torchvision
|
||||
PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
|
||||
PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
|
||||
if [[ "$SHARD_NUMBER" -eq "1" ]]; then
|
||||
test_inductor_aoti
|
||||
fi
|
||||
|
||||
@ -192,6 +192,9 @@ retry brew install libomp
|
||||
# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
|
||||
export USE_DISTRIBUTED=1
|
||||
|
||||
if [[ -n "$CROSS_COMPILE_ARM64" ]]; then
|
||||
export CMAKE_OSX_ARCHITECTURES=arm64
|
||||
fi
|
||||
export USE_MKLDNN=OFF
|
||||
export USE_QNNPACK=OFF
|
||||
export BUILD_TEST=OFF
|
||||
@ -199,7 +202,16 @@ export BUILD_TEST=OFF
|
||||
pushd "$pytorch_rootdir"
|
||||
echo "Calling setup.py bdist_wheel at $(date)"
|
||||
|
||||
python setup.py bdist_wheel -d "$whl_tmp_dir"
|
||||
if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
|
||||
echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
|
||||
BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 python setup.py bdist_wheel -d "$whl_tmp_dir"
|
||||
echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
|
||||
echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
|
||||
BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 CMAKE_FRESH=1 python setup.py bdist_wheel -d "$whl_tmp_dir"
|
||||
echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
|
||||
else
|
||||
python setup.py bdist_wheel -d "$whl_tmp_dir"
|
||||
fi
|
||||
|
||||
echo "Finished setup.py bdist_wheel at $(date)"
|
||||
|
||||
|
||||
@ -65,8 +65,16 @@ fi
|
||||
|
||||
if [[ "$PACKAGE_TYPE" != libtorch ]]; then
|
||||
if [[ "\$BUILD_ENVIRONMENT" != *s390x* ]]; then
|
||||
pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
|
||||
retry pip install -q numpy protobuf typing-extensions
|
||||
if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
|
||||
pkg_no_python="$(ls -1 /final_pkgs/torch_no_python* | sort |tail -1)"
|
||||
pkg_torch="$(ls -1 /final_pkgs/torch-* | sort |tail -1)"
|
||||
# todo: after folder is populated use the pypi_pkg channel instead
|
||||
pip install "\$pkg_no_python" "\$pkg_torch" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}_pypi_pkg"
|
||||
retry pip install -q numpy protobuf typing-extensions
|
||||
else
|
||||
pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
|
||||
retry pip install -q numpy protobuf typing-extensions
|
||||
fi
|
||||
else
|
||||
pip install "\$pkg"
|
||||
retry pip install -q numpy protobuf typing-extensions
|
||||
|
||||
@ -134,6 +134,7 @@ export DESIRED_PYTHON="${DESIRED_PYTHON:-}"
|
||||
export DESIRED_CUDA="$DESIRED_CUDA"
|
||||
export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}"
|
||||
export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}"
|
||||
export USE_SPLIT_BUILD="${USE_SPLIT_BUILD:-}"
|
||||
if [[ "${OSTYPE}" == "msys" ]]; then
|
||||
export LIBTORCH_CONFIG="${LIBTORCH_CONFIG:-}"
|
||||
if [[ "${LIBTORCH_CONFIG:-}" == 'debug' ]]; then
|
||||
|
||||
@ -23,6 +23,10 @@ if [[ "${DRY_RUN}" = "disabled" ]]; then
|
||||
AWS_S3_CP="aws s3 cp"
|
||||
fi
|
||||
|
||||
if [[ "${USE_SPLIT_BUILD:-false}" == "true" ]]; then
|
||||
UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_pypi_pkg"
|
||||
fi
|
||||
|
||||
# this is special build with all dependencies packaged
|
||||
if [[ ${BUILD_NAME} == *-full* ]]; then
|
||||
UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_full"
|
||||
|
||||
@ -24,6 +24,7 @@ runs:
|
||||
-e PYTORCH_FINAL_PACKAGE_DIR \
|
||||
-e PYTORCH_ROOT \
|
||||
-e SKIP_ALL_TESTS \
|
||||
-e USE_SPLIT_BUILD \
|
||||
--tty \
|
||||
--detach \
|
||||
-v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
e500f0cf88bc57ffd8b0029033da305eef24ae25
|
||||
bf305f538005f2e900f8850ed57146024a8bc559
|
||||
|
||||
2
.github/ci_commit_pins/vllm.txt
vendored
2
.github/ci_commit_pins/vllm.txt
vendored
@ -1 +1 @@
|
||||
35afe1b30b154114dc2ee8329e12f8cf3fe9f576
|
||||
ca9e2be3ed6320b51f52f536595cd24e254f8bb2
|
||||
|
||||
2
.github/ci_commit_pins/xla.txt
vendored
2
.github/ci_commit_pins/xla.txt
vendored
@ -1 +1 @@
|
||||
095faec1e7b6cc47220181e74ae9cde2605f9b00
|
||||
29ae4c76c026185f417a25e841d2cd5e65f087a3
|
||||
|
||||
4
.github/merge_rules.yaml
vendored
4
.github/merge_rules.yaml
vendored
@ -488,10 +488,6 @@
|
||||
- torch/_dynamo/**
|
||||
- torch/csrc/dynamo/**
|
||||
- test/dynamo/**
|
||||
- test/dynamo_expected_failures/**
|
||||
- test/dynamo_skips/**
|
||||
- test/inductor_expected_failures/**
|
||||
- test/inductor_skips/**
|
||||
approved_by:
|
||||
- guilhermeleobas
|
||||
mandatory_checks_name:
|
||||
|
||||
18
.github/scripts/generate_binary_build_matrix.py
vendored
18
.github/scripts/generate_binary_build_matrix.py
vendored
@ -193,7 +193,7 @@ LIBTORCH_CONTAINER_IMAGES: dict[str, str] = {
|
||||
"cpu": "libtorch-cxx11-builder:cpu",
|
||||
}
|
||||
|
||||
FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]
|
||||
FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"]
|
||||
|
||||
|
||||
def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
|
||||
@ -273,6 +273,7 @@ def generate_wheels_matrix(
|
||||
os: str,
|
||||
arches: Optional[list[str]] = None,
|
||||
python_versions: Optional[list[str]] = None,
|
||||
use_split_build: bool = False,
|
||||
) -> list[dict[str, str]]:
|
||||
package_type = "wheel"
|
||||
if os == "linux" or os == "linux-aarch64" or os == "linux-s390x":
|
||||
@ -314,11 +315,15 @@ def generate_wheels_matrix(
|
||||
# TODO: Enable python 3.13t on cpu-s390x
|
||||
if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
|
||||
continue
|
||||
# TODO: Enable python 3.14 on non linux OSes
|
||||
if os != "linux" and (
|
||||
python_version == "3.14" or python_version == "3.14t"
|
||||
|
||||
if use_split_build and (
|
||||
arch_version not in ["12.6", "12.8", "12.9", "cpu"] or os != "linux"
|
||||
):
|
||||
continue
|
||||
raise RuntimeError(
|
||||
"Split build is only supported on linux with cuda 12* and cpu.\n"
|
||||
f"Currently attempting to build on arch version {arch_version} and os {os}.\n"
|
||||
"Please modify the matrix generation to exclude this combination."
|
||||
)
|
||||
|
||||
# cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
|
||||
|
||||
@ -334,6 +339,7 @@ def generate_wheels_matrix(
|
||||
"gpu_arch_type": gpu_arch_type,
|
||||
"gpu_arch_version": gpu_arch_version,
|
||||
"desired_cuda": desired_cuda,
|
||||
"use_split_build": "True" if use_split_build else "False",
|
||||
"container_image": WHEEL_CONTAINER_IMAGES[arch_version].split(
|
||||
":"
|
||||
)[0],
|
||||
@ -366,6 +372,7 @@ def generate_wheels_matrix(
|
||||
"desired_cuda": translate_desired_cuda(
|
||||
gpu_arch_type, gpu_arch_version
|
||||
),
|
||||
"use_split_build": "True" if use_split_build else "False",
|
||||
"container_image": WHEEL_CONTAINER_IMAGES[
|
||||
arch_version
|
||||
].split(":")[0],
|
||||
@ -388,6 +395,7 @@ def generate_wheels_matrix(
|
||||
"desired_cuda": translate_desired_cuda(
|
||||
gpu_arch_type, gpu_arch_version
|
||||
),
|
||||
"use_split_build": "True" if use_split_build else "False",
|
||||
"container_image": WHEEL_CONTAINER_IMAGES[arch_version].split(
|
||||
":"
|
||||
)[0],
|
||||
|
||||
42
.github/scripts/generate_ci_workflows.py
vendored
42
.github/scripts/generate_ci_workflows.py
vendored
@ -59,7 +59,9 @@ class BinaryBuildWorkflow:
|
||||
is_scheduled: str = ""
|
||||
branches: str = "nightly"
|
||||
# Mainly for macos
|
||||
cross_compile_arm64: bool = False
|
||||
macos_runner: str = "macos-14-xlarge"
|
||||
use_split_build: bool = False
|
||||
# Mainly used for libtorch builds
|
||||
build_variant: str = ""
|
||||
|
||||
@ -70,6 +72,9 @@ class BinaryBuildWorkflow:
|
||||
for item in [self.os, "binary", self.package_type, self.build_variant]
|
||||
if item != ""
|
||||
)
|
||||
if self.use_split_build:
|
||||
# added to distinguish concurrency groups
|
||||
self.build_environment += "-split"
|
||||
|
||||
def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
|
||||
output_file_path = (
|
||||
@ -112,6 +117,21 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
|
||||
isolated_workflow=True,
|
||||
),
|
||||
),
|
||||
# See https://github.com/pytorch/pytorch/issues/138750
|
||||
# BinaryBuildWorkflow(
|
||||
# os=OperatingSystem.LINUX,
|
||||
# package_type="manywheel",
|
||||
# build_configs=generate_binary_build_matrix.generate_wheels_matrix(
|
||||
# OperatingSystem.LINUX,
|
||||
# use_split_build=True,
|
||||
# arches=["11.8", "12.1", "12.4", "cpu"],
|
||||
# ),
|
||||
# ciflow_config=CIFlowConfig(
|
||||
# labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
|
||||
# isolated_workflow=True,
|
||||
# ),
|
||||
# use_split_build=True,
|
||||
# ),
|
||||
BinaryBuildWorkflow(
|
||||
os=OperatingSystem.LINUX,
|
||||
package_type="libtorch",
|
||||
@ -155,11 +175,27 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
|
||||
package_type="manywheel",
|
||||
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
|
||||
OperatingSystem.LINUX,
|
||||
arches=["12.8"],
|
||||
python_versions=["3.12"],
|
||||
arches=["12.6", "12.8", "12.9"],
|
||||
python_versions=["3.9"],
|
||||
),
|
||||
branches="main",
|
||||
),
|
||||
# See https://github.com/pytorch/pytorch/issues/138750
|
||||
# BinaryBuildWorkflow(
|
||||
# os=OperatingSystem.LINUX,
|
||||
# package_type="manywheel",
|
||||
# build_configs=generate_binary_build_matrix.generate_wheels_matrix(
|
||||
# OperatingSystem.LINUX,
|
||||
# arches=["11.8", "12.1", "12.4"],
|
||||
# python_versions=["3.9"],
|
||||
# use_split_build=True,
|
||||
# ),
|
||||
# ciflow_config=CIFlowConfig(
|
||||
# labels={LABEL_CIFLOW_PERIODIC},
|
||||
# ),
|
||||
# branches="main",
|
||||
# use_split_build=True,
|
||||
# ),
|
||||
BinaryBuildWorkflow(
|
||||
os=OperatingSystem.LINUX,
|
||||
package_type="libtorch",
|
||||
@ -302,6 +338,7 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
|
||||
generate_binary_build_matrix.RELEASE,
|
||||
libtorch_variants=["shared-with-deps"],
|
||||
),
|
||||
cross_compile_arm64=False,
|
||||
macos_runner="macos-14-xlarge",
|
||||
ciflow_config=CIFlowConfig(
|
||||
labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
|
||||
@ -314,6 +351,7 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
|
||||
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
|
||||
OperatingSystem.MACOS_ARM64
|
||||
),
|
||||
cross_compile_arm64=False,
|
||||
macos_runner="macos-14-xlarge",
|
||||
ciflow_config=CIFlowConfig(
|
||||
labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
|
||||
|
||||
7
.github/scripts/runner_determinator.py
vendored
7
.github/scripts/runner_determinator.py
vendored
@ -262,12 +262,7 @@ def is_exception_branch(branch: str) -> bool:
|
||||
"""
|
||||
Branches that get opted out of experiments by default, until they're explicitly enabled.
|
||||
"""
|
||||
return branch.split("/", maxsplit=1)[0] in {
|
||||
"main",
|
||||
"nightly",
|
||||
"release",
|
||||
"landchecks",
|
||||
}
|
||||
return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}
|
||||
|
||||
|
||||
def load_yaml(yaml_text: str) -> Any:
|
||||
|
||||
@ -47,6 +47,9 @@ env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
SKIP_ALL_TESTS: 0
|
||||
{%- if cross_compile_arm64 %}
|
||||
CROSS_COMPILE_ARM64: 1
|
||||
{% endif %}
|
||||
!{{ common.concurrency(build_environment) }}
|
||||
|
||||
jobs:
|
||||
|
||||
5
.github/templates/upload.yml.j2
vendored
5
.github/templates/upload.yml.j2
vendored
@ -25,6 +25,11 @@
|
||||
DOCKER_IMAGE: !{{ config["container_image"] }}
|
||||
DOCKER_IMAGE_TAG_PREFIX: !{{ config["container_image_tag_prefix"] }}
|
||||
{%- endif %}
|
||||
{%- if config["package_type"] == "manywheel" %}
|
||||
{%- if config.use_split_build is defined %}
|
||||
use_split_build: !{{ config["use_split_build"] }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- if config["package_type"] == "libtorch" %}
|
||||
{%- if config["libtorch_config"] %}
|
||||
LIBTORCH_CONFIG: !{{ config["libtorch_config"] }}
|
||||
|
||||
10
.github/workflows/_binary-build-linux.yml
vendored
10
.github/workflows/_binary-build-linux.yml
vendored
@ -26,6 +26,13 @@ on:
|
||||
default: 240
|
||||
type: number
|
||||
description: timeout for the job
|
||||
use_split_build:
|
||||
description: |
|
||||
[Experimental] Build a libtorch only wheel and build pytorch such that
|
||||
are built from the libtorch wheel.
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
ALPINE_IMAGE:
|
||||
required: false
|
||||
type: string
|
||||
@ -110,6 +117,7 @@ jobs:
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
PYTORCH_FINAL_PACKAGE_DIR: /artifacts
|
||||
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
|
||||
steps:
|
||||
- name: Make the env permanent during this workflow (but not the secrets)
|
||||
shell: bash
|
||||
@ -134,6 +142,7 @@ jobs:
|
||||
echo "PR_NUMBER=${{ env.PR_NUMBER }}"
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
||||
echo "SHA1=${{ env.SHA1 }}"
|
||||
echo "USE_SPLIT_BUILD=${{ env.use_split_build }}"
|
||||
} >> "${GITHUB_ENV} }}"
|
||||
|
||||
- name: List the env
|
||||
@ -252,6 +261,7 @@ jobs:
|
||||
-e PYTORCH_ROOT \
|
||||
-e SKIP_ALL_TESTS \
|
||||
-e PYTORCH_EXTRA_INSTALL_REQUIREMENTS \
|
||||
-e USE_SPLIT_BUILD \
|
||||
--tty \
|
||||
--detach \
|
||||
-v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
|
||||
|
||||
9
.github/workflows/_binary-test-linux.yml
vendored
9
.github/workflows/_binary-test-linux.yml
vendored
@ -64,6 +64,13 @@ on:
|
||||
required: true
|
||||
type: string
|
||||
description: Hardware to run this job on. Valid values are linux.4xlarge, linux.4xlarge.nvidia.gpu, linux.arm64.2xlarge, and linux.rocm.gpu
|
||||
use_split_build:
|
||||
description: |
|
||||
[Experimental] Build a libtorch only wheel and build pytorch such that
|
||||
are built from the libtorch wheel.
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
secrets:
|
||||
github-token:
|
||||
required: true
|
||||
@ -97,6 +104,7 @@ jobs:
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
PYTORCH_FINAL_PACKAGE_DIR: /artifacts
|
||||
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
|
||||
steps:
|
||||
- name: Make the env permanent during this workflow (but not the secrets)
|
||||
shell: bash
|
||||
@ -121,6 +129,7 @@ jobs:
|
||||
echo "PR_NUMBER=${{ env.PR_NUMBER }}"
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
||||
echo "SHA1=${{ env.SHA1 }}"
|
||||
echo "USE_SPLIT_BUILD=${{ env.USE_SPLIT_BUILD }}"
|
||||
} >> "${GITHUB_ENV} }}"
|
||||
|
||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
||||
|
||||
8
.github/workflows/_binary-upload.yml
vendored
8
.github/workflows/_binary-upload.yml
vendored
@ -51,6 +51,13 @@ on:
|
||||
required: false
|
||||
type: string
|
||||
description: Desired python version
|
||||
use_split_build:
|
||||
description: |
|
||||
[Experimental] Build a libtorch only wheel and build pytorch such that
|
||||
are built from the libtorch wheel.
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
secrets:
|
||||
github-token:
|
||||
required: true
|
||||
@ -79,6 +86,7 @@ jobs:
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
PYTORCH_FINAL_PACKAGE_DIR: /artifacts
|
||||
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
||||
|
||||
1
.github/workflows/_linux-build.yml
vendored
1
.github/workflows/_linux-build.yml
vendored
@ -306,6 +306,7 @@ jobs:
|
||||
-e OUR_GITHUB_JOB_ID \
|
||||
-e HUGGING_FACE_HUB_TOKEN \
|
||||
-e SCRIBE_GRAPHQL_ACCESS_TOKEN \
|
||||
-e USE_SPLIT_BUILD \
|
||||
-e BUILD_ADDITIONAL_PACKAGES \
|
||||
--memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
|
||||
--memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
|
||||
|
||||
20
.github/workflows/_linux-test.yml
vendored
20
.github/workflows/_linux-test.yml
vendored
@ -96,7 +96,7 @@ jobs:
|
||||
steps:
|
||||
- name: Setup SSH (Click me for login details)
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
||||
if: ${{ !contains(matrix.runner, 'b200') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
|
||||
if: ${{ matrix.runner != 'B200' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
instructions: |
|
||||
@ -109,7 +109,7 @@ jobs:
|
||||
no-sudo: true
|
||||
|
||||
- name: Setup Python
|
||||
if: contains(matrix.runner, 'b200')
|
||||
if: matrix.runner == 'B200'
|
||||
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
|
||||
with:
|
||||
python-version: '3.12'
|
||||
@ -117,7 +117,7 @@ jobs:
|
||||
|
||||
- name: Setup Linux
|
||||
uses: ./.github/actions/setup-linux
|
||||
if: inputs.build-environment != 'linux-s390x-binary-manywheel' && !contains(matrix.runner, 'b200')
|
||||
if: inputs.build-environment != 'linux-s390x-binary-manywheel' && matrix.runner != 'B200'
|
||||
|
||||
- name: configure aws credentials
|
||||
if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
|
||||
@ -128,7 +128,7 @@ jobs:
|
||||
aws-region: us-east-1
|
||||
|
||||
- name: Login to Amazon ECR
|
||||
if: ${{ inputs.aws-role-to-assume != '' && contains(matrix.runner, 'b200') }}
|
||||
if: ${{ inputs.aws-role-to-assume != '' && matrix.runner == 'B200' }}
|
||||
id: login-ecr
|
||||
continue-on-error: true
|
||||
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
|
||||
@ -166,17 +166,17 @@ jobs:
|
||||
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
|
||||
with:
|
||||
driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
|
||||
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}
|
||||
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && matrix.runner != 'B200' }}
|
||||
|
||||
- name: Setup GPU_FLAG for docker run
|
||||
id: setup-gpu-flag
|
||||
run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
|
||||
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || contains(matrix.runner, 'b200')) }}
|
||||
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || matrix.runner == 'B200') }}
|
||||
|
||||
- name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
|
||||
id: setup-sscache-port-flag
|
||||
run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
|
||||
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && !contains(matrix.runner, 'b200') }}
|
||||
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && matrix.runner != 'B200' }}
|
||||
|
||||
- name: Lock NVIDIA A100 40GB Frequency
|
||||
run: |
|
||||
@ -277,8 +277,8 @@ jobs:
|
||||
NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
|
||||
TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
|
||||
# Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs
|
||||
SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }}
|
||||
SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }}
|
||||
SCCACHE_BUCKET: ${{ matrix.runner != 'B200' && 'ossci-compiler-cache-circleci-v2' || '' }}
|
||||
SCCACHE_REGION: ${{ matrix.runner != 'B200' && 'us-east-1' || '' }}
|
||||
SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
|
||||
DOCKER_IMAGE: ${{ inputs.docker-image }}
|
||||
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
|
||||
@ -403,7 +403,7 @@ jobs:
|
||||
job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
|
||||
|
||||
- name: Authenticate with AWS
|
||||
if: ${{ contains(matrix.runner, 'b200') }}
|
||||
if: ${{ matrix.runner == 'B200' }}
|
||||
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
|
||||
|
||||
3
.github/workflows/check-labels.yml
vendored
3
.github/workflows/check-labels.yml
vendored
@ -34,8 +34,7 @@ jobs:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
name: Check labels
|
||||
# Disabling the job until https://github.com/pytorch/pytorch/issues/159825 is resolved
|
||||
if: github.repository_owner == 'pytorch' && false
|
||||
if: github.repository_owner == 'pytorch'
|
||||
runs-on: linux.24_04.4x
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
|
||||
@ -7,8 +7,7 @@ on:
|
||||
|
||||
jobs:
|
||||
ghstack-mergeability-check:
|
||||
# Disabling the job until https://github.com/pytorch/pytorch/issues/159825 is resolved
|
||||
if: github.repository_owner == 'pytorch' && false
|
||||
if: github.repository_owner == 'pytorch'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
9
.github/workflows/docker-builds.yml
vendored
9
.github/workflows/docker-builds.yml
vendored
@ -51,17 +51,21 @@ jobs:
|
||||
docker-image-name: [
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
|
||||
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-py3.9-clang12,
|
||||
pytorch-linux-jammy-py3.11-clang12,
|
||||
pytorch-linux-jammy-py3.12-clang12,
|
||||
pytorch-linux-jammy-py3.13-clang12,
|
||||
pytorch-linux-jammy-rocm-n-py3,
|
||||
pytorch-linux-noble-rocm-n-py3,
|
||||
pytorch-linux-noble-rocm-alpha-py3,
|
||||
pytorch-linux-jammy-rocm-n-py3-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
|
||||
pytorch-linux-jammy-py3.9-gcc11,
|
||||
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
|
||||
@ -72,8 +76,7 @@ jobs:
|
||||
pytorch-linux-jammy-py3-clang12-onnx,
|
||||
pytorch-linux-jammy-linter,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
|
||||
# Executorch pin needs update
|
||||
# pytorch-linux-jammy-py3-clang12-executorch,
|
||||
pytorch-linux-jammy-py3-clang12-executorch,
|
||||
pytorch-linux-jammy-py3.12-triton-cpu
|
||||
]
|
||||
include:
|
||||
|
||||
30
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
30
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
@ -60,6 +60,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: manylinux2_28_aarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
@ -83,6 +84,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: manylinux2_28_aarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -106,6 +108,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: manylinux2_28_aarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cpu-aarch64
|
||||
secrets:
|
||||
@ -126,6 +129,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
@ -152,6 +156,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda-aarch64-12_9
|
||||
secrets:
|
||||
@ -171,6 +176,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: manylinux2_28_aarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
@ -194,6 +200,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: manylinux2_28_aarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -217,6 +224,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: manylinux2_28_aarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cpu-aarch64
|
||||
secrets:
|
||||
@ -237,6 +245,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
@ -263,6 +272,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_9
|
||||
secrets:
|
||||
@ -282,6 +292,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: manylinux2_28_aarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
@ -305,6 +316,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: manylinux2_28_aarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -328,6 +340,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: manylinux2_28_aarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cpu-aarch64
|
||||
secrets:
|
||||
@ -348,6 +361,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
@ -374,6 +388,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_9
|
||||
secrets:
|
||||
@ -393,6 +408,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: manylinux2_28_aarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
@ -416,6 +432,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: manylinux2_28_aarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -439,6 +456,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: manylinux2_28_aarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cpu-aarch64
|
||||
secrets:
|
||||
@ -459,6 +477,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
@ -485,6 +504,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_9
|
||||
secrets:
|
||||
@ -504,6 +524,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: manylinux2_28_aarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
@ -527,6 +548,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: manylinux2_28_aarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -550,6 +572,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: manylinux2_28_aarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cpu-aarch64
|
||||
secrets:
|
||||
@ -570,6 +593,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
@ -596,6 +620,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_9
|
||||
secrets:
|
||||
@ -615,6 +640,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: manylinux2_28_aarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.13t"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
@ -638,6 +664,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: manylinux2_28_aarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.13t"
|
||||
build_name: manywheel-py3_13t-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -661,6 +688,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: manylinux2_28_aarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.13t"
|
||||
build_name: manywheel-py3_13t-cpu-aarch64
|
||||
secrets:
|
||||
@ -681,6 +709,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.13t"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
@ -707,6 +736,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.13t"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_9
|
||||
secrets:
|
||||
|
||||
110
.github/workflows/generated-linux-binary-manywheel-main.yml
generated
vendored
110
.github/workflows/generated-linux-binary-manywheel-main.yml
generated
vendored
@ -42,7 +42,54 @@ jobs:
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
manywheel-py3_12-cuda12_8-build:
|
||||
manywheel-py3_9-cuda12_6-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu126
|
||||
GPU_ARCH_VERSION: 12.6
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_6-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cuda12_6-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu126
|
||||
GPU_ARCH_VERSION: 12.6
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
manywheel-py3_9-cuda12_8-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
@ -56,17 +103,18 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
|
||||
DESIRED_PYTHON: "3.12"
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_8
|
||||
build_name: manywheel-py3_9-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_8-test: # Testing
|
||||
manywheel-py3_9-cuda12_8-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_12-cuda12_8-build
|
||||
- manywheel-py3_9-cuda12_8-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
@ -79,8 +127,56 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cuda12_8
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
manywheel-py3_9-cuda12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: 12.9
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_9-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cuda12_9-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: 12.9
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner
|
||||
|
||||
1313
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
1313
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
File diff suppressed because it is too large
Load Diff
2
.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
generated
vendored
2
.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
generated
vendored
@ -58,6 +58,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-rocm6_4
|
||||
@ -82,6 +83,7 @@ jobs:
|
||||
SKIP_ALL_TESTS: 1
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Setup ROCm
|
||||
|
||||
15
.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
generated
vendored
15
.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
generated
vendored
@ -60,6 +60,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runs_on: linux.s390x
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
@ -83,6 +84,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
@ -105,6 +107,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cpu-s390x
|
||||
secrets:
|
||||
@ -124,6 +127,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runs_on: linux.s390x
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
@ -147,6 +151,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
@ -169,6 +174,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cpu-s390x
|
||||
secrets:
|
||||
@ -188,6 +194,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runs_on: linux.s390x
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
@ -211,6 +218,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
@ -233,6 +241,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cpu-s390x
|
||||
secrets:
|
||||
@ -252,6 +261,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runs_on: linux.s390x
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
@ -275,6 +285,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
@ -297,6 +308,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cpu-s390x
|
||||
secrets:
|
||||
@ -316,6 +328,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runs_on: linux.s390x
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
@ -339,6 +352,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
@ -361,6 +375,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cpu-s390x
|
||||
secrets:
|
||||
|
||||
154
.github/workflows/inductor-perf-test-b200.yml
vendored
154
.github/workflows/inductor-perf-test-b200.yml
vendored
@ -1,154 +0,0 @@
|
||||
name: inductor-perf-b200
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: 0 7 * * 1-6
|
||||
- cron: 0 7 * * 0
|
||||
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
|
||||
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
training:
|
||||
description: Run training (on by default)?
|
||||
required: false
|
||||
type: boolean
|
||||
default: true
|
||||
inference:
|
||||
description: Run inference (on by default)?
|
||||
required: false
|
||||
type: boolean
|
||||
default: true
|
||||
default:
|
||||
description: Run inductor_default?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
dynamic:
|
||||
description: Run inductor_dynamic_shapes?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
cppwrapper:
|
||||
description: Run inductor_cpp_wrapper?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
cudagraphs:
|
||||
description: Run inductor_cudagraphs?
|
||||
required: false
|
||||
type: boolean
|
||||
default: true
|
||||
freezing_cudagraphs:
|
||||
description: Run inductor_cudagraphs with freezing for inference?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
aotinductor:
|
||||
description: Run aot_inductor for inference?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
maxautotune:
|
||||
description: Run inductor_max_autotune?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
benchmark_configs:
|
||||
description: The list of configs used the benchmark
|
||||
required: false
|
||||
type: string
|
||||
default: inductor_huggingface_perf_cuda_b200,inductor_timm_perf_cuda_b200,inductor_torchbench_perf_cuda_b200
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
opt_out_experiments: lf
|
||||
|
||||
build:
|
||||
name: cuda12.8-py3.10-gcc9-sm100
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
# Use a bigger runner here because CUDA_ARCH 9.0 is only built for H100
|
||||
# or newer GPUs, so it doesn't benefit much from existing compiler cache
|
||||
# from trunk. Also use a memory-intensive runner here because memory is
|
||||
# usually the bottleneck
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '10.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor_huggingface_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
|
||||
{ config: "inductor_timm_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
|
||||
{ config: "inductor_torchbench_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
|
||||
]}
|
||||
selected-test-configs: ${{ inputs.benchmark_configs }}
|
||||
build-additional-packages: "vision audio fbgemm torchao"
|
||||
secrets: inherit
|
||||
|
||||
test-periodically:
|
||||
name: cuda12.8-py3.10-gcc9-sm100
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event.schedule == '0 7 * * 1-6'
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
timeout-minutes: 720
|
||||
disable-monitor: false
|
||||
monitor-log-interval: 15
|
||||
monitor-data-collect-interval: 4
|
||||
secrets: inherit
|
||||
|
||||
test-weekly:
|
||||
name: cuda12.8-py3.10-gcc9-sm100
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event.schedule == '0 7 * * 0'
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
timeout-minutes: 1440
|
||||
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
disable-monitor: false
|
||||
monitor-log-interval: 15
|
||||
monitor-data-collect-interval: 4
|
||||
secrets: inherit
|
||||
|
||||
test:
|
||||
name: cuda12.8-py3.10-gcc9-sm100
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
timeout-minutes: 720
|
||||
disable-monitor: false
|
||||
monitor-log-interval: 15
|
||||
monitor-data-collect-interval: 4
|
||||
secrets: inherit
|
||||
@ -85,7 +85,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3_10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
|
||||
30
.github/workflows/inductor-periodic.yml
vendored
30
.github/workflows/inductor-periodic.yml
vendored
@ -81,21 +81,21 @@ jobs:
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
9
.github/workflows/nightly.yml
vendored
9
.github/workflows/nightly.yml
vendored
@ -75,11 +75,10 @@ jobs:
|
||||
repo-owner: pytorch
|
||||
branch: main
|
||||
pin-folder: .github/ci_commit_pins
|
||||
# executorch jobs are disabled since it needs some manual work for the hash update
|
||||
# - repo-name: executorch
|
||||
# repo-owner: pytorch
|
||||
# branch: main
|
||||
# pin-folder: .ci/docker/ci_commit_pins
|
||||
- repo-name: executorch
|
||||
repo-owner: pytorch
|
||||
branch: main
|
||||
pin-folder: .ci/docker/ci_commit_pins
|
||||
- repo-name: triton
|
||||
repo-owner: triton-lang
|
||||
branch: main
|
||||
|
||||
31
.github/workflows/periodic.yml
vendored
31
.github/workflows/periodic.yml
vendored
@ -51,6 +51,37 @@ jobs:
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-jammy-cuda12_4-py3_10-gcc11-sm89-build:
|
||||
name: linux-jammy-cuda12.4-py3.10-gcc11-sm89
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.4-py3.10-gcc11-sm89
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11
|
||||
cuda-arch-list: 8.9
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_4-py3_10-gcc11-sm89-test:
|
||||
name: linux-jammy-cuda12.4-py3.10-gcc11-sm89
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
- linux-jammy-cuda12_4-py3_10-gcc11-sm89-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.4-py3.10-gcc11-sm89
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-sm89-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-sm89-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_4-py3_10-gcc11-build:
|
||||
name: linux-jammy-cuda12.4-py3.10-gcc11
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
|
||||
126
.github/workflows/pull.yml
vendored
126
.github/workflows/pull.yml
vendored
@ -254,6 +254,36 @@ jobs:
|
||||
timeout-minutes: 600
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-build-distributed:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '7.5'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
|
||||
{ config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
|
||||
{ config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-test-distributed:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-test
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
- linux-jammy-cuda12_8-py3_10-gcc11-build-distributed
|
||||
- target-determination
|
||||
with:
|
||||
timeout-minutes: 360
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-build:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
@ -262,18 +292,13 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '7.5 8.9'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
|
||||
{ config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
|
||||
{ config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
|
||||
{ config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
|
||||
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
@ -304,6 +329,30 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3_9-clang9-xla-build:
|
||||
name: linux-jammy-py3_9-clang9-xla
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.9-clang9-xla
|
||||
docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3_9-clang9-xla-test:
|
||||
name: linux-jammy-py3_9-clang9-xla
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-py3_9-clang9-xla-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-clang9-xla
|
||||
docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cpu-py3_10-gcc11-bazel-test:
|
||||
name: linux-jammy-cpu-py3.10-gcc11-bazel-test
|
||||
uses: ./.github/workflows/_bazel-build-test.yml
|
||||
@ -353,8 +402,38 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-sm89-build:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-sm89
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: 8.9
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-sm89-test:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-sm89
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
- linux-jammy-cuda12_8-py3_10-gcc11-sm89-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3-clang12-executorch-build:
|
||||
if: false # Docker build needs pin update
|
||||
name: linux-jammy-py3-clang12-executorch
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
@ -379,6 +458,31 @@ jobs:
|
||||
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
|
||||
name: cuda12.8-py3.10-gcc9-sm75
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '7.5'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
|
||||
name: cuda12.8-py3.10-gcc9-sm75
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-xpu-2025_1-py3_9-build:
|
||||
name: linux-jammy-xpu-2025.1-py3.9
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
|
||||
4
.github/workflows/torchbench.yml
vendored
4
.github/workflows/torchbench.yml
vendored
@ -10,10 +10,6 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
get-default-label-prefix:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
|
||||
2
.github/workflows/trunk.yml
vendored
2
.github/workflows/trunk.yml
vendored
@ -205,7 +205,7 @@ jobs:
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.9-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
|
||||
28
.github/workflows/unstable.yml
vendored
28
.github/workflows/unstable.yml
vendored
@ -12,9 +12,7 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
# There must be at least one job here to satisfy GitHub action workflow syntax
|
||||
@ -53,27 +51,3 @@ jobs:
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-jammy-py3_9-clang9-xla-build:
|
||||
name: linux-jammy-py3_9-clang9-xla
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.9-clang9-xla
|
||||
docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3_9-clang9-xla-test:
|
||||
name: linux-jammy-py3_9-clang9-xla
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-py3_9-clang9-xla-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-clang9-xla
|
||||
docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
2
.github/workflows/update-viablestrict.yml
vendored
2
.github/workflows/update-viablestrict.yml
vendored
@ -23,7 +23,7 @@ jobs:
|
||||
with:
|
||||
repository: pytorch/pytorch
|
||||
stable-branch: viable/strict
|
||||
requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\", \"linux-aarch64\"]'
|
||||
requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\"]'
|
||||
secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
|
||||
clickhouse-url: ${{ secrets.CLICKHOUSE_URL }}
|
||||
clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }}
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@ -146,9 +146,6 @@ merge_record.json
|
||||
torchgen/packaged/*
|
||||
!torchgen/packaged/README.md
|
||||
|
||||
# This file is injected by ROCm build scripts to bootstrap in torch/__init__.py.
|
||||
torch/_rocm_init.py
|
||||
|
||||
# IPython notebook checkpoints
|
||||
.ipynb_checkpoints
|
||||
|
||||
|
||||
@ -1452,6 +1452,8 @@ init_command = [
|
||||
'python3',
|
||||
'tools/linter/adapters/pip_init.py',
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'--no-black-binary',
|
||||
'black==23.12.1',
|
||||
'usort==1.0.8.post1',
|
||||
'isort==6.0.1',
|
||||
'ruff==0.12.2', # sync with RUFF
|
||||
|
||||
12
.pre-commit-config.yaml
Normal file
12
.pre-commit-config.yaml
Normal file
@ -0,0 +1,12 @@
|
||||
repos:
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: lintrunner
|
||||
name: Run Lintrunner in an isolated venv before every push. The first run may be slow...
|
||||
entry: python scripts/run_lintrunner.py # wrapper below
|
||||
language: python # pre‑commit manages venv for the wrapper
|
||||
additional_dependencies: [] # wrapper handles lintrunner install
|
||||
always_run: true
|
||||
stages: [pre-push] # fire only on pre‑push
|
||||
pass_filenames: false # Lintrunner gets no per‑file args
|
||||
verbose: true # stream output as it is produced...allegedly anyways
|
||||
16
AGENTS.md
16
AGENTS.md
@ -1,17 +1 @@
|
||||
- This is the only AGENTS.md, there are no recursive AGENTS.md
|
||||
- When you are working on a bug, first create a standalone file that
|
||||
reproduces the bug and verify it fails in the expected way. Use this to
|
||||
test if your changes work. Once the change is passing, find an appropriate
|
||||
test file to add the test to and make sure to follow local conventions on
|
||||
the test file.
|
||||
- If you are running the real test suite, DO NOT run the entire test suite.
|
||||
Instead run only a single test case, e.g., 'python test/test_torch.py TestTorch.test_dir'
|
||||
- Do NOT run setup.py, you do not have a working build environment
|
||||
- Do NOT run pre-commit, it is not setup
|
||||
- To run lint, run 'lintrunner -a' (which will autoapply changes)
|
||||
- Do NOT attempt to install dependencies, you do not have Internet access
|
||||
- When you are ready to make a PR, do exactly these steps:
|
||||
- git stash -u
|
||||
- git reset --hard $(cat /tmp/orig_work.txt) # NB: reset to the LOCAL branch, do NOT fetch
|
||||
- git stash pop
|
||||
- Resolve conflicts if necessary
|
||||
|
||||
@ -239,9 +239,7 @@ option(USE_XPU "Use XPU" ON)
|
||||
cmake_dependent_option(
|
||||
BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON
|
||||
"USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
|
||||
cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX OR WIN32" OFF)
|
||||
cmake_dependent_option(USE_ROCM_CK_GEMM "Use ROCm Composable Kernel for GEMMs" ON "USE_ROCM;NOT WIN32" OFF)
|
||||
option(USE_ROCM_CK_SDPA "Use ROCm Composable Kernel for SDPA" OFF)
|
||||
cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF)
|
||||
option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
|
||||
cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF)
|
||||
cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
|
||||
@ -253,6 +251,7 @@ cmake_dependent_option(USE_CUFILE "Use cuFile" ON "USE_CUDA AND NOT WIN32" OFF)
|
||||
option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
|
||||
option(USE_KINETO "Use Kineto profiling library" ON)
|
||||
option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
|
||||
option(USE_FAKELOWP "Use FakeLowp operators" OFF)
|
||||
option(USE_GFLAGS "Use GFLAGS" OFF)
|
||||
option(USE_GLOG "Use GLOG" OFF)
|
||||
option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
|
||||
@ -261,13 +260,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
|
||||
option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
|
||||
option(USE_NATIVE_ARCH "Use -march=native" OFF)
|
||||
cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
|
||||
option(USE_DISTRIBUTED "Use distributed" ON)
|
||||
cmake_dependent_option(USE_NCCL "Use NCCL" ON
|
||||
"USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
|
||||
"USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
|
||||
cmake_dependent_option(USE_XCCL "Use XCCL" ON
|
||||
"USE_XPU;UNIX;NOT APPLE" OFF)
|
||||
cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
|
||||
cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
|
||||
cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
|
||||
cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL"
|
||||
OFF)
|
||||
@ -325,6 +322,7 @@ set(MKLDNN_ENABLE_CONCURRENT_EXEC ${USE_MKLDNN})
|
||||
cmake_dependent_option(USE_MKLDNN_CBLAS "Use CBLAS in MKLDNN" OFF "USE_MKLDNN"
|
||||
OFF)
|
||||
option(USE_STATIC_MKL "Prefer to link with MKL statically (Unix only)" OFF)
|
||||
option(USE_DISTRIBUTED "Use distributed" ON)
|
||||
cmake_dependent_option(
|
||||
USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." ON
|
||||
"USE_DISTRIBUTED" OFF)
|
||||
@ -836,11 +834,10 @@ include(ExternalProject)
|
||||
|
||||
# ---[ Dependencies ---[ FBGEMM doesn't work on x86 32bit and
|
||||
# CMAKE_SYSTEM_PROCESSOR thinks its 64bit
|
||||
if(USE_FBGEMM AND NOT CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
|
||||
message(WARNING
|
||||
"x64 operating system is required for FBGEMM. "
|
||||
"Not compiling with FBGEMM. "
|
||||
"Turn this warning off by USE_FBGEMM=OFF.")
|
||||
if(USE_FBGEMM
|
||||
AND((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND CMAKE_SIZEOF_VOID_P EQUAL
|
||||
4)
|
||||
OR CMAKE_SYSTEM_PROCESSOR STREQUAL "x86"))
|
||||
set(USE_FBGEMM OFF)
|
||||
endif()
|
||||
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
/torch/csrc/autograd/ @albanD @soulitzer
|
||||
/torch/autograd/ @albanD @soulitzer
|
||||
/tools/autograd/ @albanD @soulitzer
|
||||
/torch/header_only_apis.txt @janeyx99
|
||||
/torch/nn/ @albanD @jbschlosser @mikaylagawarecki
|
||||
/torch/optim/ @albanD @janeyx99
|
||||
/test/test_public_bindings.py @albanD
|
||||
@ -164,7 +165,6 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd
|
||||
# torch.export
|
||||
/torch/export/ @avikchaudhuri @tugsbayasgalan @zhxchen17 @ydwu4 @angelayi
|
||||
/torch/_export/ @avikchaudhuri @tugsbayasgalan @zhxchen17 @ydwu4 @angelayi
|
||||
/torch/_export/serde/schema.py @SherlockNoMad @zhxchen17
|
||||
|
||||
# Dynamic Shapes
|
||||
/torch/fx/experimental/symbolic_shapes.py @bobrenjc93 @laithsakka
|
||||
@ -196,8 +196,3 @@ torch/backends/cudnn/ @eqy @syed-ahmed
|
||||
/torch/utils/_cxx_pytree.py @XuehaiPan
|
||||
/torch/utils/pytree/ @XuehaiPan
|
||||
/torch/_dynamo/polyfills/pytree.py @XuehaiPan
|
||||
|
||||
# Relating to libtorch ABI
|
||||
/torch/csrc/stable/ @janeyx99 @mikaylagawarecki
|
||||
/torch/headeronly/ @janeyx99
|
||||
/torch/header_only_apis.txt @janeyx99
|
||||
|
||||
@ -243,7 +243,7 @@ git submodule update --init --recursive
|
||||
|
||||
```bash
|
||||
conda install cmake ninja
|
||||
# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section above
|
||||
# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section below
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@ -276,7 +276,7 @@ conda install pkg-config libuv
|
||||
pip install mkl-static mkl-include
|
||||
# Add these packages if torch.distributed is needed.
|
||||
# Distributed package support on Windows is a prototype feature and is subject to changes.
|
||||
conda install -c conda-forge libuv
|
||||
conda install -c conda-forge libuv=1.39
|
||||
```
|
||||
|
||||
#### Install PyTorch
|
||||
@ -560,7 +560,7 @@ To learn more about making a contribution to Pytorch, please see our [Contributi
|
||||
|
||||
PyTorch is a community-driven project with several skillful engineers and researchers contributing to it.
|
||||
|
||||
PyTorch is currently maintained by [Soumith Chintala](http://soumith.ch), [Gregory Chanan](https://github.com/gchanan), [Dmytro Dzhulgakov](https://github.com/dzhulgakov), [Edward Yang](https://github.com/ezyang), [Alban Desmaison](https://github.com/albanD), [Piotr Bialecki](https://github.com/ptrblck) and [Nikita Shulga](https://github.com/malfet) with major contributions coming from hundreds of talented individuals in various forms and means.
|
||||
PyTorch is currently maintained by [Soumith Chintala](http://soumith.ch), [Gregory Chanan](https://github.com/gchanan), [Dmytro Dzhulgakov](https://github.com/dzhulgakov), [Edward Yang](https://github.com/ezyang), and [Nikita Shulga](https://github.com/malfet) with major contributions coming from hundreds of talented individuals in various forms and means.
|
||||
A non-exhaustive but growing list needs to mention: [Trevor Killeen](https://github.com/killeent), [Sasank Chilamkurthy](https://github.com/chsasank), [Sergey Zagoruyko](https://github.com/szagoruyko), [Adam Lerer](https://github.com/adamlerer), [Francisco Massa](https://github.com/fmassa), [Alykhan Tejani](https://github.com/alykhantejani), [Luca Antiga](https://github.com/lantiga), [Alban Desmaison](https://github.com/albanD), [Andreas Koepf](https://github.com/andreaskoepf), [James Bradbury](https://github.com/jekbradbury), [Zeming Lin](https://github.com/ebetica), [Yuandong Tian](https://github.com/yuandong-tian), [Guillaume Lample](https://github.com/glample), [Marat Dukhan](https://github.com/Maratyszcza), [Natalia Gimelshein](https://github.com/ngimel), [Christian Sarofeen](https://github.com/csarofeen), [Martin Raison](https://github.com/martinraison), [Edward Yang](https://github.com/ezyang), [Zachary Devito](https://github.com/zdevito). <!-- codespell:ignore -->
|
||||
|
||||
Note: This project is unrelated to [hughperkins/pytorch](https://github.com/hughperkins/pytorch) with the same name. Hugh is a valuable contributor to the Torch community and has helped with many things Torch and PyTorch.
|
||||
|
||||
@ -119,8 +119,6 @@ file(GLOB_RECURSE native_mps_cpp "native/mps/*.cpp")
|
||||
file(GLOB_RECURSE native_mps_mm "native/mps/*.mm")
|
||||
file(GLOB_RECURSE native_mps_metal "native/mps/*.metal")
|
||||
file(GLOB_RECURSE native_mps_h "native/mps/*.h")
|
||||
file(GLOB_RECURSE native_sparse_mps_mm "native/sparse/mps/*.mm")
|
||||
file(GLOB_RECURSE native_mps_sparse_metal "native/sparse/mps/*.metal")
|
||||
|
||||
file(GLOB native_sparse_cpp "native/sparse/*.cpp")
|
||||
file(GLOB native_quantized_cpp
|
||||
@ -180,27 +178,26 @@ file(GLOB native_flash_attn_api_cpp "native/transformers/cuda/flash_attn/flash_a
|
||||
file(GLOB flash_attention_hip_hip "native/transformers/hip/flash_attn/*.hip")
|
||||
# if USE_FLASH_ATTENTION is set, ensure CK instances get generated
|
||||
if(USE_FLASH_ATTENTION)
|
||||
if("$ENV{USE_CK_FLASH_ATTENTION}" STREQUAL "1")
|
||||
message(STATUS "USE_CK_FLASH_ATTENTION is being deprecated. Please use USE_ROCM_CK_SDPA instead")
|
||||
caffe2_update_option(USE_ROCM_CK_SDPA ON)
|
||||
endif()
|
||||
if(USE_ROCM_CK_SDPA)
|
||||
if(DEFINED ENV{PYTORCH_ROCM_ARCH})
|
||||
list(LENGTH PYTORCH_ROCM_ARCH NUM_ARCHS)
|
||||
if(NUM_ARCHS GREATER 1)
|
||||
message(WARNING "Building CK for multiple archs can increase build time considerably!
|
||||
Consider setting PYTORCH_ROCM_ARCH env var value as the gfx arch you need to build for")
|
||||
if(DEFINED ENV{USE_CK_FLASH_ATTENTION})
|
||||
set(USE_CK_FLASH_ATTENTION $ENV{USE_CK_FLASH_ATTENTION})
|
||||
if(USE_CK_FLASH_ATTENTION STREQUAL "1")
|
||||
if(DEFINED ENV{PYTORCH_ROCM_ARCH})
|
||||
list(LENGTH PYTORCH_ROCM_ARCH NUM_ARCHS)
|
||||
if(NUM_ARCHS GREATER 1)
|
||||
message(WARNING "Building CK for multiple archs can increase build time considerably!
|
||||
Consider setting PYTORCH_ROCM_ARCH env var value as the gfx arch you need to build for")
|
||||
endif()
|
||||
endif()
|
||||
message(STATUS "USE_CK_FLASH_ATTENTION is set; building PyTorch with CK Flash Attention enabled")
|
||||
message(STATUS "Generating CK kernel instances...")
|
||||
add_subdirectory(native/transformers/hip/flash_attn/ck)
|
||||
file(GLOB flash_attention_hip_ck_hip "native/transformers/hip/flash_attn/ck/*.hip")
|
||||
list(APPEND native_transformers_hip_hip ${flash_attention_hip_ck_hip})
|
||||
# FAv3 Generation
|
||||
add_subdirectory(native/transformers/hip/flash_attn/ck/fav_v3)
|
||||
file(GLOB flash_attention_v3_hip "native/transformers/hip/flash_attn/ck/fav_v3/*.hip")
|
||||
list(APPEND native_transformers_hip_hip ${flash_attention_v3_hip})
|
||||
endif()
|
||||
endif()
|
||||
message(STATUS "USE_ROCM_CK_SDPA is set; building PyTorch with CK SDPA enabled")
|
||||
message(STATUS "Generating CK kernel instances...")
|
||||
add_subdirectory(native/transformers/hip/flash_attn/ck)
|
||||
file(GLOB flash_attention_hip_ck_hip "native/transformers/hip/flash_attn/ck/*.hip")
|
||||
list(APPEND native_transformers_hip_hip ${flash_attention_hip_ck_hip})
|
||||
# FAv3 Generation
|
||||
add_subdirectory(native/transformers/hip/flash_attn/ck/fav_v3)
|
||||
file(GLOB flash_attention_v3_hip "native/transformers/hip/flash_attn/ck/fav_v3/*.hip")
|
||||
list(APPEND native_transformers_hip_hip ${flash_attention_v3_hip})
|
||||
endif()
|
||||
file(GLOB flash_attention_hip_aot_hip "native/transformers/hip/flash_attn/aot/*.hip")
|
||||
file(GLOB flash_attention_src_hip_hip "native/transformers/hip/flash_attn/src/*.hip")
|
||||
@ -419,42 +416,39 @@ if(USE_CUDA)
|
||||
endif()
|
||||
|
||||
if(USE_ROCM)
|
||||
if((USE_FLASH_ATTENTION AND USE_ROCM_CK_SDPA) OR USE_ROCM_CK_GEMM)
|
||||
# NOTE: The PyTorch build does not actually add_subdirectory
|
||||
# third_party/composable_kernel or use it as a CMake library. What is used
|
||||
# is header only, so this should be ok, except that the CMake build generates
|
||||
# a ck/config.h. We just do that part here. Without this, the ck.h from the
|
||||
# ROCM SDK may get accidentally used instead.
|
||||
function(_pytorch_rocm_generate_ck_conf)
|
||||
set(CK_ENABLE_INT8 "ON")
|
||||
set(CK_ENABLE_FP16 "ON")
|
||||
set(CK_ENABLE_FP32 "ON")
|
||||
set(CK_ENABLE_FP64 "ON")
|
||||
set(CK_ENABLE_BF16 "ON")
|
||||
set(CK_ENABLE_FP8 "ON")
|
||||
set(CK_ENABLE_BF8 "ON")
|
||||
set(CK_USE_XDL "ON")
|
||||
set(CK_USE_WMMA "ON")
|
||||
configure_file(
|
||||
"${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h"
|
||||
)
|
||||
endfunction()
|
||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
|
||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
|
||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
|
||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/example/ck_tile/01_fmha)
|
||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
|
||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/aiter/csrc/include)
|
||||
_pytorch_rocm_generate_ck_conf()
|
||||
endif()
|
||||
# NOTE: The PyTorch build does not actually add_subdirectory
|
||||
# third_party/composable_kernel or use it as a CMake library. What is used
|
||||
# is header only, so this should be ok, except that the CMake build generates
|
||||
# a ck/config.h. We just do that part here. Without this, the ck.h from the
|
||||
# ROCM SDK may get accidentally used instead.
|
||||
function(_pytorch_rocm_generate_ck_conf)
|
||||
set(CK_ENABLE_INT8 "ON")
|
||||
set(CK_ENABLE_FP16 "ON")
|
||||
set(CK_ENABLE_FP32 "ON")
|
||||
set(CK_ENABLE_FP64 "ON")
|
||||
set(CK_ENABLE_BF16 "ON")
|
||||
set(CK_ENABLE_FP8 "ON")
|
||||
set(CK_ENABLE_BF8 "ON")
|
||||
set(CK_USE_XDL "ON")
|
||||
set(CK_USE_WMMA "ON")
|
||||
configure_file(
|
||||
"${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h"
|
||||
)
|
||||
endfunction()
|
||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
|
||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
|
||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
|
||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
|
||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/aiter/csrc/include)
|
||||
_pytorch_rocm_generate_ck_conf()
|
||||
|
||||
# Next two lines are needed because TunableOp uses third-party/fmt
|
||||
list(APPEND ATen_HIP_INCLUDE $<TARGET_PROPERTY:fmt::fmt-header-only,INTERFACE_INCLUDE_DIRECTORIES>)
|
||||
list(APPEND ATen_HIP_DEPENDENCY_LIBS fmt::fmt-header-only)
|
||||
if(USE_FLASH_ATTENTION AND USE_ROCM_CK_SDPA)
|
||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/native/transformers/hip/flash_attn/ck)
|
||||
endif()
|
||||
if(USE_FLASH_ATTENTION)
|
||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/native/transformers/hip/flash_attn/ck)
|
||||
endif()
|
||||
list(APPEND ATen_HIP_SRCS
|
||||
${ATen_HIP_SRCS}
|
||||
${hip_hip}
|
||||
@ -464,17 +458,12 @@ if(USE_ROCM)
|
||||
${native_quantized_hip_hip}
|
||||
${native_transformers_hip_hip} ${native_transformers_src_hip_hip}
|
||||
)
|
||||
if(NOT USE_ROCM_CK_GEMM)
|
||||
if(WIN32) # Windows doesn't support Composable Kernels
|
||||
file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip")
|
||||
file(GLOB native_hip_ck "native/hip/ck*.hip")
|
||||
exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
|
||||
${native_hip_bgemm} ${native_hip_ck})
|
||||
endif()
|
||||
if(WIN32) # Windows doesn't support Composable Kernels and Triton
|
||||
exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
|
||||
${native_transformers_hip_hip} ${native_transformers_hip_cpp})
|
||||
endif()
|
||||
|
||||
# TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources)
|
||||
list(APPEND all_hip_cpp
|
||||
${native_nested_hip_cpp}
|
||||
@ -709,25 +698,29 @@ endif()
|
||||
if(USE_MPS)
|
||||
include(../../../cmake/Metal.cmake)
|
||||
|
||||
set(ATen_MPS_SRCS ${ATen_MPS_SRCS} ${mps_cpp} ${mps_mm} ${mps_h} ${native_mps_cpp} ${native_mps_mm} ${native_mps_h} ${native_sparse_mps_mm})
|
||||
set(ATen_MPS_SRCS ${ATen_MPS_SRCS} ${mps_cpp} ${mps_mm} ${mps_h} ${native_mps_cpp} ${native_mps_mm} ${native_mps_h})
|
||||
|
||||
if(CAN_COMPILE_METAL)
|
||||
foreach(SHADER ${native_mps_metal} ${native_mps_sparse_metal})
|
||||
foreach(SHADER ${native_mps_metal})
|
||||
cmake_path(GET SHADER STEM TGT_STEM)
|
||||
string(CONCAT TGT_BASIC ${TGT_STEM} "_31.air")
|
||||
string(CONCAT TGT_BASIC ${TGT_STEM} "_30.air")
|
||||
string(CONCAT TGT_BFLOAT ${TGT_STEM} "_31.air")
|
||||
list(APPEND AIR_BASIC ${TGT_BASIC})
|
||||
metal_to_air(${SHADER} ${TGT_BASIC} "-std=metal3.1")
|
||||
list(APPEND AIR_BFLOAT ${TGT_BFLOAT})
|
||||
metal_to_air(${SHADER} ${TGT_BASIC} "-std=metal3.0")
|
||||
metal_to_air(${SHADER} ${TGT_BFLOAT} "-std=metal3.1")
|
||||
endforeach()
|
||||
air_to_metallib(kernels_basic.metallib ${AIR_BASIC})
|
||||
air_to_metallib(kernels_bfloat.metallib ${AIR_BFLOAT})
|
||||
add_custom_command(
|
||||
COMMAND echo "// $$(date)" > metallib_dummy.cpp
|
||||
DEPENDS kernels_basic.metallib
|
||||
DEPENDS kernels_basic.metallib kernels_bfloat.metallib
|
||||
OUTPUT metallib_dummy.cpp
|
||||
COMMENT "Updating metallibs timestamp")
|
||||
add_custom_target(metallibs DEPENDS kernels_basic.metallib metallib_dummy.cpp)
|
||||
add_custom_target(metallibs DEPENDS kernels_basic.metallib kernels_bfloat.metallib metallib_dummy.cpp)
|
||||
else()
|
||||
file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/native/mps")
|
||||
foreach(SHADER ${native_mps_metal} ${native_mps_sparse_metal})
|
||||
foreach(SHADER ${native_mps_metal})
|
||||
cmake_path(GET SHADER STEM TGT_STEM)
|
||||
string(CONCAT SHADER_HDR_NAME "${CMAKE_CURRENT_BINARY_DIR}" /native/mps/ ${TGT_STEM} "_metallib.h")
|
||||
metal_to_metallib_h(${SHADER} ${SHADER_HDR_NAME})
|
||||
|
||||
@ -480,9 +480,6 @@ at::BlasBackend Context::blasPreferredBackend() {
|
||||
// call site for blasPreferredBackend(), we set it to an actual value.
|
||||
if (blas_preferred_backend == at::BlasBackend::Default) {
|
||||
blas_preferred_backend = at::BlasBackend::Cublas;
|
||||
// This logic sits in the getter because it needs to validate
|
||||
// values set via env vars such as TORCH_BLAS_PREFER_CUBLASLT
|
||||
// which initialize the backend without calling the setter
|
||||
#ifdef USE_ROCM
|
||||
// AMD Instinct targets prefer hipblaslt
|
||||
static const bool hipblaslt_preferred = []() {
|
||||
@ -512,10 +509,6 @@ at::BlasBackend Context::blasPreferredBackend() {
|
||||
// hipblaslt support for all archs is not as complete as hipblas
|
||||
if (blas_preferred_backend == at::BlasBackend::Cublaslt) {
|
||||
static const bool hipblaslt_unsupported = []() {
|
||||
if(!hasCuBLASLt())
|
||||
{
|
||||
return true;
|
||||
}
|
||||
static const std::vector<std::string> archs = {
|
||||
"gfx90a", "gfx942",
|
||||
#if ROCM_VERSION >= 60300
|
||||
@ -541,24 +534,6 @@ at::BlasBackend Context::blasPreferredBackend() {
|
||||
return blas_preferred_backend;
|
||||
}
|
||||
|
||||
bool Context::ckSupported() {
|
||||
#ifdef USE_ROCM
|
||||
static const std::vector<std::string> supported_archs = {
|
||||
"gfx90a", "gfx942", "gfx950"
|
||||
};
|
||||
for (auto index : c10::irange(detail::getCUDAHooks().deviceCount())) {
|
||||
if(!detail::getCUDAHooks().isGPUArch(supported_archs, index)) {
|
||||
TORCH_WARN_ONCE(
|
||||
"Attempting to use CK on an unsupported architecture! Cannot set backend to CK");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
void Context::setBlasPreferredBackend(at::BlasBackend b) {
|
||||
#ifdef _MSC_VER
|
||||
TORCH_WARN_ONCE(
|
||||
@ -568,14 +543,8 @@ void Context::setBlasPreferredBackend(at::BlasBackend b) {
|
||||
#else
|
||||
TORCH_CHECK((b != at::BlasBackend::Cublaslt) || hasCuBLASLt(),
|
||||
"Cannot set preferred backend to cuBLASLt if PyTorch has not been compiled with cuBLASLt.");
|
||||
#ifdef USE_ROCM
|
||||
static const bool ckSupportedFlag = ckSupported();
|
||||
static const bool hasCKGEMMFlag = hasCKGEMM();
|
||||
TORCH_CHECK((b != at::BlasBackend::Ck) || (ckSupportedFlag && hasCKGEMMFlag),
|
||||
"Cannot set preferred blas backend to CK since following conditions are not true: ",
|
||||
"architecture supported for CK: ", ckSupportedFlag,
|
||||
", PyTorch built with CK GEMM support: ", hasCKGEMMFlag);
|
||||
#endif
|
||||
TORCH_CHECK((b != at::BlasBackend::Ck) || hasROCM(),
|
||||
"Cannot set preferred backend to Ck if PyTorch has not been compiled for ROCm.");
|
||||
if (b != at::BlasBackend::Default && b != at::BlasBackend::Cublas) {
|
||||
TORCH_WARN_ONCE(
|
||||
"torch.backends.cuda.preferred_blas_library is an experimental feature. "
|
||||
@ -587,40 +556,35 @@ void Context::setBlasPreferredBackend(at::BlasBackend b) {
|
||||
#endif
|
||||
}
|
||||
|
||||
at::ROCmFABackend Context::getROCmFAPreferredBackend() {
|
||||
#ifdef USE_ROCM
|
||||
// Set potential "Default" value so we don't have to interpret at call sites.
|
||||
// We use aotriton backend as the default, for now.
|
||||
if(rocm_fa_preferred_backend == at::ROCmFABackend::Default) {
|
||||
rocm_fa_preferred_backend = at::ROCmFABackend::AOTriton;
|
||||
} else if (rocm_fa_preferred_backend == at::ROCmFABackend::Ck) {
|
||||
// This logic sits in the getter because it needs to validate
|
||||
// values set via env vars such as TORCH_ROCM_FA_PREFER_CK
|
||||
// which initialize the backend without calling the setter
|
||||
// Perform validity checking
|
||||
static const bool hasCKSDPAFlag = hasCKSDPA();
|
||||
static const bool ckSupportedFlag = ckSupported();
|
||||
if(!(hasCKSDPAFlag && ckSupportedFlag)){
|
||||
TORCH_WARN_ONCE(
|
||||
"Cannot set preferred SDPA backend to CK since following conditions are not true: ",
|
||||
"architecture supported for CK: ", ckSupportedFlag,
|
||||
", PyTorch built with CK SDPA support: ", hasCKSDPAFlag);
|
||||
rocm_fa_preferred_backend = at::ROCmFABackend::AOTriton;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
at::ROCmFABackend Context::getROCmFAPreferredBackend() const {
|
||||
return rocm_fa_preferred_backend;
|
||||
}
|
||||
|
||||
void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
|
||||
|
||||
// TODO: add plumbing for hasCK for validity checking
|
||||
TORCH_CHECK((b != at::ROCmFABackend::Ck) || hasROCM(),
|
||||
"Cannot set preferred flash attention backend to Ck if PyTorch has not been compiled for ROCm.");
|
||||
#ifdef USE_ROCM
|
||||
static const bool hasCKSDPAFlag = hasCKSDPA();
|
||||
static const bool ckSupportedFlag = ckSupported();
|
||||
TORCH_CHECK((b != at::ROCmFABackend::Ck) || (hasCKSDPAFlag && ckSupportedFlag),
|
||||
"Cannot set preferred SDPA backend to CK since following conditions are not true: ",
|
||||
"architecture supported for CK: ", ckSupportedFlag,
|
||||
", PyTorch built with CK SDPA support: ", hasCKSDPAFlag);
|
||||
if(b == at::ROCmFABackend::Ck) {
|
||||
static const bool ck_unsupported = []() {
|
||||
static const std::vector<std::string> archs = {
|
||||
"gfx90a", "gfx942"
|
||||
};
|
||||
for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
|
||||
if (!detail::getCUDAHooks().isGPUArch(archs, index)) {
|
||||
TORCH_WARN_ONCE(
|
||||
"Attempting to use CK on an unsupported architecture! Cannot set backend to CK");
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}();
|
||||
if(!ck_unsupported) rocm_fa_preferred_backend = b;
|
||||
}
|
||||
else {
|
||||
rocm_fa_preferred_backend = b;
|
||||
}
|
||||
#endif
|
||||
rocm_fa_preferred_backend = b;
|
||||
}
|
||||
|
||||
@ -132,7 +132,6 @@ class TORCH_API Context {
|
||||
static bool hasKleidiAI();
|
||||
static bool hasLAPACK();
|
||||
static bool hasMKLDNN();
|
||||
static bool ckSupported();
|
||||
static bool hasMAGMA() {
|
||||
return detail::getCUDAHooks().hasMAGMA();
|
||||
}
|
||||
@ -163,12 +162,6 @@ class TORCH_API Context {
|
||||
static bool hasROCM() {
|
||||
return detail::getCUDAHooks().hasROCM();
|
||||
}
|
||||
static bool hasCKSDPA() {
|
||||
return detail::getCUDAHooks().hasCKSDPA();
|
||||
}
|
||||
static bool hasCKGEMM() {
|
||||
return detail::getCUDAHooks().hasCKGEMM();
|
||||
}
|
||||
static bool hasHIP() {
|
||||
return detail::getHIPHooks().hasHIP();
|
||||
}
|
||||
@ -259,7 +252,7 @@ class TORCH_API Context {
|
||||
at::BlasBackend blasPreferredBackend();
|
||||
void setBlasPreferredBackend(at::BlasBackend);
|
||||
|
||||
at::ROCmFABackend getROCmFAPreferredBackend();
|
||||
at::ROCmFABackend getROCmFAPreferredBackend() const;
|
||||
void setROCmFAPreferredBackend(at::ROCmFABackend);
|
||||
|
||||
// Note [Enabling Deterministic Operations]
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/CachingDeviceAllocator.h>
|
||||
#include <c10/core/DeviceType.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
|
||||
@ -73,27 +72,6 @@ TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index);
|
||||
// original device index that was active before the change.
|
||||
TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index);
|
||||
|
||||
TORCH_API inline void emptyCache() {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
at::getDeviceAllocator(device_type)->emptyCache();
|
||||
}
|
||||
|
||||
TORCH_API inline at::CachingDeviceAllocator::DeviceStats getDeviceStats(
|
||||
c10::DeviceIndex device_index) {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
return at::getDeviceAllocator(device_type)->getDeviceStats(device_index);
|
||||
}
|
||||
|
||||
TORCH_API inline void resetAccumulatedStats(c10::DeviceIndex device_index) {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
at::getDeviceAllocator(device_type)->resetAccumulatedStats(device_index);
|
||||
}
|
||||
|
||||
TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
|
||||
}
|
||||
|
||||
} // namespace at::accelerator
|
||||
|
||||
namespace at {
|
||||
|
||||
@ -31,9 +31,7 @@ c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
|
||||
return at::globalContext().getPinnedMemoryAllocator(opt_device_type);
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"pin_memory=True requires a CUDA or other accelerator backend; "
|
||||
"no pinned memory allocator is available on this system.")
|
||||
false, "Need to provide pin_memory allocator to use pin memory.")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -10,6 +10,10 @@
|
||||
#include <ideep.hpp>
|
||||
#endif
|
||||
|
||||
#if !defined(__s390x__) && !defined(__powerpc__)
|
||||
#include <cpuinfo.h>
|
||||
#endif
|
||||
|
||||
#include <caffe2/core/common.h>
|
||||
|
||||
#include <ATen/native/DispatchStub.h>
|
||||
@ -103,7 +107,9 @@ std::string get_cpu_capability() {
|
||||
#elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
|
||||
case native::CPUCapability::ZVECTOR:
|
||||
return "Z VECTOR";
|
||||
#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
case native::CPUCapability::SVE:
|
||||
return "SVE";
|
||||
case native::CPUCapability::SVE256:
|
||||
return "SVE256";
|
||||
#else
|
||||
@ -118,6 +124,12 @@ std::string get_cpu_capability() {
|
||||
return "";
|
||||
}
|
||||
|
||||
int get_sve_len() {
|
||||
// It is possible that we override the cpu_capability with
|
||||
// environment variable
|
||||
return cpuinfo_get_max_arm_sve_length();
|
||||
}
|
||||
|
||||
static std::string used_cpu_capability() {
|
||||
// It is possible that we override the cpu_capability with
|
||||
// environment variable
|
||||
|
||||
@ -15,4 +15,6 @@ TORCH_API std::string get_cxx_flags();
|
||||
|
||||
TORCH_API std::string get_cpu_capability();
|
||||
|
||||
TORCH_API int get_sve_len();
|
||||
|
||||
} // namespace at
|
||||
|
||||
@ -239,7 +239,6 @@ TORCH_LIBRARY_IMPL(aten, AutocastMPS, m) {
|
||||
KERNEL_MPS(scaled_dot_product_attention, lower_precision_fp)
|
||||
|
||||
// fp32
|
||||
KERNEL_MPS(conv_transpose3d, input, fp32)
|
||||
KERNEL_MPS(acos, fp32)
|
||||
KERNEL_MPS(asin, fp32)
|
||||
KERNEL_MPS(cosh, fp32)
|
||||
|
||||
@ -97,8 +97,6 @@ c10::TypePtr IValue::TagType<c10::Type>::get(const IValue& v) {
|
||||
return ComplexType::get();
|
||||
case Tag::Int:
|
||||
return IntType::get();
|
||||
case Tag::UInt:
|
||||
return IntType::get();
|
||||
case Tag::SymInt:
|
||||
return c10::SymIntType::get();
|
||||
case Tag::SymFloat:
|
||||
@ -322,8 +320,6 @@ IValue IValue::equals(const IValue& rhs) const {
|
||||
return rhs.isComplexDouble() && lhs.toComplexDouble() == rhs.toComplexDouble();
|
||||
case Tag::Int:
|
||||
return rhs.isInt() && lhs.toInt() == rhs.toInt();
|
||||
case Tag::UInt:
|
||||
return rhs.isUnsigned() && lhs.toUInt() == rhs.toUInt();
|
||||
case Tag::SymInt:
|
||||
return rhs.isSymInt() && lhs.toSymInt() == rhs.toSymInt();
|
||||
case Tag::SymFloat:
|
||||
@ -383,8 +379,6 @@ size_t IValue::hash(const IValue& v) {
|
||||
case Tag::Int:
|
||||
return c10::get_hash(v.payload.u.as_int);
|
||||
// NB: these are technically strict aliasing violations
|
||||
case Tag::UInt:
|
||||
return c10::get_hash(v.payload.u.as_int);
|
||||
case Tag::SymInt:
|
||||
return c10::get_hash(v.payload.u.as_int);
|
||||
case Tag::SymFloat:
|
||||
@ -812,8 +806,6 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
|
||||
return printComplex(out, v);
|
||||
} case IValue::Tag::Int:
|
||||
return out << v.toInt();
|
||||
case IValue::Tag::UInt:
|
||||
return out << v.toUInt();
|
||||
case IValue::Tag::SymInt:
|
||||
return out << v.toSymInt();
|
||||
case IValue::Tag::SymFloat:
|
||||
|
||||
@ -12,7 +12,6 @@
|
||||
#include <c10/macros/Export.h>
|
||||
#include <c10/util/MaybeOwned.h>
|
||||
#include <c10/util/intrusive_ptr.h>
|
||||
#include <limits>
|
||||
#include <type_traits>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
@ -161,7 +160,6 @@ struct Capsule {
|
||||
_(Double) \
|
||||
_(ComplexDouble) \
|
||||
_(Int) \
|
||||
_(UInt) \
|
||||
_(SymInt) \
|
||||
_(SymFloat) \
|
||||
_(SymBool) \
|
||||
@ -655,29 +653,6 @@ struct TORCH_API IValue final {
|
||||
}
|
||||
}
|
||||
|
||||
// Unsigned
|
||||
IValue(uint64_t u) : tag( u <= std::numeric_limits<int64_t>::max() ? Tag::Int : Tag::UInt) {
|
||||
payload.u.as_uint = u;
|
||||
}
|
||||
|
||||
|
||||
// See Note [Meaning of HAS_u]
|
||||
// IValue type model closely follows that of c10::Scalar
|
||||
// Where all integers are upcast to 64-bit representation, and `as_int` is used as default
|
||||
// representation unless value could not be represented as signed int
|
||||
bool isUnsigned() const {
|
||||
return Tag::UInt == tag || (Tag::Int == tag && payload.u.as_int >= 0);
|
||||
}
|
||||
|
||||
uint64_t toUInt() const {
|
||||
if (isUnsigned()) {
|
||||
return payload.u.as_uint;
|
||||
} else {
|
||||
TORCH_INTERNAL_ASSERT(0, "expected unsigned int");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Bool
|
||||
IValue(bool b) : tag(Tag::Bool) {
|
||||
#if defined(__clang__) && defined(__x86_64__)
|
||||
@ -918,14 +893,8 @@ struct TORCH_API IValue final {
|
||||
} else {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
s.isIntegral(false), "Unknown type in Scalar");
|
||||
if (s.isUnsigned()) {
|
||||
const auto val = s.toUInt64();
|
||||
payload.u.as_uint = val;
|
||||
tag = val <= std::numeric_limits<int64_t>::max() ? Tag::Int : Tag::UInt;
|
||||
} else {
|
||||
payload.u.as_int = s.toLong();
|
||||
tag = Tag::Int;
|
||||
}
|
||||
tag = Tag::Int;
|
||||
payload.u.as_int = s.toLong();
|
||||
}
|
||||
}
|
||||
|
||||
@ -949,8 +918,6 @@ struct TORCH_API IValue final {
|
||||
return toSymFloat();
|
||||
else if (isSymBool())
|
||||
return toSymBool();
|
||||
else if (isUnsigned())
|
||||
return toUInt();
|
||||
TORCH_CHECK(false, "IValue is not a Scalar");
|
||||
}
|
||||
|
||||
@ -1280,8 +1247,6 @@ struct TORCH_API IValue final {
|
||||
return true;
|
||||
case Tag::Int:
|
||||
return false;
|
||||
case Tag::UInt:
|
||||
return false;
|
||||
case Tag::SymInt:
|
||||
return true;
|
||||
case Tag::SymFloat:
|
||||
@ -1378,8 +1343,6 @@ struct TORCH_API IValue final {
|
||||
union TriviallyCopyablePayload {
|
||||
TriviallyCopyablePayload() : as_int(0) {}
|
||||
int64_t as_int;
|
||||
// See Note [Meaning of HAS_u]
|
||||
uint64_t as_uint;
|
||||
double as_double;
|
||||
bool as_bool;
|
||||
// Invariant: never nullptr; null state is represented as
|
||||
|
||||
@ -34,9 +34,9 @@ inline scalar_t vec_reduce_all(
|
||||
scalar_t acc_arr[Vec::size()];
|
||||
acc_vec.store(acc_arr);
|
||||
for (const auto i : c10::irange(1, size)) {
|
||||
std::array<scalar_t, Vec::size()> acc_arr_next = {0};
|
||||
scalar_t acc_arr_next[Vec::size()] = {0};
|
||||
acc_arr_next[0] = acc_arr[i];
|
||||
Vec acc_vec_next = Vec::loadu(acc_arr_next.data());
|
||||
Vec acc_vec_next = Vec::loadu(acc_arr_next);
|
||||
acc_vec = vec_fun(acc_vec, acc_vec_next);
|
||||
}
|
||||
acc_vec.store(acc_arr);
|
||||
@ -102,8 +102,7 @@ struct VecReduceAllSIMD<float, Op> {
|
||||
#endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) &&
|
||||
// !defined(C10_MOBILE)
|
||||
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
!defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
template <typename Op>
|
||||
struct VecReduceAllSIMD<float, Op> {
|
||||
static inline float apply(
|
||||
@ -143,8 +142,7 @@ struct VecReduceAllSIMD<float, std::plus<Vectorized<float>>> {
|
||||
#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
|
||||
// && !defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
defined(CPU_CAPABILITY_SVE256)
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && (defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE))
|
||||
template <typename Op>
|
||||
struct VecReduceAllSIMD<float, Op> {
|
||||
static inline float apply(
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
// Define the data type of VLS(vector-length specific).
|
||||
typedef svbool_t vls_pred_t
|
||||
@ -77,4 +77,4 @@ typedef svfloat64_t vls_float64_t
|
||||
#define ALL_F64_TRUE_MASK svreinterpret_f64_s64(ALL_S64_TRUE_MASK)
|
||||
#define ALL_F64_FALSE_MASK svreinterpret_f64_s64(ALL_S64_FALSE_MASK)
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
#endif // defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
@ -19,7 +19,7 @@ namespace vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16)
|
||||
#if (defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)) && defined(__ARM_FEATURE_BF16)
|
||||
|
||||
template <>
|
||||
struct is_vec_specialized_for<BFloat16> : std::bool_constant<true> {};
|
||||
@ -230,8 +230,6 @@ __attribute__((optimize("no-tree-vectorize")))
|
||||
#endif
|
||||
inline std::tuple<Vectorized<float>, Vectorized<float>>
|
||||
convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) {
|
||||
static_assert(
|
||||
Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
|
||||
auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f));
|
||||
auto bf16_vec1 = svzip1_bf16(zero, a);
|
||||
auto bf16_vec2 = svzip2_bf16(zero, a);
|
||||
@ -243,19 +241,18 @@ convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) {
|
||||
inline Vectorized<c10::BFloat16> convert_float_bfloat16(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
static_assert(
|
||||
Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
|
||||
svbfloat16_t x1 = svcvt_bf16_f32_z(ptrue, a);
|
||||
svbfloat16_t x2 = svcvt_bf16_f32_z(ptrue, b);
|
||||
return Vectorized<c10::BFloat16>(svuzp1_bf16(x1, x2));
|
||||
}
|
||||
|
||||
inline void load_fp32_from_bf16(const BFloat16* data, Vectorized<float>& out) {
|
||||
__at_align__ float values[Vectorized<float>::size()];
|
||||
__at_align__ float * values = new float[Vectorized<float>::size()];
|
||||
for (const auto k : c10::irange(Vectorized<float>::size())) {
|
||||
values[k] = data[k];
|
||||
}
|
||||
out = Vectorized<float>::loadu(values);
|
||||
delete[] values;
|
||||
}
|
||||
|
||||
inline void load_fp32_from_bf16(
|
||||
@ -308,8 +305,8 @@ Vectorized<c10::BFloat16> inline operator/(
|
||||
}
|
||||
|
||||
inline Vectorized<BFloat16>::Vectorized() {
|
||||
const short zero = 0;
|
||||
values = svdup_n_bf16(c10::bit_cast<bfloat16_t>(zero));
|
||||
auto vals_f = svdup_n_f32(0);
|
||||
values = convert_float_bfloat16(vals_f, vals_f);
|
||||
}
|
||||
|
||||
inline Vectorized<BFloat16>::Vectorized(int val) {
|
||||
|
||||
@ -8,7 +8,7 @@
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE256)
|
||||
#include <ATen/cpu/vec/sve/vec_bfloat16.h>
|
||||
#include <ATen/cpu/vec/sve/vec_double.h>
|
||||
#include <ATen/cpu/vec/sve/vec_float.h>
|
||||
@ -27,7 +27,7 @@ namespace at::vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
#define DEFINE_SVE_CAST(t1_t, t1_prefix, t2_t, t2_prefix) \
|
||||
@ -231,6 +231,5 @@ std::pair<
|
||||
#endif // __ARM_FEATURE_BF16
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
}
|
||||
|
||||
@ -22,7 +22,7 @@ namespace at::vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
template <>
|
||||
struct is_vec_specialized_for<double> : std::bool_constant<true> {};
|
||||
@ -55,10 +55,11 @@ class Vectorized<double> {
|
||||
operator svfloat64_t() const {
|
||||
return values;
|
||||
}
|
||||
template <uint64_t mask>
|
||||
static Vectorized<double> blend(
|
||||
const Vectorized<double>& a,
|
||||
const Vectorized<double>& b) {
|
||||
const Vectorized<double>& b,
|
||||
int64_t mask
|
||||
) {
|
||||
// Build an array of flags: each element is 1 if the corresponding bit in
|
||||
// 'mask' is set, 0 otherwise.
|
||||
__at_align__ int64_t flag_arr[size()];
|
||||
|
||||
@ -2,8 +2,10 @@
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
|
||||
#include <sleef.h>
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
|
||||
@ -22,7 +24,7 @@ namespace at::vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE256)
|
||||
|
||||
template <>
|
||||
struct is_vec_specialized_for<float> : std::bool_constant<true> {};
|
||||
@ -30,52 +32,77 @@ struct is_vec_specialized_for<float> : std::bool_constant<true> {};
|
||||
template <>
|
||||
class Vectorized<float> {
|
||||
private:
|
||||
vls_float32_t values;
|
||||
|
||||
__at_align__ float values[2048 / sizeof(float)];
|
||||
public:
|
||||
|
||||
using value_type = float;
|
||||
using size_type = int;
|
||||
static constexpr size_type size() {
|
||||
return VECTOR_WIDTH / sizeof(float);
|
||||
static inline size_type size() {
|
||||
return svcntw();
|
||||
}
|
||||
Vectorized() {
|
||||
values = svdup_n_f32(0);
|
||||
inline Vectorized() {svst1_f32(ptrue, values, svdup_n_f32(0));}
|
||||
inline Vectorized(const float val) {
|
||||
svst1_f32(ptrue, values, svdup_n_f32(val));
|
||||
}
|
||||
Vectorized(svfloat32_t v) : values(v) {}
|
||||
Vectorized(float val) {
|
||||
values = svdup_n_f32(val);
|
||||
inline Vectorized(const svfloat32_t val) {
|
||||
svst1_f32(ptrue, values, val);
|
||||
}
|
||||
template <
|
||||
typename... Args,
|
||||
typename = std::enable_if_t<(sizeof...(Args) == size())>>
|
||||
Vectorized(Args... vals) {
|
||||
__at_align__ float buffer[size()] = {vals...};
|
||||
values = svld1_f32(ptrue, buffer);
|
||||
template<typename T,
|
||||
typename = std::enable_if_t<std::is_pointer_v<T>>>
|
||||
inline Vectorized(float * val) {
|
||||
svst1_f32(ptrue, values, svld1_f32(ptrue, val));
|
||||
}
|
||||
operator svfloat32_t() const {
|
||||
return values;
|
||||
template<typename... Args,
|
||||
typename = std::enable_if_t<(sizeof...(Args) == size())>>
|
||||
inline Vectorized(Args... vals) {
|
||||
values = { vals... };
|
||||
}
|
||||
template <uint64_t mask>
|
||||
static Vectorized<float> blend(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
// Build an array of flags: each element is 1 if the corresponding bit in
|
||||
// 'mask' is set, 0 otherwise.
|
||||
__at_align__ int32_t flag_arr[size()];
|
||||
inline operator svfloat32_t() const {
|
||||
return svld1_f32(ptrue, values);
|
||||
}
|
||||
static inline Vectorized<float> from_ptr(const float * vs) {
|
||||
Vectorized<float> v;
|
||||
svst1_f32(ptrue, v.values, svld1_f32(ptrue, static_cast<const float *>(vs)));
|
||||
return v;
|
||||
}
|
||||
static inline Vectorized<float> from_ptr(const float * vs, int count) {
|
||||
Vectorized<float> v;
|
||||
svst1_f32(ptrue, v.values, svld1_f32(svwhilelt_b32_s32(0, count), static_cast<const float *>(vs)));
|
||||
return v;
|
||||
}
|
||||
inline void set_lane(int i, float value) {
|
||||
values[i] = value;
|
||||
}
|
||||
inline Vectorized<float> map(float (*fn)(float)) const {
|
||||
Vectorized<float> result;
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
result.set_lane(i, fn(values[i]));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
inline Vectorized<float> map2(float (*fn)(float, float), const Vectorized<float> &b) const {
|
||||
Vectorized<float> result;
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
result.set_lane(i, fn(values[i], b.values[i]));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline Vectorized<float> blend(const Vectorized<float>& a, const Vectorized<float>& b, const uint64_t mask) {
|
||||
// Build an array of flags: each element is 1 if the corresponding bit in 'mask' is set, 0 otherwise.
|
||||
__at_align__ int32_t * flag_arr = new int32_t[size()];
|
||||
for (int i = 0; i < size(); i++) {
|
||||
flag_arr[i] = (mask & (1ULL << i)) ? 1 : 0;
|
||||
}
|
||||
// Load the flag array into an SVE int32 vector.
|
||||
svint32_t int_mask = svld1_s32(svptrue_b32(), flag_arr);
|
||||
// Compare each lane of int_mask to 0; returns an svbool_t predicate where
|
||||
// true indicates a nonzero flag.
|
||||
svbool_t blend_mask = svcmpne_n_s32(svptrue_b32(), int_mask, 0);
|
||||
// Use svsel to select elements from b where the predicate is true, else
|
||||
// from a.
|
||||
svfloat32_t result = svsel_f32(blend_mask, b.values, a.values);
|
||||
return Vectorized<float>(result);
|
||||
svint32_t int_mask = svld1_s32(ptrue, flag_arr);
|
||||
delete[] flag_arr;
|
||||
// Compare each lane of int_mask to 0; returns an svbool_t predicate where true indicates a nonzero flag.
|
||||
svbool_t blend_mask = svcmpne_n_s32(ptrue, int_mask, 0);
|
||||
// Use svsel to select elements from b where the predicate is true, else from a.
|
||||
return svsel_f32(blend_mask, b, a);
|
||||
}
|
||||
static Vectorized<float> blendv(
|
||||
static inline Vectorized<float> blendv(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& mask_) {
|
||||
@ -84,16 +111,18 @@ class Vectorized<float> {
|
||||
return svsel_f32(mask, b, a);
|
||||
}
|
||||
template <typename step_t>
|
||||
static Vectorized<float> arange(
|
||||
static inline Vectorized<float> arange(
|
||||
float base = 0.f,
|
||||
step_t step = static_cast<step_t>(1)) {
|
||||
__at_align__ float buffer[size()];
|
||||
__at_align__ float * buffer = new float[size()];
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
buffer[i] = base + i * step;
|
||||
}
|
||||
return svld1_f32(ptrue, buffer);
|
||||
auto tmp = Vectorized<float>::from_ptr(buffer);
|
||||
delete[] buffer;
|
||||
return tmp;
|
||||
}
|
||||
static Vectorized<float> set(
|
||||
static inline Vectorized<float> set(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
int64_t count = size()) {
|
||||
@ -169,271 +198,219 @@ class Vectorized<float> {
|
||||
poly = svsel_f32(svcmpgt_f32(pg, x, max_input), inf, poly);
|
||||
return poly;
|
||||
}
|
||||
static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
|
||||
if (count == size())
|
||||
return svld1_f32(ptrue, reinterpret_cast<const float*>(ptr));
|
||||
svbool_t pg = svwhilelt_b32(0ull, count);
|
||||
return svld1_f32(pg, reinterpret_cast<const float*>(ptr));
|
||||
static inline Vectorized<float> loadu(const void* ptr) {
|
||||
return Vectorized<float>::from_ptr(reinterpret_cast<const float *>(ptr));
|
||||
}
|
||||
void store(void* ptr, int64_t count = size()) const {
|
||||
if (count == size()) {
|
||||
svst1_f32(ptrue, reinterpret_cast<float*>(ptr), values);
|
||||
} else {
|
||||
svbool_t pg = svwhilelt_b32(0ull, count);
|
||||
svst1_f32(pg, reinterpret_cast<float*>(ptr), values);
|
||||
}
|
||||
static inline Vectorized<float> loadu(const void* ptr, int64_t count) {
|
||||
return Vectorized<float>::from_ptr(reinterpret_cast<const float *>(ptr), count);
|
||||
}
|
||||
const float& operator[](int idx) const = delete;
|
||||
float& operator[](int idx) = delete;
|
||||
int64_t zero_mask() const {
|
||||
// returns an integer mask where all zero elements are translated to 1-bit
|
||||
// and others are translated to 0-bit
|
||||
inline void store(void* ptr) const {
|
||||
svst1_f32(ptrue, static_cast<float *>(ptr), svld1_f32(ptrue, values));
|
||||
}
|
||||
inline void store(void* ptr, int count) const {
|
||||
svst1_f32(svwhilelt_b32_s32(0, count), static_cast<float *>(ptr), svld1_f32(ptrue, values));
|
||||
}
|
||||
inline const float& operator[](int idx) const {
|
||||
return values[idx];
|
||||
};
|
||||
inline float& operator[](int idx) {
|
||||
return values[idx];
|
||||
};
|
||||
inline int64_t zero_mask() const {
|
||||
// returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
|
||||
int64_t mask = 0;
|
||||
__at_align__ int32_t mask_array[size()];
|
||||
__at_align__ int32_t * mask_array = new int32_t[size()];
|
||||
|
||||
svbool_t svbool_mask = svcmpeq_f32(ptrue, values, ZERO_F32);
|
||||
svst1_s32(
|
||||
ptrue,
|
||||
mask_array,
|
||||
svsel_s32(svbool_mask, ALL_S32_TRUE_MASK, ALL_S32_FALSE_MASK));
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
if (mask_array[i])
|
||||
mask |= (1ull << i);
|
||||
svbool_t svbool_mask = svcmpeq_f32(ptrue, *this, ZERO_F32);
|
||||
svst1_s32(ptrue, mask_array, svsel_s32(svbool_mask,
|
||||
ALL_S32_TRUE_MASK,
|
||||
ALL_S32_FALSE_MASK));
|
||||
for (int64_t j = 0; j < size(); ++j) {
|
||||
if (mask_array[j]) mask |= (1ull << j);
|
||||
}
|
||||
delete[] mask_array;
|
||||
return mask;
|
||||
}
|
||||
Vectorized<float> isnan() const {
|
||||
inline Vectorized<float> isnan() const {
|
||||
// NaN check
|
||||
svbool_t mask = svcmpuo_f32(ptrue, values, ZERO_F32);
|
||||
auto mask = svcmpuo_f32(ptrue, *this, ZERO_F32);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
bool has_inf_nan() const {
|
||||
return svptest_any(
|
||||
ptrue,
|
||||
svcmpuo_f32(ptrue, svsub_f32_x(ptrue, values, values), ZERO_F32));
|
||||
inline bool has_inf_nan() const {
|
||||
return svptest_any(ptrue, svcmpuo_f32(ptrue, svsub_f32_x(ptrue, *this, *this), ZERO_F32));
|
||||
}
|
||||
Vectorized<float> map(float (*f)(float)) const {
|
||||
__at_align__ float tmp[size()];
|
||||
store(tmp);
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
tmp[i] = f(tmp[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
|
||||
inline Vectorized<float> abs() const {
|
||||
return svabs_f32_x(ptrue, *this);
|
||||
}
|
||||
Vectorized<float> abs() const {
|
||||
return svabs_f32_x(ptrue, values);
|
||||
}
|
||||
Vectorized<float> angle() const {
|
||||
inline Vectorized<float> angle() const {
|
||||
const auto nan_vec = svdup_n_f32(NAN);
|
||||
const auto nan_mask = svcmpuo_f32(ptrue, values, ZERO_F32);
|
||||
const auto nan_mask = svcmpuo_f32(ptrue, *this, ZERO_F32);
|
||||
const auto pi = svdup_n_f32(c10::pi<float>);
|
||||
|
||||
const auto neg_mask = svcmplt_f32(ptrue, values, ZERO_F32);
|
||||
const auto neg_mask = svcmplt_f32(ptrue, *this, ZERO_F32);
|
||||
auto angle = svsel_f32(neg_mask, pi, ZERO_F32);
|
||||
angle = svsel_f32(nan_mask, nan_vec, angle);
|
||||
return angle;
|
||||
return svsel_f32(nan_mask, nan_vec, angle);
|
||||
}
|
||||
Vectorized<float> real() const {
|
||||
return values;
|
||||
inline Vectorized<float> real() const {
|
||||
return *this;
|
||||
}
|
||||
Vectorized<float> imag() const {
|
||||
inline Vectorized<float> imag() const {
|
||||
return Vectorized<float>(0.f);
|
||||
}
|
||||
Vectorized<float> conj() const {
|
||||
return values;
|
||||
inline Vectorized<float> conj() const {
|
||||
return *this;
|
||||
}
|
||||
Vectorized<float> acos() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_acosfx_u10sve(values)), map(std::acos));
|
||||
inline Vectorized<float> acos() const {
|
||||
return USE_SLEEF(Sleef_acosfx_u10sve(*this), map(std::acos));
|
||||
}
|
||||
Vectorized<float> acosh() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_acoshfx_u10sve(values)), map(std::acosh));
|
||||
inline Vectorized<float> acosh() const {
|
||||
return USE_SLEEF(Sleef_acoshfx_u10sve(*this), map(std::acosh));
|
||||
}
|
||||
Vectorized<float> asin() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_asinfx_u10sve(values)), map(std::asin));
|
||||
inline Vectorized<float> asin() const {
|
||||
return USE_SLEEF(Sleef_asinfx_u10sve(*this), map(std::asin));
|
||||
}
|
||||
Vectorized<float> asinh() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_asinhfx_u10sve(values)), map(std::asinh));
|
||||
inline Vectorized<float> asinh() const {
|
||||
return USE_SLEEF(Sleef_asinhfx_u10sve(*this), map(std::asinh));
|
||||
}
|
||||
Vectorized<float> atan() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_atanfx_u10sve(values)), map(std::atan));
|
||||
inline Vectorized<float> atan() const {
|
||||
return USE_SLEEF(Sleef_atanfx_u10sve(*this), map(std::atan));
|
||||
}
|
||||
Vectorized<float> atanh() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_atanhfx_u10sve(values)), map(std::atanh));
|
||||
inline Vectorized<float> atanh() const {
|
||||
return USE_SLEEF(Sleef_atanhfx_u10sve(*this), map(std::atanh));
|
||||
}
|
||||
Vectorized<float> atan2(const Vectorized<float>& b) const {USE_SLEEF(
|
||||
{ return Vectorized<float>(Sleef_atan2fx_u10sve(values, b)); },
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = std::atan2(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
})} Vectorized<float> copysign(const Vectorized<float>& sign) const {
|
||||
|
||||
USE_SLEEF(
|
||||
{ return Vectorized<float>(Sleef_copysignfx_sve(values, sign)); },
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_sign[size()];
|
||||
store(tmp);
|
||||
sign.store(tmp_sign);
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
})} Vectorized<float> erf() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_erffx_u10sve(values)), map(std::erf));
|
||||
inline Vectorized<float> atan2(const Vectorized<float> &b) const {
|
||||
return USE_SLEEF(Sleef_atan2fx_u10sve(*this, b), map2(std::atan2, b));
|
||||
}
|
||||
Vectorized<float> erfc() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_erfcfx_u15sve(values)), map(std::erfc));
|
||||
inline Vectorized<float> copysign(const Vectorized<float> &sign) const {
|
||||
return USE_SLEEF(Sleef_copysignfx_sve(*this, sign), map2(std::copysign, sign));
|
||||
}
|
||||
Vectorized<float> erfinv() const {
|
||||
inline Vectorized<float> erf() const {
|
||||
return USE_SLEEF(Sleef_erffx_u10sve(*this), map(std::erf));
|
||||
}
|
||||
inline Vectorized<float> erfc() const {
|
||||
return USE_SLEEF(Sleef_erfcfx_u15sve(*this), map(std::erfc));
|
||||
}
|
||||
inline Vectorized<float> erfinv() const {
|
||||
return map(calc_erfinv);
|
||||
}
|
||||
Vectorized<float> exp() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_expfx_u10sve(values)), map(std::exp));
|
||||
inline Vectorized<float> exp() const {
|
||||
return USE_SLEEF(Sleef_expfx_u10sve(*this), map(std::exp));
|
||||
}
|
||||
Vectorized<float> exp2() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_exp2fx_u10sve(values)), map(std::exp2));
|
||||
inline Vectorized<float> exp2() const {
|
||||
return USE_SLEEF(Sleef_exp2fx_u10sve(*this), map(std::exp2));
|
||||
}
|
||||
Vectorized<float> expm1() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_expm1fx_u10sve(values)), map(std::expm1));
|
||||
inline Vectorized<float> expm1() const {
|
||||
return USE_SLEEF(Sleef_expm1fx_u10sve(*this), map(std::expm1));
|
||||
}
|
||||
Vectorized<float> exp_u20() const {
|
||||
return exp();
|
||||
// Implementation copied from Arm Optimized Routines:
|
||||
// https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/sve/expf.c
|
||||
inline Vectorized<float> exp_u20() {
|
||||
|
||||
// Load values into an SVE vector
|
||||
svfloat32_t val_vec = svld1(svptrue_b32(), values); // 'values' is float*
|
||||
|
||||
// Check for special case: |x| >= 87.3...
|
||||
svbool_t is_special_case = svacgt(svptrue_b32(), val_vec, 0x1.5d5e2ap+6f);
|
||||
if (svptest_any(svptrue_b32(), is_special_case)) {
|
||||
return exp(); // fallback to scalar exp() for special cases
|
||||
}
|
||||
Vectorized<float> fexp_u20() const {
|
||||
return exp();
|
||||
|
||||
// Constants
|
||||
const svfloat32_t ln2_hi = svdup_f32(0x1.62e4p-1f);
|
||||
const svfloat32_t ln2_lo = svdup_f32(0x1.7f7d1cp-20f);
|
||||
const svfloat32_t c1 = svdup_f32(0.5f);
|
||||
const svfloat32_t inv_ln2 = svdup_f32(0x1.715476p+0f);
|
||||
const svfloat32_t shift_vec = svdup_f32(0x1.803f8p17f); // scalar to vector
|
||||
|
||||
// n = round(x / ln2)
|
||||
svfloat32_t z = svmad_x(svptrue_b32(), inv_ln2, val_vec, shift_vec);
|
||||
svfloat32_t n = svsub_x(svptrue_b32(), z, shift_vec);
|
||||
|
||||
// r = x - n * ln2
|
||||
svfloat32_t r = svsub_x(svptrue_b32(), val_vec, svmul_x(svptrue_b32(), n, ln2_hi));
|
||||
r = svsub_x(svptrue_b32(), r, svmul_x(svptrue_b32(), n, ln2_lo));
|
||||
|
||||
// scale = 2^(n)
|
||||
svfloat32_t scale = svexpa(svreinterpret_u32(z));
|
||||
|
||||
// poly(r) = exp(r) - 1 ≈ r + 0.5 * r^2
|
||||
svfloat32_t r2 = svmul_x(svptrue_b32(), r, r);
|
||||
svfloat32_t poly = svmla_x(svptrue_b32(), r, r2, c1);
|
||||
|
||||
// return scale * (1 + poly)
|
||||
return svmla_x(svptrue_b32(), scale, scale, poly);
|
||||
}
|
||||
Vectorized<float> fmod(const Vectorized<float>& q) const {USE_SLEEF(
|
||||
{ return Vectorized<float>(Sleef_fmodfx_sve(values, q)); },
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_q[size()];
|
||||
store(tmp);
|
||||
q.store(tmp_q);
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
tmp[i] = std::fmod(tmp[i], tmp_q[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
})} Vectorized<float> hypot(const Vectorized<float>& b) const {
|
||||
USE_SLEEF(
|
||||
{ return Vectorized<float>(Sleef_hypotfx_u05sve(values, b)); },
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = std::hypot(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
})} Vectorized<float> i0() const {
|
||||
|
||||
inline Vectorized<float> fexp_u20() {
|
||||
return exp_u20();
|
||||
}
|
||||
inline Vectorized<float> fmod(const Vectorized<float>& q) const {
|
||||
return USE_SLEEF(Sleef_fmodfx_sve(*this, q), return map2(std::fmod, q));
|
||||
}
|
||||
inline Vectorized<float> hypot(const Vectorized<float> &b) const {
|
||||
return USE_SLEEF(Sleef_hypotfx_u05sve(*this, b), map2(std::hypot, b));
|
||||
}
|
||||
inline Vectorized<float> i0() const {
|
||||
return map(calc_i0);
|
||||
}
|
||||
Vectorized<float> i0e() const {
|
||||
return map(calc_i0e);
|
||||
inline Vectorized<float> i0e() const {
|
||||
return map(calc_i0e<float>);
|
||||
}
|
||||
Vectorized<float> digamma() const {
|
||||
inline Vectorized<float> digamma() const {
|
||||
return map(calc_digamma);
|
||||
}
|
||||
Vectorized<float> igamma(const Vectorized<float>& x) const {
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_x[size()];
|
||||
store(tmp);
|
||||
x.store(tmp_x);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
inline Vectorized<float> igamma(const Vectorized<float> &x) const {
|
||||
return map2(calc_igamma<float>, x);
|
||||
}
|
||||
Vectorized<float> igammac(const Vectorized<float>& x) const {
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_x[size()];
|
||||
store(tmp);
|
||||
x.store(tmp_x);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
inline Vectorized<float> igammac(const Vectorized<float> &x) const {
|
||||
return map2(calc_igammac<float>, x);
|
||||
}
|
||||
Vectorized<float> nextafter(const Vectorized<float>& b) const {USE_SLEEF(
|
||||
{ return Vectorized<float>(Sleef_nextafterfx_sve(values, b)); },
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
})} Vectorized<float> log() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_logfx_u10sve(values)), map(std::log));
|
||||
inline Vectorized<float> nextafter(const Vectorized<float> &b) const {
|
||||
return USE_SLEEF(Sleef_nextafterfx_sve(*this, b), map2(std::nextafter, b));
|
||||
}
|
||||
Vectorized<float> log2() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_log2fx_u10sve(values)), map(std::log2));
|
||||
inline Vectorized<float> log() const {
|
||||
return USE_SLEEF(Sleef_logfx_u10sve(*this), map(std::log));
|
||||
}
|
||||
Vectorized<float> log10() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_log10fx_u10sve(values)), map(std::log10));
|
||||
inline Vectorized<float> log2() const {
|
||||
return USE_SLEEF(Sleef_log2fx_u10sve(*this), map(std::log2));
|
||||
}
|
||||
Vectorized<float> log1p() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_log1pfx_u10sve(values)), map(std::log1p));
|
||||
inline Vectorized<float> log10() const {
|
||||
return USE_SLEEF(Sleef_log10fx_u10sve(*this), map(std::log10));
|
||||
}
|
||||
Vectorized<float> frac() const;
|
||||
Vectorized<float> sin() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_sinfx_u10sve(values)), map(std::sin));
|
||||
inline Vectorized<float> log1p() const {
|
||||
return USE_SLEEF(Sleef_log1pfx_u10sve(*this), map(std::log1p));
|
||||
}
|
||||
Vectorized<float> sinh() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_sinhfx_u10sve(values)), map(std::sinh));
|
||||
inline Vectorized<float> frac() const;
|
||||
inline Vectorized<float> sin() const {
|
||||
return USE_SLEEF(Sleef_sinfx_u10sve(*this), map(std::sin));
|
||||
}
|
||||
Vectorized<float> cos() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_cosfx_u10sve(values)), map(std::cos));
|
||||
inline Vectorized<float> sinh() const {
|
||||
return USE_SLEEF(Sleef_sinhfx_u10sve(*this), map(std::sinh));
|
||||
}
|
||||
Vectorized<float> cosh() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_coshfx_u10sve(values)), map(std::cosh));
|
||||
inline Vectorized<float> cos() const {
|
||||
return USE_SLEEF(Sleef_cosfx_u10sve(*this), map(std::cos));
|
||||
}
|
||||
Vectorized<float> ceil() const {
|
||||
return svrintp_f32_x(ptrue, values);
|
||||
inline Vectorized<float> cosh() const {
|
||||
return USE_SLEEF(Sleef_coshfx_u10sve(*this), map(std::cosh));
|
||||
}
|
||||
Vectorized<float> floor() const {
|
||||
return svrintm_f32_x(ptrue, values);
|
||||
inline Vectorized<float> ceil() const {
|
||||
return svrintp_f32_x(ptrue, *this);
|
||||
}
|
||||
Vectorized<float> neg() const {
|
||||
return svneg_f32_x(ptrue, values);
|
||||
inline Vectorized<float> floor() const {
|
||||
return svrintm_f32_x(ptrue, *this);
|
||||
}
|
||||
Vectorized<float> round() const {
|
||||
return svrinti_f32_x(ptrue, values);
|
||||
inline Vectorized<float> neg() const {
|
||||
return svneg_f32_x(ptrue, *this);
|
||||
}
|
||||
Vectorized<float> tan() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_tanfx_u10sve(values)), map(std::tan));
|
||||
inline Vectorized<float> round() const {
|
||||
return svrinti_f32_x(ptrue, *this);
|
||||
}
|
||||
inline Vectorized<float> tan() const {
|
||||
return USE_SLEEF(Sleef_tanfx_u10sve(*this), map(std::tan));
|
||||
}
|
||||
// Implementation is picked from
|
||||
// https://github.com/ARM-software/ComputeLibrary/blob/v25.01/src/core/NEON/SVEMath.inl#L179
|
||||
Vectorized<float> tanh() const {
|
||||
inline Vectorized<float> tanh() const {
|
||||
// Constants used for the tanh calculation.
|
||||
const svfloat32_t CONST_1 =
|
||||
svdup_n_f32(1.f); // Constant 1.0f for the tanh formula.
|
||||
@ -450,7 +427,7 @@ class Vectorized<float> {
|
||||
// instability. svmax_f32_z ensures values are greater than -10, and
|
||||
// svmin_f32_z ensures they are less than 10.
|
||||
svfloat32_t x = svmin_f32_z(
|
||||
ptrue, svmax_f32_z(ptrue, values, CONST_MIN_TANH), CONST_MAX_TANH);
|
||||
ptrue, svmax_f32_z(ptrue, *this, CONST_MIN_TANH), CONST_MAX_TANH);
|
||||
|
||||
// Step 2: Calculate exp(2 * x), where x is the clamped value.
|
||||
// svmul_f32_z computes 2 * x, and svexp_f32_z computes the exponential of
|
||||
@ -472,104 +449,85 @@ class Vectorized<float> {
|
||||
// Return the calculated tanh values.
|
||||
return tanh;
|
||||
}
|
||||
Vectorized<float> trunc() const {
|
||||
return svrintz_f32_x(ptrue, values);
|
||||
inline Vectorized<float> trunc() const {
|
||||
return svrintz_f32_x(ptrue, *this);
|
||||
}
|
||||
Vectorized<float> lgamma() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_lgammafx_u10sve(values)), map(std::lgamma));
|
||||
inline Vectorized<float> lgamma() const {
|
||||
return USE_SLEEF(Sleef_lgammafx_u10sve(*this), map(std::lgamma));
|
||||
}
|
||||
Vectorized<float> sqrt() const {
|
||||
return svsqrt_f32_x(ptrue, values);
|
||||
inline Vectorized<float> sqrt() const {
|
||||
return svsqrt_f32_x(ptrue, *this);
|
||||
}
|
||||
Vectorized<float> reciprocal() const {
|
||||
return svdivr_f32_x(ptrue, values, ONE_F32);
|
||||
inline Vectorized<float> reciprocal() const {
|
||||
return svdivr_f32_x(ptrue, *this, svdup_n_f32(1.f));
|
||||
}
|
||||
Vectorized<float> rsqrt() const {
|
||||
return svdivr_f32_x(ptrue, svsqrt_f32_x(ptrue, values), ONE_F32);
|
||||
inline Vectorized<float> rsqrt() const {
|
||||
return svdivr_f32_x(ptrue, svsqrt_f32_x(ptrue, *this), ONE_F32);
|
||||
}
|
||||
Vectorized<float> pow(const Vectorized<float>& b) const {USE_SLEEF(
|
||||
{ return Vectorized<float>(Sleef_powfx_u10sve(values, b)); },
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = std::pow(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
})} // Comparison using the _CMP_**_OQ predicate.
|
||||
// `O`: get false if an operand is NaN
|
||||
// `Q`: do not raise if an operand is NaN
|
||||
Vectorized<float> operator==(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpeq_f32(ptrue, values, other);
|
||||
inline Vectorized<float> pow(const Vectorized<float> &b) const {
|
||||
return USE_SLEEF(Sleef_powfx_u10sve(*this, b), map(std::pow, b));
|
||||
}
|
||||
// Comparison using the _CMP_**_OQ predicate.
|
||||
// `O`: get false if an operand is NaN
|
||||
// `Q`: do not raise if an operand is NaN
|
||||
inline Vectorized<float> operator==(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpeq_f32(ptrue, *this, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
inline Vectorized<float> operator!=(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpne_f32(ptrue, *this, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
inline Vectorized<float> operator<(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmplt_f32(ptrue, *this, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> operator!=(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpne_f32(ptrue, values, other);
|
||||
inline Vectorized<float> operator<=(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmple_f32(ptrue, *this, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> operator<(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmplt_f32(ptrue, values, other);
|
||||
inline Vectorized<float> operator>(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpgt_f32(ptrue, *this, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> operator<=(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmple_f32(ptrue, values, other);
|
||||
inline Vectorized<float> operator>=(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpge_f32(ptrue, *this, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> operator>(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpgt_f32(ptrue, values, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> operator>=(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpge_f32(ptrue, values, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> eq(const Vectorized<float>& other) const;
|
||||
Vectorized<float> ne(const Vectorized<float>& other) const;
|
||||
Vectorized<float> gt(const Vectorized<float>& other) const;
|
||||
Vectorized<float> ge(const Vectorized<float>& other) const;
|
||||
Vectorized<float> lt(const Vectorized<float>& other) const;
|
||||
Vectorized<float> le(const Vectorized<float>& other) const;
|
||||
inline Vectorized<float> eq(const Vectorized<float>& other) const;
|
||||
inline Vectorized<float> ne(const Vectorized<float>& other) const;
|
||||
inline Vectorized<float> gt(const Vectorized<float>& other) const;
|
||||
inline Vectorized<float> ge(const Vectorized<float>& other) const;
|
||||
inline Vectorized<float> lt(const Vectorized<float>& other) const;
|
||||
inline Vectorized<float> le(const Vectorized<float>& other) const;
|
||||
};
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator+(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
inline Vectorized<float> operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svadd_f32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator-(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
inline Vectorized<float> operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svsub_f32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator*(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
inline Vectorized<float> operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svmul_f32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator/(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
inline Vectorized<float> operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svdiv_f32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
// frac. Implement this here so we can use subtraction
|
||||
Vectorized<float> inline Vectorized<float>::frac() const {
|
||||
inline Vectorized<float> Vectorized<float>::frac() const {
|
||||
return *this - this->trunc();
|
||||
}
|
||||
|
||||
@ -585,115 +543,91 @@ Vectorized<float> inline maximum(
|
||||
// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
|
||||
// either input is a NaN.
|
||||
template <>
|
||||
Vectorized<float> inline minimum(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
inline Vectorized<float> minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svmin_f32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline clamp(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& min,
|
||||
const Vectorized<float>& max) {
|
||||
inline Vectorized<float> clamp(const Vectorized<float>& a, const Vectorized<float>& min, const Vectorized<float>& max) {
|
||||
return svmin_f32_x(ptrue, max, svmax_f32_x(ptrue, min, a));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline clamp_max(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& max) {
|
||||
inline Vectorized<float> clamp_max(const Vectorized<float>& a, const Vectorized<float>& max) {
|
||||
return svmin_f32_x(ptrue, max, a);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline clamp_min(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& min) {
|
||||
inline Vectorized<float> clamp_min(const Vectorized<float>& a, const Vectorized<float>& min) {
|
||||
return svmax_f32_x(ptrue, min, a);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator&(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
return svreinterpret_f32_s32(
|
||||
svand_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
|
||||
inline Vectorized<float> operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svreinterpret_f32_s32(svand_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator|(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
return svreinterpret_f32_s32(
|
||||
svorr_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
|
||||
inline Vectorized<float> operator|(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svreinterpret_f32_s32(svorr_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator^(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
return svreinterpret_f32_s32(
|
||||
sveor_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
|
||||
inline Vectorized<float> operator^(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svreinterpret_f32_s32(sveor_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::eq(
|
||||
const Vectorized<float>& other) const {
|
||||
inline Vectorized<float> Vectorized<float>::eq(const Vectorized<float>& other) const {
|
||||
return (*this == other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::ne(
|
||||
const Vectorized<float>& other) const {
|
||||
inline Vectorized<float> Vectorized<float>::ne(const Vectorized<float>& other) const {
|
||||
return (*this != other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::gt(
|
||||
const Vectorized<float>& other) const {
|
||||
inline Vectorized<float> Vectorized<float>::gt(const Vectorized<float>& other) const {
|
||||
return (*this > other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::ge(
|
||||
const Vectorized<float>& other) const {
|
||||
inline Vectorized<float> Vectorized<float>::ge(const Vectorized<float>& other) const {
|
||||
return (*this >= other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::lt(
|
||||
const Vectorized<float>& other) const {
|
||||
inline Vectorized<float> Vectorized<float>::lt(const Vectorized<float>& other) const {
|
||||
return (*this < other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::le(
|
||||
const Vectorized<float>& other) const {
|
||||
inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) const {
|
||||
return (*this <= other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const float* src, float* dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<float>::size();
|
||||
const int64_t fraction = n % svcntw();
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
|
||||
for (int64_t i = 0; i < n - fraction; i += svcntw()) {
|
||||
svst1_f32(ptrue, dst + i, svldnt1_f32(ptrue, src + i));
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
|
||||
for (int64_t i = n - fraction; i < n; i += svcntw()) {
|
||||
svbool_t pg = svwhilelt_b32(i, n);
|
||||
svst1_f32(pg, dst + i, svldnt1_f32(pg, src + i));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const float* src, at::Half* dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<float>::size();
|
||||
svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized<float>::size());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
|
||||
inline void convert(const float *src, at::Half *dst, int64_t n) {
|
||||
const int64_t fraction = n % svcntw();
|
||||
svbool_t pg_16 = svwhilelt_b16(0ull, svcntw());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, svcntw());
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
|
||||
svfloat16_t src_vec = svuzp1_f16(
|
||||
svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)), ZERO_F16);
|
||||
for (int64_t i = 0; i < n - fraction; i += svcntw()) {
|
||||
svfloat16_t src_vec = svuzp1_f16(svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)),
|
||||
ZERO_F16);
|
||||
svst1_f16(pg_16, reinterpret_cast<float16_t*>(dst) + i, src_vec);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
|
||||
for (int64_t i = n - fraction; i < n; i += svcntw()) {
|
||||
pg_16 = svwhilelt_b16(i, n);
|
||||
pg_32 = svwhilelt_b32(i, n);
|
||||
svfloat16_t src_vec = svuzp1_f16(
|
||||
@ -703,19 +637,18 @@ inline void convert(const float* src, at::Half* dst, int64_t n) {
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const at::Half* src, float* dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<float>::size();
|
||||
svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized<float>::size());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
|
||||
inline void convert(const at::Half *src, float *dst, int64_t n) {
|
||||
const int64_t fraction = n % svcntw();
|
||||
svbool_t pg_16 = svwhilelt_b16(0ull, svcntw());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, svcntw());
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
|
||||
svfloat16_t src_vec = svzip1_f16(
|
||||
svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
|
||||
ZERO_F16);
|
||||
for (int64_t i = 0; i < n - fraction; i += svcntw()) {
|
||||
svfloat16_t src_vec = svzip1_f16(svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
|
||||
ZERO_F16);
|
||||
svst1_f32(pg_32, dst + i, svcvt_f32_f16_x(ptrue, src_vec));
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
|
||||
for (int64_t i = n - fraction; i < n; i += svcntw()) {
|
||||
pg_16 = svwhilelt_b16(i, n);
|
||||
pg_32 = svwhilelt_b32(i, n);
|
||||
svfloat16_t src_vec = svzip1_f16(
|
||||
@ -726,20 +659,19 @@ inline void convert(const at::Half* src, float* dst, int64_t n) {
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const bool* src, float* dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<float>::size();
|
||||
svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized<float>::size());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
|
||||
inline void convert(const bool *src, float *dst, int64_t n) {
|
||||
const int64_t fraction = n % svcntw();
|
||||
svbool_t pg_8 = svwhilelt_b8(0ull, svcntw());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, svcntw());
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
|
||||
svuint8_t src_vec_u8 =
|
||||
svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
|
||||
for (int64_t i = 0; i < n - fraction; i += svcntw()) {
|
||||
svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
|
||||
svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
|
||||
svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
|
||||
svst1_f32(pg_32, dst + i, svsel_f32(mask, ONE_F32, ZERO_F32));
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
|
||||
for (int64_t i = n - fraction; i < n; i += svcntw()) {
|
||||
pg_8 = svwhilelt_b8(i, n);
|
||||
pg_32 = svwhilelt_b32(i, n);
|
||||
svuint8_t src_vec_u8 =
|
||||
@ -751,10 +683,7 @@ inline void convert(const bool* src, float* dst, int64_t n) {
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fmadd(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& c) {
|
||||
inline Vectorized<float> fmadd(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
|
||||
return svmad_f32_x(ptrue, a, b, c);
|
||||
}
|
||||
|
||||
@ -785,4 +714,4 @@ Vectorized<float> inline fnmsub(
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
} // namespace at::vec
|
||||
@ -15,7 +15,7 @@ namespace at::vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
#define VEC_INT_SVE_TEMPLATE(vl, bit) \
|
||||
template <> \
|
||||
@ -49,10 +49,11 @@ inline namespace CPU_CAPABILITY {
|
||||
operator svint##bit##_t() const { \
|
||||
return values; \
|
||||
} \
|
||||
template <uint64_t mask> \
|
||||
static Vectorized<int##bit##_t> blend( \
|
||||
const Vectorized<int##bit##_t>& a, \
|
||||
const Vectorized<int##bit##_t>& b) { \
|
||||
const Vectorized<int##bit##_t>& b, \
|
||||
uint64_t mask \
|
||||
) { \
|
||||
__at_align__ int##bit##_t flag_arr[size()]; \
|
||||
for (int i = 0; i < size(); ++i) { \
|
||||
flag_arr[i] = (i < 64 && (mask & (1ULL << i))) ? 1 : 0; \
|
||||
@ -493,7 +494,7 @@ Vectorized<int8_t> inline operator>>(
|
||||
return svasr_s8_x(ptrue, a, svreinterpret_u8_s8(b));
|
||||
}
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
#endif // defined(CPU_CAPABILITY_SVE256)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
|
||||
@ -46,7 +46,7 @@ namespace at::vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
// NOTE: These are low-performance implementations that we fall back on
|
||||
// if we are not building with SVE. This may not be an issue, because
|
||||
@ -100,12 +100,12 @@ struct VectorizedQuantizedConverter {
|
||||
Vectorized<float> zero_point,
|
||||
Vectorized<float> scale_zp_premul) const {
|
||||
float_vec_return_type rv;
|
||||
float tmp_scale[Vectorized<float>::size()];
|
||||
float tmp_zero_point[Vectorized<float>::size()];
|
||||
float * tmp_scale = new float[Vectorized<float>::size()];
|
||||
float * tmp_zero_point = new float[Vectorized<float>::size()];
|
||||
scale.store(tmp_scale);
|
||||
zero_point.store(tmp_zero_point);
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
float tmp_vals[Vectorized<float>::size()];
|
||||
float * tmp_vals = new float[Vectorized<float>::size()];
|
||||
for (int j = 0; j < Vectorized<float>::size(); ++j) {
|
||||
tmp_vals[j] = at::native::dequantize_val<T>(
|
||||
tmp_scale[j],
|
||||
@ -113,6 +113,10 @@ struct VectorizedQuantizedConverter {
|
||||
T(vals[Vectorized<float>::size() * i + j]));
|
||||
}
|
||||
rv[i] = Vectorized<float>::loadu(tmp_vals);
|
||||
|
||||
delete[] tmp_scale;
|
||||
delete[] tmp_zero_point;
|
||||
delete[] tmp_vals;
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
@ -121,12 +125,12 @@ struct VectorizedQuantizedConverter {
|
||||
Vectorized<float> scale,
|
||||
Vectorized<float> zero_point) const {
|
||||
float_vec_return_type rv;
|
||||
float tmp_scale[Vectorized<float>::size()];
|
||||
float tmp_zero_point[Vectorized<float>::size()];
|
||||
float * tmp_scale = new float[Vectorized<float>::size()];
|
||||
float * tmp_zero_point = new float[Vectorized<float>::size()];
|
||||
scale.store(tmp_scale);
|
||||
zero_point.store(tmp_zero_point);
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
float tmp_vals[Vectorized<float>::size()];
|
||||
float * tmp_vals = new float[Vectorized<float>::size()];
|
||||
for (int j = 0; j < Vectorized<float>::size(); ++j) {
|
||||
tmp_vals[j] = at::native::dequantize_val<T>(
|
||||
tmp_scale[j],
|
||||
@ -134,6 +138,9 @@ struct VectorizedQuantizedConverter {
|
||||
T(vals[Vectorized<float>::size() * i + j]));
|
||||
}
|
||||
rv[i] = Vectorized<float>::loadu(tmp_vals);
|
||||
delete[] tmp_scale;
|
||||
delete[] tmp_zero_point;
|
||||
delete[] tmp_vals;
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
@ -205,7 +212,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
||||
int32_t zero_point,
|
||||
float inverse_scale) {
|
||||
std::array<value_type, size()> qvals;
|
||||
std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
|
||||
float * float_vals = new float[float_num_vecs() * Vectorized<float>::size()];
|
||||
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
rhs[i].store(
|
||||
@ -216,10 +223,11 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
||||
at::native::quantize_vec<c10::qint32, /*precision=*/32>(
|
||||
scale,
|
||||
zero_point,
|
||||
float_vals.data(),
|
||||
float_vals,
|
||||
(c10::qint32*)qvals.data(),
|
||||
Vectorized<float>::size() * float_num_vecs());
|
||||
|
||||
delete[] float_vals;
|
||||
return Vectorized<c10::qint32>::loadu(qvals.data());
|
||||
}
|
||||
|
||||
@ -359,7 +367,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
||||
int32_t zero_point,
|
||||
float inverse_scale) {
|
||||
std::array<value_type, size()> qvals;
|
||||
std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
|
||||
float * float_vals = new float[float_num_vecs() * Vectorized<float>::size()];
|
||||
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
rhs[i].store(
|
||||
@ -370,10 +378,11 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
||||
at::native::quantize_vec<c10::qint8>(
|
||||
scale,
|
||||
zero_point,
|
||||
float_vals.data(),
|
||||
float_vals,
|
||||
(c10::qint8*)qvals.data(),
|
||||
Vectorized<float>::size() * float_num_vecs());
|
||||
|
||||
delete[] float_vals;
|
||||
return Vectorized<c10::qint8>::loadu(qvals.data());
|
||||
}
|
||||
|
||||
@ -511,7 +520,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
||||
int32_t zero_point,
|
||||
float inverse_scale) {
|
||||
std::array<value_type, size()> qvals;
|
||||
std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
|
||||
float * float_vals = new float[float_num_vecs() * Vectorized<float>::size()];
|
||||
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
rhs[i].store(
|
||||
@ -522,10 +531,11 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
||||
at::native::quantize_vec<c10::quint8>(
|
||||
scale,
|
||||
zero_point,
|
||||
float_vals.data(),
|
||||
float_vals,
|
||||
(c10::quint8*)qvals.data(),
|
||||
Vectorized<float>::size() * float_num_vecs());
|
||||
|
||||
delete[] float_vals;
|
||||
return Vectorized<c10::quint8>::loadu(qvals.data());
|
||||
}
|
||||
|
||||
@ -600,7 +610,7 @@ Vectorized<c10::quint8> inline maximum(
|
||||
return a.maximum(b);
|
||||
}
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
#endif // defined(CPU_CAPABILITY_SVE256)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
|
||||
@ -4,7 +4,9 @@
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
|
||||
#ifdef __aarch64__
|
||||
#if !defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE256)
|
||||
#include <ATen/cpu/vec/sve/vec_common_sve.h>
|
||||
#else
|
||||
#include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
|
||||
|
||||
@ -241,7 +241,7 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
||||
Vectorized() = default;
|
||||
|
||||
Vectorized(c10::BFloat16 val)
|
||||
: Vectorized16(at_vdupq_n_bf16(c10::bit_cast<at_bfloat16_t>(val.x))) {}
|
||||
: Vectorized16(at_vdupq_n_bf16(val.x)) {}
|
||||
Vectorized(float val) : Vectorized(c10::BFloat16(val)) {}
|
||||
Vectorized(
|
||||
value_type val0,
|
||||
@ -253,14 +253,14 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
||||
value_type val6,
|
||||
value_type val7)
|
||||
: Vectorized16(at_bfloat16x8_t{
|
||||
c10::bit_cast<at_bfloat16_t>(val0.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val1.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val2.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val3.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val4.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val5.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val6.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val7.x)}) {}
|
||||
val0.x,
|
||||
val1.x,
|
||||
val2.x,
|
||||
val3.x,
|
||||
val4.x,
|
||||
val5.x,
|
||||
val6.x,
|
||||
val7.x}) {}
|
||||
|
||||
static Vectorized<c10::BFloat16> blendv(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
|
||||
namespace at::vec {
|
||||
inline namespace CPU_CAPABILITY {
|
||||
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
|
||||
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE))
|
||||
template <typename src_t>
|
||||
struct VecConvert<
|
||||
float,
|
||||
|
||||
@ -41,32 +41,16 @@ inline namespace CPU_CAPABILITY {
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
|
||||
#endif
|
||||
|
||||
template <int index, bool mask_val>
|
||||
template <int index>
|
||||
struct BlendRegs {
|
||||
static float32x4_t impl(
|
||||
const float32x4_t& a,
|
||||
const float32x4_t& b,
|
||||
float32x4_t& res);
|
||||
};
|
||||
|
||||
template <int index>
|
||||
struct BlendRegs<index, true> {
|
||||
static float32x4_t impl(
|
||||
const float32x4_t& a,
|
||||
const float32x4_t& b,
|
||||
float32x4_t& res) {
|
||||
return vsetq_lane_f32(vgetq_lane_f32(b, index), res, index);
|
||||
}
|
||||
};
|
||||
|
||||
template <int index>
|
||||
struct BlendRegs<index, false> {
|
||||
static float32x4_t impl(
|
||||
const float32x4_t& a,
|
||||
const float32x4_t& b,
|
||||
float32x4_t& res) {
|
||||
return vsetq_lane_f32(vgetq_lane_f32(a, index), res, index);
|
||||
}
|
||||
float32x4_t& res,
|
||||
bool mask_val
|
||||
) {
|
||||
return vsetq_lane_f32(vgetq_lane_f32(mask_val ? b : a, index), res, index);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
@ -94,19 +78,15 @@ class Vectorized<float> {
|
||||
operator float32x4_t() const {
|
||||
return values;
|
||||
}
|
||||
template <int64_t mask>
|
||||
static Vectorized<float> blend(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
const Vectorized<float>& b,
|
||||
int64_t mask) {
|
||||
Vectorized<float> vec;
|
||||
vec.values = BlendRegs < 0,
|
||||
(mask & 0x01) != 0 > ::impl(a.values, b.values, vec.values);
|
||||
vec.values = BlendRegs < 1,
|
||||
(mask & 0x02) != 0 > ::impl(a.values, b.values, vec.values);
|
||||
vec.values = BlendRegs < 2,
|
||||
(mask & 0x04) != 0 > ::impl(a.values, b.values, vec.values);
|
||||
vec.values = BlendRegs < 3,
|
||||
(mask & 0x08) != 0 > ::impl(a.values, b.values, vec.values);
|
||||
vec.values = BlendRegs <0>::impl(a.values, b.values, vec.values, (mask & 0x01) != 0);
|
||||
vec.values = BlendRegs <1> ::impl(a.values, b.values, vec.values, (mask & 0x02) != 0);
|
||||
vec.values = BlendRegs <2> ::impl(a.values, b.values, vec.values, (mask & 0x04) != 0);
|
||||
vec.values = BlendRegs <3> ::impl(a.values, b.values, vec.values, (mask & 0x08) != 0);
|
||||
return vec;
|
||||
}
|
||||
static Vectorized<float> blendv(
|
||||
@ -307,11 +287,50 @@ class Vectorized<float> {
|
||||
DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(exp)
|
||||
DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(exp2)
|
||||
DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(expm1)
|
||||
// Implementation copied from Arm Optimized Routine https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
|
||||
Vectorized<float> exp_u20() const {
|
||||
return exp();
|
||||
// bail out to sleef if it's a special case:
|
||||
// i.e. there's an input s.t. |input| > 87.3....
|
||||
const float32x4_t special_bound = vdupq_n_f32(0x1.5d5e2ap+6f);
|
||||
uint32x4_t cmp = vcagtq_f32 (values, special_bound);
|
||||
if (vpaddd_u64 (vreinterpretq_u64_u32 (cmp)) != 0) {
|
||||
return exp();
|
||||
}
|
||||
|
||||
const float32x4_t inv_ln2 = vdupq_n_f32(0x1.715476p+0f);
|
||||
const float ln2_hi = 0x1.62e4p-1f;
|
||||
const float ln2_lo = 0x1.7f7d1cp-20f;
|
||||
const float c0 = 0x1.0e4020p-7f;
|
||||
const float c2 = 0x1.555e66p-3f;
|
||||
const float32x4_t ln2_c02 = {ln2_hi, ln2_lo, c0, c2};
|
||||
|
||||
const uint32x4_t exponent_bias = vdupq_n_u32(0x3f800000);
|
||||
const float32x4_t c1 = vdupq_n_f32(0x1.573e2ep-5f);
|
||||
const float32x4_t c3 = vdupq_n_f32(0x1.fffdb6p-2f);
|
||||
const float32x4_t c4 = vdupq_n_f32(0x1.ffffecp-1f);
|
||||
|
||||
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
|
||||
float32x4_t n = vrndaq_f32 (vmulq_f32 (values, inv_ln2));
|
||||
float32x4_t r = vfmsq_laneq_f32 (values, n, ln2_c02, 0);
|
||||
r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
|
||||
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
|
||||
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, exponent_bias));
|
||||
|
||||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
float32x4_t p = vfmaq_laneq_f32 (c1, r, ln2_c02, 2);
|
||||
float32x4_t q = vfmaq_laneq_f32 (c3, r, ln2_c02, 3);
|
||||
q = vfmaq_f32 (q, p, r2);
|
||||
p = vmulq_f32 (c4, r);
|
||||
float32x4_t poly = vfmaq_f32 (p, q, r2);
|
||||
|
||||
return vfmaq_f32 (scale, poly, scale);
|
||||
|
||||
}
|
||||
|
||||
Vectorized<float> fexp_u20() const {
|
||||
return exp();
|
||||
return exp_u20();
|
||||
}
|
||||
DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
|
||||
fmod,
|
||||
@ -645,4 +664,4 @@ inline Vectorized<float> Vectorized<float>::erf() const {
|
||||
#endif /* defined(aarch64) */
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
} // namespace at::vec
|
||||
@ -813,11 +813,12 @@ static inline Vectorized<T> binary_op_as_fp32(
|
||||
#define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
|
||||
inline void load_fp32_from_##name( \
|
||||
const type* data, Vectorized<float>& out) { \
|
||||
__at_align__ float values[Vectorized<float>::size()]; \
|
||||
__at_align__ float * values = new float[Vectorized<float>::size()]; \
|
||||
for (const auto k : c10::irange(Vectorized<float>::size())) { \
|
||||
values[k] = data[k]; \
|
||||
} \
|
||||
out = Vectorized<float>::loadu(values); \
|
||||
delete[] values; \
|
||||
} \
|
||||
\
|
||||
inline void load_fp32_from_##name( \
|
||||
|
||||
@ -269,12 +269,13 @@ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16)
|
||||
#else // defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
#if !( \
|
||||
defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
!defined(CPU_CAPABILITY_SVE256))
|
||||
defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__))
|
||||
CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
|
||||
#endif
|
||||
|
||||
#if !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16)
|
||||
#endif
|
||||
#endif // defined(CPU_CAPABILITY_AVX2)
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
|
||||
@ -294,7 +294,7 @@ struct VecConvert<
|
||||
};
|
||||
#endif
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16)
|
||||
#if (defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)) && defined(__ARM_FEATURE_BF16)
|
||||
|
||||
template <>
|
||||
struct VecConvert<float, 1, BFloat16, 1> {
|
||||
|
||||
@ -270,7 +270,7 @@ LOAD_FP32_VECTORIZED_INIT(Half, fp16)
|
||||
|
||||
#if !( \
|
||||
defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
!defined(CPU_CAPABILITY_SVE256))
|
||||
!defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE))
|
||||
CONVERT_NON_VECTORIZED_INIT(Half, half)
|
||||
#endif
|
||||
|
||||
|
||||
@ -915,7 +915,7 @@ Vectorized<c10::quint8> inline maximum(
|
||||
return a.maximum(b);
|
||||
}
|
||||
|
||||
#elif !defined(CPU_CAPABILITY_SVE256)
|
||||
#elif !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
// NOTE: These are low-performance implementations that we fall back on
|
||||
// if we are not building with AVX2. This may not be an issue, because
|
||||
@ -1374,11 +1374,11 @@ Vectorized<c10::quint8> inline maximum(
|
||||
|
||||
#endif // if defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
|
||||
std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
|
||||
at::vec::Vectorized<int8_t> src) {
|
||||
auto s8x8 = vld1_s8(src.operator const int8_t*());
|
||||
auto s16x8 = vmovl_s8(s8x8);
|
||||
#if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
std::pair<Vectorized<float>, Vectorized<float>>
|
||||
inline convert_int8_to_float(at::vec::Vectorized<int8_t> src) {
|
||||
auto s8x8 = vld1_s8(src.operator const int8_t*());
|
||||
auto s16x8 = vmovl_s8(s8x8);
|
||||
|
||||
auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8));
|
||||
auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
|
||||
|
||||
@ -68,7 +68,7 @@ Windows llvm will not have this definition.
|
||||
#define VECTOR_WIDTH 64
|
||||
#define int_vector __m512i
|
||||
#elif defined(__aarch64__) && \
|
||||
!defined(CPU_CAPABILITY_SVE) // CPU_CAPABILITY_AVX512
|
||||
!defined(CPU_CAPABILITY_SVE) && !defined(CPU_CAPABILITY_SVE256) // CPU_CAPABILITY_AVX512
|
||||
// SVE code expects 256-vectors; leave that set for SVE?
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(16)))
|
||||
@ -79,6 +79,18 @@ Windows llvm will not have this definition.
|
||||
#endif
|
||||
#define VECTOR_WIDTH 16
|
||||
#else // CPU_CAPABILITY_AVX512
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(16)))
|
||||
#elif defined(_WIN32)
|
||||
#define __at_align__ __declspec(align(16))
|
||||
#else
|
||||
#define __at_align__
|
||||
#endif
|
||||
#define VECTOR_WIDTH 16
|
||||
#define int_vector __m256i
|
||||
#else // CPU_CAPABILITY_SVE256 || CPU_CAPABILITY_SVE
|
||||
#if defined(CPU_CAPABILITY_SVE256)
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(32)))
|
||||
#elif defined(_WIN32)
|
||||
@ -88,6 +100,18 @@ Windows llvm will not have this definition.
|
||||
#endif
|
||||
#define VECTOR_WIDTH 32
|
||||
#define int_vector __m256i
|
||||
#else // CPU_CAPABILITY_SVE
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(16)))
|
||||
#elif defined(_WIN32)
|
||||
#define __at_align__ __declspec(align(16))
|
||||
#else
|
||||
#define __at_align__
|
||||
#endif
|
||||
#define VECTOR_WIDTH 16
|
||||
#define int_vector __m256i
|
||||
#endif // CPU_CAPABILITY_SVE256
|
||||
#endif // CPU_CAPABILITY_SVE256 || CPU_CAPABILITY_SVE
|
||||
#endif // CPU_CAPABILITY_AVX512
|
||||
|
||||
namespace at::vec {
|
||||
@ -210,8 +234,7 @@ struct Vectorized {
|
||||
auto as_bytes() const -> const char* {
|
||||
return reinterpret_cast<const char*>(values);
|
||||
}
|
||||
template <int64_t mask_>
|
||||
static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b) {
|
||||
static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b, const int64_t mask_) {
|
||||
int64_t mask = mask_;
|
||||
Vectorized vector;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
@ -1312,7 +1335,7 @@ std::
|
||||
T const* base_addr,
|
||||
const Vectorized<int_same_size_t<T>>& vindex,
|
||||
Vectorized<T>& mask) {
|
||||
static constexpr int size = Vectorized<T>::size();
|
||||
static const int size = Vectorized<T>::size();
|
||||
T src_arr[size];
|
||||
int_same_size_t<T> mask_arr[size]; // use int type so we can logical and
|
||||
int_same_size_t<T> index_arr[size];
|
||||
@ -1405,7 +1428,7 @@ inline Vectorized<T> convert_to_fp_of_same_size(
|
||||
// clang-format on
|
||||
template <typename T>
|
||||
inline std::enable_if_t<
|
||||
Vectorized<T>::size() % 2 == 0,
|
||||
true,
|
||||
std::pair<Vectorized<T>, Vectorized<T>>>
|
||||
deinterleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
|
||||
static constexpr int size = Vectorized<T>::size();
|
||||
@ -1444,7 +1467,7 @@ VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(deinterleave2)
|
||||
// clang-format on
|
||||
template <typename T>
|
||||
inline std::enable_if_t<
|
||||
Vectorized<T>::size() % 2 == 0,
|
||||
true,
|
||||
std::pair<Vectorized<T>, Vectorized<T>>>
|
||||
interleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
|
||||
static constexpr int size = Vectorized<T>::size();
|
||||
@ -1486,7 +1509,7 @@ inline void convert(const src_T* src, dst_T* dst, int64_t n) {
|
||||
|
||||
template <typename T>
|
||||
inline Vectorized<T> flip(const Vectorized<T>& data) {
|
||||
static constexpr int size = Vectorized<T>::size();
|
||||
static const int size = Vectorized<T>::size();
|
||||
T output[size];
|
||||
T buffer[size];
|
||||
data.store(static_cast<void*>(buffer));
|
||||
|
||||
@ -15,7 +15,7 @@ template <
|
||||
struct VecConvert {
|
||||
static inline VectorizedN<dst_t, dst_n> apply(
|
||||
const VectorizedN<src_t, src_n>& src) {
|
||||
constexpr int count = std::min(
|
||||
const int count = std::min(
|
||||
VectorizedN<src_t, src_n>::size(), VectorizedN<dst_t, dst_n>::size());
|
||||
__at_align__ src_t src_buf[VectorizedN<src_t, src_n>::size()];
|
||||
src.store(src_buf);
|
||||
|
||||
@ -2,6 +2,8 @@
|
||||
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <ATen/cpu/vec/vec_n.h>
|
||||
|
||||
#include <cassert>
|
||||
namespace at::vec {
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
@ -38,9 +40,9 @@ struct VecMaskLoad {
|
||||
static inline VectorizedN<data_t, data_n> apply(
|
||||
const data_t* ptr,
|
||||
const VecMask<mask_t, mask_n>& vec_mask) {
|
||||
constexpr typename VecMask<mask_t, mask_n>::size_type size =
|
||||
const typename VecMask<mask_t, mask_n>::size_type size =
|
||||
VecMask<mask_t, mask_n>::size();
|
||||
static_assert(VectorizedN<data_t, data_n>::size() >= size);
|
||||
assert((VectorizedN<data_t, data_n>::size() >= size));
|
||||
__at_align__ data_t data[size];
|
||||
__at_align__ mask_t mask[size];
|
||||
auto mask_ = VectorizedN<mask_t, mask_n>(vec_mask);
|
||||
@ -134,7 +136,7 @@ class VecMask {
|
||||
template <typename U, int L>
|
||||
static VecMask<T, N> from(const VectorizedN<U, L>& b_vec) {
|
||||
__at_align__ U b_buf[size()];
|
||||
if constexpr (size() >= VectorizedN<U, L>::size()) {
|
||||
if (size() >= VectorizedN<U, L>::size()) {
|
||||
b_vec.store(b_buf);
|
||||
for (int i = VectorizedN<U, L>::size(); i < size(); i++) {
|
||||
b_buf[i] = static_cast<U>(0);
|
||||
@ -235,16 +237,18 @@ class VecMask {
|
||||
template <
|
||||
typename U,
|
||||
int L,
|
||||
std::enable_if_t<L >= 2 && VectorizedN<U, L>::size() >= size(), int> = 0>
|
||||
std::enable_if_t<L >= 2, int> = 0>
|
||||
VectorizedN<U, L> loadu(const U* ptr) const {
|
||||
assert((VectorizedN<U, L>::size() >= size()));
|
||||
return VecMaskLoad<U, L, T, N>::apply(ptr, *this);
|
||||
}
|
||||
|
||||
template <
|
||||
typename U,
|
||||
int L,
|
||||
std::enable_if_t<L == 1 && Vectorized<U>::size() >= size(), int> = 0>
|
||||
std::enable_if_t<L == 1, int> = 0>
|
||||
Vectorized<U> loadu(const U* ptr) const {
|
||||
assert((Vectorized<U>::size() >= size()));
|
||||
return VecMaskLoad<U, L, T, N>::apply(ptr, *this);
|
||||
}
|
||||
};
|
||||
|
||||
@ -28,7 +28,7 @@ class VectorizedN {
|
||||
using size_type = int;
|
||||
|
||||
static constexpr size_type size_T = sizeof(T);
|
||||
static constexpr size_type size() {
|
||||
static size_type size() {
|
||||
return Vectorized<T>::size() * N;
|
||||
}
|
||||
|
||||
|
||||
@ -832,7 +832,7 @@ void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
|
||||
bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
|
||||
}
|
||||
}
|
||||
#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
|
||||
#if defined(USE_ROCM) && !defined(_MSC_VER)
|
||||
else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
|
||||
at::native::bgemm_internal_ck<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
|
||||
}
|
||||
@ -1273,7 +1273,7 @@ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
|
||||
gemm_internal_cublaslt<double>(CUDABLAS_GEMM_ARGS(double));
|
||||
#endif
|
||||
}
|
||||
#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
|
||||
#if defined(USE_ROCM) && !defined(_MSC_VER)
|
||||
else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
|
||||
at::native::gemm_internal_ck<double>(CUDABLAS_GEMM_ARGS(double));
|
||||
}
|
||||
@ -1289,7 +1289,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
|
||||
if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
|
||||
gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
|
||||
}
|
||||
#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
|
||||
#if defined(USE_ROCM) && !defined(_MSC_VER)
|
||||
else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
|
||||
if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100
|
||||
gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
|
||||
@ -1341,7 +1341,7 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
|
||||
if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
|
||||
gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
|
||||
}
|
||||
#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
|
||||
#if defined(USE_ROCM) && !defined(_MSC_VER)
|
||||
else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
|
||||
at::native::gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
|
||||
}
|
||||
@ -1357,7 +1357,7 @@ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
|
||||
if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
|
||||
gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
|
||||
}
|
||||
#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
|
||||
#if defined(USE_ROCM) && !defined(_MSC_VER)
|
||||
else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
|
||||
at::native::gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
|
||||
}
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
#include <ATen/cuda/CUDAGraph.h>
|
||||
#include <ATen/cuda/Exceptions.h>
|
||||
#include <ATen/Functions.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
#include <c10/cuda/CUDAFunctions.h>
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
|
||||
#include <ATen/Tensor.h>
|
||||
#include <c10/core/Device.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
#include <c10/cuda/CUDAGraphsC10Utils.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include <c10/util/flat_hash_map.h>
|
||||
|
||||
@ -207,27 +207,6 @@ bool CUDAHooks::hasCuBLASLt() const {
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
bool CUDAHooks::hasCKSDPA() const {
|
||||
#if !defined(USE_ROCM)
|
||||
return false;
|
||||
#elif defined(USE_ROCM) && defined(USE_ROCM_CK_SDPA)
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool CUDAHooks::hasCKGEMM() const {
|
||||
#if !defined(USE_ROCM)
|
||||
return false;
|
||||
#elif defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool CUDAHooks::hasROCM() const {
|
||||
// Currently, this is same as `compiledWithMIOpen`.
|
||||
// But in future if there are ROCm builds without MIOpen,
|
||||
|
||||
@ -31,8 +31,6 @@ struct CUDAHooks : public at::CUDAHooksInterface {
|
||||
bool hasCuSOLVER() const override;
|
||||
bool hasCuBLASLt() const override;
|
||||
bool hasROCM() const override;
|
||||
bool hasCKSDPA() const override;
|
||||
bool hasCKGEMM() const override;
|
||||
const at::cuda::NVRTC& nvrtc() const override;
|
||||
DeviceIndex current_device() const override;
|
||||
bool isBuilt() const override {return true;}
|
||||
|
||||
@ -118,14 +118,6 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual bool hasCKSDPA() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual bool hasCKGEMM() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual const at::cuda::NVRTC& nvrtc() const {
|
||||
TORCH_CHECK(false, "NVRTC requires CUDA. ", CUDA_HELP);
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user