mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-22 06:11:27 +08:00
Compare commits
1 Commits
cslpull91
...
dev/joona/
Author | SHA1 | Date | |
---|---|---|---|
b9caa336a0 |
@ -286,23 +286,23 @@ case "$image" in
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-focal-rocm-n-1-py3)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
ANACONDA_PYTHON_VERSION=3.8
|
||||
GCC_VERSION=9
|
||||
PROTOBUF=yes
|
||||
DB=yes
|
||||
VISION=yes
|
||||
ROCM_VERSION=6.1
|
||||
ROCM_VERSION=6.0
|
||||
NINJA_VERSION=1.9.0
|
||||
CONDA_CMAKE=yes
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-focal-rocm-n-py3)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
ANACONDA_PYTHON_VERSION=3.8
|
||||
GCC_VERSION=9
|
||||
PROTOBUF=yes
|
||||
DB=yes
|
||||
VISION=yes
|
||||
ROCM_VERSION=6.2
|
||||
ROCM_VERSION=6.1
|
||||
NINJA_VERSION=1.9.0
|
||||
CONDA_CMAKE=yes
|
||||
TRITON=yes
|
||||
@ -379,7 +379,6 @@ case "$image" in
|
||||
GCC_VERSION=11
|
||||
CONDA_CMAKE=yes
|
||||
HALIDE=yes
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-focal-linter)
|
||||
# TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
|
||||
|
@ -7,7 +7,7 @@ PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/hea
|
||||
GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py
|
||||
|
||||
# Python versions to be installed in /opt/$VERSION_NO
|
||||
CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"}
|
||||
CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0"}
|
||||
|
||||
function check_var {
|
||||
if [ -z "$1" ]; then
|
||||
@ -22,13 +22,6 @@ function do_cpython_build {
|
||||
check_var $py_ver
|
||||
check_var $py_folder
|
||||
tar -xzf Python-$py_ver.tgz
|
||||
|
||||
local additional_flags=""
|
||||
if [ "$py_ver" == "3.13.0t" ]; then
|
||||
additional_flags=" --disable-gil"
|
||||
mv cpython-3.13/ cpython-3.13t/
|
||||
fi
|
||||
|
||||
pushd $py_folder
|
||||
|
||||
local prefix="/opt/_internal/cpython-${py_ver}"
|
||||
@ -44,10 +37,8 @@ function do_cpython_build {
|
||||
local openssl_flags="--with-openssl=${WITH_OPENSSL} --with-openssl-rpath=auto"
|
||||
fi
|
||||
|
||||
|
||||
|
||||
# -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
|
||||
CFLAGS="-Wformat" ./configure --prefix=${prefix} ${openssl_flags} ${shared_flags} ${additional_flags} > /dev/null
|
||||
CFLAGS="-Wformat" ./configure --prefix=${prefix} ${openssl_flags} ${shared_flags} > /dev/null
|
||||
|
||||
make -j40 > /dev/null
|
||||
make install > /dev/null
|
||||
@ -78,14 +69,7 @@ function build_cpython {
|
||||
check_var $py_ver
|
||||
check_var $PYTHON_DOWNLOAD_URL
|
||||
local py_ver_folder=$py_ver
|
||||
|
||||
if [ "$py_ver" = "3.13.0t" ]; then
|
||||
PY_VER_SHORT="3.13"
|
||||
PYT_VER_SHORT="3.13t"
|
||||
check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
|
||||
wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
|
||||
do_cpython_build $py_ver cpython-$PYT_VER_SHORT
|
||||
elif [ "$py_ver" = "3.13.0" ]; then
|
||||
if [ "$py_ver" = "3.13.0" ]; then
|
||||
PY_VER_SHORT="3.13"
|
||||
check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
|
||||
wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
|
||||
|
@ -5,7 +5,7 @@ set -ex
|
||||
# cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
|
||||
mkdir tmp_cusparselt && cd tmp_cusparselt
|
||||
|
||||
if [[ ${CUDA_VERSION:0:4} =~ ^12\.[2-6]$ ]]; then
|
||||
if [[ ${CUDA_VERSION:0:4} =~ ^12\.[2-4]$ ]]; then
|
||||
arch_path='sbsa'
|
||||
export TARGETARCH=${TARGETARCH:-$(uname -m)}
|
||||
if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
|
||||
|
@ -10,21 +10,6 @@ if [[ -z $ROCM_VERSION ]]; then
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
IS_UBUNTU=0
|
||||
ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
|
||||
case "$ID" in
|
||||
ubuntu)
|
||||
IS_UBUNTU=1
|
||||
;;
|
||||
centos)
|
||||
IS_UBUNTU=0
|
||||
;;
|
||||
*)
|
||||
echo "Unable to determine OS..."
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
# To make version comparison easier, create an integer representation.
|
||||
save_IFS="$IFS"
|
||||
IFS=. ROCM_VERSION_ARRAY=(${ROCM_VERSION})
|
||||
@ -72,11 +57,9 @@ MIOPEN_CMAKE_COMMON_FLAGS="
|
||||
-DMIOPEN_BUILD_DRIVER=OFF
|
||||
"
|
||||
# Pull MIOpen repo and set DMIOPEN_EMBED_DB based on ROCm version
|
||||
if [[ $ROCM_INT -ge 60300 ]]; then
|
||||
echo "ROCm 6.3+ MIOpen does not need any patches, do not build from source"
|
||||
if [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60300 ]]; then
|
||||
echo "ROCm 6.2 MIOpen does not need any patches, do not build from source"
|
||||
exit 0
|
||||
elif [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60300 ]]; then
|
||||
MIOPEN_BRANCH="release/rocm-rel-6.2-staging"
|
||||
elif [[ $ROCM_INT -ge 60100 ]] && [[ $ROCM_INT -lt 60200 ]]; then
|
||||
echo "ROCm 6.1 MIOpen does not need any patches, do not build from source"
|
||||
exit 0
|
||||
@ -110,21 +93,12 @@ else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
if [[ ${IS_UBUNTU} == 1 ]]; then
|
||||
apt-get remove -y miopen-hip
|
||||
else
|
||||
yum remove -y miopen-hip
|
||||
fi
|
||||
yum remove -y miopen-hip
|
||||
|
||||
git clone https://github.com/ROCm/MIOpen -b ${MIOPEN_BRANCH}
|
||||
pushd MIOpen
|
||||
# remove .git to save disk space since CI runner was running out
|
||||
rm -rf .git
|
||||
# Don't build CK to save docker build time
|
||||
if [[ $ROCM_INT -ge 60200 ]]; then
|
||||
sed -i '/composable_kernel/d' requirements.txt
|
||||
fi
|
||||
# Don't build MLIR to save docker build time
|
||||
# since we are disabling MLIR backend for MIOpen anyway
|
||||
if [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
|
||||
@ -137,15 +111,10 @@ cmake -P install_deps.cmake --minimum
|
||||
|
||||
# clean up since CI runner was running out of disk space
|
||||
rm -rf /tmp/*
|
||||
if [[ ${IS_UBUNTU} == 1 ]]; then
|
||||
apt-get autoclean && apt-get clean
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
else
|
||||
yum clean all
|
||||
rm -rf /var/cache/yum
|
||||
rm -rf /var/lib/yum/yumdb
|
||||
rm -rf /var/lib/yum/history
|
||||
fi
|
||||
yum clean all
|
||||
rm -rf /var/cache/yum
|
||||
rm -rf /var/lib/yum/yumdb
|
||||
rm -rf /var/lib/yum/history
|
||||
|
||||
## Build MIOpen
|
||||
mkdir -p build
|
||||
@ -162,11 +131,7 @@ make -j $(nproc) package
|
||||
# clean up since CI runner was running out of disk space
|
||||
rm -rf /usr/local/cget
|
||||
|
||||
if [[ ${IS_UBUNTU} == 1 ]]; then
|
||||
sudo dpkg -i miopen-hip*.deb
|
||||
else
|
||||
yum install -y miopen-*.rpm
|
||||
fi
|
||||
yum install -y miopen-*.rpm
|
||||
|
||||
popd
|
||||
rm -rf MIOpen
|
||||
|
@ -37,12 +37,6 @@ esac
|
||||
|
||||
(
|
||||
set -x
|
||||
# TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
|
||||
# is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
|
||||
sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl restart docker
|
||||
|
||||
docker build \
|
||||
--target final \
|
||||
--progress plain \
|
||||
|
@ -10,7 +10,6 @@ ENV LANG en_US.UTF-8
|
||||
ENV LANGUAGE en_US.UTF-8
|
||||
|
||||
ARG DEVTOOLSET_VERSION=9
|
||||
|
||||
# Note: This is required patch since CentOS have reached EOL
|
||||
# otherwise any yum install setp will fail
|
||||
RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
|
||||
|
@ -124,14 +124,7 @@ if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then
|
||||
fi
|
||||
(
|
||||
set -x
|
||||
|
||||
# TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
|
||||
# is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
|
||||
sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl restart docker
|
||||
|
||||
DOCKER_BUILDKIT=1 docker build \
|
||||
DOCKER_BUILDKIT=1 docker build \
|
||||
${DOCKER_GPU_BUILD_ARG} \
|
||||
--build-arg "GPU_IMAGE=${GPU_IMAGE}" \
|
||||
--target "${TARGET}" \
|
||||
|
@ -90,7 +90,7 @@ librosa>=0.6.2 ; python_version < "3.11"
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
mypy==1.11.2
|
||||
mypy==1.10.0
|
||||
# Pin MyPy version because new errors are likely to appear with each release
|
||||
#Description: linter
|
||||
#Pinned versions: 1.10.0
|
||||
|
@ -68,8 +68,6 @@ RUN rm install_rocm.sh
|
||||
COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
|
||||
RUN bash ./install_rocm_magma.sh
|
||||
RUN rm install_rocm_magma.sh
|
||||
ADD ./common/install_miopen.sh install_miopen.sh
|
||||
RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
|
||||
ENV ROCM_PATH /opt/rocm
|
||||
ENV PATH /opt/rocm/bin:$PATH
|
||||
ENV PATH /opt/rocm/hcc/bin:$PATH
|
||||
@ -123,8 +121,5 @@ RUN bash ./install_cache.sh && rm install_cache.sh
|
||||
ARG BUILD_ENVIRONMENT
|
||||
ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
|
||||
|
||||
# Install LLVM dev version (Defined in the pytorch/builder github repository)
|
||||
COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
|
||||
|
||||
USER jenkins
|
||||
CMD ["bash"]
|
||||
|
@ -49,8 +49,13 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
|
||||
fi
|
||||
|
||||
# Enable LLVM dependency for TensorExpr testing
|
||||
export USE_LLVM=/opt/llvm
|
||||
export LLVM_DIR=/opt/llvm/lib/cmake/llvm
|
||||
if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
|
||||
export USE_LLVM=/opt/rocm/llvm
|
||||
export LLVM_DIR=/opt/rocm/llvm/lib/cmake/llvm
|
||||
else
|
||||
export USE_LLVM=/opt/llvm
|
||||
export LLVM_DIR=/opt/llvm/lib/cmake/llvm
|
||||
fi
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *executorch* ]]; then
|
||||
# To build test_edge_op_registration
|
||||
@ -232,7 +237,7 @@ fi
|
||||
|
||||
# Do not change workspace permissions for ROCm CI jobs
|
||||
# as it can leave workspace with bad permissions for cancelled jobs
|
||||
if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* ]]; then
|
||||
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
|
||||
# Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
|
||||
WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
|
||||
cleanup_workspace() {
|
||||
@ -278,7 +283,6 @@ else
|
||||
# set only when building other architectures
|
||||
# or building non-XLA tests.
|
||||
if [[ "$BUILD_ENVIRONMENT" != *rocm* &&
|
||||
"$BUILD_ENVIRONMENT" != *s390x* &&
|
||||
"$BUILD_ENVIRONMENT" != *xla* ]]; then
|
||||
if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
|
||||
# Install numpy-2.0.2 for builds which are backward compatible with 1.X
|
||||
@ -341,11 +345,11 @@ else
|
||||
CUSTOM_OP_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/custom-op-build"
|
||||
CUSTOM_OP_TEST="$PWD/test/custom_operator"
|
||||
python --version
|
||||
SITE_PACKAGES="$(python -c 'import site; print(";".join([x for x in site.getsitepackages()] + [x + "/torch" for x in site.getsitepackages()]))')"
|
||||
SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
|
||||
|
||||
mkdir -p "$CUSTOM_OP_BUILD"
|
||||
pushd "$CUSTOM_OP_BUILD"
|
||||
cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
|
||||
cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
|
||||
-DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
|
||||
make VERBOSE=1
|
||||
popd
|
||||
@ -355,10 +359,10 @@ else
|
||||
JIT_HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build"
|
||||
JIT_HOOK_TEST="$PWD/test/jit_hooks"
|
||||
python --version
|
||||
SITE_PACKAGES="$(python -c 'import site; print(";".join([x for x in site.getsitepackages()] + [x + "/torch" for x in site.getsitepackages()]))')"
|
||||
SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
|
||||
mkdir -p "$JIT_HOOK_BUILD"
|
||||
pushd "$JIT_HOOK_BUILD"
|
||||
cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
|
||||
cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
|
||||
-DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
|
||||
make VERBOSE=1
|
||||
popd
|
||||
@ -370,7 +374,7 @@ else
|
||||
python --version
|
||||
mkdir -p "$CUSTOM_BACKEND_BUILD"
|
||||
pushd "$CUSTOM_BACKEND_BUILD"
|
||||
cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
|
||||
cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
|
||||
-DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
|
||||
make VERBOSE=1
|
||||
popd
|
||||
@ -403,6 +407,6 @@ fi
|
||||
|
||||
# snadampal: skipping it till sccache support added for aarch64
|
||||
# https://github.com/pytorch/pytorch/issues/121559
|
||||
if [[ "$BUILD_ENVIRONMENT" != *aarch64* && "$BUILD_ENVIRONMENT" != *s390x* ]]; then
|
||||
if [[ "$BUILD_ENVIRONMENT" != *aarch64* ]]; then
|
||||
print_sccache_stats
|
||||
fi
|
||||
|
@ -1,4 +1,4 @@
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from datetime import datetime, timedelta
|
||||
from tempfile import mkdtemp
|
||||
|
||||
from cryptography import x509
|
||||
@ -42,10 +42,10 @@ def create_cert(path, C, ST, L, O, key):
|
||||
.issuer_name(issuer)
|
||||
.public_key(key.public_key())
|
||||
.serial_number(x509.random_serial_number())
|
||||
.not_valid_before(datetime.now(timezone.utc))
|
||||
.not_valid_before(datetime.utcnow())
|
||||
.not_valid_after(
|
||||
# Our certificate will be valid for 10 days
|
||||
datetime.now(timezone.utc)
|
||||
datetime.utcnow()
|
||||
+ timedelta(days=10)
|
||||
)
|
||||
.add_extension(
|
||||
@ -88,10 +88,10 @@ def sign_certificate_request(path, csr_cert, ca_cert, private_ca_key):
|
||||
.issuer_name(ca_cert.subject)
|
||||
.public_key(csr_cert.public_key())
|
||||
.serial_number(x509.random_serial_number())
|
||||
.not_valid_before(datetime.now(timezone.utc))
|
||||
.not_valid_before(datetime.utcnow())
|
||||
.not_valid_after(
|
||||
# Our certificate will be valid for 10 days
|
||||
datetime.now(timezone.utc)
|
||||
datetime.utcnow()
|
||||
+ timedelta(days=10)
|
||||
# Sign our certificate with our private key
|
||||
)
|
||||
|
@ -9,13 +9,15 @@ if [[ -n "$CONDA_ENV" ]]; then
|
||||
export PATH="$CONDA_ENV/bin":$PATH
|
||||
fi
|
||||
|
||||
# Test that OpenMP is enabled
|
||||
pushd test
|
||||
if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available()))") == "1" ]]; then
|
||||
echo "Build should have OpenMP enabled, but torch.backends.openmp.is_available() is False"
|
||||
exit 1
|
||||
# Test that OpenMP is enabled for non-arm64 build
|
||||
if [[ ${BUILD_ENVIRONMENT} != *arm64* ]]; then
|
||||
pushd test
|
||||
if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available()))") == "1" ]]; then
|
||||
echo "Build should have OpenMP enabled, but torch.backends.openmp.is_available() is False"
|
||||
exit 1
|
||||
fi
|
||||
popd
|
||||
fi
|
||||
popd
|
||||
|
||||
setup_test_python() {
|
||||
# The CircleCI worker hostname doesn't resolve to an address.
|
||||
@ -25,9 +27,8 @@ setup_test_python() {
|
||||
echo "Ninja version: $(ninja --version)"
|
||||
echo "Python version: $(which python) ($(python --version))"
|
||||
|
||||
# Set the limit on open file handles to 16384
|
||||
# might help with intermittent compiler test failures
|
||||
ulimit -n 16384
|
||||
# Increase default limit on open file handles from 256 to 1024
|
||||
ulimit -n 1024
|
||||
}
|
||||
|
||||
test_python_all() {
|
||||
|
@ -375,8 +375,9 @@ test_inductor_cpp_wrapper_abi_compatible() {
|
||||
mkdir -p "$TEST_REPORTS_DIR"
|
||||
|
||||
echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
|
||||
# cpu stack allocation causes segfault and needs more investigation
|
||||
PYTORCH_TESTING_DEVICE_ONLY_FOR="" python test/run_test.py --include inductor/test_cpu_cpp_wrapper
|
||||
python test/run_test.py --include inductor/test_cuda_cpp_wrapper inductor/test_cpu_repro
|
||||
python test/run_test.py --include inductor/test_cuda_cpp_wrapper
|
||||
|
||||
TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
|
||||
--training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
|
||||
@ -400,9 +401,9 @@ pr_time_benchmarks() {
|
||||
|
||||
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
||||
mkdir -p "$TEST_REPORTS_DIR"
|
||||
PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks source benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv" "benchmarks/dynamo/pr_time_benchmarks/benchmarks"
|
||||
PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks source benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh "$TEST_REPORTS_DIR/pr_time_benchmarks_after.txt" "benchmarks/dynamo/pr_time_benchmarks/benchmarks"
|
||||
echo "benchmark results on current PR: "
|
||||
cat "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv"
|
||||
cat "$TEST_REPORTS_DIR/pr_time_benchmarks_after.txt"
|
||||
|
||||
}
|
||||
|
||||
@ -1382,16 +1383,14 @@ test_executorch() {
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_linux_aarch64() {
|
||||
test_linux_aarch64(){
|
||||
python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
|
||||
test_transformers test_multiprocessing test_numpy_interop \
|
||||
--shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
|
||||
test_transformers test_multiprocessing test_numpy_interop --verbose
|
||||
|
||||
# Dynamo tests
|
||||
python test/run_test.py --include dynamo/test_compile dynamo/test_backends dynamo/test_comptime dynamo/test_config \
|
||||
dynamo/test_functions dynamo/test_fx_passes_pre_grad dynamo/test_interop dynamo/test_model_output dynamo/test_modules \
|
||||
dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles \
|
||||
--shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
|
||||
dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles --verbose
|
||||
|
||||
# Inductor tests
|
||||
python test/run_test.py --include inductor/test_torchinductor inductor/test_benchmark_fusion inductor/test_codecache \
|
||||
@ -1401,8 +1400,7 @@ test_linux_aarch64() {
|
||||
inductor/test_max_autotune inductor/test_memory_planning inductor/test_metrics inductor/test_multi_kernel inductor/test_pad_mm \
|
||||
inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \
|
||||
inductor/test_split_cat_fx_passes inductor/test_standalone_compile inductor/test_torchinductor \
|
||||
inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes inductor/test_memory \
|
||||
--shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
|
||||
inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes --verbose
|
||||
}
|
||||
|
||||
if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
|
||||
|
24
.github/actionlint.yaml
vendored
24
.github/actionlint.yaml
vendored
@ -32,6 +32,30 @@ self-hosted-runner:
|
||||
- lf.linux.8xlarge.nvidia.gpu
|
||||
- lf.linux.16xlarge.nvidia.gpu
|
||||
- lf.linux.g5.4xlarge.nvidia.gpu
|
||||
# Organization-wide AWS Linux Runners with new Amazon 2023 AMI
|
||||
- amz2023.linux.large
|
||||
- amz2023.linux.2xlarge
|
||||
- amz2023.linux.4xlarge
|
||||
- amz2023.linux.12xlarge
|
||||
- amz2023.linux.24xlarge
|
||||
- amz2023.linux.arm64.2xlarge
|
||||
- amz2023.linux.arm64.m7g.4xlarge
|
||||
- amz2023.linux.arm64.m7g.4xlarge.ephemeral
|
||||
- amz2023.linux.4xlarge.nvidia.gpu
|
||||
- amz2023.linux.8xlarge.nvidia.gpu
|
||||
- amz2023.linux.16xlarge.nvidia.gpu
|
||||
- amz2023.linux.g5.4xlarge.nvidia.gpu
|
||||
# Pytorch/pytorch AWS Linux Runners with the new Amazon 2023 AMI on Linux Foundation account
|
||||
- amz2023.lf.linux.large
|
||||
- amz2023.lf.linux.2xlarge
|
||||
- amz2023.lf.linux.4xlarge
|
||||
- amz2023.lf.linux.12xlarge
|
||||
- amz2023.lf.linux.24xlarge
|
||||
- amz2023.lf.linux.arm64.2xlarge
|
||||
- amz2023.lf.linux.4xlarge.nvidia.gpu
|
||||
- amz2023.lf.linux.8xlarge.nvidia.gpu
|
||||
- amz2023.lf.linux.16xlarge.nvidia.gpu
|
||||
- amz2023.lf.linux.g5.4xlarge.nvidia.gpu
|
||||
# Repo-specific IBM hosted S390x runner
|
||||
- linux.s390x
|
||||
# Organization wide AWS Windows runners
|
||||
|
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
ba696ea3dfec4cbe693bf06a84c75dc196077f5b
|
||||
97ed7b36b7a741253d4e41e4da3c901d83294503
|
||||
|
182
.github/lf-canary-scale-config.yml
vendored
182
.github/lf-canary-scale-config.yml
vendored
@ -7,14 +7,10 @@
|
||||
# runners. Runners listed here will be available as self hosted
|
||||
# runners, configuration is directly pulled from the main branch.
|
||||
#
|
||||
# NOTE (Apr, 5, 2021): Linux runners are currently all an amazonlinux2
|
||||
#
|
||||
# NOTES:
|
||||
# - Linux runners are by default non-ephemeral to reduce the amount of CreateInstaces calls
|
||||
# to avoid RequestLimitExceeded issues
|
||||
# - When updating this file, run the following command to validate the YAML and to generate
|
||||
# corresponding versions of scale-config for the pytorch/pytorch repo and merge the
|
||||
# pytorch/pytorch changes before merging these changes.
|
||||
# `python .github/scripts/validate_scale_config.py --test-infra-repo-root [path_to_test-infra_root] --pytorch-repo-root [path_to_pytorch_root]``
|
||||
# NOTE (Jan 5, 2021): Linux runners are all non-ephemeral to reduce the amount of CreateInstaces calls
|
||||
# to avoid RequestLimitExceeded issues
|
||||
#
|
||||
# TODO: Add some documentation on how the auto-scaling works
|
||||
#
|
||||
@ -35,36 +31,58 @@ runner_types:
|
||||
is_ephemeral: false
|
||||
max_available: 1000
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.c.linux.10xlarge.avx2:
|
||||
disk_size: 200
|
||||
instance_type: m4.10xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 450
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.c.linux.24xl.spr-metal:
|
||||
disk_size: 200
|
||||
instance_type: c7i.metal-24xl
|
||||
is_ephemeral: false
|
||||
max_available: 150
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.c.linux.16xlarge.spr:
|
||||
disk_size: 200
|
||||
instance_type: c7i.16xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 150
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.c.linux.9xlarge.ephemeral:
|
||||
disk_size: 200
|
||||
instance_type: c5.9xlarge
|
||||
is_ephemeral: true
|
||||
max_available: 50
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.c.linux.12xlarge.ephemeral:
|
||||
@ -73,140 +91,240 @@ runner_types:
|
||||
is_ephemeral: true
|
||||
max_available: 300
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.c.linux.16xlarge.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g3.16xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 150
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.c.linux.24xlarge:
|
||||
disk_size: 150
|
||||
instance_type: c5.24xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 500
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.c.linux.24xlarge.ephemeral:
|
||||
disk_size: 150
|
||||
instance_type: c5.24xlarge
|
||||
is_ephemeral: true
|
||||
max_available: 200
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.c.linux.2xlarge:
|
||||
disk_size: 150
|
||||
instance_type: c5.2xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 3120
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.c.linux.4xlarge:
|
||||
disk_size: 150
|
||||
instance_type: c5.4xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 1000
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.c.linux.4xlarge.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g3.4xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 1000
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.c.linux.8xlarge.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g3.8xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 400
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.c.linux.g4dn.12xlarge.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g4dn.12xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 250
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.c.linux.g4dn.metal.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g4dn.metal
|
||||
is_ephemeral: false
|
||||
max_available: 300
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.c.linux.g5.48xlarge.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g5.48xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 200
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.c.linux.g5.12xlarge.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g5.12xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 150
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.c.linux.g5.4xlarge.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g5.4xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 2400
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.c.linux.g6.4xlarge.experimental.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g6.4xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 50
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.c.linux.large:
|
||||
max_available: 1200
|
||||
disk_size: 15
|
||||
instance_type: c5.large
|
||||
is_ephemeral: false
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.c.linux.arm64.2xlarge:
|
||||
disk_size: 256
|
||||
instance_type: t4g.2xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 200
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
|
||||
lf.c.linux.arm64.m7g.4xlarge:
|
||||
disk_size: 256
|
||||
instance_type: m7g.4xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 200
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
|
||||
lf.c.linux.arm64.2xlarge.ephemeral:
|
||||
disk_size: 256
|
||||
instance_type: t4g.2xlarge
|
||||
is_ephemeral: true
|
||||
max_available: 200
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
|
||||
lf.c.linux.arm64.m7g.4xlarge.ephemeral:
|
||||
disk_size: 256
|
||||
instance_type: m7g.4xlarge
|
||||
is_ephemeral: true
|
||||
max_available: 200
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
|
||||
lf.c.linux.arm64.m7g.metal:
|
||||
disk_size: 256
|
||||
instance_type: m7g.metal
|
||||
is_ephemeral: false
|
||||
max_available: 100
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
|
||||
lf.c.windows.g4dn.xlarge:
|
||||
disk_size: 256
|
||||
instance_type: g4dn.xlarge
|
||||
|
182
.github/lf-scale-config.yml
vendored
182
.github/lf-scale-config.yml
vendored
@ -7,14 +7,10 @@
|
||||
# runners. Runners listed here will be available as self hosted
|
||||
# runners, configuration is directly pulled from the main branch.
|
||||
#
|
||||
# NOTE (Apr, 5, 2021): Linux runners are currently all an amazonlinux2
|
||||
#
|
||||
# NOTES:
|
||||
# - Linux runners are by default non-ephemeral to reduce the amount of CreateInstaces calls
|
||||
# to avoid RequestLimitExceeded issues
|
||||
# - When updating this file, run the following command to validate the YAML and to generate
|
||||
# corresponding versions of scale-config for the pytorch/pytorch repo and merge the
|
||||
# pytorch/pytorch changes before merging these changes.
|
||||
# `python .github/scripts/validate_scale_config.py --test-infra-repo-root [path_to_test-infra_root] --pytorch-repo-root [path_to_pytorch_root]``
|
||||
# NOTE (Jan 5, 2021): Linux runners are all non-ephemeral to reduce the amount of CreateInstaces calls
|
||||
# to avoid RequestLimitExceeded issues
|
||||
#
|
||||
# TODO: Add some documentation on how the auto-scaling works
|
||||
#
|
||||
@ -35,36 +31,58 @@ runner_types:
|
||||
is_ephemeral: false
|
||||
max_available: 1000
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.linux.10xlarge.avx2:
|
||||
disk_size: 200
|
||||
instance_type: m4.10xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 450
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.linux.24xl.spr-metal:
|
||||
disk_size: 200
|
||||
instance_type: c7i.metal-24xl
|
||||
is_ephemeral: false
|
||||
max_available: 150
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.linux.16xlarge.spr:
|
||||
disk_size: 200
|
||||
instance_type: c7i.16xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 150
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.linux.9xlarge.ephemeral:
|
||||
disk_size: 200
|
||||
instance_type: c5.9xlarge
|
||||
is_ephemeral: true
|
||||
max_available: 50
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.linux.12xlarge.ephemeral:
|
||||
@ -73,140 +91,240 @@ runner_types:
|
||||
is_ephemeral: true
|
||||
max_available: 300
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.linux.16xlarge.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g3.16xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 150
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.linux.24xlarge:
|
||||
disk_size: 150
|
||||
instance_type: c5.24xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 500
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.linux.24xlarge.ephemeral:
|
||||
disk_size: 150
|
||||
instance_type: c5.24xlarge
|
||||
is_ephemeral: true
|
||||
max_available: 200
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.linux.2xlarge:
|
||||
disk_size: 150
|
||||
instance_type: c5.2xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 3120
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.linux.4xlarge:
|
||||
disk_size: 150
|
||||
instance_type: c5.4xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 1000
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.linux.4xlarge.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g3.4xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 1000
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.linux.8xlarge.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g3.8xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 400
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.linux.g4dn.12xlarge.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g4dn.12xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 250
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.linux.g4dn.metal.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g4dn.metal
|
||||
is_ephemeral: false
|
||||
max_available: 300
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.linux.g5.48xlarge.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g5.48xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 200
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.linux.g5.12xlarge.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g5.12xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 150
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.linux.g5.4xlarge.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g5.4xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 2400
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.linux.g6.4xlarge.experimental.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g6.4xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 50
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.linux.large:
|
||||
max_available: 1200
|
||||
disk_size: 15
|
||||
instance_type: c5.large
|
||||
is_ephemeral: false
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.linux.arm64.2xlarge:
|
||||
disk_size: 256
|
||||
instance_type: t4g.2xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 200
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
|
||||
lf.linux.arm64.m7g.4xlarge:
|
||||
disk_size: 256
|
||||
instance_type: m7g.4xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 200
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
|
||||
lf.linux.arm64.2xlarge.ephemeral:
|
||||
disk_size: 256
|
||||
instance_type: t4g.2xlarge
|
||||
is_ephemeral: true
|
||||
max_available: 200
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
|
||||
lf.linux.arm64.m7g.4xlarge.ephemeral:
|
||||
disk_size: 256
|
||||
instance_type: m7g.4xlarge
|
||||
is_ephemeral: true
|
||||
max_available: 200
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
|
||||
lf.linux.arm64.m7g.metal:
|
||||
disk_size: 256
|
||||
instance_type: m7g.metal
|
||||
is_ephemeral: false
|
||||
max_available: 100
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
|
||||
variants:
|
||||
amz2023:
|
||||
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
|
||||
lf.windows.g4dn.xlarge:
|
||||
disk_size: 256
|
||||
instance_type: g4dn.xlarge
|
||||
|
1
.github/merge_rules.yaml
vendored
1
.github/merge_rules.yaml
vendored
@ -544,7 +544,6 @@
|
||||
- anijain2305
|
||||
- bdhirsh
|
||||
- zou3519
|
||||
- isuruf
|
||||
mandatory_checks_name:
|
||||
- EasyCLA
|
||||
- Lint
|
||||
|
@ -412,8 +412,8 @@ def generate_wheels_matrix(
|
||||
),
|
||||
}
|
||||
)
|
||||
# Special build building to use on Colab. Python 3.11 for 12.1 CUDA
|
||||
if python_version == "3.11" and arch_version == "12.1":
|
||||
# Special build building to use on Colab. PyThon 3.10 for 12.1 CUDA
|
||||
if python_version == "3.10" and arch_version == "12.1":
|
||||
ret.append(
|
||||
{
|
||||
"python_version": python_version,
|
||||
|
5
.github/scripts/generate_ci_workflows.py
vendored
5
.github/scripts/generate_ci_workflows.py
vendored
@ -79,6 +79,11 @@ class BinaryBuildWorkflow:
|
||||
GITHUB_DIR
|
||||
/ f"workflows/generated-{self.build_environment}-{self.branches}.yml"
|
||||
)
|
||||
if self.use_split_build:
|
||||
output_file_path = (
|
||||
GITHUB_DIR
|
||||
/ f"workflows/generated-{self.build_environment}-{self.branches}"
|
||||
)
|
||||
with open(output_file_path, "w") as output_file:
|
||||
GENERATED = "generated" # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file
|
||||
output_file.writelines([f"# @{GENERATED} DO NOT EDIT MANUALLY\n"])
|
||||
|
8
.github/scripts/github_utils.py
vendored
8
.github/scripts/github_utils.py
vendored
@ -168,14 +168,6 @@ def gh_post_commit_comment(
|
||||
)
|
||||
|
||||
|
||||
def gh_close_pr(org: str, repo: str, pr_num: int, dry_run: bool = False) -> None:
|
||||
url = f"{GITHUB_API_URL}/repos/{org}/{repo}/pulls/{pr_num}"
|
||||
if dry_run:
|
||||
print(f"Dry run closing PR {pr_num}")
|
||||
else:
|
||||
gh_fetch_url(url, method="PATCH", data={"state": "closed"})
|
||||
|
||||
|
||||
def gh_delete_comment(org: str, repo: str, comment_id: int) -> None:
|
||||
url = f"{GITHUB_API_URL}/repos/{org}/{repo}/issues/comments/{comment_id}"
|
||||
gh_fetch_url(url, method="DELETE")
|
||||
|
7
.github/scripts/lintrunner.sh
vendored
7
.github/scripts/lintrunner.sh
vendored
@ -17,11 +17,6 @@ if [[ -d "${CACHE_DIRECTORY}" ]]; then
|
||||
cp -r "${CACHE_DIRECTORY}" . || true
|
||||
fi
|
||||
|
||||
# if lintrunner is not installed, install it
|
||||
if ! command -v lintrunner &> /dev/null; then
|
||||
python3 -m pip install lintrunner==0.12.5
|
||||
fi
|
||||
|
||||
# This has already been cached in the docker image
|
||||
lintrunner init 2> /dev/null
|
||||
|
||||
@ -38,7 +33,7 @@ python3 torch/utils/data/datapipes/gen_pyi.py
|
||||
|
||||
RC=0
|
||||
# Run lintrunner on all files
|
||||
if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
|
||||
if ! lintrunner --force-color --all-files --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
|
||||
echo ""
|
||||
echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner -m origin/main\`. (If you don't get the same results, run \'lintrunner init\' to update your local linter)\e[0m"
|
||||
echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
|
||||
|
35
.github/scripts/sync_distributed_folder_prototype.sh
vendored
Executable file
35
.github/scripts/sync_distributed_folder_prototype.sh
vendored
Executable file
@ -0,0 +1,35 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -eoux pipefail
|
||||
|
||||
SYNC_BRANCH=pytorch-stable-prototype
|
||||
|
||||
git config user.email "fake@example.com"
|
||||
git config user.name "PyTorch Stable Bot"
|
||||
|
||||
git fetch origin main
|
||||
git fetch origin "$SYNC_BRANCH"
|
||||
git checkout "$SYNC_BRANCH"
|
||||
|
||||
# Using a hardcoded SHA here is a massive speedup as we can skip the entire history of the pytorch GitHub repo.
|
||||
# This specific SHA was chosen as it was before the "branch point" of the stable branch
|
||||
for SHA in $(git log ba3b05fdf37ddbc3c301294d6a560a816335e717..origin/main --pretty="%h" -- torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed)
|
||||
do
|
||||
# `git merge-base --is-ancestor` exits with code 0 if the given SHA is an ancestor, and non-0 otherwise
|
||||
if git merge-base --is-ancestor $SHA HEAD || [[ $(git log --grep="(cherry picked from commit $SHA") ]]
|
||||
then
|
||||
echo "Skipping $SHA"
|
||||
continue
|
||||
fi
|
||||
echo "Copying $SHA"
|
||||
git cherry-pick -x "$SHA" -X theirs
|
||||
git reset --soft HEAD~1
|
||||
git add torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed
|
||||
git checkout .
|
||||
git commit --reuse-message=HEAD@{1}
|
||||
git clean -f
|
||||
done
|
||||
|
||||
if [[ "${WITH_PUSH}" == true ]]; then
|
||||
git push
|
||||
fi
|
48
.github/scripts/trymerge.py
vendored
48
.github/scripts/trymerge.py
vendored
@ -36,7 +36,6 @@ from warnings import warn
|
||||
|
||||
import yaml
|
||||
from github_utils import (
|
||||
gh_close_pr,
|
||||
gh_fetch_json_list,
|
||||
gh_fetch_merge_base,
|
||||
gh_fetch_url,
|
||||
@ -1175,11 +1174,11 @@ class GitHubPR:
|
||||
for pr in additional_merged_prs:
|
||||
pr.add_numbered_label(MERGE_COMPLETE_LABEL, dry_run)
|
||||
|
||||
# When the merge process reaches this part, we can assume that the commit
|
||||
# has been successfully pushed to trunk
|
||||
merge_commit_sha = repo.rev_parse(name=self.default_branch())
|
||||
|
||||
if comment_id and self.pr_num:
|
||||
# When the merge process reaches this part, we can assume that the commit
|
||||
# has been successfully pushed to trunk
|
||||
merge_commit_sha = repo.rev_parse(name=REMOTE_MAIN_BRANCH)
|
||||
|
||||
# Finally, upload the record to Rockset. The list of pending and failed
|
||||
# checks are at the time of the merge
|
||||
save_merge_record(
|
||||
@ -1204,17 +1203,6 @@ class GitHubPR:
|
||||
else:
|
||||
print("Missing comment ID or PR number, couldn't upload to Rockset")
|
||||
|
||||
# Usually Github will see that the commit has "resolves <pr_num>" in the
|
||||
# commit message and close the PR, but sometimes it doesn't, leading to
|
||||
# confusion. When it doesn't, we close it manually.
|
||||
time.sleep(60) # Give Github some time to close the PR
|
||||
manually_close_merged_pr(
|
||||
pr=self,
|
||||
additional_merged_prs=additional_merged_prs,
|
||||
merge_commit_sha=merge_commit_sha,
|
||||
dry_run=dry_run,
|
||||
)
|
||||
|
||||
def merge_changes(
|
||||
self,
|
||||
repo: GitRepo,
|
||||
@ -1515,34 +1503,6 @@ def checks_to_markdown_bullets(
|
||||
]
|
||||
|
||||
|
||||
def manually_close_merged_pr(
|
||||
pr: GitHubPR,
|
||||
additional_merged_prs: List[GitHubPR],
|
||||
merge_commit_sha: str,
|
||||
dry_run: bool,
|
||||
) -> None:
|
||||
def _comment_and_close(pr: GitHubPR, comment: str) -> None:
|
||||
pr = GitHubPR(pr.org, pr.project, pr.pr_num) # Refresh the PR
|
||||
if not pr.is_closed():
|
||||
gh_post_pr_comment(pr.org, pr.project, pr.pr_num, comment, dry_run)
|
||||
gh_close_pr(pr.org, pr.project, pr.pr_num, dry_run)
|
||||
|
||||
message = (
|
||||
f"This PR (#{pr.pr_num}) was merged in {merge_commit_sha} but it is still open, likely due to a Github bug, "
|
||||
"so mergebot is closing it manually. If you think this is a mistake, please feel free to reopen and contact Dev Infra."
|
||||
)
|
||||
_comment_and_close(pr, message)
|
||||
for additional_pr in additional_merged_prs:
|
||||
message = (
|
||||
f"This PR (#{additional_pr.pr_num}) was merged as part of PR #{pr.pr_num} in the stack under {merge_commit_sha} "
|
||||
"but it is still open, likely due to a Github bug, so mergebot is closing it manually. "
|
||||
"If you think this is a mistake, please feel free to reopen and contact Dev Infra."
|
||||
)
|
||||
_comment_and_close(additional_pr, message)
|
||||
|
||||
print(f"PR {pr.pr_num} and all additional PRs in the stack have been closed.")
|
||||
|
||||
|
||||
@retries_decorator()
|
||||
def save_merge_record(
|
||||
comment_id: int,
|
||||
|
64
.github/workflows/_linux-build.yml
vendored
64
.github/workflows/_linux-build.yml
vendored
@ -109,7 +109,6 @@ jobs:
|
||||
steps:
|
||||
- name: Setup SSH (Click me for login details)
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
||||
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
@ -119,16 +118,13 @@ jobs:
|
||||
# checkout. In other cases you should prefer a local checkout.
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
||||
with:
|
||||
no-sudo: ${{ inputs.build-environment == 'linux-s390x-binary-manywheel' }}
|
||||
|
||||
- name: Setup Linux
|
||||
uses: ./.github/actions/setup-linux
|
||||
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
|
||||
|
||||
- name: configure aws credentials
|
||||
uses: aws-actions/configure-aws-credentials@v3
|
||||
if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
|
||||
if: ${{ inputs.aws-role-to-assume != '' }}
|
||||
with:
|
||||
role-to-assume: ${{ inputs.aws-role-to-assume }}
|
||||
role-session-name: gha-linux-build
|
||||
@ -137,13 +133,11 @@ jobs:
|
||||
- name: Calculate docker image
|
||||
id: calculate-docker-image
|
||||
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
||||
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
|
||||
with:
|
||||
docker-image-name: ${{ inputs.docker-image-name }}
|
||||
|
||||
- name: Use following to pull public copy of the image
|
||||
id: print-ghcr-mirror
|
||||
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
|
||||
env:
|
||||
ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
shell: bash
|
||||
@ -153,7 +147,6 @@ jobs:
|
||||
|
||||
- name: Pull docker image
|
||||
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
||||
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
|
||||
with:
|
||||
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
|
||||
@ -181,7 +174,6 @@ jobs:
|
||||
- name: Download pytest cache
|
||||
uses: ./.github/actions/pytest-cache-download
|
||||
continue-on-error: true
|
||||
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
|
||||
with:
|
||||
cache_dir: .pytest_cache
|
||||
job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
|
||||
@ -203,7 +195,6 @@ jobs:
|
||||
PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
|
||||
TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
|
||||
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
DOCKER_IMAGE_S390X: ${{ inputs.docker-image-name }}
|
||||
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
|
||||
DEBUG: ${{ inputs.build-with-debug && '1' || '0' }}
|
||||
OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
||||
@ -211,21 +202,7 @@ jobs:
|
||||
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
|
||||
USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
|
||||
run: |
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
|
||||
JENKINS_USER=
|
||||
USED_IMAGE="${DOCKER_IMAGE_S390X}"
|
||||
|
||||
# since some steps are skipped on s390x, if they are necessary, run them here
|
||||
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
|
||||
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
|
||||
else
|
||||
JENKINS_USER="--user jenkins"
|
||||
USED_IMAGE="${DOCKER_IMAGE}"
|
||||
fi
|
||||
|
||||
# detached container should get cleaned up by teardown_ec2_linux
|
||||
# Used for JENKINS_USER, which can be empty
|
||||
# shellcheck disable=SC2086
|
||||
container_name=$(docker run \
|
||||
-e BUILD_ENVIRONMENT \
|
||||
-e MAX_JOBS="$(nproc --ignore=2)" \
|
||||
@ -248,10 +225,10 @@ jobs:
|
||||
--cap-add=SYS_PTRACE \
|
||||
--tty \
|
||||
--detach \
|
||||
${JENKINS_USER} \
|
||||
--user jenkins \
|
||||
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
|
||||
-w /var/lib/jenkins/workspace \
|
||||
"${USED_IMAGE}"
|
||||
"${DOCKER_IMAGE}"
|
||||
)
|
||||
docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh'
|
||||
|
||||
@ -262,7 +239,7 @@ jobs:
|
||||
|
||||
- name: Store PyTorch Build Artifacts on S3
|
||||
uses: seemethere/upload-artifact-s3@v5
|
||||
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel'
|
||||
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build
|
||||
with:
|
||||
name: ${{ inputs.build-environment }}
|
||||
retention-days: 14
|
||||
@ -272,7 +249,7 @@ jobs:
|
||||
|
||||
- name: Store PyTorch Build Artifacts on S3 for split build
|
||||
uses: seemethere/upload-artifact-s3@v5
|
||||
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel'
|
||||
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build
|
||||
with:
|
||||
name: ${{ inputs.build-environment }}-experimental-split-build
|
||||
retention-days: 14
|
||||
@ -280,26 +257,8 @@ jobs:
|
||||
path: artifacts.zip
|
||||
s3-bucket: ${{ inputs.s3-bucket }}
|
||||
|
||||
- name: Store PyTorch Build Artifacts for s390x
|
||||
uses: actions/upload-artifact@v3
|
||||
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
|
||||
with:
|
||||
name: ${{ inputs.build-environment }}
|
||||
retention-days: 14
|
||||
if-no-files-found: error
|
||||
path: artifacts.zip
|
||||
|
||||
- name: Store PyTorch Build Artifacts for s390x for split build
|
||||
uses: actions/upload-artifact@v3
|
||||
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
|
||||
with:
|
||||
name: ${{ inputs.build-environment }}-experimental-split-build
|
||||
retention-days: 14
|
||||
if-no-files-found: error
|
||||
path: artifacts.zip
|
||||
|
||||
- name: Upload sccache stats
|
||||
if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
|
||||
if: steps.build.outcome != 'skipped'
|
||||
uses: seemethere/upload-artifact-s3@v5
|
||||
with:
|
||||
s3-prefix: |
|
||||
@ -311,13 +270,4 @@ jobs:
|
||||
|
||||
- name: Teardown Linux
|
||||
uses: pytorch/test-infra/.github/actions/teardown-linux@main
|
||||
if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel'
|
||||
|
||||
- name: Cleanup docker
|
||||
if: always() && inputs.build-environment == 'linux-s390x-binary-manywheel'
|
||||
shell: bash
|
||||
run: |
|
||||
# on s390x stop the container for clean worker stop
|
||||
# ignore expansion of "docker ps -q" since it could be empty
|
||||
# shellcheck disable=SC2046
|
||||
docker stop $(docker ps -q) || true
|
||||
if: always()
|
||||
|
22
.github/workflows/_mac-test-mps.yml
vendored
22
.github/workflows/_mac-test-mps.yml
vendored
@ -88,13 +88,6 @@ jobs:
|
||||
environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
|
||||
pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
|
||||
|
||||
- name: Get workflow job id
|
||||
id: get-job-id
|
||||
uses: ./.github/actions/get-workflow-job-id
|
||||
if: always()
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Install PyTorch and run MPS tests
|
||||
id: test
|
||||
env:
|
||||
@ -110,14 +103,6 @@ jobs:
|
||||
NO_TEST_TIMEOUT: ${{ needs.filter.outputs.ci-no-test-timeout }}
|
||||
NO_TD: ${{ needs.filter.outputs.ci-no-td }}
|
||||
PIP_REQUIREMENTS_FILE: .github/requirements/pip-requirements-${{ runner.os }}.txt
|
||||
GITHUB_REPOSITORY: ${{ github.repository }}
|
||||
GITHUB_WORKFLOW: ${{ github.workflow }}
|
||||
GITHUB_JOB: ${{ github.job }}
|
||||
GITHUB_RUN_ID: ${{ github.run_id }}
|
||||
GITHUB_RUN_NUMBER: ${{ github.run_number }}
|
||||
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
|
||||
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
||||
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
|
||||
REENABLED_ISSUES: ${{ needs.filter.outputs.reenabled-issues }}
|
||||
run: |
|
||||
# shellcheck disable=SC1090
|
||||
@ -159,6 +144,13 @@ jobs:
|
||||
run: |
|
||||
cat test/**/*_toprint.log || true
|
||||
|
||||
- name: Get workflow job id
|
||||
id: get-job-id
|
||||
uses: ./.github/actions/get-workflow-job-id
|
||||
if: always()
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Upload test artifacts
|
||||
uses: ./.github/actions/upload-test-artifacts
|
||||
if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
|
||||
|
2
.github/workflows/build-conda-images.yml
vendored
2
.github/workflows/build-conda-images.yml
vendored
@ -32,7 +32,7 @@ concurrency:
|
||||
jobs:
|
||||
build-docker:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: linux.9xlarge.ephemeral
|
||||
runs-on: am2.linux.9xlarge.ephemeral
|
||||
strategy:
|
||||
matrix:
|
||||
cuda_version: ["11.8", "12.1", "12.4", "cpu"]
|
||||
|
6
.github/workflows/build-manywheel-images.yml
vendored
6
.github/workflows/build-manywheel-images.yml
vendored
@ -45,7 +45,7 @@ jobs:
|
||||
build-docker-cuda:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral"
|
||||
strategy:
|
||||
matrix:
|
||||
cuda_version: ["12.4", "12.1", "11.8"]
|
||||
@ -156,7 +156,7 @@ jobs:
|
||||
build-docker-rocm:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral"
|
||||
strategy:
|
||||
matrix:
|
||||
rocm_version: ["6.1", "6.2"]
|
||||
@ -192,7 +192,7 @@ jobs:
|
||||
build-docker-cpu:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral"
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
||||
|
3
.github/workflows/check-labels.yml
vendored
3
.github/workflows/check-labels.yml
vendored
@ -30,9 +30,6 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
check-labels:
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
name: Check labels
|
||||
if: github.repository_owner == 'pytorch'
|
||||
runs-on: linux.20_04.4x
|
||||
|
140
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
140
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
@ -1010,6 +1010,76 @@ jobs:
|
||||
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_10-cuda12_1-full-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_1-full
|
||||
build_environment: linux-binary-manywheel
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_1-full-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_10-cuda12_1-full-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda12_1-full
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_1-full-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_10-cuda12_1-full-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda12_1-full
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
|
||||
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_10-cuda12_4-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -1696,76 +1766,6 @@ jobs:
|
||||
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda12_1-full-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_1-full
|
||||
build_environment: linux-binary-manywheel
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_1-full-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_11-cuda12_1-full-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_1-full
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_1-full-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_11-cuda12_1-full-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_1-full
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
|
||||
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda12_4-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
|
@ -467,6 +467,76 @@ jobs:
|
||||
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_10-cuda12_1-full-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_1-full
|
||||
build_environment: linux-binary-manywheel-split
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_1-full-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_10-cuda12_1-full-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda12_1-full
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_1-full-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_10-cuda12_1-full-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda12_1-full
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
|
||||
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_10-cuda12_4-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -747,76 +817,6 @@ jobs:
|
||||
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda12_1-full-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_1-full
|
||||
build_environment: linux-binary-manywheel-split
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_1-full-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_11-cuda12_1-full-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_1-full
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_1-full-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_11-cuda12_1-full-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_1-full
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
|
||||
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda12_4-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
@ -5,7 +5,9 @@ on:
|
||||
# - cron: 0 7 * * 1-6
|
||||
# - cron: 0 7 * * 0
|
||||
# Does not perform max_autotune on CPU, so skip the weekly run setup
|
||||
- cron: 0 7 * * *
|
||||
# Run 6 times everyday to see if perf instablity can be reproduced
|
||||
# Will change this back
|
||||
- cron: 0 */4 * * *
|
||||
# NB: GitHub has an upper limit of 10 inputs here
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
@ -114,7 +116,7 @@ jobs:
|
||||
name: linux-jammy-aarch64-py3.10-inductor
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-aarch64-py3_10-inductor-build
|
||||
if: github.event.schedule == '0 7 * * *'
|
||||
if: github.event.schedule == '0 */4 * * *'
|
||||
with:
|
||||
build-environment: linux-jammy-aarch64-py3.10
|
||||
# Turn off dynamic-shapes and aotinductor tests for now, to have faster iteration for debugging perf instability.
|
||||
|
18
.github/workflows/inductor-rocm.yml
vendored
18
.github/workflows/inductor-rocm.yml
vendored
@ -31,13 +31,13 @@ jobs:
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-focal-rocm6_2-py3_10-inductor-build:
|
||||
name: rocm6.2-py3.10-inductor
|
||||
linux-focal-rocm6_1-py3_8-inductor-build:
|
||||
name: rocm6.1-py3.8-inductor
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
docker-image-name: pytorch-linux-focal-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -45,14 +45,14 @@ jobs:
|
||||
{ config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2" },
|
||||
]}
|
||||
|
||||
linux-focal-rocm6_2-py3_10-inductor-test:
|
||||
linux-focal-rocm6_1-py3_8-inductor-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: rocm6.2-py3.10-inductor
|
||||
name: rocm6.1-py3.8-inductor
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs: linux-focal-rocm6_2-py3_10-inductor-build
|
||||
needs: linux-focal-rocm6_1-py3_8-inductor-build
|
||||
with:
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-inductor-build.outputs.test-matrix }}
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-inductor-build.outputs.test-matrix }}
|
||||
|
22
.github/workflows/inductor.yml
vendored
22
.github/workflows/inductor.yml
vendored
@ -58,7 +58,8 @@ jobs:
|
||||
{ config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "inductor_cpp_wrapper_abi_compatible", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
secrets:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-inductor-test:
|
||||
name: cuda12.1-py3.10-gcc9-sm86
|
||||
@ -68,7 +69,8 @@ jobs:
|
||||
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
|
||||
docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
secrets:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
|
||||
linux-focal-cuda12_1-py3_12-gcc9-inductor-build:
|
||||
name: cuda12.1-py3.12-gcc9-sm86
|
||||
@ -84,7 +86,6 @@ jobs:
|
||||
{ config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-focal-cuda12_1-py3_12-gcc9-inductor-test:
|
||||
name: cuda12.1-py3.12-gcc9-sm86
|
||||
@ -94,7 +95,6 @@ jobs:
|
||||
build-environment: linux-focal-cuda12.1-py3.12-gcc9-sm86
|
||||
docker-image: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cpu-py3_12-inductor-halide-build:
|
||||
name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
|
||||
@ -108,7 +108,6 @@ jobs:
|
||||
{ include: [
|
||||
{ config: "inductor-halide", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cpu-py3_12-inductor-halide-test:
|
||||
name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
|
||||
@ -118,7 +117,6 @@ jobs:
|
||||
build-environment: linux-jammy-py3.12-gcc11
|
||||
docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
|
||||
# Should be synced with the one in inductor-periodic.yml but this only runs inductor_timm
|
||||
@ -136,7 +134,8 @@ jobs:
|
||||
{ config: "inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
secrets:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
|
||||
linux-focal-cuda12_4-py3_10-gcc9-inductor-test:
|
||||
name: cuda12.4-py3.10-gcc9-sm86
|
||||
@ -147,7 +146,8 @@ jobs:
|
||||
build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
|
||||
docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
secrets:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
|
||||
linux-jammy-cpu-py3_9-gcc11-inductor-build:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-inductor
|
||||
@ -201,7 +201,8 @@ jobs:
|
||||
{ config: "cpu_inductor_freezing_avx2_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
|
||||
{ config: "cpu_inductor_freezing_avx2_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
|
||||
]}
|
||||
secrets: inherit
|
||||
secrets:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
|
||||
linux-jammy-cpu-py3_9-gcc11-inductor-test:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-inductor
|
||||
@ -211,4 +212,5 @@ jobs:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
secrets:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
|
44
.github/workflows/lint-autoformat.yml
vendored
44
.github/workflows/lint-autoformat.yml
vendored
@ -1,44 +0,0 @@
|
||||
name: Apply lint suggestions
|
||||
|
||||
on:
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
|
||||
jobs:
|
||||
lintrunner-autoformat:
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
runs-on: lf.linux.2xlarge
|
||||
continue-on-error: true
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
steps:
|
||||
- name: Checkout pytorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
- name: Setup miniconda
|
||||
uses: pytorch/test-infra/.github/actions/setup-miniconda@main
|
||||
with:
|
||||
python-version: "3.10"
|
||||
- name: Run lintrunner (nonretryable)
|
||||
continue-on-error: true
|
||||
# we can't run all files here because only changes around where the diff are shown in the PR UI
|
||||
run: |
|
||||
export ADDITIONAL_LINTRUNNER_ARGS="format"
|
||||
bash .github/scripts/lintrunner.sh
|
||||
- name: Check for changes
|
||||
id: git-check
|
||||
run: |
|
||||
git diff --exit-code || echo "changes=true" >> "$GITHUB_OUTPUT"
|
||||
- name: Suggest changes
|
||||
if: steps.git-check.outputs.changes == 'true'
|
||||
uses: parkerbxyz/suggest-changes@v1
|
||||
with:
|
||||
comment: "Please commit the suggested changes from pytorch's linter."
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
6
.github/workflows/lint.yml
vendored
6
.github/workflows/lint.yml
vendored
@ -36,7 +36,7 @@ jobs:
|
||||
submodules: true
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
script: |
|
||||
export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT --all-files"
|
||||
export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT"
|
||||
export CLANG=1
|
||||
.github/scripts/lintrunner.sh
|
||||
|
||||
@ -53,7 +53,7 @@ jobs:
|
||||
submodules: true
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
script: |
|
||||
export ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT --all-files"
|
||||
export ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT"
|
||||
.github/scripts/lintrunner.sh
|
||||
|
||||
quick-checks:
|
||||
@ -278,4 +278,4 @@ jobs:
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
cancel-in-progress: true
|
||||
|
22
.github/workflows/periodic.yml
vendored
22
.github/workflows/periodic.yml
vendored
@ -218,9 +218,7 @@ jobs:
|
||||
# TODO: Figure out how to migrate this job to M1 runner
|
||||
ios-build-test:
|
||||
name: ios-build-test
|
||||
# Has been broken for a while, see https://github.com/pytorch/pytorch/issues/136284
|
||||
# if: github.event_name != 'schedule' || github.event.schedule == '45 0,8,16 * * 1-5' || github.event.schedule == '45 4 * * 0,6' || github.event.schedule == '29 8 * * *'
|
||||
if: false
|
||||
if: github.event_name != 'schedule' || github.event.schedule == '45 0,8,16 * * 1-5' || github.event.schedule == '45 4 * * 0,6' || github.event.schedule == '29 8 * * *'
|
||||
uses: ./.github/workflows/_ios-build-test.yml
|
||||
with:
|
||||
trigger-event: ${{ github.event_name }}
|
||||
@ -299,13 +297,13 @@ jobs:
|
||||
docker-image: ${{ needs.linux-vulkan-focal-py3_11-clang10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-vulkan-focal-py3_11-clang10-build.outputs.test-matrix }}
|
||||
|
||||
linux-focal-rocm6_2-py3_10-build:
|
||||
name: linux-focal-rocm6.2-py3.10
|
||||
linux-focal-rocm6_1-py3_8-build:
|
||||
name: linux-focal-rocm6.1-py3.8
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
docker-image-name: pytorch-linux-focal-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -314,19 +312,19 @@ jobs:
|
||||
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu" },
|
||||
]}
|
||||
|
||||
linux-focal-rocm6_2-py3_10-test:
|
||||
linux-focal-rocm6_1-py3_8-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-focal-rocm6.2-py3.10
|
||||
name: linux-focal-rocm6.1-py3.8
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-focal-rocm6_2-py3_10-build
|
||||
- linux-focal-rocm6_1-py3_8-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build:
|
||||
name: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
|
||||
|
8
.github/workflows/pull.yml
vendored
8
.github/workflows/pull.yml
vendored
@ -383,7 +383,7 @@ jobs:
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-py3.9-clang9-xla
|
||||
docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
|
||||
docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.1-lite
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
|
||||
@ -503,15 +503,15 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-focal-rocm6_2-py3_10-build:
|
||||
linux-focal-rocm6_1-py3_8-build:
|
||||
# don't run build twice on main
|
||||
if: github.event_name == 'pull_request'
|
||||
name: linux-focal-rocm6.2-py3.10
|
||||
name: linux-focal-rocm6.1-py3.8
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
docker-image-name: pytorch-linux-focal-rocm-n-py3
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
|
18
.github/workflows/rocm.yml
vendored
18
.github/workflows/rocm.yml
vendored
@ -25,11 +25,11 @@ jobs:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
linux-focal-rocm6_2-py3_10-build:
|
||||
name: linux-focal-rocm6.2-py3.10
|
||||
linux-focal-rocm6_1-py3_8-build:
|
||||
name: linux-focal-rocm6.1-py3.8
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
docker-image-name: pytorch-linux-focal-rocm-n-py3
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
@ -42,16 +42,16 @@ jobs:
|
||||
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.2" },
|
||||
]}
|
||||
|
||||
linux-focal-rocm6_2-py3_10-test:
|
||||
linux-focal-rocm6_1-py3_8-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-focal-rocm6.2-py3.10
|
||||
name: linux-focal-rocm6.1-py3.8
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-focal-rocm6_2-py3_10-build
|
||||
- linux-focal-rocm6_1-py3_8-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
|
||||
|
18
.github/workflows/slow.yml
vendored
18
.github/workflows/slow.yml
vendored
@ -130,13 +130,13 @@ jobs:
|
||||
docker-image: ${{ needs.linux-focal-py3_9-clang10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-py3_9-clang10-build.outputs.test-matrix }}
|
||||
|
||||
linux-focal-rocm6_2-py3_10-build:
|
||||
name: linux-focal-rocm6.2-py3.10
|
||||
linux-focal-rocm6_1-py3_8-build:
|
||||
name: linux-focal-rocm6.1-py3.8
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
docker-image-name: pytorch-linux-focal-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -144,19 +144,19 @@ jobs:
|
||||
{ config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
|
||||
]}
|
||||
|
||||
linux-focal-rocm6_2-py3_10-test:
|
||||
linux-focal-rocm6_1-py3_8-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-focal-rocm6.2-py3.10
|
||||
name: linux-focal-rocm6.1-py3.8
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-focal-rocm6_2-py3_10-build
|
||||
- linux-focal-rocm6_1-py3_8-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
|
||||
|
||||
linux-jammy-py3_10-clang15-asan-build:
|
||||
name: linux-jammy-py3.10-clang15-asan
|
||||
|
30
.github/workflows/sync_distributed_folder_prototype.yml
vendored
Normal file
30
.github/workflows/sync_distributed_folder_prototype.yml
vendored
Normal file
@ -0,0 +1,30 @@
|
||||
name: Sync Distributed Folder
|
||||
|
||||
on:
|
||||
#push:
|
||||
# branches:
|
||||
# - 'main'
|
||||
# paths:
|
||||
# - 'torch/distributed/**'
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
paths:
|
||||
- '.github/scripts/sync_distributed_folder_prototype.sh'
|
||||
- '.github/workflows/sync_distributed_folder_prototype.yml'
|
||||
|
||||
env:
|
||||
WITH_PUSH: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
sync:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- run: .github/scripts/sync_distributed_folder_prototype.sh
|
26
.github/workflows/trunk.yml
vendored
26
.github/workflows/trunk.yml
vendored
@ -223,13 +223,13 @@ jobs:
|
||||
cuda-version: "12.1"
|
||||
runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
|
||||
linux-focal-rocm6_2-py3_10-build:
|
||||
name: linux-focal-rocm6.2-py3.10
|
||||
linux-focal-rocm6_1-py3_8-build:
|
||||
name: linux-focal-rocm6.1-py3.8
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
docker-image-name: pytorch-linux-focal-rocm-n-py3
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
@ -240,19 +240,19 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-focal-rocm6_2-py3_10-test:
|
||||
linux-focal-rocm6_1-py3_8-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-focal-rocm6.2-py3.10
|
||||
name: linux-focal-rocm6.1-py3.8
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-focal-rocm6_2-py3_10-build
|
||||
- linux-focal-rocm6_1-py3_8-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
|
||||
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
|
||||
|
||||
linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build:
|
||||
@ -316,11 +316,3 @@ jobs:
|
||||
build-environment: linux-focal-cuda11.8-py3.10-gcc9-experimental-split-build
|
||||
docker-image: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
|
||||
|
||||
linux-manylinux-2_28-py3-cpu-s390x-build:
|
||||
name: linux-manylinux-2_28-py3-cpu-s390x
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
build-environment: linux-s390x-binary-manywheel
|
||||
docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
runner: linux.s390x
|
||||
|
24
.github/workflows/update-viablestrict.yml
vendored
24
.github/workflows/update-viablestrict.yml
vendored
@ -11,39 +11,15 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
do_update_viablestrict:
|
||||
permissions:
|
||||
id-token: write
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
runs-on: ubuntu-20.04
|
||||
environment: ${{ (github.event_name == 'schedule') && 'mergebot' || '' }}
|
||||
steps:
|
||||
- name: Update viable/strict
|
||||
uses: pytorch/test-infra/.github/actions/update-viablestrict@main
|
||||
id: update_viablestrict
|
||||
with:
|
||||
repository: pytorch/pytorch
|
||||
stable-branch: viable/strict
|
||||
requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\"]'
|
||||
secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
|
||||
rockset-api-key: ${{ secrets.ROCKSET_API_KEY }}
|
||||
|
||||
- name: Authenticate to AWS with OIDC
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/upload_to_ossci_raw_job_status
|
||||
aws-region: us-east-1
|
||||
|
||||
- name: Print sha
|
||||
env:
|
||||
LATEST_SHA: ${{ steps.update_viablestrict.outputs.latest_viable_sha }}
|
||||
PUSH_RESULT: ${{ steps.update_viablestrict.outputs.push_result }}
|
||||
TIME: ${{ steps.update_viablestrict.outputs.time }}
|
||||
run: |
|
||||
echo "${PUSH_RESULT}"
|
||||
if [ "$PUSH_RESULT" = "Everything up-to-date" ]; then
|
||||
echo "No update pushed"
|
||||
else
|
||||
echo "{\"sha\": \"${LATEST_SHA}\", \"repository\":\"pytorch/pytorch\", \"timestamp\": ${TIME}}" > "/tmp/${LATEST_SHA}.json"
|
||||
pip install awscli==1.29.40
|
||||
aws s3 cp "/tmp/${LATEST_SHA}.json" "s3://ossci-raw-job-status/stable_pushes/pytorch/pytorch/${LATEST_SHA}.json"
|
||||
fi
|
||||
|
55
.github/workflows/upload-alerts.yml
vendored
Normal file
55
.github/workflows/upload-alerts.yml
vendored
Normal file
@ -0,0 +1,55 @@
|
||||
# upload alerts every 10 minutes
|
||||
|
||||
name: Upload Alerts to AWS/Rockset
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '*/10 * * * *'
|
||||
pull_request:
|
||||
paths:
|
||||
- 'tools/alerts/create_alerts.py'
|
||||
- '.github/workflows/upload-alerts.yml'
|
||||
|
||||
jobs:
|
||||
upload-alerts:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
runs-on: ubuntu-22.04
|
||||
environment: upload-stats
|
||||
steps:
|
||||
- name: Checkout repo
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 1
|
||||
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.11'
|
||||
cache: pip
|
||||
|
||||
- name: Install Python Packages
|
||||
run: |
|
||||
pip3 install rockset==1.0.3 boto3==1.19.12 requests==2.32.2
|
||||
|
||||
- name: Create alerts
|
||||
run: |
|
||||
output=$(PYTHONPATH=$PYTHONPATH:$(pwd) python3 "tools/alerts/create_alerts.py")
|
||||
echo "uploading following alerts"
|
||||
echo "$output"
|
||||
echo "script-output=$output" >> "$GITHUB_OUTPUT"
|
||||
id: alert_creation_step
|
||||
|
||||
- name: Upload alerts
|
||||
env:
|
||||
ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
uses: pytorch/test-infra/.github/actions/upload-alerts@main
|
||||
with:
|
||||
alerts: '${{ steps.alert_creation_step.outputs.script-output }}'
|
||||
organization: "pytorch"
|
||||
repo: "pytorch"
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
2
.github/workflows/upload-test-stats.yml
vendored
2
.github/workflows/upload-test-stats.yml
vendored
@ -96,7 +96,7 @@ jobs:
|
||||
python3 -m tools.stats.check_disabled_tests --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --repo "${REPO_FULLNAME}"
|
||||
|
||||
- name: Upload gpt-fast benchmark results to Rockset
|
||||
if: steps.upload-s3.outcome && steps.upload-s3.outcome == 'success' && contains(github.event.workflow_run.name, 'inductor-micro-benchmark')
|
||||
if: steps.upload-s3.outcome && steps.upload-s3.outcome == 'success' && contains('inductor-micro-benchmark', github.event.workflow_run.name)
|
||||
env:
|
||||
ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
|
||||
WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
|
||||
|
@ -139,7 +139,7 @@ init_command = [
|
||||
'numpy==1.24.3 ; python_version == "3.8"',
|
||||
'numpy==1.26.0 ; python_version >= "3.9"',
|
||||
'expecttest==0.2.1',
|
||||
'mypy==1.11.2',
|
||||
'mypy==1.10.0',
|
||||
'sympy==1.12.1 ; python_version == "3.8"',
|
||||
'sympy==1.13.0 ; python_version >= "3.9"',
|
||||
'types-requests==2.27.25',
|
||||
@ -195,7 +195,6 @@ include_patterns = [
|
||||
# and excluding most sub-directories for now.
|
||||
'aten/src/ATen/*.h',
|
||||
'aten/src/ATen/*.cpp',
|
||||
'aten/src/ATen/cuda/*.cpp',
|
||||
'aten/src/ATen/cpu/*.h',
|
||||
'aten/src/ATen/cpu/*.cpp',
|
||||
'aten/src/ATen/core/*.h',
|
||||
@ -211,6 +210,8 @@ include_patterns = [
|
||||
'aten/src/ATen/native/nested/*.h',
|
||||
'c10/**/*.cpp',
|
||||
'c10/**/*.h',
|
||||
'caffe2/**/*.cc',
|
||||
'caffe2/**/*.h',
|
||||
'torch/*.h',
|
||||
'torch/csrc/*.h',
|
||||
'torch/csrc/*.cpp',
|
||||
@ -225,6 +226,7 @@ exclude_patterns = [
|
||||
# CUDA files are also excluded.
|
||||
'**/fb/**',
|
||||
'**/*pb.h',
|
||||
'aten/**/cuda/*pp',
|
||||
'c10/xpu/**/*.h',
|
||||
'c10/xpu/**/*.cpp',
|
||||
'c10/cuda/CUDAAlgorithm.h',
|
||||
@ -1585,27 +1587,6 @@ command = [
|
||||
]
|
||||
is_formatter = true
|
||||
|
||||
|
||||
[[linter]]
|
||||
code = 'META_NO_CREATE_UNBACKED'
|
||||
include_patterns = [
|
||||
"torch/_meta_registrations.py"
|
||||
]
|
||||
command = [
|
||||
'python3',
|
||||
'tools/linter/adapters/grep_linter.py',
|
||||
'--pattern=create_unbacked',
|
||||
'--linter-name=META_NO_CREATE_UNBACKED',
|
||||
'--error-name=no create_unbacked in meta registrations',
|
||||
"""--error-description=\
|
||||
Data-dependent operators should have their meta \
|
||||
registration in torch/_subclasses/fake_impls.py, \
|
||||
not torch/_meta_registrations.py
|
||||
""",
|
||||
'--',
|
||||
'@{{PATHSFILE}}'
|
||||
]
|
||||
|
||||
[[linter]]
|
||||
code = 'ATEN_CPU_GPU_AGNOSTIC'
|
||||
include_patterns = [
|
||||
|
1
BUCK.oss
1
BUCK.oss
@ -65,7 +65,6 @@ cxx_library(
|
||||
"caffe2/serialize/file_adapter.cc",
|
||||
"caffe2/serialize/inline_container.cc",
|
||||
"caffe2/serialize/istream_adapter.cc",
|
||||
"caffe2/serialize/read_adapter_interface.cc",
|
||||
],
|
||||
visibility = ["PUBLIC"],
|
||||
deps = [
|
||||
|
@ -473,7 +473,6 @@ filegroup(
|
||||
"caffe2/serialize/file_adapter.cc",
|
||||
"caffe2/serialize/inline_container.cc",
|
||||
"caffe2/serialize/istream_adapter.cc",
|
||||
"caffe2/serialize/read_adapter_interface.cc",
|
||||
],
|
||||
)
|
||||
|
||||
|
@ -305,6 +305,7 @@ if(NOT DEFINED USE_VULKAN)
|
||||
cmake_dependent_option(USE_VULKAN "Use Vulkan GPU backend" ON "ANDROID" OFF)
|
||||
endif()
|
||||
|
||||
option(USE_SLEEF_FOR_ARM_VEC256 "Use sleef for arm" OFF)
|
||||
option(USE_SOURCE_DEBUG_ON_MOBILE "Enable" ON)
|
||||
option(USE_LITE_INTERPRETER_PROFILER "Enable" ON)
|
||||
cmake_dependent_option(
|
||||
@ -368,7 +369,7 @@ cmake_dependent_option(
|
||||
USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF)
|
||||
cmake_dependent_option(
|
||||
USE_TENSORPIPE "Use TensorPipe. Only available if USE_DISTRIBUTED is on." ON
|
||||
"USE_DISTRIBUTED AND NOT WIN32" OFF)
|
||||
"USE_DISTRIBUTED" OFF)
|
||||
option(ONNX_ML "Enable traditional ONNX ML API." ON)
|
||||
option(HAVE_SOVERSION "Whether to add SOVERSION to the shared objects" OFF)
|
||||
option(BUILD_LIBTORCH_CPU_WITH_DEBUG
|
||||
@ -911,6 +912,11 @@ if(USE_PYTORCH_QNNPACK)
|
||||
string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK")
|
||||
endif()
|
||||
|
||||
if(USE_SLEEF_FOR_ARM_VEC256)
|
||||
string(APPEND CMAKE_CXX_FLAGS " -DAT_BUILD_ARM_VEC256_WITH_SLEEF")
|
||||
add_definitions(-DAT_BUILD_ARM_VEC256_WITH_SLEEF)
|
||||
endif()
|
||||
|
||||
# Enable sleef on macOS with Apple silicon by default
|
||||
if((${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm64"))
|
||||
message(STATUS "Running on macOS with Apple silicon")
|
||||
@ -918,14 +924,6 @@ if((${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") AND ("${CMAKE_SYSTEM_PROCESSOR}" STR
|
||||
add_definitions(-DAT_BUILD_ARM_VEC256_WITH_SLEEF)
|
||||
endif()
|
||||
|
||||
# Enable sleef on Arm(R) architecture by default (except Android)
|
||||
if((NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android")
|
||||
AND("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64"))
|
||||
string(APPEND CMAKE_CXX_FLAGS " -DAT_BUILD_ARM_VEC256_WITH_SLEEF")
|
||||
add_definitions(-DAT_BUILD_ARM_VEC256_WITH_SLEEF)
|
||||
endif()
|
||||
|
||||
|
||||
if(USE_XNNPACK)
|
||||
string(APPEND CMAKE_CXX_FLAGS " -DUSE_XNNPACK")
|
||||
endif()
|
||||
|
@ -98,10 +98,6 @@ test/test_type_promotion.py @mruberry
|
||||
test/functorch/test_ops.py @zou3519 @chillee @kshitij12345
|
||||
test/functorch/test_vmap.py @zou3519 @chillee @kshitij12345
|
||||
|
||||
# HOPs
|
||||
torch/_higher_order_ops/*.py @zou3519
|
||||
torch/_dynamo/variables/higher_order_ops.py @zou3519
|
||||
|
||||
# torch MPS
|
||||
test/test_mps.py @kulinseth @malfet
|
||||
aten/src/ATen/mps/ @kulinseth @malfet
|
||||
|
@ -50,6 +50,7 @@ aspects of contributing to PyTorch.
|
||||
- [Windows development tips](#windows-development-tips)
|
||||
- [Known MSVC (and MSVC with NVCC) bugs](#known-msvc-and-msvc-with-nvcc-bugs)
|
||||
- [Building on legacy code and CUDA](#building-on-legacy-code-and-cuda)
|
||||
- [Running clang-tidy](#running-clang-tidy)
|
||||
- [Pre-commit tidy/linting hook](#pre-commit-tidylinting-hook)
|
||||
- [Building PyTorch with ASAN](#building-pytorch-with-asan)
|
||||
- [Getting `ccache` to work](#getting-ccache-to-work)
|
||||
@ -1131,6 +1132,38 @@ CUDA, MSVC, and PyTorch versions are interdependent; please install matching ver
|
||||
|
||||
Note: There's a [compilation issue](https://github.com/oneapi-src/oneDNN/issues/812) in several Visual Studio 2019 versions since 16.7.1, so please make sure your Visual Studio 2019 version is not in 16.7.1 ~ 16.7.5
|
||||
|
||||
## Running clang-tidy
|
||||
|
||||
[Clang-Tidy](https://clang.llvm.org/extra/clang-tidy/index.html) is a C++
|
||||
linter and static analysis tool based on the clang compiler. We run clang-tidy
|
||||
in our CI to make sure that new C++ code is safe, sane and efficient. See the
|
||||
[`clang-tidy` job in our GitHub Workflow's
|
||||
lint.yml file](https://github.com/pytorch/pytorch/blob/main/.github/workflows/lint.yml)
|
||||
for the simple commands we use for this.
|
||||
|
||||
To run clang-tidy locally, follow these steps:
|
||||
|
||||
1. Install clang-tidy.
|
||||
We provide custom built binaries which have additional checks enabled. You can install it by running:
|
||||
```bash
|
||||
python3 -m tools.linter.clang_tidy.generate_build_files
|
||||
```
|
||||
We currently only support Linux and MacOS (x86).
|
||||
|
||||
2. Install clang-tidy driver script dependencies
|
||||
```bash
|
||||
pip3 install -r tools/linter/clang_tidy/requirements.txt
|
||||
```
|
||||
|
||||
3. Run clang-tidy
|
||||
```bash
|
||||
# Run clang-tidy on the entire codebase
|
||||
make clang-tidy
|
||||
# Run clang-tidy only on your changes
|
||||
make clang-tidy CHANGED_ONLY=--changed-only
|
||||
```
|
||||
This internally invokes our driver script and closely mimics how clang-tidy is run on CI.
|
||||
|
||||
## Pre-commit tidy/linting hook
|
||||
|
||||
We use clang-tidy to perform additional
|
||||
|
59
README.md
59
README.md
@ -27,8 +27,8 @@ Our trunk health (Continuous Integration signals) can be found at [hud.pytorch.o
|
||||
- [NVIDIA CUDA Support](#nvidia-cuda-support)
|
||||
- [AMD ROCm Support](#amd-rocm-support)
|
||||
- [Intel GPU Support](#intel-gpu-support)
|
||||
- [Get the PyTorch Source](#get-the-pytorch-source)
|
||||
- [Install Dependencies](#install-dependencies)
|
||||
- [Get the PyTorch Source](#get-the-pytorch-source)
|
||||
- [Install PyTorch](#install-pytorch)
|
||||
- [Adjust Build Options (Optional)](#adjust-build-options-optional)
|
||||
- [Docker Image](#docker-image)
|
||||
@ -161,34 +161,9 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv)
|
||||
#### Prerequisites
|
||||
If you are installing from source, you will need:
|
||||
- Python 3.8 or later (for Linux, Python 3.8.1+ is needed)
|
||||
- A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux)
|
||||
- Visual Studio or Visual Studio Build Tool on Windows
|
||||
- A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required)
|
||||
|
||||
\* PyTorch CI uses Visual C++ BuildTools, which come with Visual Studio Enterprise,
|
||||
Professional, or Community Editions. You can also install the build tools from
|
||||
https://visualstudio.microsoft.com/visual-cpp-build-tools/. The build tools *do not*
|
||||
come with Visual Studio Code by default.
|
||||
|
||||
\* We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro.
|
||||
|
||||
An example of environment setup is shown below:
|
||||
|
||||
* Linux:
|
||||
|
||||
```bash
|
||||
$ source <CONDA_INSTALL_DIR>/bin/activate
|
||||
$ conda create -y -n <CONDA_NAME>
|
||||
$ conda activate <CONDA_NAME>
|
||||
```
|
||||
|
||||
* Windows:
|
||||
|
||||
```bash
|
||||
$ source <CONDA_INSTALL_DIR>\Scripts\activate.bat
|
||||
$ conda create -y -n <CONDA_NAME>
|
||||
$ conda activate <CONDA_NAME>
|
||||
$ call "C:\Program Files\Microsoft Visual Studio\<VERSION>\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
|
||||
```
|
||||
We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro.
|
||||
|
||||
##### NVIDIA CUDA Support
|
||||
If you want to compile with CUDA support, [select a supported version of CUDA from our support matrix](https://pytorch.org/get-started/locally/), then install the following:
|
||||
@ -219,23 +194,12 @@ If you want to compile with Intel GPU support, follow these
|
||||
If you want to disable Intel GPU support, export the environment variable `USE_XPU=0`.
|
||||
Other potentially useful environment variables may be found in `setup.py`.
|
||||
|
||||
#### Get the PyTorch Source
|
||||
```bash
|
||||
git clone --recursive https://github.com/pytorch/pytorch
|
||||
cd pytorch
|
||||
# if you are updating an existing checkout
|
||||
git submodule sync
|
||||
git submodule update --init --recursive
|
||||
```
|
||||
|
||||
#### Install Dependencies
|
||||
|
||||
**Common**
|
||||
|
||||
```bash
|
||||
conda install cmake ninja
|
||||
# Run this command on native Windows
|
||||
conda install rust
|
||||
# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section below
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
@ -271,6 +235,15 @@ pip install mkl-static mkl-include
|
||||
conda install -c conda-forge libuv=1.39
|
||||
```
|
||||
|
||||
#### Get the PyTorch Source
|
||||
```bash
|
||||
git clone --recursive https://github.com/pytorch/pytorch
|
||||
cd pytorch
|
||||
# if you are updating an existing checkout
|
||||
git submodule sync
|
||||
git submodule update --init --recursive
|
||||
```
|
||||
|
||||
#### Install PyTorch
|
||||
**On Linux**
|
||||
|
||||
@ -311,6 +284,13 @@ python3 setup.py develop
|
||||
|
||||
**On Windows**
|
||||
|
||||
Choose Correct Visual Studio Version.
|
||||
|
||||
PyTorch CI uses Visual C++ BuildTools, which come with Visual Studio Enterprise,
|
||||
Professional, or Community Editions. You can also install the build tools from
|
||||
https://visualstudio.microsoft.com/visual-cpp-build-tools/. The build tools *do not*
|
||||
come with Visual Studio Code by default.
|
||||
|
||||
If you want to build legacy python code, please refer to [Building on legacy code and CUDA](https://github.com/pytorch/pytorch/blob/main/CONTRIBUTING.md#building-on-legacy-code-and-cuda)
|
||||
|
||||
**CPU-only builds**
|
||||
@ -318,6 +298,7 @@ If you want to build legacy python code, please refer to [Building on legacy cod
|
||||
In this mode PyTorch computations will run on your CPU, not your GPU
|
||||
|
||||
```cmd
|
||||
conda activate
|
||||
python setup.py develop
|
||||
```
|
||||
|
||||
|
28
RELEASE.md
28
RELEASE.md
@ -48,16 +48,16 @@
|
||||
|
||||
Following is the Release Compatibility Matrix for PyTorch releases:
|
||||
|
||||
| PyTorch version | Python | C++ | Stable CUDA | Experimental CUDA | Stable ROCm |
|
||||
| --- | --- | --- | --- | --- | --- |
|
||||
| 2.5 | >=3.9, <=3.12, (3.13 experimental) | C++17 | CUDA 11.8, CUDA 12.1, CUDA 12.4, CUDNN 9.1.0.70 | None | ROCm 6.2 |
|
||||
| 2.4 | >=3.8, <=3.12 | C++17 | CUDA 11.8, CUDA 12.1, CUDNN 9.1.0.70 | CUDA 12.4, CUDNN 9.1.0.70 | ROCm 6.1 |
|
||||
| 2.3 | >=3.8, <=3.11, (3.12 experimental) | C++17 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 6.0 |
|
||||
| 2.2 | >=3.8, <=3.11, (3.12 experimental) | C++17 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.7 |
|
||||
| 2.1 | >=3.8, <=3.11 | C++17 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.6 |
|
||||
| 2.0 | >=3.8, <=3.11 | C++14 | CUDA 11.7, CUDNN 8.5.0.96 | CUDA 11.8, CUDNN 8.7.0.84 | ROCm 5.4 |
|
||||
| 1.13 | >=3.7, <=3.10 | C++14 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 |
|
||||
| 1.12 | >=3.7, <=3.10 | C++14 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 |
|
||||
| PyTorch version | Python | Stable CUDA | Experimental CUDA | Stable ROCm |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| 2.5 | >=3.9, <=3.12, (3.13 experimental) | CUDA 11.8, CUDA 12.1, CUDA 12.4, CUDNN 9.1.0.70 | None | ROCm 6.2 |
|
||||
| 2.4 | >=3.8, <=3.12 | CUDA 11.8, CUDA 12.1, CUDNN 9.1.0.70 | CUDA 12.4, CUDNN 9.1.0.70 | ROCm 6.1 |
|
||||
| 2.3 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 6.0 |
|
||||
| 2.2 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.7 |
|
||||
| 2.1 | >=3.8, <=3.11 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.6 |
|
||||
| 2.0 | >=3.8, <=3.11 | CUDA 11.7, CUDNN 8.5.0.96 | CUDA 11.8, CUDNN 8.7.0.84 | ROCm 5.4 |
|
||||
| 1.13 | >=3.7, <=3.10 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 |
|
||||
| 1.12 | >=3.7, <=3.10 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 |
|
||||
|
||||
## Release Cadence
|
||||
|
||||
@ -234,7 +234,7 @@ Typically, within a release cycle fixes are necessary for regressions, test fixe
|
||||
For fixes that are to go into a release after the release branch has been cut we typically employ the use of a cherry pick tracker.
|
||||
|
||||
An example of this would look like:
|
||||
* https://github.com/pytorch/pytorch/issues/128436
|
||||
* https://github.com/pytorch/pytorch/issues/51886
|
||||
|
||||
Please also make sure to add milestone target to the PR/issue, especially if it needs to be considered for inclusion into the dot release.
|
||||
|
||||
@ -243,9 +243,7 @@ Please also make sure to add milestone target to the PR/issue, especially if it
|
||||
#### How to do Cherry Picking
|
||||
|
||||
You can now use `pytorchbot` to cherry pick a PyTorch PR that has been committed
|
||||
to the main branch using `@pytorchbot cherry-pick` command as follows (make sure
|
||||
that the cherry-pick tracker issue for the target release labelled as "release tracker" -
|
||||
this will allow the bot to find it and post comments).
|
||||
to the main branch using `@pytorchbot cherry-pick` command as follows.
|
||||
|
||||
```
|
||||
usage: @pytorchbot cherry-pick --onto ONTO [--fixes FIXES] -c
|
||||
@ -382,7 +380,7 @@ Patch release process takes around 4-5 weeks to complete.
|
||||
### Issue Tracker for Patch releases
|
||||
|
||||
For patch releases issue tracker needs to be created. For patch release, we require all cherry-pick changes to have links to either a high-priority GitHub issue or a CI failure from previous RC. An example of this would look like:
|
||||
* https://github.com/pytorch/pytorch/issues/128436
|
||||
* https://github.com/pytorch/pytorch/issues/51886
|
||||
|
||||
Only following issues are accepted:
|
||||
1. Fixes to regressions against previous major version (e.g. regressions introduced in 1.13.0 from 1.12.0 are pickable for 1.13.1)
|
||||
|
@ -54,7 +54,7 @@ if(NOT BUILD_LITE_INTERPRETER)
|
||||
endif()
|
||||
EXCLUDE(ATen_CORE_SRCS "${ATen_CORE_SRCS}" ${ATen_CORE_TEST_SRCS})
|
||||
|
||||
file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/vec256/vsx/*.h" "cpu/vec/vec256/zarch/*.h" "cpu/vec/sve/*.h" "cpu/vec/*.h" "quantized/*.h" "functorch/*.h")
|
||||
file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/vec256/vsx/*.h" "cpu/vec/vec256/zarch/*.h" "cpu/vec/*.h" "quantized/*.h" "functorch/*.h")
|
||||
file(GLOB base_cpp "*.cpp" "detail/*.cpp" "cpu/*.cpp" "functorch/*.cpp")
|
||||
file(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh" "cuda/tunable/*.cuh" "cuda/tunable/*.h")
|
||||
file(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp" "cuda/tunable/*.cpp")
|
||||
|
@ -145,14 +145,6 @@ void Context::setSDPUseMath(bool e) {
|
||||
enabled_mathSDP = e;
|
||||
}
|
||||
|
||||
bool Context::allowFP16BF16ReductionMathSDP() const {
|
||||
return allow_fp16_bf16_reduction_mathSDP;
|
||||
}
|
||||
|
||||
void Context::setAllowFP16BF16ReductionMathSDP(bool e) {
|
||||
allow_fp16_bf16_reduction_mathSDP = e;
|
||||
}
|
||||
|
||||
bool Context::userEnabledCuDNNSDP() const {
|
||||
return enabled_cudnnSDP;
|
||||
}
|
||||
|
@ -234,9 +234,6 @@ class TORCH_API Context {
|
||||
void setSDPUseCuDNN(bool);
|
||||
bool userEnabledCuDNNSDP() const;
|
||||
|
||||
void setAllowFP16BF16ReductionMathSDP(bool);
|
||||
bool allowFP16BF16ReductionMathSDP() const;
|
||||
|
||||
void setSDPUseOverrideable(bool);
|
||||
bool userEnabledOverrideableSDP() const;
|
||||
|
||||
@ -393,7 +390,6 @@ class TORCH_API Context {
|
||||
bool enabled_mathSDP = true;
|
||||
bool enabled_cudnnSDP = true;
|
||||
bool enabled_overrideable = true;
|
||||
bool allow_fp16_bf16_reduction_mathSDP = false;
|
||||
#ifdef USE_ROCM
|
||||
bool benchmark_cudnn = true;
|
||||
#else
|
||||
|
@ -105,11 +105,6 @@ std::string get_cpu_capability() {
|
||||
return "DEFAULT";
|
||||
case native::CPUCapability::ZVECTOR:
|
||||
return "Z VECTOR";
|
||||
#elif defined(HAVE_SVE_CPU_DEFINITION)
|
||||
case native::CPUCapability::DEFAULT:
|
||||
return "DEFAULT";
|
||||
case native::CPUCapability::SVE256:
|
||||
return "SVE256";
|
||||
#else
|
||||
case native::CPUCapability::DEFAULT:
|
||||
return "NO AVX";
|
||||
|
@ -336,7 +336,6 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
|
||||
KERNEL_CPU(linalg_vecdot, lower_precision_fp)
|
||||
KERNEL_CPU(baddbmm, lower_precision_fp)
|
||||
KERNEL_CPU(addmm, lower_precision_fp)
|
||||
KERNEL_CPU(_addmm_activation, lower_precision_fp)
|
||||
KERNEL_CPU(addbmm, lower_precision_fp)
|
||||
KERNEL_CPU(linear, lower_precision_fp)
|
||||
KERNEL_CPU(_convolution, deprecated, lower_precision_fp)
|
||||
|
@ -1,6 +1,4 @@
|
||||
#include <c10/core/Allocator.h>
|
||||
#include <c10/core/thread_pool.h>
|
||||
#include <c10/util/CallOnce.h>
|
||||
#include <c10/util/flat_hash_map.h>
|
||||
#include <c10/util/llvmMathExtras.h>
|
||||
#include <optional>
|
||||
@ -111,17 +109,6 @@ template <
|
||||
typename E,
|
||||
typename B = HostBlock<S>>
|
||||
struct CachingHostAllocatorImpl {
|
||||
CachingHostAllocatorImpl() {
|
||||
// Launch the background thread and process events in a loop.
|
||||
if (pinned_use_background_threads()) {
|
||||
getBackgroundThreadPool()->run([&]() {
|
||||
while (true) {
|
||||
process_events();
|
||||
std::this_thread::sleep_for(std::chrono::microseconds(100));
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
virtual ~CachingHostAllocatorImpl() = default;
|
||||
|
||||
public:
|
||||
@ -131,34 +118,17 @@ struct CachingHostAllocatorImpl {
|
||||
return {nullptr, nullptr};
|
||||
}
|
||||
|
||||
// If we are using background threads, we can process events in the
|
||||
// background.
|
||||
if (!pinned_use_background_threads()) {
|
||||
process_events();
|
||||
process_events();
|
||||
|
||||
// First, try to allocate from the free list
|
||||
auto* block = get_free_block(size);
|
||||
if (block) {
|
||||
return {block->ptr_, reinterpret_cast<void*>(block)};
|
||||
}
|
||||
|
||||
// Round up the allocation to the nearest power of two to improve reuse.
|
||||
// These power of two sizes are also used to index into the free list.
|
||||
size_t roundSize = c10::llvm::PowerOf2Ceil(size);
|
||||
|
||||
// First, try to allocate from the free list
|
||||
auto* block = get_free_block(roundSize);
|
||||
if (block) {
|
||||
return {block->ptr_, reinterpret_cast<void*>(block)};
|
||||
}
|
||||
|
||||
// Check in the recently freed blocks with pending events to see if we
|
||||
// can reuse them. Call get_free_block again after processing events
|
||||
if (pinned_use_background_threads()) {
|
||||
process_events_for_specific_size(roundSize);
|
||||
block = get_free_block(roundSize);
|
||||
if (block) {
|
||||
return {block->ptr_, reinterpret_cast<void*>(block)};
|
||||
}
|
||||
}
|
||||
|
||||
// Slow path: if we can't allocate from the cached free list, we need
|
||||
// to create a new block.
|
||||
void* ptr = nullptr;
|
||||
allocate_host_memory(roundSize, &ptr);
|
||||
|
||||
@ -267,10 +237,6 @@ struct CachingHostAllocatorImpl {
|
||||
return c10::llvm::Log2_64_Ceil(size);
|
||||
}
|
||||
|
||||
virtual bool pinned_use_background_threads() {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual void copy_data(void* dest [[maybe_unused]], const void* src [[maybe_unused]], std::size_t count [[maybe_unused]]) const {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for copy_data");
|
||||
}
|
||||
@ -295,21 +261,6 @@ struct CachingHostAllocatorImpl {
|
||||
}
|
||||
|
||||
virtual void process_events() {
|
||||
// process all events until the last unready event, not for specific size.
|
||||
process_events_for_specific_size(-1);
|
||||
}
|
||||
|
||||
// If size is -1, process all events from backwards until the last unready
|
||||
// event. Otherwise, process events for a specific size and on first ready block
|
||||
// is found, add it to the free list and return.
|
||||
virtual void process_events_for_specific_size(int64_t size) {
|
||||
size_t event_count = 0;
|
||||
size_t max_events = 0;
|
||||
{
|
||||
std::lock_guard<std::mutex> g(events_mutex_);
|
||||
max_events = events_.size();
|
||||
}
|
||||
|
||||
while (true) {
|
||||
// Avoid calling cudaEventDestroy while holding a mutex, so move
|
||||
// intermediate events out of the lock into this object.
|
||||
@ -327,25 +278,6 @@ struct CachingHostAllocatorImpl {
|
||||
return;
|
||||
}
|
||||
|
||||
if (size != -1) {
|
||||
if (event_count++ > max_events) {
|
||||
{
|
||||
std::lock_guard<std::mutex> g(events_mutex_);
|
||||
events_.push_front(std::move(*processed));
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (size != (int64_t)processed->second->size_) {
|
||||
// if we are processing a specific size, and the size of the block
|
||||
// doesn't match, we can't use it.
|
||||
{
|
||||
std::lock_guard<std::mutex> g(events_mutex_);
|
||||
events_.push_front(std::move(*processed));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// otherwise, query the event
|
||||
{
|
||||
// now, see if we can handle this element
|
||||
@ -354,14 +286,9 @@ struct CachingHostAllocatorImpl {
|
||||
// push the event onto the back if it's not ready.
|
||||
{
|
||||
std::lock_guard<std::mutex> g(events_mutex_);
|
||||
if (size == -1) {
|
||||
events_.push_back(std::move(*processed));
|
||||
return;
|
||||
} else {
|
||||
events_.push_front(std::move(*processed));
|
||||
continue;
|
||||
}
|
||||
events_.push_back(std::move(*processed));
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
@ -382,54 +309,46 @@ struct CachingHostAllocatorImpl {
|
||||
auto index = size_index(block->size_);
|
||||
std::lock_guard<std::mutex> g(free_list_[index].mutex_);
|
||||
free_list_[index].list_.push_back(block);
|
||||
if (size != -1) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TaskThreadPool* getBackgroundThreadPool() {
|
||||
static TaskThreadPool* pool = new TaskThreadPool(1);
|
||||
return pool;
|
||||
/* These following functions are runtime-related. */
|
||||
|
||||
// Allocate page-locked memory on the host.
|
||||
virtual void allocate_host_memory(size_t size, void** ptr) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false, "Not implemented for allocate_host_memory");
|
||||
}
|
||||
|
||||
/* These following functions are runtime-related. */
|
||||
// Free block and release the pointer contained in block.
|
||||
virtual void free_block(B* block) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for free_block");
|
||||
}
|
||||
|
||||
// Allocate page-locked memory on the host.
|
||||
virtual void allocate_host_memory(size_t size, void** ptr) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false, "Not implemented for allocate_host_memory");
|
||||
}
|
||||
// Record an event on stream and store event into events.
|
||||
virtual void record_stream(std::optional<std::vector<E>>& events, S stream) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for record_stream");
|
||||
}
|
||||
|
||||
// Free block and release the pointer contained in block.
|
||||
virtual void free_block(B* block) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for free_block");
|
||||
}
|
||||
// Query event if it is completed.
|
||||
virtual bool query_event(E& event) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event");
|
||||
}
|
||||
|
||||
// Record an event on stream and store event into events.
|
||||
virtual void record_stream(std::optional<std::vector<E>>& events, S stream) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for record_stream");
|
||||
}
|
||||
alignas(64) std::mutex blocks_mutex_;
|
||||
ska::flat_hash_set<B*> blocks_; // block list
|
||||
ska::flat_hash_map<void*, B*> ptr_to_block_;
|
||||
|
||||
// Query event if it is completed.
|
||||
virtual bool query_event(E& event) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event");
|
||||
}
|
||||
// We keep free list as a vector of free lists, one for each power of two
|
||||
// size. This allows us to quickly find a free block of the right size.
|
||||
// We use deque to store per size free list and guard the list with its own
|
||||
// mutex.
|
||||
alignas(64) std::vector<FreeBlockList<B>> free_list_ = std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
|
||||
|
||||
alignas(64) std::mutex blocks_mutex_;
|
||||
ska::flat_hash_set<B*> blocks_; // block list
|
||||
ska::flat_hash_map<void*, B*> ptr_to_block_;
|
||||
|
||||
// We keep free list as a vector of free lists, one for each power of two
|
||||
// size. This allows us to quickly find a free block of the right size.
|
||||
// We use deque to store per size free list and guard the list with its own
|
||||
// mutex.
|
||||
alignas(64) std::vector<FreeBlockList<B>> free_list_ = std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
|
||||
|
||||
alignas(64) std::mutex events_mutex_;
|
||||
std::deque<std::pair<E, B*>> events_; // event queue paired with block
|
||||
};
|
||||
alignas(64) std::mutex events_mutex_;
|
||||
std::deque<std::pair<E, B*>> events_; // event queue paired with block
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct CachingHostAllocatorInterface : public at::Allocator {
|
||||
|
@ -45,7 +45,7 @@ private:
|
||||
c10::impl::LocalDispatchKeySet saved_;
|
||||
};
|
||||
|
||||
void pythonFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
|
||||
void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
|
||||
TORCH_INTERNAL_ASSERT(tls_on_entry.has_value());
|
||||
// c10::impl::ForceDispatchKeyGuard dispatcher_guard(tls_on_entry.value());
|
||||
// StashTLSOnEntryGuard stash_guard;
|
||||
@ -68,20 +68,12 @@ void pythonFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_
|
||||
// we actually run dispatch(), we will take out PyObjects in the context
|
||||
// of that interpreter, and this will ensure that everyone is on the same
|
||||
// interpreter.
|
||||
bool tensors_with_python_key_present = false;
|
||||
c10::impl::PyInterpreter* interpreter = nullptr;
|
||||
for (const auto& ivalue : torch::jit::last(*stack, num_arguments)) {
|
||||
if (ivalue.isTensor()) {
|
||||
auto* t = ivalue.unsafeToTensorImpl();
|
||||
if (t->key_set().has(c10::DispatchKey::Python)) {
|
||||
tensors_with_python_key_present = true;
|
||||
}
|
||||
|
||||
if (!interpreter) {
|
||||
auto* t_interpreter = t->pyobj_slot()->pyobj_interpreter();
|
||||
if (t_interpreter) {
|
||||
interpreter = t_interpreter;
|
||||
}
|
||||
auto* interpreter = ivalue.unsafeToTensorImpl()->pyobj_slot()->pyobj_interpreter();
|
||||
if (interpreter) {
|
||||
(*interpreter)->dispatch(op, stack);
|
||||
return;
|
||||
}
|
||||
} else if (ivalue.isTensorList() || ivalue.isOptionalTensorList()) {
|
||||
// NB: use toListRef as it doesn't induce refcount bumps (toTensorListRef
|
||||
@ -90,43 +82,14 @@ void pythonFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_
|
||||
if (nv.isNone()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto* t = nv.unsafeToTensorImpl();
|
||||
if (t->key_set().has(c10::DispatchKey::Python)) {
|
||||
tensors_with_python_key_present = true;
|
||||
}
|
||||
|
||||
if (!interpreter) {
|
||||
auto* t_interpreter = t->pyobj_slot()->pyobj_interpreter();
|
||||
if (t_interpreter) {
|
||||
interpreter = t_interpreter;
|
||||
}
|
||||
auto* interpreter = nv.unsafeToTensorImpl()->pyobj_slot()->pyobj_interpreter();
|
||||
if (interpreter) {
|
||||
(*interpreter)->dispatch(op, stack);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (interpreter) {
|
||||
if (tensors_with_python_key_present) {
|
||||
(*interpreter)->dispatch(op, stack);
|
||||
} else {
|
||||
// At this point, there are no modes in the stack and no tensors with the python key.
|
||||
// so disable the python key before redispatching.
|
||||
// See https://github.com/pytorch/pytorch/issues/136565
|
||||
c10::DispatchKeySet keyset = dispatch_keys.remove(c10::DispatchKey::Python);
|
||||
|
||||
// Remove Python key from the included set as well (modes add it there).
|
||||
c10::impl::LocalDispatchKeySet local_keyset = c10::impl::tls_local_dispatch_key_set();
|
||||
c10::impl::ForceDispatchKeyGuard no_python_guard(
|
||||
local_keyset.included_.remove(c10::DispatchKey::Python),
|
||||
local_keyset.excluded_
|
||||
);
|
||||
|
||||
op.redispatchBoxed(keyset, stack);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
TORCH_INTERNAL_ASSERT(0, "Hit Python dispatch key but no arguments had PyInterpreter (no tensor args?)");
|
||||
}
|
||||
|
||||
|
@ -78,7 +78,7 @@ struct VecReduceAllSIMD<float, Op> {
|
||||
#endif // defined(CPU_CAPABILITY_AVX512)
|
||||
#endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && !defined(C10_MOBILE)
|
||||
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && !defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
|
||||
template <typename Op>
|
||||
struct VecReduceAllSIMD<float, Op> {
|
||||
static inline float apply(const Op& vec_fun, const Vectorized<float>& acc_vec) {
|
||||
|
@ -5,10 +5,6 @@
|
||||
#elif defined(__clang__) && (defined(__ARM_NEON__) || defined(__aarch64__))
|
||||
/* Clang-compatible compiler, targeting arm neon */
|
||||
#include <arm_neon.h>
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
/* CLANG-compatible compiler, targeting ARM with SVE */
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
#elif defined(_MSC_VER)
|
||||
/* Microsoft C/C++-compatible compiler */
|
||||
#include <intrin.h>
|
||||
@ -21,10 +17,6 @@
|
||||
#elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__aarch64__))
|
||||
/* GCC-compatible compiler, targeting ARM with NEON */
|
||||
#include <arm_neon.h>
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
/* GCC-compatible compiler, targeting ARM with SVE */
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
#if defined (MISSING_ARM_VLD1)
|
||||
#include <ATen/cpu/vec/vec256/missing_vld1_neon.h>
|
||||
#elif defined (MISSING_ARM_VST1)
|
||||
|
@ -1,63 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
// Define the data type of VLS(vector-length specific).
|
||||
typedef svbool_t vls_pred_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
|
||||
typedef svint8_t vls_int8_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
|
||||
typedef svint16_t vls_int16_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
|
||||
typedef svint32_t vls_int32_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
|
||||
typedef svint64_t vls_int64_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
|
||||
typedef svuint8_t vls_uint8_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
|
||||
typedef svuint16_t vls_uint16_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
|
||||
typedef svuint32_t vls_uint32_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
|
||||
typedef svuint64_t vls_uint64_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
|
||||
typedef svfloat16_t vls_float16_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
|
||||
typedef svfloat32_t vls_float32_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
|
||||
typedef svfloat64_t vls_float64_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
|
||||
|
||||
#define ptrue svptrue_b8()
|
||||
#define ZERO_S8 svdup_n_s8(0)
|
||||
#define ZERO_S16 svdup_n_s16(0)
|
||||
#define ZERO_S32 svdup_n_s32(0)
|
||||
#define ZERO_S64 svdup_n_s64(0)
|
||||
#define ZERO_U8 svdup_n_u8(0)
|
||||
#define ZERO_U16 svdup_n_u16(0)
|
||||
#define ZERO_U32 svdup_n_u32(0)
|
||||
#define ZERO_U64 svdup_n_u64(0)
|
||||
#define ZERO_F16 svdup_n_f16(0.f)
|
||||
#define ZERO_F32 svdup_n_f32(0.f)
|
||||
#define ZERO_F64 svdup_n_f64(0.0)
|
||||
#define ONE_S8 svdup_n_s8(1)
|
||||
#define ONE_S16 svdup_n_s16(1)
|
||||
#define ONE_S32 svdup_n_s32(1)
|
||||
#define ONE_S64 svdup_n_s64(1)
|
||||
#define ONE_U8 svdup_n_u8(1)
|
||||
#define ONE_U16 svdup_n_u16(1)
|
||||
#define ONE_U32 svdup_n_u32(1)
|
||||
#define ONE_U64 svdup_n_u64(1)
|
||||
#define ONE_F16 svdup_n_f16(1.f)
|
||||
#define ONE_F32 svdup_n_f32(1.f)
|
||||
#define ONE_F64 svdup_n_f64(1.0)
|
||||
#define ALL_S8_TRUE_MASK svdup_n_s8(0xff)
|
||||
#define ALL_S8_FALSE_MASK svdup_n_s8(0x0)
|
||||
#define ALL_S16_TRUE_MASK svdup_n_s16(0xffff)
|
||||
#define ALL_S16_FALSE_MASK svdup_n_s16(0x0)
|
||||
#define ALL_S32_TRUE_MASK svdup_n_s32(0xffffffff)
|
||||
#define ALL_S32_FALSE_MASK svdup_n_s32(0x0)
|
||||
#define ALL_S64_TRUE_MASK svdup_n_s64(0xffffffffffffffff)
|
||||
#define ALL_S64_FALSE_MASK svdup_n_s64(0x0)
|
||||
#define ALL_U8_TRUE_MASK svdup_n_u8(0x01)
|
||||
#define ALL_U8_FALSE_MASK svdup_n_u8(0x00)
|
||||
#define ALL_F16_TRUE_MASK svreinterpret_f16_s16(ALL_S16_TRUE_MASK)
|
||||
#define ALL_F16_FALSE_MASK svreinterpret_f16_s16(ALL_S16_FALSE_MASK)
|
||||
#define ALL_F32_TRUE_MASK svreinterpret_f32_s32(ALL_S32_TRUE_MASK)
|
||||
#define ALL_F32_FALSE_MASK svreinterpret_f32_s32(ALL_S32_FALSE_MASK)
|
||||
#define ALL_F64_TRUE_MASK svreinterpret_f64_s64(ALL_S64_TRUE_MASK)
|
||||
#define ALL_F64_FALSE_MASK svreinterpret_f64_s64(ALL_S64_FALSE_MASK)
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
@ -1,176 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
// DO NOT DEFINE STATIC DATA IN THIS HEADER!
|
||||
// See Note [Do not compile initializers with SVE]
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#include <ATen/cpu/vec/sve/vec_float.h>
|
||||
#include <ATen/cpu/vec/sve/vec_double.h>
|
||||
#include <ATen/cpu/vec/sve/vec_int.h>
|
||||
#include <ATen/cpu/vec/sve/vec_qint.h>
|
||||
#endif
|
||||
|
||||
namespace at {
|
||||
namespace vec {
|
||||
// Note [CPU_CAPABILITY namespace]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
// This header, and all of its subheaders, will be compiled with
|
||||
// different architecture flags for each supported set of vector
|
||||
// intrinsics. So we need to make sure they aren't inadvertently
|
||||
// linked together. We do this by declaring objects in an `inline
|
||||
// namespace` which changes the name mangling, but can still be
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
template<>
|
||||
inline Vectorized<float> cast<float, double>(const Vectorized<double>& src) {
|
||||
return svreinterpret_f32_f64(src);
|
||||
}
|
||||
|
||||
template<>
|
||||
inline Vectorized<double> cast<double, float>(const Vectorized<float>& src) {
|
||||
return svreinterpret_f64_f32(src);
|
||||
}
|
||||
|
||||
#define DEFINE_FLOAT_INT_CAST(int_t, int_bit, float_t, float_bit) \
|
||||
template<> \
|
||||
inline Vectorized<int_t> cast<int_t, float_t>(const Vectorized<float_t>& src) { \
|
||||
return svreinterpret_s##int_bit##_f##float_bit(src); \
|
||||
} \
|
||||
template<> \
|
||||
inline Vectorized<float_t> cast<float_t, int_t>(const Vectorized<int_t>& src) { \
|
||||
return svreinterpret_f##float_bit##_s##int_bit(src); \
|
||||
}
|
||||
|
||||
DEFINE_FLOAT_INT_CAST(int64_t, 64, double, 64)
|
||||
DEFINE_FLOAT_INT_CAST(int32_t, 32, double, 64)
|
||||
DEFINE_FLOAT_INT_CAST(int16_t, 16, double, 64)
|
||||
DEFINE_FLOAT_INT_CAST(int64_t, 64, float, 32)
|
||||
DEFINE_FLOAT_INT_CAST(int32_t, 32, float, 32)
|
||||
DEFINE_FLOAT_INT_CAST(int16_t, 16, float, 32)
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
template<int64_t scale = 1>
|
||||
std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
|
||||
inline gather(const double* base_addr, const Vectorized<int64_t>& vindex_) {
|
||||
svint64_t vindex = svasrd_n_s64_x(ptrue, svmul_s64_x(ptrue, vindex_, svdup_n_s64(scale)), 3);
|
||||
return svld1_gather_s64index_f64(ptrue, base_addr, vindex);
|
||||
}
|
||||
|
||||
template<int64_t scale = 1>
|
||||
std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<float>>
|
||||
inline gather(const float* base_addr, const Vectorized<int32_t>& vindex_) {
|
||||
svint32_t vindex = svasrd_n_s32_x(ptrue, svmul_s32_x(ptrue, vindex_, svdup_n_s32(scale)), 2);
|
||||
return svld1_gather_s32index_f32(ptrue, base_addr, vindex);
|
||||
}
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
template<int64_t scale = 1>
|
||||
std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
|
||||
inline mask_gather(const Vectorized<double>& src, const double* base_addr,
|
||||
const Vectorized<int64_t>& vindex_, const Vectorized<double>& mask_) {
|
||||
svbool_t mask = svcmpeq_s64(ptrue, svreinterpret_s64_f64(mask_),
|
||||
ALL_S64_TRUE_MASK);
|
||||
svint64_t vindex = svasrd_n_s64_x(ptrue, svmul_s64_x(ptrue, vindex_, svdup_n_s64(scale)), 3);
|
||||
return svsel_f64(mask, svld1_gather_s64index_f64(mask, base_addr, vindex), src);
|
||||
}
|
||||
|
||||
template<int64_t scale = 1>
|
||||
std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<float>>
|
||||
inline mask_gather(const Vectorized<float>& src, const float* base_addr,
|
||||
const Vectorized<int32_t>& vindex_, const Vectorized<float>& mask_) {
|
||||
svbool_t mask = svcmpeq_s32(ptrue, svreinterpret_s32_f32(mask_),
|
||||
ALL_S32_TRUE_MASK);
|
||||
svint32_t vindex = svasrd_n_s32_x(ptrue, svmul_s32_x(ptrue, vindex_, svdup_n_s32(scale)), 2);
|
||||
return svsel_f32(mask, svld1_gather_s32index_f32(mask, base_addr, vindex), src);
|
||||
}
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
// Only works for inputs in the range: [-2^51, 2^51]
|
||||
// From: https://stackoverflow.com/a/41148578
|
||||
template<>
|
||||
Vectorized<int64_t>
|
||||
inline convert_to_int_of_same_size<double>(const Vectorized<double> &src) {
|
||||
svfloat64_t x = svadd_f64_x(ptrue, src, svdup_n_f64(0x0018000000000000));
|
||||
return svsub_s64_x(ptrue,
|
||||
svreinterpret_s64_f64(x),
|
||||
svreinterpret_s64_f64(svdup_n_f64(0x0018000000000000)));
|
||||
}
|
||||
|
||||
template<>
|
||||
Vectorized<int32_t>
|
||||
inline convert_to_int_of_same_size<float>(const Vectorized<float> &src) {
|
||||
return svcvt_s32_f32_x(ptrue, src);
|
||||
}
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
template <>
|
||||
std::pair<Vectorized<double>, Vectorized<double>>
|
||||
inline interleave2<double>(const Vectorized<double>& a, const Vectorized<double>& b) {
|
||||
// inputs:
|
||||
// a = {a0, a1, a3, a3}
|
||||
// b = {b0, b1, b2, b3}
|
||||
// group cols crossing lanes:
|
||||
// return {a0, b0, a1, b1}
|
||||
// {a2, b2, a3, b3}
|
||||
return std::make_pair(Vectorized<double>(svzip1_f64(a, b)),
|
||||
Vectorized<double>(svzip2_f64(a, b)));
|
||||
}
|
||||
|
||||
template <>
|
||||
std::pair<Vectorized<float>, Vectorized<float>>
|
||||
inline interleave2<float>(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
// inputs:
|
||||
// a = {a0, a1, a2, a3, a4, a5, a6, a7}
|
||||
// b = {b0, b1, b2, b3, b4, b5, b6, b7}
|
||||
// group cols crossing lanes:
|
||||
// return {a0, b0, a1, b1, a2, b2, a3, b3}
|
||||
// {a4, b4, a5, b5, a6, b6, a7, b7}
|
||||
return std::make_pair(Vectorized<float>(svzip1_f32(a, b)),
|
||||
Vectorized<float>(svzip2_f32(a, b)));
|
||||
}
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
template <>
|
||||
std::pair<Vectorized<double>, Vectorized<double>>
|
||||
inline deinterleave2<double>(const Vectorized<double>& a, const Vectorized<double>& b) {
|
||||
// inputs:
|
||||
// a = {a0, b0, a1, b1}
|
||||
// b = {a2, b2, a3, b3}
|
||||
// swap lanes:
|
||||
// return {a0, a1, a2, a3}
|
||||
// {b0, b1, b2, b3}
|
||||
return std::make_pair(Vectorized<double>(svuzp1_f64(a, b)),
|
||||
Vectorized<double>(svuzp2_f64(a, b)));
|
||||
}
|
||||
|
||||
template <>
|
||||
std::pair<Vectorized<float>, Vectorized<float>>
|
||||
inline deinterleave2<float>(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
// inputs:
|
||||
// a = {a0, b0, a1, b1, a2, b2, a3, b3}
|
||||
// b = {a4, b4, a5, b5, a6, b6, a7, b7}
|
||||
// swap lanes:
|
||||
// return {a0, a1, a2, a3, a4, a5, a6, a7}
|
||||
// {b0, b1, b2, b3, b4, b5, b6, b7}
|
||||
return std::make_pair(Vectorized<float>(svuzp1_f32(a, b)),
|
||||
Vectorized<float>(svuzp2_f32(a, b)));
|
||||
}
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
}}}
|
@ -1,505 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <cmath>
|
||||
#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
|
||||
#include <sleef.h>
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
|
||||
#else
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
|
||||
#endif
|
||||
namespace at {
|
||||
namespace vec {
|
||||
// Note [CPU_CAPABILITY namespace]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
// This header, and all of its subheaders, will be compiled with
|
||||
// different architecture flags for each supported set of vector
|
||||
// intrinsics. So we need to make sure they aren't inadvertently
|
||||
// linked together. We do this by declaring objects in an `inline
|
||||
// namespace` which changes the name mangling, but can still be
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
template <> class Vectorized<double> {
|
||||
private:
|
||||
vls_float64_t values;
|
||||
public:
|
||||
using value_type = double;
|
||||
using size_type = int;
|
||||
static constexpr size_type size() {
|
||||
return VECTOR_WIDTH / sizeof(double);
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(svfloat64_t v) : values(v) {}
|
||||
Vectorized(double val) {
|
||||
values = svdup_n_f64(val);
|
||||
}
|
||||
template<typename... Args,
|
||||
typename = std::enable_if_t<(sizeof...(Args) == size())>>
|
||||
Vectorized(Args... vals) {
|
||||
__at_align__ double buffer[size()] = { vals... };
|
||||
values = svld1_f64(ptrue, buffer);
|
||||
}
|
||||
operator svfloat64_t() const {
|
||||
return values;
|
||||
}
|
||||
static Vectorized<double> blendv(const Vectorized<double>& a, const Vectorized<double>& b,
|
||||
const Vectorized<double>& mask_) {
|
||||
svbool_t mask = svcmpeq_s64(ptrue, svreinterpret_s64_f64(mask_),
|
||||
ALL_S64_TRUE_MASK);
|
||||
return svsel_f64(mask, b, a);
|
||||
}
|
||||
template<typename step_t>
|
||||
static Vectorized<double> arange(double base = 0., step_t step = static_cast<step_t>(1)) {
|
||||
__at_align__ double buffer[size()];
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
buffer[i] = base + i * step;
|
||||
}
|
||||
return svld1_f64(ptrue, buffer);
|
||||
}
|
||||
static Vectorized<double> set(const Vectorized<double>& a, const Vectorized<double>& b,
|
||||
int64_t count = size()) {
|
||||
if (count == 0) {
|
||||
return a;
|
||||
} else if (count < size()) {
|
||||
return svsel_f64(svwhilelt_b64(0ull, count), b, a);
|
||||
}
|
||||
return b;
|
||||
}
|
||||
static Vectorized<double> loadu(const void* ptr, int64_t count = size()) {
|
||||
if (count == size())
|
||||
return svld1_f64(ptrue, reinterpret_cast<const double*>(ptr));
|
||||
svbool_t pg = svwhilelt_b64(0ull, count);
|
||||
return svld1_f64(pg, reinterpret_cast<const double*>(ptr));
|
||||
}
|
||||
void store(void* ptr, int64_t count = size()) const {
|
||||
if (count == size()) {
|
||||
svst1_f64(ptrue, reinterpret_cast<double*>(ptr), values);
|
||||
} else {
|
||||
svbool_t pg = svwhilelt_b64(0ull, count);
|
||||
svst1_f64(pg, reinterpret_cast<double*>(ptr), values);
|
||||
}
|
||||
}
|
||||
const double& operator[](int idx) const = delete;
|
||||
double& operator[](int idx) = delete;
|
||||
int64_t zero_mask() const {
|
||||
// returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
|
||||
int64_t mask = 0;
|
||||
__at_align__ int64_t mask_array[size()];
|
||||
|
||||
svbool_t svbool_mask = svcmpeq_f64(ptrue, values, ZERO_F64);
|
||||
svst1_s64(ptrue, mask_array, svsel_s64(svbool_mask,
|
||||
ALL_S64_TRUE_MASK,
|
||||
ALL_S64_FALSE_MASK));
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
if (mask_array[i]) mask |= (1ull << i);
|
||||
}
|
||||
return mask;
|
||||
}
|
||||
Vectorized<double> isnan() const {
|
||||
// NaN check
|
||||
svbool_t mask = svcmpuo_f64(ptrue, values, ZERO_F64);
|
||||
return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
|
||||
}
|
||||
bool has_inf_nan() const {
|
||||
return svptest_any(ptrue, svcmpuo_f64(ptrue, svsub_f64_x(ptrue, values, values), ZERO_F64));
|
||||
}
|
||||
Vectorized<double> map(double (*f)(double)) const {
|
||||
__at_align__ double tmp[size()];
|
||||
store(tmp);
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
tmp[i] = f(tmp[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
}
|
||||
Vectorized<double> abs() const {
|
||||
return svabs_f64_x(ptrue, values);
|
||||
}
|
||||
Vectorized<double> angle() const {
|
||||
const auto nan_vec = svdup_n_f64(NAN);
|
||||
const auto nan_mask = svcmpuo_f64(ptrue, values, ZERO_F64);
|
||||
const auto pi = svdup_n_f64(c10::pi<double>);
|
||||
|
||||
const auto neg_mask = svcmplt_f64(ptrue, values, ZERO_F64);
|
||||
auto angle = svsel_f64(neg_mask, pi, ZERO_F64);
|
||||
angle = svsel_f64(nan_mask, nan_vec, angle);
|
||||
return angle;
|
||||
}
|
||||
Vectorized<double> real() const {
|
||||
return *this;
|
||||
}
|
||||
Vectorized<double> imag() const {
|
||||
return Vectorized<double>(0.0);
|
||||
}
|
||||
Vectorized<double> conj() const {
|
||||
return *this;
|
||||
}
|
||||
Vectorized<double> acos() const {
|
||||
return USE_SLEEF(Vectorized<double>(Sleef_acosdx_u10sve(values)),map(std::acos));
|
||||
}
|
||||
Vectorized<double> acosh() const {
|
||||
return USE_SLEEF( Vectorized<double>(Sleef_acoshdx_u10sve(values)),map(std::acosh));
|
||||
}
|
||||
Vectorized<double> asin() const {
|
||||
return USE_SLEEF(Vectorized<double>(Sleef_asindx_u10sve(values)),map(std::asin));
|
||||
}
|
||||
Vectorized<double> atan() const {
|
||||
return USE_SLEEF(Vectorized<double>(Sleef_atandx_u10sve(values)),map(std::atan));
|
||||
}
|
||||
Vectorized<double> atanh() const {
|
||||
return USE_SLEEF(Vectorized<double>(Sleef_atanhdx_u10sve(values)),map(std::atanh));
|
||||
}
|
||||
Vectorized<double> atan2(const Vectorized<double> &b) const {
|
||||
USE_SLEEF({return Vectorized<double>(Sleef_atan2dx_u10sve(values, b));},
|
||||
{
|
||||
__at_align__ double tmp[size()];
|
||||
__at_align__ double tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = std::atan2(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
}
|
||||
)
|
||||
}
|
||||
Vectorized<double> copysign(const Vectorized<double> &sign) const {
|
||||
USE_SLEEF( {return Vectorized<double>(Sleef_copysigndx_sve(values, sign));},
|
||||
{
|
||||
__at_align__ double tmp[size()];
|
||||
__at_align__ double tmp_sign[size()];
|
||||
store(tmp);
|
||||
sign.store(tmp_sign);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
}
|
||||
)
|
||||
}
|
||||
Vectorized<double> erf() const {
|
||||
return USE_SLEEF(Vectorized<double>(Sleef_erfdx_u10sve(values)),map(std::erf));
|
||||
}
|
||||
Vectorized<double> erfc() const {
|
||||
return USE_SLEEF(Vectorized<double>(Sleef_erfcdx_u15sve(values)),map(std::erfc));
|
||||
}
|
||||
Vectorized<double> erfinv() const {
|
||||
return map(calc_erfinv);
|
||||
}
|
||||
Vectorized<double> exp() const {
|
||||
return USE_SLEEF(Vectorized<double>(Sleef_expdx_u10sve(values)),map(std::exp));
|
||||
}
|
||||
Vectorized<double> exp2() const {
|
||||
return USE_SLEEF(Vectorized<double>(Sleef_exp2dx_u10sve(values)),map(std::exp2));
|
||||
}
|
||||
Vectorized<double> expm1() const {
|
||||
return USE_SLEEF(Vectorized<double>(Sleef_expm1dx_u10sve(values)),map(std::expm1));
|
||||
}
|
||||
Vectorized<double> exp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
Vectorized<double> fmod(const Vectorized<double>& q) const {
|
||||
USE_SLEEF({return Vectorized<double>(Sleef_fmoddx_sve(values, q));},
|
||||
{
|
||||
__at_align__ double tmp[size()];
|
||||
__at_align__ double tmp_q[size()];
|
||||
store(tmp);
|
||||
q.store(tmp_q);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = std::fmod(tmp[i], tmp_q[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
}
|
||||
)
|
||||
}
|
||||
Vectorized<double> hypot(const Vectorized<double> &b) const {
|
||||
USE_SLEEF({return Vectorized<double>(Sleef_hypotdx_u05sve(values, b));},
|
||||
{
|
||||
__at_align__ double tmp[size()];
|
||||
__at_align__ double tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = std::hypot(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
})
|
||||
}
|
||||
Vectorized<double> i0() const {
|
||||
return map(calc_i0);
|
||||
}
|
||||
Vectorized<double> i0e() const {
|
||||
return map(calc_i0e);
|
||||
}
|
||||
Vectorized<double> digamma() const {
|
||||
return map(calc_digamma);
|
||||
}
|
||||
Vectorized<double> igamma(const Vectorized<double> &x) const {
|
||||
__at_align__ double tmp[size()];
|
||||
__at_align__ double tmp_x[size()];
|
||||
store(tmp);
|
||||
x.store(tmp_x);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
}
|
||||
Vectorized<double> igammac(const Vectorized<double> &x) const {
|
||||
__at_align__ double tmp[size()];
|
||||
__at_align__ double tmp_x[size()];
|
||||
store(tmp);
|
||||
x.store(tmp_x);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
}
|
||||
Vectorized<double> nextafter(const Vectorized<double> &b) const {
|
||||
USE_SLEEF(
|
||||
{
|
||||
return Vectorized<double>(Sleef_nextafterdx_sve(values, b));
|
||||
},
|
||||
{
|
||||
__at_align__ double tmp[size()];
|
||||
__at_align__ double tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
}
|
||||
)
|
||||
}
|
||||
Vectorized<double> log() const {
|
||||
return USE_SLEEF(Vectorized<double>(Sleef_logdx_u10sve(values)),map(std::log));
|
||||
}
|
||||
Vectorized<double> log2() const {
|
||||
return USE_SLEEF(Vectorized<double>(Sleef_log2dx_u10sve(values)),map(std::log2));
|
||||
}
|
||||
Vectorized<double> log10() const {
|
||||
return USE_SLEEF(Vectorized<double>(Sleef_log10dx_u10sve(values)),map(std::log10));
|
||||
}
|
||||
Vectorized<double> log1p() const {
|
||||
return USE_SLEEF(Vectorized<double>(Sleef_log1pdx_u10sve(values)),map(std::log1p));
|
||||
}
|
||||
Vectorized<double> frac() const;
|
||||
Vectorized<double> sin() const {
|
||||
return USE_SLEEF( Vectorized<double>(Sleef_sindx_u10sve(values)),map(std::sin));
|
||||
}
|
||||
Vectorized<double> sinh() const {
|
||||
return USE_SLEEF(Vectorized<double>(Sleef_sinhdx_u10sve(values)),map(std::sinh));
|
||||
}
|
||||
Vectorized<double> cos() const {
|
||||
return USE_SLEEF(Vectorized<double>(Sleef_cosdx_u10sve(values)),map(std::cos));
|
||||
}
|
||||
Vectorized<double> cosh() const {
|
||||
return USE_SLEEF( Vectorized<double>(Sleef_coshdx_u10sve(values)),map(std::cosh));
|
||||
}
|
||||
Vectorized<double> ceil() const {
|
||||
return svrintp_f64_x(ptrue, values);
|
||||
}
|
||||
Vectorized<double> floor() const {
|
||||
return svrintm_f64_x(ptrue, values);
|
||||
}
|
||||
Vectorized<double> neg() const {
|
||||
return svneg_f64_x(ptrue, values);
|
||||
}
|
||||
Vectorized<double> round() const {
|
||||
return svrinti_f64_x(ptrue, values);
|
||||
}
|
||||
Vectorized<double> tan() const {
|
||||
return USE_SLEEF( Vectorized<double>(Sleef_tandx_u10sve(values)),map(std::tan));
|
||||
}
|
||||
Vectorized<double> tanh() const {
|
||||
return USE_SLEEF( Vectorized<double>(Sleef_tanhdx_u10sve(values)),map(std::tanh));
|
||||
}
|
||||
Vectorized<double> trunc() const {
|
||||
return svrintz_f64_x(ptrue, values);
|
||||
}
|
||||
Vectorized<double> lgamma() const {
|
||||
return USE_SLEEF( Vectorized<double>(Sleef_lgammadx_u10sve(values)),map(std::lgamma));
|
||||
}
|
||||
Vectorized<double> sqrt() const {
|
||||
return svsqrt_f64_x(ptrue, values);
|
||||
}
|
||||
Vectorized<double> reciprocal() const {
|
||||
return svdivr_f64_x(ptrue, values, ONE_F64);
|
||||
}
|
||||
Vectorized<double> rsqrt() const {
|
||||
return svdivr_f64_x(ptrue, svsqrt_f64_x(ptrue, values), ONE_F64);
|
||||
}
|
||||
Vectorized<double> pow(const Vectorized<double> &b) const {
|
||||
USE_SLEEF( {return Vectorized<double>(Sleef_powdx_u10sve(values, b));},
|
||||
{
|
||||
__at_align__ double tmp[size()];
|
||||
__at_align__ double tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = std::pow(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
}
|
||||
)
|
||||
}
|
||||
// Comparison using the _CMP_**_OQ predicate.
|
||||
// `O`: get false if an operand is NaN
|
||||
// `Q`: do not raise if an operand is NaN
|
||||
Vectorized<double> operator==(const Vectorized<double>& other) const {
|
||||
svbool_t mask = svcmpeq_f64(ptrue, values, other);
|
||||
return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<double> operator!=(const Vectorized<double>& other) const {
|
||||
svbool_t mask = svcmpne_f64(ptrue, values, other);
|
||||
return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<double> operator<(const Vectorized<double>& other) const {
|
||||
svbool_t mask = svcmplt_f64(ptrue, values, other);
|
||||
return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<double> operator<=(const Vectorized<double>& other) const {
|
||||
svbool_t mask = svcmple_f64(ptrue, values, other);
|
||||
return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<double> operator>(const Vectorized<double>& other) const {
|
||||
svbool_t mask = svcmpgt_f64(ptrue, values, other);
|
||||
return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<double> operator>=(const Vectorized<double>& other) const {
|
||||
svbool_t mask = svcmpge_f64(ptrue, values, other);
|
||||
return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<double> eq(const Vectorized<double>& other) const;
|
||||
Vectorized<double> ne(const Vectorized<double>& other) const;
|
||||
Vectorized<double> gt(const Vectorized<double>& other) const;
|
||||
Vectorized<double> ge(const Vectorized<double>& other) const;
|
||||
Vectorized<double> lt(const Vectorized<double>& other) const;
|
||||
Vectorized<double> le(const Vectorized<double>& other) const;
|
||||
};
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline operator+(const Vectorized<double>& a, const Vectorized<double>& b) {
|
||||
return svadd_f64_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline operator-(const Vectorized<double>& a, const Vectorized<double>& b) {
|
||||
return svsub_f64_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline operator*(const Vectorized<double>& a, const Vectorized<double>& b) {
|
||||
return svmul_f64_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline operator/(const Vectorized<double>& a, const Vectorized<double>& b) {
|
||||
return svdiv_f64_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
// frac. Implement this here so we can use subtraction
|
||||
Vectorized<double> inline Vectorized<double>::frac() const {
|
||||
return *this - this->trunc();
|
||||
}
|
||||
|
||||
// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
|
||||
// either input is a NaN.
|
||||
template <>
|
||||
Vectorized<double> inline maximum(const Vectorized<double>& a, const Vectorized<double>& b) {
|
||||
return svmax_f64_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
|
||||
// either input is a NaN.
|
||||
template <>
|
||||
Vectorized<double> inline minimum(const Vectorized<double>& a, const Vectorized<double>& b) {
|
||||
return svmin_f64_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline clamp(const Vectorized<double>& a, const Vectorized<double>& min, const Vectorized<double>& max) {
|
||||
return svmin_f64_x(ptrue, max, svmax_f64_x(ptrue, min, a));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline clamp_max(const Vectorized<double>& a, const Vectorized<double>& max) {
|
||||
return svmin_f64_x(ptrue, max, a);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline clamp_min(const Vectorized<double>& a, const Vectorized<double>& min) {
|
||||
return svmax_f64_x(ptrue, min, a);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline operator&(const Vectorized<double>& a, const Vectorized<double>& b) {
|
||||
return svreinterpret_f64_s64(svand_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b)));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline operator|(const Vectorized<double>& a, const Vectorized<double>& b) {
|
||||
return svreinterpret_f64_s64(svorr_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b)));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline operator^(const Vectorized<double>& a, const Vectorized<double>& b) {
|
||||
return svreinterpret_f64_s64(sveor_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b)));
|
||||
}
|
||||
|
||||
Vectorized<double> inline Vectorized<double>::eq(const Vectorized<double>& other) const {
|
||||
return (*this == other) & Vectorized<double>(1.0);
|
||||
}
|
||||
|
||||
Vectorized<double> inline Vectorized<double>::ne(const Vectorized<double>& other) const {
|
||||
return (*this != other) & Vectorized<double>(1.0);
|
||||
}
|
||||
|
||||
Vectorized<double> inline Vectorized<double>::gt(const Vectorized<double>& other) const {
|
||||
return (*this > other) & Vectorized<double>(1.0);
|
||||
}
|
||||
|
||||
Vectorized<double> inline Vectorized<double>::ge(const Vectorized<double>& other) const {
|
||||
return (*this >= other) & Vectorized<double>(1.0);
|
||||
}
|
||||
|
||||
Vectorized<double> inline Vectorized<double>::lt(const Vectorized<double>& other) const {
|
||||
return (*this < other) & Vectorized<double>(1.0);
|
||||
}
|
||||
|
||||
Vectorized<double> inline Vectorized<double>::le(const Vectorized<double>& other) const {
|
||||
return (*this <= other) & Vectorized<double>(1.0);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const double* src, double* dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<double>::size();
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<double>::size()) {
|
||||
svst1_f64(ptrue, dst + i, svldnt1_f64(ptrue, src + i));
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<double>::size()) {
|
||||
svbool_t pg = svwhilelt_b64(i, n);
|
||||
svst1_f64(pg, dst + i, svldnt1_f64(pg, src + i));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline fmadd(const Vectorized<double>& a, const Vectorized<double>& b, const Vectorized<double>& c) {
|
||||
return svmad_f64_x(ptrue, a, b, c);
|
||||
}
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
}}}
|
@ -1,570 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <cmath>
|
||||
#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
|
||||
#include <sleef.h>
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
|
||||
#else
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
|
||||
#endif
|
||||
namespace at {
|
||||
namespace vec {
|
||||
// Note [CPU_CAPABILITY namespace]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
// This header, and all of its subheaders, will be compiled with
|
||||
// different architecture flags for each supported set of vector
|
||||
// intrinsics. So we need to make sure they aren't inadvertently
|
||||
// linked together. We do this by declaring objects in an `inline
|
||||
// namespace` which changes the name mangling, but can still be
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
template <> class Vectorized<float> {
|
||||
private:
|
||||
vls_float32_t values;
|
||||
public:
|
||||
using value_type = float;
|
||||
using size_type = int;
|
||||
static constexpr size_type size() {
|
||||
return VECTOR_WIDTH / sizeof(float);
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(svfloat32_t v) : values(v) {}
|
||||
Vectorized(float val) {
|
||||
values = svdup_n_f32(val);
|
||||
}
|
||||
template<typename... Args,
|
||||
typename = std::enable_if_t<(sizeof...(Args) == size())>>
|
||||
Vectorized(Args... vals) {
|
||||
__at_align__ float buffer[size()] = { vals... };
|
||||
values = svld1_f32(ptrue, buffer);
|
||||
}
|
||||
operator svfloat32_t() const {
|
||||
return values;
|
||||
}
|
||||
static Vectorized<float> blendv(const Vectorized<float>& a, const Vectorized<float>& b,
|
||||
const Vectorized<float>& mask_) {
|
||||
svbool_t mask = svcmpeq_s32(ptrue, svreinterpret_s32_f32(mask_),
|
||||
ALL_S32_TRUE_MASK);
|
||||
return svsel_f32(mask, b, a);
|
||||
}
|
||||
template<typename step_t>
|
||||
static Vectorized<float> arange(float base = 0.f, step_t step = static_cast<step_t>(1)) {
|
||||
__at_align__ float buffer[size()];
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
buffer[i] = base + i * step;
|
||||
}
|
||||
return svld1_f32(ptrue, buffer);
|
||||
}
|
||||
static Vectorized<float> set(const Vectorized<float>& a, const Vectorized<float>& b,
|
||||
int64_t count = size()) {
|
||||
if (count == 0) {
|
||||
return a;
|
||||
} else if (count < size()) {
|
||||
return svsel_f32(svwhilelt_b32(0ull, count), b, a);
|
||||
}
|
||||
return b;
|
||||
}
|
||||
static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
|
||||
if (count == size())
|
||||
return svld1_f32(ptrue, reinterpret_cast<const float*>(ptr));
|
||||
svbool_t pg = svwhilelt_b32(0ull, count);
|
||||
return svld1_f32(pg, reinterpret_cast<const float*>(ptr));
|
||||
}
|
||||
void store(void* ptr, int64_t count = size()) const {
|
||||
if (count == size()) {
|
||||
svst1_f32(ptrue, reinterpret_cast<float*>(ptr), values);
|
||||
} else {
|
||||
svbool_t pg = svwhilelt_b32(0ull, count);
|
||||
svst1_f32(pg, reinterpret_cast<float*>(ptr), values);
|
||||
}
|
||||
}
|
||||
const float& operator[](int idx) const = delete;
|
||||
float& operator[](int idx) = delete;
|
||||
int64_t zero_mask() const {
|
||||
// returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
|
||||
int64_t mask = 0;
|
||||
__at_align__ int32_t mask_array[size()];
|
||||
|
||||
svbool_t svbool_mask = svcmpeq_f32(ptrue, values, ZERO_F32);
|
||||
svst1_s32(ptrue, mask_array, svsel_s32(svbool_mask,
|
||||
ALL_S32_TRUE_MASK,
|
||||
ALL_S32_FALSE_MASK));
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
if (mask_array[i]) mask |= (1ull << i);
|
||||
}
|
||||
return mask;
|
||||
}
|
||||
Vectorized<float> isnan() const {
|
||||
// NaN check
|
||||
svbool_t mask = svcmpuo_f32(ptrue, values, ZERO_F32);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
bool has_inf_nan() const {
|
||||
return svptest_any(ptrue, svcmpuo_f32(ptrue, svsub_f32_x(ptrue, values, values), ZERO_F32));
|
||||
}
|
||||
Vectorized<float> map(float (*f)(float)) const {
|
||||
__at_align__ float tmp[size()];
|
||||
store(tmp);
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
tmp[i] = f(tmp[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
}
|
||||
Vectorized<float> abs() const {
|
||||
return svabs_f32_x(ptrue, values);
|
||||
}
|
||||
Vectorized<float> angle() const {
|
||||
const auto nan_vec = svdup_n_f32(NAN);
|
||||
const auto nan_mask = svcmpuo_f32(ptrue, values, ZERO_F32);
|
||||
const auto pi = svdup_n_f32(c10::pi<float>);
|
||||
|
||||
const auto neg_mask = svcmplt_f32(ptrue, values, ZERO_F32);
|
||||
auto angle = svsel_f32(neg_mask, pi, ZERO_F32);
|
||||
angle = svsel_f32(nan_mask, nan_vec, angle);
|
||||
return angle;
|
||||
}
|
||||
Vectorized<float> real() const {
|
||||
return values;
|
||||
}
|
||||
Vectorized<float> imag() const {
|
||||
return Vectorized<float>(0.f);
|
||||
}
|
||||
Vectorized<float> conj() const {
|
||||
return values;
|
||||
}
|
||||
Vectorized<float> acos() const {
|
||||
return USE_SLEEF(Vectorized<float>(Sleef_acosfx_u10sve(values)),map(std::acos));
|
||||
}
|
||||
Vectorized<float> acosh() const {
|
||||
return USE_SLEEF(Vectorized<float>(Sleef_acoshfx_u10sve(values)),map(std::acosh));
|
||||
}
|
||||
Vectorized<float> asin() const {
|
||||
return USE_SLEEF(Vectorized<float>(Sleef_asinfx_u10sve(values)),map(std::asin));
|
||||
}
|
||||
Vectorized<float> atan() const {
|
||||
return USE_SLEEF(Vectorized<float>(Sleef_atanfx_u10sve(values)),map(std::atan));
|
||||
}
|
||||
Vectorized<float> atanh() const {
|
||||
return USE_SLEEF(Vectorized<float>(Sleef_atanhfx_u10sve(values)),map(std::atanh));
|
||||
}
|
||||
Vectorized<float> atan2(const Vectorized<float> &b) const {
|
||||
USE_SLEEF({return Vectorized<float>(Sleef_atan2fx_u10sve(values, b));},
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (int64_t i = 0; i < size(); i++){
|
||||
tmp[i] = std::atan2(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
}
|
||||
)
|
||||
}
|
||||
Vectorized<float> copysign(const Vectorized<float> &sign) const {
|
||||
|
||||
USE_SLEEF({return Vectorized<float>(Sleef_copysignfx_sve(values, sign));},
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_sign[size()];
|
||||
store(tmp);
|
||||
sign.store(tmp_sign);
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
})
|
||||
}
|
||||
Vectorized<float> erf() const {
|
||||
return USE_SLEEF(Vectorized<float>(Sleef_erffx_u10sve(values)),map(std::erf));
|
||||
}
|
||||
Vectorized<float> erfc() const {
|
||||
return USE_SLEEF(Vectorized<float>(Sleef_erfcfx_u15sve(values)),map(std::erfc));
|
||||
}
|
||||
Vectorized<float> erfinv() const {
|
||||
return map(calc_erfinv);
|
||||
}
|
||||
Vectorized<float> exp() const {
|
||||
return USE_SLEEF(Vectorized<float>(Sleef_expfx_u10sve(values)),map(std::exp));
|
||||
}
|
||||
Vectorized<float> exp2() const {
|
||||
return USE_SLEEF(Vectorized<float>(Sleef_exp2fx_u10sve(values)),map(std::exp2));
|
||||
}
|
||||
Vectorized<float> expm1() const {
|
||||
return USE_SLEEF(Vectorized<float>(Sleef_expm1fx_u10sve(values)),map(std::expm1));
|
||||
}
|
||||
Vectorized<float> exp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
Vectorized<float> fmod(const Vectorized<float>& q) const {
|
||||
USE_SLEEF({return Vectorized<float>(Sleef_fmodfx_sve(values, q));},
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_q[size()];
|
||||
store(tmp);
|
||||
q.store(tmp_q);
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
tmp[i] = std::fmod(tmp[i], tmp_q[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
})
|
||||
}
|
||||
Vectorized<float> hypot(const Vectorized<float> &b) const {
|
||||
USE_SLEEF( {return Vectorized<float>(Sleef_hypotfx_u05sve(values, b));},
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = std::hypot(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
}
|
||||
)
|
||||
}
|
||||
Vectorized<float> i0() const {
|
||||
return map(calc_i0);
|
||||
}
|
||||
Vectorized<float> i0e() const {
|
||||
return map(calc_i0e);
|
||||
}
|
||||
Vectorized<float> digamma() const {
|
||||
return map(calc_digamma);
|
||||
}
|
||||
Vectorized<float> igamma(const Vectorized<float> &x) const {
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_x[size()];
|
||||
store(tmp);
|
||||
x.store(tmp_x);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
}
|
||||
Vectorized<float> igammac(const Vectorized<float> &x) const {
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_x[size()];
|
||||
store(tmp);
|
||||
x.store(tmp_x);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
}
|
||||
Vectorized<float> nextafter(const Vectorized<float> &b) const {
|
||||
USE_SLEEF(
|
||||
{
|
||||
return Vectorized<float>(Sleef_nextafterfx_sve(values, b));
|
||||
},
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
}
|
||||
)
|
||||
}
|
||||
Vectorized<float> log() const {
|
||||
return USE_SLEEF(Vectorized<float>(Sleef_logfx_u10sve(values)),map(std::log));
|
||||
}
|
||||
Vectorized<float> log2() const {
|
||||
return USE_SLEEF(Vectorized<float>(Sleef_log2fx_u10sve(values)),map(std::log2));
|
||||
}
|
||||
Vectorized<float> log10() const {
|
||||
return USE_SLEEF(Vectorized<float>(Sleef_log10fx_u10sve(values)),map(std::log10));
|
||||
}
|
||||
Vectorized<float> log1p() const {
|
||||
return USE_SLEEF(Vectorized<float>(Sleef_log1pfx_u10sve(values)),map(std::log1p));
|
||||
}
|
||||
Vectorized<float> frac() const;
|
||||
Vectorized<float> sin() const {
|
||||
return USE_SLEEF(Vectorized<float>(Sleef_sinfx_u10sve(values)),map(std::sin));
|
||||
}
|
||||
Vectorized<float> sinh() const {
|
||||
return USE_SLEEF(Vectorized<float>(Sleef_sinhfx_u10sve(values)),map(std::sinh));
|
||||
}
|
||||
Vectorized<float> cos() const {
|
||||
return USE_SLEEF(Vectorized<float>(Sleef_cosfx_u10sve(values)),map(std::cos));
|
||||
}
|
||||
Vectorized<float> cosh() const {
|
||||
return USE_SLEEF(Vectorized<float>(Sleef_coshfx_u10sve(values)),map(std::cosh));
|
||||
}
|
||||
Vectorized<float> ceil() const {
|
||||
return svrintp_f32_x(ptrue, values);
|
||||
}
|
||||
Vectorized<float> floor() const {
|
||||
return svrintm_f32_x(ptrue, values);
|
||||
}
|
||||
Vectorized<float> neg() const {
|
||||
return svneg_f32_x(ptrue, values);
|
||||
}
|
||||
Vectorized<float> round() const {
|
||||
return svrinti_f32_x(ptrue, values);
|
||||
}
|
||||
Vectorized<float> tan() const {
|
||||
return USE_SLEEF(Vectorized<float>(Sleef_tanfx_u10sve(values)),map(std::tan));
|
||||
}
|
||||
Vectorized<float> tanh() const {
|
||||
return USE_SLEEF(Vectorized<float>(Sleef_tanhfx_u10sve(values)),map(std::tanh));
|
||||
}
|
||||
Vectorized<float> trunc() const {
|
||||
return svrintz_f32_x(ptrue, values);
|
||||
}
|
||||
Vectorized<float> lgamma() const {
|
||||
return USE_SLEEF(Vectorized<float>(Sleef_lgammafx_u10sve(values)),map(std::lgamma));
|
||||
}
|
||||
Vectorized<float> sqrt() const {
|
||||
return svsqrt_f32_x(ptrue, values);
|
||||
}
|
||||
Vectorized<float> reciprocal() const {
|
||||
return svdivr_f32_x(ptrue, values, ONE_F32);
|
||||
}
|
||||
Vectorized<float> rsqrt() const {
|
||||
return svdivr_f32_x(ptrue, svsqrt_f32_x(ptrue, values), ONE_F32);
|
||||
}
|
||||
Vectorized<float> pow(const Vectorized<float> &b) const {
|
||||
USE_SLEEF( {return Vectorized<float>(Sleef_powfx_u10sve(values, b));},
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = std::pow(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
}
|
||||
)
|
||||
}
|
||||
// Comparison using the _CMP_**_OQ predicate.
|
||||
// `O`: get false if an operand is NaN
|
||||
// `Q`: do not raise if an operand is NaN
|
||||
Vectorized<float> operator==(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpeq_f32(ptrue, values, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> operator!=(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpne_f32(ptrue, values, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> operator<(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmplt_f32(ptrue, values, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> operator<=(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmple_f32(ptrue, values, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> operator>(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpgt_f32(ptrue, values, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> operator>=(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpge_f32(ptrue, values, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> eq(const Vectorized<float>& other) const;
|
||||
Vectorized<float> ne(const Vectorized<float>& other) const;
|
||||
Vectorized<float> gt(const Vectorized<float>& other) const;
|
||||
Vectorized<float> ge(const Vectorized<float>& other) const;
|
||||
Vectorized<float> lt(const Vectorized<float>& other) const;
|
||||
Vectorized<float> le(const Vectorized<float>& other) const;
|
||||
};
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svadd_f32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svsub_f32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svmul_f32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svdiv_f32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
// frac. Implement this here so we can use subtraction
|
||||
Vectorized<float> inline Vectorized<float>::frac() const {
|
||||
return *this - this->trunc();
|
||||
}
|
||||
|
||||
// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
|
||||
// either input is a NaN.
|
||||
template <>
|
||||
Vectorized<float> inline maximum(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svmax_f32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
|
||||
// either input is a NaN.
|
||||
template <>
|
||||
Vectorized<float> inline minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svmin_f32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline clamp(const Vectorized<float>& a, const Vectorized<float>& min, const Vectorized<float>& max) {
|
||||
return svmin_f32_x(ptrue, max, svmax_f32_x(ptrue, min, a));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline clamp_max(const Vectorized<float>& a, const Vectorized<float>& max) {
|
||||
return svmin_f32_x(ptrue, max, a);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline clamp_min(const Vectorized<float>& a, const Vectorized<float>& min) {
|
||||
return svmax_f32_x(ptrue, min, a);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svreinterpret_f32_s32(svand_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator|(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svreinterpret_f32_s32(svorr_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator^(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svreinterpret_f32_s32(sveor_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::eq(const Vectorized<float>& other) const {
|
||||
return (*this == other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::ne(const Vectorized<float>& other) const {
|
||||
return (*this != other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::gt(const Vectorized<float>& other) const {
|
||||
return (*this > other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::ge(const Vectorized<float>& other) const {
|
||||
return (*this >= other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::lt(const Vectorized<float>& other) const {
|
||||
return (*this < other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::le(const Vectorized<float>& other) const {
|
||||
return (*this <= other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const float* src, float* dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<float>::size();
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
|
||||
svst1_f32(ptrue, dst + i, svldnt1_f32(ptrue, src + i));
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
|
||||
svbool_t pg = svwhilelt_b32(i, n);
|
||||
svst1_f32(pg, dst + i, svldnt1_f32(pg, src + i));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const float *src, at::Half *dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<float>::size();
|
||||
svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized<float>::size());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
|
||||
svfloat16_t src_vec = svuzp1_f16(svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)),
|
||||
ZERO_F16);
|
||||
svst1_f16(pg_16, reinterpret_cast<float16_t*>(dst) + i, src_vec);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
|
||||
pg_16 = svwhilelt_b16(i, n);
|
||||
pg_32 = svwhilelt_b32(i, n);
|
||||
svfloat16_t src_vec = svuzp1_f16(svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)),
|
||||
ZERO_F16);
|
||||
svst1_f16(pg_16, reinterpret_cast<float16_t*>(dst) + i, src_vec);
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const at::Half *src, float *dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<float>::size();
|
||||
svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized<float>::size());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
|
||||
svfloat16_t src_vec = svzip1_f16(svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
|
||||
ZERO_F16);
|
||||
svst1_f32(pg_32, dst + i, svcvt_f32_f16_x(ptrue, src_vec));
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
|
||||
pg_16 = svwhilelt_b16(i, n);
|
||||
pg_32 = svwhilelt_b32(i, n);
|
||||
svfloat16_t src_vec = svzip1_f16(svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
|
||||
ZERO_F16);
|
||||
svst1_f32(pg_32, dst + i, svcvt_f32_f16_x(ptrue, src_vec));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const bool *src, float *dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<float>::size();
|
||||
svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized<float>::size());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
|
||||
svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
|
||||
svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
|
||||
svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
|
||||
svst1_f32(pg_32, dst + i, svsel_f32(mask, ONE_F32, ZERO_F32));
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
|
||||
pg_8 = svwhilelt_b8(i, n);
|
||||
pg_32 = svwhilelt_b32(i, n);
|
||||
svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
|
||||
svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
|
||||
svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
|
||||
svst1_f32(pg_32, dst + i, svsel_f32(mask, ONE_F32, ZERO_F32));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
|
||||
return svmad_f32_x(ptrue, a, b, c);
|
||||
}
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
}}}
|
@ -1,410 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
|
||||
namespace at {
|
||||
namespace vec {
|
||||
// Note [CPU_CAPABILITY namespace]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
// This header, and all of its subheaders, will be compiled with
|
||||
// different architecture flags for each supported set of vector
|
||||
// intrinsics. So we need to make sure they aren't inadvertently
|
||||
// linked together. We do this by declaring objects in an `inline
|
||||
// namespace` which changes the name mangling, but can still be
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
#define VEC_INT_SVE_TEMPLATE(vl, bit) \
|
||||
template <> class Vectorized<int##bit##_t> { \
|
||||
private: \
|
||||
vls_int##bit##_t values; \
|
||||
public: \
|
||||
using value_type = int##bit##_t; \
|
||||
using size_type = int; \
|
||||
static constexpr size_type size() { \
|
||||
return vl; \
|
||||
} \
|
||||
Vectorized() {} \
|
||||
Vectorized(svint##bit##_t v) : values(v) {} \
|
||||
Vectorized(int##bit##_t val) { \
|
||||
values = svdup_n_s##bit(val); \
|
||||
} \
|
||||
template<typename... Args, \
|
||||
typename = std::enable_if_t<(sizeof...(Args) == size())>> \
|
||||
Vectorized(Args... vals) { \
|
||||
__at_align__ int##bit##_t buffer[size()] = { vals... }; \
|
||||
values = svld1_s##bit(ptrue, buffer); \
|
||||
} \
|
||||
operator svint##bit##_t() const { \
|
||||
return values; \
|
||||
} \
|
||||
static Vectorized<int##bit##_t> blendv(const Vectorized<int##bit##_t>& a, \
|
||||
const Vectorized<int##bit##_t>& b, \
|
||||
const Vectorized<int##bit##_t>& mask_) { \
|
||||
svbool_t mask = svcmpeq_s##bit(ptrue, mask_, ALL_S##bit##_TRUE_MASK); \
|
||||
return svsel_s##bit(mask, b, a); \
|
||||
} \
|
||||
/* step sometimes requires a higher precision type (e.g., T=int, step_t=double) */ \
|
||||
template <typename step_t> \
|
||||
static Vectorized<int##bit##_t> arange(int##bit##_t base = 0, step_t step = static_cast<step_t>(1)) { \
|
||||
__at_align__ int##bit##_t buffer[size()]; \
|
||||
for (int64_t i = 0; i < size(); i++) { \
|
||||
buffer[i] = base + i * step; \
|
||||
} \
|
||||
return svld1_s##bit(ptrue, buffer); \
|
||||
} \
|
||||
static Vectorized<int##bit##_t> set(const Vectorized<int##bit##_t>& a, \
|
||||
const Vectorized<int##bit##_t>& b, \
|
||||
int##bit##_t count = size()) { \
|
||||
if (count == 0) { \
|
||||
return a; \
|
||||
} else if (count < size()) { \
|
||||
return svsel_s##bit(svwhilelt_b##bit(0ull, count), b, a); \
|
||||
} \
|
||||
return b; \
|
||||
} \
|
||||
static Vectorized<int##bit##_t> loadu(const void* ptr, int64_t count = size()) { \
|
||||
if (count == size()) \
|
||||
return svld1_s##bit(ptrue, reinterpret_cast<const int##bit##_t*>(ptr)); \
|
||||
svbool_t pg = svwhilelt_b##bit(0ull, count); \
|
||||
return svld1_s##bit(pg, reinterpret_cast<const int##bit##_t*>(ptr)); \
|
||||
} \
|
||||
void store(void* ptr, int64_t count = size()) const { \
|
||||
if (count == size()) { \
|
||||
svst1_s##bit(ptrue, reinterpret_cast<int##bit##_t*>(ptr), values); \
|
||||
} else { \
|
||||
svbool_t pg = svwhilelt_b##bit(0ull, count); \
|
||||
svst1_s##bit(pg, reinterpret_cast<int##bit##_t*>(ptr), values); \
|
||||
} \
|
||||
} \
|
||||
const int##bit##_t& operator[](int idx) const = delete; \
|
||||
int##bit##_t& operator[](int idx) = delete; \
|
||||
Vectorized<int##bit##_t> abs() const { \
|
||||
return svabs_s##bit##_x(ptrue, values); \
|
||||
} \
|
||||
Vectorized<int##bit##_t> real() const { \
|
||||
return values; \
|
||||
} \
|
||||
Vectorized<int##bit##_t> imag() const { \
|
||||
return svdup_n_s##bit(0); \
|
||||
} \
|
||||
Vectorized<int##bit##_t> conj() const { \
|
||||
return values; \
|
||||
} \
|
||||
Vectorized<int##bit##_t> frac() const; \
|
||||
Vectorized<int##bit##_t> neg() const { \
|
||||
return svneg_s##bit##_x(ptrue, values); \
|
||||
} \
|
||||
Vectorized<int##bit##_t> operator==(const Vectorized<int##bit##_t>& other) const { \
|
||||
svbool_t mask = svcmpeq_s##bit(ptrue, values, other); \
|
||||
return svsel_s##bit(mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK); \
|
||||
} \
|
||||
Vectorized<int##bit##_t> operator!=(const Vectorized<int##bit##_t>& other) const { \
|
||||
svbool_t mask = svcmpne_s##bit(ptrue, values, other); \
|
||||
return svsel_s##bit(mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK); \
|
||||
} \
|
||||
Vectorized<int##bit##_t> operator<(const Vectorized<int##bit##_t>& other) const { \
|
||||
svbool_t mask = svcmplt_s##bit(ptrue, values, other); \
|
||||
return svsel_s##bit(mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK); \
|
||||
} \
|
||||
Vectorized<int##bit##_t> operator<=(const Vectorized<int##bit##_t>& other) const { \
|
||||
svbool_t mask = svcmple_s##bit(ptrue, values, other); \
|
||||
return svsel_s##bit(mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK); \
|
||||
} \
|
||||
Vectorized<int##bit##_t> operator>(const Vectorized<int##bit##_t>& other) const { \
|
||||
svbool_t mask = svcmpgt_s##bit(ptrue, values, other); \
|
||||
return svsel_s##bit(mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK); \
|
||||
} \
|
||||
Vectorized<int##bit##_t> operator>=(const Vectorized<int##bit##_t>& other) const { \
|
||||
svbool_t mask = svcmpge_s##bit(ptrue, values, other); \
|
||||
return svsel_s##bit(mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK); \
|
||||
} \
|
||||
Vectorized<int##bit##_t> eq(const Vectorized<int##bit##_t>& other) const; \
|
||||
Vectorized<int##bit##_t> ne(const Vectorized<int##bit##_t>& other) const; \
|
||||
Vectorized<int##bit##_t> gt(const Vectorized<int##bit##_t>& other) const; \
|
||||
Vectorized<int##bit##_t> ge(const Vectorized<int##bit##_t>& other) const; \
|
||||
Vectorized<int##bit##_t> lt(const Vectorized<int##bit##_t>& other) const; \
|
||||
Vectorized<int##bit##_t> le(const Vectorized<int##bit##_t>& other) const; \
|
||||
}; \
|
||||
template <> \
|
||||
Vectorized<int##bit##_t> inline operator+(const Vectorized<int##bit##_t>& a, \
|
||||
const Vectorized<int##bit##_t>& b) { \
|
||||
return svadd_s##bit##_x(ptrue, a, b); \
|
||||
} \
|
||||
template <> \
|
||||
Vectorized<int##bit##_t> inline operator-(const Vectorized<int##bit##_t>& a, \
|
||||
const Vectorized<int##bit##_t>& b) { \
|
||||
return svsub_s##bit##_x(ptrue, a, b); \
|
||||
} \
|
||||
template <> \
|
||||
Vectorized<int##bit##_t> inline operator*(const Vectorized<int##bit##_t>& a, \
|
||||
const Vectorized<int##bit##_t>& b) { \
|
||||
return svmul_s##bit##_x(ptrue, a, b); \
|
||||
} \
|
||||
template <> \
|
||||
Vectorized<int##bit##_t> inline maximum(const Vectorized<int##bit##_t>& a, \
|
||||
const Vectorized<int##bit##_t>& b) { \
|
||||
return svmax_s##bit##_x(ptrue, a, b); \
|
||||
} \
|
||||
template <> \
|
||||
Vectorized<int##bit##_t> inline minimum(const Vectorized<int##bit##_t>& a, \
|
||||
const Vectorized<int##bit##_t>& b) { \
|
||||
return svmin_s##bit##_x(ptrue, a, b); \
|
||||
} \
|
||||
template <> \
|
||||
Vectorized<int##bit##_t> inline clamp(const Vectorized<int##bit##_t>& a, \
|
||||
const Vectorized<int##bit##_t>& min, \
|
||||
const Vectorized<int##bit##_t>& max) { \
|
||||
return svmin_s##bit##_x(ptrue, max, svmax_s##bit##_x(ptrue, min, a)); \
|
||||
} \
|
||||
template <> \
|
||||
Vectorized<int##bit##_t> inline clamp_max(const Vectorized<int##bit##_t>& a, \
|
||||
const Vectorized<int##bit##_t>& max) { \
|
||||
return svmin_s##bit##_x(ptrue, max, a); \
|
||||
} \
|
||||
template <> \
|
||||
Vectorized<int##bit##_t> inline clamp_min(const Vectorized<int##bit##_t>& a, \
|
||||
const Vectorized<int##bit##_t>& min) { \
|
||||
return svmax_s##bit##_x(ptrue, min, a); \
|
||||
} \
|
||||
template <> \
|
||||
Vectorized<int##bit##_t> inline operator&(const Vectorized<int##bit##_t>& a, \
|
||||
const Vectorized<int##bit##_t>& b) { \
|
||||
return svand_s##bit##_x(ptrue, a, b); \
|
||||
} \
|
||||
template <> \
|
||||
Vectorized<int##bit##_t> inline operator|(const Vectorized<int##bit##_t>& a, \
|
||||
const Vectorized<int##bit##_t>& b) { \
|
||||
return svorr_s##bit##_x(ptrue, a, b); \
|
||||
} \
|
||||
template <> \
|
||||
Vectorized<int##bit##_t> inline operator^(const Vectorized<int##bit##_t>& a, \
|
||||
const Vectorized<int##bit##_t>& b) { \
|
||||
return sveor_s##bit##_x(ptrue, a, b); \
|
||||
} \
|
||||
template <> \
|
||||
inline Vectorized<int##bit##_t> operator~(const Vectorized<int##bit##_t>& a) { \
|
||||
return sveor_s##bit##_x(ptrue, a, svdup_n_s##bit(-1)); \
|
||||
} \
|
||||
Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::eq(const Vectorized<int##bit##_t>& other) const { \
|
||||
return (*this == other) & Vectorized<int##bit##_t>(1); \
|
||||
} \
|
||||
Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::ne(const Vectorized<int##bit##_t>& other) const { \
|
||||
return (*this != other) & Vectorized<int##bit##_t>(1); \
|
||||
} \
|
||||
Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::gt(const Vectorized<int##bit##_t>& other) const { \
|
||||
return (*this > other) & Vectorized<int##bit##_t>(1); \
|
||||
} \
|
||||
Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::ge(const Vectorized<int##bit##_t>& other) const { \
|
||||
return (*this >= other) & Vectorized<int##bit##_t>(1); \
|
||||
} \
|
||||
Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::lt(const Vectorized<int##bit##_t>& other) const { \
|
||||
return (*this < other) & Vectorized<int##bit##_t>(1); \
|
||||
} \
|
||||
Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::le(const Vectorized<int##bit##_t>& other) const { \
|
||||
return (*this <= other) & Vectorized<int##bit##_t>(1); \
|
||||
}
|
||||
|
||||
VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int64_t), 64)
|
||||
VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int32_t), 32)
|
||||
VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int16_t), 16)
|
||||
VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int8_t), 8)
|
||||
|
||||
template <typename T>
|
||||
Vectorized<T> inline intdiv_nosve(const Vectorized<T>& a, const Vectorized<T>& b) {
|
||||
T values_a[Vectorized<T>::size()];
|
||||
T values_b[Vectorized<T>::size()];
|
||||
a.store(values_a);
|
||||
b.store(values_b);
|
||||
for (int i = 0; i != Vectorized<T>::size(); i++) {
|
||||
values_a[i] /= values_b[i];
|
||||
}
|
||||
return Vectorized<T>::loadu(values_a);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<int64_t> inline operator/(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
|
||||
return svdiv_s64_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<int32_t> inline operator/(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
|
||||
return svdiv_s32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<int16_t> inline operator/(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
|
||||
return intdiv_nosve(a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<int8_t> inline operator/(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
|
||||
return intdiv_nosve(a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const int32_t *src, int64_t *dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<int64_t>::size();
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<int64_t>::size());
|
||||
svbool_t pg_64 = svwhilelt_b64(0ull, Vectorized<int64_t>::size());
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<int64_t>::size())
|
||||
svst1_s64(pg_64, dst + i, svunpklo_s64(svldnt1_s32(pg_32, src + i)));
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<int64_t>::size()) {
|
||||
pg_32 = svwhilelt_b32(i, n);
|
||||
pg_64 = svwhilelt_b64(i, n);
|
||||
svst1_s64(pg_64, dst + i, svunpklo_s64(svldnt1_s32(pg_32, src + i)));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const int64_t *src, float *dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<int64_t>::size();
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<int64_t>::size());
|
||||
svbool_t pg_64 = svwhilelt_b64(0ull, Vectorized<int64_t>::size());
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<int64_t>::size()) {
|
||||
svint64_t src_vec_s64 = svldnt1_s64(pg_64, src + i);
|
||||
svfloat32_t src_vec_f32 = svuzp1_f32(svcvt_f32_s64_x(pg_64, src_vec_s64), ZERO_F32);
|
||||
svst1_f32(pg_32, dst + i, src_vec_f32);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<int64_t>::size()) {
|
||||
pg_32 = svwhilelt_b32(i, n);
|
||||
pg_64 = svwhilelt_b64(i, n);
|
||||
svint64_t src_vec_s64 = svldnt1_s64(pg_64, src + i);
|
||||
svfloat32_t src_vec_f32 = svuzp1_f32(svcvt_f32_s64_x(pg_64, src_vec_s64), ZERO_F32);
|
||||
svst1_f32(pg_32, dst + i, src_vec_f32);
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const int32_t *src, float *dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<int32_t>::size();
|
||||
svbool_t pg = svwhilelt_b32(0ull, Vectorized<int32_t>::size());
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<int32_t>::size()) {
|
||||
svint32_t src_vec = svldnt1_s32(pg, src + i);
|
||||
svst1_f32(pg, dst + i, svcvt_f32_s32_x(pg, src_vec));
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<int32_t>::size()) {
|
||||
pg = svwhilelt_b32(i, n);
|
||||
svint32_t src_vec = svldnt1_s32(pg, src + i);
|
||||
svst1_f32(pg, dst + i, svcvt_f32_s32_x(pg, src_vec));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const bool *src, int64_t *dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<int64_t>::size();
|
||||
svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized<int64_t>::size());
|
||||
svbool_t pg_64 = svwhilelt_b64(0ull, Vectorized<int64_t>::size());
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<int64_t>::size()) {
|
||||
svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
|
||||
svuint64_t src_vec_u64 = svunpklo_u64(svunpklo_u32(svunpklo_u16(src_vec_u8)));
|
||||
svbool_t mask = svcmpne_u64(pg_64, src_vec_u64, ZERO_U64);
|
||||
svst1_s64(pg_64, dst + i, svsel_s64(mask, ONE_S64, ZERO_S64));
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<int64_t>::size()) {
|
||||
pg_8 = svwhilelt_b8(i, n);
|
||||
pg_64 = svwhilelt_b64(i, n);
|
||||
svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
|
||||
svuint64_t src_vec_u64 = svunpklo_u64(svunpklo_u32(svunpklo_u16(src_vec_u8)));
|
||||
svbool_t mask = svcmpne_u64(pg_64, src_vec_u64, ZERO_U64);
|
||||
svst1_s64(pg_64, dst + i, svsel_s64(mask, ONE_S64, ZERO_S64));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const bool *src, int32_t *dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<int32_t>::size();
|
||||
svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized<int32_t>::size());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<int32_t>::size());
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<int32_t>::size()) {
|
||||
svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
|
||||
svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
|
||||
svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
|
||||
svst1_s32(pg_32, dst + i, svsel_s32(mask, ONE_S32, ZERO_S32));
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<int32_t>::size()) {
|
||||
pg_8 = svwhilelt_b8(i, n);
|
||||
pg_32 = svwhilelt_b32(i, n);
|
||||
svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
|
||||
svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
|
||||
svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
|
||||
svst1_s32(pg_32, dst + i, svsel_s32(mask, ONE_S32, ZERO_S32));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const uint8_t *src, bool *dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<uint8_t>::size();
|
||||
svbool_t pg = svwhilelt_b8(0ull, Vectorized<uint8_t>::size());
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<uint8_t>::size()) {
|
||||
svbool_t mask = svcmpne_u8(pg, svldnt1_u8(pg, src + i), ZERO_U8);
|
||||
svst1_u8(pg, reinterpret_cast<uint8_t*>(dst) + i,
|
||||
svsel_u8(mask, ALL_U8_TRUE_MASK, ALL_U8_FALSE_MASK));
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<uint8_t>::size()) {
|
||||
pg = svwhilelt_b8(i, n);
|
||||
svbool_t mask = svcmpne_u8(pg, svldnt1_u8(pg, src + i), ZERO_U8);
|
||||
svst1_u8(pg, reinterpret_cast<uint8_t*>(dst) + i,
|
||||
svsel_u8(mask, ALL_U8_TRUE_MASK, ALL_U8_FALSE_MASK));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<int64_t> inline operator<<(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
|
||||
return svlsl_s64_x(ptrue, a, svreinterpret_u64_s64(b));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<int32_t> inline operator<<(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
|
||||
return svlsl_s32_x(ptrue, a, svreinterpret_u32_s32(b));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<int16_t> inline operator<<(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
|
||||
return svlsl_s16_x(ptrue, a, svreinterpret_u16_s16(b));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<int8_t> inline operator<<(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
|
||||
return svlsl_s8_x(ptrue, a, svreinterpret_u8_s8(b));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<int64_t> inline operator>>(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
|
||||
return svasr_s64_x(ptrue, a, svreinterpret_u64_s64(b));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<int32_t> inline operator>>(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
|
||||
return svasr_s32_x(ptrue, a, svreinterpret_u32_s32(b));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<int16_t> inline operator>>(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
|
||||
return svasr_s16_x(ptrue, a, svreinterpret_u16_s16(b));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<int8_t> inline operator>>(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
|
||||
return svasr_s8_x(ptrue, a, svreinterpret_u8_s8(b));
|
||||
}
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
}}}
|
@ -1,567 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
// DO NOT DEFINE STATIC DATA IN THIS HEADER!
|
||||
// See Note [Do not compile initializers with SVE]
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <ATen/native/quantized/AffineQuantizerBase.h>
|
||||
#include <c10/util/qint32.h>
|
||||
#include <c10/util/qint8.h>
|
||||
#include <c10/util/quint8.h>
|
||||
|
||||
#include <array>
|
||||
|
||||
// This file defines Vectorized<> for the quantized types.
|
||||
//
|
||||
//
|
||||
// Currently, we simply use these classes as efficient converters between
|
||||
// the quantized types and Vectorized<float>, usually in bandwidth-bound cases
|
||||
// where doing the arithmetic in full-precision is acceptable (e.g.
|
||||
// elementwise operators).
|
||||
//
|
||||
//
|
||||
// Conversions are as follows:
|
||||
// Vectorized<qint8> -> 4x Vectorized<float>
|
||||
// Vectorized<quint8> -> 4x Vectorized<float>
|
||||
// Vectorized<qint32> -> 1x Vectorized<float>
|
||||
//
|
||||
// The size of the returned float vector is specified by the special
|
||||
// constexpr function float_num_vecs. The type of the value returned
|
||||
// from dequantize (and expected as an argument to quantize) is
|
||||
// specified by float_vec_return_type.
|
||||
//
|
||||
// When writing kernels with these vectors, it is expected that floating-
|
||||
// point operations will be carried out in a loop over Vectorized<T>::float_num_vecs
|
||||
// iterations.
|
||||
|
||||
namespace at {
|
||||
namespace vec {
|
||||
// Note [CPU_CAPABILITY namespace]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
// This header, and all of its subheaders, will be compiled with
|
||||
// different architecture flags for each supported set of vector
|
||||
// intrinsics. So we need to make sure they aren't inadvertently
|
||||
// linked together. We do this by declaring objects in an `inline
|
||||
// namespace` which changes the name mangling, but can still be
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
// NOTE: These are low-performance implementations that we fall back on
|
||||
// if we are not building with SVE. This may not be an issue, because
|
||||
// currently for quantization we assume the user has at least SVE
|
||||
// installed, so these can simply act as a reference implementation.
|
||||
//
|
||||
// If in the future we relax this requirement (SVE+), we should probably
|
||||
// revisit these implementations
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename float_vec_return_type_,
|
||||
typename int_vec_return_type_,
|
||||
int size_>
|
||||
struct VectorizedQuantizedConverter {
|
||||
using size_type = int;
|
||||
static constexpr size_type size() {
|
||||
return size_;
|
||||
}
|
||||
|
||||
static constexpr int float_num_vecs() {
|
||||
return size() / Vectorized<float>::size();
|
||||
}
|
||||
|
||||
static constexpr int int_num_vecs() {
|
||||
return size() / Vectorized<int32_t>::size();
|
||||
}
|
||||
|
||||
using float_vec_return_type = float_vec_return_type_;
|
||||
using int_vec_return_type = int_vec_return_type_;
|
||||
|
||||
using value_type = typename T::underlying;
|
||||
std::array<value_type, size_> vals;
|
||||
|
||||
VectorizedQuantizedConverter(T val) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
vals[i] = val.val_;
|
||||
}
|
||||
}
|
||||
|
||||
VectorizedQuantizedConverter(const void* ptr) {
|
||||
memcpy(vals.data(), ptr, sizeof(value_type) * size());
|
||||
}
|
||||
|
||||
void store(void* ptr, int count = size()) const {
|
||||
memcpy(ptr, vals.data(), count * sizeof(value_type));
|
||||
}
|
||||
|
||||
float_vec_return_type dequantize(
|
||||
Vectorized<float> scale,
|
||||
Vectorized<float> zero_point,
|
||||
Vectorized<float> scale_zp_premul) const {
|
||||
float_vec_return_type rv;
|
||||
float tmp_scale[Vectorized<float>::size()];
|
||||
float tmp_zero_point[Vectorized<float>::size()];
|
||||
scale.store(tmp_scale);
|
||||
zero_point.store(tmp_zero_point);
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
float tmp_vals[Vectorized<float>::size()];
|
||||
for (int j = 0; j < Vectorized<float>::size(); ++j) {
|
||||
tmp_vals[j] =
|
||||
at::native::dequantize_val<T>(tmp_scale[j], tmp_zero_point[j], T(vals[Vectorized<float>::size() * i + j]));
|
||||
}
|
||||
rv[i] = Vectorized<float>::loadu(tmp_vals);
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
|
||||
float_vec_return_type dequantize(
|
||||
Vectorized<float> scale,
|
||||
Vectorized<float> zero_point) const {
|
||||
float_vec_return_type rv;
|
||||
float tmp_scale[Vectorized<float>::size()];
|
||||
float tmp_zero_point[Vectorized<float>::size()];
|
||||
scale.store(tmp_scale);
|
||||
zero_point.store(tmp_zero_point);
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
float tmp_vals[Vectorized<float>::size()];
|
||||
for (int j = 0; j < Vectorized<float>::size(); ++j) {
|
||||
tmp_vals[j] =
|
||||
at::native::dequantize_val<T>(tmp_scale[j], tmp_zero_point[j], T(vals[Vectorized<float>::size() * i + j]));
|
||||
}
|
||||
rv[i] = Vectorized<float>::loadu(tmp_vals);
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
|
||||
protected:
|
||||
VectorizedQuantizedConverter() {}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
||||
c10::qint32,
|
||||
std::array<Vectorized<float>, 1>,
|
||||
std::array<Vectorized<c10::qint32>, 1>,
|
||||
VECTOR_WIDTH / 4> {
|
||||
Vectorized()
|
||||
: VectorizedQuantizedConverter<
|
||||
c10::qint32,
|
||||
std::array<Vectorized<float>, 1>,
|
||||
std::array<Vectorized<c10::qint32>, 1>,
|
||||
VECTOR_WIDTH / 4>() {}
|
||||
Vectorized(c10::qint32 val)
|
||||
: VectorizedQuantizedConverter<
|
||||
c10::qint32,
|
||||
std::array<Vectorized<float>, 1>,
|
||||
std::array<Vectorized<c10::qint32>, 1>,
|
||||
VECTOR_WIDTH / 4>(val) {}
|
||||
Vectorized(const void* ptr)
|
||||
: VectorizedQuantizedConverter<
|
||||
c10::qint32,
|
||||
std::array<Vectorized<float>, 1>,
|
||||
std::array<Vectorized<c10::qint32>, 1>,
|
||||
VECTOR_WIDTH / 4>(ptr) {}
|
||||
#if 1
|
||||
static Vectorized<c10::qint32> loadu(const void* ptr) {
|
||||
return Vectorized<c10::qint32>(ptr);
|
||||
}
|
||||
|
||||
static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
|
||||
__at_align__ value_type tmp_values[size()];
|
||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||
// instructions while a loop would be compiled to one instruction.
|
||||
for (const auto i : c10::irange(size())) {
|
||||
tmp_values[i] = 0;
|
||||
}
|
||||
std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
|
||||
return loadu(tmp_values);
|
||||
}
|
||||
#else
|
||||
static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count = size()) {
|
||||
if (count == size())
|
||||
return svld1_s32(ptrue, reinterpret_cast<const int32_t*>(ptr));
|
||||
svbool_t pg = svwhilelt_b32(0ull, count);
|
||||
return svld1_s32(pg, reinterpret_cast<const int32_t*>(ptr));
|
||||
}
|
||||
#endif
|
||||
static Vectorized<c10::qint32> quantize(
|
||||
const float_vec_return_type& rhs,
|
||||
float scale,
|
||||
int32_t zero_point,
|
||||
float inverse_scale) {
|
||||
std::array<value_type, size()> qvals;
|
||||
std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
|
||||
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
rhs[i].store(&float_vals[i * Vectorized<float>::size()], Vectorized<float>::size());
|
||||
}
|
||||
|
||||
at::native::quantize_vec<c10::qint32, /*precision=*/32>(
|
||||
scale,
|
||||
zero_point,
|
||||
float_vals.data(),
|
||||
(c10::qint32*)qvals.data(),
|
||||
Vectorized<float>::size() * float_num_vecs());
|
||||
|
||||
return Vectorized<c10::qint32>::loadu(qvals.data());
|
||||
}
|
||||
|
||||
Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
|
||||
Vectorized<c10::qint32> retval;
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
|
||||
Vectorized<c10::qint32> retval;
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const {
|
||||
return maximum(zero_point);
|
||||
}
|
||||
|
||||
|
||||
Vectorized<c10::qint32> relu6(
|
||||
Vectorized<c10::qint32> zero_point,
|
||||
Vectorized<c10::qint32> q_six) {
|
||||
Vectorized<c10::qint32> retval;
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::min<value_type>(
|
||||
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
|
||||
int_vec_return_type retval;
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval[0].vals[i] = vals[i] - b.vals[i];
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
static Vectorized<c10::qint32> requantize_from_int(
|
||||
const int_vec_return_type& inp,
|
||||
float multiplier,
|
||||
int32_t zero_point) {
|
||||
Vectorized<c10::qint32> retval;
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] =
|
||||
nearbyint(static_cast<float>(inp[0].vals[i]) * multiplier) +
|
||||
zero_point;
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
Vectorized<c10::qint32> inline maximum(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
|
||||
return a.maximum(b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::qint32> inline operator*(
|
||||
const Vectorized<c10::qint32>& a,
|
||||
const Vectorized<c10::qint32>& b) {
|
||||
Vectorized<c10::qint32> retval;
|
||||
for (size_t i = 0; i < std::decay_t<decltype(a)>::size(); ++i) {
|
||||
retval.vals[i] = a.vals[i] * b.vals[i];
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::qint32> inline operator+(
|
||||
const Vectorized<c10::qint32>& a,
|
||||
const Vectorized<c10::qint32>& b) {
|
||||
Vectorized<c10::qint32> retval;
|
||||
for (size_t i = 0; i < std::decay_t<decltype(a)>::size(); ++i) {
|
||||
retval.vals[i] = a.vals[i] + b.vals[i];
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
template <>
|
||||
struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
||||
c10::qint8,
|
||||
std::array<Vectorized<float>, 4>,
|
||||
std::array<Vectorized<c10::qint32>, 4>,
|
||||
VECTOR_WIDTH> {
|
||||
Vectorized()
|
||||
: VectorizedQuantizedConverter<
|
||||
c10::qint8,
|
||||
std::array<Vectorized<float>, 4>,
|
||||
std::array<Vectorized<c10::qint32>, 4>,
|
||||
VECTOR_WIDTH>() {}
|
||||
Vectorized(c10::qint8 val)
|
||||
: VectorizedQuantizedConverter<
|
||||
c10::qint8,
|
||||
std::array<Vectorized<float>, 4>,
|
||||
std::array<Vectorized<c10::qint32>, 4>,
|
||||
VECTOR_WIDTH>(val) {}
|
||||
Vectorized(const void* ptr)
|
||||
: VectorizedQuantizedConverter<
|
||||
c10::qint8,
|
||||
std::array<Vectorized<float>, 4>,
|
||||
std::array<Vectorized<c10::qint32>, 4>,
|
||||
VECTOR_WIDTH>(ptr) {}
|
||||
|
||||
static Vectorized<c10::qint8> loadu(const void* ptr) {
|
||||
return Vectorized<c10::qint8>(ptr);
|
||||
}
|
||||
|
||||
static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) {
|
||||
__at_align__ value_type tmp_values[size()];
|
||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||
// instructions while a loop would be compiled to one instruction.
|
||||
for (const auto i : c10::irange(size())) {
|
||||
tmp_values[i] = 0;
|
||||
}
|
||||
std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
|
||||
return loadu(tmp_values);
|
||||
}
|
||||
|
||||
static Vectorized<c10::qint8> quantize(
|
||||
const float_vec_return_type& rhs,
|
||||
float scale,
|
||||
int32_t zero_point,
|
||||
float inverse_scale) {
|
||||
std::array<value_type, size()> qvals;
|
||||
std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
|
||||
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
rhs[i].store(&float_vals[i * Vectorized<float>::size()], Vectorized<float>::size());
|
||||
}
|
||||
|
||||
at::native::quantize_vec<c10::qint8>(
|
||||
scale,
|
||||
zero_point,
|
||||
float_vals.data(),
|
||||
(c10::qint8*)qvals.data(),
|
||||
Vectorized<float>::size() * float_num_vecs());
|
||||
|
||||
return Vectorized<c10::qint8>::loadu(qvals.data());
|
||||
}
|
||||
|
||||
Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
|
||||
Vectorized<c10::qint8> retval;
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
|
||||
Vectorized<c10::qint8> retval;
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
Vectorized<c10::qint8> relu(Vectorized<c10::qint8> zero_point) const {
|
||||
return maximum(zero_point);
|
||||
}
|
||||
|
||||
Vectorized<c10::qint8> relu6(
|
||||
Vectorized<c10::qint8> zero_point,
|
||||
Vectorized<c10::qint8> q_six) {
|
||||
Vectorized<c10::qint8> retval;
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::min<value_type>(
|
||||
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
|
||||
int_vec_return_type retval;
|
||||
constexpr int elem_per_int_vec = size() / int_num_vecs();
|
||||
for (size_t i = 0; i < int_num_vecs(); ++i) {
|
||||
for (size_t j = 0; j < elem_per_int_vec; ++j) {
|
||||
retval[i].vals[j] =
|
||||
static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
|
||||
static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
|
||||
}
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
static Vectorized<c10::qint8> requantize_from_int(
|
||||
const int_vec_return_type& inp,
|
||||
float multiplier,
|
||||
int32_t zero_point) {
|
||||
constexpr int elem_per_int_vec = size() / int_num_vecs();
|
||||
constexpr auto min_val = std::numeric_limits<value_type>::min();
|
||||
constexpr auto max_val = std::numeric_limits<value_type>::max();
|
||||
Vectorized<c10::qint8> retval;
|
||||
for (size_t i = 0; i < int_num_vecs(); ++i) {
|
||||
for (size_t j = 0; j < elem_per_int_vec; ++j) {
|
||||
int32_t rounded =
|
||||
nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
|
||||
zero_point;
|
||||
retval.vals[i * elem_per_int_vec + j] =
|
||||
std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val);
|
||||
}
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
Vectorized<c10::qint8> inline maximum(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
|
||||
return a.maximum(b);
|
||||
}
|
||||
|
||||
template <>
|
||||
struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
||||
c10::quint8,
|
||||
std::array<Vectorized<float>, 4>,
|
||||
std::array<Vectorized<c10::qint32>, 4>,
|
||||
VECTOR_WIDTH> {
|
||||
Vectorized()
|
||||
: VectorizedQuantizedConverter<
|
||||
c10::quint8,
|
||||
std::array<Vectorized<float>, 4>,
|
||||
std::array<Vectorized<c10::qint32>, 4>,
|
||||
VECTOR_WIDTH>() {}
|
||||
Vectorized(c10::quint8 val)
|
||||
: VectorizedQuantizedConverter<
|
||||
c10::quint8,
|
||||
std::array<Vectorized<float>, 4>,
|
||||
std::array<Vectorized<c10::qint32>, 4>,
|
||||
VECTOR_WIDTH>(val) {}
|
||||
Vectorized(const void* ptr)
|
||||
: VectorizedQuantizedConverter<
|
||||
c10::quint8,
|
||||
std::array<Vectorized<float>, 4>,
|
||||
std::array<Vectorized<c10::qint32>, 4>,
|
||||
VECTOR_WIDTH>(ptr) {}
|
||||
#if 1
|
||||
static Vectorized<c10::quint8> loadu(const void* ptr) {
|
||||
return Vectorized<c10::quint8>(ptr);
|
||||
}
|
||||
|
||||
static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) {
|
||||
__at_align__ value_type tmp_values[size()];
|
||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||
// instructions while a loop would be compiled to one instruction.
|
||||
for (const auto i : c10::irange(size())) {
|
||||
tmp_values[i] = 0;
|
||||
}
|
||||
std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
|
||||
return loadu(tmp_values);
|
||||
}
|
||||
#else
|
||||
static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count = size()) {
|
||||
if (count == size())
|
||||
return svld1_u8(ptrue, reinterpret_cast<const uint8_t*>(ptr));
|
||||
svbool_t pg = svwhilelt_b8(0ull, count);
|
||||
return svld1_u8(pg, reinterpret_cast<const uint8_t*>(ptr));
|
||||
}
|
||||
#endif
|
||||
static Vectorized<c10::quint8> quantize(
|
||||
const float_vec_return_type& rhs,
|
||||
float scale,
|
||||
int32_t zero_point,
|
||||
float inverse_scale) {
|
||||
std::array<value_type, size()> qvals;
|
||||
std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
|
||||
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
rhs[i].store(&float_vals[i * Vectorized<float>::size()], Vectorized<float>::size());
|
||||
}
|
||||
|
||||
at::native::quantize_vec<c10::quint8>(
|
||||
scale,
|
||||
zero_point,
|
||||
float_vals.data(),
|
||||
(c10::quint8*)qvals.data(),
|
||||
Vectorized<float>::size() * float_num_vecs());
|
||||
|
||||
return Vectorized<c10::quint8>::loadu(qvals.data());
|
||||
}
|
||||
|
||||
Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
|
||||
Vectorized<c10::quint8> retval;
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
|
||||
Vectorized<c10::quint8> retval;
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
Vectorized<c10::quint8> relu(Vectorized<c10::quint8> zero_point) const {
|
||||
return maximum(zero_point);
|
||||
}
|
||||
|
||||
|
||||
Vectorized<c10::quint8> relu6(
|
||||
Vectorized<c10::quint8> zero_point,
|
||||
Vectorized<c10::quint8> q_six) {
|
||||
Vectorized<c10::quint8> retval;
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::min<value_type>(
|
||||
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
|
||||
int_vec_return_type retval;
|
||||
constexpr int elem_per_int_vec = size() / int_num_vecs();
|
||||
for (size_t i = 0; i < int_num_vecs(); ++i) {
|
||||
for (size_t j = 0; j < elem_per_int_vec; ++j) {
|
||||
retval[i].vals[j] =
|
||||
static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
|
||||
static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
|
||||
}
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
static Vectorized<c10::quint8> requantize_from_int(
|
||||
const int_vec_return_type& inp,
|
||||
float multiplier,
|
||||
int32_t zero_point) {
|
||||
constexpr int elem_per_int_vec = size() / int_num_vecs();
|
||||
constexpr auto min_val = std::numeric_limits<value_type>::min();
|
||||
constexpr auto max_val = std::numeric_limits<value_type>::max();
|
||||
Vectorized<c10::quint8> retval;
|
||||
for (size_t i = 0; i < int_num_vecs(); ++i) {
|
||||
for (size_t j = 0; j < elem_per_int_vec; ++j) {
|
||||
int32_t rounded =
|
||||
nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
|
||||
zero_point;
|
||||
retval.vals[i * elem_per_int_vec + j] =
|
||||
std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val);
|
||||
}
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
|
||||
return a.maximum(b);
|
||||
}
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
}}}
|
@ -7,13 +7,9 @@
|
||||
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#if !(defined(__VSX__) || defined(CPU_CAPABILITY_VSX) || defined(CPU_CAPABILITY_ZVECTOR))
|
||||
#if defined(CPU_CAPABILITY_SVE256)
|
||||
#include <ATen/cpu/vec/sve/vec_common_sve.h>
|
||||
#else
|
||||
#include <ATen/cpu/vec/vec256/vec256_float.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_float_neon.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_half_neon.h>
|
||||
#endif
|
||||
#include <ATen/cpu/vec/vec256/vec256_float.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_bfloat16.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_double.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_int.h>
|
||||
|
@ -1097,7 +1097,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
|
||||
return Vectorized<type>::loadu(arr2); \
|
||||
}
|
||||
CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && !defined(CPU_CAPABILITY_SVE256)
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
|
||||
inline std::tuple<Vectorized<float>, Vectorized<float>> convert_half_float(const Vectorized<Half>& a) {
|
||||
static_assert(Vectorized<Half>::size() == 2 * Vectorized<float>::size());
|
||||
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
||||
|
@ -208,27 +208,8 @@ struct VecConvert<
|
||||
(is_reduced_floating_point_v<src_t> && is_8bit_integer_v<dst_t>),
|
||||
void>> {
|
||||
static inline VectorizedN<dst_t, 1> apply(const VectorizedN<src_t, 1>& src) {
|
||||
VectorizedN<float, 2> tmp_fp32 = VecConvert<float, 2, src_t, 1>::apply(src);
|
||||
return VecConvert<dst_t, 1, float, 2>::apply(tmp_fp32);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename dst_t>
|
||||
struct VecConvert<
|
||||
dst_t,
|
||||
1,
|
||||
float,
|
||||
2,
|
||||
typename std::enable_if_t<is_8bit_integer_v<dst_t>,
|
||||
void>> {
|
||||
static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 2>& src) {
|
||||
at::vec::Vectorized<dst_t> vec1 = convert_float_to_int8<dst_t>(src[0]);
|
||||
at::vec::Vectorized<dst_t> vec2 = convert_float_to_int8<dst_t>(src[1]);
|
||||
__m128 lane2 = _mm256_castps256_ps128(_mm256_castsi256_ps(vec2));
|
||||
__m256 combined = _mm256_insertf128_ps(_mm256_castsi256_ps(vec1), lane2, 1);
|
||||
// Shuffle [191:128] bit from combined in to [127:64] bit of result
|
||||
__m256i result = _mm256_permute4x64_epi64(_mm256_castps_si256(combined), 0b11011000);
|
||||
return at::vec::Vectorized<dst_t>(result);
|
||||
VectorizedN<float, 1> tmp_fp32 = VecConvert<float, 1, src_t, 1>::apply(src);
|
||||
return VecConvert<dst_t, 1, float, 1>::apply(tmp_fp32);
|
||||
}
|
||||
};
|
||||
|
||||
@ -245,25 +226,6 @@ struct VecConvert<
|
||||
}
|
||||
};
|
||||
|
||||
template <typename src_t>
|
||||
struct VecConvert<
|
||||
float,
|
||||
2,
|
||||
src_t,
|
||||
1,
|
||||
typename std::enable_if_t<is_8bit_integer_v<src_t>,
|
||||
void>> {
|
||||
static inline VectorizedN<float, 2> apply(const VectorizedN<src_t, 1>& src) {
|
||||
// Shuffle [127:64] bit from src[0] in to [191:128] bit of shuffled
|
||||
__m256i shuffled = _mm256_permute4x64_epi64(src[0], 0b11011000);
|
||||
__m256i src2 = _mm256_castsi128_si256(
|
||||
_mm_castps_si128(
|
||||
_mm256_extractf128_ps(_mm256_castsi256_ps(shuffled), 1) // Extract the second 128-bit lane
|
||||
)
|
||||
);
|
||||
return VectorizedN<float, 2>(convert_int8_to_float<src_t>(src[0]), convert_int8_to_float<src_t>(src2));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename dst_t>
|
||||
struct VecConvert<
|
||||
|
@ -843,7 +843,7 @@ Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const V
|
||||
return a.maximum(b);
|
||||
}
|
||||
|
||||
#elif !defined(CPU_CAPABILITY_SVE256)
|
||||
#else
|
||||
|
||||
// NOTE: These are low-performance implementations that we fall back on
|
||||
// if we are not building with AVX2. This may not be an issue, because
|
||||
|
@ -209,25 +209,8 @@ struct VecConvert<
|
||||
(is_reduced_floating_point_v<src_t> && is_8bit_integer_v<dst_t>),
|
||||
void>> {
|
||||
static inline VectorizedN<dst_t, 1> apply(const VectorizedN<src_t, 1>& src) {
|
||||
VectorizedN<float, 2> tmp_fp32 = VecConvert<float, 2, src_t, 1>::apply(src);
|
||||
return VecConvert<dst_t, 1, float, 2>::apply(tmp_fp32);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename dst_t>
|
||||
struct VecConvert<
|
||||
dst_t,
|
||||
1,
|
||||
float,
|
||||
2,
|
||||
typename std::enable_if_t<is_8bit_integer_v<dst_t>,
|
||||
void>> {
|
||||
static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 2>& src) {
|
||||
at::vec::Vectorized<dst_t> vec1 = convert_float_to_int8<dst_t>(src[0]);
|
||||
at::vec::Vectorized<dst_t> vec2 = convert_float_to_int8<dst_t>(src[1]);
|
||||
__m128 lane2 = _mm512_castps512_ps128(_mm512_castsi512_ps(vec2));
|
||||
__m512 result = _mm512_insertf32x4(_mm512_castsi512_ps(vec1), lane2, 1); // Insert lane2 into the second 128-bit lane
|
||||
return at::vec::Vectorized<dst_t>(_mm512_castps_si512(result));
|
||||
VectorizedN<float, 1> tmp_fp32 = VecConvert<float, 1, src_t, 1>::apply(src);
|
||||
return VecConvert<dst_t, 1, float, 1>::apply(tmp_fp32);
|
||||
}
|
||||
};
|
||||
|
||||
@ -244,24 +227,6 @@ struct VecConvert<
|
||||
}
|
||||
};
|
||||
|
||||
template <typename src_t>
|
||||
struct VecConvert<
|
||||
float,
|
||||
2,
|
||||
src_t,
|
||||
1,
|
||||
typename std::enable_if_t<is_8bit_integer_v<src_t>,
|
||||
void>> {
|
||||
static inline VectorizedN<float, 2> apply(const VectorizedN<src_t, 1>& src) {
|
||||
__m512i src2 = _mm512_castsi128_si512(
|
||||
_mm_castps_si128(
|
||||
_mm512_extractf32x4_ps(_mm512_castsi512_ps(src[0]), 1) // Extract the second 128-bit lane
|
||||
)
|
||||
);
|
||||
return VectorizedN<float, 2>(convert_int8_to_float<src_t>(src[0]), convert_int8_to_float<src_t>(src2));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename src_t>
|
||||
struct VecConvert<
|
||||
float,
|
||||
|
@ -990,7 +990,7 @@ inline mask_gather(const Vectorized<T>& src, T const* base_addr,
|
||||
buffer[i] = src_arr[i];
|
||||
}
|
||||
}
|
||||
mask = Vectorized<T>(static_cast<T>(0)); // "zero out" mask
|
||||
mask = Vectorized<T>(); // "zero out" mask
|
||||
return Vectorized<T>::loadu(static_cast<void*>(buffer));
|
||||
}
|
||||
|
||||
|
@ -160,7 +160,7 @@ void CUDAGraph::capture_end() {
|
||||
|
||||
c10::cuda::CUDACachingAllocator::endAllocateToPool(capture_dev_, mempool_id_);
|
||||
|
||||
TORCH_CHECK(graph_ != nullptr, "Invalid capture.");
|
||||
TORCH_CHECK(graph_ != NULL, "Invalid capture.");
|
||||
has_graph_ = true;
|
||||
|
||||
// In typical graph usage some tensors (e.g. the tensors used for graph IO) are not freed
|
||||
@ -175,7 +175,7 @@ void CUDAGraph::capture_end() {
|
||||
// cudaGraphInstantiateWithFlags
|
||||
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga2c652a24ba93e52b99a47bec0888233
|
||||
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11040)
|
||||
int version = 0;
|
||||
int version;
|
||||
AT_CUDA_CHECK(cudaDriverGetVersion(&version));
|
||||
if (version < 11040) {
|
||||
#endif
|
||||
@ -203,7 +203,7 @@ void CUDAGraph::capture_end() {
|
||||
}
|
||||
|
||||
size_t numCUDAGraphNodes = 0;
|
||||
AT_CUDA_CHECK(cudaGraphGetNodes(graph_, nullptr, &numCUDAGraphNodes));
|
||||
AT_CUDA_CHECK(cudaGraphGetNodes(graph_, NULL, &numCUDAGraphNodes));
|
||||
if (numCUDAGraphNodes == 0) {
|
||||
TORCH_WARN("The CUDA Graph is empty. This usually means that the graph was ",
|
||||
"attempted to be captured on wrong device or stream.");
|
||||
@ -233,7 +233,7 @@ void CUDAGraph::replay() {
|
||||
// graph_exec_ may be replayed in any stream.
|
||||
AT_CUDA_CHECK(cudaGraphLaunch(graph_exec_, at::cuda::getCurrentCUDAStream()));
|
||||
|
||||
int version = 0;
|
||||
int version;
|
||||
AT_CUDA_CHECK(cudaDriverGetVersion(&version));
|
||||
if (version < 11040) {
|
||||
// Workaround for bug in libcuda.so that causes replayed graphs with
|
||||
|
@ -82,7 +82,7 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
|
||||
// in a capture to run on the same device, but this is a limitation of CUDAGraph,
|
||||
// not CUDA itself. We can straightforwardly modify CUDAGraph to support multi-device
|
||||
// captures if needed.
|
||||
int capture_dev_{};
|
||||
int capture_dev_;
|
||||
};
|
||||
|
||||
} // namespace cuda
|
||||
|
@ -123,11 +123,6 @@ struct CUDACachingHostAllocatorImpl
|
||||
return true;
|
||||
}
|
||||
|
||||
bool pinned_use_background_threads() override {
|
||||
return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
|
||||
pinned_use_background_threads();
|
||||
}
|
||||
|
||||
EventPool::Event create_event_internal(DeviceIndex idx) {
|
||||
// Leak the event pool to avoid shutdown issue.
|
||||
static auto* event_pool = new EventPool();
|
||||
|
@ -50,7 +50,7 @@ void radix_sort_keys(
|
||||
int64_t begin_bit, \
|
||||
int64_t end_bit);
|
||||
|
||||
AT_FORALL_SCALAR_TYPES_AND3(Bool, BFloat16, Half, AT_INSTATIATE_CUB_TEMPLATES)
|
||||
AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, AT_INSTATIATE_CUB_TEMPLATES)
|
||||
AT_INSTATIATE_CUB_TEMPLATES(uint16_t, UInt16)
|
||||
AT_INSTATIATE_CUB_TEMPLATES(uint32_t, UInt32)
|
||||
AT_INSTATIATE_CUB_TEMPLATES(uint64_t, UInt64)
|
||||
|
@ -278,7 +278,7 @@ class TunableOp {
|
||||
};
|
||||
|
||||
struct OpParams {
|
||||
OpParams() = default;
|
||||
OpParams() {}
|
||||
virtual ~OpParams() = default;
|
||||
virtual std::string Signature() const = 0;
|
||||
};
|
||||
|
@ -104,11 +104,6 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
|
||||
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
virtual PyObject* getDeviceCapability(DeviceIndex device) const {
|
||||
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||
return nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
struct TORCH_API MTIAHooksArgs {};
|
||||
|
@ -230,7 +230,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
|
||||
m.impl("reshape", native::reshape_symint);
|
||||
OP_DECOMPOSE(resolve_conj);
|
||||
OP_DECOMPOSE(resolve_neg);
|
||||
m.impl("rms_norm", native::rms_norm_symint);
|
||||
OP_DECOMPOSE(rms_norm);
|
||||
OP_DECOMPOSE(row_stack);
|
||||
OP_DECOMPOSE(rrelu);
|
||||
OP_DECOMPOSE(rrelu_);
|
||||
|
@ -779,28 +779,6 @@ std::tuple<Tensor, std::optional<int64_t>> scatter_reduce_batch_rule(
|
||||
self, self_bdim, dim, index, index_bdim, src, src_bdim, reduce);
|
||||
}
|
||||
|
||||
std::tuple<Tensor, std::optional<int64_t>> scatter_reduce_two_batch_rule(
|
||||
const Tensor& self, std::optional<int64_t> self_bdim,
|
||||
int64_t dim,
|
||||
const Tensor& index, std::optional<int64_t> index_bdim,
|
||||
const Tensor& src, std::optional<int64_t> src_bdim,
|
||||
const c10::string_view reduce,
|
||||
bool include_self) {
|
||||
return scatter_batch_rule(ATEN_FN2(scatter_reduce, two),
|
||||
self, self_bdim, dim, index, index_bdim, src, src_bdim, reduce, include_self);
|
||||
}
|
||||
|
||||
std::tuple<Tensor, std::optional<int64_t>> scatter_reduce__two_batch_rule(
|
||||
const Tensor& self, std::optional<int64_t> self_bdim,
|
||||
int64_t dim,
|
||||
const Tensor& index, std::optional<int64_t> index_bdim,
|
||||
const Tensor& src, std::optional<int64_t> src_bdim,
|
||||
const c10::string_view reduce,
|
||||
bool include_self) {
|
||||
return scatter_batch_rule(ATEN_FN2(scatter_reduce_, two),
|
||||
self, self_bdim, dim, index, index_bdim, src, src_bdim, reduce, include_self);
|
||||
}
|
||||
|
||||
std::tuple<Tensor, std::optional<int64_t>> scatter_value_reduce_batch_rule(
|
||||
const Tensor& self, std::optional<int64_t> self_bdim,
|
||||
int64_t dim,
|
||||
@ -1272,8 +1250,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
|
||||
VMAP_SUPPORT(scatter_add, scatter_add_batch_rule);
|
||||
VMAP_SUPPORT2(scatter, reduce, scatter_reduce_batch_rule);
|
||||
VMAP_SUPPORT2(scatter, value_reduce, scatter_value_reduce_batch_rule);
|
||||
VMAP_SUPPORT2(scatter_reduce, two, scatter_reduce_two_batch_rule);
|
||||
VMAP_SUPPORT2(scatter_reduce_, two, scatter_reduce__two_batch_rule);
|
||||
// as_strided_scatter does not work with the for-loop fallback today,
|
||||
// because as_strided_scatter will return an output that matches
|
||||
// the strides/storage_offset of its input.
|
||||
|
@ -88,6 +88,7 @@ TORCH_LIBRARY_IMPL(aten, MPS, m) {
|
||||
m.impl("embedding_renorm_", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
|
||||
m.impl("linalg_svd", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
|
||||
m.impl("linalg_svd.U", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
|
||||
m.impl("im2col", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>()); // Used in preprocessing by nn.Unfold
|
||||
m.impl("col2im", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
|
||||
m.impl("_slow_conv2d_forward", slow_conv2d_forward_mps);
|
||||
m.impl("upsample_nearest3d.vec", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
|
||||
|
@ -28,6 +28,7 @@ MPSStream::MPSStream(Stream stream) : _stream(stream) {
|
||||
_executionDescriptor.enableCommitAndContinue = _enableCommitAndContinue;
|
||||
|
||||
// Choose level which optimizes for GPU
|
||||
[_compilationDescriptor disableTypeInference];
|
||||
_compilationDescriptor.optimizationLevel = MPSGraphOptimizationLevel0;
|
||||
_executionDescriptor.compilationDescriptor = _compilationDescriptor;
|
||||
}
|
||||
|
@ -297,7 +297,7 @@ TORCH_IMPL_FUNC(adaptive_max_pool3d_out_cpu)
|
||||
int64_t osizeW = output_size[2];
|
||||
|
||||
if (input.ndimension() == 4) {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf,
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16,
|
||||
input.scalar_type(), "adaptive_max_pool3d_cpu", [&] {
|
||||
auto input_data = input.const_data_ptr<scalar_t>();
|
||||
auto output_data = output.data_ptr<scalar_t>();
|
||||
@ -320,7 +320,7 @@ TORCH_IMPL_FUNC(adaptive_max_pool3d_out_cpu)
|
||||
istrideW);
|
||||
});
|
||||
} else {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf,
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16,
|
||||
input.scalar_type(), "adaptive_max_pool3d_cpu", [&] {
|
||||
auto input_data = input.const_data_ptr<scalar_t>();
|
||||
auto output_data = output.data_ptr<scalar_t>();
|
||||
@ -390,7 +390,7 @@ TORCH_IMPL_FUNC(adaptive_max_pool3d_backward_out_cpu)
|
||||
|
||||
/* backprop */
|
||||
if (input.ndimension() == 4) {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf,
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16,
|
||||
input.scalar_type(), "adaptive_max_pool3d_backward", [&] {
|
||||
/* get raw pointers */
|
||||
scalar_t* gradInput_data = gradInput.data_ptr<scalar_t>();
|
||||
@ -410,7 +410,7 @@ TORCH_IMPL_FUNC(adaptive_max_pool3d_backward_out_cpu)
|
||||
osizeW);
|
||||
});
|
||||
} else {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf,
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16,
|
||||
input.scalar_type(), "adaptive_max_pool3d_backward", [&] {
|
||||
/* get raw pointers */
|
||||
scalar_t* gradInput_data = gradInput.data_ptr<scalar_t>();
|
||||
|
@ -1140,103 +1140,87 @@ REGISTER_AVX512_DISPATCH(cholesky_stub, &cholesky_kernel);
|
||||
REGISTER_AVX2_DISPATCH(cholesky_stub, &cholesky_kernel);
|
||||
REGISTER_VSX_DISPATCH(cholesky_stub, &cholesky_kernel);
|
||||
REGISTER_ZVECTOR_DISPATCH(cholesky_stub, &cholesky_kernel);
|
||||
REGISTER_SVE256_DISPATCH(cholesky_stub, &cholesky_kernel);
|
||||
|
||||
REGISTER_ARCH_DISPATCH(cholesky_inverse_stub, DEFAULT, &cholesky_inverse_kernel_impl);
|
||||
REGISTER_AVX512_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl);
|
||||
REGISTER_AVX2_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl);
|
||||
REGISTER_VSX_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl);
|
||||
REGISTER_ZVECTOR_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl);
|
||||
REGISTER_SVE256_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl);
|
||||
|
||||
REGISTER_ARCH_DISPATCH(linalg_eig_stub, DEFAULT, &linalg_eig_kernel);
|
||||
REGISTER_AVX512_DISPATCH(linalg_eig_stub, &linalg_eig_kernel);
|
||||
REGISTER_AVX2_DISPATCH(linalg_eig_stub, &linalg_eig_kernel);
|
||||
REGISTER_VSX_DISPATCH(linalg_eig_stub, &linalg_eig_kernel);
|
||||
REGISTER_ZVECTOR_DISPATCH(linalg_eig_stub, &linalg_eig_kernel);
|
||||
REGISTER_SVE256_DISPATCH(linalg_eig_stub, &linalg_eig_kernel);
|
||||
|
||||
REGISTER_ARCH_DISPATCH(linalg_eigh_stub, DEFAULT, &linalg_eigh_kernel);
|
||||
REGISTER_AVX512_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel);
|
||||
REGISTER_AVX2_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel);
|
||||
REGISTER_VSX_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel);
|
||||
REGISTER_ZVECTOR_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel);
|
||||
REGISTER_SVE256_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel);
|
||||
|
||||
REGISTER_ARCH_DISPATCH(geqrf_stub, DEFAULT, &geqrf_kernel);
|
||||
REGISTER_AVX512_DISPATCH(geqrf_stub, &geqrf_kernel);
|
||||
REGISTER_AVX2_DISPATCH(geqrf_stub, &geqrf_kernel);
|
||||
REGISTER_VSX_DISPATCH(geqrf_stub, &geqrf_kernel);
|
||||
REGISTER_ZVECTOR_DISPATCH(geqrf_stub, &geqrf_kernel);
|
||||
REGISTER_SVE256_DISPATCH(geqrf_stub, &geqrf_kernel);
|
||||
|
||||
REGISTER_ARCH_DISPATCH(orgqr_stub, DEFAULT, &orgqr_kernel_impl);
|
||||
REGISTER_AVX512_DISPATCH(orgqr_stub, &orgqr_kernel_impl);
|
||||
REGISTER_AVX2_DISPATCH(orgqr_stub, &orgqr_kernel_impl);
|
||||
REGISTER_VSX_DISPATCH(orgqr_stub, &orgqr_kernel_impl);
|
||||
REGISTER_ZVECTOR_DISPATCH(orgqr_stub, &orgqr_kernel_impl);
|
||||
REGISTER_SVE256_DISPATCH(orgqr_stub, &orgqr_kernel_impl);
|
||||
|
||||
REGISTER_ARCH_DISPATCH(ormqr_stub, DEFAULT, &ormqr_kernel);
|
||||
REGISTER_AVX512_DISPATCH(ormqr_stub, &ormqr_kernel);
|
||||
REGISTER_AVX2_DISPATCH(ormqr_stub, &ormqr_kernel);
|
||||
REGISTER_VSX_DISPATCH(ormqr_stub, &ormqr_kernel);
|
||||
REGISTER_ZVECTOR_DISPATCH(ormqr_stub, &ormqr_kernel);
|
||||
REGISTER_SVE256_DISPATCH(ormqr_stub, &ormqr_kernel);
|
||||
|
||||
REGISTER_ARCH_DISPATCH(lstsq_stub, DEFAULT, &lstsq_kernel);
|
||||
REGISTER_AVX512_DISPATCH(lstsq_stub, &lstsq_kernel);
|
||||
REGISTER_AVX2_DISPATCH(lstsq_stub, &lstsq_kernel);
|
||||
REGISTER_VSX_DISPATCH(lstsq_stub, &lstsq_kernel);
|
||||
REGISTER_ZVECTOR_DISPATCH(lstsq_stub, &lstsq_kernel);
|
||||
REGISTER_SVE256_DISPATCH(lstsq_stub, &lstsq_kernel);
|
||||
|
||||
REGISTER_ARCH_DISPATCH(triangular_solve_stub, DEFAULT, &triangular_solve_kernel);
|
||||
REGISTER_AVX512_DISPATCH(triangular_solve_stub, &triangular_solve_kernel);
|
||||
REGISTER_AVX2_DISPATCH(triangular_solve_stub, &triangular_solve_kernel);
|
||||
REGISTER_VSX_DISPATCH(triangular_solve_stub, &triangular_solve_kernel);
|
||||
REGISTER_ZVECTOR_DISPATCH(triangular_solve_stub, &triangular_solve_kernel);
|
||||
REGISTER_SVE256_DISPATCH(triangular_solve_stub, &triangular_solve_kernel);
|
||||
|
||||
REGISTER_ARCH_DISPATCH(lu_factor_stub, DEFAULT, &lu_factor_kernel);
|
||||
REGISTER_AVX512_DISPATCH(lu_factor_stub, &lu_factor_kernel);
|
||||
REGISTER_AVX2_DISPATCH(lu_factor_stub, &lu_factor_kernel);
|
||||
REGISTER_VSX_DISPATCH(lu_factor_stub, &lu_factor_kernel);
|
||||
REGISTER_ZVECTOR_DISPATCH(lu_factor_stub, &lu_factor_kernel);
|
||||
REGISTER_SVE256_DISPATCH(lu_factor_stub, &lu_factor_kernel);
|
||||
|
||||
REGISTER_ARCH_DISPATCH(ldl_factor_stub, DEFAULT, &ldl_factor_kernel);
|
||||
REGISTER_AVX512_DISPATCH(ldl_factor_stub, &ldl_factor_kernel);
|
||||
REGISTER_AVX2_DISPATCH(ldl_factor_stub, &ldl_factor_kernel);
|
||||
REGISTER_VSX_DISPATCH(ldl_factor_stub, &ldl_factor_kernel);
|
||||
REGISTER_ZVECTOR_DISPATCH(ldl_factor_stub, &ldl_factor_kernel);
|
||||
REGISTER_SVE256_DISPATCH(ldl_factor_stub, &ldl_factor_kernel);
|
||||
|
||||
REGISTER_ARCH_DISPATCH(ldl_solve_stub, DEFAULT, &ldl_solve_kernel);
|
||||
REGISTER_AVX512_DISPATCH(ldl_solve_stub, &ldl_solve_kernel);
|
||||
REGISTER_AVX2_DISPATCH(ldl_solve_stub, &ldl_solve_kernel);
|
||||
REGISTER_VSX_DISPATCH(ldl_solve_stub, &ldl_solve_kernel);
|
||||
REGISTER_ZVECTOR_DISPATCH(ldl_solve_stub, &ldl_solve_kernel);
|
||||
REGISTER_SVE256_DISPATCH(ldl_solve_stub, &ldl_solve_kernel);
|
||||
|
||||
REGISTER_ARCH_DISPATCH(lu_solve_stub, DEFAULT, &lu_solve_kernel);
|
||||
REGISTER_AVX512_DISPATCH(lu_solve_stub, &lu_solve_kernel);
|
||||
REGISTER_AVX2_DISPATCH(lu_solve_stub, &lu_solve_kernel);
|
||||
REGISTER_VSX_DISPATCH(lu_solve_stub, &lu_solve_kernel);
|
||||
REGISTER_ZVECTOR_DISPATCH(lu_solve_stub, &lu_solve_kernel);
|
||||
REGISTER_SVE256_DISPATCH(lu_solve_stub, &lu_solve_kernel);
|
||||
|
||||
REGISTER_ARCH_DISPATCH(svd_stub, DEFAULT, &svd_kernel);
|
||||
REGISTER_AVX512_DISPATCH(svd_stub, &svd_kernel);
|
||||
REGISTER_AVX2_DISPATCH(svd_stub, &svd_kernel);
|
||||
REGISTER_VSX_DISPATCH(svd_stub, &svd_kernel);
|
||||
REGISTER_ZVECTOR_DISPATCH(svd_stub, &svd_kernel);
|
||||
REGISTER_SVE256_DISPATCH(svd_stub, &svd_kernel);
|
||||
|
||||
REGISTER_ARCH_DISPATCH(unpack_pivots_stub, DEFAULT, &unpack_pivots_cpu_kernel);
|
||||
REGISTER_AVX512_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel);
|
||||
REGISTER_AVX2_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel);
|
||||
REGISTER_VSX_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel);
|
||||
REGISTER_ZVECTOR_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel);
|
||||
REGISTER_SVE256_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel);
|
||||
} // namespace at::native
|
||||
|
@ -1663,7 +1663,13 @@ at::Tensor _convolution(
|
||||
break;
|
||||
case ConvBackend::Mps:
|
||||
#ifdef USE_MPS
|
||||
check_input_same_type_as_parameters(input, weight, bias);
|
||||
TORCH_CHECK(input.options().type_equal(weight.options()),
|
||||
"Input type (", input.toString(), ") and weight type (", weight.toString(),
|
||||
") should be the same");
|
||||
TORCH_CHECK(!bias.defined() || (input.options().type_equal(bias.options())),
|
||||
"Input type (", input.toString(), ") and bias type (", bias.toString(),
|
||||
") should be the same");
|
||||
|
||||
output = at::_mps_convolution(input, weight, bias.defined() ? bias.contiguous() : bias,
|
||||
params.padding, params.stride, params.dilation,
|
||||
params.groups);
|
||||
@ -1673,7 +1679,12 @@ at::Tensor _convolution(
|
||||
break;
|
||||
case ConvBackend::MpsTranspose:
|
||||
#ifdef USE_MPS
|
||||
check_input_same_type_as_parameters(input, weight, bias);
|
||||
TORCH_CHECK(input.options().type_equal(weight.options()),
|
||||
"Input type (", input.toString(), ") and weight type (", weight.toString(),
|
||||
") should be the same");
|
||||
TORCH_CHECK(!bias.defined() || (input.options().type_equal(bias.options())),
|
||||
"Input type (", input.toString(), ") and bias type (", bias.toString(),
|
||||
") should be the same");
|
||||
output = at::_mps_convolution_transpose(
|
||||
input.contiguous(backend_memory_format), weight,
|
||||
params.padding, params.output_padding,
|
||||
|
@ -34,17 +34,6 @@ static CPUCapability compute_cpu_capability() {
|
||||
if (strcmp(envar, "zvector") == 0) {
|
||||
return CPUCapability::ZVECTOR;
|
||||
}
|
||||
#elif defined(HAVE_SVE_CPU_DEFINITION)
|
||||
int sve_vl = cpuinfo_get_max_arm_sve_length(); //Returns maximum SVE VL supported by your HW.
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
if (strcmp(envar, "sve256") == 0) {
|
||||
if (sve_vl == 256) {
|
||||
return CPUCapability::SVE256;
|
||||
}
|
||||
TORCH_WARN("SVE256 capability not available on hardware. Falling back to DEFAULT");
|
||||
return CPUCapability::DEFAULT;
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
#ifdef HAVE_AVX512_CPU_DEFINITION
|
||||
if (strcmp(envar, "avx512") == 0) {
|
||||
@ -63,7 +52,7 @@ static CPUCapability compute_cpu_capability() {
|
||||
TORCH_WARN("ignoring invalid value for ATEN_CPU_CAPABILITY: ", envar);
|
||||
}
|
||||
|
||||
#if !defined(__powerpc__) && !defined(__s390x__) && !defined(HAVE_SVE_CPU_DEFINITION)
|
||||
#if !defined(__powerpc__) && !defined(__s390x__)
|
||||
if (cpuinfo_initialize()) {
|
||||
#if defined(HAVE_AVX512_CPU_DEFINITION)
|
||||
// GCC supports some AVX512 intrinsics such as _mm512_set_epi16 only in
|
||||
@ -90,23 +79,6 @@ static CPUCapability compute_cpu_capability() {
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__linux__) && defined(HAVE_SVE_CPU_DEFINITION)
|
||||
if (cpuinfo_initialize() && cpuinfo_has_arm_sve()) {
|
||||
int sve_vl = cpuinfo_get_max_arm_sve_length(); //Returns maximum SVE VL supported by your HW.
|
||||
if (sve_vl <= 0) {
|
||||
// SVE is not supported on this system.
|
||||
// Return the default CPU capability.
|
||||
return CPUCapability::DEFAULT;
|
||||
}
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
if (sve_vl == 256) { // Check for SVE256
|
||||
return CPUCapability::SVE256;
|
||||
}
|
||||
#endif
|
||||
// Return the default CPU capability.
|
||||
return CPUCapability::DEFAULT;
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_VSX_CPU_DEFINITION
|
||||
return CPUCapability::VSX;
|
||||
#else
|
||||
@ -134,9 +106,6 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
) {
|
||||
constexpr auto supported_devices = c10::array_of<c10::DeviceType>(
|
||||
c10::DeviceType::CPU,
|
||||
@ -170,9 +139,6 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
|
||||
#endif
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, SVE256
|
||||
#endif
|
||||
);
|
||||
if (!std::holds_alternative<ErrorType>(result)) {
|
||||
@ -225,9 +191,6 @@ void* DispatchStubImpl::get_call_ptr(
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
) {
|
||||
|
||||
auto result = try_get_call_ptr(
|
||||
@ -248,10 +211,6 @@ void* DispatchStubImpl::get_call_ptr(
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
,
|
||||
ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
,
|
||||
SVE256
|
||||
#endif
|
||||
);
|
||||
if (std::holds_alternative<ErrorType>(result)) {
|
||||
@ -283,9 +242,6 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
|
||||
#endif
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
){
|
||||
|
||||
@ -318,16 +274,6 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
|
||||
if (capability >= static_cast<int>(CPUCapability::ZVECTOR)) {
|
||||
return ZVECTOR != nullptr ? DispatchResult(ZVECTOR) : ErrorType::MissingDeviceKernel;
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
if (capability >= static_cast<int>(CPUCapability::SVE256)) {
|
||||
if (C10_UNLIKELY(!SVE256)) {
|
||||
// dispatch to DEFAULT, since the SVE kernel is missing
|
||||
return DEFAULT != nullptr ? DispatchResult(DEFAULT) : ErrorType::MissingDeviceKernel;
|
||||
} else {
|
||||
return DispatchResult(SVE256);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return DEFAULT != nullptr ? DispatchResult(DEFAULT) : ErrorType::MissingDeviceKernel;
|
||||
}
|
||||
@ -346,9 +292,6 @@ void* DispatchStubImpl::choose_cpu_impl(
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
) {
|
||||
auto capability = static_cast<int>(get_cpu_capability());
|
||||
(void)capability;
|
||||
@ -383,17 +326,6 @@ void* DispatchStubImpl::choose_cpu_impl(
|
||||
TORCH_INTERNAL_ASSERT(ZVECTOR, "DispatchStub: missing ZVECTOR kernel");
|
||||
return ZVECTOR;
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
if (capability >= static_cast<int>(CPUCapability::SVE256)) {
|
||||
if (C10_UNLIKELY(!SVE256)) {
|
||||
// dispatch to DEFAULT, since the SVE kernel is missing
|
||||
TORCH_INTERNAL_ASSERT(DEFAULT, "DispatchStub: missing default kernel");
|
||||
return DEFAULT;
|
||||
} else {
|
||||
return SVE256;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
TORCH_INTERNAL_ASSERT(DEFAULT, "DispatchStub: missing default kernel");
|
||||
return DEFAULT;
|
||||
|
@ -64,8 +64,6 @@ enum class CPUCapability {
|
||||
VSX = 1,
|
||||
#elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
|
||||
ZVECTOR = 1,
|
||||
#elif defined(HAVE_SVE_CPU_DEFINITION)
|
||||
SVE256 = 1,
|
||||
#else
|
||||
AVX2 = 1,
|
||||
AVX512 = 2,
|
||||
@ -114,9 +112,6 @@ struct TORCH_API DispatchStubImpl {
|
||||
#endif
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
);
|
||||
|
||||
@ -135,9 +130,6 @@ struct TORCH_API DispatchStubImpl {
|
||||
#endif
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
);
|
||||
|
||||
@ -156,9 +148,6 @@ struct TORCH_API DispatchStubImpl {
|
||||
#endif
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
);
|
||||
|
||||
@ -180,9 +169,6 @@ struct TORCH_API DispatchStubImpl {
|
||||
#endif
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
);
|
||||
|
||||
@ -235,9 +221,6 @@ private:
|
||||
#endif
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, reinterpret_cast<void*>(ZVECTOR)
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, reinterpret_cast<void*>(SVE256)
|
||||
#endif
|
||||
)
|
||||
);
|
||||
@ -292,9 +275,6 @@ public:
|
||||
#endif
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, reinterpret_cast<void*>(ZVECTOR)
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, reinterpret_cast<void*>(SVE256)
|
||||
#endif
|
||||
);
|
||||
if (std::holds_alternative<ErrorType>(result)){
|
||||
@ -316,9 +296,6 @@ public:
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
static TORCH_API FnPtr ZVECTOR;
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
static TORCH_API FnPtr SVE256;
|
||||
#endif
|
||||
private:
|
||||
DispatchStubImpl impl;
|
||||
};
|
||||
@ -410,12 +387,6 @@ struct RegisterPRIVATEUSE1Dispatch {
|
||||
#define REGISTER_ZVECTOR_DISPATCH(name, fn)
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#define REGISTER_SVE256_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, SVE256, fn)
|
||||
#else
|
||||
#define REGISTER_SVE256_DISPATCH(name, fn)
|
||||
#endif
|
||||
|
||||
// Macro to register the same kernel for all CPU arch types. This is useful
|
||||
// if a kernel does not benefit from being recompiled across different arch types.
|
||||
#define REGISTER_ALL_CPU_DISPATCH(name, fn) \
|
||||
@ -423,8 +394,7 @@ struct RegisterPRIVATEUSE1Dispatch {
|
||||
REGISTER_AVX512_DISPATCH(name, fn) \
|
||||
REGISTER_AVX2_DISPATCH(name, fn) \
|
||||
REGISTER_VSX_DISPATCH(name, fn) \
|
||||
REGISTER_ZVECTOR_DISPATCH(name, fn) \
|
||||
REGISTER_SVE256_DISPATCH(name, fn)
|
||||
REGISTER_ZVECTOR_DISPATCH(name, fn)
|
||||
|
||||
#define REGISTER_NO_CPU_DISPATCH(name) \
|
||||
REGISTER_ALL_CPU_DISPATCH(name, nullptr)
|
||||
@ -462,14 +432,12 @@ struct RegisterPRIVATEUSE1Dispatch {
|
||||
#elif defined(CPU_CAPABILITY)
|
||||
// REGISTER_DISPATCH now dispatches an AVX512 kernel to nullptr but registers other dispatches.
|
||||
// ALSO_REGISTER_AVX512_DISPATCH should be used for ensuring AVX512 dispatch, among others.
|
||||
// ALSO_REGISTER_SVE256_DISPATCH should be used for ensuring SVE256 dispatch, among others.
|
||||
#ifdef CPU_CAPABILITY_AVX512
|
||||
#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, ((void*)(fn) ? nullptr : nullptr))
|
||||
#else
|
||||
#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
|
||||
#endif
|
||||
#define ALSO_REGISTER_AVX512_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
|
||||
#define ALSO_REGISTER_SVE256_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
|
||||
#endif
|
||||
} // namespace at::native
|
||||
|
||||
|
@ -128,26 +128,10 @@ inline bool _check_tensors_share_device_and_dtype(
|
||||
// corresponding tensors in tensor lists have the same sizes and strides.
|
||||
inline bool _check_tensors_share_sizes_and_strides(
|
||||
ArrayRef<TensorList> tensorLists) {
|
||||
auto is_diff_stride = [](const IntArrayRef& size,
|
||||
const IntArrayRef& left_stride,
|
||||
const IntArrayRef& right_stride) -> bool {
|
||||
const size_t size_size = size.size();
|
||||
for (const auto dim : c10::irange(size_size)) {
|
||||
if (size[dim] == 1)
|
||||
continue;
|
||||
if (left_stride[dim] != right_stride[dim]) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
};
|
||||
for (const auto i : c10::irange(1, tensorLists.size())) {
|
||||
for (const auto j : c10::irange(tensorLists[0].size())) {
|
||||
if (tensorLists[0][j].sizes() != tensorLists[i][j].sizes() ||
|
||||
is_diff_stride(
|
||||
tensorLists[0][j].sizes(),
|
||||
tensorLists[0][j].strides(),
|
||||
tensorLists[i][j].strides())) {
|
||||
tensorLists[0][j].strides() != tensorLists[i][j].strides()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -1366,6 +1366,7 @@ TORCH_IMPL_FUNC(mean_out)
|
||||
dim_prod *= self.size(d);
|
||||
}
|
||||
}
|
||||
auto& result_mut = const_cast<Tensor&>(result);
|
||||
// For accuracy reasons, BF16/FP16 mean should be computed via the
|
||||
// following approach:
|
||||
// cast_fp32 -> sum -> div -> cast_bf16_or_fp16
|
||||
@ -1377,7 +1378,7 @@ TORCH_IMPL_FUNC(mean_out)
|
||||
// which, in turn, does not produce as accurate results.
|
||||
bool is_half_type = (dtype == kHalf || dtype == kBFloat16);
|
||||
auto sum_out_dtype = is_half_type ? ScalarType::Float : dtype;
|
||||
auto result_temp = is_half_type ? result.to(sum_out_dtype) : result;
|
||||
result_mut = is_half_type ? result_mut.to(sum_out_dtype) : result_mut;
|
||||
// If dtype is FP16 or BF16, self (input tensor) will initially be cast to
|
||||
// FP32 in sum_out. This results in having to read that FP32 tensor again,
|
||||
// but maybe in the future, we could revise the implementation to not
|
||||
@ -1385,14 +1386,9 @@ TORCH_IMPL_FUNC(mean_out)
|
||||
// require some modifications in binary_kernel_reduce_vec(),
|
||||
// TensorIteratorBase::for_each(), and
|
||||
// TensorIteratorBase::serial_for_each(), apart from sum kernel for CPU.
|
||||
at::sum_out(result_temp, self, opt_dim, keepdim, sum_out_dtype).div_(dim_prod);
|
||||
// After sum & div, cast result_temp back to BF16 or FP16, if required.
|
||||
// It cannot be avoided copy_() if we promotion the out of sum op, because of
|
||||
// the result needs to be update and the storage of result tensor cannot be reused
|
||||
// by sum op. We do not need explicit call to(dtype) func as copy_() do it.
|
||||
if (is_half_type) {
|
||||
result.copy_(result_temp);
|
||||
}
|
||||
at::sum_out(result_mut, self, opt_dim, keepdim, sum_out_dtype).div_(dim_prod);
|
||||
// After sum & div, cast result_mut back to BF16 or FP16, if required.
|
||||
result_mut = is_half_type ? result_mut.to(dtype) : result_mut;
|
||||
} else {
|
||||
// device is not CPU
|
||||
auto iter = at::meta::make_reduction_from_out_ty(
|
||||
|
@ -466,7 +466,6 @@ REGISTER_AVX2_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cp
|
||||
REGISTER_AVX512_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel);
|
||||
REGISTER_VSX_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel);
|
||||
REGISTER_ZVECTOR_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel);
|
||||
REGISTER_SVE256_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel);
|
||||
|
||||
// offsets dispatches
|
||||
REGISTER_ARCH_DISPATCH(
|
||||
@ -477,7 +476,6 @@ REGISTER_AVX2_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cp
|
||||
REGISTER_AVX512_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel);
|
||||
REGISTER_VSX_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel);
|
||||
REGISTER_ZVECTOR_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel);
|
||||
REGISTER_SVE256_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel);
|
||||
|
||||
// Currently some computation is being duplicated across forward and backward.
|
||||
// TODO: Cache indices in forward pass to re-use in backward
|
||||
@ -548,9 +546,6 @@ REGISTER_VSX_DISPATCH(
|
||||
REGISTER_ZVECTOR_DISPATCH(
|
||||
_segment_reduce_lengths_backward_stub,
|
||||
&_segment_reduce_cpu_lengths_backward_kernel);
|
||||
REGISTER_SVE256_DISPATCH(
|
||||
_segment_reduce_lengths_backward_stub,
|
||||
&_segment_reduce_cpu_lengths_backward_kernel);
|
||||
|
||||
REGISTER_ARCH_DISPATCH(
|
||||
_segment_reduce_offsets_backward_stub,
|
||||
@ -568,8 +563,5 @@ REGISTER_VSX_DISPATCH(
|
||||
REGISTER_ZVECTOR_DISPATCH(
|
||||
_segment_reduce_offsets_backward_stub,
|
||||
&_segment_reduce_cpu_offsets_backward_kernel);
|
||||
REGISTER_SVE256_DISPATCH(
|
||||
_segment_reduce_offsets_backward_stub,
|
||||
&_segment_reduce_cpu_offsets_backward_kernel);
|
||||
|
||||
} // namespace at::native
|
||||
|
@ -82,6 +82,7 @@ namespace at::meta {
|
||||
static inline void check_for_unsupported_isin_dtype(const ScalarType type) {
|
||||
// Bail out for dtypes unsupported by the sorting algorithm to keep the interface consistent.
|
||||
TORCH_CHECK(type != ScalarType::Bool &&
|
||||
type != ScalarType::BFloat16 &&
|
||||
type != ScalarType::ComplexFloat &&
|
||||
type != ScalarType::ComplexDouble,
|
||||
"Unsupported input type encountered for isin(): ", type);
|
||||
|
@ -772,6 +772,9 @@ inline SymDimVector compute_strides_for_view_dtype_upsize(SymIntArrayRef old_str
|
||||
}
|
||||
|
||||
Tensor view_dtype(const Tensor& self, ScalarType dtype) {
|
||||
if (self.scalar_type() == dtype) {
|
||||
return self;
|
||||
}
|
||||
const auto type_meta = c10::scalarTypeToTypeMeta(dtype);
|
||||
TORCH_CHECK(!self.is_conj(),
|
||||
"torch.Tensor.view is not supported for conjugate view tensors when converting to a different dtype.");
|
||||
|
@ -341,8 +341,8 @@ void gemm_notrans_(
|
||||
at::Half* c,
|
||||
int64_t ldc) {
|
||||
// c += alpha * (a @ b)
|
||||
if (n == 1 && beta == 0.0 && alpha == 1.0) {
|
||||
at::native::blas_impl::fp16_gemv_notrans(m, k, 1.0, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(b), 1, 0.0, reinterpret_cast<float16_t*>(c), 1);
|
||||
if (n == 1 && beta == 0.0) {
|
||||
at::native::blas_impl::fp16_gemv_notrans(m, k, alpha, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(b), 1, beta, reinterpret_cast<float16_t*>(c), 1);
|
||||
return;
|
||||
}
|
||||
for (const auto i : c10::irange(m)) {
|
||||
@ -388,8 +388,8 @@ void gemm_transa_(
|
||||
float beta,
|
||||
at::Half *c, int64_t ldc) {
|
||||
// c = alpha * (a.T @ b) + beta * c
|
||||
if (n == 1 && beta == 0.0 && alpha == 1.0) {
|
||||
at::native::blas_impl::fp16_gemv_trans(k, m, 1.0, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(b), 1, 0.0, reinterpret_cast<float16_t*>(c), 1);
|
||||
if (n == 1 && beta == 0.0) {
|
||||
at::native::blas_impl::fp16_gemv_trans(k, m, alpha, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(b), 1, beta, reinterpret_cast<float16_t*>(c), 1);
|
||||
return;
|
||||
}
|
||||
parallel_for(0, m, 1, [&](int64_t begin, int64_t end) {
|
||||
|
@ -23,7 +23,7 @@ namespace {
|
||||
// out = val * a + b
|
||||
// is_b_stride_zero: If the stride of b is 0 (mask broadcasting case),
|
||||
// take b as a scalar pointer.
|
||||
#if __GNUC__ == 11 && defined(__ARM_FEATURE_SVE)
|
||||
#if __GNUC__ == 11 && __GNUC_MINOR__ >= 4 && defined(__ARM_FEATURE_SVE)
|
||||
template <typename T1, typename T2>
|
||||
inline void _scale_attn_mask_fusion_kernel(
|
||||
T1* a,
|
||||
@ -51,7 +51,7 @@ inline void _scale_attn_mask_fusion_kernel(
|
||||
for (; i < size - (size % vec_size2); i += vec_size2) {
|
||||
auto a_n = at::vec::VectorizedN<T1, T1_n>::loadu(a + i);
|
||||
at::vec::VectorizedN<T2, T2_n> b_n;
|
||||
#if __GNUC__ == 11 && defined(__ARM_FEATURE_SVE)
|
||||
#if __GNUC__ == 11 && __GNUC_MINOR__ >= 4 && defined(__ARM_FEATURE_SVE)
|
||||
if (is_b_stride_zero) {
|
||||
#else
|
||||
if constexpr(is_b_stride_zero) {
|
||||
@ -67,7 +67,7 @@ inline void _scale_attn_mask_fusion_kernel(
|
||||
for (; i < size; i++) {
|
||||
auto tmp0 = a[i];
|
||||
T1 tmp1;
|
||||
#if __GNUC__ == 11 && defined(__ARM_FEATURE_SVE)
|
||||
#if __GNUC__ == 11 && __GNUC_MINOR__ >= 4 && defined(__ARM_FEATURE_SVE)
|
||||
if (is_b_stride_zero) {
|
||||
#else
|
||||
if constexpr(is_b_stride_zero) {
|
||||
@ -473,7 +473,8 @@ void cpu_flash_attention(
|
||||
scalar_t* transpose_buffer_ptr = transpose_buffer.get();
|
||||
std::unique_ptr<scalar_t[]> v_copy_buffer = std::make_unique<scalar_t[]>(ekvSplitSize * packb_size);
|
||||
scalar_t* v_copy_buffer_ptr = v_copy_buffer.get();
|
||||
for (C10_UNUSED auto z : c10::irange(begin, end)) {
|
||||
for (const auto z : c10::irange(begin, end)) {
|
||||
(void)z; // Suppress unused variable
|
||||
n = l * kvSplitSize;
|
||||
int64_t kvBlockSize = std::min(kvSplitSize, kvSize - n);
|
||||
int64_t ekvBlockSize = kvBlockSize % 2 == 0 ? kvBlockSize : kvBlockSize + 1;
|
||||
@ -566,7 +567,8 @@ void cpu_flash_attention(
|
||||
? query_padding_ptr + ompIdx * qSplitSize * eheadSize
|
||||
: nullptr;
|
||||
|
||||
for (C10_UNUSED auto z : c10::irange(begin, end)) {
|
||||
for (const auto z : c10::irange(begin, end)) {
|
||||
(void)z; // Suppress unused variable
|
||||
int64_t m = k * qSplitSize;
|
||||
int64_t qBlockSize = std::min(qSplitSize, qSize - m);
|
||||
// Initialize max and sum
|
||||
@ -646,7 +648,7 @@ void cpu_flash_attention(
|
||||
// qk <- qk * scaling + attn_mask
|
||||
if (has_attn_mask) {
|
||||
for (int64_t row = 0; row < qBlockSize; ++row) {
|
||||
#if __GNUC__ == 11 && defined(__ARM_FEATURE_SVE)
|
||||
#if __GNUC__ == 11 && __GNUC_MINOR__ >= 4 && defined(__ARM_FEATURE_SVE)
|
||||
_scale_attn_mask_fusion_kernel(
|
||||
qk_data + row * rkvBlockSize,
|
||||
mask_data + i * mStrideB + j * mStrideH +
|
||||
@ -931,7 +933,8 @@ void cpu_flash_attention_backward(
|
||||
|
||||
at::Tensor dsum = at::empty({qSplitSize}, query.options().dtype(accumulate_dtype));
|
||||
accum_t* dsum_data = dsum.data_ptr<accum_t>();
|
||||
for (C10_UNUSED auto z : c10::irange(begin, end)) {
|
||||
for (const auto z : c10::irange(begin, end)) {
|
||||
(void)z; // Suppress unused variable
|
||||
// rowsum of grad_out * out
|
||||
for (int64_t m = 0; m < qSize; m += qSplitSize) {
|
||||
int64_t qBlockSize = std::min(qSplitSize, qSize - m);
|
||||
@ -968,7 +971,7 @@ void cpu_flash_attention_backward(
|
||||
if (has_attn_mask) {
|
||||
accum_t one = accum_t(1);
|
||||
for (const auto row : c10::irange(qBlockSize)) {
|
||||
#if __GNUC__ == 11 && defined(__ARM_FEATURE_SVE)
|
||||
#if __GNUC__ == 11 && __GNUC_MINOR__ >= 4 && defined(__ARM_FEATURE_SVE)
|
||||
_scale_attn_mask_fusion_kernel(
|
||||
attn_data + row * kvBlockSize,
|
||||
mask_data + i * mStrideB + j * mStrideH +
|
||||
|
@ -19,7 +19,7 @@ Vectorized<scalar_t> is_lerp_weight_small(Vectorized<scalar_t> weight) {
|
||||
// is_lerp_weight_small doesn't work for complex because z.abs() returns a
|
||||
// complex vector which can't be compared. Either implement it with z.abs_2_(),
|
||||
// or fallback to the scalar function.
|
||||
#if !(defined(CPU_CAPABILITY_DEFAULT) || defined(_MSC_VER) || defined(CPU_CAPABILITY_SVE))
|
||||
#if !(defined(CPU_CAPABILITY_DEFAULT) || defined(_MSC_VER))
|
||||
template <typename value_t>
|
||||
Vectorized<c10::complex<value_t>> is_lerp_weight_small(Vectorized<c10::complex<value_t>> weight) {
|
||||
using vec_reg_t = decltype(weight.abs_2_());
|
||||
|
@ -486,7 +486,7 @@ void reflection_pad1d_kernel_impl(const Tensor& output, const Tensor& input, Int
|
||||
cpu_padding<scalar_t, ReflectionPad>(output, input, param);
|
||||
});
|
||||
} else {
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, input.scalar_type(),
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, input.scalar_type(),
|
||||
"reflection_pad1d", [&] {
|
||||
cpu_padding<scalar_t, ReflectionPad>(output, input, param);
|
||||
});
|
||||
@ -496,7 +496,7 @@ void reflection_pad1d_kernel_impl(const Tensor& output, const Tensor& input, Int
|
||||
void reflection_pad1d_backward_kernel_impl(
|
||||
const Tensor& grad_input, const Tensor& grad_output, IntArrayRef padding) {
|
||||
PaddingParams param{grad_input, grad_output, padding};
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(),
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, grad_output.scalar_type(),
|
||||
"reflection_pad1d_backward", [&] {
|
||||
cpu_padding_backward<scalar_t, ReflectionPad>(grad_input, grad_output, param);
|
||||
});
|
||||
@ -513,14 +513,14 @@ void reflection_pad2d_kernel_impl(const Tensor& output, const Tensor& input, Int
|
||||
} else {
|
||||
switch (input.suggest_memory_format()) {
|
||||
case at::MemoryFormat::Contiguous: {
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, input.scalar_type(),
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, input.scalar_type(),
|
||||
"reflection_pad2d", [&] {
|
||||
cpu_padding<scalar_t, ReflectionPad>(output, input, param);
|
||||
});
|
||||
break;
|
||||
}
|
||||
case at::MemoryFormat::ChannelsLast: {
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, input.scalar_type(),
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, input.scalar_type(),
|
||||
"reflection_pad2d_channels_last", [&]{
|
||||
cpu_padding_channels_last<scalar_t, ReflectionPad>(output, input, param);
|
||||
});
|
||||
@ -537,14 +537,14 @@ void reflection_pad2d_backward_kernel_impl(
|
||||
PaddingParams param{grad_input, grad_output, padding};
|
||||
switch (grad_output.suggest_memory_format()) {
|
||||
case at::MemoryFormat::Contiguous: {
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(),
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, grad_output.scalar_type(),
|
||||
"reflection_pad2d_backward", [&] {
|
||||
cpu_padding_backward<scalar_t, ReflectionPad>(grad_input, grad_output, param);
|
||||
});
|
||||
break;
|
||||
}
|
||||
case at::MemoryFormat::ChannelsLast: {
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(),
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, grad_output.scalar_type(),
|
||||
"reflection_pad2d_backward_channels_last", [&]{
|
||||
cpu_padding_backward_channels_last<scalar_t, ReflectionPad>(grad_input, grad_output, param);
|
||||
});
|
||||
@ -603,7 +603,7 @@ void reflection_pad3d_backward_kernel_impl(
|
||||
// replication padding
|
||||
void replication_pad1d_kernel_impl(const Tensor& output, const Tensor& input, IntArrayRef padding) {
|
||||
PaddingParams param{input, output, padding};
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf,input.scalar_type(),
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, input.scalar_type(),
|
||||
"replication_pad1d", [&] {
|
||||
cpu_padding<scalar_t, ReplicationPad>(output, input, param);
|
||||
});
|
||||
@ -612,7 +612,7 @@ void replication_pad1d_kernel_impl(const Tensor& output, const Tensor& input, In
|
||||
void replication_pad1d_backward_kernel_impl(
|
||||
const Tensor& grad_input, const Tensor& grad_output, IntArrayRef padding) {
|
||||
PaddingParams param{grad_input, grad_output, padding};
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(),
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, grad_output.scalar_type(),
|
||||
"replication_pad1d_backward", [&] {
|
||||
cpu_padding_backward<scalar_t, ReplicationPad>(grad_input, grad_output, param);
|
||||
});
|
||||
@ -622,14 +622,14 @@ void replication_pad2d_kernel_impl(const Tensor& output, const Tensor& input, In
|
||||
PaddingParams param{input, output, padding};
|
||||
switch (input.suggest_memory_format()) {
|
||||
case at::MemoryFormat::Contiguous: {
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, input.scalar_type(),
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, input.scalar_type(),
|
||||
"replication_pad2d", [&] {
|
||||
cpu_padding<scalar_t, ReplicationPad>(output, input, param);
|
||||
});
|
||||
break;
|
||||
}
|
||||
case at::MemoryFormat::ChannelsLast: {
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, input.scalar_type(),
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, input.scalar_type(),
|
||||
"replication_pad2d_channels_last", [&]{
|
||||
cpu_padding_channels_last<scalar_t, ReplicationPad>(output, input, param);
|
||||
});
|
||||
@ -645,14 +645,14 @@ void replication_pad2d_backward_kernel_impl(
|
||||
PaddingParams param{grad_input, grad_output, padding};
|
||||
switch (grad_output.suggest_memory_format()) {
|
||||
case at::MemoryFormat::Contiguous: {
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(),
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, grad_output.scalar_type(),
|
||||
"replication_pad2d_backward", [&] {
|
||||
cpu_padding_backward<scalar_t, ReplicationPad>(grad_input, grad_output, param);
|
||||
});
|
||||
break;
|
||||
}
|
||||
case at::MemoryFormat::ChannelsLast: {
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(),
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, grad_output.scalar_type(),
|
||||
"replication_pad2d_backward_channels_last", [&]{
|
||||
cpu_padding_backward_channels_last<scalar_t, ReplicationPad>(grad_input, grad_output, param);
|
||||
});
|
||||
@ -667,14 +667,14 @@ void replication_pad3d_kernel_impl(const Tensor& output, const Tensor& input, In
|
||||
PaddingParams param{input, output, padding};
|
||||
switch (padding_memory_format_3d(input)) {
|
||||
case at::MemoryFormat::Contiguous: {
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, input.scalar_type(),
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, input.scalar_type(),
|
||||
"replication_pad3d", [&] {
|
||||
cpu_padding<scalar_t, ReplicationPad>(output, input, param);
|
||||
});
|
||||
break;
|
||||
}
|
||||
case at::MemoryFormat::ChannelsLast3d: {
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, input.scalar_type(),
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, input.scalar_type(),
|
||||
"replication_pad3d_channels_last", [&]{
|
||||
cpu_padding_channels_last<scalar_t, ReplicationPad>(output, input, param);
|
||||
});
|
||||
@ -690,14 +690,14 @@ void replication_pad3d_backward_kernel_impl(
|
||||
PaddingParams param{grad_input, grad_output, padding};
|
||||
switch (padding_memory_format_3d(grad_output)) {
|
||||
case at::MemoryFormat::Contiguous: {
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(),
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, grad_output.scalar_type(),
|
||||
"replication_pad3d_backward", [&] {
|
||||
cpu_padding_backward<scalar_t, ReplicationPad>(grad_input, grad_output, param);
|
||||
});
|
||||
break;
|
||||
}
|
||||
case at::MemoryFormat::ChannelsLast3d: {
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(),
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, grad_output.scalar_type(),
|
||||
"replication_pad3d_backward_channels_last", [&]{
|
||||
cpu_padding_backward_channels_last<scalar_t, ReplicationPad>(grad_input, grad_output, param);
|
||||
});
|
||||
|
@ -239,38 +239,22 @@ static void norm_kernel_tensor_iterator_impl(
|
||||
|
||||
using Vec = Vectorized<scalar_t>;
|
||||
using fVec = Vectorized<acc_t>;
|
||||
fVec acc_vec{acc_t(0)};
|
||||
acc_t buffer[fVec::size()];
|
||||
auto inner_reduction = [&buffer](scalar_t* inner_self_data, int64_t inner_size) -> acc_t {
|
||||
fVec acc_vec{acc_t(0)};
|
||||
int64_t d = 0;
|
||||
for (; d < inner_size - (inner_size % Vec::size()); d += Vec::size()) {
|
||||
Vec data_vec = Vec::loadu(inner_self_data + d);
|
||||
norm_two_reduce_step(acc_vec, data_vec);
|
||||
}
|
||||
acc_vec.store(buffer);
|
||||
for (int j = 1; j < fVec::size(); j++) {
|
||||
buffer[0] = buffer[0] + buffer[j];
|
||||
}
|
||||
for (; d < inner_size; d++) {
|
||||
acc_t data_val = acc_t(inner_self_data[d]);
|
||||
buffer[0] += data_val * data_val;
|
||||
}
|
||||
return buffer[0];
|
||||
};
|
||||
|
||||
// Use group reduction to avoid overflow.
|
||||
// See https://github.com/pytorch/pytorch/pull/123416
|
||||
int64_t group_size = 32768L;
|
||||
int64_t group_n = (size + group_size - 1) / group_size;
|
||||
scalar_t* inner_self_data = self_data;
|
||||
int64_t inner_size = group_size;
|
||||
double result = 0;
|
||||
for (int64_t g = 0; g < group_n; g++) {
|
||||
inner_size = (g * inner_size + group_size) > size ? (size - g * inner_size) : group_size;
|
||||
result += inner_reduction(inner_self_data, inner_size);
|
||||
inner_self_data += inner_size;
|
||||
int64_t d = 0;
|
||||
for (; d < size - (size % Vec::size()); d += Vec::size()) {
|
||||
Vec data_vec = Vec::loadu(self_data + d);
|
||||
norm_two_reduce_step(acc_vec, data_vec);
|
||||
}
|
||||
result_data[0] = scalar_t(std::sqrt(result));
|
||||
acc_vec.store(buffer);
|
||||
for (int j = 1; j < fVec::size(); j++) {
|
||||
buffer[0] = buffer[0] + buffer[j];
|
||||
}
|
||||
for (; d < size; d++) {
|
||||
acc_t data_val = acc_t(self_data[d]);
|
||||
buffer[0] += data_val * data_val;
|
||||
}
|
||||
result_data[0] = scalar_t(std::sqrt(buffer[0]));
|
||||
});
|
||||
});
|
||||
} else {
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user