mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-14 22:25:03 +08:00
Update
[ghstack-poisoned]
This commit is contained in:
@ -4,7 +4,7 @@ set -eux -o pipefail
|
|||||||
GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
|
GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
|
||||||
|
|
||||||
if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
|
if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
|
||||||
export TORCH_CUDA_ARCH_LIST="9.0;10.0;12.0"
|
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
|
SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
|
||||||
|
|||||||
@ -52,6 +52,8 @@ fi
|
|||||||
|
|
||||||
if [[ "$image" == *-jammy* ]]; then
|
if [[ "$image" == *-jammy* ]]; then
|
||||||
UBUNTU_VERSION=22.04
|
UBUNTU_VERSION=22.04
|
||||||
|
elif [[ "$image" == *-noble* ]]; then
|
||||||
|
UBUNTU_VERSION=24.04
|
||||||
elif [[ "$image" == *ubuntu* ]]; then
|
elif [[ "$image" == *ubuntu* ]]; then
|
||||||
extract_version_from_image_name ubuntu UBUNTU_VERSION
|
extract_version_from_image_name ubuntu UBUNTU_VERSION
|
||||||
fi
|
fi
|
||||||
@ -230,8 +232,12 @@ case "$tag" in
|
|||||||
UCC_COMMIT=${_UCC_COMMIT}
|
UCC_COMMIT=${_UCC_COMMIT}
|
||||||
INDUCTOR_BENCHMARKS=yes
|
INDUCTOR_BENCHMARKS=yes
|
||||||
;;
|
;;
|
||||||
pytorch-linux-jammy-rocm-n-py3)
|
pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-noble-rocm-n-py3)
|
||||||
ANACONDA_PYTHON_VERSION=3.10
|
if [[ $tag =~ "jammy" ]]; then
|
||||||
|
ANACONDA_PYTHON_VERSION=3.10
|
||||||
|
else
|
||||||
|
ANACONDA_PYTHON_VERSION=3.12
|
||||||
|
fi
|
||||||
GCC_VERSION=11
|
GCC_VERSION=11
|
||||||
VISION=yes
|
VISION=yes
|
||||||
ROCM_VERSION=6.4
|
ROCM_VERSION=6.4
|
||||||
@ -322,6 +328,8 @@ case "$tag" in
|
|||||||
GCC_VERSION=11
|
GCC_VERSION=11
|
||||||
ACL=yes
|
ACL=yes
|
||||||
VISION=yes
|
VISION=yes
|
||||||
|
CONDA_CMAKE=yes
|
||||||
|
OPENBLAS=yes
|
||||||
# snadampal: skipping llvm src build install because the current version
|
# snadampal: skipping llvm src build install because the current version
|
||||||
# from pytorch/llvm:9.0.1 is x86 specific
|
# from pytorch/llvm:9.0.1 is x86 specific
|
||||||
SKIP_LLVM_SRC_BUILD_INSTALL=yes
|
SKIP_LLVM_SRC_BUILD_INSTALL=yes
|
||||||
@ -331,6 +339,8 @@ case "$tag" in
|
|||||||
GCC_VERSION=11
|
GCC_VERSION=11
|
||||||
ACL=yes
|
ACL=yes
|
||||||
VISION=yes
|
VISION=yes
|
||||||
|
CONDA_CMAKE=yes
|
||||||
|
OPENBLAS=yes
|
||||||
# snadampal: skipping llvm src build install because the current version
|
# snadampal: skipping llvm src build install because the current version
|
||||||
# from pytorch/llvm:9.0.1 is x86 specific
|
# from pytorch/llvm:9.0.1 is x86 specific
|
||||||
SKIP_LLVM_SRC_BUILD_INSTALL=yes
|
SKIP_LLVM_SRC_BUILD_INSTALL=yes
|
||||||
@ -417,6 +427,7 @@ docker build \
|
|||||||
--build-arg "XPU_VERSION=${XPU_VERSION}" \
|
--build-arg "XPU_VERSION=${XPU_VERSION}" \
|
||||||
--build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
|
--build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
|
||||||
--build-arg "ACL=${ACL:-}" \
|
--build-arg "ACL=${ACL:-}" \
|
||||||
|
--build-arg "OPENBLAS=${OPENBLAS:-}" \
|
||||||
--build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
|
--build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
|
||||||
--build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
|
--build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
|
||||||
-f $(dirname ${DOCKERFILE})/Dockerfile \
|
-f $(dirname ${DOCKERFILE})/Dockerfile \
|
||||||
|
|||||||
@ -23,6 +23,10 @@ conda_install() {
|
|||||||
as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $*
|
as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $*
|
||||||
}
|
}
|
||||||
|
|
||||||
|
conda_install_through_forge() {
|
||||||
|
as_jenkins conda install -c conda-forge -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $*
|
||||||
|
}
|
||||||
|
|
||||||
conda_run() {
|
conda_run() {
|
||||||
as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION --no-capture-output $*
|
as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION --no-capture-output $*
|
||||||
}
|
}
|
||||||
|
|||||||
@ -15,6 +15,9 @@ install_ubuntu() {
|
|||||||
elif [[ "$UBUNTU_VERSION" == "22.04"* ]]; then
|
elif [[ "$UBUNTU_VERSION" == "22.04"* ]]; then
|
||||||
cmake3="cmake=3.22*"
|
cmake3="cmake=3.22*"
|
||||||
maybe_libiomp_dev=""
|
maybe_libiomp_dev=""
|
||||||
|
elif [[ "$UBUNTU_VERSION" == "24.04"* ]]; then
|
||||||
|
cmake3="cmake=3.28*"
|
||||||
|
maybe_libiomp_dev=""
|
||||||
else
|
else
|
||||||
cmake3="cmake=3.5*"
|
cmake3="cmake=3.5*"
|
||||||
maybe_libiomp_dev="libiomp-dev"
|
maybe_libiomp_dev="libiomp-dev"
|
||||||
|
|||||||
@ -70,10 +70,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
|
# Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
|
||||||
if [[ $(uname -m) == "aarch64" ]]; then
|
if [[ $(uname -m) != "aarch64" ]]; then
|
||||||
conda_install "openblas==0.3.29=*openmp*"
|
pip_install mkl==2024.2.0
|
||||||
else
|
pip_install mkl-static==2024.2.0
|
||||||
conda_install "mkl=2021.4.0 mkl-include=2021.4.0"
|
pip_install mkl-include==2024.2.0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
|
# Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
|
||||||
@ -87,6 +87,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
|
|||||||
conda_run ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION})
|
conda_run ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION})
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [[ "$UBUNTU_VERSION" == "24.04"* ]] ; then
|
||||||
|
conda_install_through_forge libstdcxx-ng=14
|
||||||
|
fi
|
||||||
|
|
||||||
# Install some other packages, including those needed for Python test reporting
|
# Install some other packages, including those needed for Python test reporting
|
||||||
pip_install -r /opt/conda/requirements-ci.txt
|
pip_install -r /opt/conda/requirements-ci.txt
|
||||||
|
|
||||||
|
|||||||
@ -4,8 +4,9 @@
|
|||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
cd /
|
cd /
|
||||||
git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.29}" --depth 1 --shallow-submodules
|
git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.30}" --depth 1 --shallow-submodules
|
||||||
|
|
||||||
|
OPENBLAS_CHECKOUT_DIR="OpenBLAS"
|
||||||
OPENBLAS_BUILD_FLAGS="
|
OPENBLAS_BUILD_FLAGS="
|
||||||
NUM_THREADS=128
|
NUM_THREADS=128
|
||||||
USE_OPENMP=1
|
USE_OPENMP=1
|
||||||
@ -13,9 +14,8 @@ NO_SHARED=0
|
|||||||
DYNAMIC_ARCH=1
|
DYNAMIC_ARCH=1
|
||||||
TARGET=ARMV8
|
TARGET=ARMV8
|
||||||
CFLAGS=-O3
|
CFLAGS=-O3
|
||||||
|
BUILD_BFLOAT16=1
|
||||||
"
|
"
|
||||||
|
|
||||||
OPENBLAS_CHECKOUT_DIR="OpenBLAS"
|
|
||||||
|
|
||||||
make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
|
make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
|
||||||
make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}
|
make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}
|
||||||
|
|||||||
@ -8,9 +8,11 @@ ver() {
|
|||||||
|
|
||||||
install_ubuntu() {
|
install_ubuntu() {
|
||||||
apt-get update
|
apt-get update
|
||||||
if [[ $UBUNTU_VERSION == 20.04 ]]; then
|
# gpg-agent is not available by default
|
||||||
# gpg-agent is not available by default on 20.04
|
apt-get install -y --no-install-recommends gpg-agent
|
||||||
apt-get install -y --no-install-recommends gpg-agent
|
if [[ $(ver $UBUNTU_VERSION) -ge $(ver 22.04) ]]; then
|
||||||
|
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \
|
||||||
|
| sudo tee /etc/apt/preferences.d/rocm-pin-600
|
||||||
fi
|
fi
|
||||||
apt-get install -y kmod
|
apt-get install -y kmod
|
||||||
apt-get install -y wget
|
apt-get install -y wget
|
||||||
@ -85,13 +87,14 @@ EOF
|
|||||||
VER_STR=6.3
|
VER_STR=6.3
|
||||||
fi
|
fi
|
||||||
# clr build needs CppHeaderParser but can only find it using conda's python
|
# clr build needs CppHeaderParser but can only find it using conda's python
|
||||||
/opt/conda/bin/python -m pip install CppHeaderParser
|
python -m pip install CppHeaderParser
|
||||||
git clone https://github.com/ROCm/HIP -b $HIP_BRANCH
|
git clone https://github.com/ROCm/HIP -b $HIP_BRANCH
|
||||||
HIP_COMMON_DIR=$(readlink -f HIP)
|
HIP_COMMON_DIR=$(readlink -f HIP)
|
||||||
git clone https://github.com/jeffdaily/clr -b release/rocm-rel-${VER_STR}${VER_PATCH}-statco-hotfix
|
git clone https://github.com/jeffdaily/clr -b release/rocm-rel-${VER_STR}${VER_PATCH}-statco-hotfix
|
||||||
mkdir -p clr/build
|
mkdir -p clr/build
|
||||||
pushd clr/build
|
pushd clr/build
|
||||||
cmake .. -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
|
# Need to point CMake to the correct python installation to find CppHeaderParser
|
||||||
|
cmake .. -DPython3_EXECUTABLE=/opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}/bin/python3 -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
|
||||||
make -j
|
make -j
|
||||||
cp hipamd/lib/libamdhip64.so.${VER_STR}.* /opt/rocm/lib/libamdhip64.so.${VER_STR}.*
|
cp hipamd/lib/libamdhip64.so.${VER_STR}.* /opt/rocm/lib/libamdhip64.so.${VER_STR}.*
|
||||||
popd
|
popd
|
||||||
|
|||||||
@ -41,7 +41,7 @@ case ${image} in
|
|||||||
GPU_IMAGE=arm64v8/almalinux:8
|
GPU_IMAGE=arm64v8/almalinux:8
|
||||||
DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1"
|
DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1"
|
||||||
MANY_LINUX_VERSION="2_28_aarch64"
|
MANY_LINUX_VERSION="2_28_aarch64"
|
||||||
OPENBLAS_VERSION="v0.3.29"
|
OPENBLAS_VERSION="v0.3.30"
|
||||||
;;
|
;;
|
||||||
manylinuxcxx11-abi-builder:cpu-cxx11-abi)
|
manylinuxcxx11-abi-builder:cpu-cxx11-abi)
|
||||||
TARGET=final
|
TARGET=final
|
||||||
|
|||||||
@ -16,6 +16,7 @@ click
|
|||||||
#test that import:
|
#test that import:
|
||||||
|
|
||||||
coremltools==5.0b5 ; python_version < "3.12"
|
coremltools==5.0b5 ; python_version < "3.12"
|
||||||
|
coremltools==8.3 ; python_version == "3.12"
|
||||||
#Description: Apple framework for ML integration
|
#Description: Apple framework for ML integration
|
||||||
#Pinned versions: 5.0b5
|
#Pinned versions: 5.0b5
|
||||||
#test that import:
|
#test that import:
|
||||||
@ -63,6 +64,7 @@ lark==0.12.0
|
|||||||
#test that import:
|
#test that import:
|
||||||
|
|
||||||
librosa>=0.6.2 ; python_version < "3.11"
|
librosa>=0.6.2 ; python_version < "3.11"
|
||||||
|
librosa==0.10.2 ; python_version == "3.12"
|
||||||
#Description: A python package for music and audio analysis
|
#Description: A python package for music and audio analysis
|
||||||
#Pinned versions: >=0.6.2
|
#Pinned versions: >=0.6.2
|
||||||
#test that import: test_spectral_ops.py
|
#test that import: test_spectral_ops.py
|
||||||
@ -111,6 +113,7 @@ ninja==1.11.1.3
|
|||||||
numba==0.49.0 ; python_version < "3.9"
|
numba==0.49.0 ; python_version < "3.9"
|
||||||
numba==0.55.2 ; python_version == "3.9"
|
numba==0.55.2 ; python_version == "3.9"
|
||||||
numba==0.55.2 ; python_version == "3.10"
|
numba==0.55.2 ; python_version == "3.10"
|
||||||
|
numba==0.60.0 ; python_version == "3.12"
|
||||||
#Description: Just-In-Time Compiler for Numerical Functions
|
#Description: Just-In-Time Compiler for Numerical Functions
|
||||||
#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
|
#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
|
||||||
#test that import: test_numba_integration.py
|
#test that import: test_numba_integration.py
|
||||||
@ -360,10 +363,11 @@ pwlf==2.2.1
|
|||||||
|
|
||||||
|
|
||||||
# To build PyTorch itself
|
# To build PyTorch itself
|
||||||
astunparse
|
pyyaml
|
||||||
PyYAML
|
|
||||||
pyzstd
|
pyzstd
|
||||||
setuptools
|
setuptools
|
||||||
|
six
|
||||||
|
wheel
|
||||||
|
|
||||||
scons==4.5.2 ; platform_machine == "aarch64"
|
scons==4.5.2 ; platform_machine == "aarch64"
|
||||||
|
|
||||||
|
|||||||
@ -5,7 +5,7 @@ sphinx==5.3.0
|
|||||||
|
|
||||||
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
|
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
|
||||||
# but it doesn't seem to work and hangs around idly. The initial thought is probably
|
# but it doesn't seem to work and hangs around idly. The initial thought is probably
|
||||||
# something related to Docker setup. We can investigate this later
|
# something related to Docker setup. We can investigate this later.
|
||||||
|
|
||||||
sphinxcontrib.katex==0.8.6
|
sphinxcontrib.katex==0.8.6
|
||||||
#Description: This is used to generate PyTorch docs
|
#Description: This is used to generate PyTorch docs
|
||||||
|
|||||||
@ -147,6 +147,12 @@ RUN if [ -n "${ACL}" ]; then bash ./install_acl.sh; fi
|
|||||||
RUN rm install_acl.sh
|
RUN rm install_acl.sh
|
||||||
ENV INSTALLED_ACL ${ACL}
|
ENV INSTALLED_ACL ${ACL}
|
||||||
|
|
||||||
|
ARG OPENBLAS
|
||||||
|
COPY ./common/install_openblas.sh install_openblas.sh
|
||||||
|
RUN if [ -n "${OPENBLAS}" ]; then bash ./install_openblas.sh; fi
|
||||||
|
RUN rm install_openblas.sh
|
||||||
|
ENV INSTALLED_OPENBLAS ${OPENBLAS}
|
||||||
|
|
||||||
# Install ccache/sccache (do this last, so we get priority in PATH)
|
# Install ccache/sccache (do this last, so we get priority in PATH)
|
||||||
ARG SKIP_SCCACHE_INSTALL
|
ARG SKIP_SCCACHE_INSTALL
|
||||||
COPY ./common/install_cache.sh install_cache.sh
|
COPY ./common/install_cache.sh install_cache.sh
|
||||||
|
|||||||
@ -104,7 +104,7 @@ if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
|
|||||||
export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr
|
export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Calling setup.py install at $(date)"
|
echo "Calling 'python -m pip install .' at $(date)"
|
||||||
|
|
||||||
if [[ $LIBTORCH_VARIANT = *"static"* ]]; then
|
if [[ $LIBTORCH_VARIANT = *"static"* ]]; then
|
||||||
STATIC_CMAKE_FLAG="-DTORCH_STATIC=1"
|
STATIC_CMAKE_FLAG="-DTORCH_STATIC=1"
|
||||||
@ -120,7 +120,7 @@ fi
|
|||||||
# TODO: Remove this flag once https://github.com/pytorch/pytorch/issues/55952 is closed
|
# TODO: Remove this flag once https://github.com/pytorch/pytorch/issues/55952 is closed
|
||||||
CFLAGS='-Wno-deprecated-declarations' \
|
CFLAGS='-Wno-deprecated-declarations' \
|
||||||
BUILD_LIBTORCH_CPU_WITH_DEBUG=1 \
|
BUILD_LIBTORCH_CPU_WITH_DEBUG=1 \
|
||||||
python setup.py install
|
python -m pip install --no-build-isolation -v .
|
||||||
|
|
||||||
mkdir -p libtorch/{lib,bin,include,share}
|
mkdir -p libtorch/{lib,bin,include,share}
|
||||||
|
|
||||||
|
|||||||
@ -185,7 +185,7 @@ torchbench_setup_macos() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pip_benchmark_deps() {
|
pip_benchmark_deps() {
|
||||||
python -mpip install --no-input astunparse requests cython scikit-learn
|
python -mpip install --no-input requests cython scikit-learn six
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -436,11 +436,11 @@ test_inductor_aoti() {
|
|||||||
python3 tools/amd_build/build_amd.py
|
python3 tools/amd_build/build_amd.py
|
||||||
fi
|
fi
|
||||||
if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
|
if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
|
||||||
BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python setup.py develop)
|
BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python -m pip install --no-build-isolation -v -e .)
|
||||||
# TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
|
# TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
|
||||||
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
|
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
|
||||||
else
|
else
|
||||||
BUILD_COMMAND=(python setup.py develop)
|
BUILD_COMMAND=(python -m pip install --no-build-isolation -v -e .)
|
||||||
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
|
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -1579,7 +1579,7 @@ test_operator_benchmark() {
|
|||||||
test_inductor_set_cpu_affinity
|
test_inductor_set_cpu_affinity
|
||||||
|
|
||||||
cd benchmarks/operator_benchmark/pt_extension
|
cd benchmarks/operator_benchmark/pt_extension
|
||||||
python setup.py install
|
python -m pip install .
|
||||||
|
|
||||||
cd "${TEST_DIR}"/benchmarks/operator_benchmark
|
cd "${TEST_DIR}"/benchmarks/operator_benchmark
|
||||||
$TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \
|
$TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \
|
||||||
|
|||||||
@ -42,7 +42,7 @@ call choco upgrade -y cmake --no-progress --installargs 'ADD_CMAKE_TO_PATH=Syste
|
|||||||
if errorlevel 1 goto fail
|
if errorlevel 1 goto fail
|
||||||
if not errorlevel 0 goto fail
|
if not errorlevel 0 goto fail
|
||||||
|
|
||||||
call pip install mkl-include==2021.4.0 mkl-devel==2021.4.0
|
call pip install mkl==2024.2.0 mkl-static==2024.2.0 mkl-include==2024.2.0
|
||||||
if errorlevel 1 goto fail
|
if errorlevel 1 goto fail
|
||||||
if not errorlevel 0 goto fail
|
if not errorlevel 0 goto fail
|
||||||
|
|
||||||
|
|||||||
@ -61,8 +61,8 @@ You are now all set to start developing with PyTorch in a DevContainer environme
|
|||||||
## Step 8: Build PyTorch
|
## Step 8: Build PyTorch
|
||||||
|
|
||||||
To build pytorch from source, simply run:
|
To build pytorch from source, simply run:
|
||||||
```
|
```bash
|
||||||
python setup.py develop
|
python -m pip install --no-build-isolation -v -e .
|
||||||
```
|
```
|
||||||
|
|
||||||
The process involves compiling thousands of files, and would take a long time. Fortunately, the compiled objects can be useful for your next build. When you modify some files, you only need to compile the changed files the next time.
|
The process involves compiling thousands of files, and would take a long time. Fortunately, the compiled objects can be useful for your next build. When you modify some files, you only need to compile the changed files the next time.
|
||||||
|
|||||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
|||||||
70caf76066ef2c1054d6128b11769dc816a779e7
|
6c57850358f34c47802db216b0746e4e9d08a95a
|
||||||
|
|||||||
2
.github/scripts/td_llm_indexer.sh
vendored
2
.github/scripts/td_llm_indexer.sh
vendored
@ -6,7 +6,7 @@ set -euxo pipefail
|
|||||||
cd llm-target-determinator
|
cd llm-target-determinator
|
||||||
pip install -q -r requirements.txt
|
pip install -q -r requirements.txt
|
||||||
cd ../codellama
|
cd ../codellama
|
||||||
pip install -e .
|
pip install --no-build-isolation -v -e .
|
||||||
pip install numpy==1.26.0
|
pip install numpy==1.26.0
|
||||||
|
|
||||||
# Run indexer
|
# Run indexer
|
||||||
|
|||||||
3
.github/workflows/_linux-build.yml
vendored
3
.github/workflows/_linux-build.yml
vendored
@ -131,6 +131,9 @@ jobs:
|
|||||||
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
|
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
|
||||||
with:
|
with:
|
||||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
instructions: |
|
||||||
|
Build is done inside the container, to start an interactive session run:
|
||||||
|
docker exec -it $(docker container ps --format '{{.ID}}') bash
|
||||||
|
|
||||||
# [pytorch repo ref]
|
# [pytorch repo ref]
|
||||||
# Use a pytorch/pytorch reference instead of a reference to the local
|
# Use a pytorch/pytorch reference instead of a reference to the local
|
||||||
|
|||||||
16
.github/workflows/_mac-test.yml
vendored
16
.github/workflows/_mac-test.yml
vendored
@ -88,6 +88,14 @@ jobs:
|
|||||||
pkill "${PROCESS}" || true
|
pkill "${PROCESS}" || true
|
||||||
done
|
done
|
||||||
|
|
||||||
|
- name: Clean up brew miniconda, if installed
|
||||||
|
continue-on-error: true
|
||||||
|
run: |
|
||||||
|
if brew list miniconda; then
|
||||||
|
brew uninstall miniconda
|
||||||
|
echo "REINSTALL_BREW_MINICONDA=1" >> "${GITHUB_ENV}"
|
||||||
|
fi
|
||||||
|
|
||||||
- name: Clean up leftover local python3 site-packages on MacOS pet runner
|
- name: Clean up leftover local python3 site-packages on MacOS pet runner
|
||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
run: |
|
run: |
|
||||||
@ -268,6 +276,14 @@ jobs:
|
|||||||
workflow_attempt: ${{github.run_attempt}}
|
workflow_attempt: ${{github.run_attempt}}
|
||||||
local_path: usage_log.txt
|
local_path: usage_log.txt
|
||||||
|
|
||||||
|
- name: Reinstall brew miniconda, if was installed
|
||||||
|
if: always()
|
||||||
|
continue-on-error: true
|
||||||
|
run: |
|
||||||
|
if [[ -n "$REINSTALL_BREW_MINICONDA" ]]; then
|
||||||
|
brew install miniconda
|
||||||
|
fi
|
||||||
|
|
||||||
- name: Clean up disk space
|
- name: Clean up disk space
|
||||||
if: always()
|
if: always()
|
||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
|
|||||||
1
.github/workflows/docker-builds.yml
vendored
1
.github/workflows/docker-builds.yml
vendored
@ -63,6 +63,7 @@ jobs:
|
|||||||
pytorch-linux-jammy-py3.13-clang12,
|
pytorch-linux-jammy-py3.13-clang12,
|
||||||
pytorch-linux-jammy-rocm-n-1-py3,
|
pytorch-linux-jammy-rocm-n-1-py3,
|
||||||
pytorch-linux-jammy-rocm-n-py3,
|
pytorch-linux-jammy-rocm-n-py3,
|
||||||
|
pytorch-linux-noble-rocm-n-py3,
|
||||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
|
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
|
||||||
pytorch-linux-jammy-py3.9-gcc11,
|
pytorch-linux-jammy-py3.9-gcc11,
|
||||||
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
|
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
|
||||||
|
|||||||
20
.github/workflows/rocm-mi300.yml
vendored
20
.github/workflows/rocm-mi300.yml
vendored
@ -36,15 +36,15 @@ jobs:
|
|||||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||||
curr_ref_type: ${{ github.ref_type }}
|
curr_ref_type: ${{ github.ref_type }}
|
||||||
|
|
||||||
linux-jammy-rocm-py3_10-build:
|
linux-noble-rocm-py3_12-build:
|
||||||
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||||
name: linux-jammy-rocm-py3.10-mi300
|
name: linux-noble-rocm-py3.12-mi300
|
||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build-environment: linux-jammy-rocm-py3.10-mi300
|
build-environment: linux-noble-rocm-py3.12-mi300
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
docker-image-name: ci-image:pytorch-linux-noble-rocm-n-py3
|
||||||
sync-tag: rocm-build
|
sync-tag: rocm-build
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
{ include: [
|
{ include: [
|
||||||
@ -57,17 +57,17 @@ jobs:
|
|||||||
]}
|
]}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
linux-jammy-rocm-py3_10-test:
|
linux-noble-rocm-py3_12-test:
|
||||||
permissions:
|
permissions:
|
||||||
id-token: write
|
id-token: write
|
||||||
contents: read
|
contents: read
|
||||||
name: linux-jammy-rocm-py3.10-mi300
|
name: linux-noble-rocm-py3.12-mi300
|
||||||
uses: ./.github/workflows/_rocm-test.yml
|
uses: ./.github/workflows/_rocm-test.yml
|
||||||
needs:
|
needs:
|
||||||
- linux-jammy-rocm-py3_10-build
|
- linux-noble-rocm-py3_12-build
|
||||||
- target-determination
|
- target-determination
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-rocm-py3.10-mi300
|
build-environment: linux-noble-rocm-py3.12-mi300
|
||||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|||||||
@ -231,7 +231,8 @@ include_patterns = [
|
|||||||
'c10/**/*.cpp',
|
'c10/**/*.cpp',
|
||||||
'c10/**/*.h',
|
'c10/**/*.h',
|
||||||
'torch/*.h',
|
'torch/*.h',
|
||||||
'torch/_inductor/codegen/aoti_runtime/interface.cpp',
|
'torch/_inductor/codegen/aoti_runtime/*.h',
|
||||||
|
'torch/_inductor/codegen/aoti_runtime/*.cpp',
|
||||||
'torch/csrc/*.h',
|
'torch/csrc/*.h',
|
||||||
'torch/csrc/*.cpp',
|
'torch/csrc/*.cpp',
|
||||||
'torch/csrc/**/*.h',
|
'torch/csrc/**/*.h',
|
||||||
@ -1476,6 +1477,31 @@ init_command = [
|
|||||||
'tomli==2.2.1 ; python_version < "3.11"',
|
'tomli==2.2.1 ; python_version < "3.11"',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[linter]]
|
||||||
|
code = 'CMAKE_MINIMUM_REQUIRED'
|
||||||
|
command = [
|
||||||
|
'python3',
|
||||||
|
'tools/linter/adapters/cmake_minimum_required_linter.py',
|
||||||
|
'--',
|
||||||
|
'@{{PATHSFILE}}'
|
||||||
|
]
|
||||||
|
include_patterns = [
|
||||||
|
"**/pyproject.toml",
|
||||||
|
"**/CMakeLists.txt",
|
||||||
|
"**/CMakeLists.txt.in",
|
||||||
|
"**/*.cmake",
|
||||||
|
"**/*.cmake.in",
|
||||||
|
"**/*requirements*.txt",
|
||||||
|
"**/*requirements*.in",
|
||||||
|
]
|
||||||
|
init_command = [
|
||||||
|
'python3',
|
||||||
|
'tools/linter/adapters/pip_init.py',
|
||||||
|
'--dry-run={{DRYRUN}}',
|
||||||
|
'packaging==25.0',
|
||||||
|
'tomli==2.2.1 ; python_version < "3.11"',
|
||||||
|
]
|
||||||
|
|
||||||
[[linter]]
|
[[linter]]
|
||||||
code = 'COPYRIGHT'
|
code = 'COPYRIGHT'
|
||||||
include_patterns = ['**']
|
include_patterns = ['**']
|
||||||
|
|||||||
@ -88,20 +88,19 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
|
|||||||
|
|
||||||
* If you want to have no-op incremental rebuilds (which are fast), see [Make no-op build fast](#make-no-op-build-fast) below.
|
* If you want to have no-op incremental rebuilds (which are fast), see [Make no-op build fast](#make-no-op-build-fast) below.
|
||||||
|
|
||||||
* When installing with `python setup.py develop` (in contrast to `python setup.py install`) Python runtime will use
|
* When installing with `python -m pip install -e .` (in contrast to `python -m pip install .`) Python runtime will use
|
||||||
the current local source-tree when importing `torch` package. (This is done by creating [`.egg-link`](https://wiki.python.org/moin/PythonPackagingTerminology#egg-link) file in `site-packages` folder)
|
the current local source-tree when importing `torch` package. (This is done by creating [`.egg-link`](https://wiki.python.org/moin/PythonPackagingTerminology#egg-link) file in `site-packages` folder)
|
||||||
This way you do not need to repeatedly install after modifying Python files (`.py`).
|
This way you do not need to repeatedly install after modifying Python files (`.py`).
|
||||||
However, you would need to reinstall if you modify Python interface (`.pyi`, `.pyi.in`) or
|
However, you would need to reinstall if you modify Python interface (`.pyi`, `.pyi.in`) or non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...).
|
||||||
non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...).
|
|
||||||
|
|
||||||
|
|
||||||
One way to avoid running `python setup.py develop` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac,
|
One way to avoid running `python -m pip install -e .` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac,
|
||||||
is to create a symbolic link from `build` folder to `torch/lib`, for example, by issuing following:
|
is to create a symbolic link from `build` folder to `torch/lib`, for example, by issuing following:
|
||||||
```bash
|
```bash
|
||||||
pushd torch/lib; sh -c "ln -sf ../../build/lib/libtorch_cpu.* ."; popd
|
pushd torch/lib; sh -c "ln -sf ../../build/lib/libtorch_cpu.* ."; popd
|
||||||
```
|
```
|
||||||
Afterwards rebuilding a library (for example to rebuild `libtorch_cpu.so` issue `ninja torch_cpu` from `build` folder),
|
Afterwards rebuilding a library (for example to rebuild `libtorch_cpu.so` issue `ninja torch_cpu` from `build` folder),
|
||||||
would be sufficient to make change visible in `torch` package.
|
would be sufficient to make change visible in `torch` package.
|
||||||
|
|
||||||
|
|
||||||
To reinstall, first uninstall all existing PyTorch installs. You may need to run `pip
|
To reinstall, first uninstall all existing PyTorch installs. You may need to run `pip
|
||||||
@ -115,9 +114,9 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
|
|||||||
pip uninstall torch
|
pip uninstall torch
|
||||||
```
|
```
|
||||||
|
|
||||||
Next run `python setup.py clean`. After that, you can install in `develop` mode again.
|
Next run `python setup.py clean`. After that, you can install in editable mode again.
|
||||||
|
|
||||||
* If you run into errors when running `python setup.py develop`, here are some debugging steps:
|
* If you run into errors when running `python -m pip install -e .`, here are some debugging steps:
|
||||||
1. Run `printf '#include <stdio.h>\nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure
|
1. Run `printf '#include <stdio.h>\nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure
|
||||||
your CMake works and can compile this simple Hello World program without errors.
|
your CMake works and can compile this simple Hello World program without errors.
|
||||||
2. Nuke your `build` directory. The `setup.py` script compiles binaries into the `build` folder and caches many
|
2. Nuke your `build` directory. The `setup.py` script compiles binaries into the `build` folder and caches many
|
||||||
@ -130,13 +129,20 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
|
|||||||
git clean -xdf
|
git clean -xdf
|
||||||
python setup.py clean
|
python setup.py clean
|
||||||
git submodule update --init --recursive
|
git submodule update --init --recursive
|
||||||
python setup.py develop
|
python -m pip install -r requirements.txt
|
||||||
|
python -m pip install --no-build-isolation -v -e .
|
||||||
```
|
```
|
||||||
4. The main step within `python setup.py develop` is running `make` from the `build` directory. If you want to
|
4. The main step within `python -m pip install -e .` is running `cmake --build build` from the `build` directory. If you want to
|
||||||
experiment with some environment variables, you can pass them into the command:
|
experiment with some environment variables, you can pass them into the command:
|
||||||
```bash
|
```bash
|
||||||
ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* python setup.py develop
|
ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* CMAKE_FRESH=1 python -m pip install --no-build-isolation -v -e .
|
||||||
```
|
```
|
||||||
|
5. Try installing PyTorch without build isolation by adding `--no-build-isolation` to the `pip install` command.
|
||||||
|
This will use the current environment's packages instead of creating a new isolated environment for the build.
|
||||||
|
```bash
|
||||||
|
python -m pip install --no-build-isolation -v -e .
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
* If you run into issue running `git submodule update --init --recursive`. Please try the following:
|
* If you run into issue running `git submodule update --init --recursive`. Please try the following:
|
||||||
- If you encounter an error such as
|
- If you encounter an error such as
|
||||||
@ -639,9 +645,9 @@ can be selected interactively with your mouse to zoom in on a particular part of
|
|||||||
the program execution timeline. The `--native` command-line option tells
|
the program execution timeline. The `--native` command-line option tells
|
||||||
`py-spy` to record stack frame entries for PyTorch C++ code. To get line numbers
|
`py-spy` to record stack frame entries for PyTorch C++ code. To get line numbers
|
||||||
for C++ code it may be necessary to compile PyTorch in debug mode by prepending
|
for C++ code it may be necessary to compile PyTorch in debug mode by prepending
|
||||||
your `setup.py develop` call to compile PyTorch with `DEBUG=1`. Depending on
|
your `python -m pip install -e .` call to compile PyTorch with `DEBUG=1`.
|
||||||
your operating system it may also be necessary to run `py-spy` with root
|
Depending on your operating system it may also be necessary to run `py-spy` with
|
||||||
privileges.
|
root privileges.
|
||||||
|
|
||||||
`py-spy` can also work in an `htop`-like "live profiling" mode and can be
|
`py-spy` can also work in an `htop`-like "live profiling" mode and can be
|
||||||
tweaked to adjust the stack sampling rate, see the `py-spy` readme for more
|
tweaked to adjust the stack sampling rate, see the `py-spy` readme for more
|
||||||
@ -649,7 +655,7 @@ details.
|
|||||||
|
|
||||||
## Managing multiple build trees
|
## Managing multiple build trees
|
||||||
|
|
||||||
One downside to using `python setup.py develop` is that your development
|
One downside to using `python -m pip install -e .` is that your development
|
||||||
version of PyTorch will be installed globally on your account (e.g., if
|
version of PyTorch will be installed globally on your account (e.g., if
|
||||||
you run `import torch` anywhere else, the development version will be
|
you run `import torch` anywhere else, the development version will be
|
||||||
used).
|
used).
|
||||||
@ -663,7 +669,7 @@ specific build of PyTorch. To set one up:
|
|||||||
python -m venv pytorch-myfeature
|
python -m venv pytorch-myfeature
|
||||||
source pytorch-myfeature/bin/activate # or `& .\pytorch-myfeature\Scripts\Activate.ps1` on Windows
|
source pytorch-myfeature/bin/activate # or `& .\pytorch-myfeature\Scripts\Activate.ps1` on Windows
|
||||||
# if you run python now, torch will NOT be installed
|
# if you run python now, torch will NOT be installed
|
||||||
python setup.py develop
|
python -m pip install --no-build-isolation -v -e .
|
||||||
```
|
```
|
||||||
|
|
||||||
## C++ development tips
|
## C++ development tips
|
||||||
@ -701,7 +707,9 @@ variables `DEBUG`, `USE_DISTRIBUTED`, `USE_MKLDNN`, `USE_CUDA`, `USE_FLASH_ATTEN
|
|||||||
For example:
|
For example:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
DEBUG=1 USE_DISTRIBUTED=0 USE_MKLDNN=0 USE_CUDA=0 BUILD_TEST=0 USE_FBGEMM=0 USE_NNPACK=0 USE_QNNPACK=0 USE_XNNPACK=0 python setup.py develop
|
DEBUG=1 USE_DISTRIBUTED=0 USE_MKLDNN=0 USE_CUDA=0 BUILD_TEST=0 \
|
||||||
|
USE_FBGEMM=0 USE_NNPACK=0 USE_QNNPACK=0 USE_XNNPACK=0 \
|
||||||
|
python -m pip install --no-build-isolation -v -e .
|
||||||
```
|
```
|
||||||
|
|
||||||
For subsequent builds (i.e., when `build/CMakeCache.txt` exists), the build
|
For subsequent builds (i.e., when `build/CMakeCache.txt` exists), the build
|
||||||
@ -711,7 +719,7 @@ options.
|
|||||||
|
|
||||||
### Code completion and IDE support
|
### Code completion and IDE support
|
||||||
|
|
||||||
When using `python setup.py develop`, PyTorch will generate
|
When using `python -m pip install -e .`, PyTorch will generate
|
||||||
a `compile_commands.json` file that can be used by many editors
|
a `compile_commands.json` file that can be used by many editors
|
||||||
to provide command completion and error highlighting for PyTorch's
|
to provide command completion and error highlighting for PyTorch's
|
||||||
C++ code. You need to `pip install ninja` to generate accurate
|
C++ code. You need to `pip install ninja` to generate accurate
|
||||||
@ -772,7 +780,7 @@ If not, you can define these variables on the command line before invoking `setu
|
|||||||
export CMAKE_C_COMPILER_LAUNCHER=ccache
|
export CMAKE_C_COMPILER_LAUNCHER=ccache
|
||||||
export CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
export CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
||||||
export CMAKE_CUDA_COMPILER_LAUNCHER=ccache
|
export CMAKE_CUDA_COMPILER_LAUNCHER=ccache
|
||||||
python setup.py develop
|
python -m pip install --no-build-isolation -v -e .
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Use a faster linker
|
#### Use a faster linker
|
||||||
@ -785,7 +793,7 @@ If you are editing a single file and rebuilding in a tight loop, the time spent
|
|||||||
Starting with CMake 3.29, you can specify the linker type using the [`CMAKE_LINKER_TYPE`](https://cmake.org/cmake/help/latest/variable/CMAKE_LINKER_TYPE.html) variable. For example, with `mold` installed:
|
Starting with CMake 3.29, you can specify the linker type using the [`CMAKE_LINKER_TYPE`](https://cmake.org/cmake/help/latest/variable/CMAKE_LINKER_TYPE.html) variable. For example, with `mold` installed:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
CMAKE_LINKER_TYPE=MOLD python setup.py develop
|
CMAKE_LINKER_TYPE=MOLD python -m pip install --no-build-isolation -v -e .
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Use pre-compiled headers
|
#### Use pre-compiled headers
|
||||||
@ -797,7 +805,7 @@ setting `USE_PRECOMPILED_HEADERS=1` either on first setup, or in the
|
|||||||
`CMakeCache.txt` file.
|
`CMakeCache.txt` file.
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
USE_PRECOMPILED_HEADERS=1 python setup.py develop
|
USE_PRECOMPILED_HEADERS=1 python -m pip install --no-build-isolation -v -e .
|
||||||
```
|
```
|
||||||
|
|
||||||
This adds a build step where the compiler takes `<ATen/ATen.h>` and essentially
|
This adds a build step where the compiler takes `<ATen/ATen.h>` and essentially
|
||||||
@ -820,7 +828,7 @@ A compiler-wrapper to fix this is provided in `tools/nvcc_fix_deps.py`. You can
|
|||||||
this as a compiler launcher, similar to `ccache`
|
this as a compiler launcher, similar to `ccache`
|
||||||
```bash
|
```bash
|
||||||
export CMAKE_CUDA_COMPILER_LAUNCHER="python;`pwd`/tools/nvcc_fix_deps.py;ccache"
|
export CMAKE_CUDA_COMPILER_LAUNCHER="python;`pwd`/tools/nvcc_fix_deps.py;ccache"
|
||||||
python setup.py develop
|
python -m pip install --no-build-isolation -v -e .
|
||||||
```
|
```
|
||||||
|
|
||||||
### Rebuild few files with debug information
|
### Rebuild few files with debug information
|
||||||
@ -1171,7 +1179,7 @@ build_with_asan()
|
|||||||
CFLAGS="-fsanitize=address -fno-sanitize-recover=all -shared-libasan -pthread" \
|
CFLAGS="-fsanitize=address -fno-sanitize-recover=all -shared-libasan -pthread" \
|
||||||
CXX_FLAGS="-pthread" \
|
CXX_FLAGS="-pthread" \
|
||||||
USE_CUDA=0 USE_OPENMP=0 USE_DISTRIBUTED=0 DEBUG=1 \
|
USE_CUDA=0 USE_OPENMP=0 USE_DISTRIBUTED=0 DEBUG=1 \
|
||||||
python setup.py develop
|
python -m pip install --no-build-isolation -v -e .
|
||||||
}
|
}
|
||||||
|
|
||||||
run_with_asan()
|
run_with_asan()
|
||||||
|
|||||||
@ -57,7 +57,7 @@ RUN --mount=type=cache,target=/opt/ccache \
|
|||||||
export eval ${CMAKE_VARS} && \
|
export eval ${CMAKE_VARS} && \
|
||||||
TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
|
TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
|
||||||
CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
|
CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
|
||||||
python setup.py install
|
python -m pip install --no-build-isolation -v .
|
||||||
|
|
||||||
FROM conda as conda-installs
|
FROM conda as conda-installs
|
||||||
ARG PYTHON_VERSION=3.11
|
ARG PYTHON_VERSION=3.11
|
||||||
|
|||||||
20
README.md
20
README.md
@ -228,6 +228,7 @@ If you want to disable Intel GPU support, export the environment variable `USE_X
|
|||||||
Other potentially useful environment variables may be found in `setup.py`.
|
Other potentially useful environment variables may be found in `setup.py`.
|
||||||
|
|
||||||
#### Get the PyTorch Source
|
#### Get the PyTorch Source
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/pytorch/pytorch
|
git clone https://github.com/pytorch/pytorch
|
||||||
cd pytorch
|
cd pytorch
|
||||||
@ -279,24 +280,29 @@ conda install -c conda-forge libuv=1.39
|
|||||||
```
|
```
|
||||||
|
|
||||||
#### Install PyTorch
|
#### Install PyTorch
|
||||||
|
|
||||||
**On Linux**
|
**On Linux**
|
||||||
|
|
||||||
If you're compiling for AMD ROCm then first run this command:
|
If you're compiling for AMD ROCm then first run this command:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Only run this if you're compiling for ROCm
|
# Only run this if you're compiling for ROCm
|
||||||
python tools/amd_build/build_amd.py
|
python tools/amd_build/build_amd.py
|
||||||
```
|
```
|
||||||
|
|
||||||
Install PyTorch
|
Install PyTorch
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
|
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
|
||||||
python setup.py develop
|
python -m pip install -r requirements.txt
|
||||||
|
python -m pip install --no-build-isolation -v -e .
|
||||||
```
|
```
|
||||||
|
|
||||||
**On macOS**
|
**On macOS**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 setup.py develop
|
python -m pip install -r requirements.txt
|
||||||
|
python -m pip install --no-build-isolation -v -e .
|
||||||
```
|
```
|
||||||
|
|
||||||
**On Windows**
|
**On Windows**
|
||||||
@ -308,7 +314,7 @@ If you want to build legacy python code, please refer to [Building on legacy cod
|
|||||||
In this mode PyTorch computations will run on your CPU, not your GPU.
|
In this mode PyTorch computations will run on your CPU, not your GPU.
|
||||||
|
|
||||||
```cmd
|
```cmd
|
||||||
python setup.py develop
|
python -m pip install --no-build-isolation -v -e .
|
||||||
```
|
```
|
||||||
|
|
||||||
Note on OpenMP: The desired OpenMP implementation is Intel OpenMP (iomp). In order to link against iomp, you'll need to manually download the library and set up the building environment by tweaking `CMAKE_INCLUDE_PATH` and `LIB`. The instruction [here](https://github.com/pytorch/pytorch/blob/main/docs/source/notes/windows.rst#building-from-source) is an example for setting up both MKL and Intel OpenMP. Without these configurations for CMake, Microsoft Visual C OpenMP runtime (vcomp) will be used.
|
Note on OpenMP: The desired OpenMP implementation is Intel OpenMP (iomp). In order to link against iomp, you'll need to manually download the library and set up the building environment by tweaking `CMAKE_INCLUDE_PATH` and `LIB`. The instruction [here](https://github.com/pytorch/pytorch/blob/main/docs/source/notes/windows.rst#building-from-source) is an example for setting up both MKL and Intel OpenMP. Without these configurations for CMake, Microsoft Visual C OpenMP runtime (vcomp) will be used.
|
||||||
@ -329,7 +335,6 @@ Additional libraries such as
|
|||||||
|
|
||||||
You can refer to the [build_pytorch.bat](https://github.com/pytorch/pytorch/blob/main/.ci/pytorch/win-test-helpers/build_pytorch.bat) script for some other environment variables configurations
|
You can refer to the [build_pytorch.bat](https://github.com/pytorch/pytorch/blob/main/.ci/pytorch/win-test-helpers/build_pytorch.bat) script for some other environment variables configurations
|
||||||
|
|
||||||
|
|
||||||
```cmd
|
```cmd
|
||||||
cmd
|
cmd
|
||||||
|
|
||||||
@ -349,8 +354,7 @@ for /f "usebackq tokens=*" %i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\
|
|||||||
:: [Optional] If you want to override the CUDA host compiler
|
:: [Optional] If you want to override the CUDA host compiler
|
||||||
set CUDAHOSTCXX=C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\HostX64\x64\cl.exe
|
set CUDAHOSTCXX=C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\HostX64\x64\cl.exe
|
||||||
|
|
||||||
python setup.py develop
|
python -m pip install --no-build-isolation -v -e .
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Intel GPU builds**
|
**Intel GPU builds**
|
||||||
@ -372,7 +376,7 @@ if defined CMAKE_PREFIX_PATH (
|
|||||||
set "CMAKE_PREFIX_PATH=%CONDA_PREFIX%\Library"
|
set "CMAKE_PREFIX_PATH=%CONDA_PREFIX%\Library"
|
||||||
)
|
)
|
||||||
|
|
||||||
python setup.py develop
|
python -m pip install --no-build-isolation -v -e .
|
||||||
```
|
```
|
||||||
|
|
||||||
##### Adjust Build Options (Optional)
|
##### Adjust Build Options (Optional)
|
||||||
@ -382,6 +386,7 @@ the following. For example, adjusting the pre-detected directories for CuDNN or
|
|||||||
with such a step.
|
with such a step.
|
||||||
|
|
||||||
On Linux
|
On Linux
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
|
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
|
||||||
CMAKE_ONLY=1 python setup.py build
|
CMAKE_ONLY=1 python setup.py build
|
||||||
@ -389,6 +394,7 @@ ccmake build # or cmake-gui build
|
|||||||
```
|
```
|
||||||
|
|
||||||
On macOS
|
On macOS
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
|
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
|
||||||
MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ CMAKE_ONLY=1 python setup.py build
|
MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ CMAKE_ONLY=1 python setup.py build
|
||||||
|
|||||||
@ -131,69 +131,25 @@ uint64_t CPUGeneratorImpl::seed() {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Sets the internal state of CPUGeneratorImpl. The new internal state
|
* Sets the internal state of CPUGeneratorImpl. The new internal state
|
||||||
* must be a strided CPU byte tensor and of the same size as either
|
* must be a strided CPU byte tensor and of the same size as CPUGeneratorImplState.
|
||||||
* CPUGeneratorImplStateLegacy (for legacy CPU generator state) or
|
|
||||||
* CPUGeneratorImplState (for new state).
|
|
||||||
*
|
|
||||||
* FIXME: Remove support of the legacy state in the future?
|
|
||||||
*/
|
*/
|
||||||
void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
|
void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
|
||||||
using detail::CPUGeneratorImplState;
|
using detail::CPUGeneratorImplState;
|
||||||
using detail::CPUGeneratorImplStateLegacy;
|
using detail::CPUGeneratorImplStateLegacy;
|
||||||
|
|
||||||
static_assert(std::is_standard_layout_v<CPUGeneratorImplStateLegacy>, "CPUGeneratorImplStateLegacy is not a PODType");
|
|
||||||
static_assert(std::is_standard_layout_v<CPUGeneratorImplState>, "CPUGeneratorImplState is not a PODType");
|
static_assert(std::is_standard_layout_v<CPUGeneratorImplState>, "CPUGeneratorImplState is not a PODType");
|
||||||
|
constexpr size_t size = sizeof(CPUGeneratorImplState);
|
||||||
static const size_t size_legacy = sizeof(CPUGeneratorImplStateLegacy);
|
|
||||||
static const size_t size_current = sizeof(CPUGeneratorImplState);
|
|
||||||
static_assert(size_legacy != size_current, "CPUGeneratorImplStateLegacy and CPUGeneratorImplState can't be of the same size");
|
|
||||||
|
|
||||||
detail::check_rng_state(new_state);
|
detail::check_rng_state(new_state);
|
||||||
|
|
||||||
at::mt19937 engine;
|
at::mt19937 engine;
|
||||||
auto float_normal_sample = std::optional<float>();
|
|
||||||
auto double_normal_sample = std::optional<double>();
|
|
||||||
|
|
||||||
// Construct the state of at::CPUGeneratorImpl based on input byte tensor size.
|
|
||||||
CPUGeneratorImplStateLegacy* legacy_pod{nullptr};
|
|
||||||
auto new_state_size = new_state.numel();
|
auto new_state_size = new_state.numel();
|
||||||
if (new_state_size == size_legacy) {
|
|
||||||
legacy_pod = (CPUGeneratorImplStateLegacy*)new_state.data();
|
|
||||||
// Note that in CPUGeneratorImplStateLegacy, we didn't have float version
|
|
||||||
// of normal sample and hence we leave the std::optional<float> as is
|
|
||||||
|
|
||||||
// Update next_double_normal_sample.
|
TORCH_CHECK(new_state_size == size, "Expected a CPUGeneratorImplState of size ", size,
|
||||||
// Note that CPUGeneratorImplStateLegacy stores two uniform values (normal_x, normal_y)
|
" but found the input RNG state size to be ", new_state_size);
|
||||||
// and a rho value (normal_rho). These three values were redundant and in the new
|
|
||||||
// DistributionsHelper.h, we store the actual extra normal sample, rather than three
|
|
||||||
// intermediate values.
|
|
||||||
if (legacy_pod->normal_is_valid) {
|
|
||||||
auto r = legacy_pod->normal_rho;
|
|
||||||
auto theta = 2.0 * c10::pi<double> * legacy_pod->normal_x;
|
|
||||||
// we return the sin version of the normal sample when in caching mode
|
|
||||||
double_normal_sample = std::optional<double>(r * ::sin(theta));
|
|
||||||
}
|
|
||||||
} else if (new_state_size == size_current) {
|
|
||||||
auto rng_state = (CPUGeneratorImplState*)new_state.data();
|
|
||||||
legacy_pod = &rng_state->legacy_pod;
|
|
||||||
// update next_float_normal_sample
|
|
||||||
if (rng_state->is_next_float_normal_sample_valid) {
|
|
||||||
float_normal_sample = std::optional<float>(rng_state->next_float_normal_sample);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update next_double_normal_sample.
|
|
||||||
// Note that in getRNGState, we now return the actual normal sample in normal_y
|
|
||||||
// and if it's valid in normal_is_valid. The redundant normal_x and normal_rho
|
|
||||||
// are squashed to 0.0.
|
|
||||||
if (legacy_pod->normal_is_valid) {
|
|
||||||
double_normal_sample = std::optional<double>(legacy_pod->normal_y);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
TORCH_CHECK(false, "Expected either a CPUGeneratorImplStateLegacy of size ", size_legacy,
|
|
||||||
" or a CPUGeneratorImplState of size ", size_current,
|
|
||||||
" but found the input RNG state size to be ", new_state_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
auto rng_state = new_state.data_ptr_impl<CPUGeneratorImplState>();
|
||||||
|
auto legacy_pod = &(rng_state->legacy_pod);
|
||||||
// construct engine_
|
// construct engine_
|
||||||
// Note that CPUGeneratorImplStateLegacy stored a state array of 64 bit uints, whereas in our
|
// Note that CPUGeneratorImplStateLegacy stored a state array of 64 bit uints, whereas in our
|
||||||
// redefined mt19937, we have changed to a state array of 32 bit uints. Hence, we are
|
// redefined mt19937, we have changed to a state array of 32 bit uints. Hence, we are
|
||||||
@ -207,8 +163,12 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
|
|||||||
engine.set_data(rng_data);
|
engine.set_data(rng_data);
|
||||||
TORCH_CHECK(engine.is_valid(), "Invalid mt19937 state");
|
TORCH_CHECK(engine.is_valid(), "Invalid mt19937 state");
|
||||||
this->engine_ = engine;
|
this->engine_ = engine;
|
||||||
this->next_float_normal_sample_ = float_normal_sample;
|
this->next_float_normal_sample_ = rng_state->is_next_float_normal_sample_valid
|
||||||
this->next_double_normal_sample_ = double_normal_sample;
|
? std::optional<float>(rng_state->next_float_normal_sample)
|
||||||
|
: std::optional<float>();
|
||||||
|
this->next_double_normal_sample_ = legacy_pod->normal_is_valid
|
||||||
|
? std::optional<double>(legacy_pod->normal_y)
|
||||||
|
: std::optional<double>();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -431,7 +431,8 @@ class TORCH_API Context {
|
|||||||
at::SDPBackend::flash_attention,
|
at::SDPBackend::flash_attention,
|
||||||
at::SDPBackend::efficient_attention,
|
at::SDPBackend::efficient_attention,
|
||||||
at::SDPBackend::math,
|
at::SDPBackend::math,
|
||||||
at::SDPBackend::cudnn_attention};
|
at::SDPBackend::cudnn_attention,
|
||||||
|
at::SDPBackend::overrideable};
|
||||||
bool enabled_flashSDP = true;
|
bool enabled_flashSDP = true;
|
||||||
bool enabled_mem_efficientSDP = true;
|
bool enabled_mem_efficientSDP = true;
|
||||||
bool enabled_mathSDP = true;
|
bool enabled_mathSDP = true;
|
||||||
|
|||||||
@ -26,9 +26,7 @@ inline void infer_size_impl(
|
|||||||
std::optional<int64_t> infer_dim;
|
std::optional<int64_t> infer_dim;
|
||||||
for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) {
|
for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) {
|
||||||
if (TORCH_GUARD_OR_FALSE(sym_eq(shape[dim], -1))) {
|
if (TORCH_GUARD_OR_FALSE(sym_eq(shape[dim], -1))) {
|
||||||
if (infer_dim) {
|
TORCH_CHECK(!infer_dim, "only one dimension can be inferred");
|
||||||
throw std::runtime_error("only one dimension can be inferred");
|
|
||||||
}
|
|
||||||
infer_dim = dim;
|
infer_dim = dim;
|
||||||
} else {
|
} else {
|
||||||
// in case of unbacked shape[dim] we assume it's not -1 and add a runtime
|
// in case of unbacked shape[dim] we assume it's not -1 and add a runtime
|
||||||
|
|||||||
@ -214,7 +214,7 @@ inline Tensor applySlice(
|
|||||||
"step must be greater than zero");
|
"step must be greater than zero");
|
||||||
|
|
||||||
// See NOTE [nested tensor size for indexing]
|
// See NOTE [nested tensor size for indexing]
|
||||||
if (self_sizes.has_value()) {
|
if (self_sizes.has_value() && self_sizes.value().size() > 0) {
|
||||||
// Skip this optimization if we are tracing, as the trace may be polymorphic
|
// Skip this optimization if we are tracing, as the trace may be polymorphic
|
||||||
// over the shape of the `self` tensor, and we still want to record
|
// over the shape of the `self` tensor, and we still want to record
|
||||||
// the slice.
|
// the slice.
|
||||||
@ -223,7 +223,7 @@ inline Tensor applySlice(
|
|||||||
: self.sym_size(dim);
|
: self.sym_size(dim);
|
||||||
if (!disable_slice_optimization &&
|
if (!disable_slice_optimization &&
|
||||||
TORCH_STATICALLY_KNOWN_TRUE(start.sym_eq(0)) &&
|
TORCH_STATICALLY_KNOWN_TRUE(start.sym_eq(0)) &&
|
||||||
TORCH_STATICALLY_KNOWN_TRUE(length.sym_eq(stop)) && step == 1) {
|
TORCH_STATICALLY_KNOWN_TRUE(length.sym_le(stop)) && step == 1) {
|
||||||
return self;
|
return self;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -59,9 +59,7 @@ struct TORCH_API Generator {
|
|||||||
|
|
||||||
explicit Generator(c10::intrusive_ptr<c10::GeneratorImpl> gen_impl)
|
explicit Generator(c10::intrusive_ptr<c10::GeneratorImpl> gen_impl)
|
||||||
: impl_(std::move(gen_impl)) {
|
: impl_(std::move(gen_impl)) {
|
||||||
if (impl_.get() == nullptr) {
|
TORCH_CHECK(impl_, "GeneratorImpl with nullptr is not supported");
|
||||||
throw std::runtime_error("GeneratorImpl with nullptr is not supported");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool operator==(const Generator& rhs) const {
|
bool operator==(const Generator& rhs) const {
|
||||||
|
|||||||
@ -98,9 +98,7 @@ class TORCH_API TensorBase {
|
|||||||
explicit TensorBase(
|
explicit TensorBase(
|
||||||
c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
|
c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
|
||||||
: impl_(std::move(tensor_impl)) {
|
: impl_(std::move(tensor_impl)) {
|
||||||
if (impl_.get() == nullptr) {
|
TORCH_CHECK(impl_, "TensorImpl with nullptr is not supported");
|
||||||
throw std::runtime_error("TensorImpl with nullptr is not supported");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
TensorBase(const TensorBase&) = default;
|
TensorBase(const TensorBase&) = default;
|
||||||
TensorBase(TensorBase&&) noexcept = default;
|
TensorBase(TensorBase&&) noexcept = default;
|
||||||
|
|||||||
@ -68,11 +68,10 @@ Symbol InternedStrings::_symbol(const std::string& s) {
|
|||||||
return it->second;
|
return it->second;
|
||||||
|
|
||||||
auto pos = s.find("::");
|
auto pos = s.find("::");
|
||||||
if (pos == std::string::npos) {
|
TORCH_CHECK(
|
||||||
std::stringstream ss;
|
pos != std::string::npos,
|
||||||
ss << "all symbols must have a namespace, <namespace>::<string>, but found: " << s;
|
"all symbols must have a namespace, <namespace>::<string>, but found: ",
|
||||||
throw std::runtime_error(ss.str());
|
s);
|
||||||
}
|
|
||||||
Symbol ns = _symbol("namespaces::" + s.substr(0, pos));
|
Symbol ns = _symbol("namespaces::" + s.substr(0, pos));
|
||||||
|
|
||||||
Symbol sym(sym_to_info_.size());
|
Symbol sym(sym_to_info_.size());
|
||||||
@ -121,12 +120,11 @@ std::string Symbol::domainString() const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Symbol Symbol::fromDomainAndUnqualString(const std::string & d, const std::string & s) {
|
Symbol Symbol::fromDomainAndUnqualString(const std::string & d, const std::string & s) {
|
||||||
if (d.compare(0, domain_prefix().size(), domain_prefix()) != 0) {
|
TORCH_CHECK(
|
||||||
std::ostringstream ss;
|
d.compare(0, domain_prefix().size(), domain_prefix()) == 0,
|
||||||
ss << "Symbol: domain string is expected to be prefixed with '"
|
"Symbol: domain string is expected to be prefixed with '",
|
||||||
<< domain_prefix() << "', e.g. 'org.pytorch.aten'";
|
domain_prefix(),
|
||||||
throw std::runtime_error(ss.str());
|
"', e.g. 'org.pytorch.aten'");
|
||||||
}
|
|
||||||
std::string qualString = d.substr(domain_prefix().size()) + "::" + s;
|
std::string qualString = d.substr(domain_prefix().size()) + "::" + s;
|
||||||
return fromQualString(qualString);
|
return fromQualString(qualString);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -406,8 +406,7 @@ size_t IValue::hash(const IValue& v) {
|
|||||||
case Tag::Enum:
|
case Tag::Enum:
|
||||||
case Tag::Stream:
|
case Tag::Stream:
|
||||||
case Tag::Uninitialized:
|
case Tag::Uninitialized:
|
||||||
throw std::runtime_error(
|
TORCH_CHECK(false, "unhashable type: '" + v.type()->repr_str() + "'");
|
||||||
"unhashable type: '" + v.type()->repr_str() + "'");
|
|
||||||
}
|
}
|
||||||
// the above switch should be exhaustive
|
// the above switch should be exhaustive
|
||||||
TORCH_INTERNAL_ASSERT(false, "we should never reach here")
|
TORCH_INTERNAL_ASSERT(false, "we should never reach here")
|
||||||
|
|||||||
@ -116,10 +116,9 @@ struct SingleElementType : public SharedType {
|
|||||||
|
|
||||||
protected:
|
protected:
|
||||||
SingleElementType(TypePtr elem) : SharedType(Kind), elem(std::move(elem)) {
|
SingleElementType(TypePtr elem) : SharedType(Kind), elem(std::move(elem)) {
|
||||||
if (!this->elem) {
|
TORCH_CHECK(
|
||||||
throw std::runtime_error(c10::str(
|
this->elem,
|
||||||
"Can not create ", typeKindToString(Kind), " with None type"));
|
c10::str("Can not create ", typeKindToString(Kind), " with None type"));
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -416,16 +415,12 @@ struct TORCH_API SymbolicShape {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ShapeSymbol operator[](size_t i) const {
|
ShapeSymbol operator[](size_t i) const {
|
||||||
if (!dims_) {
|
TORCH_CHECK(dims_, "Rank isn't fixed");
|
||||||
throw std::runtime_error("Rank isn't fixed");
|
|
||||||
}
|
|
||||||
return (*dims_).at(i);
|
return (*dims_).at(i);
|
||||||
}
|
}
|
||||||
|
|
||||||
ShapeSymbol at(size_t i) const {
|
ShapeSymbol at(size_t i) const {
|
||||||
if (!dims_) {
|
TORCH_CHECK(dims_, "Rank isn't fixed");
|
||||||
throw std::runtime_error("Rank isn't fixed");
|
|
||||||
}
|
|
||||||
return (*dims_).at(i);
|
return (*dims_).at(i);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -520,9 +515,7 @@ struct VaryingShape {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const std::optional<T> &operator[](size_t i) const {
|
const std::optional<T> &operator[](size_t i) const {
|
||||||
if (!dims_) {
|
TORCH_CHECK(dims_, "Rank isn't fixed");
|
||||||
throw std::runtime_error("Rank isn't fixed");
|
|
||||||
}
|
|
||||||
return (*dims_).at(i);
|
return (*dims_).at(i);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -957,9 +950,7 @@ struct TORCH_API DictType : public SharedType {
|
|||||||
|
|
||||||
TypePtr createWithContained(
|
TypePtr createWithContained(
|
||||||
std::vector<TypePtr> contained_types) const override {
|
std::vector<TypePtr> contained_types) const override {
|
||||||
if (contained_types.size() != 2) {
|
TORCH_CHECK(contained_types.size() == 2, "Expected 2 contained types");
|
||||||
throw std::runtime_error("Expected 2 contained types");
|
|
||||||
}
|
|
||||||
return create(std::move(contained_types.at(0)), std::move(contained_types.at(1)));
|
return create(std::move(contained_types.at(0)), std::move(contained_types.at(1)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -826,9 +826,7 @@ TupleType::TupleType(
|
|||||||
: NamedType(TypeKind::TupleType, std::move(name)),
|
: NamedType(TypeKind::TupleType, std::move(name)),
|
||||||
elements_(std::move(elements)),
|
elements_(std::move(elements)),
|
||||||
has_free_variables_(std::any_of(elements_.begin(), elements_.end(), [](const TypePtr& v) {
|
has_free_variables_(std::any_of(elements_.begin(), elements_.end(), [](const TypePtr& v) {
|
||||||
if (!v) {
|
TORCH_CHECK(v, "Can not create tuple with None type");
|
||||||
throw std::runtime_error("Can not create tuple with None type");
|
|
||||||
}
|
|
||||||
return v->hasFreeVariables();
|
return v->hasFreeVariables();
|
||||||
})), schema_(std::move(schema)) {
|
})), schema_(std::move(schema)) {
|
||||||
|
|
||||||
|
|||||||
@ -163,6 +163,9 @@ class Vectorized<BFloat16> {
|
|||||||
Vectorized<BFloat16> exp_u20() const {
|
Vectorized<BFloat16> exp_u20() const {
|
||||||
return exp();
|
return exp();
|
||||||
}
|
}
|
||||||
|
Vectorized<BFloat16> fexp_u20() const {
|
||||||
|
return exp();
|
||||||
|
}
|
||||||
Vectorized<BFloat16> fmod(const Vectorized<BFloat16>& q) const;
|
Vectorized<BFloat16> fmod(const Vectorized<BFloat16>& q) const;
|
||||||
Vectorized<BFloat16> hypot(const Vectorized<BFloat16>& b) const;
|
Vectorized<BFloat16> hypot(const Vectorized<BFloat16>& b) const;
|
||||||
Vectorized<BFloat16> i0() const;
|
Vectorized<BFloat16> i0() const;
|
||||||
|
|||||||
@ -249,6 +249,9 @@ class Vectorized<double> {
|
|||||||
Vectorized<double> exp_u20() const {
|
Vectorized<double> exp_u20() const {
|
||||||
return exp();
|
return exp();
|
||||||
}
|
}
|
||||||
|
Vectorized<double> fexp_u20() const {
|
||||||
|
return exp();
|
||||||
|
}
|
||||||
Vectorized<double> fmod(const Vectorized<double>& q) const {USE_SLEEF(
|
Vectorized<double> fmod(const Vectorized<double>& q) const {USE_SLEEF(
|
||||||
{ return Vectorized<double>(Sleef_fmoddx_sve(values, q)); },
|
{ return Vectorized<double>(Sleef_fmoddx_sve(values, q)); },
|
||||||
{
|
{
|
||||||
|
|||||||
@ -314,6 +314,9 @@ class Vectorized<float> {
|
|||||||
Vectorized<float> exp_u20() const {
|
Vectorized<float> exp_u20() const {
|
||||||
return exp();
|
return exp();
|
||||||
}
|
}
|
||||||
|
Vectorized<float> fexp_u20() const {
|
||||||
|
return exp();
|
||||||
|
}
|
||||||
Vectorized<float> fmod(const Vectorized<float>& q) const {USE_SLEEF(
|
Vectorized<float> fmod(const Vectorized<float>& q) const {USE_SLEEF(
|
||||||
{ return Vectorized<float>(Sleef_fmodfx_sve(values, q)); },
|
{ return Vectorized<float>(Sleef_fmodfx_sve(values, q)); },
|
||||||
{
|
{
|
||||||
|
|||||||
@ -308,6 +308,9 @@ class Vectorized<float> {
|
|||||||
Vectorized<float> exp_u20() const {
|
Vectorized<float> exp_u20() const {
|
||||||
return exp();
|
return exp();
|
||||||
}
|
}
|
||||||
|
Vectorized<float> fexp_u20() const {
|
||||||
|
return exp();
|
||||||
|
}
|
||||||
DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
|
DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
|
||||||
fmod,
|
fmod,
|
||||||
Sleef_fmodf4)
|
Sleef_fmodf4)
|
||||||
|
|||||||
@ -206,6 +206,10 @@ struct Vectorized16 {
|
|||||||
return static_cast<const Derived*>(this)->map_with_vec_float_method(
|
return static_cast<const Derived*>(this)->map_with_vec_float_method(
|
||||||
&Vectorized<float>::exp_u20);
|
&Vectorized<float>::exp_u20);
|
||||||
}
|
}
|
||||||
|
Derived fexp_u20() const {
|
||||||
|
return static_cast<const Derived*>(this)->map_with_vec_float_method(
|
||||||
|
&Vectorized<float>::exp_u20);
|
||||||
|
}
|
||||||
Derived fmod(const Derived& q) const {
|
Derived fmod(const Derived& q) const {
|
||||||
// This function is questionable with a conversion, so we use map2
|
// This function is questionable with a conversion, so we use map2
|
||||||
return map2(q, std::fmod);
|
return map2(q, std::fmod);
|
||||||
|
|||||||
@ -488,6 +488,9 @@ class Vectorized16 {
|
|||||||
Vectorized<T> expm1() const {
|
Vectorized<T> expm1() const {
|
||||||
return map(Sleef_expm1f8_u10);
|
return map(Sleef_expm1f8_u10);
|
||||||
}
|
}
|
||||||
|
Vectorized<T> fexp_u20() const {
|
||||||
|
return exp();
|
||||||
|
}
|
||||||
Vectorized<T> exp_u20() const {
|
Vectorized<T> exp_u20() const {
|
||||||
return exp();
|
return exp();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -198,6 +198,9 @@ class Vectorized<double> {
|
|||||||
Vectorized<double> exp_u20() const {
|
Vectorized<double> exp_u20() const {
|
||||||
return exp();
|
return exp();
|
||||||
}
|
}
|
||||||
|
Vectorized<double> fexp_u20() const {
|
||||||
|
return exp();
|
||||||
|
}
|
||||||
Vectorized<double> fmod(const Vectorized<double>& q) const {
|
Vectorized<double> fmod(const Vectorized<double>& q) const {
|
||||||
return Vectorized<double>(Sleef_fmodd4(values, q));
|
return Vectorized<double>(Sleef_fmodd4(values, q));
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
// DO NOT DEFINE STATIC DATA IN THIS HEADER!
|
// DO NOT DEFINE STATIC DATA IN THIS HEADER!
|
||||||
// See Note [Do not compile initializers with AVX]
|
// See Note [Do not compile initializers with AVX]
|
||||||
|
|
||||||
@ -256,6 +255,63 @@ class Vectorized<float> {
|
|||||||
Vectorized<float> expm1() const {
|
Vectorized<float> expm1() const {
|
||||||
return Vectorized<float>(Sleef_expm1f8_u10(values));
|
return Vectorized<float>(Sleef_expm1f8_u10(values));
|
||||||
}
|
}
|
||||||
|
Vectorized<float> fexp_u20() const {
|
||||||
|
const __m256 vec_c0 = _mm256_set1_ps(0.00010703434948458272f);
|
||||||
|
const __m256 vec_c1 = _mm256_set1_ps(0.30354260500649682f);
|
||||||
|
const __m256 vec_c2 = _mm256_set1_ps(-0.22433836478672356);
|
||||||
|
const __m256 vec_c3 = _mm256_set1_ps(-0.079204240219773236);
|
||||||
|
|
||||||
|
const __m256 vec_exp_log2ef =
|
||||||
|
_mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e)
|
||||||
|
|
||||||
|
const __m256 vec_a = _mm256_set1_ps(std::pow(2, 23) / std::log2(2));
|
||||||
|
const __m256 vec_b = _mm256_set1_ps(std::pow(2, 23) * 127.f);
|
||||||
|
|
||||||
|
const __m256 vec_ln_flt_min =
|
||||||
|
_mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
|
||||||
|
const __m256 vec_ln_flt_max =
|
||||||
|
_mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
|
||||||
|
const __m256 vec_inf = _mm256_set1_ps(INFINITY);
|
||||||
|
const __m256 zero = _mm256_setzero_ps();
|
||||||
|
|
||||||
|
// exp(x) = 2**(x * log2(e))
|
||||||
|
// = 2**xi * 2**xf - TIPS we are using the EEEE floating point
|
||||||
|
// representation with identification to the exponent and the
|
||||||
|
// mentissa
|
||||||
|
// 2**xf will be approximated to a polynomial of degree 3 computed with
|
||||||
|
// Horner method
|
||||||
|
// compute the min/max for the mask
|
||||||
|
// Masks
|
||||||
|
__m256 mask_too_small =
|
||||||
|
_mm256_cmp_ps(values, vec_ln_flt_min, _CMP_LT_OS); // x < min
|
||||||
|
__m256 mask_too_large =
|
||||||
|
_mm256_cmp_ps(values, vec_ln_flt_max, _CMP_GT_OS); // x > max
|
||||||
|
|
||||||
|
// transformation with log2(e)
|
||||||
|
auto vec_src = _mm256_mul_ps(values, vec_exp_log2ef);
|
||||||
|
auto vec_fractional = _mm256_sub_ps(vec_src, _mm256_floor_ps(vec_src));
|
||||||
|
|
||||||
|
// compute polynomial using Horner Scheme
|
||||||
|
auto vec_res = _mm256_fmadd_ps(vec_fractional, vec_c3, vec_c2);
|
||||||
|
vec_res = _mm256_fmadd_ps(vec_fractional, vec_res, vec_c1);
|
||||||
|
vec_res = _mm256_fmadd_ps(vec_fractional, vec_res, vec_c0);
|
||||||
|
|
||||||
|
vec_src = _mm256_sub_ps(vec_src, vec_res);
|
||||||
|
// // the tips is here, headache in perspective
|
||||||
|
auto tmp = _mm256_fmadd_ps(vec_a, vec_src, vec_b);
|
||||||
|
// headache bis
|
||||||
|
__m256i casted_integer = _mm256_cvttps_epi32(tmp);
|
||||||
|
// bitwise to float for the final transformation
|
||||||
|
auto result = _mm256_castsi256_ps(casted_integer);
|
||||||
|
// boundary condition
|
||||||
|
// Set to 0 where x < ln(FLT_MIN)
|
||||||
|
result = _mm256_blendv_ps(result, zero, mask_too_small);
|
||||||
|
// Set to +inf where x > ln(FLT_MAX)
|
||||||
|
result = _mm256_blendv_ps(result, vec_inf, mask_too_large);
|
||||||
|
// final interpretation to float
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
Vectorized<float> exp_u20() const {
|
Vectorized<float> exp_u20() const {
|
||||||
// A faster version of exp with ULP=20
|
// A faster version of exp with ULP=20
|
||||||
const __m256 vec_factorial_1 =
|
const __m256 vec_factorial_1 =
|
||||||
|
|||||||
@ -121,27 +121,52 @@ typename std::enable_if_t<
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
typename std::enable_if_t<
|
at::vec::Vectorized<T> inline convert_float_to_int8(
|
||||||
std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
|
at::vec::Vectorized<float> src);
|
||||||
at::vec::Vectorized<
|
|
||||||
T>> inline convert_float_to_int8(at::vec::Vectorized<float> src) {
|
template <>
|
||||||
|
at::vec::Vectorized<int8_t> inline convert_float_to_int8(
|
||||||
|
at::vec::Vectorized<float> src) {
|
||||||
// Convert from float32 to int32 with truncation
|
// Convert from float32 to int32 with truncation
|
||||||
__m256i x_values_int32 = _mm256_cvttps_epi32(src);
|
__m256i x_values_int32 = _mm256_cvttps_epi32(src);
|
||||||
|
|
||||||
// Convert from int32 to int16 using signed saturation
|
// Convert from int32 to int16 using signed saturation
|
||||||
__m256i xy_packed_v = _mm256_packs_epi32(x_values_int32, x_values_int32);
|
__m256i xy_packed_v = _mm256_packs_epi32(x_values_int32, x_values_int32);
|
||||||
|
|
||||||
constexpr auto min_val = std::numeric_limits<T>::min();
|
constexpr auto min_val = std::numeric_limits<int8_t>::min();
|
||||||
constexpr auto max_val = std::numeric_limits<T>::max();
|
constexpr auto max_val = std::numeric_limits<int8_t>::max();
|
||||||
|
|
||||||
// Convert from int16 to uint8/int8 using unsigned saturation
|
// Convert from int16 to int8 using unsigned saturation
|
||||||
__m256i xyzw_clamped_v =
|
__m256i xyzw_clamped_v = pack_saturate_and_clamp<int8_t>(
|
||||||
pack_saturate_and_clamp<T>(xy_packed_v, xy_packed_v, min_val, max_val);
|
xy_packed_v, xy_packed_v, min_val, max_val);
|
||||||
__m256i permute_mask_v =
|
__m256i permute_mask_v =
|
||||||
_mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
|
_mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
|
||||||
return _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v);
|
return _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
at::vec::Vectorized<uint8_t> inline convert_float_to_int8(
|
||||||
|
at::vec::Vectorized<float> src) {
|
||||||
|
// The type of *_val should be int32_t to ensure correct clamping behavior.
|
||||||
|
constexpr auto min_val = std::numeric_limits<int32_t>::min();
|
||||||
|
constexpr auto max_val = std::numeric_limits<int32_t>::max();
|
||||||
|
__m256 float32_min_val = _mm256_set1_ps(float(min_val));
|
||||||
|
__m256 float32_max_val = _mm256_set1_ps(float(max_val));
|
||||||
|
__m256 float32_src = _mm256_max_ps(src, float32_min_val);
|
||||||
|
float32_src = _mm256_min_ps(float32_src, float32_max_val);
|
||||||
|
__m256i truncated_src = _mm256_cvttps_epi32(float32_src);
|
||||||
|
|
||||||
|
__m128i r1 = _mm256_castsi256_si128(truncated_src);
|
||||||
|
__m128i mask = _mm_setr_epi8(
|
||||||
|
0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
|
||||||
|
__m128i r1_shuffled = _mm_shuffle_epi8(r1, mask);
|
||||||
|
__m128i r2 = _mm256_extractf128_si256(truncated_src, 1);
|
||||||
|
__m128i r2_shuffled = _mm_shuffle_epi8(r2, mask);
|
||||||
|
__m128i result = _mm_unpacklo_epi32(r1_shuffled, r2_shuffled);
|
||||||
|
|
||||||
|
return _mm256_castsi128_si256(result);
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__FORCE_INLINE void QuantizeAvx2(
|
__FORCE_INLINE void QuantizeAvx2(
|
||||||
const float* src,
|
const float* src,
|
||||||
|
|||||||
@ -273,6 +273,9 @@ class Vectorized<double> {
|
|||||||
Vectorized<double> C10_ALWAYS_INLINE exp_u20() const {
|
Vectorized<double> C10_ALWAYS_INLINE exp_u20() const {
|
||||||
return exp();
|
return exp();
|
||||||
}
|
}
|
||||||
|
Vectorized<double> C10_ALWAYS_INLINE fexp_u20() const {
|
||||||
|
return exp();
|
||||||
|
}
|
||||||
|
|
||||||
Vectorized<double> lgamma() const __ubsan_ignore_undefined__ {
|
Vectorized<double> lgamma() const __ubsan_ignore_undefined__ {
|
||||||
return {Sleef_lgammad2_u10(_vec0), Sleef_lgammad2_u10(_vec1)};
|
return {Sleef_lgammad2_u10(_vec0), Sleef_lgammad2_u10(_vec1)};
|
||||||
|
|||||||
@ -352,6 +352,9 @@ class Vectorized<float> {
|
|||||||
Vectorized<float> C10_ALWAYS_INLINE exp_u20() const {
|
Vectorized<float> C10_ALWAYS_INLINE exp_u20() const {
|
||||||
return exp();
|
return exp();
|
||||||
}
|
}
|
||||||
|
Vectorized<float> C10_ALWAYS_INLINE fexp_u20() const {
|
||||||
|
return exp();
|
||||||
|
}
|
||||||
|
|
||||||
Vectorized<float> C10_ALWAYS_INLINE log() const {
|
Vectorized<float> C10_ALWAYS_INLINE log() const {
|
||||||
return {Sleef_logf4_u10(_vec0), Sleef_logf4_u10(_vec1)};
|
return {Sleef_logf4_u10(_vec0), Sleef_logf4_u10(_vec1)};
|
||||||
|
|||||||
@ -1023,6 +1023,9 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
|
|||||||
Vectorized<T> exp_u20() const {
|
Vectorized<T> exp_u20() const {
|
||||||
return exp();
|
return exp();
|
||||||
}
|
}
|
||||||
|
Vectorized<T> fexp_u20() const {
|
||||||
|
return exp();
|
||||||
|
}
|
||||||
|
|
||||||
Vectorized<T> log() const {
|
Vectorized<T> log() const {
|
||||||
return mapSleef(Sleef_logf4_u10, Sleef_logd2_u10);
|
return mapSleef(Sleef_logf4_u10, Sleef_logd2_u10);
|
||||||
|
|||||||
@ -535,6 +535,9 @@ class Vectorized16 {
|
|||||||
Vectorized<T> expm1() const {
|
Vectorized<T> expm1() const {
|
||||||
return map(Sleef_expm1f16_u10);
|
return map(Sleef_expm1f16_u10);
|
||||||
}
|
}
|
||||||
|
Vectorized<T> fexp_u20() const {
|
||||||
|
return exp();
|
||||||
|
}
|
||||||
Vectorized<T> exp_u20() const {
|
Vectorized<T> exp_u20() const {
|
||||||
return exp();
|
return exp();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -221,6 +221,9 @@ class Vectorized<double> {
|
|||||||
Vectorized<double> exp_u20() const {
|
Vectorized<double> exp_u20() const {
|
||||||
return exp();
|
return exp();
|
||||||
}
|
}
|
||||||
|
Vectorized<double> fexp_u20() const {
|
||||||
|
return exp();
|
||||||
|
}
|
||||||
Vectorized<double> fmod(const Vectorized<double>& q) const {
|
Vectorized<double> fmod(const Vectorized<double>& q) const {
|
||||||
return Vectorized<double>(Sleef_fmodd8(values, q));
|
return Vectorized<double>(Sleef_fmodd8(values, q));
|
||||||
}
|
}
|
||||||
|
|||||||
@ -310,6 +310,60 @@ class Vectorized<float> {
|
|||||||
Vectorized<float> expm1() const {
|
Vectorized<float> expm1() const {
|
||||||
return Vectorized<float>(Sleef_expm1f16_u10(values));
|
return Vectorized<float>(Sleef_expm1f16_u10(values));
|
||||||
}
|
}
|
||||||
|
Vectorized<float> fexp_u20() const {
|
||||||
|
const __m512 vec_c0 = _mm512_set1_ps(0.00010703434948458272f);
|
||||||
|
const __m512 vec_c1 = _mm512_set1_ps(0.30354260500649682f);
|
||||||
|
const __m512 vec_c2 = _mm512_set1_ps(-0.22433836478672356);
|
||||||
|
const __m512 vec_c3 = _mm512_set1_ps(-0.079204240219773236);
|
||||||
|
|
||||||
|
const __m512 vec_exp_log2ef =
|
||||||
|
_mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e)
|
||||||
|
|
||||||
|
const __m512 vec_a = _mm512_set1_ps(std::pow(2, 23) / std::log2(2));
|
||||||
|
const __m512 vec_b = _mm512_set1_ps(std::pow(2, 23) * 127.f);
|
||||||
|
|
||||||
|
const __m512 vec_ln_flt_min =
|
||||||
|
_mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
|
||||||
|
const __m512 vec_ln_flt_max =
|
||||||
|
_mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
|
||||||
|
__m512i vec_infinity = _mm512_set1_epi32(0x7F800000);
|
||||||
|
__m512i vec_zero = _mm512_setzero_epi32();
|
||||||
|
|
||||||
|
// Fast Exponential Computation on SIMD Architectures
|
||||||
|
// A. Cristiano I. Malossi, Yves Ineichen, Costas Bekas, and Alessandro
|
||||||
|
// Curioni exp(x) = 2**(x * log2(e))
|
||||||
|
// = 2**xi * 2**xf - TIPS we are using the EEEE floating point
|
||||||
|
// representation with identification to the exponent and the
|
||||||
|
// mentissa
|
||||||
|
// 2**xf will be approximated to a polynomial of degree 3 computed with
|
||||||
|
// Horner method
|
||||||
|
// mask for the boundary condition
|
||||||
|
auto min_mask = _mm512_cmp_ps_mask(values, vec_ln_flt_min, _CMP_LT_OS);
|
||||||
|
auto max_mask = _mm512_cmp_ps_mask(values, vec_ln_flt_max, _CMP_GT_OS);
|
||||||
|
|
||||||
|
// transformation with log2(e)
|
||||||
|
auto vec_src = _mm512_mul_ps(values, vec_exp_log2ef);
|
||||||
|
auto vec_fractional = _mm512_sub_ps(vec_src, _mm512_floor_ps(vec_src));
|
||||||
|
|
||||||
|
// compute polynomial using Horner Scheme, for superscalar processor
|
||||||
|
auto vec_res = _mm512_fmadd_ps(vec_fractional, vec_c3, vec_c2);
|
||||||
|
vec_res = _mm512_fmadd_ps(vec_fractional, vec_res, vec_c1);
|
||||||
|
vec_res = _mm512_fmadd_ps(vec_fractional, vec_res, vec_c0);
|
||||||
|
|
||||||
|
vec_src = _mm512_sub_ps(vec_src, vec_res);
|
||||||
|
// the tips is here, headache in perspective
|
||||||
|
auto tmp = _mm512_fmadd_ps(vec_a, vec_src, vec_b);
|
||||||
|
// headache bis - we loose precision with the cast but it "fits", but ok
|
||||||
|
// after f32 -> f16 later
|
||||||
|
__m512i casted_integer = _mm512_cvttps_epi32(tmp);
|
||||||
|
// boundary condition, lower than the min -> 0
|
||||||
|
casted_integer = _mm512_mask_mov_epi32(casted_integer, min_mask, vec_zero);
|
||||||
|
// boundary condition, larger than the max -> +oo
|
||||||
|
casted_integer =
|
||||||
|
_mm512_mask_mov_epi32(casted_integer, max_mask, vec_infinity);
|
||||||
|
// final interpretation to float
|
||||||
|
return _mm512_castsi512_ps(casted_integer);
|
||||||
|
}
|
||||||
Vectorized<float> exp_u20() const {
|
Vectorized<float> exp_u20() const {
|
||||||
// A faster version of exp with ULP=20
|
// A faster version of exp with ULP=20
|
||||||
const __m512 vec_factorial_1 =
|
const __m512 vec_factorial_1 =
|
||||||
|
|||||||
@ -123,22 +123,24 @@ typename std::enable_if_t<
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
typename std::enable_if_t<
|
at::vec::Vectorized<T> inline convert_float_to_int8(
|
||||||
std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
|
at::vec::Vectorized<float> src);
|
||||||
at::vec::Vectorized<
|
|
||||||
T>> inline convert_float_to_int8(at::vec::Vectorized<float> src) {
|
template <>
|
||||||
|
at::vec::Vectorized<int8_t> inline convert_float_to_int8(
|
||||||
|
at::vec::Vectorized<float> src) {
|
||||||
// Convert from float32 to int32 with truncation
|
// Convert from float32 to int32 with truncation
|
||||||
__m512i x_values_int32 = _mm512_cvttps_epi32(src);
|
__m512i x_values_int32 = _mm512_cvttps_epi32(src);
|
||||||
|
|
||||||
// Convert from int32 to int16 using signed saturation
|
// Convert from int32 to int16 using signed saturation
|
||||||
__m512i xy_packed_v = _mm512_packs_epi32(x_values_int32, x_values_int32);
|
__m512i xy_packed_v = _mm512_packs_epi32(x_values_int32, x_values_int32);
|
||||||
|
|
||||||
constexpr auto min_val = std::numeric_limits<T>::min();
|
constexpr auto min_val = std::numeric_limits<int8_t>::min();
|
||||||
constexpr auto max_val = std::numeric_limits<T>::max();
|
constexpr auto max_val = std::numeric_limits<int8_t>::max();
|
||||||
|
|
||||||
// Convert from int16 to uint8/int8 using unsigned saturation
|
// Convert from int16 to int8 using unsigned saturation
|
||||||
__m512i xyzw_clamped_v =
|
__m512i xyzw_clamped_v = pack_saturate_and_clamp<int8_t>(
|
||||||
pack_saturate_and_clamp<T>(xy_packed_v, xy_packed_v, min_val, max_val);
|
xy_packed_v, xy_packed_v, min_val, max_val);
|
||||||
__m512i permute_mask_v = _mm512_set_epi32(
|
__m512i permute_mask_v = _mm512_set_epi32(
|
||||||
0x0f,
|
0x0f,
|
||||||
0x0b,
|
0x0b,
|
||||||
@ -159,6 +161,21 @@ typename std::enable_if_t<
|
|||||||
return _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
|
return _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
at::vec::Vectorized<uint8_t> inline convert_float_to_int8(
|
||||||
|
at::vec::Vectorized<float> src) {
|
||||||
|
// The type of *_val should be int32_t to ensure correct clamping behavior.
|
||||||
|
constexpr auto min_val = std::numeric_limits<int32_t>::min();
|
||||||
|
constexpr auto max_val = std::numeric_limits<int32_t>::max();
|
||||||
|
__m512 float32_min_val = _mm512_set1_ps(float(min_val));
|
||||||
|
__m512 float32_max_val = _mm512_set1_ps(float(max_val));
|
||||||
|
__m512 float32_src = _mm512_max_ps(src, float32_min_val);
|
||||||
|
float32_src = _mm512_min_ps(float32_src, float32_max_val);
|
||||||
|
__m512i int32_src_clamped = _mm512_cvttps_epi32(float32_src);
|
||||||
|
__m128i int8_src = _mm512_cvtepi32_epi8(int32_src_clamped);
|
||||||
|
return _mm512_castsi128_si512(int8_src);
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__FORCE_INLINE void QuantizeAvx512(
|
__FORCE_INLINE void QuantizeAvx512(
|
||||||
const float* src,
|
const float* src,
|
||||||
|
|||||||
@ -238,9 +238,6 @@ struct Vectorized {
|
|||||||
Vectorized vector;
|
Vectorized vector;
|
||||||
int_same_size_t<T> buffer[size()];
|
int_same_size_t<T> buffer[size()];
|
||||||
mask.store(buffer);
|
mask.store(buffer);
|
||||||
#if defined(__clang__) && __ARM_FEATURE_SVE
|
|
||||||
#pragma clang loop vectorize(disable)
|
|
||||||
#endif
|
|
||||||
for (const auto i : c10::irange(size())) {
|
for (const auto i : c10::irange(size())) {
|
||||||
if (buffer[i] & 0x01) {
|
if (buffer[i] & 0x01) {
|
||||||
vector[i] = b[i];
|
vector[i] = b[i];
|
||||||
@ -547,6 +544,9 @@ struct Vectorized {
|
|||||||
Vectorized<T> exp_u20() const {
|
Vectorized<T> exp_u20() const {
|
||||||
return map(std::exp);
|
return map(std::exp);
|
||||||
}
|
}
|
||||||
|
Vectorized<T> fexp_u20() const {
|
||||||
|
return map(std::exp);
|
||||||
|
}
|
||||||
Vectorized<T> frac() const {
|
Vectorized<T> frac() const {
|
||||||
return *this - this->trunc();
|
return *this - this->trunc();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -263,6 +263,7 @@ class VectorizedN {
|
|||||||
VECTORIZEDN_DEFINE_UNARY_OP(exp2)
|
VECTORIZEDN_DEFINE_UNARY_OP(exp2)
|
||||||
VECTORIZEDN_DEFINE_UNARY_OP(expm1)
|
VECTORIZEDN_DEFINE_UNARY_OP(expm1)
|
||||||
VECTORIZEDN_DEFINE_UNARY_OP(exp_u20)
|
VECTORIZEDN_DEFINE_UNARY_OP(exp_u20)
|
||||||
|
VECTORIZEDN_DEFINE_UNARY_OP(fexp_u20)
|
||||||
VECTORIZEDN_DEFINE_UNARY_OP(frac)
|
VECTORIZEDN_DEFINE_UNARY_OP(frac)
|
||||||
VECTORIZEDN_DEFINE_BINARY_OP(fmod)
|
VECTORIZEDN_DEFINE_BINARY_OP(fmod)
|
||||||
VECTORIZEDN_DEFINE_UNARY_OP(log)
|
VECTORIZEDN_DEFINE_UNARY_OP(log)
|
||||||
|
|||||||
@ -94,9 +94,10 @@ static std::vector<std::optional<Tensor>> batchIndices(
|
|||||||
if (index.has_value() && index->sym_numel() != 0) {
|
if (index.has_value() && index->sym_numel() != 0) {
|
||||||
const auto idx_bdim = indices_bdims[i];
|
const auto idx_bdim = indices_bdims[i];
|
||||||
indices_.emplace_back(maybePadToLogicalRank(moveBatchDimToFront(index.value(), idx_bdim), idx_bdim, maxLogicalRank));
|
indices_.emplace_back(maybePadToLogicalRank(moveBatchDimToFront(index.value(), idx_bdim), idx_bdim, maxLogicalRank));
|
||||||
if (index.value().dtype() == kBool && indices_bdims[i].has_value()) {
|
TORCH_CHECK(
|
||||||
throw std::runtime_error("vmap: We do not support batching operators that can support dynamic shape. Attempting to batch over indexing with a boolean mask.");
|
index.value().dtype() != kBool || !indices_bdims[i].has_value(),
|
||||||
}
|
"vmap: We do not support batching operators that can support ",
|
||||||
|
"dynamic shape. Attempting to batch over indexing with a boolean mask.");
|
||||||
} else {
|
} else {
|
||||||
indices_.push_back(index);
|
indices_.push_back(index);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -16,11 +16,14 @@ template<typename O, typename C>
|
|||||||
static void _assert_match(const O& original, const C& compared, const std::string& name) {
|
static void _assert_match(const O& original, const C& compared, const std::string& name) {
|
||||||
if (compared) {
|
if (compared) {
|
||||||
bool equal = (original == compared.value());
|
bool equal = (original == compared.value());
|
||||||
if (!equal) {
|
TORCH_CHECK(
|
||||||
std::stringstream msg;
|
equal,
|
||||||
msg << "Tensor " << name << " mismatch! Expected: " << compared.value() << ", Got: " << original;
|
"Tensor ",
|
||||||
throw std::runtime_error(msg.str());
|
name,
|
||||||
}
|
" mismatch! Expected: ",
|
||||||
|
compared.value(),
|
||||||
|
", Got: ",
|
||||||
|
original);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -424,6 +424,14 @@ Tensor _dirichlet_grad_cpu(const Tensor& x, const Tensor& alpha, const Tensor& t
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
Tensor _s_binomial_cpu(const Tensor& count, const Tensor& prob, std::optional<Generator> gen) {
|
Tensor _s_binomial_cpu(const Tensor& count, const Tensor& prob, std::optional<Generator> gen) {
|
||||||
|
TORCH_CHECK_VALUE(
|
||||||
|
at::isFloatingType(count.scalar_type()),
|
||||||
|
"binomial only supports floating-point dtypes for count, got: ",
|
||||||
|
count.scalar_type());
|
||||||
|
TORCH_CHECK_VALUE(
|
||||||
|
at::isFloatingType(prob.scalar_type()),
|
||||||
|
"binomial only supports floating-point dtypes for prob, got: ",
|
||||||
|
prob.scalar_type());
|
||||||
Tensor ret = at::zeros(count.sizes(), count.options());
|
Tensor ret = at::zeros(count.sizes(), count.options());
|
||||||
auto iter = TensorIteratorConfig()
|
auto iter = TensorIteratorConfig()
|
||||||
.add_output(ret)
|
.add_output(ret)
|
||||||
|
|||||||
@ -180,9 +180,7 @@ TORCH_IMPL_FUNC(triu_cpu)(const Tensor& self, int64_t k, const Tensor &result) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Tensor trace_backward_symint(const Tensor& grad, c10::SymIntArrayRef sizes) {
|
Tensor trace_backward_symint(const Tensor& grad, c10::SymIntArrayRef sizes) {
|
||||||
if (sizes.size() != 2) {
|
TORCH_CHECK(sizes.size() == 2, "expected matrix input");
|
||||||
throw std::runtime_error("expected matrix input");
|
|
||||||
}
|
|
||||||
|
|
||||||
auto grad_input = at::zeros_symint(sizes[0] * sizes[1], grad.options());
|
auto grad_input = at::zeros_symint(sizes[0] * sizes[1], grad.options());
|
||||||
auto indices = at::arange(0, grad_input.numel(), sizes[1] + 1, grad.options().dtype(at::kLong));
|
auto indices = at::arange(0, grad_input.numel(), sizes[1] + 1, grad.options().dtype(at::kLong));
|
||||||
|
|||||||
@ -62,7 +62,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
|
|||||||
virtual std::optional<at::Tensor> bias() = 0;
|
virtual std::optional<at::Tensor> bias() = 0;
|
||||||
|
|
||||||
virtual void set_bias(const std::optional<at::Tensor>& bias) {
|
virtual void set_bias(const std::optional<at::Tensor>& bias) {
|
||||||
throw std::runtime_error(
|
TORCH_CHECK(
|
||||||
|
false,
|
||||||
"set_bias is not implemented for this packed "
|
"set_bias is not implemented for this packed "
|
||||||
"parameter type");
|
"parameter type");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -96,7 +96,14 @@ inline void _exp_reduce_sum_fusion_kernel(
|
|||||||
for (long i = 0; i < vec_size * (size / vec_size); i += vec_size) {
|
for (long i = 0; i < vec_size * (size / vec_size); i += vec_size) {
|
||||||
auto tmp0 = vec::Vectorized<T1>::loadu(a + i);
|
auto tmp0 = vec::Vectorized<T1>::loadu(a + i);
|
||||||
auto tmp1 = tmp0 - vec_max;
|
auto tmp1 = tmp0 - vec_max;
|
||||||
auto tmp2 = tmp1.exp_u20();
|
Vectorized<T1> tmp2;
|
||||||
|
if constexpr (std::is_same_v<T1, float> &&
|
||||||
|
(std::is_same_v<T2, at::BFloat16> || std::is_same_v<T2, at::Half>))
|
||||||
|
{
|
||||||
|
tmp2 = tmp1.fexp_u20();
|
||||||
|
} else {
|
||||||
|
tmp2 = tmp1.exp_u20();
|
||||||
|
}
|
||||||
vec_tmp_sum += tmp2;
|
vec_tmp_sum += tmp2;
|
||||||
_store(out + i, tmp2);
|
_store(out + i, tmp2);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -48,12 +48,7 @@ __global__ void prepare_grouped_gemm_data(
|
|||||||
int32_t start = tid == 0 ? 0 : offs[tid - 1];
|
int32_t start = tid == 0 ? 0 : offs[tid - 1];
|
||||||
delta = offs[tid] - start;
|
delta = offs[tid] - start;
|
||||||
if (K < 0) {
|
if (K < 0) {
|
||||||
if (!a_row_major && b_row_major) {
|
CUDA_KERNEL_ASSERT(delta >=0 && "expected offsets to be greater or equal 0\n");
|
||||||
CUDA_KERNEL_ASSERT(delta >=0 && "expected offsets to be greater or equal 0\n");
|
|
||||||
} else {
|
|
||||||
// CUTLASS cannot handle delta=0 here.
|
|
||||||
CUDA_KERNEL_ASSERT(delta >0 && "expected offsets to be greater than 0\n");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TMA transfers require global memory tensor addresses to be
|
// TMA transfers require global memory tensor addresses to be
|
||||||
|
|||||||
@ -337,6 +337,7 @@ Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
|
|||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
#include <mkl_dfti.h>
|
#include <mkl_dfti.h>
|
||||||
|
#include <mkl_version.h>
|
||||||
#include <ATen/mkl/Exceptions.h>
|
#include <ATen/mkl/Exceptions.h>
|
||||||
#include <ATen/mkl/Descriptors.h>
|
#include <ATen/mkl/Descriptors.h>
|
||||||
#include <ATen/mkl/Limits.h>
|
#include <ATen/mkl/Limits.h>
|
||||||
@ -479,6 +480,19 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
|
|||||||
const auto value_type = c10::toRealValueType(input.scalar_type());
|
const auto value_type = c10::toRealValueType(input.scalar_type());
|
||||||
out.resize_(batched_out_sizes, MemoryFormat::Contiguous);
|
out.resize_(batched_out_sizes, MemoryFormat::Contiguous);
|
||||||
|
|
||||||
|
// fix mkl issue
|
||||||
|
// https://github.com/pytorch/pytorch/issues/154477
|
||||||
|
#ifdef INTEL_MKL_VERSION
|
||||||
|
#if INTEL_MKL_VERSION > 20210400L
|
||||||
|
for (const auto& stride : input.strides()) {
|
||||||
|
if (stride == 0) {
|
||||||
|
input = input.clone(MemoryFormat::Contiguous);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
auto descriptor = _plan_mkl_fft(
|
auto descriptor = _plan_mkl_fft(
|
||||||
input.strides(), out.strides(), signal_size, input.is_complex(),
|
input.strides(), out.strides(), signal_size, input.is_complex(),
|
||||||
out.is_complex(), normalization, forward, value_type);
|
out.is_complex(), normalization, forward, value_type);
|
||||||
|
|||||||
@ -79,14 +79,16 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
|
|||||||
// 2. Math fallback
|
// 2. Math fallback
|
||||||
auto& ctx = at::globalContext();
|
auto& ctx = at::globalContext();
|
||||||
// use overrideable linked to onednn as overrideable implementation
|
// use overrideable linked to onednn as overrideable implementation
|
||||||
if (!ctx.userEnabledMathSDP() && !ctx.userEnabledOverrideableSDP()) {
|
if (!ctx.userEnabledMathSDP() && !ctx.userEnabledOverrideableSDP() &&
|
||||||
|
!ctx.userEnabledFlashSDP()) {
|
||||||
return sdp::SDPBackend::error;
|
return sdp::SDPBackend::error;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get ideal kernel ordering
|
// Get ideal kernel ordering
|
||||||
const std::array<sdp::SDPBackend, 2> priority_order{
|
const std::array<sdp::SDPBackend, 3> priority_order{
|
||||||
sdp::SDPBackend::overrideable,
|
sdp::SDPBackend::overrideable,
|
||||||
sdp::SDPBackend::math,
|
sdp::SDPBackend::math,
|
||||||
|
sdp::SDPBackend::flash_attention,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Because TORCHCHECK checks if condition is true we negate debug so that
|
// Because TORCHCHECK checks if condition is true we negate debug so that
|
||||||
@ -105,6 +107,14 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
|
|||||||
return sdp::SDPBackend::math;
|
return sdp::SDPBackend::math;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case sdp::SDPBackend::flash_attention:
|
||||||
|
if (ctx.userEnabledFlashSDP() &&
|
||||||
|
use_overrideable_xpu(kernel_params, print_debug)) {
|
||||||
|
TORCH_WARN(
|
||||||
|
"Flash Attention is not supported on XPU, falling back to overrideable kernel.");
|
||||||
|
return sdp::SDPBackend::overrideable;
|
||||||
|
}
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
TORCH_CHECK(false, "Invalid backend");
|
TORCH_CHECK(false, "Invalid backend");
|
||||||
}
|
}
|
||||||
@ -141,7 +151,7 @@ int64_t _fused_sdp_choice_xpu(
|
|||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
false,
|
false,
|
||||||
"No viable backend for scaled_dot_product_attention was found. ",
|
"No viable backend for scaled_dot_product_attention was found. ",
|
||||||
"This is likely due to turning off both the math kernel and the fused kernels.");
|
"This is likely due to turning off both the math kernel and the overrideable kernels.");
|
||||||
}
|
}
|
||||||
return static_cast<int64_t>(backend);
|
return static_cast<int64_t>(backend);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,8 +0,0 @@
|
|||||||
// Copyright © 2022 Apple Inc.
|
|
||||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
|
||||||
#include <ATen/native/mps/OperationUtils.h>
|
|
||||||
#include <ATen/native/mps/TensorFactory.h>
|
|
||||||
#include <c10/core/ScalarType.h>
|
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
using namespace at::mps;
|
|
||||||
@ -18,8 +18,6 @@
|
|||||||
#include <ATen/native/Resize.h>
|
#include <ATen/native/Resize.h>
|
||||||
#include <ATen/native/TensorAdvancedIndexing.h>
|
#include <ATen/native/TensorAdvancedIndexing.h>
|
||||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||||
#include <ATen/native/mps/operations/Indexing.h>
|
|
||||||
#include <c10/core/QScheme.h>
|
|
||||||
#include <c10/util/SmallVector.h>
|
#include <c10/util/SmallVector.h>
|
||||||
#include <c10/util/irange.h>
|
#include <c10/util/irange.h>
|
||||||
#include <fmt/format.h>
|
#include <fmt/format.h>
|
||||||
|
|||||||
@ -746,7 +746,7 @@ inline std::tuple<bool, Tensor, Tensor> NestedTensor_compute_size_stride(
|
|||||||
}
|
}
|
||||||
else if (size_reshaped == -1) {
|
else if (size_reshaped == -1) {
|
||||||
if (infer_index > -1) {
|
if (infer_index > -1) {
|
||||||
throw std::runtime_error("only one dimension can be inferred");
|
TORCH_CHECK(false, "only one dimension can be inferred");
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
infer_index = idim;
|
infer_index = idim;
|
||||||
|
|||||||
@ -19,7 +19,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
|
|||||||
double /*output_scale*/,
|
double /*output_scale*/,
|
||||||
int64_t /*output_zero_point*/,
|
int64_t /*output_zero_point*/,
|
||||||
at::Tensor& output) {
|
at::Tensor& output) {
|
||||||
throw std::runtime_error(
|
TORCH_CHECK(
|
||||||
|
false,
|
||||||
"apply_out is not implemented for this packed "
|
"apply_out is not implemented for this packed "
|
||||||
"parameter type");
|
"parameter type");
|
||||||
return output;
|
return output;
|
||||||
@ -30,7 +31,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
|
|||||||
double /*output_scale*/,
|
double /*output_scale*/,
|
||||||
int64_t /*output_zero_point*/,
|
int64_t /*output_zero_point*/,
|
||||||
at::Tensor& output) {
|
at::Tensor& output) {
|
||||||
throw std::runtime_error(
|
TORCH_CHECK(
|
||||||
|
false,
|
||||||
"apply_relu_out is not implemented for this packed "
|
"apply_relu_out is not implemented for this packed "
|
||||||
"parameter type");
|
"parameter type");
|
||||||
return output;
|
return output;
|
||||||
@ -55,7 +57,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
|
|||||||
at::Tensor input,
|
at::Tensor input,
|
||||||
double input_scale,
|
double input_scale,
|
||||||
int64_t input_zero_point) {
|
int64_t input_zero_point) {
|
||||||
throw std::runtime_error(
|
TORCH_CHECK(
|
||||||
|
false,
|
||||||
"apply_with_input_q_dq_qweight_dq_output_fp32 is not implemented for this packed "
|
"apply_with_input_q_dq_qweight_dq_output_fp32 is not implemented for this packed "
|
||||||
"parameter type");
|
"parameter type");
|
||||||
return {};
|
return {};
|
||||||
@ -79,7 +82,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
|
|||||||
at::Tensor input,
|
at::Tensor input,
|
||||||
double input_scale,
|
double input_scale,
|
||||||
int64_t input_zero_point) {
|
int64_t input_zero_point) {
|
||||||
throw std::runtime_error(
|
TORCH_CHECK(
|
||||||
|
false,
|
||||||
"apply_with_input_q_dq_qweight_dq_relu_output_fp32 is not implemented for this packed "
|
"apply_with_input_q_dq_qweight_dq_relu_output_fp32 is not implemented for this packed "
|
||||||
"parameter type");
|
"parameter type");
|
||||||
return {};
|
return {};
|
||||||
@ -96,7 +100,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
|
|||||||
const at::Tensor& /* input */,
|
const at::Tensor& /* input */,
|
||||||
at::Tensor& output,
|
at::Tensor& output,
|
||||||
bool /* reduce_range */) {
|
bool /* reduce_range */) {
|
||||||
throw std::runtime_error(
|
TORCH_CHECK(
|
||||||
|
false,
|
||||||
"apply_dynamic_out is not implemented for this packed "
|
"apply_dynamic_out is not implemented for this packed "
|
||||||
"parameter type");
|
"parameter type");
|
||||||
return output;
|
return output;
|
||||||
@ -105,7 +110,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
|
|||||||
const at::Tensor& /* input */,
|
const at::Tensor& /* input */,
|
||||||
at::Tensor& output,
|
at::Tensor& output,
|
||||||
bool /* reduce_range */) {
|
bool /* reduce_range */) {
|
||||||
throw std::runtime_error(
|
TORCH_CHECK(
|
||||||
|
false,
|
||||||
"apply_dynamic_relu_out is not implemented for this packed "
|
"apply_dynamic_relu_out is not implemented for this packed "
|
||||||
"parameter type");
|
"parameter type");
|
||||||
return output;
|
return output;
|
||||||
@ -116,7 +122,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
|
|||||||
virtual std::optional<at::Tensor> bias() = 0;
|
virtual std::optional<at::Tensor> bias() = 0;
|
||||||
|
|
||||||
virtual void set_bias(std::optional<at::Tensor> /*bias*/) {
|
virtual void set_bias(std::optional<at::Tensor> /*bias*/) {
|
||||||
throw std::runtime_error(
|
TORCH_CHECK(
|
||||||
|
false,
|
||||||
"set_bias is not implemented for this packed "
|
"set_bias is not implemented for this packed "
|
||||||
"parameter type");
|
"parameter type");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -843,6 +843,11 @@ SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
|
|||||||
return SDPBackend::math;
|
return SDPBackend::math;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case SDPBackend::overrideable:
|
||||||
|
if (ctx.userEnabledOverrideableSDP()) {
|
||||||
|
TORCH_CHECK(false, "Invalid backend");
|
||||||
|
}
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
TORCH_CHECK(false, "Invalid backend");
|
TORCH_CHECK(false, "Invalid backend");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -8,12 +8,12 @@ It also provides mechanisms to compare PyTorch with other frameworks.
|
|||||||
Make sure you're on a machine with CUDA, torchvision, and pytorch installed. Install in the following order:
|
Make sure you're on a machine with CUDA, torchvision, and pytorch installed. Install in the following order:
|
||||||
```
|
```
|
||||||
# Install torchvision. It comes with the pytorch stable release binary
|
# Install torchvision. It comes with the pytorch stable release binary
|
||||||
pip3 install torch torchvision
|
python -m pip install torch torchvision
|
||||||
|
|
||||||
# Install the latest pytorch master from source.
|
# Install the latest pytorch master from source.
|
||||||
# It should supersede the installation from the release binary.
|
# It should supersede the installation from the release binary.
|
||||||
cd $PYTORCH_HOME
|
cd $PYTORCH_HOME
|
||||||
python setup.py build develop
|
python -m pip install --no-build-isolation -v -e .
|
||||||
|
|
||||||
# Check the pytorch installation version
|
# Check the pytorch installation version
|
||||||
python -c "import torch; print(torch.__version__)"
|
python -c "import torch; print(torch.__version__)"
|
||||||
|
|||||||
@ -27,7 +27,7 @@ pull-deps: clone-deps
|
|||||||
(cd ../../../torchbenchmark && git fetch && git checkout "$$(cat ../pytorch/.github/ci_commit_pins/torchbench.txt)" && git submodule update --init --recursive)
|
(cd ../../../torchbenchmark && git fetch && git checkout "$$(cat ../pytorch/.github/ci_commit_pins/torchbench.txt)" && git submodule update --init --recursive)
|
||||||
|
|
||||||
build-deps: clone-deps
|
build-deps: clone-deps
|
||||||
uv pip install astunparse numpy scipy ninja pyyaml mkl mkl-include setuptools cmake \
|
uv pip install numpy scipy ninja pyyaml six mkl mkl-include setuptools wheel cmake \
|
||||||
typing-extensions requests protobuf numba cython scikit-learn librosa
|
typing-extensions requests protobuf numba cython scikit-learn librosa
|
||||||
(cd ../../../torchvision && uv pip install -e . --no-build-isolation)
|
(cd ../../../torchvision && uv pip install -e . --no-build-isolation)
|
||||||
(cd ../../../torchdata && uv pip install -e .)
|
(cd ../../../torchdata && uv pip install -e .)
|
||||||
|
|||||||
@ -210,7 +210,7 @@ mobilenet_v2,pass,0
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
mobilenet_v2_quantized_qat,pass,2
|
mobilenet_v2_quantized_qat,pass,3
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -274,7 +274,7 @@ resnet50,pass,0
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
resnet50_quantized_qat,pass,2
|
resnet50_quantized_qat,pass,3
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
|
@ -210,7 +210,7 @@ mobilenet_v2,pass,0
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
mobilenet_v2_quantized_qat,pass,2
|
mobilenet_v2_quantized_qat,pass,3
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -274,7 +274,7 @@ resnet50,pass,0
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
resnet50_quantized_qat,pass,2
|
resnet50_quantized_qat,pass,3
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
|
@ -210,7 +210,7 @@ mobilenet_v2,pass,0
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
mobilenet_v2_quantized_qat,pass,2
|
mobilenet_v2_quantized_qat,pass,3
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -274,7 +274,7 @@ resnet50,pass,0
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
resnet50_quantized_qat,pass,2
|
resnet50_quantized_qat,pass,3
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
|
@ -194,7 +194,7 @@ mobilenet_v2,pass,0
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
mobilenet_v2_quantized_qat,pass,2
|
mobilenet_v2_quantized_qat,pass,3
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -258,7 +258,7 @@ resnet50,pass,0
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
resnet50_quantized_qat,pass,2
|
resnet50_quantized_qat,pass,3
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
|
@ -210,7 +210,7 @@ mobilenet_v2,pass,0
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
mobilenet_v2_quantized_qat,pass,2
|
mobilenet_v2_quantized_qat,pass,3
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -274,7 +274,7 @@ resnet50,pass,0
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
resnet50_quantized_qat,pass,2
|
resnet50_quantized_qat,pass,3
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
|
@ -17,8 +17,8 @@ export DEBUG=0
|
|||||||
export OMP_NUM_THREADS=10
|
export OMP_NUM_THREADS=10
|
||||||
|
|
||||||
# Compile pytorch with the base revision
|
# Compile pytorch with the base revision
|
||||||
git checkout master
|
git checkout main
|
||||||
python setup.py develop
|
python -m pip install --no-build-isolation -v -e .
|
||||||
|
|
||||||
# Install dependencies:
|
# Install dependencies:
|
||||||
# Scipy is required by detr
|
# Scipy is required by detr
|
||||||
@ -32,7 +32,7 @@ python functional_autograd_benchmark.py --output before.txt
|
|||||||
# Compile pytorch with your change
|
# Compile pytorch with your change
|
||||||
popd
|
popd
|
||||||
git checkout your_feature_branch
|
git checkout your_feature_branch
|
||||||
python setup.py develop
|
python -m pip install --no-build-isolation -v -e .
|
||||||
|
|
||||||
# Run the benchmark for the new version
|
# Run the benchmark for the new version
|
||||||
pushd benchmarks/functional_autograd_benchmark
|
pushd benchmarks/functional_autograd_benchmark
|
||||||
|
|||||||
@ -20,7 +20,7 @@ Key Features:
|
|||||||
The instruction below installs a cpp\_extension for PyTorch and it is required to run the benchmark suite.
|
The instruction below installs a cpp\_extension for PyTorch and it is required to run the benchmark suite.
|
||||||
```bash
|
```bash
|
||||||
cd pt_extension
|
cd pt_extension
|
||||||
python setup.py install
|
python -m pip install .
|
||||||
```
|
```
|
||||||
|
|
||||||
## How to run the benchmarks:
|
## How to run the benchmarks:
|
||||||
|
|||||||
@ -11,7 +11,7 @@ export USE_MKL=1
|
|||||||
CMAKE_ONLY=1 python setup.py build
|
CMAKE_ONLY=1 python setup.py build
|
||||||
ccmake build # or cmake-gui build
|
ccmake build # or cmake-gui build
|
||||||
|
|
||||||
python setup.py install
|
python -m pip install --no-build-isolation -v .
|
||||||
|
|
||||||
cd benchmarks
|
cd benchmarks
|
||||||
echo "!! SPARSE SPMM TIME BENCHMARK!! " >> $OUTFILE
|
echo "!! SPARSE SPMM TIME BENCHMARK!! " >> $OUTFILE
|
||||||
@ -28,7 +28,7 @@ echo "----- USE_MKL=0 ------" >> $OUTFILE
|
|||||||
rm -rf build
|
rm -rf build
|
||||||
|
|
||||||
export USE_MKL=0
|
export USE_MKL=0
|
||||||
python setup.py install
|
python -m pip install --no-build-isolation -v .
|
||||||
|
|
||||||
cd benchmarks
|
cd benchmarks
|
||||||
for dim0 in 1000 5000 10000; do
|
for dim0 in 1000 5000 10000; do
|
||||||
|
|||||||
233
c10/core/AllocatorConfig.cpp
Normal file
233
c10/core/AllocatorConfig.cpp
Normal file
@ -0,0 +1,233 @@
|
|||||||
|
#include <c10/core/AllocatorConfig.h>
|
||||||
|
#include <c10/core/DeviceType.h>
|
||||||
|
#include <c10/util/env.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
|
namespace c10::CachingAllocator {
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
|
||||||
|
constexpr size_t kMB = 1024 * 1024ul;
|
||||||
|
constexpr size_t kRoundUpPowerOfTwoStart = 1 * kMB; // 1MB
|
||||||
|
constexpr size_t kRoundUpPowerOfTwoEnd = 64 * 1024ul * kMB; // 64GB
|
||||||
|
} // anonymous namespace
|
||||||
|
|
||||||
|
AcceleratorAllocatorConfig& AcceleratorAllocatorConfig::instance() {
|
||||||
|
static AcceleratorAllocatorConfig instance;
|
||||||
|
#define C10_ALLOCATOR_CONFIG_PARSE_ENV(env, deprecated) \
|
||||||
|
auto env##_name = c10::utils::get_env(#env); \
|
||||||
|
if (env##_name.has_value()) { \
|
||||||
|
if (deprecated) { \
|
||||||
|
TORCH_WARN_ONCE(#env " is deprecated, use PYTORCH_ALLOC_CONF instead"); \
|
||||||
|
} \
|
||||||
|
instance.parseArgs(env##_name.value()); \
|
||||||
|
return true; \
|
||||||
|
}
|
||||||
|
static bool env_flag [[maybe_unused]] = []() {
|
||||||
|
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_ALLOC_CONF, false)
|
||||||
|
// Keep this for backwards compatibility
|
||||||
|
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_CUDA_ALLOC_CONF, /*deprecated=*/true)
|
||||||
|
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_HIP_ALLOC_CONF, /*deprecated=*/true)
|
||||||
|
return false;
|
||||||
|
}();
|
||||||
|
#undef C10_ALLOCATOR_CONFIG_PARSE_ENV
|
||||||
|
return instance;
|
||||||
|
}
|
||||||
|
|
||||||
|
AcceleratorAllocatorConfig::AcceleratorAllocatorConfig() {
|
||||||
|
roundup_power2_divisions_.assign(kRoundUpPowerOfTwoIntervals, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t AcceleratorAllocatorConfig::roundup_power2_divisions(size_t size) {
|
||||||
|
size_t log_size = (63 - llvm::countLeadingZeros(size));
|
||||||
|
|
||||||
|
// Our intervals start at 1MB and end at 64GB
|
||||||
|
const size_t interval_start =
|
||||||
|
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoStart);
|
||||||
|
const size_t interval_end =
|
||||||
|
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoEnd);
|
||||||
|
TORCH_CHECK(
|
||||||
|
interval_end - interval_start == kRoundUpPowerOfTwoIntervals,
|
||||||
|
"kRoundUpPowerOfTwoIntervals mismatch");
|
||||||
|
|
||||||
|
size_t index =
|
||||||
|
(log_size > interval_start) ? (log_size - interval_start) : 0ul;
|
||||||
|
index = std::min(index, kRoundUpPowerOfTwoIntervals - 1);
|
||||||
|
return instance().roundup_power2_divisions_[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t AcceleratorAllocatorConfig::parseMaxSplitSize(
|
||||||
|
const ConfigTokenizer& tokenizer,
|
||||||
|
size_t i) {
|
||||||
|
tokenizer.checkToken(++i, ":");
|
||||||
|
constexpr size_t min_allowed_split_size_mb = kLargeBuffer / kMB;
|
||||||
|
constexpr size_t max_allowed_split_size_mb =
|
||||||
|
std::numeric_limits<size_t>::max() / kMB;
|
||||||
|
|
||||||
|
size_t val_env = tokenizer.toSizeT(++i);
|
||||||
|
TORCH_CHECK(
|
||||||
|
val_env >= min_allowed_split_size_mb,
|
||||||
|
"CachingAllocator option max_split_size_mb too small, must be >= ",
|
||||||
|
min_allowed_split_size_mb);
|
||||||
|
val_env = std::min(val_env, max_allowed_split_size_mb);
|
||||||
|
max_split_size_ = val_env * kMB;
|
||||||
|
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t AcceleratorAllocatorConfig::parseMaxNonSplitRoundingSize(
|
||||||
|
const ConfigTokenizer& tokenizer,
|
||||||
|
size_t i) {
|
||||||
|
tokenizer.checkToken(++i, ":");
|
||||||
|
constexpr size_t min_allowed_split_size_mb = kLargeBuffer / kMB;
|
||||||
|
constexpr size_t max_allowed_split_size_mb =
|
||||||
|
std::numeric_limits<size_t>::max() / kMB;
|
||||||
|
|
||||||
|
size_t val_env = tokenizer.toSizeT(++i);
|
||||||
|
TORCH_CHECK(
|
||||||
|
val_env >= min_allowed_split_size_mb,
|
||||||
|
"CachingAllocator option max_non_split_rounding_mb too small, must be >= ",
|
||||||
|
min_allowed_split_size_mb);
|
||||||
|
val_env = std::min(val_env, max_allowed_split_size_mb);
|
||||||
|
max_non_split_rounding_size_ = val_env * kMB;
|
||||||
|
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t AcceleratorAllocatorConfig::parseGarbageCollectionThreshold(
|
||||||
|
const ConfigTokenizer& tokenizer,
|
||||||
|
size_t i) {
|
||||||
|
tokenizer.checkToken(++i, ":");
|
||||||
|
double val_env = tokenizer.toDouble(++i);
|
||||||
|
TORCH_CHECK(
|
||||||
|
val_env > 0 && val_env < 1.0,
|
||||||
|
"garbage_collect_threshold is invalid, set it in (0.0, 1.0)");
|
||||||
|
garbage_collection_threshold_ = val_env;
|
||||||
|
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
|
||||||
|
const ConfigTokenizer& tokenizer,
|
||||||
|
size_t i) {
|
||||||
|
tokenizer.checkToken(++i, ":");
|
||||||
|
bool first_value = true;
|
||||||
|
|
||||||
|
if (tokenizer[++i] == "[") {
|
||||||
|
size_t last_index = 0;
|
||||||
|
// NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
|
||||||
|
while (++i < tokenizer.size() && tokenizer[i] != "]") {
|
||||||
|
size_t value_index = i;
|
||||||
|
tokenizer.checkToken(++i, ":");
|
||||||
|
size_t value = tokenizer.toSizeT(++i);
|
||||||
|
TORCH_CHECK(
|
||||||
|
value == 0 || llvm::isPowerOf2_64(value),
|
||||||
|
"For roundups, the divisions has to be power of 2 or 0 to disable roundup ");
|
||||||
|
|
||||||
|
if (tokenizer[value_index] == ">") {
|
||||||
|
std::fill(
|
||||||
|
std::next(
|
||||||
|
roundup_power2_divisions_.begin(),
|
||||||
|
static_cast<std::vector<size_t>::difference_type>(
|
||||||
|
last_index + 1)),
|
||||||
|
roundup_power2_divisions_.end(),
|
||||||
|
value);
|
||||||
|
} else {
|
||||||
|
size_t boundary = tokenizer.toSizeT(value_index);
|
||||||
|
TORCH_CHECK(
|
||||||
|
llvm::isPowerOf2_64(boundary),
|
||||||
|
"For roundups, the intervals have to be power of 2 ");
|
||||||
|
|
||||||
|
size_t index = 63 - llvm::countLeadingZeros(boundary);
|
||||||
|
index =
|
||||||
|
std::clamp(index, size_t{0}, roundup_power2_divisions_.size() - 1);
|
||||||
|
|
||||||
|
if (first_value) {
|
||||||
|
std::fill(
|
||||||
|
roundup_power2_divisions_.begin(),
|
||||||
|
std::next(
|
||||||
|
roundup_power2_divisions_.begin(),
|
||||||
|
static_cast<std::vector<size_t>::difference_type>(index)),
|
||||||
|
value);
|
||||||
|
first_value = false;
|
||||||
|
}
|
||||||
|
roundup_power2_divisions_[index] = value;
|
||||||
|
last_index = index;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tokenizer[i + 1] != "]") {
|
||||||
|
tokenizer.checkToken(++i, ",");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
TORCH_INTERNAL_ASSERT(
|
||||||
|
i < tokenizer.size(),
|
||||||
|
"Expected closing bracket ']' in ConfigTokenizer but reached end of config");
|
||||||
|
} else { // Keep this for backwards compatibility
|
||||||
|
size_t value = tokenizer.toSizeT(i);
|
||||||
|
TORCH_CHECK(
|
||||||
|
llvm::isPowerOf2_64(value),
|
||||||
|
"For roundups, the divisions has to be power of 2 ");
|
||||||
|
std::fill(
|
||||||
|
roundup_power2_divisions_.begin(),
|
||||||
|
roundup_power2_divisions_.end(),
|
||||||
|
value);
|
||||||
|
}
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t AcceleratorAllocatorConfig::parseExpandableSegments(
|
||||||
|
const ConfigTokenizer& tokenizer,
|
||||||
|
size_t i) {
|
||||||
|
tokenizer.checkToken(++i, ":");
|
||||||
|
use_expandable_segments_ = tokenizer.toBool(++i);
|
||||||
|
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t AcceleratorAllocatorConfig::parsePinnedUseBackgroundThreads(
|
||||||
|
const ConfigTokenizer& tokenizer,
|
||||||
|
size_t i) {
|
||||||
|
tokenizer.checkToken(++i, ":");
|
||||||
|
pinned_use_background_threads_ = tokenizer.toBool(++i);
|
||||||
|
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
void AcceleratorAllocatorConfig::parseArgs(const std::string& env) {
|
||||||
|
// The following option will be reset to its default value if not explicitly
|
||||||
|
// set each time.
|
||||||
|
max_split_size_ = std::numeric_limits<size_t>::max();
|
||||||
|
roundup_power2_divisions_.assign(kRoundUpPowerOfTwoIntervals, 0);
|
||||||
|
garbage_collection_threshold_ = 0;
|
||||||
|
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(last_allocator_settings_mutex_);
|
||||||
|
last_allocator_settings_ = env;
|
||||||
|
}
|
||||||
|
|
||||||
|
ConfigTokenizer tokenizer(env);
|
||||||
|
for (size_t i = 0; i < tokenizer.size(); i++) {
|
||||||
|
const auto& key = tokenizer[i];
|
||||||
|
if (key == "max_split_size_mb") {
|
||||||
|
i = parseMaxSplitSize(tokenizer, i);
|
||||||
|
} else if (key == "max_non_split_rounding_mb") {
|
||||||
|
i = parseMaxNonSplitRoundingSize(tokenizer, i);
|
||||||
|
} else if (key == "garbage_collection_threshold") {
|
||||||
|
i = parseGarbageCollectionThreshold(tokenizer, i);
|
||||||
|
} else if (key == "roundup_power2_divisions") {
|
||||||
|
i = parseRoundUpPower2Divisions(tokenizer, i);
|
||||||
|
} else if (key == "expandable_segments") {
|
||||||
|
i = parseExpandableSegments(tokenizer, i);
|
||||||
|
} else if (key == "pinned_use_background_threads") {
|
||||||
|
i = parsePinnedUseBackgroundThreads(tokenizer, i);
|
||||||
|
} else {
|
||||||
|
i = tokenizer.skipKey(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i + 1 < tokenizer.size()) {
|
||||||
|
tokenizer.checkToken(++i, ",");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace c10::CachingAllocator
|
||||||
337
c10/core/AllocatorConfig.h
Normal file
337
c10/core/AllocatorConfig.h
Normal file
@ -0,0 +1,337 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <c10/core/DeviceType.h>
|
||||||
|
#include <c10/util/Exception.h>
|
||||||
|
#include <c10/util/llvmMathExtras.h>
|
||||||
|
|
||||||
|
#include <atomic>
|
||||||
|
#include <mutex>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace c10::CachingAllocator {
|
||||||
|
|
||||||
|
// "large" allocations may be packed in 20 MiB blocks
|
||||||
|
const size_t kLargeBuffer = 20971520;
|
||||||
|
|
||||||
|
// A utility class for tokenizing allocator configuration strings into discrete
|
||||||
|
// parts. For example, the config string:
|
||||||
|
// "key1:val1,key2:[val2,val3]"
|
||||||
|
// is tokenized into:
|
||||||
|
// "key1", ":", "val1", ",", "key2", ":", "[", "val2", ",", "val3", "]",
|
||||||
|
//
|
||||||
|
// Tokens include keys, values, and special characters (':', ',', '[', ']').
|
||||||
|
// Whitespace is ignored.
|
||||||
|
class ConfigTokenizer {
|
||||||
|
public:
|
||||||
|
explicit ConfigTokenizer(const std::string& env) {
|
||||||
|
std::string buffer;
|
||||||
|
for (char ch : env) {
|
||||||
|
if (ch == ',' || ch == ':' || ch == '[' || ch == ']') {
|
||||||
|
if (!buffer.empty()) {
|
||||||
|
config_.emplace_back(std::move(buffer));
|
||||||
|
buffer.clear();
|
||||||
|
}
|
||||||
|
config_.emplace_back(1, ch);
|
||||||
|
} else if (!std::isspace(static_cast<unsigned char>(ch))) {
|
||||||
|
buffer += ch;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!buffer.empty()) {
|
||||||
|
config_.emplace_back(std::move(buffer));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string& operator[](size_t i) const {
|
||||||
|
TORCH_INTERNAL_ASSERT(
|
||||||
|
i < config_.size(), "Index out of bounds in ConfigTokenizer");
|
||||||
|
return config_[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t size() const {
|
||||||
|
return config_.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool checkToken(size_t i, const std::string& token) const {
|
||||||
|
checkIndex(i);
|
||||||
|
return config_[i] == token;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t toSizeT(size_t i) const {
|
||||||
|
checkIndex(i);
|
||||||
|
return std::stoull(config_[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
double toDouble(size_t i) const {
|
||||||
|
checkIndex(i);
|
||||||
|
return std::stod(config_[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool toBool(size_t i) const {
|
||||||
|
checkIndex(i);
|
||||||
|
const auto& token = config_[i];
|
||||||
|
if (token == "True") {
|
||||||
|
return true;
|
||||||
|
} else if (token == "False") {
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
TORCH_CHECK(
|
||||||
|
false,
|
||||||
|
"Expected 'True' or 'False' at index ",
|
||||||
|
i,
|
||||||
|
" in ConfigTokenizer but got '",
|
||||||
|
token,
|
||||||
|
"'");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skips the current token group and returns the index of the value token.
|
||||||
|
// Assumes the current index `i` points to a key name in a key-value pair.
|
||||||
|
size_t skipKey(size_t i) const {
|
||||||
|
// Expect a colon after the key
|
||||||
|
checkToken(++i, ":");
|
||||||
|
|
||||||
|
++i; // Move to the value
|
||||||
|
checkIndex(i);
|
||||||
|
if (config_[i] != "[") {
|
||||||
|
// Value is a single token (not a list) -> return its index
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip tokens inside the list until matching ']'
|
||||||
|
// NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
|
||||||
|
while (++i < config_.size() && config_[i] != "]") {
|
||||||
|
}
|
||||||
|
|
||||||
|
TORCH_INTERNAL_ASSERT(
|
||||||
|
i < config_.size(),
|
||||||
|
"Expected closing bracket ']' in ConfigTokenizer but reached end of config");
|
||||||
|
|
||||||
|
return i; // Return the index of the closing ']'
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
void checkIndex(size_t i) const {
|
||||||
|
TORCH_INTERNAL_ASSERT(
|
||||||
|
i < config_.size(), "Index out of bounds in ConfigTokenizer");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> config_;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Note [AcceleratorAllocatorConfig design]
|
||||||
|
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
* This class configures memory allocation for both device and host memory. A
|
||||||
|
* single `AcceleratorAllocatorConfig` instance is shared across all accelerator
|
||||||
|
* backends, such as CUDA and XPU, under the assumption that relevant
|
||||||
|
* environment variables apply uniformly to all accelerators. Device-specific
|
||||||
|
* configuration extensions are supported via hooks (see
|
||||||
|
* `registerDeviceConfigParserHook`).
|
||||||
|
*
|
||||||
|
* Recommended design:
|
||||||
|
* - Place common configurations in `AcceleratorAllocatorConfig`.
|
||||||
|
* - Extend backend-specific configurations in corresponding device-specific
|
||||||
|
* classes, such as `CUDAAllocatorConfig`, etc.
|
||||||
|
*
|
||||||
|
* Scope:
|
||||||
|
* - Configuration options must be environment-variable driven.
|
||||||
|
*
|
||||||
|
* Naming Convention:
|
||||||
|
* - Public API names in `AcceleratorAllocatorConfig` should be device-generic.
|
||||||
|
* - Members prefixed with `pinned_` are specific to the host/pinned allocator.
|
||||||
|
* - Environment variable names should be generic across backends.
|
||||||
|
* - Comma-separated key-value pairs in the format: `key:value`. Use square
|
||||||
|
* brackets `[]` for list values Example: `key1:123, key2:[val1,val2]`
|
||||||
|
*
|
||||||
|
* Environment Variables:
|
||||||
|
* - The primary environment variable for configuration is `PYTORCH_ALLOC_CONF`.
|
||||||
|
* - For backward compatibility, `PYTORCH_CUDA_ALLOC_CONF` is also supported
|
||||||
|
* with lower priority.
|
||||||
|
*/
|
||||||
|
|
||||||
|
class C10_API AcceleratorAllocatorConfig {
|
||||||
|
public:
|
||||||
|
static AcceleratorAllocatorConfig& instance();
|
||||||
|
|
||||||
|
C10_DISABLE_COPY_AND_ASSIGN(AcceleratorAllocatorConfig);
|
||||||
|
AcceleratorAllocatorConfig(AcceleratorAllocatorConfig&&) = delete;
|
||||||
|
AcceleratorAllocatorConfig& operator=(AcceleratorAllocatorConfig&&) = delete;
|
||||||
|
~AcceleratorAllocatorConfig() = default;
|
||||||
|
|
||||||
|
/* Device allocator settings */
|
||||||
|
|
||||||
|
// Returns the maximum block size (in MB) that is allowed to be split. The
|
||||||
|
// default is unlimited (all blocks can be split).
|
||||||
|
static size_t max_split_size() {
|
||||||
|
return instance().max_split_size_;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the maximum block size (in MB) that is allowed to be rounded up
|
||||||
|
// without requiring splitting when searching for a free block. The default is
|
||||||
|
// 20 MiB.
|
||||||
|
static size_t max_non_split_rounding_size() {
|
||||||
|
return instance().max_non_split_rounding_size_;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return the number of divisions used when rounding up allocation sizes (in
|
||||||
|
// MB) to the nearest power-of-2 boundary.
|
||||||
|
static size_t roundup_power2_divisions(size_t size);
|
||||||
|
|
||||||
|
// Returns the vector of division factors used for rounding up allocation
|
||||||
|
// sizes. These divisions apply to size intervals between 1MB and 64GB.
|
||||||
|
static std::vector<size_t> roundup_power2_divisions() {
|
||||||
|
return instance().roundup_power2_divisions_;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the threshold that triggers garbage collection when the ratio of
|
||||||
|
// used memory to maximum allowed memory exceeds this value. The default is 0,
|
||||||
|
// meaning no garbage collection is triggered. The value should be in the
|
||||||
|
// range (0.0, 1.0).
|
||||||
|
static double garbage_collection_threshold() {
|
||||||
|
return instance().garbage_collection_threshold_;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns whether the expandable segment feature is enabled. This allows the
|
||||||
|
// allocator to start with one segment that grows as needed, rather than
|
||||||
|
// creating a new segment for each allocation. Default is false (expandable
|
||||||
|
// segments disabled).
|
||||||
|
static bool use_expandable_segments() {
|
||||||
|
return instance().use_expandable_segments_;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Host allocator settings */
|
||||||
|
|
||||||
|
// Returns whether the pinned host allocator uses background threads for
|
||||||
|
// processing events. This is useful for improving performance in scenarios
|
||||||
|
// where many small allocations are made. Default is false (background threads
|
||||||
|
// disabled).
|
||||||
|
static bool pinned_use_background_threads() {
|
||||||
|
return instance().pinned_use_background_threads_;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Settings for both device and host allocator */
|
||||||
|
|
||||||
|
// Returns the current allocator settings as a string. This string is useful
|
||||||
|
// to expand device-specific allocator configurations
|
||||||
|
static std::string last_allocator_settings() {
|
||||||
|
std::lock_guard<std::mutex> lock(instance().last_allocator_settings_mutex_);
|
||||||
|
return instance().last_allocator_settings_;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parses the environment variable `env` to update the allocator settings.
|
||||||
|
// If the environment variable is not set, it does nothing.
|
||||||
|
// The configuration string should be a comma-separated list of key-value
|
||||||
|
// pairs, where each key is a configuration option and the value is the
|
||||||
|
// corresponding setting. For example:
|
||||||
|
// "max_split_size_mb:100,max_non_split_rounding_mb:20,garbage_collection_threshold:0.5,roundup_power2_divisions:[64:8,256:4,1024:4,>:1],expandable_segments:true,pinned_use_background_threads:true"
|
||||||
|
void parseArgs(const std::string& env);
|
||||||
|
|
||||||
|
// Registers a device-specific configuration parser hook. This allows
|
||||||
|
// backends to parse additional device-specific configuration options from the
|
||||||
|
// environment variable. The hook should be a function that takes a string
|
||||||
|
// (the environment variable value) and parses it to set device-specific
|
||||||
|
// configuration options.
|
||||||
|
// The hook will be called when the environment variable is parsed.
|
||||||
|
// If a hook is already registered, it will be replaced with the new one.
|
||||||
|
void registerDeviceConfigParserHook(
|
||||||
|
std::function<void(const std::string&)> hook) {
|
||||||
|
device_config_parser_hook_ = std::move(hook);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calls the registered device-specific configuration parser hook with the
|
||||||
|
// provided environment string. This allows backends to parse additional
|
||||||
|
// device-specific configuration options from the environment variable.
|
||||||
|
// If no hook is registered, this function does nothing.
|
||||||
|
void callDeviceConfigParserHook(const std::string& env) const {
|
||||||
|
if (device_config_parser_hook_) {
|
||||||
|
device_config_parser_hook_(env);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
AcceleratorAllocatorConfig();
|
||||||
|
|
||||||
|
/* Internal functions for device allocator */
|
||||||
|
|
||||||
|
// Parse `max_split_size_mb` from environment variable.
|
||||||
|
size_t parseMaxSplitSize(const ConfigTokenizer& tokenizer, size_t i);
|
||||||
|
// Parse `max_non_split_rounding_mb` from environment variable.
|
||||||
|
size_t parseMaxNonSplitRoundingSize(
|
||||||
|
const ConfigTokenizer& tokenizer,
|
||||||
|
size_t i);
|
||||||
|
// Parse `garbage_collection_threshold` from environment variable.
|
||||||
|
size_t parseGarbageCollectionThreshold(
|
||||||
|
const ConfigTokenizer& tokenizer,
|
||||||
|
size_t i);
|
||||||
|
// Parse `roundup_power2_divisions` from environment variable.
|
||||||
|
size_t parseRoundUpPower2Divisions(
|
||||||
|
const ConfigTokenizer& tokenizer,
|
||||||
|
size_t i);
|
||||||
|
// Parse `expandable_segments` from environment variable.
|
||||||
|
size_t parseExpandableSegments(const ConfigTokenizer& tokenizer, size_t i);
|
||||||
|
|
||||||
|
/* Internal functions for host allocator */
|
||||||
|
|
||||||
|
// Parse `pinned_use_background_threads` from environment variable.
|
||||||
|
size_t parsePinnedUseBackgroundThreads(
|
||||||
|
const ConfigTokenizer& tokenizer,
|
||||||
|
size_t i);
|
||||||
|
|
||||||
|
/* The following members are specifically used for the device allocator. */
|
||||||
|
|
||||||
|
// The maximum block size that is allowed to be split.
|
||||||
|
std::atomic<size_t> max_split_size_{std::numeric_limits<size_t>::max()};
|
||||||
|
// The maximum allowable extra size of a memory block without requiring
|
||||||
|
// splitting when searching for a free block.
|
||||||
|
std::atomic<size_t> max_non_split_rounding_size_{kLargeBuffer};
|
||||||
|
// Used to store how memory allocations of different sizes should be rounded
|
||||||
|
// up to the nearest power of 2 divisions.
|
||||||
|
std::vector<size_t> roundup_power2_divisions_;
|
||||||
|
// The threshold that triggers garbage collection when the ratio of used
|
||||||
|
// memory to maximum allowed memory exceeds this value.
|
||||||
|
std::atomic<double> garbage_collection_threshold_{0};
|
||||||
|
// A flag to enable expandable segments feature.
|
||||||
|
std::atomic<bool> use_expandable_segments_{false};
|
||||||
|
|
||||||
|
/* The following members are specifically used for the host allocator. */
|
||||||
|
|
||||||
|
// A flag to enable background thread for processing events.
|
||||||
|
std::atomic<bool> pinned_use_background_threads_{false};
|
||||||
|
|
||||||
|
/* The following members are used for both device and host allocator. */
|
||||||
|
|
||||||
|
// Record the last allocator config environment setting.
|
||||||
|
std::mutex last_allocator_settings_mutex_;
|
||||||
|
std::string last_allocator_settings_;
|
||||||
|
|
||||||
|
// Optional hook for parsing additional device-specific allocator settings.
|
||||||
|
// This allows backends (e.g., CUDA, XPU) to register a custom parser for
|
||||||
|
// their own environment configuration extensions.
|
||||||
|
std::function<void(const std::string&)> device_config_parser_hook_{nullptr};
|
||||||
|
};
|
||||||
|
|
||||||
|
C10_API inline void setAllocatorSettings(const std::string& env) {
|
||||||
|
AcceleratorAllocatorConfig::instance().parseArgs(env);
|
||||||
|
AcceleratorAllocatorConfig::instance().callDeviceConfigParserHook(env);
|
||||||
|
}
|
||||||
|
|
||||||
|
C10_API inline std::string getAllocatorSettings() {
|
||||||
|
return AcceleratorAllocatorConfig::instance().last_allocator_settings();
|
||||||
|
}
|
||||||
|
|
||||||
|
struct DeviceConfigParserHookRegistry {
|
||||||
|
explicit DeviceConfigParserHookRegistry(
|
||||||
|
std::function<void(const std::string&)> hook) {
|
||||||
|
AcceleratorAllocatorConfig::instance().registerDeviceConfigParserHook(
|
||||||
|
std::move(hook));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#define REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK(hook) \
|
||||||
|
namespace { \
|
||||||
|
static at::CachingAllocator::DeviceConfigParserHookRegistry \
|
||||||
|
g_device_config_parse_hook_registry_instance(hook); \
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace c10::CachingAllocator
|
||||||
@ -1,30 +1,27 @@
|
|||||||
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
||||||
|
#include <c10/cuda/CUDAException.h>
|
||||||
#include <c10/cuda/driver_api.h>
|
#include <c10/cuda/driver_api.h>
|
||||||
#include <c10/util/CallOnce.h>
|
#include <c10/util/CallOnce.h>
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
|
#include <c10/util/Logging.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
#include <dlfcn.h>
|
#include <dlfcn.h>
|
||||||
|
|
||||||
namespace c10::cuda {
|
namespace c10::cuda {
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
|
void* get_symbol(const char* name, int version);
|
||||||
|
|
||||||
DriverAPI create_driver_api() {
|
DriverAPI create_driver_api() {
|
||||||
void* handle_0 = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_NOLOAD);
|
|
||||||
TORCH_CHECK(handle_0, "Can't open libcuda.so.1: ", dlerror());
|
|
||||||
void* handle_1 = DriverAPI::get_nvml_handle();
|
void* handle_1 = DriverAPI::get_nvml_handle();
|
||||||
DriverAPI r{};
|
DriverAPI r{};
|
||||||
|
|
||||||
#define LOOKUP_LIBCUDA_ENTRY(name) \
|
#define LOOKUP_LIBCUDA_ENTRY_WITH_VERSION(name, version) \
|
||||||
r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \
|
r.name##_ = reinterpret_cast<decltype(&name)>(get_symbol(#name, version)); \
|
||||||
TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name, ": ", dlerror())
|
TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name)
|
||||||
C10_LIBCUDA_DRIVER_API(LOOKUP_LIBCUDA_ENTRY)
|
C10_LIBCUDA_DRIVER_API(LOOKUP_LIBCUDA_ENTRY_WITH_VERSION)
|
||||||
#undef LOOKUP_LIBCUDA_ENTRY
|
#undef LOOKUP_LIBCUDA_ENTRY_WITH_VERSION
|
||||||
|
|
||||||
#define LOOKUP_LIBCUDA_ENTRY(name) \
|
|
||||||
r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \
|
|
||||||
dlerror();
|
|
||||||
C10_LIBCUDA_DRIVER_API_12030(LOOKUP_LIBCUDA_ENTRY)
|
|
||||||
#undef LOOKUP_LIBCUDA_ENTRY
|
|
||||||
|
|
||||||
if (handle_1) {
|
if (handle_1) {
|
||||||
#define LOOKUP_NVML_ENTRY(name) \
|
#define LOOKUP_NVML_ENTRY(name) \
|
||||||
@ -35,6 +32,32 @@ DriverAPI create_driver_api() {
|
|||||||
}
|
}
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void* get_symbol(const char* name, int version) {
|
||||||
|
void* out = nullptr;
|
||||||
|
cudaDriverEntryPointQueryResult qres{};
|
||||||
|
|
||||||
|
// CUDA 12.5+ supports version-based lookup
|
||||||
|
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12050)
|
||||||
|
if (auto st = cudaGetDriverEntryPointByVersion(
|
||||||
|
name, &out, version, cudaEnableDefault, &qres);
|
||||||
|
st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// This fallback to the old API to try getting the symbol again.
|
||||||
|
if (auto st = cudaGetDriverEntryPoint(name, &out, cudaEnableDefault, &qres);
|
||||||
|
st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the symbol cannot be resolved, report and return nullptr;
|
||||||
|
// the caller is responsible for checking the pointer.
|
||||||
|
LOG(INFO) << "Failed to resolve symbol " << name;
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
void* DriverAPI::get_nvml_handle() {
|
void* DriverAPI::get_nvml_handle() {
|
||||||
|
|||||||
@ -20,30 +20,24 @@
|
|||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define C10_LIBCUDA_DRIVER_API(_) \
|
#define C10_LIBCUDA_DRIVER_API(_) \
|
||||||
_(cuDeviceGetAttribute) \
|
_(cuDeviceGetAttribute, 12000) \
|
||||||
_(cuMemAddressReserve) \
|
_(cuMemAddressReserve, 12000) \
|
||||||
_(cuMemRelease) \
|
_(cuMemRelease, 12000) \
|
||||||
_(cuMemMap) \
|
_(cuMemMap, 12000) \
|
||||||
_(cuMemAddressFree) \
|
_(cuMemAddressFree, 12000) \
|
||||||
_(cuMemSetAccess) \
|
_(cuMemSetAccess, 12000) \
|
||||||
_(cuMemUnmap) \
|
_(cuMemUnmap, 12000) \
|
||||||
_(cuMemCreate) \
|
_(cuMemCreate, 12000) \
|
||||||
_(cuMemGetAllocationGranularity) \
|
_(cuMemGetAllocationGranularity, 12000) \
|
||||||
_(cuMemExportToShareableHandle) \
|
_(cuMemExportToShareableHandle, 12000) \
|
||||||
_(cuMemImportFromShareableHandle) \
|
_(cuMemImportFromShareableHandle, 12000) \
|
||||||
_(cuMemsetD32Async) \
|
_(cuMemsetD32Async, 12000) \
|
||||||
_(cuStreamWriteValue32) \
|
_(cuStreamWriteValue32, 12000) \
|
||||||
_(cuGetErrorString)
|
_(cuGetErrorString, 12000) \
|
||||||
|
_(cuMulticastAddDevice, 12030) \
|
||||||
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12030)
|
_(cuMulticastBindMem, 12030) \
|
||||||
#define C10_LIBCUDA_DRIVER_API_12030(_) \
|
_(cuMulticastCreate, 12030)
|
||||||
_(cuMulticastAddDevice) \
|
|
||||||
_(cuMulticastBindMem) \
|
|
||||||
_(cuMulticastCreate)
|
|
||||||
#else
|
|
||||||
#define C10_LIBCUDA_DRIVER_API_12030(_)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define C10_NVML_DRIVER_API(_) \
|
#define C10_NVML_DRIVER_API(_) \
|
||||||
_(nvmlInit_v2) \
|
_(nvmlInit_v2) \
|
||||||
@ -56,11 +50,13 @@
|
|||||||
namespace c10::cuda {
|
namespace c10::cuda {
|
||||||
|
|
||||||
struct DriverAPI {
|
struct DriverAPI {
|
||||||
|
#define CREATE_MEMBER_VERSIONED(name, version) decltype(&name) name##_;
|
||||||
#define CREATE_MEMBER(name) decltype(&name) name##_;
|
#define CREATE_MEMBER(name) decltype(&name) name##_;
|
||||||
C10_LIBCUDA_DRIVER_API(CREATE_MEMBER)
|
C10_LIBCUDA_DRIVER_API(CREATE_MEMBER_VERSIONED)
|
||||||
C10_LIBCUDA_DRIVER_API_12030(CREATE_MEMBER)
|
|
||||||
C10_NVML_DRIVER_API(CREATE_MEMBER)
|
C10_NVML_DRIVER_API(CREATE_MEMBER)
|
||||||
|
#undef CREATE_MEMBER_VERSIONED
|
||||||
#undef CREATE_MEMBER
|
#undef CREATE_MEMBER
|
||||||
|
|
||||||
static DriverAPI* get();
|
static DriverAPI* get();
|
||||||
static void* get_nvml_handle();
|
static void* get_nvml_handle();
|
||||||
};
|
};
|
||||||
|
|||||||
@ -63,7 +63,6 @@ def define_c10_ovrsource(name, is_mobile):
|
|||||||
"core/impl/*.h",
|
"core/impl/*.h",
|
||||||
]),
|
]),
|
||||||
reexport_all_header_dependencies = False,
|
reexport_all_header_dependencies = False,
|
||||||
# tests = C10_CPU_TEST_TARGETS,
|
|
||||||
visibility = [
|
visibility = [
|
||||||
"//xplat/caffe2/c10:c10_ovrsource",
|
"//xplat/caffe2/c10:c10_ovrsource",
|
||||||
],
|
],
|
||||||
@ -84,25 +83,6 @@ def define_c10_ovrsource(name, is_mobile):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def define_ovrsource_targets():
|
def define_ovrsource_targets():
|
||||||
# C10_CPU_TEST_FILES = native.glob([
|
|
||||||
# "test/core/*.cpp",
|
|
||||||
# "test/util/*.cpp",
|
|
||||||
# ])
|
|
||||||
|
|
||||||
# C10_GPU_TEST_FILES = native.glob([
|
|
||||||
# "cuda/test/**/*.cpp",
|
|
||||||
# ])
|
|
||||||
|
|
||||||
# C10_CPU_TEST_TARGETS = [
|
|
||||||
# ":" + paths.basename(test)[:-len(".cpp")] + "_ovrsource"
|
|
||||||
# for test in C10_CPU_TEST_FILES
|
|
||||||
# ]
|
|
||||||
|
|
||||||
# C10_GPU_TEST_TARGETS = [
|
|
||||||
# ":" + paths.basename(test)[:-len(".cpp")] + "_ovrsource"
|
|
||||||
# for test in C10_GPU_TEST_FILES
|
|
||||||
# ]
|
|
||||||
|
|
||||||
common_c10_cmake_defines = [
|
common_c10_cmake_defines = [
|
||||||
("#cmakedefine C10_BUILD_SHARED_LIBS", ""),
|
("#cmakedefine C10_BUILD_SHARED_LIBS", ""),
|
||||||
("#cmakedefine C10_USE_NUMA", ""),
|
("#cmakedefine C10_USE_NUMA", ""),
|
||||||
@ -207,7 +187,6 @@ def define_ovrsource_targets():
|
|||||||
"cuda/impl/*.h",
|
"cuda/impl/*.h",
|
||||||
]),
|
]),
|
||||||
reexport_all_header_dependencies = False,
|
reexport_all_header_dependencies = False,
|
||||||
# tests = C10_GPU_TEST_TARGETS,
|
|
||||||
visibility = ["PUBLIC"],
|
visibility = ["PUBLIC"],
|
||||||
deps = [
|
deps = [
|
||||||
"//third-party/cuda:libcuda",
|
"//third-party/cuda:libcuda",
|
||||||
@ -217,64 +196,3 @@ def define_ovrsource_targets():
|
|||||||
":c10_ovrsource",
|
":c10_ovrsource",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
# [
|
|
||||||
# oxx_test(
|
|
||||||
# name = paths.basename(test)[:-len(".cpp")] + "_ovrsource",
|
|
||||||
# srcs = [test],
|
|
||||||
# compatible_with = cpu_supported_platforms,
|
|
||||||
# compiler_flags = select({
|
|
||||||
# "DEFAULT": [],
|
|
||||||
# "ovr_config//compiler:cl": [
|
|
||||||
# "/w",
|
|
||||||
# ],
|
|
||||||
# "ovr_config//compiler:clang": [
|
|
||||||
# "-Wno-error",
|
|
||||||
# "-Wno-self-assign-overloaded",
|
|
||||||
# "-Wno-self-move",
|
|
||||||
# "-Wno-shadow",
|
|
||||||
# "-Wno-undef",
|
|
||||||
# "-Wno-unused-function",
|
|
||||||
# "-Wno-unused-variable",
|
|
||||||
# ],
|
|
||||||
# }),
|
|
||||||
# framework = "gtest",
|
|
||||||
# oncall = "ovrsource_pytorch",
|
|
||||||
# raw_headers = native.glob([
|
|
||||||
# "test/**/*.h",
|
|
||||||
# ]),
|
|
||||||
# deps = [
|
|
||||||
# ":c10_ovrsource",
|
|
||||||
# ],
|
|
||||||
# )
|
|
||||||
# for test in C10_CPU_TEST_FILES
|
|
||||||
# ]
|
|
||||||
|
|
||||||
# [
|
|
||||||
# oxx_test(
|
|
||||||
# name = paths.basename(test)[:-len(".cpp")] + "_ovrsource",
|
|
||||||
# srcs = [test],
|
|
||||||
# compatible_with = cuda_supported_platforms,
|
|
||||||
# compiler_flags = select({
|
|
||||||
# "DEFAULT": [],
|
|
||||||
# "ovr_config//compiler:cl": [
|
|
||||||
# "/w",
|
|
||||||
# ],
|
|
||||||
# "ovr_config//compiler:clang": [
|
|
||||||
# "-Wno-error",
|
|
||||||
# ],
|
|
||||||
# }),
|
|
||||||
# framework = "gtest",
|
|
||||||
# oncall = "ovrsource_pytorch",
|
|
||||||
# raw_headers = native.glob([
|
|
||||||
# "test/**/*.h",
|
|
||||||
# ]),
|
|
||||||
# runtime_shared_libraries = [
|
|
||||||
# "//third-party/cuda:cudart",
|
|
||||||
# ],
|
|
||||||
# deps = [
|
|
||||||
# ":c10_cuda_ovrsource",
|
|
||||||
# ],
|
|
||||||
# )
|
|
||||||
# for test in C10_GPU_TEST_FILES
|
|
||||||
# ]
|
|
||||||
|
|||||||
123
c10/test/core/AllocatorConfig_test.cpp
Normal file
123
c10/test/core/AllocatorConfig_test.cpp
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
#include <c10/core/AllocatorConfig.h>
|
||||||
|
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
|
using namespace c10::CachingAllocator;
|
||||||
|
constexpr size_t kMB = 1024 * 1024ul;
|
||||||
|
|
||||||
|
struct ExtendedAllocatorConfig {
|
||||||
|
static ExtendedAllocatorConfig& instance() {
|
||||||
|
static ExtendedAllocatorConfig instance;
|
||||||
|
return instance;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the device-specific option value in bytes.
|
||||||
|
static size_t device_specific_option() {
|
||||||
|
return instance().device_specific_option_;
|
||||||
|
}
|
||||||
|
|
||||||
|
void parseArgs(const std::string& env) {
|
||||||
|
// Parse device-specific options from the environment variable
|
||||||
|
ConfigTokenizer tokenizer(env);
|
||||||
|
for (size_t i = 0; i < tokenizer.size(); i++) {
|
||||||
|
const auto& key = tokenizer[i];
|
||||||
|
if (key == "device_specific_option_mb") {
|
||||||
|
tokenizer.checkToken(++i, ":");
|
||||||
|
device_specific_option_ = tokenizer.toSizeT(++i) * kMB;
|
||||||
|
} else {
|
||||||
|
i = tokenizer.skipKey(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i + 1 < tokenizer.size()) {
|
||||||
|
tokenizer.checkToken(++i, ",");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Device-specific option, e.g., memory limit for a specific device.
|
||||||
|
std::atomic<size_t> device_specific_option_{0};
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK([](const std::string& env) {
|
||||||
|
ExtendedAllocatorConfig::instance().parseArgs(env);
|
||||||
|
})
|
||||||
|
|
||||||
|
TEST(AllocatorConfigTest, allocator_config_test) {
|
||||||
|
std::string env =
|
||||||
|
"max_split_size_mb:40,"
|
||||||
|
"max_non_split_rounding_mb:30,"
|
||||||
|
"garbage_collection_threshold:0.5,"
|
||||||
|
"roundup_power2_divisions:[64:8,128:2,256:4,512:2,1024:4,>:1],"
|
||||||
|
"expandable_segments:True,"
|
||||||
|
"pinned_use_background_threads:True,"
|
||||||
|
"device_specific_option_mb:64";
|
||||||
|
c10::CachingAllocator::setAllocatorSettings(env);
|
||||||
|
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
|
||||||
|
EXPECT_EQ(AcceleratorAllocatorConfig::max_split_size(), 40 * kMB);
|
||||||
|
EXPECT_EQ(
|
||||||
|
AcceleratorAllocatorConfig::max_non_split_rounding_size(), 30 * kMB);
|
||||||
|
EXPECT_EQ(AcceleratorAllocatorConfig::garbage_collection_threshold(), 0.5);
|
||||||
|
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(32 * kMB), 8);
|
||||||
|
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(64 * kMB), 8);
|
||||||
|
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(128 * kMB), 2);
|
||||||
|
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(256 * kMB), 4);
|
||||||
|
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(512 * kMB), 2);
|
||||||
|
EXPECT_EQ(
|
||||||
|
AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 4);
|
||||||
|
EXPECT_EQ(
|
||||||
|
AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 1);
|
||||||
|
EXPECT_EQ(
|
||||||
|
AcceleratorAllocatorConfig::roundup_power2_divisions(4096 * kMB), 1);
|
||||||
|
EXPECT_EQ(
|
||||||
|
AcceleratorAllocatorConfig::roundup_power2_divisions(8192 * kMB), 1);
|
||||||
|
EXPECT_EQ(AcceleratorAllocatorConfig::use_expandable_segments(), true);
|
||||||
|
EXPECT_EQ(AcceleratorAllocatorConfig::pinned_use_background_threads(), true);
|
||||||
|
EXPECT_EQ(ExtendedAllocatorConfig::device_specific_option(), 64 * kMB);
|
||||||
|
|
||||||
|
env =
|
||||||
|
"max_split_size_mb:20,"
|
||||||
|
"max_non_split_rounding_mb:40,"
|
||||||
|
"garbage_collection_threshold:0.8";
|
||||||
|
c10::CachingAllocator::setAllocatorSettings(env);
|
||||||
|
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
|
||||||
|
EXPECT_EQ(AcceleratorAllocatorConfig::max_split_size(), 20 * kMB);
|
||||||
|
EXPECT_EQ(
|
||||||
|
AcceleratorAllocatorConfig::max_non_split_rounding_size(), 40 * kMB);
|
||||||
|
EXPECT_EQ(AcceleratorAllocatorConfig::garbage_collection_threshold(), 0.8);
|
||||||
|
|
||||||
|
// roundup_power2_divisions knob array syntax
|
||||||
|
env = "roundup_power2_divisions:[128:8,256:16,512:1,2048:8,>:2]";
|
||||||
|
c10::CachingAllocator::setAllocatorSettings(env);
|
||||||
|
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
|
||||||
|
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(64 * kMB), 8);
|
||||||
|
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(128 * kMB), 8);
|
||||||
|
EXPECT_EQ(
|
||||||
|
AcceleratorAllocatorConfig::roundup_power2_divisions(256 * kMB), 16);
|
||||||
|
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(512 * kMB), 1);
|
||||||
|
EXPECT_EQ(
|
||||||
|
AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 0);
|
||||||
|
EXPECT_EQ(
|
||||||
|
AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 8);
|
||||||
|
EXPECT_EQ(
|
||||||
|
AcceleratorAllocatorConfig::roundup_power2_divisions(4096 * kMB), 2);
|
||||||
|
|
||||||
|
// roundup_power2_divisions single value syntax for backward compatibility
|
||||||
|
env = "roundup_power2_divisions:4";
|
||||||
|
c10::CachingAllocator::setAllocatorSettings(env);
|
||||||
|
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
|
||||||
|
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(64 * kMB), 4);
|
||||||
|
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(256 * kMB), 4);
|
||||||
|
EXPECT_EQ(
|
||||||
|
AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 4);
|
||||||
|
|
||||||
|
env = "expandable_segments:False,";
|
||||||
|
c10::CachingAllocator::setAllocatorSettings(env);
|
||||||
|
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
|
||||||
|
EXPECT_EQ(AcceleratorAllocatorConfig::use_expandable_segments(), false);
|
||||||
|
|
||||||
|
env = "pinned_use_background_threads:False";
|
||||||
|
c10::CachingAllocator::setAllocatorSettings(env);
|
||||||
|
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
|
||||||
|
EXPECT_EQ(AcceleratorAllocatorConfig::pinned_use_background_threads(), false);
|
||||||
|
}
|
||||||
@ -4,6 +4,7 @@
|
|||||||
// 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa.
|
// 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa.
|
||||||
|
|
||||||
#include <c10/macros/Macros.h>
|
#include <c10/macros/Macros.h>
|
||||||
|
#include <c10/util/bit_cast.h>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
@ -67,13 +68,7 @@ inline C10_HOST_DEVICE uint16_t round_to_nearest_even(float src) {
|
|||||||
#endif
|
#endif
|
||||||
return UINT16_C(0x7FC0);
|
return UINT16_C(0x7FC0);
|
||||||
} else {
|
} else {
|
||||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
const uint32_t U32 = c10::bit_cast<uint32_t>(src);
|
||||||
union {
|
|
||||||
uint32_t U32; // NOLINT(facebook-hte-BadMemberName)
|
|
||||||
float F32; // NOLINT(facebook-hte-BadMemberName)
|
|
||||||
};
|
|
||||||
|
|
||||||
F32 = src;
|
|
||||||
uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
|
uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
|
||||||
return static_cast<uint16_t>((U32 + rounding_bias) >> 16);
|
return static_cast<uint16_t>((U32 + rounding_bias) >> 16);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -3,6 +3,8 @@
|
|||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
|
|
||||||
|
#include <c10/macros/Macros.h>
|
||||||
|
|
||||||
#if __has_include(<bit>) && (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L)
|
#if __has_include(<bit>) && (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L)
|
||||||
#include <bit>
|
#include <bit>
|
||||||
#define C10_HAVE_STD_BIT_CAST 1
|
#define C10_HAVE_STD_BIT_CAST 1
|
||||||
@ -23,7 +25,7 @@ using std::bit_cast;
|
|||||||
// See https://en.cppreference.com/w/cpp/numeric/bit_cast for more
|
// See https://en.cppreference.com/w/cpp/numeric/bit_cast for more
|
||||||
// information as well as the source of our implementations.
|
// information as well as the source of our implementations.
|
||||||
template <class To, class From>
|
template <class To, class From>
|
||||||
std::enable_if_t<
|
C10_HOST_DEVICE std::enable_if_t<
|
||||||
sizeof(To) == sizeof(From) && std::is_trivially_copyable_v<From> &&
|
sizeof(To) == sizeof(From) && std::is_trivially_copyable_v<From> &&
|
||||||
std::is_trivially_copyable_v<To>,
|
std::is_trivially_copyable_v<To>,
|
||||||
To>
|
To>
|
||||||
|
|||||||
@ -58,6 +58,9 @@ def define_targets(rules):
|
|||||||
name = "bit_cast",
|
name = "bit_cast",
|
||||||
hdrs = ["bit_cast.h"],
|
hdrs = ["bit_cast.h"],
|
||||||
visibility = ["//:__subpackages__"],
|
visibility = ["//:__subpackages__"],
|
||||||
|
deps = [
|
||||||
|
"//c10/macros",
|
||||||
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
rules.cc_library(
|
rules.cc_library(
|
||||||
|
|||||||
18
docs/source/distributed._dist2.md
Normal file
18
docs/source/distributed._dist2.md
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
```{eval-rst}
|
||||||
|
.. role:: hidden
|
||||||
|
:class: hidden-section
|
||||||
|
```
|
||||||
|
|
||||||
|
```{eval-rst}
|
||||||
|
.. automodule:: torch.distributed._dist2
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
```
|
||||||
|
|
||||||
|
```{eval-rst}
|
||||||
|
.. autoclass:: torch.distributed.ProcessGroup
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
```
|
||||||
@ -224,6 +224,10 @@ inconsistent 'UUID' assignment across ranks, and to prevent races during initial
|
|||||||
.. autofunction:: is_torchelastic_launched
|
.. autofunction:: is_torchelastic_launched
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```{eval-rst}
|
||||||
|
.. autofunction:: get_default_backend_for_device
|
||||||
|
```
|
||||||
|
|
||||||
______________________________________________________________________
|
______________________________________________________________________
|
||||||
|
|
||||||
Currently three initialization methods are supported:
|
Currently three initialization methods are supported:
|
||||||
@ -1471,3 +1475,9 @@ If you are running single node training, it may be convenient to interactively b
|
|||||||
```{eval-rst}
|
```{eval-rst}
|
||||||
.. py:module:: torch.distributed.checkpoint.state_dict
|
.. py:module:: torch.distributed.checkpoint.state_dict
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```{toctree}
|
||||||
|
:hidden:
|
||||||
|
|
||||||
|
distributed._dist2
|
||||||
|
```
|
||||||
|
|||||||
@ -8,16 +8,14 @@ higher-level API to automatically differentiate models split across several
|
|||||||
machines.
|
machines.
|
||||||
|
|
||||||
```{warning}
|
```{warning}
|
||||||
APIs in the RPC package are stable. There are multiple ongoing work items
|
APIs in the RPC package are stable and in maintenance mode.
|
||||||
to improve performance and error handling, which will ship in future releases.
|
|
||||||
```
|
```
|
||||||
|
|
||||||
```{warning}
|
```{warning}
|
||||||
CUDA support was introduced in PyTorch 1.9 and is still a **beta** feature.
|
CUDA support is a **beta** feature.
|
||||||
Not all features of the RPC package are yet compatible with CUDA support and
|
Not all features of the RPC package are yet compatible with CUDA support and
|
||||||
thus their use is discouraged. These unsupported features include: RRefs,
|
thus their use is discouraged. These unsupported features include: RRefs,
|
||||||
JIT compatibility, dist autograd and dist optimizer, and profiling. These
|
JIT compatibility, dist autograd and dist optimizer, and profiling.
|
||||||
shortcomings will be addressed in future releases.
|
|
||||||
```
|
```
|
||||||
|
|
||||||
```{note}
|
```{note}
|
||||||
@ -102,13 +100,6 @@ device lists on source and destination workers do not match. In such cases,
|
|||||||
applications can always explicitly move the input tensors to CPU on the caller
|
applications can always explicitly move the input tensors to CPU on the caller
|
||||||
and move it to the desired devices on the callee if necessary.
|
and move it to the desired devices on the callee if necessary.
|
||||||
|
|
||||||
```{warning}
|
|
||||||
TorchScript support in RPC is a prototype feature and subject to change. Since
|
|
||||||
v1.5.0, ``torch.distributed.rpc`` supports calling TorchScript functions as
|
|
||||||
RPC target functions, and this will help improve parallelism on the callee
|
|
||||||
side as executing TorchScript functions does not require GIL.
|
|
||||||
```
|
|
||||||
|
|
||||||
```{eval-rst}
|
```{eval-rst}
|
||||||
.. autofunction:: rpc_sync
|
.. autofunction:: rpc_sync
|
||||||
.. autofunction:: rpc_async
|
.. autofunction:: rpc_async
|
||||||
@ -159,9 +150,7 @@ multiple different transports (TCP, of course, but also shared memory, NVLink,
|
|||||||
InfiniBand, ...) and can automatically detect their availability and negotiate
|
InfiniBand, ...) and can automatically detect their availability and negotiate
|
||||||
the best transport to use for each pipe.
|
the best transport to use for each pipe.
|
||||||
|
|
||||||
The TensorPipe backend has been introduced in PyTorch v1.6 and is being actively
|
The TensorPipe backend comes with a TCP-based transport, just like Gloo. It is also able to
|
||||||
developed. At the moment, it only supports CPU tensors, with GPU support coming
|
|
||||||
soon. It comes with a TCP-based transport, just like Gloo. It is also able to
|
|
||||||
automatically chunk and multiplex large tensors over multiple sockets and
|
automatically chunk and multiplex large tensors over multiple sockets and
|
||||||
threads in order to achieve very high bandwidths. The agent will be able to pick
|
threads in order to achieve very high bandwidths. The agent will be able to pick
|
||||||
the best transport on its own, with no intervention required.
|
the best transport on its own, with no intervention required.
|
||||||
@ -301,6 +290,4 @@ to use [the profiler](https://pytorch.org/docs/stable/autograd.html#profiler) to
|
|||||||
- [Getting started with Distributed RPC Framework](https://pytorch.org/tutorials/intermediate/rpc_tutorial.html)
|
- [Getting started with Distributed RPC Framework](https://pytorch.org/tutorials/intermediate/rpc_tutorial.html)
|
||||||
- [Implementing a Parameter Server using Distributed RPC Framework](https://pytorch.org/tutorials/intermediate/rpc_param_server_tutorial.html)
|
- [Implementing a Parameter Server using Distributed RPC Framework](https://pytorch.org/tutorials/intermediate/rpc_param_server_tutorial.html)
|
||||||
- [Combining Distributed DataParallel with Distributed RPC Framework](https://pytorch.org/tutorials/advanced/rpc_ddp_tutorial.html) (covers **RemoteModule** as well)
|
- [Combining Distributed DataParallel with Distributed RPC Framework](https://pytorch.org/tutorials/advanced/rpc_ddp_tutorial.html) (covers **RemoteModule** as well)
|
||||||
- [Profiling RPC-based Workloads](https://pytorch.org/tutorials/recipes/distributed_rpc_profiling.html)
|
|
||||||
- [Implementing batch RPC processing](https://pytorch.org/tutorials/intermediate/rpc_async_execution.html)
|
- [Implementing batch RPC processing](https://pytorch.org/tutorials/intermediate/rpc_async_execution.html)
|
||||||
- [Distributed Pipeline Parallel](https://pytorch.org/tutorials/intermediate/dist_pipeline_parallel_tutorial.html)
|
|
||||||
|
|||||||
@ -9,13 +9,13 @@ requires = [
|
|||||||
# 77.0.0: min version for SPDX expression support for project.license
|
# 77.0.0: min version for SPDX expression support for project.license
|
||||||
"setuptools>=62.3.0,<80.0",
|
"setuptools>=62.3.0,<80.0",
|
||||||
"wheel",
|
"wheel",
|
||||||
"astunparse",
|
|
||||||
"cmake>=3.27",
|
"cmake>=3.27",
|
||||||
"ninja",
|
"ninja",
|
||||||
"numpy",
|
"numpy",
|
||||||
"packaging",
|
"packaging",
|
||||||
"pyyaml",
|
"pyyaml",
|
||||||
"requests",
|
"requests",
|
||||||
|
"six", # dependency chain: NNPACK -> PeachPy -> six
|
||||||
"typing-extensions>=4.10.0",
|
"typing-extensions>=4.10.0",
|
||||||
]
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# Python dependencies required for development
|
# Python dependencies required for development
|
||||||
astunparse
|
|
||||||
build[uv] # for building sdist and wheel
|
build[uv] # for building sdist and wheel
|
||||||
cmake>=3.27
|
cmake>=3.27
|
||||||
expecttest>=0.3.0
|
expecttest>=0.3.0
|
||||||
@ -18,6 +17,8 @@ pyyaml
|
|||||||
requests
|
requests
|
||||||
# setuptools develop deprecated on 80.0
|
# setuptools develop deprecated on 80.0
|
||||||
setuptools>=62.3.0,<80.0
|
setuptools>=62.3.0,<80.0
|
||||||
|
six # dependency chain: NNPACK -> PeachPy -> six
|
||||||
sympy>=1.13.3
|
sympy>=1.13.3
|
||||||
types-dataclasses
|
types-dataclasses
|
||||||
typing-extensions>=4.13.2
|
typing-extensions>=4.13.2
|
||||||
|
wheel
|
||||||
|
|||||||
@ -15,4 +15,4 @@ pip install --no-use-pep517 -e "$tp2_dir/onnx"
|
|||||||
# Install caffe2 and pytorch
|
# Install caffe2 and pytorch
|
||||||
pip install -r "$top_dir/caffe2/requirements.txt"
|
pip install -r "$top_dir/caffe2/requirements.txt"
|
||||||
pip install -r "$top_dir/requirements.txt"
|
pip install -r "$top_dir/requirements.txt"
|
||||||
python setup.py develop
|
python -m pip install --no-build-isolation -v -e .
|
||||||
|
|||||||
@ -35,4 +35,4 @@ _pip_install -b "$BUILD_DIR/onnx" "file://$tp2_dir/onnx#egg=onnx"
|
|||||||
# Install caffe2 and pytorch
|
# Install caffe2 and pytorch
|
||||||
pip install -r "$top_dir/caffe2/requirements.txt"
|
pip install -r "$top_dir/caffe2/requirements.txt"
|
||||||
pip install -r "$top_dir/requirements.txt"
|
pip install -r "$top_dir/requirements.txt"
|
||||||
python setup.py install
|
python -m pip install --no-build-isolation -v .
|
||||||
|
|||||||
34
setup.py
34
setup.py
@ -263,6 +263,7 @@ import json
|
|||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import sysconfig
|
import sysconfig
|
||||||
|
import textwrap
|
||||||
import time
|
import time
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -601,7 +602,7 @@ def build_deps() -> None:
|
|||||||
report(
|
report(
|
||||||
'Finished running cmake. Run "ccmake build" or '
|
'Finished running cmake. Run "ccmake build" or '
|
||||||
'"cmake-gui build" to adjust build options and '
|
'"cmake-gui build" to adjust build options and '
|
||||||
'"python setup.py install" to build.'
|
'"python -m pip install --no-build-isolation -v ." to build.'
|
||||||
)
|
)
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
@ -1207,24 +1208,25 @@ def configure_extension_build() -> tuple[
|
|||||||
|
|
||||||
# post run, warnings, printed at the end to make them more visible
|
# post run, warnings, printed at the end to make them more visible
|
||||||
build_update_message = """
|
build_update_message = """
|
||||||
It is no longer necessary to use the 'build' or 'rebuild' targets
|
It is no longer necessary to use the 'build' or 'rebuild' targets
|
||||||
|
|
||||||
To install:
|
To install:
|
||||||
$ python setup.py install
|
$ python -m pip install --no-build-isolation -v .
|
||||||
To develop locally:
|
To develop locally:
|
||||||
$ python setup.py develop
|
$ python -m pip install --no-build-isolation -v -e .
|
||||||
To force cmake to re-generate native build files (off by default):
|
To force cmake to re-generate native build files (off by default):
|
||||||
$ CMAKE_FRESH=1 python setup.py develop
|
$ CMAKE_FRESH=1 python -m pip install --no-build-isolation -v -e .
|
||||||
"""
|
""".strip()
|
||||||
|
|
||||||
|
|
||||||
def print_box(msg: str) -> None:
|
def print_box(msg: str) -> None:
|
||||||
lines = msg.split("\n")
|
msg = textwrap.dedent(msg).strip()
|
||||||
size = max(len(l) + 1 for l in lines)
|
lines = ["", *msg.split("\n"), ""]
|
||||||
print("-" * (size + 2))
|
max_width = max(len(l) for l in lines)
|
||||||
for l in lines:
|
print("+" + "-" * (max_width + 4) + "+", file=sys.stderr, flush=True)
|
||||||
print("|{}{}|".format(l, " " * (size - len(l))))
|
for line in lines:
|
||||||
print("-" * (size + 2))
|
print(f"| {line:<{max_width}s} |", file=sys.stderr, flush=True)
|
||||||
|
print("+" + "-" * (max_width + 4) + "+", file=sys.stderr, flush=True)
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
@ -1308,7 +1310,9 @@ def main() -> None:
|
|||||||
"include/**/*.hpp",
|
"include/**/*.hpp",
|
||||||
"include/*.cuh",
|
"include/*.cuh",
|
||||||
"include/**/*.cuh",
|
"include/**/*.cuh",
|
||||||
|
"csrc/inductor/aoti_runtime/model.h",
|
||||||
"_inductor/codegen/*.h",
|
"_inductor/codegen/*.h",
|
||||||
|
"_inductor/codegen/aoti_runtime/*.h",
|
||||||
"_inductor/codegen/aoti_runtime/*.cpp",
|
"_inductor/codegen/aoti_runtime/*.cpp",
|
||||||
"_inductor/script.ld",
|
"_inductor/script.ld",
|
||||||
"_export/serde/*.yaml",
|
"_export/serde/*.yaml",
|
||||||
|
|||||||
@ -36,7 +36,7 @@ The following commands assume you are in PyTorch root.
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# ... Build PyTorch from source, e.g.
|
# ... Build PyTorch from source, e.g.
|
||||||
python setup.py develop
|
python -m pip install --no-build-isolation -v -e .
|
||||||
# (re)build just the binary
|
# (re)build just the binary
|
||||||
ninja -C build bin/test_jit
|
ninja -C build bin/test_jit
|
||||||
# run tests
|
# run tests
|
||||||
|
|||||||
@ -4,8 +4,8 @@ This folder contains a self-contained example of a PyTorch out-of-tree backend l
|
|||||||
|
|
||||||
## How to use
|
## How to use
|
||||||
|
|
||||||
Install as standalone with `python setup.py develop` (or install) from this folder.
|
Install as standalone with `python -m pip install -e .` (or `python -m pip install .`)
|
||||||
You can run test via `python {PYTORCH_ROOT_PATH}/test/test_openreg.py`.
|
from this folder. You can run test via `python {PYTORCH_ROOT_PATH}/test/test_openreg.py`.
|
||||||
|
|
||||||
## Design principles
|
## Design principles
|
||||||
|
|
||||||
|
|||||||
@ -1570,5 +1570,54 @@ class TestFullyShardForceSumReduction(FSDPTest):
|
|||||||
self.assertRegex(logs, all_reduce_sum_re)
|
self.assertRegex(logs, all_reduce_sum_re)
|
||||||
|
|
||||||
|
|
||||||
|
class TestFullyShardReduceOpWorldSize1(FSDPTest):
|
||||||
|
@property
|
||||||
|
def world_size(self) -> int:
|
||||||
|
return 1
|
||||||
|
|
||||||
|
def test_size1_reduceop(self):
|
||||||
|
from torch.distributed.distributed_c10d import ReduceOp
|
||||||
|
|
||||||
|
model = nn.Linear(1024, 1025)
|
||||||
|
ref_model = copy.deepcopy(model).to(device_type)
|
||||||
|
ref_optim = torch.optim.Adam(ref_model.parameters())
|
||||||
|
fully_shard(
|
||||||
|
model,
|
||||||
|
mesh=init_device_mesh(device_type.type, (1,)),
|
||||||
|
reshard_after_forward=False,
|
||||||
|
)
|
||||||
|
optim = torch.optim.Adam(model.parameters())
|
||||||
|
|
||||||
|
inp = torch.randn(1025, 1024, device=device_type.type)
|
||||||
|
for _ in range(3):
|
||||||
|
ref_optim.zero_grad()
|
||||||
|
ref_loss = ref_model(inp).sum()
|
||||||
|
ref_loss.backward()
|
||||||
|
for param in ref_model.parameters():
|
||||||
|
dist.all_reduce(param.grad, op=dist.ReduceOp.SUM)
|
||||||
|
ref_optim.step()
|
||||||
|
|
||||||
|
optim.zero_grad()
|
||||||
|
loss = model(inp).sum()
|
||||||
|
loss.backward()
|
||||||
|
optim.step()
|
||||||
|
self.assertEqual(loss, ref_loss)
|
||||||
|
self.assertEqual(
|
||||||
|
model.bias.grad._local_tensor,
|
||||||
|
ref_model.bias.grad,
|
||||||
|
)
|
||||||
|
|
||||||
|
state = model._get_fsdp_state()
|
||||||
|
fsdp_param_group = state._fsdp_param_group
|
||||||
|
group = fsdp_param_group.mesh_info.shard_process_group
|
||||||
|
(
|
||||||
|
_,
|
||||||
|
_,
|
||||||
|
_,
|
||||||
|
all_reduce_op,
|
||||||
|
) = _get_gradient_divide_factors(group, None, torch.float32)
|
||||||
|
self.assertEqual(all_reduce_op, ReduceOp.SUM)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
run_tests()
|
run_tests()
|
||||||
|
|||||||
@ -554,21 +554,6 @@ class TestNew2dParallelTraining(DTensorTestBase):
|
|||||||
p2 = p2.redistribute(p2.device_mesh, [Replicate()]).to_local()
|
p2 = p2.redistribute(p2.device_mesh, [Replicate()]).to_local()
|
||||||
self.assertTrue(torch.allclose(p1, p2), f"{p1} vs {p2}")
|
self.assertTrue(torch.allclose(p1, p2), f"{p1} vs {p2}")
|
||||||
|
|
||||||
@with_comms
|
|
||||||
@skip_if_lt_x_gpu(4)
|
|
||||||
def test_raise_invalid_tp_composition(self):
|
|
||||||
with self.assertRaisesRegex(
|
|
||||||
RuntimeError, r"Found TP device_mesh on the \d dimension of its parent mesh"
|
|
||||||
):
|
|
||||||
mesh_2d = init_device_mesh(
|
|
||||||
self.device_type, (2, self.world_size // 2), mesh_dim_names=("tp", "dp")
|
|
||||||
)
|
|
||||||
parallelize_plan = {
|
|
||||||
"net1": ColwiseParallel(),
|
|
||||||
"net2": RowwiseParallel(),
|
|
||||||
}
|
|
||||||
parallelize_module(SimpleModel().cuda(), mesh_2d["tp"], parallelize_plan)
|
|
||||||
|
|
||||||
@with_comms
|
@with_comms
|
||||||
@skip_if_lt_x_gpu(4)
|
@skip_if_lt_x_gpu(4)
|
||||||
def test_2d_fsdp_state_enable_extension(self):
|
def test_2d_fsdp_state_enable_extension(self):
|
||||||
|
|||||||
@ -3182,7 +3182,7 @@ class NcclRegistrationTest(MultiProcessTestCase):
|
|||||||
|
|
||||||
# Use NCCL memory allocator
|
# Use NCCL memory allocator
|
||||||
# enable symmetric memory usage in NCCL
|
# enable symmetric memory usage in NCCL
|
||||||
pool = torch.cuda.MemPool(backend.mem_allocator, symm_mem=True)
|
pool = torch.cuda.MemPool(backend.mem_allocator, symmetric=True)
|
||||||
|
|
||||||
# allocate memory with ncclMemAlloc
|
# allocate memory with ncclMemAlloc
|
||||||
# note: symmetric kernels are not available for dtypes like torch.int64
|
# note: symmetric kernels are not available for dtypes like torch.int64
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user