mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-11 22:34:53 +08:00
Update
[ghstack-poisoned]
This commit is contained in:
@ -4,7 +4,7 @@ set -eux -o pipefail
|
||||
GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
|
||||
|
||||
if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="9.0;10.0;12.0"
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
|
||||
fi
|
||||
|
||||
SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
|
||||
|
||||
@ -52,6 +52,8 @@ fi
|
||||
|
||||
if [[ "$image" == *-jammy* ]]; then
|
||||
UBUNTU_VERSION=22.04
|
||||
elif [[ "$image" == *-noble* ]]; then
|
||||
UBUNTU_VERSION=24.04
|
||||
elif [[ "$image" == *ubuntu* ]]; then
|
||||
extract_version_from_image_name ubuntu UBUNTU_VERSION
|
||||
fi
|
||||
@ -230,8 +232,12 @@ case "$tag" in
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-rocm-n-py3)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-noble-rocm-n-py3)
|
||||
if [[ $tag =~ "jammy" ]]; then
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
else
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
fi
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
ROCM_VERSION=6.4
|
||||
@ -322,6 +328,8 @@ case "$tag" in
|
||||
GCC_VERSION=11
|
||||
ACL=yes
|
||||
VISION=yes
|
||||
CONDA_CMAKE=yes
|
||||
OPENBLAS=yes
|
||||
# snadampal: skipping llvm src build install because the current version
|
||||
# from pytorch/llvm:9.0.1 is x86 specific
|
||||
SKIP_LLVM_SRC_BUILD_INSTALL=yes
|
||||
@ -331,6 +339,8 @@ case "$tag" in
|
||||
GCC_VERSION=11
|
||||
ACL=yes
|
||||
VISION=yes
|
||||
CONDA_CMAKE=yes
|
||||
OPENBLAS=yes
|
||||
# snadampal: skipping llvm src build install because the current version
|
||||
# from pytorch/llvm:9.0.1 is x86 specific
|
||||
SKIP_LLVM_SRC_BUILD_INSTALL=yes
|
||||
@ -417,6 +427,7 @@ docker build \
|
||||
--build-arg "XPU_VERSION=${XPU_VERSION}" \
|
||||
--build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
|
||||
--build-arg "ACL=${ACL:-}" \
|
||||
--build-arg "OPENBLAS=${OPENBLAS:-}" \
|
||||
--build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
|
||||
--build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
|
||||
-f $(dirname ${DOCKERFILE})/Dockerfile \
|
||||
|
||||
@ -23,6 +23,10 @@ conda_install() {
|
||||
as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $*
|
||||
}
|
||||
|
||||
conda_install_through_forge() {
|
||||
as_jenkins conda install -c conda-forge -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $*
|
||||
}
|
||||
|
||||
conda_run() {
|
||||
as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION --no-capture-output $*
|
||||
}
|
||||
|
||||
@ -15,6 +15,9 @@ install_ubuntu() {
|
||||
elif [[ "$UBUNTU_VERSION" == "22.04"* ]]; then
|
||||
cmake3="cmake=3.22*"
|
||||
maybe_libiomp_dev=""
|
||||
elif [[ "$UBUNTU_VERSION" == "24.04"* ]]; then
|
||||
cmake3="cmake=3.28*"
|
||||
maybe_libiomp_dev=""
|
||||
else
|
||||
cmake3="cmake=3.5*"
|
||||
maybe_libiomp_dev="libiomp-dev"
|
||||
|
||||
@ -70,10 +70,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
|
||||
fi
|
||||
|
||||
# Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
|
||||
if [[ $(uname -m) == "aarch64" ]]; then
|
||||
conda_install "openblas==0.3.29=*openmp*"
|
||||
else
|
||||
conda_install "mkl=2021.4.0 mkl-include=2021.4.0"
|
||||
if [[ $(uname -m) != "aarch64" ]]; then
|
||||
pip_install mkl==2024.2.0
|
||||
pip_install mkl-static==2024.2.0
|
||||
pip_install mkl-include==2024.2.0
|
||||
fi
|
||||
|
||||
# Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
|
||||
@ -87,6 +87,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
|
||||
conda_run ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION})
|
||||
fi
|
||||
|
||||
if [[ "$UBUNTU_VERSION" == "24.04"* ]] ; then
|
||||
conda_install_through_forge libstdcxx-ng=14
|
||||
fi
|
||||
|
||||
# Install some other packages, including those needed for Python test reporting
|
||||
pip_install -r /opt/conda/requirements-ci.txt
|
||||
|
||||
|
||||
@ -4,8 +4,9 @@
|
||||
set -ex
|
||||
|
||||
cd /
|
||||
git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.29}" --depth 1 --shallow-submodules
|
||||
git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.30}" --depth 1 --shallow-submodules
|
||||
|
||||
OPENBLAS_CHECKOUT_DIR="OpenBLAS"
|
||||
OPENBLAS_BUILD_FLAGS="
|
||||
NUM_THREADS=128
|
||||
USE_OPENMP=1
|
||||
@ -13,9 +14,8 @@ NO_SHARED=0
|
||||
DYNAMIC_ARCH=1
|
||||
TARGET=ARMV8
|
||||
CFLAGS=-O3
|
||||
BUILD_BFLOAT16=1
|
||||
"
|
||||
|
||||
OPENBLAS_CHECKOUT_DIR="OpenBLAS"
|
||||
|
||||
make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
|
||||
make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}
|
||||
|
||||
@ -8,9 +8,11 @@ ver() {
|
||||
|
||||
install_ubuntu() {
|
||||
apt-get update
|
||||
if [[ $UBUNTU_VERSION == 20.04 ]]; then
|
||||
# gpg-agent is not available by default on 20.04
|
||||
apt-get install -y --no-install-recommends gpg-agent
|
||||
# gpg-agent is not available by default
|
||||
apt-get install -y --no-install-recommends gpg-agent
|
||||
if [[ $(ver $UBUNTU_VERSION) -ge $(ver 22.04) ]]; then
|
||||
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \
|
||||
| sudo tee /etc/apt/preferences.d/rocm-pin-600
|
||||
fi
|
||||
apt-get install -y kmod
|
||||
apt-get install -y wget
|
||||
@ -85,13 +87,14 @@ EOF
|
||||
VER_STR=6.3
|
||||
fi
|
||||
# clr build needs CppHeaderParser but can only find it using conda's python
|
||||
/opt/conda/bin/python -m pip install CppHeaderParser
|
||||
python -m pip install CppHeaderParser
|
||||
git clone https://github.com/ROCm/HIP -b $HIP_BRANCH
|
||||
HIP_COMMON_DIR=$(readlink -f HIP)
|
||||
git clone https://github.com/jeffdaily/clr -b release/rocm-rel-${VER_STR}${VER_PATCH}-statco-hotfix
|
||||
mkdir -p clr/build
|
||||
pushd clr/build
|
||||
cmake .. -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
|
||||
# Need to point CMake to the correct python installation to find CppHeaderParser
|
||||
cmake .. -DPython3_EXECUTABLE=/opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}/bin/python3 -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
|
||||
make -j
|
||||
cp hipamd/lib/libamdhip64.so.${VER_STR}.* /opt/rocm/lib/libamdhip64.so.${VER_STR}.*
|
||||
popd
|
||||
|
||||
@ -41,7 +41,7 @@ case ${image} in
|
||||
GPU_IMAGE=arm64v8/almalinux:8
|
||||
DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1"
|
||||
MANY_LINUX_VERSION="2_28_aarch64"
|
||||
OPENBLAS_VERSION="v0.3.29"
|
||||
OPENBLAS_VERSION="v0.3.30"
|
||||
;;
|
||||
manylinuxcxx11-abi-builder:cpu-cxx11-abi)
|
||||
TARGET=final
|
||||
|
||||
@ -16,6 +16,7 @@ click
|
||||
#test that import:
|
||||
|
||||
coremltools==5.0b5 ; python_version < "3.12"
|
||||
coremltools==8.3 ; python_version == "3.12"
|
||||
#Description: Apple framework for ML integration
|
||||
#Pinned versions: 5.0b5
|
||||
#test that import:
|
||||
@ -63,6 +64,7 @@ lark==0.12.0
|
||||
#test that import:
|
||||
|
||||
librosa>=0.6.2 ; python_version < "3.11"
|
||||
librosa==0.10.2 ; python_version == "3.12"
|
||||
#Description: A python package for music and audio analysis
|
||||
#Pinned versions: >=0.6.2
|
||||
#test that import: test_spectral_ops.py
|
||||
@ -111,6 +113,7 @@ ninja==1.11.1.3
|
||||
numba==0.49.0 ; python_version < "3.9"
|
||||
numba==0.55.2 ; python_version == "3.9"
|
||||
numba==0.55.2 ; python_version == "3.10"
|
||||
numba==0.60.0 ; python_version == "3.12"
|
||||
#Description: Just-In-Time Compiler for Numerical Functions
|
||||
#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
|
||||
#test that import: test_numba_integration.py
|
||||
@ -360,10 +363,11 @@ pwlf==2.2.1
|
||||
|
||||
|
||||
# To build PyTorch itself
|
||||
astunparse
|
||||
PyYAML
|
||||
pyyaml
|
||||
pyzstd
|
||||
setuptools
|
||||
six
|
||||
wheel
|
||||
|
||||
scons==4.5.2 ; platform_machine == "aarch64"
|
||||
|
||||
|
||||
@ -5,7 +5,7 @@ sphinx==5.3.0
|
||||
|
||||
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
|
||||
# but it doesn't seem to work and hangs around idly. The initial thought is probably
|
||||
# something related to Docker setup. We can investigate this later
|
||||
# something related to Docker setup. We can investigate this later.
|
||||
|
||||
sphinxcontrib.katex==0.8.6
|
||||
#Description: This is used to generate PyTorch docs
|
||||
|
||||
@ -147,6 +147,12 @@ RUN if [ -n "${ACL}" ]; then bash ./install_acl.sh; fi
|
||||
RUN rm install_acl.sh
|
||||
ENV INSTALLED_ACL ${ACL}
|
||||
|
||||
ARG OPENBLAS
|
||||
COPY ./common/install_openblas.sh install_openblas.sh
|
||||
RUN if [ -n "${OPENBLAS}" ]; then bash ./install_openblas.sh; fi
|
||||
RUN rm install_openblas.sh
|
||||
ENV INSTALLED_OPENBLAS ${OPENBLAS}
|
||||
|
||||
# Install ccache/sccache (do this last, so we get priority in PATH)
|
||||
ARG SKIP_SCCACHE_INSTALL
|
||||
COPY ./common/install_cache.sh install_cache.sh
|
||||
|
||||
@ -104,7 +104,7 @@ if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
|
||||
export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr
|
||||
fi
|
||||
|
||||
echo "Calling setup.py install at $(date)"
|
||||
echo "Calling 'python -m pip install .' at $(date)"
|
||||
|
||||
if [[ $LIBTORCH_VARIANT = *"static"* ]]; then
|
||||
STATIC_CMAKE_FLAG="-DTORCH_STATIC=1"
|
||||
@ -120,7 +120,7 @@ fi
|
||||
# TODO: Remove this flag once https://github.com/pytorch/pytorch/issues/55952 is closed
|
||||
CFLAGS='-Wno-deprecated-declarations' \
|
||||
BUILD_LIBTORCH_CPU_WITH_DEBUG=1 \
|
||||
python setup.py install
|
||||
python -m pip install --no-build-isolation -v .
|
||||
|
||||
mkdir -p libtorch/{lib,bin,include,share}
|
||||
|
||||
|
||||
@ -185,7 +185,7 @@ torchbench_setup_macos() {
|
||||
}
|
||||
|
||||
pip_benchmark_deps() {
|
||||
python -mpip install --no-input astunparse requests cython scikit-learn
|
||||
python -mpip install --no-input requests cython scikit-learn six
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -436,11 +436,11 @@ test_inductor_aoti() {
|
||||
python3 tools/amd_build/build_amd.py
|
||||
fi
|
||||
if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
|
||||
BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python setup.py develop)
|
||||
BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python -m pip install --no-build-isolation -v -e .)
|
||||
# TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
|
||||
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
|
||||
else
|
||||
BUILD_COMMAND=(python setup.py develop)
|
||||
BUILD_COMMAND=(python -m pip install --no-build-isolation -v -e .)
|
||||
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
|
||||
fi
|
||||
|
||||
@ -1579,7 +1579,7 @@ test_operator_benchmark() {
|
||||
test_inductor_set_cpu_affinity
|
||||
|
||||
cd benchmarks/operator_benchmark/pt_extension
|
||||
python setup.py install
|
||||
python -m pip install .
|
||||
|
||||
cd "${TEST_DIR}"/benchmarks/operator_benchmark
|
||||
$TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \
|
||||
|
||||
@ -42,7 +42,7 @@ call choco upgrade -y cmake --no-progress --installargs 'ADD_CMAKE_TO_PATH=Syste
|
||||
if errorlevel 1 goto fail
|
||||
if not errorlevel 0 goto fail
|
||||
|
||||
call pip install mkl-include==2021.4.0 mkl-devel==2021.4.0
|
||||
call pip install mkl==2024.2.0 mkl-static==2024.2.0 mkl-include==2024.2.0
|
||||
if errorlevel 1 goto fail
|
||||
if not errorlevel 0 goto fail
|
||||
|
||||
|
||||
@ -61,8 +61,8 @@ You are now all set to start developing with PyTorch in a DevContainer environme
|
||||
## Step 8: Build PyTorch
|
||||
|
||||
To build pytorch from source, simply run:
|
||||
```
|
||||
python setup.py develop
|
||||
```bash
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
|
||||
The process involves compiling thousands of files, and would take a long time. Fortunately, the compiled objects can be useful for your next build. When you modify some files, you only need to compile the changed files the next time.
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
70caf76066ef2c1054d6128b11769dc816a779e7
|
||||
6c57850358f34c47802db216b0746e4e9d08a95a
|
||||
|
||||
2
.github/scripts/td_llm_indexer.sh
vendored
2
.github/scripts/td_llm_indexer.sh
vendored
@ -6,7 +6,7 @@ set -euxo pipefail
|
||||
cd llm-target-determinator
|
||||
pip install -q -r requirements.txt
|
||||
cd ../codellama
|
||||
pip install -e .
|
||||
pip install --no-build-isolation -v -e .
|
||||
pip install numpy==1.26.0
|
||||
|
||||
# Run indexer
|
||||
|
||||
3
.github/workflows/_linux-build.yml
vendored
3
.github/workflows/_linux-build.yml
vendored
@ -131,6 +131,9 @@ jobs:
|
||||
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
instructions: |
|
||||
Build is done inside the container, to start an interactive session run:
|
||||
docker exec -it $(docker container ps --format '{{.ID}}') bash
|
||||
|
||||
# [pytorch repo ref]
|
||||
# Use a pytorch/pytorch reference instead of a reference to the local
|
||||
|
||||
16
.github/workflows/_mac-test.yml
vendored
16
.github/workflows/_mac-test.yml
vendored
@ -88,6 +88,14 @@ jobs:
|
||||
pkill "${PROCESS}" || true
|
||||
done
|
||||
|
||||
- name: Clean up brew miniconda, if installed
|
||||
continue-on-error: true
|
||||
run: |
|
||||
if brew list miniconda; then
|
||||
brew uninstall miniconda
|
||||
echo "REINSTALL_BREW_MINICONDA=1" >> "${GITHUB_ENV}"
|
||||
fi
|
||||
|
||||
- name: Clean up leftover local python3 site-packages on MacOS pet runner
|
||||
continue-on-error: true
|
||||
run: |
|
||||
@ -268,6 +276,14 @@ jobs:
|
||||
workflow_attempt: ${{github.run_attempt}}
|
||||
local_path: usage_log.txt
|
||||
|
||||
- name: Reinstall brew miniconda, if was installed
|
||||
if: always()
|
||||
continue-on-error: true
|
||||
run: |
|
||||
if [[ -n "$REINSTALL_BREW_MINICONDA" ]]; then
|
||||
brew install miniconda
|
||||
fi
|
||||
|
||||
- name: Clean up disk space
|
||||
if: always()
|
||||
continue-on-error: true
|
||||
|
||||
1
.github/workflows/docker-builds.yml
vendored
1
.github/workflows/docker-builds.yml
vendored
@ -63,6 +63,7 @@ jobs:
|
||||
pytorch-linux-jammy-py3.13-clang12,
|
||||
pytorch-linux-jammy-rocm-n-1-py3,
|
||||
pytorch-linux-jammy-rocm-n-py3,
|
||||
pytorch-linux-noble-rocm-n-py3,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
|
||||
pytorch-linux-jammy-py3.9-gcc11,
|
||||
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
|
||||
|
||||
20
.github/workflows/rocm-mi300.yml
vendored
20
.github/workflows/rocm-mi300.yml
vendored
@ -36,15 +36,15 @@ jobs:
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-jammy-rocm-py3_10-build:
|
||||
linux-noble-rocm-py3_12-build:
|
||||
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||
name: linux-jammy-rocm-py3.10-mi300
|
||||
name: linux-noble-rocm-py3.12-mi300
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-rocm-py3.10-mi300
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
build-environment: linux-noble-rocm-py3.12-mi300
|
||||
docker-image-name: ci-image:pytorch-linux-noble-rocm-n-py3
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -57,17 +57,17 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-test:
|
||||
linux-noble-rocm-py3_12-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-jammy-rocm-py3.10-mi300
|
||||
name: linux-noble-rocm-py3.12-mi300
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-jammy-rocm-py3_10-build
|
||||
- linux-noble-rocm-py3_12-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3.10-mi300
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
||||
build-environment: linux-noble-rocm-py3.12-mi300
|
||||
docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
@ -231,7 +231,8 @@ include_patterns = [
|
||||
'c10/**/*.cpp',
|
||||
'c10/**/*.h',
|
||||
'torch/*.h',
|
||||
'torch/_inductor/codegen/aoti_runtime/interface.cpp',
|
||||
'torch/_inductor/codegen/aoti_runtime/*.h',
|
||||
'torch/_inductor/codegen/aoti_runtime/*.cpp',
|
||||
'torch/csrc/*.h',
|
||||
'torch/csrc/*.cpp',
|
||||
'torch/csrc/**/*.h',
|
||||
@ -1476,6 +1477,31 @@ init_command = [
|
||||
'tomli==2.2.1 ; python_version < "3.11"',
|
||||
]
|
||||
|
||||
[[linter]]
|
||||
code = 'CMAKE_MINIMUM_REQUIRED'
|
||||
command = [
|
||||
'python3',
|
||||
'tools/linter/adapters/cmake_minimum_required_linter.py',
|
||||
'--',
|
||||
'@{{PATHSFILE}}'
|
||||
]
|
||||
include_patterns = [
|
||||
"**/pyproject.toml",
|
||||
"**/CMakeLists.txt",
|
||||
"**/CMakeLists.txt.in",
|
||||
"**/*.cmake",
|
||||
"**/*.cmake.in",
|
||||
"**/*requirements*.txt",
|
||||
"**/*requirements*.in",
|
||||
]
|
||||
init_command = [
|
||||
'python3',
|
||||
'tools/linter/adapters/pip_init.py',
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'packaging==25.0',
|
||||
'tomli==2.2.1 ; python_version < "3.11"',
|
||||
]
|
||||
|
||||
[[linter]]
|
||||
code = 'COPYRIGHT'
|
||||
include_patterns = ['**']
|
||||
|
||||
@ -88,20 +88,19 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
|
||||
|
||||
* If you want to have no-op incremental rebuilds (which are fast), see [Make no-op build fast](#make-no-op-build-fast) below.
|
||||
|
||||
* When installing with `python setup.py develop` (in contrast to `python setup.py install`) Python runtime will use
|
||||
* When installing with `python -m pip install -e .` (in contrast to `python -m pip install .`) Python runtime will use
|
||||
the current local source-tree when importing `torch` package. (This is done by creating [`.egg-link`](https://wiki.python.org/moin/PythonPackagingTerminology#egg-link) file in `site-packages` folder)
|
||||
This way you do not need to repeatedly install after modifying Python files (`.py`).
|
||||
However, you would need to reinstall if you modify Python interface (`.pyi`, `.pyi.in`) or
|
||||
non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...).
|
||||
However, you would need to reinstall if you modify Python interface (`.pyi`, `.pyi.in`) or non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...).
|
||||
|
||||
|
||||
One way to avoid running `python setup.py develop` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac,
|
||||
One way to avoid running `python -m pip install -e .` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac,
|
||||
is to create a symbolic link from `build` folder to `torch/lib`, for example, by issuing following:
|
||||
```bash
|
||||
pushd torch/lib; sh -c "ln -sf ../../build/lib/libtorch_cpu.* ."; popd
|
||||
pushd torch/lib; sh -c "ln -sf ../../build/lib/libtorch_cpu.* ."; popd
|
||||
```
|
||||
Afterwards rebuilding a library (for example to rebuild `libtorch_cpu.so` issue `ninja torch_cpu` from `build` folder),
|
||||
would be sufficient to make change visible in `torch` package.
|
||||
Afterwards rebuilding a library (for example to rebuild `libtorch_cpu.so` issue `ninja torch_cpu` from `build` folder),
|
||||
would be sufficient to make change visible in `torch` package.
|
||||
|
||||
|
||||
To reinstall, first uninstall all existing PyTorch installs. You may need to run `pip
|
||||
@ -115,9 +114,9 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
|
||||
pip uninstall torch
|
||||
```
|
||||
|
||||
Next run `python setup.py clean`. After that, you can install in `develop` mode again.
|
||||
Next run `python setup.py clean`. After that, you can install in editable mode again.
|
||||
|
||||
* If you run into errors when running `python setup.py develop`, here are some debugging steps:
|
||||
* If you run into errors when running `python -m pip install -e .`, here are some debugging steps:
|
||||
1. Run `printf '#include <stdio.h>\nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure
|
||||
your CMake works and can compile this simple Hello World program without errors.
|
||||
2. Nuke your `build` directory. The `setup.py` script compiles binaries into the `build` folder and caches many
|
||||
@ -130,13 +129,20 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
|
||||
git clean -xdf
|
||||
python setup.py clean
|
||||
git submodule update --init --recursive
|
||||
python setup.py develop
|
||||
python -m pip install -r requirements.txt
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
4. The main step within `python setup.py develop` is running `make` from the `build` directory. If you want to
|
||||
4. The main step within `python -m pip install -e .` is running `cmake --build build` from the `build` directory. If you want to
|
||||
experiment with some environment variables, you can pass them into the command:
|
||||
```bash
|
||||
ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* python setup.py develop
|
||||
ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* CMAKE_FRESH=1 python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
5. Try installing PyTorch without build isolation by adding `--no-build-isolation` to the `pip install` command.
|
||||
This will use the current environment's packages instead of creating a new isolated environment for the build.
|
||||
```bash
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
|
||||
|
||||
* If you run into issue running `git submodule update --init --recursive`. Please try the following:
|
||||
- If you encounter an error such as
|
||||
@ -639,9 +645,9 @@ can be selected interactively with your mouse to zoom in on a particular part of
|
||||
the program execution timeline. The `--native` command-line option tells
|
||||
`py-spy` to record stack frame entries for PyTorch C++ code. To get line numbers
|
||||
for C++ code it may be necessary to compile PyTorch in debug mode by prepending
|
||||
your `setup.py develop` call to compile PyTorch with `DEBUG=1`. Depending on
|
||||
your operating system it may also be necessary to run `py-spy` with root
|
||||
privileges.
|
||||
your `python -m pip install -e .` call to compile PyTorch with `DEBUG=1`.
|
||||
Depending on your operating system it may also be necessary to run `py-spy` with
|
||||
root privileges.
|
||||
|
||||
`py-spy` can also work in an `htop`-like "live profiling" mode and can be
|
||||
tweaked to adjust the stack sampling rate, see the `py-spy` readme for more
|
||||
@ -649,7 +655,7 @@ details.
|
||||
|
||||
## Managing multiple build trees
|
||||
|
||||
One downside to using `python setup.py develop` is that your development
|
||||
One downside to using `python -m pip install -e .` is that your development
|
||||
version of PyTorch will be installed globally on your account (e.g., if
|
||||
you run `import torch` anywhere else, the development version will be
|
||||
used).
|
||||
@ -663,7 +669,7 @@ specific build of PyTorch. To set one up:
|
||||
python -m venv pytorch-myfeature
|
||||
source pytorch-myfeature/bin/activate # or `& .\pytorch-myfeature\Scripts\Activate.ps1` on Windows
|
||||
# if you run python now, torch will NOT be installed
|
||||
python setup.py develop
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
|
||||
## C++ development tips
|
||||
@ -701,7 +707,9 @@ variables `DEBUG`, `USE_DISTRIBUTED`, `USE_MKLDNN`, `USE_CUDA`, `USE_FLASH_ATTEN
|
||||
For example:
|
||||
|
||||
```bash
|
||||
DEBUG=1 USE_DISTRIBUTED=0 USE_MKLDNN=0 USE_CUDA=0 BUILD_TEST=0 USE_FBGEMM=0 USE_NNPACK=0 USE_QNNPACK=0 USE_XNNPACK=0 python setup.py develop
|
||||
DEBUG=1 USE_DISTRIBUTED=0 USE_MKLDNN=0 USE_CUDA=0 BUILD_TEST=0 \
|
||||
USE_FBGEMM=0 USE_NNPACK=0 USE_QNNPACK=0 USE_XNNPACK=0 \
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
|
||||
For subsequent builds (i.e., when `build/CMakeCache.txt` exists), the build
|
||||
@ -711,7 +719,7 @@ options.
|
||||
|
||||
### Code completion and IDE support
|
||||
|
||||
When using `python setup.py develop`, PyTorch will generate
|
||||
When using `python -m pip install -e .`, PyTorch will generate
|
||||
a `compile_commands.json` file that can be used by many editors
|
||||
to provide command completion and error highlighting for PyTorch's
|
||||
C++ code. You need to `pip install ninja` to generate accurate
|
||||
@ -772,7 +780,7 @@ If not, you can define these variables on the command line before invoking `setu
|
||||
export CMAKE_C_COMPILER_LAUNCHER=ccache
|
||||
export CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
||||
export CMAKE_CUDA_COMPILER_LAUNCHER=ccache
|
||||
python setup.py develop
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
|
||||
#### Use a faster linker
|
||||
@ -785,7 +793,7 @@ If you are editing a single file and rebuilding in a tight loop, the time spent
|
||||
Starting with CMake 3.29, you can specify the linker type using the [`CMAKE_LINKER_TYPE`](https://cmake.org/cmake/help/latest/variable/CMAKE_LINKER_TYPE.html) variable. For example, with `mold` installed:
|
||||
|
||||
```sh
|
||||
CMAKE_LINKER_TYPE=MOLD python setup.py develop
|
||||
CMAKE_LINKER_TYPE=MOLD python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
|
||||
#### Use pre-compiled headers
|
||||
@ -797,7 +805,7 @@ setting `USE_PRECOMPILED_HEADERS=1` either on first setup, or in the
|
||||
`CMakeCache.txt` file.
|
||||
|
||||
```sh
|
||||
USE_PRECOMPILED_HEADERS=1 python setup.py develop
|
||||
USE_PRECOMPILED_HEADERS=1 python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
|
||||
This adds a build step where the compiler takes `<ATen/ATen.h>` and essentially
|
||||
@ -820,7 +828,7 @@ A compiler-wrapper to fix this is provided in `tools/nvcc_fix_deps.py`. You can
|
||||
this as a compiler launcher, similar to `ccache`
|
||||
```bash
|
||||
export CMAKE_CUDA_COMPILER_LAUNCHER="python;`pwd`/tools/nvcc_fix_deps.py;ccache"
|
||||
python setup.py develop
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
|
||||
### Rebuild few files with debug information
|
||||
@ -1171,7 +1179,7 @@ build_with_asan()
|
||||
CFLAGS="-fsanitize=address -fno-sanitize-recover=all -shared-libasan -pthread" \
|
||||
CXX_FLAGS="-pthread" \
|
||||
USE_CUDA=0 USE_OPENMP=0 USE_DISTRIBUTED=0 DEBUG=1 \
|
||||
python setup.py develop
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
}
|
||||
|
||||
run_with_asan()
|
||||
|
||||
@ -57,7 +57,7 @@ RUN --mount=type=cache,target=/opt/ccache \
|
||||
export eval ${CMAKE_VARS} && \
|
||||
TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
|
||||
CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
|
||||
python setup.py install
|
||||
python -m pip install --no-build-isolation -v .
|
||||
|
||||
FROM conda as conda-installs
|
||||
ARG PYTHON_VERSION=3.11
|
||||
|
||||
20
README.md
20
README.md
@ -228,6 +228,7 @@ If you want to disable Intel GPU support, export the environment variable `USE_X
|
||||
Other potentially useful environment variables may be found in `setup.py`.
|
||||
|
||||
#### Get the PyTorch Source
|
||||
|
||||
```bash
|
||||
git clone https://github.com/pytorch/pytorch
|
||||
cd pytorch
|
||||
@ -279,24 +280,29 @@ conda install -c conda-forge libuv=1.39
|
||||
```
|
||||
|
||||
#### Install PyTorch
|
||||
|
||||
**On Linux**
|
||||
|
||||
If you're compiling for AMD ROCm then first run this command:
|
||||
|
||||
```bash
|
||||
# Only run this if you're compiling for ROCm
|
||||
python tools/amd_build/build_amd.py
|
||||
```
|
||||
|
||||
Install PyTorch
|
||||
|
||||
```bash
|
||||
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
|
||||
python setup.py develop
|
||||
python -m pip install -r requirements.txt
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
|
||||
**On macOS**
|
||||
|
||||
```bash
|
||||
python3 setup.py develop
|
||||
python -m pip install -r requirements.txt
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
|
||||
**On Windows**
|
||||
@ -308,7 +314,7 @@ If you want to build legacy python code, please refer to [Building on legacy cod
|
||||
In this mode PyTorch computations will run on your CPU, not your GPU.
|
||||
|
||||
```cmd
|
||||
python setup.py develop
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
|
||||
Note on OpenMP: The desired OpenMP implementation is Intel OpenMP (iomp). In order to link against iomp, you'll need to manually download the library and set up the building environment by tweaking `CMAKE_INCLUDE_PATH` and `LIB`. The instruction [here](https://github.com/pytorch/pytorch/blob/main/docs/source/notes/windows.rst#building-from-source) is an example for setting up both MKL and Intel OpenMP. Without these configurations for CMake, Microsoft Visual C OpenMP runtime (vcomp) will be used.
|
||||
@ -329,7 +335,6 @@ Additional libraries such as
|
||||
|
||||
You can refer to the [build_pytorch.bat](https://github.com/pytorch/pytorch/blob/main/.ci/pytorch/win-test-helpers/build_pytorch.bat) script for some other environment variables configurations
|
||||
|
||||
|
||||
```cmd
|
||||
cmd
|
||||
|
||||
@ -349,8 +354,7 @@ for /f "usebackq tokens=*" %i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\
|
||||
:: [Optional] If you want to override the CUDA host compiler
|
||||
set CUDAHOSTCXX=C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\HostX64\x64\cl.exe
|
||||
|
||||
python setup.py develop
|
||||
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
|
||||
**Intel GPU builds**
|
||||
@ -372,7 +376,7 @@ if defined CMAKE_PREFIX_PATH (
|
||||
set "CMAKE_PREFIX_PATH=%CONDA_PREFIX%\Library"
|
||||
)
|
||||
|
||||
python setup.py develop
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
|
||||
##### Adjust Build Options (Optional)
|
||||
@ -382,6 +386,7 @@ the following. For example, adjusting the pre-detected directories for CuDNN or
|
||||
with such a step.
|
||||
|
||||
On Linux
|
||||
|
||||
```bash
|
||||
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
|
||||
CMAKE_ONLY=1 python setup.py build
|
||||
@ -389,6 +394,7 @@ ccmake build # or cmake-gui build
|
||||
```
|
||||
|
||||
On macOS
|
||||
|
||||
```bash
|
||||
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
|
||||
MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ CMAKE_ONLY=1 python setup.py build
|
||||
|
||||
@ -131,69 +131,25 @@ uint64_t CPUGeneratorImpl::seed() {
|
||||
|
||||
/**
|
||||
* Sets the internal state of CPUGeneratorImpl. The new internal state
|
||||
* must be a strided CPU byte tensor and of the same size as either
|
||||
* CPUGeneratorImplStateLegacy (for legacy CPU generator state) or
|
||||
* CPUGeneratorImplState (for new state).
|
||||
*
|
||||
* FIXME: Remove support of the legacy state in the future?
|
||||
* must be a strided CPU byte tensor and of the same size as CPUGeneratorImplState.
|
||||
*/
|
||||
void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
|
||||
using detail::CPUGeneratorImplState;
|
||||
using detail::CPUGeneratorImplStateLegacy;
|
||||
|
||||
static_assert(std::is_standard_layout_v<CPUGeneratorImplStateLegacy>, "CPUGeneratorImplStateLegacy is not a PODType");
|
||||
static_assert(std::is_standard_layout_v<CPUGeneratorImplState>, "CPUGeneratorImplState is not a PODType");
|
||||
|
||||
static const size_t size_legacy = sizeof(CPUGeneratorImplStateLegacy);
|
||||
static const size_t size_current = sizeof(CPUGeneratorImplState);
|
||||
static_assert(size_legacy != size_current, "CPUGeneratorImplStateLegacy and CPUGeneratorImplState can't be of the same size");
|
||||
constexpr size_t size = sizeof(CPUGeneratorImplState);
|
||||
|
||||
detail::check_rng_state(new_state);
|
||||
|
||||
at::mt19937 engine;
|
||||
auto float_normal_sample = std::optional<float>();
|
||||
auto double_normal_sample = std::optional<double>();
|
||||
|
||||
// Construct the state of at::CPUGeneratorImpl based on input byte tensor size.
|
||||
CPUGeneratorImplStateLegacy* legacy_pod{nullptr};
|
||||
auto new_state_size = new_state.numel();
|
||||
if (new_state_size == size_legacy) {
|
||||
legacy_pod = (CPUGeneratorImplStateLegacy*)new_state.data();
|
||||
// Note that in CPUGeneratorImplStateLegacy, we didn't have float version
|
||||
// of normal sample and hence we leave the std::optional<float> as is
|
||||
|
||||
// Update next_double_normal_sample.
|
||||
// Note that CPUGeneratorImplStateLegacy stores two uniform values (normal_x, normal_y)
|
||||
// and a rho value (normal_rho). These three values were redundant and in the new
|
||||
// DistributionsHelper.h, we store the actual extra normal sample, rather than three
|
||||
// intermediate values.
|
||||
if (legacy_pod->normal_is_valid) {
|
||||
auto r = legacy_pod->normal_rho;
|
||||
auto theta = 2.0 * c10::pi<double> * legacy_pod->normal_x;
|
||||
// we return the sin version of the normal sample when in caching mode
|
||||
double_normal_sample = std::optional<double>(r * ::sin(theta));
|
||||
}
|
||||
} else if (new_state_size == size_current) {
|
||||
auto rng_state = (CPUGeneratorImplState*)new_state.data();
|
||||
legacy_pod = &rng_state->legacy_pod;
|
||||
// update next_float_normal_sample
|
||||
if (rng_state->is_next_float_normal_sample_valid) {
|
||||
float_normal_sample = std::optional<float>(rng_state->next_float_normal_sample);
|
||||
}
|
||||
|
||||
// Update next_double_normal_sample.
|
||||
// Note that in getRNGState, we now return the actual normal sample in normal_y
|
||||
// and if it's valid in normal_is_valid. The redundant normal_x and normal_rho
|
||||
// are squashed to 0.0.
|
||||
if (legacy_pod->normal_is_valid) {
|
||||
double_normal_sample = std::optional<double>(legacy_pod->normal_y);
|
||||
}
|
||||
} else {
|
||||
TORCH_CHECK(false, "Expected either a CPUGeneratorImplStateLegacy of size ", size_legacy,
|
||||
" or a CPUGeneratorImplState of size ", size_current,
|
||||
" but found the input RNG state size to be ", new_state_size);
|
||||
}
|
||||
TORCH_CHECK(new_state_size == size, "Expected a CPUGeneratorImplState of size ", size,
|
||||
" but found the input RNG state size to be ", new_state_size);
|
||||
|
||||
auto rng_state = new_state.data_ptr_impl<CPUGeneratorImplState>();
|
||||
auto legacy_pod = &(rng_state->legacy_pod);
|
||||
// construct engine_
|
||||
// Note that CPUGeneratorImplStateLegacy stored a state array of 64 bit uints, whereas in our
|
||||
// redefined mt19937, we have changed to a state array of 32 bit uints. Hence, we are
|
||||
@ -207,8 +163,12 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
|
||||
engine.set_data(rng_data);
|
||||
TORCH_CHECK(engine.is_valid(), "Invalid mt19937 state");
|
||||
this->engine_ = engine;
|
||||
this->next_float_normal_sample_ = float_normal_sample;
|
||||
this->next_double_normal_sample_ = double_normal_sample;
|
||||
this->next_float_normal_sample_ = rng_state->is_next_float_normal_sample_valid
|
||||
? std::optional<float>(rng_state->next_float_normal_sample)
|
||||
: std::optional<float>();
|
||||
this->next_double_normal_sample_ = legacy_pod->normal_is_valid
|
||||
? std::optional<double>(legacy_pod->normal_y)
|
||||
: std::optional<double>();
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@ -431,7 +431,8 @@ class TORCH_API Context {
|
||||
at::SDPBackend::flash_attention,
|
||||
at::SDPBackend::efficient_attention,
|
||||
at::SDPBackend::math,
|
||||
at::SDPBackend::cudnn_attention};
|
||||
at::SDPBackend::cudnn_attention,
|
||||
at::SDPBackend::overrideable};
|
||||
bool enabled_flashSDP = true;
|
||||
bool enabled_mem_efficientSDP = true;
|
||||
bool enabled_mathSDP = true;
|
||||
|
||||
@ -26,9 +26,7 @@ inline void infer_size_impl(
|
||||
std::optional<int64_t> infer_dim;
|
||||
for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) {
|
||||
if (TORCH_GUARD_OR_FALSE(sym_eq(shape[dim], -1))) {
|
||||
if (infer_dim) {
|
||||
throw std::runtime_error("only one dimension can be inferred");
|
||||
}
|
||||
TORCH_CHECK(!infer_dim, "only one dimension can be inferred");
|
||||
infer_dim = dim;
|
||||
} else {
|
||||
// in case of unbacked shape[dim] we assume it's not -1 and add a runtime
|
||||
|
||||
@ -214,7 +214,7 @@ inline Tensor applySlice(
|
||||
"step must be greater than zero");
|
||||
|
||||
// See NOTE [nested tensor size for indexing]
|
||||
if (self_sizes.has_value()) {
|
||||
if (self_sizes.has_value() && self_sizes.value().size() > 0) {
|
||||
// Skip this optimization if we are tracing, as the trace may be polymorphic
|
||||
// over the shape of the `self` tensor, and we still want to record
|
||||
// the slice.
|
||||
@ -223,7 +223,7 @@ inline Tensor applySlice(
|
||||
: self.sym_size(dim);
|
||||
if (!disable_slice_optimization &&
|
||||
TORCH_STATICALLY_KNOWN_TRUE(start.sym_eq(0)) &&
|
||||
TORCH_STATICALLY_KNOWN_TRUE(length.sym_eq(stop)) && step == 1) {
|
||||
TORCH_STATICALLY_KNOWN_TRUE(length.sym_le(stop)) && step == 1) {
|
||||
return self;
|
||||
}
|
||||
}
|
||||
|
||||
@ -59,9 +59,7 @@ struct TORCH_API Generator {
|
||||
|
||||
explicit Generator(c10::intrusive_ptr<c10::GeneratorImpl> gen_impl)
|
||||
: impl_(std::move(gen_impl)) {
|
||||
if (impl_.get() == nullptr) {
|
||||
throw std::runtime_error("GeneratorImpl with nullptr is not supported");
|
||||
}
|
||||
TORCH_CHECK(impl_, "GeneratorImpl with nullptr is not supported");
|
||||
}
|
||||
|
||||
bool operator==(const Generator& rhs) const {
|
||||
|
||||
@ -98,9 +98,7 @@ class TORCH_API TensorBase {
|
||||
explicit TensorBase(
|
||||
c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
|
||||
: impl_(std::move(tensor_impl)) {
|
||||
if (impl_.get() == nullptr) {
|
||||
throw std::runtime_error("TensorImpl with nullptr is not supported");
|
||||
}
|
||||
TORCH_CHECK(impl_, "TensorImpl with nullptr is not supported");
|
||||
}
|
||||
TensorBase(const TensorBase&) = default;
|
||||
TensorBase(TensorBase&&) noexcept = default;
|
||||
|
||||
@ -68,11 +68,10 @@ Symbol InternedStrings::_symbol(const std::string& s) {
|
||||
return it->second;
|
||||
|
||||
auto pos = s.find("::");
|
||||
if (pos == std::string::npos) {
|
||||
std::stringstream ss;
|
||||
ss << "all symbols must have a namespace, <namespace>::<string>, but found: " << s;
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
TORCH_CHECK(
|
||||
pos != std::string::npos,
|
||||
"all symbols must have a namespace, <namespace>::<string>, but found: ",
|
||||
s);
|
||||
Symbol ns = _symbol("namespaces::" + s.substr(0, pos));
|
||||
|
||||
Symbol sym(sym_to_info_.size());
|
||||
@ -121,12 +120,11 @@ std::string Symbol::domainString() const {
|
||||
}
|
||||
|
||||
Symbol Symbol::fromDomainAndUnqualString(const std::string & d, const std::string & s) {
|
||||
if (d.compare(0, domain_prefix().size(), domain_prefix()) != 0) {
|
||||
std::ostringstream ss;
|
||||
ss << "Symbol: domain string is expected to be prefixed with '"
|
||||
<< domain_prefix() << "', e.g. 'org.pytorch.aten'";
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
TORCH_CHECK(
|
||||
d.compare(0, domain_prefix().size(), domain_prefix()) == 0,
|
||||
"Symbol: domain string is expected to be prefixed with '",
|
||||
domain_prefix(),
|
||||
"', e.g. 'org.pytorch.aten'");
|
||||
std::string qualString = d.substr(domain_prefix().size()) + "::" + s;
|
||||
return fromQualString(qualString);
|
||||
}
|
||||
|
||||
@ -406,8 +406,7 @@ size_t IValue::hash(const IValue& v) {
|
||||
case Tag::Enum:
|
||||
case Tag::Stream:
|
||||
case Tag::Uninitialized:
|
||||
throw std::runtime_error(
|
||||
"unhashable type: '" + v.type()->repr_str() + "'");
|
||||
TORCH_CHECK(false, "unhashable type: '" + v.type()->repr_str() + "'");
|
||||
}
|
||||
// the above switch should be exhaustive
|
||||
TORCH_INTERNAL_ASSERT(false, "we should never reach here")
|
||||
|
||||
@ -116,10 +116,9 @@ struct SingleElementType : public SharedType {
|
||||
|
||||
protected:
|
||||
SingleElementType(TypePtr elem) : SharedType(Kind), elem(std::move(elem)) {
|
||||
if (!this->elem) {
|
||||
throw std::runtime_error(c10::str(
|
||||
"Can not create ", typeKindToString(Kind), " with None type"));
|
||||
}
|
||||
TORCH_CHECK(
|
||||
this->elem,
|
||||
c10::str("Can not create ", typeKindToString(Kind), " with None type"));
|
||||
}
|
||||
|
||||
private:
|
||||
@ -416,16 +415,12 @@ struct TORCH_API SymbolicShape {
|
||||
}
|
||||
|
||||
ShapeSymbol operator[](size_t i) const {
|
||||
if (!dims_) {
|
||||
throw std::runtime_error("Rank isn't fixed");
|
||||
}
|
||||
TORCH_CHECK(dims_, "Rank isn't fixed");
|
||||
return (*dims_).at(i);
|
||||
}
|
||||
|
||||
ShapeSymbol at(size_t i) const {
|
||||
if (!dims_) {
|
||||
throw std::runtime_error("Rank isn't fixed");
|
||||
}
|
||||
TORCH_CHECK(dims_, "Rank isn't fixed");
|
||||
return (*dims_).at(i);
|
||||
}
|
||||
|
||||
@ -520,9 +515,7 @@ struct VaryingShape {
|
||||
}
|
||||
|
||||
const std::optional<T> &operator[](size_t i) const {
|
||||
if (!dims_) {
|
||||
throw std::runtime_error("Rank isn't fixed");
|
||||
}
|
||||
TORCH_CHECK(dims_, "Rank isn't fixed");
|
||||
return (*dims_).at(i);
|
||||
}
|
||||
|
||||
@ -957,9 +950,7 @@ struct TORCH_API DictType : public SharedType {
|
||||
|
||||
TypePtr createWithContained(
|
||||
std::vector<TypePtr> contained_types) const override {
|
||||
if (contained_types.size() != 2) {
|
||||
throw std::runtime_error("Expected 2 contained types");
|
||||
}
|
||||
TORCH_CHECK(contained_types.size() == 2, "Expected 2 contained types");
|
||||
return create(std::move(contained_types.at(0)), std::move(contained_types.at(1)));
|
||||
}
|
||||
|
||||
|
||||
@ -826,9 +826,7 @@ TupleType::TupleType(
|
||||
: NamedType(TypeKind::TupleType, std::move(name)),
|
||||
elements_(std::move(elements)),
|
||||
has_free_variables_(std::any_of(elements_.begin(), elements_.end(), [](const TypePtr& v) {
|
||||
if (!v) {
|
||||
throw std::runtime_error("Can not create tuple with None type");
|
||||
}
|
||||
TORCH_CHECK(v, "Can not create tuple with None type");
|
||||
return v->hasFreeVariables();
|
||||
})), schema_(std::move(schema)) {
|
||||
|
||||
|
||||
@ -163,6 +163,9 @@ class Vectorized<BFloat16> {
|
||||
Vectorized<BFloat16> exp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
Vectorized<BFloat16> fexp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
Vectorized<BFloat16> fmod(const Vectorized<BFloat16>& q) const;
|
||||
Vectorized<BFloat16> hypot(const Vectorized<BFloat16>& b) const;
|
||||
Vectorized<BFloat16> i0() const;
|
||||
|
||||
@ -249,6 +249,9 @@ class Vectorized<double> {
|
||||
Vectorized<double> exp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
Vectorized<double> fexp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
Vectorized<double> fmod(const Vectorized<double>& q) const {USE_SLEEF(
|
||||
{ return Vectorized<double>(Sleef_fmoddx_sve(values, q)); },
|
||||
{
|
||||
|
||||
@ -314,6 +314,9 @@ class Vectorized<float> {
|
||||
Vectorized<float> exp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
Vectorized<float> fexp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
Vectorized<float> fmod(const Vectorized<float>& q) const {USE_SLEEF(
|
||||
{ return Vectorized<float>(Sleef_fmodfx_sve(values, q)); },
|
||||
{
|
||||
|
||||
@ -308,6 +308,9 @@ class Vectorized<float> {
|
||||
Vectorized<float> exp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
Vectorized<float> fexp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
|
||||
fmod,
|
||||
Sleef_fmodf4)
|
||||
|
||||
@ -206,6 +206,10 @@ struct Vectorized16 {
|
||||
return static_cast<const Derived*>(this)->map_with_vec_float_method(
|
||||
&Vectorized<float>::exp_u20);
|
||||
}
|
||||
Derived fexp_u20() const {
|
||||
return static_cast<const Derived*>(this)->map_with_vec_float_method(
|
||||
&Vectorized<float>::exp_u20);
|
||||
}
|
||||
Derived fmod(const Derived& q) const {
|
||||
// This function is questionable with a conversion, so we use map2
|
||||
return map2(q, std::fmod);
|
||||
|
||||
@ -488,6 +488,9 @@ class Vectorized16 {
|
||||
Vectorized<T> expm1() const {
|
||||
return map(Sleef_expm1f8_u10);
|
||||
}
|
||||
Vectorized<T> fexp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
Vectorized<T> exp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
|
||||
@ -198,6 +198,9 @@ class Vectorized<double> {
|
||||
Vectorized<double> exp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
Vectorized<double> fexp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
Vectorized<double> fmod(const Vectorized<double>& q) const {
|
||||
return Vectorized<double>(Sleef_fmodd4(values, q));
|
||||
}
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
#pragma once
|
||||
|
||||
// DO NOT DEFINE STATIC DATA IN THIS HEADER!
|
||||
// See Note [Do not compile initializers with AVX]
|
||||
|
||||
@ -256,6 +255,63 @@ class Vectorized<float> {
|
||||
Vectorized<float> expm1() const {
|
||||
return Vectorized<float>(Sleef_expm1f8_u10(values));
|
||||
}
|
||||
Vectorized<float> fexp_u20() const {
|
||||
const __m256 vec_c0 = _mm256_set1_ps(0.00010703434948458272f);
|
||||
const __m256 vec_c1 = _mm256_set1_ps(0.30354260500649682f);
|
||||
const __m256 vec_c2 = _mm256_set1_ps(-0.22433836478672356);
|
||||
const __m256 vec_c3 = _mm256_set1_ps(-0.079204240219773236);
|
||||
|
||||
const __m256 vec_exp_log2ef =
|
||||
_mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e)
|
||||
|
||||
const __m256 vec_a = _mm256_set1_ps(std::pow(2, 23) / std::log2(2));
|
||||
const __m256 vec_b = _mm256_set1_ps(std::pow(2, 23) * 127.f);
|
||||
|
||||
const __m256 vec_ln_flt_min =
|
||||
_mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
|
||||
const __m256 vec_ln_flt_max =
|
||||
_mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
|
||||
const __m256 vec_inf = _mm256_set1_ps(INFINITY);
|
||||
const __m256 zero = _mm256_setzero_ps();
|
||||
|
||||
// exp(x) = 2**(x * log2(e))
|
||||
// = 2**xi * 2**xf - TIPS we are using the EEEE floating point
|
||||
// representation with identification to the exponent and the
|
||||
// mentissa
|
||||
// 2**xf will be approximated to a polynomial of degree 3 computed with
|
||||
// Horner method
|
||||
// compute the min/max for the mask
|
||||
// Masks
|
||||
__m256 mask_too_small =
|
||||
_mm256_cmp_ps(values, vec_ln_flt_min, _CMP_LT_OS); // x < min
|
||||
__m256 mask_too_large =
|
||||
_mm256_cmp_ps(values, vec_ln_flt_max, _CMP_GT_OS); // x > max
|
||||
|
||||
// transformation with log2(e)
|
||||
auto vec_src = _mm256_mul_ps(values, vec_exp_log2ef);
|
||||
auto vec_fractional = _mm256_sub_ps(vec_src, _mm256_floor_ps(vec_src));
|
||||
|
||||
// compute polynomial using Horner Scheme
|
||||
auto vec_res = _mm256_fmadd_ps(vec_fractional, vec_c3, vec_c2);
|
||||
vec_res = _mm256_fmadd_ps(vec_fractional, vec_res, vec_c1);
|
||||
vec_res = _mm256_fmadd_ps(vec_fractional, vec_res, vec_c0);
|
||||
|
||||
vec_src = _mm256_sub_ps(vec_src, vec_res);
|
||||
// // the tips is here, headache in perspective
|
||||
auto tmp = _mm256_fmadd_ps(vec_a, vec_src, vec_b);
|
||||
// headache bis
|
||||
__m256i casted_integer = _mm256_cvttps_epi32(tmp);
|
||||
// bitwise to float for the final transformation
|
||||
auto result = _mm256_castsi256_ps(casted_integer);
|
||||
// boundary condition
|
||||
// Set to 0 where x < ln(FLT_MIN)
|
||||
result = _mm256_blendv_ps(result, zero, mask_too_small);
|
||||
// Set to +inf where x > ln(FLT_MAX)
|
||||
result = _mm256_blendv_ps(result, vec_inf, mask_too_large);
|
||||
// final interpretation to float
|
||||
return result;
|
||||
}
|
||||
|
||||
Vectorized<float> exp_u20() const {
|
||||
// A faster version of exp with ULP=20
|
||||
const __m256 vec_factorial_1 =
|
||||
|
||||
@ -121,27 +121,52 @@ typename std::enable_if_t<
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
typename std::enable_if_t<
|
||||
std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
|
||||
at::vec::Vectorized<
|
||||
T>> inline convert_float_to_int8(at::vec::Vectorized<float> src) {
|
||||
at::vec::Vectorized<T> inline convert_float_to_int8(
|
||||
at::vec::Vectorized<float> src);
|
||||
|
||||
template <>
|
||||
at::vec::Vectorized<int8_t> inline convert_float_to_int8(
|
||||
at::vec::Vectorized<float> src) {
|
||||
// Convert from float32 to int32 with truncation
|
||||
__m256i x_values_int32 = _mm256_cvttps_epi32(src);
|
||||
|
||||
// Convert from int32 to int16 using signed saturation
|
||||
__m256i xy_packed_v = _mm256_packs_epi32(x_values_int32, x_values_int32);
|
||||
|
||||
constexpr auto min_val = std::numeric_limits<T>::min();
|
||||
constexpr auto max_val = std::numeric_limits<T>::max();
|
||||
constexpr auto min_val = std::numeric_limits<int8_t>::min();
|
||||
constexpr auto max_val = std::numeric_limits<int8_t>::max();
|
||||
|
||||
// Convert from int16 to uint8/int8 using unsigned saturation
|
||||
__m256i xyzw_clamped_v =
|
||||
pack_saturate_and_clamp<T>(xy_packed_v, xy_packed_v, min_val, max_val);
|
||||
// Convert from int16 to int8 using unsigned saturation
|
||||
__m256i xyzw_clamped_v = pack_saturate_and_clamp<int8_t>(
|
||||
xy_packed_v, xy_packed_v, min_val, max_val);
|
||||
__m256i permute_mask_v =
|
||||
_mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
|
||||
return _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v);
|
||||
}
|
||||
|
||||
template <>
|
||||
at::vec::Vectorized<uint8_t> inline convert_float_to_int8(
|
||||
at::vec::Vectorized<float> src) {
|
||||
// The type of *_val should be int32_t to ensure correct clamping behavior.
|
||||
constexpr auto min_val = std::numeric_limits<int32_t>::min();
|
||||
constexpr auto max_val = std::numeric_limits<int32_t>::max();
|
||||
__m256 float32_min_val = _mm256_set1_ps(float(min_val));
|
||||
__m256 float32_max_val = _mm256_set1_ps(float(max_val));
|
||||
__m256 float32_src = _mm256_max_ps(src, float32_min_val);
|
||||
float32_src = _mm256_min_ps(float32_src, float32_max_val);
|
||||
__m256i truncated_src = _mm256_cvttps_epi32(float32_src);
|
||||
|
||||
__m128i r1 = _mm256_castsi256_si128(truncated_src);
|
||||
__m128i mask = _mm_setr_epi8(
|
||||
0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
|
||||
__m128i r1_shuffled = _mm_shuffle_epi8(r1, mask);
|
||||
__m128i r2 = _mm256_extractf128_si256(truncated_src, 1);
|
||||
__m128i r2_shuffled = _mm_shuffle_epi8(r2, mask);
|
||||
__m128i result = _mm_unpacklo_epi32(r1_shuffled, r2_shuffled);
|
||||
|
||||
return _mm256_castsi128_si256(result);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__FORCE_INLINE void QuantizeAvx2(
|
||||
const float* src,
|
||||
|
||||
@ -273,6 +273,9 @@ class Vectorized<double> {
|
||||
Vectorized<double> C10_ALWAYS_INLINE exp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
Vectorized<double> C10_ALWAYS_INLINE fexp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
|
||||
Vectorized<double> lgamma() const __ubsan_ignore_undefined__ {
|
||||
return {Sleef_lgammad2_u10(_vec0), Sleef_lgammad2_u10(_vec1)};
|
||||
|
||||
@ -352,6 +352,9 @@ class Vectorized<float> {
|
||||
Vectorized<float> C10_ALWAYS_INLINE exp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
Vectorized<float> C10_ALWAYS_INLINE fexp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
|
||||
Vectorized<float> C10_ALWAYS_INLINE log() const {
|
||||
return {Sleef_logf4_u10(_vec0), Sleef_logf4_u10(_vec1)};
|
||||
|
||||
@ -1023,6 +1023,9 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
|
||||
Vectorized<T> exp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
Vectorized<T> fexp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
|
||||
Vectorized<T> log() const {
|
||||
return mapSleef(Sleef_logf4_u10, Sleef_logd2_u10);
|
||||
|
||||
@ -535,6 +535,9 @@ class Vectorized16 {
|
||||
Vectorized<T> expm1() const {
|
||||
return map(Sleef_expm1f16_u10);
|
||||
}
|
||||
Vectorized<T> fexp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
Vectorized<T> exp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
|
||||
@ -221,6 +221,9 @@ class Vectorized<double> {
|
||||
Vectorized<double> exp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
Vectorized<double> fexp_u20() const {
|
||||
return exp();
|
||||
}
|
||||
Vectorized<double> fmod(const Vectorized<double>& q) const {
|
||||
return Vectorized<double>(Sleef_fmodd8(values, q));
|
||||
}
|
||||
|
||||
@ -310,6 +310,60 @@ class Vectorized<float> {
|
||||
Vectorized<float> expm1() const {
|
||||
return Vectorized<float>(Sleef_expm1f16_u10(values));
|
||||
}
|
||||
Vectorized<float> fexp_u20() const {
|
||||
const __m512 vec_c0 = _mm512_set1_ps(0.00010703434948458272f);
|
||||
const __m512 vec_c1 = _mm512_set1_ps(0.30354260500649682f);
|
||||
const __m512 vec_c2 = _mm512_set1_ps(-0.22433836478672356);
|
||||
const __m512 vec_c3 = _mm512_set1_ps(-0.079204240219773236);
|
||||
|
||||
const __m512 vec_exp_log2ef =
|
||||
_mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e)
|
||||
|
||||
const __m512 vec_a = _mm512_set1_ps(std::pow(2, 23) / std::log2(2));
|
||||
const __m512 vec_b = _mm512_set1_ps(std::pow(2, 23) * 127.f);
|
||||
|
||||
const __m512 vec_ln_flt_min =
|
||||
_mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
|
||||
const __m512 vec_ln_flt_max =
|
||||
_mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
|
||||
__m512i vec_infinity = _mm512_set1_epi32(0x7F800000);
|
||||
__m512i vec_zero = _mm512_setzero_epi32();
|
||||
|
||||
// Fast Exponential Computation on SIMD Architectures
|
||||
// A. Cristiano I. Malossi, Yves Ineichen, Costas Bekas, and Alessandro
|
||||
// Curioni exp(x) = 2**(x * log2(e))
|
||||
// = 2**xi * 2**xf - TIPS we are using the EEEE floating point
|
||||
// representation with identification to the exponent and the
|
||||
// mentissa
|
||||
// 2**xf will be approximated to a polynomial of degree 3 computed with
|
||||
// Horner method
|
||||
// mask for the boundary condition
|
||||
auto min_mask = _mm512_cmp_ps_mask(values, vec_ln_flt_min, _CMP_LT_OS);
|
||||
auto max_mask = _mm512_cmp_ps_mask(values, vec_ln_flt_max, _CMP_GT_OS);
|
||||
|
||||
// transformation with log2(e)
|
||||
auto vec_src = _mm512_mul_ps(values, vec_exp_log2ef);
|
||||
auto vec_fractional = _mm512_sub_ps(vec_src, _mm512_floor_ps(vec_src));
|
||||
|
||||
// compute polynomial using Horner Scheme, for superscalar processor
|
||||
auto vec_res = _mm512_fmadd_ps(vec_fractional, vec_c3, vec_c2);
|
||||
vec_res = _mm512_fmadd_ps(vec_fractional, vec_res, vec_c1);
|
||||
vec_res = _mm512_fmadd_ps(vec_fractional, vec_res, vec_c0);
|
||||
|
||||
vec_src = _mm512_sub_ps(vec_src, vec_res);
|
||||
// the tips is here, headache in perspective
|
||||
auto tmp = _mm512_fmadd_ps(vec_a, vec_src, vec_b);
|
||||
// headache bis - we loose precision with the cast but it "fits", but ok
|
||||
// after f32 -> f16 later
|
||||
__m512i casted_integer = _mm512_cvttps_epi32(tmp);
|
||||
// boundary condition, lower than the min -> 0
|
||||
casted_integer = _mm512_mask_mov_epi32(casted_integer, min_mask, vec_zero);
|
||||
// boundary condition, larger than the max -> +oo
|
||||
casted_integer =
|
||||
_mm512_mask_mov_epi32(casted_integer, max_mask, vec_infinity);
|
||||
// final interpretation to float
|
||||
return _mm512_castsi512_ps(casted_integer);
|
||||
}
|
||||
Vectorized<float> exp_u20() const {
|
||||
// A faster version of exp with ULP=20
|
||||
const __m512 vec_factorial_1 =
|
||||
|
||||
@ -123,22 +123,24 @@ typename std::enable_if_t<
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
typename std::enable_if_t<
|
||||
std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
|
||||
at::vec::Vectorized<
|
||||
T>> inline convert_float_to_int8(at::vec::Vectorized<float> src) {
|
||||
at::vec::Vectorized<T> inline convert_float_to_int8(
|
||||
at::vec::Vectorized<float> src);
|
||||
|
||||
template <>
|
||||
at::vec::Vectorized<int8_t> inline convert_float_to_int8(
|
||||
at::vec::Vectorized<float> src) {
|
||||
// Convert from float32 to int32 with truncation
|
||||
__m512i x_values_int32 = _mm512_cvttps_epi32(src);
|
||||
|
||||
// Convert from int32 to int16 using signed saturation
|
||||
__m512i xy_packed_v = _mm512_packs_epi32(x_values_int32, x_values_int32);
|
||||
|
||||
constexpr auto min_val = std::numeric_limits<T>::min();
|
||||
constexpr auto max_val = std::numeric_limits<T>::max();
|
||||
constexpr auto min_val = std::numeric_limits<int8_t>::min();
|
||||
constexpr auto max_val = std::numeric_limits<int8_t>::max();
|
||||
|
||||
// Convert from int16 to uint8/int8 using unsigned saturation
|
||||
__m512i xyzw_clamped_v =
|
||||
pack_saturate_and_clamp<T>(xy_packed_v, xy_packed_v, min_val, max_val);
|
||||
// Convert from int16 to int8 using unsigned saturation
|
||||
__m512i xyzw_clamped_v = pack_saturate_and_clamp<int8_t>(
|
||||
xy_packed_v, xy_packed_v, min_val, max_val);
|
||||
__m512i permute_mask_v = _mm512_set_epi32(
|
||||
0x0f,
|
||||
0x0b,
|
||||
@ -159,6 +161,21 @@ typename std::enable_if_t<
|
||||
return _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
|
||||
}
|
||||
|
||||
template <>
|
||||
at::vec::Vectorized<uint8_t> inline convert_float_to_int8(
|
||||
at::vec::Vectorized<float> src) {
|
||||
// The type of *_val should be int32_t to ensure correct clamping behavior.
|
||||
constexpr auto min_val = std::numeric_limits<int32_t>::min();
|
||||
constexpr auto max_val = std::numeric_limits<int32_t>::max();
|
||||
__m512 float32_min_val = _mm512_set1_ps(float(min_val));
|
||||
__m512 float32_max_val = _mm512_set1_ps(float(max_val));
|
||||
__m512 float32_src = _mm512_max_ps(src, float32_min_val);
|
||||
float32_src = _mm512_min_ps(float32_src, float32_max_val);
|
||||
__m512i int32_src_clamped = _mm512_cvttps_epi32(float32_src);
|
||||
__m128i int8_src = _mm512_cvtepi32_epi8(int32_src_clamped);
|
||||
return _mm512_castsi128_si512(int8_src);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__FORCE_INLINE void QuantizeAvx512(
|
||||
const float* src,
|
||||
|
||||
@ -238,9 +238,6 @@ struct Vectorized {
|
||||
Vectorized vector;
|
||||
int_same_size_t<T> buffer[size()];
|
||||
mask.store(buffer);
|
||||
#if defined(__clang__) && __ARM_FEATURE_SVE
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for (const auto i : c10::irange(size())) {
|
||||
if (buffer[i] & 0x01) {
|
||||
vector[i] = b[i];
|
||||
@ -547,6 +544,9 @@ struct Vectorized {
|
||||
Vectorized<T> exp_u20() const {
|
||||
return map(std::exp);
|
||||
}
|
||||
Vectorized<T> fexp_u20() const {
|
||||
return map(std::exp);
|
||||
}
|
||||
Vectorized<T> frac() const {
|
||||
return *this - this->trunc();
|
||||
}
|
||||
|
||||
@ -263,6 +263,7 @@ class VectorizedN {
|
||||
VECTORIZEDN_DEFINE_UNARY_OP(exp2)
|
||||
VECTORIZEDN_DEFINE_UNARY_OP(expm1)
|
||||
VECTORIZEDN_DEFINE_UNARY_OP(exp_u20)
|
||||
VECTORIZEDN_DEFINE_UNARY_OP(fexp_u20)
|
||||
VECTORIZEDN_DEFINE_UNARY_OP(frac)
|
||||
VECTORIZEDN_DEFINE_BINARY_OP(fmod)
|
||||
VECTORIZEDN_DEFINE_UNARY_OP(log)
|
||||
|
||||
@ -94,9 +94,10 @@ static std::vector<std::optional<Tensor>> batchIndices(
|
||||
if (index.has_value() && index->sym_numel() != 0) {
|
||||
const auto idx_bdim = indices_bdims[i];
|
||||
indices_.emplace_back(maybePadToLogicalRank(moveBatchDimToFront(index.value(), idx_bdim), idx_bdim, maxLogicalRank));
|
||||
if (index.value().dtype() == kBool && indices_bdims[i].has_value()) {
|
||||
throw std::runtime_error("vmap: We do not support batching operators that can support dynamic shape. Attempting to batch over indexing with a boolean mask.");
|
||||
}
|
||||
TORCH_CHECK(
|
||||
index.value().dtype() != kBool || !indices_bdims[i].has_value(),
|
||||
"vmap: We do not support batching operators that can support ",
|
||||
"dynamic shape. Attempting to batch over indexing with a boolean mask.");
|
||||
} else {
|
||||
indices_.push_back(index);
|
||||
}
|
||||
|
||||
@ -16,11 +16,14 @@ template<typename O, typename C>
|
||||
static void _assert_match(const O& original, const C& compared, const std::string& name) {
|
||||
if (compared) {
|
||||
bool equal = (original == compared.value());
|
||||
if (!equal) {
|
||||
std::stringstream msg;
|
||||
msg << "Tensor " << name << " mismatch! Expected: " << compared.value() << ", Got: " << original;
|
||||
throw std::runtime_error(msg.str());
|
||||
}
|
||||
TORCH_CHECK(
|
||||
equal,
|
||||
"Tensor ",
|
||||
name,
|
||||
" mismatch! Expected: ",
|
||||
compared.value(),
|
||||
", Got: ",
|
||||
original);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -424,6 +424,14 @@ Tensor _dirichlet_grad_cpu(const Tensor& x, const Tensor& alpha, const Tensor& t
|
||||
*/
|
||||
|
||||
Tensor _s_binomial_cpu(const Tensor& count, const Tensor& prob, std::optional<Generator> gen) {
|
||||
TORCH_CHECK_VALUE(
|
||||
at::isFloatingType(count.scalar_type()),
|
||||
"binomial only supports floating-point dtypes for count, got: ",
|
||||
count.scalar_type());
|
||||
TORCH_CHECK_VALUE(
|
||||
at::isFloatingType(prob.scalar_type()),
|
||||
"binomial only supports floating-point dtypes for prob, got: ",
|
||||
prob.scalar_type());
|
||||
Tensor ret = at::zeros(count.sizes(), count.options());
|
||||
auto iter = TensorIteratorConfig()
|
||||
.add_output(ret)
|
||||
|
||||
@ -180,9 +180,7 @@ TORCH_IMPL_FUNC(triu_cpu)(const Tensor& self, int64_t k, const Tensor &result) {
|
||||
}
|
||||
|
||||
Tensor trace_backward_symint(const Tensor& grad, c10::SymIntArrayRef sizes) {
|
||||
if (sizes.size() != 2) {
|
||||
throw std::runtime_error("expected matrix input");
|
||||
}
|
||||
TORCH_CHECK(sizes.size() == 2, "expected matrix input");
|
||||
|
||||
auto grad_input = at::zeros_symint(sizes[0] * sizes[1], grad.options());
|
||||
auto indices = at::arange(0, grad_input.numel(), sizes[1] + 1, grad.options().dtype(at::kLong));
|
||||
|
||||
@ -62,7 +62,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
|
||||
virtual std::optional<at::Tensor> bias() = 0;
|
||||
|
||||
virtual void set_bias(const std::optional<at::Tensor>& bias) {
|
||||
throw std::runtime_error(
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"set_bias is not implemented for this packed "
|
||||
"parameter type");
|
||||
}
|
||||
|
||||
@ -96,7 +96,14 @@ inline void _exp_reduce_sum_fusion_kernel(
|
||||
for (long i = 0; i < vec_size * (size / vec_size); i += vec_size) {
|
||||
auto tmp0 = vec::Vectorized<T1>::loadu(a + i);
|
||||
auto tmp1 = tmp0 - vec_max;
|
||||
auto tmp2 = tmp1.exp_u20();
|
||||
Vectorized<T1> tmp2;
|
||||
if constexpr (std::is_same_v<T1, float> &&
|
||||
(std::is_same_v<T2, at::BFloat16> || std::is_same_v<T2, at::Half>))
|
||||
{
|
||||
tmp2 = tmp1.fexp_u20();
|
||||
} else {
|
||||
tmp2 = tmp1.exp_u20();
|
||||
}
|
||||
vec_tmp_sum += tmp2;
|
||||
_store(out + i, tmp2);
|
||||
}
|
||||
|
||||
@ -48,12 +48,7 @@ __global__ void prepare_grouped_gemm_data(
|
||||
int32_t start = tid == 0 ? 0 : offs[tid - 1];
|
||||
delta = offs[tid] - start;
|
||||
if (K < 0) {
|
||||
if (!a_row_major && b_row_major) {
|
||||
CUDA_KERNEL_ASSERT(delta >=0 && "expected offsets to be greater or equal 0\n");
|
||||
} else {
|
||||
// CUTLASS cannot handle delta=0 here.
|
||||
CUDA_KERNEL_ASSERT(delta >0 && "expected offsets to be greater than 0\n");
|
||||
}
|
||||
CUDA_KERNEL_ASSERT(delta >=0 && "expected offsets to be greater or equal 0\n");
|
||||
}
|
||||
|
||||
// TMA transfers require global memory tensor addresses to be
|
||||
|
||||
@ -337,6 +337,7 @@ Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
|
||||
#include <cmath>
|
||||
|
||||
#include <mkl_dfti.h>
|
||||
#include <mkl_version.h>
|
||||
#include <ATen/mkl/Exceptions.h>
|
||||
#include <ATen/mkl/Descriptors.h>
|
||||
#include <ATen/mkl/Limits.h>
|
||||
@ -479,6 +480,19 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
|
||||
const auto value_type = c10::toRealValueType(input.scalar_type());
|
||||
out.resize_(batched_out_sizes, MemoryFormat::Contiguous);
|
||||
|
||||
// fix mkl issue
|
||||
// https://github.com/pytorch/pytorch/issues/154477
|
||||
#ifdef INTEL_MKL_VERSION
|
||||
#if INTEL_MKL_VERSION > 20210400L
|
||||
for (const auto& stride : input.strides()) {
|
||||
if (stride == 0) {
|
||||
input = input.clone(MemoryFormat::Contiguous);
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
auto descriptor = _plan_mkl_fft(
|
||||
input.strides(), out.strides(), signal_size, input.is_complex(),
|
||||
out.is_complex(), normalization, forward, value_type);
|
||||
|
||||
@ -79,14 +79,16 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
|
||||
// 2. Math fallback
|
||||
auto& ctx = at::globalContext();
|
||||
// use overrideable linked to onednn as overrideable implementation
|
||||
if (!ctx.userEnabledMathSDP() && !ctx.userEnabledOverrideableSDP()) {
|
||||
if (!ctx.userEnabledMathSDP() && !ctx.userEnabledOverrideableSDP() &&
|
||||
!ctx.userEnabledFlashSDP()) {
|
||||
return sdp::SDPBackend::error;
|
||||
}
|
||||
|
||||
// Get ideal kernel ordering
|
||||
const std::array<sdp::SDPBackend, 2> priority_order{
|
||||
const std::array<sdp::SDPBackend, 3> priority_order{
|
||||
sdp::SDPBackend::overrideable,
|
||||
sdp::SDPBackend::math,
|
||||
sdp::SDPBackend::flash_attention,
|
||||
};
|
||||
|
||||
// Because TORCHCHECK checks if condition is true we negate debug so that
|
||||
@ -105,6 +107,14 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
|
||||
return sdp::SDPBackend::math;
|
||||
}
|
||||
break;
|
||||
case sdp::SDPBackend::flash_attention:
|
||||
if (ctx.userEnabledFlashSDP() &&
|
||||
use_overrideable_xpu(kernel_params, print_debug)) {
|
||||
TORCH_WARN(
|
||||
"Flash Attention is not supported on XPU, falling back to overrideable kernel.");
|
||||
return sdp::SDPBackend::overrideable;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK(false, "Invalid backend");
|
||||
}
|
||||
@ -141,7 +151,7 @@ int64_t _fused_sdp_choice_xpu(
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"No viable backend for scaled_dot_product_attention was found. ",
|
||||
"This is likely due to turning off both the math kernel and the fused kernels.");
|
||||
"This is likely due to turning off both the math kernel and the overrideable kernels.");
|
||||
}
|
||||
return static_cast<int64_t>(backend);
|
||||
}
|
||||
|
||||
@ -1,8 +0,0 @@
|
||||
// Copyright © 2022 Apple Inc.
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <ATen/native/mps/TensorFactory.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <unordered_map>
|
||||
|
||||
using namespace at::mps;
|
||||
@ -18,8 +18,6 @@
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/TensorAdvancedIndexing.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/operations/Indexing.h>
|
||||
#include <c10/core/QScheme.h>
|
||||
#include <c10/util/SmallVector.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <fmt/format.h>
|
||||
|
||||
@ -746,7 +746,7 @@ inline std::tuple<bool, Tensor, Tensor> NestedTensor_compute_size_stride(
|
||||
}
|
||||
else if (size_reshaped == -1) {
|
||||
if (infer_index > -1) {
|
||||
throw std::runtime_error("only one dimension can be inferred");
|
||||
TORCH_CHECK(false, "only one dimension can be inferred");
|
||||
}
|
||||
else {
|
||||
infer_index = idim;
|
||||
|
||||
@ -19,7 +19,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
|
||||
double /*output_scale*/,
|
||||
int64_t /*output_zero_point*/,
|
||||
at::Tensor& output) {
|
||||
throw std::runtime_error(
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"apply_out is not implemented for this packed "
|
||||
"parameter type");
|
||||
return output;
|
||||
@ -30,7 +31,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
|
||||
double /*output_scale*/,
|
||||
int64_t /*output_zero_point*/,
|
||||
at::Tensor& output) {
|
||||
throw std::runtime_error(
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"apply_relu_out is not implemented for this packed "
|
||||
"parameter type");
|
||||
return output;
|
||||
@ -55,7 +57,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
|
||||
at::Tensor input,
|
||||
double input_scale,
|
||||
int64_t input_zero_point) {
|
||||
throw std::runtime_error(
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"apply_with_input_q_dq_qweight_dq_output_fp32 is not implemented for this packed "
|
||||
"parameter type");
|
||||
return {};
|
||||
@ -79,7 +82,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
|
||||
at::Tensor input,
|
||||
double input_scale,
|
||||
int64_t input_zero_point) {
|
||||
throw std::runtime_error(
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"apply_with_input_q_dq_qweight_dq_relu_output_fp32 is not implemented for this packed "
|
||||
"parameter type");
|
||||
return {};
|
||||
@ -96,7 +100,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
|
||||
const at::Tensor& /* input */,
|
||||
at::Tensor& output,
|
||||
bool /* reduce_range */) {
|
||||
throw std::runtime_error(
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"apply_dynamic_out is not implemented for this packed "
|
||||
"parameter type");
|
||||
return output;
|
||||
@ -105,7 +110,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
|
||||
const at::Tensor& /* input */,
|
||||
at::Tensor& output,
|
||||
bool /* reduce_range */) {
|
||||
throw std::runtime_error(
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"apply_dynamic_relu_out is not implemented for this packed "
|
||||
"parameter type");
|
||||
return output;
|
||||
@ -116,7 +122,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
|
||||
virtual std::optional<at::Tensor> bias() = 0;
|
||||
|
||||
virtual void set_bias(std::optional<at::Tensor> /*bias*/) {
|
||||
throw std::runtime_error(
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"set_bias is not implemented for this packed "
|
||||
"parameter type");
|
||||
}
|
||||
|
||||
@ -843,6 +843,11 @@ SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
|
||||
return SDPBackend::math;
|
||||
}
|
||||
break;
|
||||
case SDPBackend::overrideable:
|
||||
if (ctx.userEnabledOverrideableSDP()) {
|
||||
TORCH_CHECK(false, "Invalid backend");
|
||||
}
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK(false, "Invalid backend");
|
||||
}
|
||||
|
||||
@ -8,12 +8,12 @@ It also provides mechanisms to compare PyTorch with other frameworks.
|
||||
Make sure you're on a machine with CUDA, torchvision, and pytorch installed. Install in the following order:
|
||||
```
|
||||
# Install torchvision. It comes with the pytorch stable release binary
|
||||
pip3 install torch torchvision
|
||||
python -m pip install torch torchvision
|
||||
|
||||
# Install the latest pytorch master from source.
|
||||
# It should supersede the installation from the release binary.
|
||||
cd $PYTORCH_HOME
|
||||
python setup.py build develop
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
|
||||
# Check the pytorch installation version
|
||||
python -c "import torch; print(torch.__version__)"
|
||||
|
||||
@ -27,7 +27,7 @@ pull-deps: clone-deps
|
||||
(cd ../../../torchbenchmark && git fetch && git checkout "$$(cat ../pytorch/.github/ci_commit_pins/torchbench.txt)" && git submodule update --init --recursive)
|
||||
|
||||
build-deps: clone-deps
|
||||
uv pip install astunparse numpy scipy ninja pyyaml mkl mkl-include setuptools cmake \
|
||||
uv pip install numpy scipy ninja pyyaml six mkl mkl-include setuptools wheel cmake \
|
||||
typing-extensions requests protobuf numba cython scikit-learn librosa
|
||||
(cd ../../../torchvision && uv pip install -e . --no-build-isolation)
|
||||
(cd ../../../torchdata && uv pip install -e .)
|
||||
|
||||
@ -210,7 +210,7 @@ mobilenet_v2,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenet_v2_quantized_qat,pass,2
|
||||
mobilenet_v2_quantized_qat,pass,3
|
||||
|
||||
|
||||
|
||||
@ -274,7 +274,7 @@ resnet50,pass,0
|
||||
|
||||
|
||||
|
||||
resnet50_quantized_qat,pass,2
|
||||
resnet50_quantized_qat,pass,3
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -210,7 +210,7 @@ mobilenet_v2,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenet_v2_quantized_qat,pass,2
|
||||
mobilenet_v2_quantized_qat,pass,3
|
||||
|
||||
|
||||
|
||||
@ -274,7 +274,7 @@ resnet50,pass,0
|
||||
|
||||
|
||||
|
||||
resnet50_quantized_qat,pass,2
|
||||
resnet50_quantized_qat,pass,3
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -210,7 +210,7 @@ mobilenet_v2,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenet_v2_quantized_qat,pass,2
|
||||
mobilenet_v2_quantized_qat,pass,3
|
||||
|
||||
|
||||
|
||||
@ -274,7 +274,7 @@ resnet50,pass,0
|
||||
|
||||
|
||||
|
||||
resnet50_quantized_qat,pass,2
|
||||
resnet50_quantized_qat,pass,3
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -194,7 +194,7 @@ mobilenet_v2,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenet_v2_quantized_qat,pass,2
|
||||
mobilenet_v2_quantized_qat,pass,3
|
||||
|
||||
|
||||
|
||||
@ -258,7 +258,7 @@ resnet50,pass,0
|
||||
|
||||
|
||||
|
||||
resnet50_quantized_qat,pass,2
|
||||
resnet50_quantized_qat,pass,3
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -210,7 +210,7 @@ mobilenet_v2,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenet_v2_quantized_qat,pass,2
|
||||
mobilenet_v2_quantized_qat,pass,3
|
||||
|
||||
|
||||
|
||||
@ -274,7 +274,7 @@ resnet50,pass,0
|
||||
|
||||
|
||||
|
||||
resnet50_quantized_qat,pass,2
|
||||
resnet50_quantized_qat,pass,3
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -17,8 +17,8 @@ export DEBUG=0
|
||||
export OMP_NUM_THREADS=10
|
||||
|
||||
# Compile pytorch with the base revision
|
||||
git checkout master
|
||||
python setup.py develop
|
||||
git checkout main
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
|
||||
# Install dependencies:
|
||||
# Scipy is required by detr
|
||||
@ -32,7 +32,7 @@ python functional_autograd_benchmark.py --output before.txt
|
||||
# Compile pytorch with your change
|
||||
popd
|
||||
git checkout your_feature_branch
|
||||
python setup.py develop
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
|
||||
# Run the benchmark for the new version
|
||||
pushd benchmarks/functional_autograd_benchmark
|
||||
|
||||
@ -20,7 +20,7 @@ Key Features:
|
||||
The instruction below installs a cpp\_extension for PyTorch and it is required to run the benchmark suite.
|
||||
```bash
|
||||
cd pt_extension
|
||||
python setup.py install
|
||||
python -m pip install .
|
||||
```
|
||||
|
||||
## How to run the benchmarks:
|
||||
|
||||
@ -11,7 +11,7 @@ export USE_MKL=1
|
||||
CMAKE_ONLY=1 python setup.py build
|
||||
ccmake build # or cmake-gui build
|
||||
|
||||
python setup.py install
|
||||
python -m pip install --no-build-isolation -v .
|
||||
|
||||
cd benchmarks
|
||||
echo "!! SPARSE SPMM TIME BENCHMARK!! " >> $OUTFILE
|
||||
@ -28,7 +28,7 @@ echo "----- USE_MKL=0 ------" >> $OUTFILE
|
||||
rm -rf build
|
||||
|
||||
export USE_MKL=0
|
||||
python setup.py install
|
||||
python -m pip install --no-build-isolation -v .
|
||||
|
||||
cd benchmarks
|
||||
for dim0 in 1000 5000 10000; do
|
||||
|
||||
233
c10/core/AllocatorConfig.cpp
Normal file
233
c10/core/AllocatorConfig.cpp
Normal file
@ -0,0 +1,233 @@
|
||||
#include <c10/core/AllocatorConfig.h>
|
||||
#include <c10/core/DeviceType.h>
|
||||
#include <c10/util/env.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace c10::CachingAllocator {
|
||||
|
||||
namespace {
|
||||
constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
|
||||
constexpr size_t kMB = 1024 * 1024ul;
|
||||
constexpr size_t kRoundUpPowerOfTwoStart = 1 * kMB; // 1MB
|
||||
constexpr size_t kRoundUpPowerOfTwoEnd = 64 * 1024ul * kMB; // 64GB
|
||||
} // anonymous namespace
|
||||
|
||||
AcceleratorAllocatorConfig& AcceleratorAllocatorConfig::instance() {
|
||||
static AcceleratorAllocatorConfig instance;
|
||||
#define C10_ALLOCATOR_CONFIG_PARSE_ENV(env, deprecated) \
|
||||
auto env##_name = c10::utils::get_env(#env); \
|
||||
if (env##_name.has_value()) { \
|
||||
if (deprecated) { \
|
||||
TORCH_WARN_ONCE(#env " is deprecated, use PYTORCH_ALLOC_CONF instead"); \
|
||||
} \
|
||||
instance.parseArgs(env##_name.value()); \
|
||||
return true; \
|
||||
}
|
||||
static bool env_flag [[maybe_unused]] = []() {
|
||||
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_ALLOC_CONF, false)
|
||||
// Keep this for backwards compatibility
|
||||
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_CUDA_ALLOC_CONF, /*deprecated=*/true)
|
||||
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_HIP_ALLOC_CONF, /*deprecated=*/true)
|
||||
return false;
|
||||
}();
|
||||
#undef C10_ALLOCATOR_CONFIG_PARSE_ENV
|
||||
return instance;
|
||||
}
|
||||
|
||||
AcceleratorAllocatorConfig::AcceleratorAllocatorConfig() {
|
||||
roundup_power2_divisions_.assign(kRoundUpPowerOfTwoIntervals, 0);
|
||||
}
|
||||
|
||||
size_t AcceleratorAllocatorConfig::roundup_power2_divisions(size_t size) {
|
||||
size_t log_size = (63 - llvm::countLeadingZeros(size));
|
||||
|
||||
// Our intervals start at 1MB and end at 64GB
|
||||
const size_t interval_start =
|
||||
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoStart);
|
||||
const size_t interval_end =
|
||||
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoEnd);
|
||||
TORCH_CHECK(
|
||||
interval_end - interval_start == kRoundUpPowerOfTwoIntervals,
|
||||
"kRoundUpPowerOfTwoIntervals mismatch");
|
||||
|
||||
size_t index =
|
||||
(log_size > interval_start) ? (log_size - interval_start) : 0ul;
|
||||
index = std::min(index, kRoundUpPowerOfTwoIntervals - 1);
|
||||
return instance().roundup_power2_divisions_[index];
|
||||
}
|
||||
|
||||
size_t AcceleratorAllocatorConfig::parseMaxSplitSize(
|
||||
const ConfigTokenizer& tokenizer,
|
||||
size_t i) {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
constexpr size_t min_allowed_split_size_mb = kLargeBuffer / kMB;
|
||||
constexpr size_t max_allowed_split_size_mb =
|
||||
std::numeric_limits<size_t>::max() / kMB;
|
||||
|
||||
size_t val_env = tokenizer.toSizeT(++i);
|
||||
TORCH_CHECK(
|
||||
val_env >= min_allowed_split_size_mb,
|
||||
"CachingAllocator option max_split_size_mb too small, must be >= ",
|
||||
min_allowed_split_size_mb);
|
||||
val_env = std::min(val_env, max_allowed_split_size_mb);
|
||||
max_split_size_ = val_env * kMB;
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t AcceleratorAllocatorConfig::parseMaxNonSplitRoundingSize(
|
||||
const ConfigTokenizer& tokenizer,
|
||||
size_t i) {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
constexpr size_t min_allowed_split_size_mb = kLargeBuffer / kMB;
|
||||
constexpr size_t max_allowed_split_size_mb =
|
||||
std::numeric_limits<size_t>::max() / kMB;
|
||||
|
||||
size_t val_env = tokenizer.toSizeT(++i);
|
||||
TORCH_CHECK(
|
||||
val_env >= min_allowed_split_size_mb,
|
||||
"CachingAllocator option max_non_split_rounding_mb too small, must be >= ",
|
||||
min_allowed_split_size_mb);
|
||||
val_env = std::min(val_env, max_allowed_split_size_mb);
|
||||
max_non_split_rounding_size_ = val_env * kMB;
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t AcceleratorAllocatorConfig::parseGarbageCollectionThreshold(
|
||||
const ConfigTokenizer& tokenizer,
|
||||
size_t i) {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
double val_env = tokenizer.toDouble(++i);
|
||||
TORCH_CHECK(
|
||||
val_env > 0 && val_env < 1.0,
|
||||
"garbage_collect_threshold is invalid, set it in (0.0, 1.0)");
|
||||
garbage_collection_threshold_ = val_env;
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
|
||||
const ConfigTokenizer& tokenizer,
|
||||
size_t i) {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
bool first_value = true;
|
||||
|
||||
if (tokenizer[++i] == "[") {
|
||||
size_t last_index = 0;
|
||||
// NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
|
||||
while (++i < tokenizer.size() && tokenizer[i] != "]") {
|
||||
size_t value_index = i;
|
||||
tokenizer.checkToken(++i, ":");
|
||||
size_t value = tokenizer.toSizeT(++i);
|
||||
TORCH_CHECK(
|
||||
value == 0 || llvm::isPowerOf2_64(value),
|
||||
"For roundups, the divisions has to be power of 2 or 0 to disable roundup ");
|
||||
|
||||
if (tokenizer[value_index] == ">") {
|
||||
std::fill(
|
||||
std::next(
|
||||
roundup_power2_divisions_.begin(),
|
||||
static_cast<std::vector<size_t>::difference_type>(
|
||||
last_index + 1)),
|
||||
roundup_power2_divisions_.end(),
|
||||
value);
|
||||
} else {
|
||||
size_t boundary = tokenizer.toSizeT(value_index);
|
||||
TORCH_CHECK(
|
||||
llvm::isPowerOf2_64(boundary),
|
||||
"For roundups, the intervals have to be power of 2 ");
|
||||
|
||||
size_t index = 63 - llvm::countLeadingZeros(boundary);
|
||||
index =
|
||||
std::clamp(index, size_t{0}, roundup_power2_divisions_.size() - 1);
|
||||
|
||||
if (first_value) {
|
||||
std::fill(
|
||||
roundup_power2_divisions_.begin(),
|
||||
std::next(
|
||||
roundup_power2_divisions_.begin(),
|
||||
static_cast<std::vector<size_t>::difference_type>(index)),
|
||||
value);
|
||||
first_value = false;
|
||||
}
|
||||
roundup_power2_divisions_[index] = value;
|
||||
last_index = index;
|
||||
}
|
||||
|
||||
if (tokenizer[i + 1] != "]") {
|
||||
tokenizer.checkToken(++i, ",");
|
||||
}
|
||||
}
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
i < tokenizer.size(),
|
||||
"Expected closing bracket ']' in ConfigTokenizer but reached end of config");
|
||||
} else { // Keep this for backwards compatibility
|
||||
size_t value = tokenizer.toSizeT(i);
|
||||
TORCH_CHECK(
|
||||
llvm::isPowerOf2_64(value),
|
||||
"For roundups, the divisions has to be power of 2 ");
|
||||
std::fill(
|
||||
roundup_power2_divisions_.begin(),
|
||||
roundup_power2_divisions_.end(),
|
||||
value);
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t AcceleratorAllocatorConfig::parseExpandableSegments(
|
||||
const ConfigTokenizer& tokenizer,
|
||||
size_t i) {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
use_expandable_segments_ = tokenizer.toBool(++i);
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t AcceleratorAllocatorConfig::parsePinnedUseBackgroundThreads(
|
||||
const ConfigTokenizer& tokenizer,
|
||||
size_t i) {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
pinned_use_background_threads_ = tokenizer.toBool(++i);
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
void AcceleratorAllocatorConfig::parseArgs(const std::string& env) {
|
||||
// The following option will be reset to its default value if not explicitly
|
||||
// set each time.
|
||||
max_split_size_ = std::numeric_limits<size_t>::max();
|
||||
roundup_power2_divisions_.assign(kRoundUpPowerOfTwoIntervals, 0);
|
||||
garbage_collection_threshold_ = 0;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(last_allocator_settings_mutex_);
|
||||
last_allocator_settings_ = env;
|
||||
}
|
||||
|
||||
ConfigTokenizer tokenizer(env);
|
||||
for (size_t i = 0; i < tokenizer.size(); i++) {
|
||||
const auto& key = tokenizer[i];
|
||||
if (key == "max_split_size_mb") {
|
||||
i = parseMaxSplitSize(tokenizer, i);
|
||||
} else if (key == "max_non_split_rounding_mb") {
|
||||
i = parseMaxNonSplitRoundingSize(tokenizer, i);
|
||||
} else if (key == "garbage_collection_threshold") {
|
||||
i = parseGarbageCollectionThreshold(tokenizer, i);
|
||||
} else if (key == "roundup_power2_divisions") {
|
||||
i = parseRoundUpPower2Divisions(tokenizer, i);
|
||||
} else if (key == "expandable_segments") {
|
||||
i = parseExpandableSegments(tokenizer, i);
|
||||
} else if (key == "pinned_use_background_threads") {
|
||||
i = parsePinnedUseBackgroundThreads(tokenizer, i);
|
||||
} else {
|
||||
i = tokenizer.skipKey(i);
|
||||
}
|
||||
|
||||
if (i + 1 < tokenizer.size()) {
|
||||
tokenizer.checkToken(++i, ",");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace c10::CachingAllocator
|
||||
337
c10/core/AllocatorConfig.h
Normal file
337
c10/core/AllocatorConfig.h
Normal file
@ -0,0 +1,337 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/DeviceType.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/llvmMathExtras.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace c10::CachingAllocator {
|
||||
|
||||
// "large" allocations may be packed in 20 MiB blocks
|
||||
const size_t kLargeBuffer = 20971520;
|
||||
|
||||
// A utility class for tokenizing allocator configuration strings into discrete
|
||||
// parts. For example, the config string:
|
||||
// "key1:val1,key2:[val2,val3]"
|
||||
// is tokenized into:
|
||||
// "key1", ":", "val1", ",", "key2", ":", "[", "val2", ",", "val3", "]",
|
||||
//
|
||||
// Tokens include keys, values, and special characters (':', ',', '[', ']').
|
||||
// Whitespace is ignored.
|
||||
class ConfigTokenizer {
|
||||
public:
|
||||
explicit ConfigTokenizer(const std::string& env) {
|
||||
std::string buffer;
|
||||
for (char ch : env) {
|
||||
if (ch == ',' || ch == ':' || ch == '[' || ch == ']') {
|
||||
if (!buffer.empty()) {
|
||||
config_.emplace_back(std::move(buffer));
|
||||
buffer.clear();
|
||||
}
|
||||
config_.emplace_back(1, ch);
|
||||
} else if (!std::isspace(static_cast<unsigned char>(ch))) {
|
||||
buffer += ch;
|
||||
}
|
||||
}
|
||||
if (!buffer.empty()) {
|
||||
config_.emplace_back(std::move(buffer));
|
||||
}
|
||||
}
|
||||
|
||||
const std::string& operator[](size_t i) const {
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
i < config_.size(), "Index out of bounds in ConfigTokenizer");
|
||||
return config_[i];
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
return config_.size();
|
||||
}
|
||||
|
||||
bool checkToken(size_t i, const std::string& token) const {
|
||||
checkIndex(i);
|
||||
return config_[i] == token;
|
||||
}
|
||||
|
||||
size_t toSizeT(size_t i) const {
|
||||
checkIndex(i);
|
||||
return std::stoull(config_[i]);
|
||||
}
|
||||
|
||||
double toDouble(size_t i) const {
|
||||
checkIndex(i);
|
||||
return std::stod(config_[i]);
|
||||
}
|
||||
|
||||
bool toBool(size_t i) const {
|
||||
checkIndex(i);
|
||||
const auto& token = config_[i];
|
||||
if (token == "True") {
|
||||
return true;
|
||||
} else if (token == "False") {
|
||||
return false;
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Expected 'True' or 'False' at index ",
|
||||
i,
|
||||
" in ConfigTokenizer but got '",
|
||||
token,
|
||||
"'");
|
||||
}
|
||||
}
|
||||
|
||||
// Skips the current token group and returns the index of the value token.
|
||||
// Assumes the current index `i` points to a key name in a key-value pair.
|
||||
size_t skipKey(size_t i) const {
|
||||
// Expect a colon after the key
|
||||
checkToken(++i, ":");
|
||||
|
||||
++i; // Move to the value
|
||||
checkIndex(i);
|
||||
if (config_[i] != "[") {
|
||||
// Value is a single token (not a list) -> return its index
|
||||
return i;
|
||||
}
|
||||
|
||||
// Skip tokens inside the list until matching ']'
|
||||
// NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
|
||||
while (++i < config_.size() && config_[i] != "]") {
|
||||
}
|
||||
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
i < config_.size(),
|
||||
"Expected closing bracket ']' in ConfigTokenizer but reached end of config");
|
||||
|
||||
return i; // Return the index of the closing ']'
|
||||
}
|
||||
|
||||
private:
|
||||
void checkIndex(size_t i) const {
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
i < config_.size(), "Index out of bounds in ConfigTokenizer");
|
||||
}
|
||||
|
||||
std::vector<std::string> config_;
|
||||
};
|
||||
|
||||
/**
|
||||
* Note [AcceleratorAllocatorConfig design]
|
||||
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
* This class configures memory allocation for both device and host memory. A
|
||||
* single `AcceleratorAllocatorConfig` instance is shared across all accelerator
|
||||
* backends, such as CUDA and XPU, under the assumption that relevant
|
||||
* environment variables apply uniformly to all accelerators. Device-specific
|
||||
* configuration extensions are supported via hooks (see
|
||||
* `registerDeviceConfigParserHook`).
|
||||
*
|
||||
* Recommended design:
|
||||
* - Place common configurations in `AcceleratorAllocatorConfig`.
|
||||
* - Extend backend-specific configurations in corresponding device-specific
|
||||
* classes, such as `CUDAAllocatorConfig`, etc.
|
||||
*
|
||||
* Scope:
|
||||
* - Configuration options must be environment-variable driven.
|
||||
*
|
||||
* Naming Convention:
|
||||
* - Public API names in `AcceleratorAllocatorConfig` should be device-generic.
|
||||
* - Members prefixed with `pinned_` are specific to the host/pinned allocator.
|
||||
* - Environment variable names should be generic across backends.
|
||||
* - Comma-separated key-value pairs in the format: `key:value`. Use square
|
||||
* brackets `[]` for list values Example: `key1:123, key2:[val1,val2]`
|
||||
*
|
||||
* Environment Variables:
|
||||
* - The primary environment variable for configuration is `PYTORCH_ALLOC_CONF`.
|
||||
* - For backward compatibility, `PYTORCH_CUDA_ALLOC_CONF` is also supported
|
||||
* with lower priority.
|
||||
*/
|
||||
|
||||
class C10_API AcceleratorAllocatorConfig {
|
||||
public:
|
||||
static AcceleratorAllocatorConfig& instance();
|
||||
|
||||
C10_DISABLE_COPY_AND_ASSIGN(AcceleratorAllocatorConfig);
|
||||
AcceleratorAllocatorConfig(AcceleratorAllocatorConfig&&) = delete;
|
||||
AcceleratorAllocatorConfig& operator=(AcceleratorAllocatorConfig&&) = delete;
|
||||
~AcceleratorAllocatorConfig() = default;
|
||||
|
||||
/* Device allocator settings */
|
||||
|
||||
// Returns the maximum block size (in MB) that is allowed to be split. The
|
||||
// default is unlimited (all blocks can be split).
|
||||
static size_t max_split_size() {
|
||||
return instance().max_split_size_;
|
||||
}
|
||||
|
||||
// Returns the maximum block size (in MB) that is allowed to be rounded up
|
||||
// without requiring splitting when searching for a free block. The default is
|
||||
// 20 MiB.
|
||||
static size_t max_non_split_rounding_size() {
|
||||
return instance().max_non_split_rounding_size_;
|
||||
}
|
||||
|
||||
// Return the number of divisions used when rounding up allocation sizes (in
|
||||
// MB) to the nearest power-of-2 boundary.
|
||||
static size_t roundup_power2_divisions(size_t size);
|
||||
|
||||
// Returns the vector of division factors used for rounding up allocation
|
||||
// sizes. These divisions apply to size intervals between 1MB and 64GB.
|
||||
static std::vector<size_t> roundup_power2_divisions() {
|
||||
return instance().roundup_power2_divisions_;
|
||||
}
|
||||
|
||||
// Returns the threshold that triggers garbage collection when the ratio of
|
||||
// used memory to maximum allowed memory exceeds this value. The default is 0,
|
||||
// meaning no garbage collection is triggered. The value should be in the
|
||||
// range (0.0, 1.0).
|
||||
static double garbage_collection_threshold() {
|
||||
return instance().garbage_collection_threshold_;
|
||||
}
|
||||
|
||||
// Returns whether the expandable segment feature is enabled. This allows the
|
||||
// allocator to start with one segment that grows as needed, rather than
|
||||
// creating a new segment for each allocation. Default is false (expandable
|
||||
// segments disabled).
|
||||
static bool use_expandable_segments() {
|
||||
return instance().use_expandable_segments_;
|
||||
}
|
||||
|
||||
/* Host allocator settings */
|
||||
|
||||
// Returns whether the pinned host allocator uses background threads for
|
||||
// processing events. This is useful for improving performance in scenarios
|
||||
// where many small allocations are made. Default is false (background threads
|
||||
// disabled).
|
||||
static bool pinned_use_background_threads() {
|
||||
return instance().pinned_use_background_threads_;
|
||||
}
|
||||
|
||||
/* Settings for both device and host allocator */
|
||||
|
||||
// Returns the current allocator settings as a string. This string is useful
|
||||
// to expand device-specific allocator configurations
|
||||
static std::string last_allocator_settings() {
|
||||
std::lock_guard<std::mutex> lock(instance().last_allocator_settings_mutex_);
|
||||
return instance().last_allocator_settings_;
|
||||
}
|
||||
|
||||
// Parses the environment variable `env` to update the allocator settings.
|
||||
// If the environment variable is not set, it does nothing.
|
||||
// The configuration string should be a comma-separated list of key-value
|
||||
// pairs, where each key is a configuration option and the value is the
|
||||
// corresponding setting. For example:
|
||||
// "max_split_size_mb:100,max_non_split_rounding_mb:20,garbage_collection_threshold:0.5,roundup_power2_divisions:[64:8,256:4,1024:4,>:1],expandable_segments:true,pinned_use_background_threads:true"
|
||||
void parseArgs(const std::string& env);
|
||||
|
||||
// Registers a device-specific configuration parser hook. This allows
|
||||
// backends to parse additional device-specific configuration options from the
|
||||
// environment variable. The hook should be a function that takes a string
|
||||
// (the environment variable value) and parses it to set device-specific
|
||||
// configuration options.
|
||||
// The hook will be called when the environment variable is parsed.
|
||||
// If a hook is already registered, it will be replaced with the new one.
|
||||
void registerDeviceConfigParserHook(
|
||||
std::function<void(const std::string&)> hook) {
|
||||
device_config_parser_hook_ = std::move(hook);
|
||||
}
|
||||
|
||||
// Calls the registered device-specific configuration parser hook with the
|
||||
// provided environment string. This allows backends to parse additional
|
||||
// device-specific configuration options from the environment variable.
|
||||
// If no hook is registered, this function does nothing.
|
||||
void callDeviceConfigParserHook(const std::string& env) const {
|
||||
if (device_config_parser_hook_) {
|
||||
device_config_parser_hook_(env);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
AcceleratorAllocatorConfig();
|
||||
|
||||
/* Internal functions for device allocator */
|
||||
|
||||
// Parse `max_split_size_mb` from environment variable.
|
||||
size_t parseMaxSplitSize(const ConfigTokenizer& tokenizer, size_t i);
|
||||
// Parse `max_non_split_rounding_mb` from environment variable.
|
||||
size_t parseMaxNonSplitRoundingSize(
|
||||
const ConfigTokenizer& tokenizer,
|
||||
size_t i);
|
||||
// Parse `garbage_collection_threshold` from environment variable.
|
||||
size_t parseGarbageCollectionThreshold(
|
||||
const ConfigTokenizer& tokenizer,
|
||||
size_t i);
|
||||
// Parse `roundup_power2_divisions` from environment variable.
|
||||
size_t parseRoundUpPower2Divisions(
|
||||
const ConfigTokenizer& tokenizer,
|
||||
size_t i);
|
||||
// Parse `expandable_segments` from environment variable.
|
||||
size_t parseExpandableSegments(const ConfigTokenizer& tokenizer, size_t i);
|
||||
|
||||
/* Internal functions for host allocator */
|
||||
|
||||
// Parse `pinned_use_background_threads` from environment variable.
|
||||
size_t parsePinnedUseBackgroundThreads(
|
||||
const ConfigTokenizer& tokenizer,
|
||||
size_t i);
|
||||
|
||||
/* The following members are specifically used for the device allocator. */
|
||||
|
||||
// The maximum block size that is allowed to be split.
|
||||
std::atomic<size_t> max_split_size_{std::numeric_limits<size_t>::max()};
|
||||
// The maximum allowable extra size of a memory block without requiring
|
||||
// splitting when searching for a free block.
|
||||
std::atomic<size_t> max_non_split_rounding_size_{kLargeBuffer};
|
||||
// Used to store how memory allocations of different sizes should be rounded
|
||||
// up to the nearest power of 2 divisions.
|
||||
std::vector<size_t> roundup_power2_divisions_;
|
||||
// The threshold that triggers garbage collection when the ratio of used
|
||||
// memory to maximum allowed memory exceeds this value.
|
||||
std::atomic<double> garbage_collection_threshold_{0};
|
||||
// A flag to enable expandable segments feature.
|
||||
std::atomic<bool> use_expandable_segments_{false};
|
||||
|
||||
/* The following members are specifically used for the host allocator. */
|
||||
|
||||
// A flag to enable background thread for processing events.
|
||||
std::atomic<bool> pinned_use_background_threads_{false};
|
||||
|
||||
/* The following members are used for both device and host allocator. */
|
||||
|
||||
// Record the last allocator config environment setting.
|
||||
std::mutex last_allocator_settings_mutex_;
|
||||
std::string last_allocator_settings_;
|
||||
|
||||
// Optional hook for parsing additional device-specific allocator settings.
|
||||
// This allows backends (e.g., CUDA, XPU) to register a custom parser for
|
||||
// their own environment configuration extensions.
|
||||
std::function<void(const std::string&)> device_config_parser_hook_{nullptr};
|
||||
};
|
||||
|
||||
C10_API inline void setAllocatorSettings(const std::string& env) {
|
||||
AcceleratorAllocatorConfig::instance().parseArgs(env);
|
||||
AcceleratorAllocatorConfig::instance().callDeviceConfigParserHook(env);
|
||||
}
|
||||
|
||||
C10_API inline std::string getAllocatorSettings() {
|
||||
return AcceleratorAllocatorConfig::instance().last_allocator_settings();
|
||||
}
|
||||
|
||||
struct DeviceConfigParserHookRegistry {
|
||||
explicit DeviceConfigParserHookRegistry(
|
||||
std::function<void(const std::string&)> hook) {
|
||||
AcceleratorAllocatorConfig::instance().registerDeviceConfigParserHook(
|
||||
std::move(hook));
|
||||
}
|
||||
};
|
||||
|
||||
#define REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK(hook) \
|
||||
namespace { \
|
||||
static at::CachingAllocator::DeviceConfigParserHookRegistry \
|
||||
g_device_config_parse_hook_registry_instance(hook); \
|
||||
}
|
||||
|
||||
} // namespace c10::CachingAllocator
|
||||
@ -1,30 +1,27 @@
|
||||
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
||||
#include <c10/cuda/CUDAException.h>
|
||||
#include <c10/cuda/driver_api.h>
|
||||
#include <c10/util/CallOnce.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/Logging.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <dlfcn.h>
|
||||
|
||||
namespace c10::cuda {
|
||||
|
||||
namespace {
|
||||
|
||||
void* get_symbol(const char* name, int version);
|
||||
|
||||
DriverAPI create_driver_api() {
|
||||
void* handle_0 = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_NOLOAD);
|
||||
TORCH_CHECK(handle_0, "Can't open libcuda.so.1: ", dlerror());
|
||||
void* handle_1 = DriverAPI::get_nvml_handle();
|
||||
DriverAPI r{};
|
||||
|
||||
#define LOOKUP_LIBCUDA_ENTRY(name) \
|
||||
r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \
|
||||
TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name, ": ", dlerror())
|
||||
C10_LIBCUDA_DRIVER_API(LOOKUP_LIBCUDA_ENTRY)
|
||||
#undef LOOKUP_LIBCUDA_ENTRY
|
||||
|
||||
#define LOOKUP_LIBCUDA_ENTRY(name) \
|
||||
r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \
|
||||
dlerror();
|
||||
C10_LIBCUDA_DRIVER_API_12030(LOOKUP_LIBCUDA_ENTRY)
|
||||
#undef LOOKUP_LIBCUDA_ENTRY
|
||||
#define LOOKUP_LIBCUDA_ENTRY_WITH_VERSION(name, version) \
|
||||
r.name##_ = reinterpret_cast<decltype(&name)>(get_symbol(#name, version)); \
|
||||
TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name)
|
||||
C10_LIBCUDA_DRIVER_API(LOOKUP_LIBCUDA_ENTRY_WITH_VERSION)
|
||||
#undef LOOKUP_LIBCUDA_ENTRY_WITH_VERSION
|
||||
|
||||
if (handle_1) {
|
||||
#define LOOKUP_NVML_ENTRY(name) \
|
||||
@ -35,6 +32,32 @@ DriverAPI create_driver_api() {
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
void* get_symbol(const char* name, int version) {
|
||||
void* out = nullptr;
|
||||
cudaDriverEntryPointQueryResult qres{};
|
||||
|
||||
// CUDA 12.5+ supports version-based lookup
|
||||
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12050)
|
||||
if (auto st = cudaGetDriverEntryPointByVersion(
|
||||
name, &out, version, cudaEnableDefault, &qres);
|
||||
st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
|
||||
return out;
|
||||
}
|
||||
#endif
|
||||
|
||||
// This fallback to the old API to try getting the symbol again.
|
||||
if (auto st = cudaGetDriverEntryPoint(name, &out, cudaEnableDefault, &qres);
|
||||
st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
|
||||
return out;
|
||||
}
|
||||
|
||||
// If the symbol cannot be resolved, report and return nullptr;
|
||||
// the caller is responsible for checking the pointer.
|
||||
LOG(INFO) << "Failed to resolve symbol " << name;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void* DriverAPI::get_nvml_handle() {
|
||||
|
||||
@ -20,30 +20,24 @@
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define C10_LIBCUDA_DRIVER_API(_) \
|
||||
_(cuDeviceGetAttribute) \
|
||||
_(cuMemAddressReserve) \
|
||||
_(cuMemRelease) \
|
||||
_(cuMemMap) \
|
||||
_(cuMemAddressFree) \
|
||||
_(cuMemSetAccess) \
|
||||
_(cuMemUnmap) \
|
||||
_(cuMemCreate) \
|
||||
_(cuMemGetAllocationGranularity) \
|
||||
_(cuMemExportToShareableHandle) \
|
||||
_(cuMemImportFromShareableHandle) \
|
||||
_(cuMemsetD32Async) \
|
||||
_(cuStreamWriteValue32) \
|
||||
_(cuGetErrorString)
|
||||
|
||||
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12030)
|
||||
#define C10_LIBCUDA_DRIVER_API_12030(_) \
|
||||
_(cuMulticastAddDevice) \
|
||||
_(cuMulticastBindMem) \
|
||||
_(cuMulticastCreate)
|
||||
#else
|
||||
#define C10_LIBCUDA_DRIVER_API_12030(_)
|
||||
#endif
|
||||
#define C10_LIBCUDA_DRIVER_API(_) \
|
||||
_(cuDeviceGetAttribute, 12000) \
|
||||
_(cuMemAddressReserve, 12000) \
|
||||
_(cuMemRelease, 12000) \
|
||||
_(cuMemMap, 12000) \
|
||||
_(cuMemAddressFree, 12000) \
|
||||
_(cuMemSetAccess, 12000) \
|
||||
_(cuMemUnmap, 12000) \
|
||||
_(cuMemCreate, 12000) \
|
||||
_(cuMemGetAllocationGranularity, 12000) \
|
||||
_(cuMemExportToShareableHandle, 12000) \
|
||||
_(cuMemImportFromShareableHandle, 12000) \
|
||||
_(cuMemsetD32Async, 12000) \
|
||||
_(cuStreamWriteValue32, 12000) \
|
||||
_(cuGetErrorString, 12000) \
|
||||
_(cuMulticastAddDevice, 12030) \
|
||||
_(cuMulticastBindMem, 12030) \
|
||||
_(cuMulticastCreate, 12030)
|
||||
|
||||
#define C10_NVML_DRIVER_API(_) \
|
||||
_(nvmlInit_v2) \
|
||||
@ -56,11 +50,13 @@
|
||||
namespace c10::cuda {
|
||||
|
||||
struct DriverAPI {
|
||||
#define CREATE_MEMBER_VERSIONED(name, version) decltype(&name) name##_;
|
||||
#define CREATE_MEMBER(name) decltype(&name) name##_;
|
||||
C10_LIBCUDA_DRIVER_API(CREATE_MEMBER)
|
||||
C10_LIBCUDA_DRIVER_API_12030(CREATE_MEMBER)
|
||||
C10_LIBCUDA_DRIVER_API(CREATE_MEMBER_VERSIONED)
|
||||
C10_NVML_DRIVER_API(CREATE_MEMBER)
|
||||
#undef CREATE_MEMBER_VERSIONED
|
||||
#undef CREATE_MEMBER
|
||||
|
||||
static DriverAPI* get();
|
||||
static void* get_nvml_handle();
|
||||
};
|
||||
|
||||
@ -63,7 +63,6 @@ def define_c10_ovrsource(name, is_mobile):
|
||||
"core/impl/*.h",
|
||||
]),
|
||||
reexport_all_header_dependencies = False,
|
||||
# tests = C10_CPU_TEST_TARGETS,
|
||||
visibility = [
|
||||
"//xplat/caffe2/c10:c10_ovrsource",
|
||||
],
|
||||
@ -84,25 +83,6 @@ def define_c10_ovrsource(name, is_mobile):
|
||||
)
|
||||
|
||||
def define_ovrsource_targets():
|
||||
# C10_CPU_TEST_FILES = native.glob([
|
||||
# "test/core/*.cpp",
|
||||
# "test/util/*.cpp",
|
||||
# ])
|
||||
|
||||
# C10_GPU_TEST_FILES = native.glob([
|
||||
# "cuda/test/**/*.cpp",
|
||||
# ])
|
||||
|
||||
# C10_CPU_TEST_TARGETS = [
|
||||
# ":" + paths.basename(test)[:-len(".cpp")] + "_ovrsource"
|
||||
# for test in C10_CPU_TEST_FILES
|
||||
# ]
|
||||
|
||||
# C10_GPU_TEST_TARGETS = [
|
||||
# ":" + paths.basename(test)[:-len(".cpp")] + "_ovrsource"
|
||||
# for test in C10_GPU_TEST_FILES
|
||||
# ]
|
||||
|
||||
common_c10_cmake_defines = [
|
||||
("#cmakedefine C10_BUILD_SHARED_LIBS", ""),
|
||||
("#cmakedefine C10_USE_NUMA", ""),
|
||||
@ -207,7 +187,6 @@ def define_ovrsource_targets():
|
||||
"cuda/impl/*.h",
|
||||
]),
|
||||
reexport_all_header_dependencies = False,
|
||||
# tests = C10_GPU_TEST_TARGETS,
|
||||
visibility = ["PUBLIC"],
|
||||
deps = [
|
||||
"//third-party/cuda:libcuda",
|
||||
@ -217,64 +196,3 @@ def define_ovrsource_targets():
|
||||
":c10_ovrsource",
|
||||
],
|
||||
)
|
||||
|
||||
# [
|
||||
# oxx_test(
|
||||
# name = paths.basename(test)[:-len(".cpp")] + "_ovrsource",
|
||||
# srcs = [test],
|
||||
# compatible_with = cpu_supported_platforms,
|
||||
# compiler_flags = select({
|
||||
# "DEFAULT": [],
|
||||
# "ovr_config//compiler:cl": [
|
||||
# "/w",
|
||||
# ],
|
||||
# "ovr_config//compiler:clang": [
|
||||
# "-Wno-error",
|
||||
# "-Wno-self-assign-overloaded",
|
||||
# "-Wno-self-move",
|
||||
# "-Wno-shadow",
|
||||
# "-Wno-undef",
|
||||
# "-Wno-unused-function",
|
||||
# "-Wno-unused-variable",
|
||||
# ],
|
||||
# }),
|
||||
# framework = "gtest",
|
||||
# oncall = "ovrsource_pytorch",
|
||||
# raw_headers = native.glob([
|
||||
# "test/**/*.h",
|
||||
# ]),
|
||||
# deps = [
|
||||
# ":c10_ovrsource",
|
||||
# ],
|
||||
# )
|
||||
# for test in C10_CPU_TEST_FILES
|
||||
# ]
|
||||
|
||||
# [
|
||||
# oxx_test(
|
||||
# name = paths.basename(test)[:-len(".cpp")] + "_ovrsource",
|
||||
# srcs = [test],
|
||||
# compatible_with = cuda_supported_platforms,
|
||||
# compiler_flags = select({
|
||||
# "DEFAULT": [],
|
||||
# "ovr_config//compiler:cl": [
|
||||
# "/w",
|
||||
# ],
|
||||
# "ovr_config//compiler:clang": [
|
||||
# "-Wno-error",
|
||||
# ],
|
||||
# }),
|
||||
# framework = "gtest",
|
||||
# oncall = "ovrsource_pytorch",
|
||||
# raw_headers = native.glob([
|
||||
# "test/**/*.h",
|
||||
# ]),
|
||||
# runtime_shared_libraries = [
|
||||
# "//third-party/cuda:cudart",
|
||||
# ],
|
||||
# deps = [
|
||||
# ":c10_cuda_ovrsource",
|
||||
# ],
|
||||
# )
|
||||
# for test in C10_GPU_TEST_FILES
|
||||
# ]
|
||||
|
||||
123
c10/test/core/AllocatorConfig_test.cpp
Normal file
123
c10/test/core/AllocatorConfig_test.cpp
Normal file
@ -0,0 +1,123 @@
|
||||
#include <c10/core/AllocatorConfig.h>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
using namespace c10::CachingAllocator;
|
||||
constexpr size_t kMB = 1024 * 1024ul;
|
||||
|
||||
struct ExtendedAllocatorConfig {
|
||||
static ExtendedAllocatorConfig& instance() {
|
||||
static ExtendedAllocatorConfig instance;
|
||||
return instance;
|
||||
}
|
||||
|
||||
// Returns the device-specific option value in bytes.
|
||||
static size_t device_specific_option() {
|
||||
return instance().device_specific_option_;
|
||||
}
|
||||
|
||||
void parseArgs(const std::string& env) {
|
||||
// Parse device-specific options from the environment variable
|
||||
ConfigTokenizer tokenizer(env);
|
||||
for (size_t i = 0; i < tokenizer.size(); i++) {
|
||||
const auto& key = tokenizer[i];
|
||||
if (key == "device_specific_option_mb") {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
device_specific_option_ = tokenizer.toSizeT(++i) * kMB;
|
||||
} else {
|
||||
i = tokenizer.skipKey(i);
|
||||
}
|
||||
|
||||
if (i + 1 < tokenizer.size()) {
|
||||
tokenizer.checkToken(++i, ",");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// Device-specific option, e.g., memory limit for a specific device.
|
||||
std::atomic<size_t> device_specific_option_{0};
|
||||
};
|
||||
|
||||
REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK([](const std::string& env) {
|
||||
ExtendedAllocatorConfig::instance().parseArgs(env);
|
||||
})
|
||||
|
||||
TEST(AllocatorConfigTest, allocator_config_test) {
|
||||
std::string env =
|
||||
"max_split_size_mb:40,"
|
||||
"max_non_split_rounding_mb:30,"
|
||||
"garbage_collection_threshold:0.5,"
|
||||
"roundup_power2_divisions:[64:8,128:2,256:4,512:2,1024:4,>:1],"
|
||||
"expandable_segments:True,"
|
||||
"pinned_use_background_threads:True,"
|
||||
"device_specific_option_mb:64";
|
||||
c10::CachingAllocator::setAllocatorSettings(env);
|
||||
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::max_split_size(), 40 * kMB);
|
||||
EXPECT_EQ(
|
||||
AcceleratorAllocatorConfig::max_non_split_rounding_size(), 30 * kMB);
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::garbage_collection_threshold(), 0.5);
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(32 * kMB), 8);
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(64 * kMB), 8);
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(128 * kMB), 2);
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(256 * kMB), 4);
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(512 * kMB), 2);
|
||||
EXPECT_EQ(
|
||||
AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 4);
|
||||
EXPECT_EQ(
|
||||
AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 1);
|
||||
EXPECT_EQ(
|
||||
AcceleratorAllocatorConfig::roundup_power2_divisions(4096 * kMB), 1);
|
||||
EXPECT_EQ(
|
||||
AcceleratorAllocatorConfig::roundup_power2_divisions(8192 * kMB), 1);
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::use_expandable_segments(), true);
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::pinned_use_background_threads(), true);
|
||||
EXPECT_EQ(ExtendedAllocatorConfig::device_specific_option(), 64 * kMB);
|
||||
|
||||
env =
|
||||
"max_split_size_mb:20,"
|
||||
"max_non_split_rounding_mb:40,"
|
||||
"garbage_collection_threshold:0.8";
|
||||
c10::CachingAllocator::setAllocatorSettings(env);
|
||||
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::max_split_size(), 20 * kMB);
|
||||
EXPECT_EQ(
|
||||
AcceleratorAllocatorConfig::max_non_split_rounding_size(), 40 * kMB);
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::garbage_collection_threshold(), 0.8);
|
||||
|
||||
// roundup_power2_divisions knob array syntax
|
||||
env = "roundup_power2_divisions:[128:8,256:16,512:1,2048:8,>:2]";
|
||||
c10::CachingAllocator::setAllocatorSettings(env);
|
||||
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(64 * kMB), 8);
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(128 * kMB), 8);
|
||||
EXPECT_EQ(
|
||||
AcceleratorAllocatorConfig::roundup_power2_divisions(256 * kMB), 16);
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(512 * kMB), 1);
|
||||
EXPECT_EQ(
|
||||
AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 0);
|
||||
EXPECT_EQ(
|
||||
AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 8);
|
||||
EXPECT_EQ(
|
||||
AcceleratorAllocatorConfig::roundup_power2_divisions(4096 * kMB), 2);
|
||||
|
||||
// roundup_power2_divisions single value syntax for backward compatibility
|
||||
env = "roundup_power2_divisions:4";
|
||||
c10::CachingAllocator::setAllocatorSettings(env);
|
||||
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(64 * kMB), 4);
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(256 * kMB), 4);
|
||||
EXPECT_EQ(
|
||||
AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 4);
|
||||
|
||||
env = "expandable_segments:False,";
|
||||
c10::CachingAllocator::setAllocatorSettings(env);
|
||||
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::use_expandable_segments(), false);
|
||||
|
||||
env = "pinned_use_background_threads:False";
|
||||
c10::CachingAllocator::setAllocatorSettings(env);
|
||||
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::pinned_use_background_threads(), false);
|
||||
}
|
||||
@ -4,6 +4,7 @@
|
||||
// 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa.
|
||||
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/bit_cast.h>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
@ -67,13 +68,7 @@ inline C10_HOST_DEVICE uint16_t round_to_nearest_even(float src) {
|
||||
#endif
|
||||
return UINT16_C(0x7FC0);
|
||||
} else {
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
||||
union {
|
||||
uint32_t U32; // NOLINT(facebook-hte-BadMemberName)
|
||||
float F32; // NOLINT(facebook-hte-BadMemberName)
|
||||
};
|
||||
|
||||
F32 = src;
|
||||
const uint32_t U32 = c10::bit_cast<uint32_t>(src);
|
||||
uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
|
||||
return static_cast<uint16_t>((U32 + rounding_bias) >> 16);
|
||||
}
|
||||
|
||||
@ -3,6 +3,8 @@
|
||||
#include <cstring>
|
||||
#include <type_traits>
|
||||
|
||||
#include <c10/macros/Macros.h>
|
||||
|
||||
#if __has_include(<bit>) && (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L)
|
||||
#include <bit>
|
||||
#define C10_HAVE_STD_BIT_CAST 1
|
||||
@ -23,7 +25,7 @@ using std::bit_cast;
|
||||
// See https://en.cppreference.com/w/cpp/numeric/bit_cast for more
|
||||
// information as well as the source of our implementations.
|
||||
template <class To, class From>
|
||||
std::enable_if_t<
|
||||
C10_HOST_DEVICE std::enable_if_t<
|
||||
sizeof(To) == sizeof(From) && std::is_trivially_copyable_v<From> &&
|
||||
std::is_trivially_copyable_v<To>,
|
||||
To>
|
||||
|
||||
@ -58,6 +58,9 @@ def define_targets(rules):
|
||||
name = "bit_cast",
|
||||
hdrs = ["bit_cast.h"],
|
||||
visibility = ["//:__subpackages__"],
|
||||
deps = [
|
||||
"//c10/macros",
|
||||
],
|
||||
)
|
||||
|
||||
rules.cc_library(
|
||||
|
||||
18
docs/source/distributed._dist2.md
Normal file
18
docs/source/distributed._dist2.md
Normal file
@ -0,0 +1,18 @@
|
||||
```{eval-rst}
|
||||
.. role:: hidden
|
||||
:class: hidden-section
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: torch.distributed._dist2
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: torch.distributed.ProcessGroup
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
```
|
||||
@ -224,6 +224,10 @@ inconsistent 'UUID' assignment across ranks, and to prevent races during initial
|
||||
.. autofunction:: is_torchelastic_launched
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autofunction:: get_default_backend_for_device
|
||||
```
|
||||
|
||||
______________________________________________________________________
|
||||
|
||||
Currently three initialization methods are supported:
|
||||
@ -1471,3 +1475,9 @@ If you are running single node training, it may be convenient to interactively b
|
||||
```{eval-rst}
|
||||
.. py:module:: torch.distributed.checkpoint.state_dict
|
||||
```
|
||||
|
||||
```{toctree}
|
||||
:hidden:
|
||||
|
||||
distributed._dist2
|
||||
```
|
||||
|
||||
@ -8,16 +8,14 @@ higher-level API to automatically differentiate models split across several
|
||||
machines.
|
||||
|
||||
```{warning}
|
||||
APIs in the RPC package are stable. There are multiple ongoing work items
|
||||
to improve performance and error handling, which will ship in future releases.
|
||||
APIs in the RPC package are stable and in maintenance mode.
|
||||
```
|
||||
|
||||
```{warning}
|
||||
CUDA support was introduced in PyTorch 1.9 and is still a **beta** feature.
|
||||
CUDA support is a **beta** feature.
|
||||
Not all features of the RPC package are yet compatible with CUDA support and
|
||||
thus their use is discouraged. These unsupported features include: RRefs,
|
||||
JIT compatibility, dist autograd and dist optimizer, and profiling. These
|
||||
shortcomings will be addressed in future releases.
|
||||
JIT compatibility, dist autograd and dist optimizer, and profiling.
|
||||
```
|
||||
|
||||
```{note}
|
||||
@ -102,13 +100,6 @@ device lists on source and destination workers do not match. In such cases,
|
||||
applications can always explicitly move the input tensors to CPU on the caller
|
||||
and move it to the desired devices on the callee if necessary.
|
||||
|
||||
```{warning}
|
||||
TorchScript support in RPC is a prototype feature and subject to change. Since
|
||||
v1.5.0, ``torch.distributed.rpc`` supports calling TorchScript functions as
|
||||
RPC target functions, and this will help improve parallelism on the callee
|
||||
side as executing TorchScript functions does not require GIL.
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autofunction:: rpc_sync
|
||||
.. autofunction:: rpc_async
|
||||
@ -159,9 +150,7 @@ multiple different transports (TCP, of course, but also shared memory, NVLink,
|
||||
InfiniBand, ...) and can automatically detect their availability and negotiate
|
||||
the best transport to use for each pipe.
|
||||
|
||||
The TensorPipe backend has been introduced in PyTorch v1.6 and is being actively
|
||||
developed. At the moment, it only supports CPU tensors, with GPU support coming
|
||||
soon. It comes with a TCP-based transport, just like Gloo. It is also able to
|
||||
The TensorPipe backend comes with a TCP-based transport, just like Gloo. It is also able to
|
||||
automatically chunk and multiplex large tensors over multiple sockets and
|
||||
threads in order to achieve very high bandwidths. The agent will be able to pick
|
||||
the best transport on its own, with no intervention required.
|
||||
@ -301,6 +290,4 @@ to use [the profiler](https://pytorch.org/docs/stable/autograd.html#profiler) to
|
||||
- [Getting started with Distributed RPC Framework](https://pytorch.org/tutorials/intermediate/rpc_tutorial.html)
|
||||
- [Implementing a Parameter Server using Distributed RPC Framework](https://pytorch.org/tutorials/intermediate/rpc_param_server_tutorial.html)
|
||||
- [Combining Distributed DataParallel with Distributed RPC Framework](https://pytorch.org/tutorials/advanced/rpc_ddp_tutorial.html) (covers **RemoteModule** as well)
|
||||
- [Profiling RPC-based Workloads](https://pytorch.org/tutorials/recipes/distributed_rpc_profiling.html)
|
||||
- [Implementing batch RPC processing](https://pytorch.org/tutorials/intermediate/rpc_async_execution.html)
|
||||
- [Distributed Pipeline Parallel](https://pytorch.org/tutorials/intermediate/dist_pipeline_parallel_tutorial.html)
|
||||
|
||||
@ -9,13 +9,13 @@ requires = [
|
||||
# 77.0.0: min version for SPDX expression support for project.license
|
||||
"setuptools>=62.3.0,<80.0",
|
||||
"wheel",
|
||||
"astunparse",
|
||||
"cmake>=3.27",
|
||||
"ninja",
|
||||
"numpy",
|
||||
"packaging",
|
||||
"pyyaml",
|
||||
"requests",
|
||||
"six", # dependency chain: NNPACK -> PeachPy -> six
|
||||
"typing-extensions>=4.10.0",
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
# Python dependencies required for development
|
||||
astunparse
|
||||
build[uv] # for building sdist and wheel
|
||||
cmake>=3.27
|
||||
expecttest>=0.3.0
|
||||
@ -18,6 +17,8 @@ pyyaml
|
||||
requests
|
||||
# setuptools develop deprecated on 80.0
|
||||
setuptools>=62.3.0,<80.0
|
||||
six # dependency chain: NNPACK -> PeachPy -> six
|
||||
sympy>=1.13.3
|
||||
types-dataclasses
|
||||
typing-extensions>=4.13.2
|
||||
wheel
|
||||
|
||||
@ -15,4 +15,4 @@ pip install --no-use-pep517 -e "$tp2_dir/onnx"
|
||||
# Install caffe2 and pytorch
|
||||
pip install -r "$top_dir/caffe2/requirements.txt"
|
||||
pip install -r "$top_dir/requirements.txt"
|
||||
python setup.py develop
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
|
||||
@ -35,4 +35,4 @@ _pip_install -b "$BUILD_DIR/onnx" "file://$tp2_dir/onnx#egg=onnx"
|
||||
# Install caffe2 and pytorch
|
||||
pip install -r "$top_dir/caffe2/requirements.txt"
|
||||
pip install -r "$top_dir/requirements.txt"
|
||||
python setup.py install
|
||||
python -m pip install --no-build-isolation -v .
|
||||
|
||||
34
setup.py
34
setup.py
@ -263,6 +263,7 @@ import json
|
||||
import shutil
|
||||
import subprocess
|
||||
import sysconfig
|
||||
import textwrap
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
@ -601,7 +602,7 @@ def build_deps() -> None:
|
||||
report(
|
||||
'Finished running cmake. Run "ccmake build" or '
|
||||
'"cmake-gui build" to adjust build options and '
|
||||
'"python setup.py install" to build.'
|
||||
'"python -m pip install --no-build-isolation -v ." to build.'
|
||||
)
|
||||
sys.exit()
|
||||
|
||||
@ -1207,24 +1208,25 @@ def configure_extension_build() -> tuple[
|
||||
|
||||
# post run, warnings, printed at the end to make them more visible
|
||||
build_update_message = """
|
||||
It is no longer necessary to use the 'build' or 'rebuild' targets
|
||||
It is no longer necessary to use the 'build' or 'rebuild' targets
|
||||
|
||||
To install:
|
||||
$ python setup.py install
|
||||
To develop locally:
|
||||
$ python setup.py develop
|
||||
To force cmake to re-generate native build files (off by default):
|
||||
$ CMAKE_FRESH=1 python setup.py develop
|
||||
"""
|
||||
To install:
|
||||
$ python -m pip install --no-build-isolation -v .
|
||||
To develop locally:
|
||||
$ python -m pip install --no-build-isolation -v -e .
|
||||
To force cmake to re-generate native build files (off by default):
|
||||
$ CMAKE_FRESH=1 python -m pip install --no-build-isolation -v -e .
|
||||
""".strip()
|
||||
|
||||
|
||||
def print_box(msg: str) -> None:
|
||||
lines = msg.split("\n")
|
||||
size = max(len(l) + 1 for l in lines)
|
||||
print("-" * (size + 2))
|
||||
for l in lines:
|
||||
print("|{}{}|".format(l, " " * (size - len(l))))
|
||||
print("-" * (size + 2))
|
||||
msg = textwrap.dedent(msg).strip()
|
||||
lines = ["", *msg.split("\n"), ""]
|
||||
max_width = max(len(l) for l in lines)
|
||||
print("+" + "-" * (max_width + 4) + "+", file=sys.stderr, flush=True)
|
||||
for line in lines:
|
||||
print(f"| {line:<{max_width}s} |", file=sys.stderr, flush=True)
|
||||
print("+" + "-" * (max_width + 4) + "+", file=sys.stderr, flush=True)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
@ -1308,7 +1310,9 @@ def main() -> None:
|
||||
"include/**/*.hpp",
|
||||
"include/*.cuh",
|
||||
"include/**/*.cuh",
|
||||
"csrc/inductor/aoti_runtime/model.h",
|
||||
"_inductor/codegen/*.h",
|
||||
"_inductor/codegen/aoti_runtime/*.h",
|
||||
"_inductor/codegen/aoti_runtime/*.cpp",
|
||||
"_inductor/script.ld",
|
||||
"_export/serde/*.yaml",
|
||||
|
||||
@ -36,7 +36,7 @@ The following commands assume you are in PyTorch root.
|
||||
|
||||
```bash
|
||||
# ... Build PyTorch from source, e.g.
|
||||
python setup.py develop
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
# (re)build just the binary
|
||||
ninja -C build bin/test_jit
|
||||
# run tests
|
||||
|
||||
@ -4,8 +4,8 @@ This folder contains a self-contained example of a PyTorch out-of-tree backend l
|
||||
|
||||
## How to use
|
||||
|
||||
Install as standalone with `python setup.py develop` (or install) from this folder.
|
||||
You can run test via `python {PYTORCH_ROOT_PATH}/test/test_openreg.py`.
|
||||
Install as standalone with `python -m pip install -e .` (or `python -m pip install .`)
|
||||
from this folder. You can run test via `python {PYTORCH_ROOT_PATH}/test/test_openreg.py`.
|
||||
|
||||
## Design principles
|
||||
|
||||
|
||||
@ -1570,5 +1570,54 @@ class TestFullyShardForceSumReduction(FSDPTest):
|
||||
self.assertRegex(logs, all_reduce_sum_re)
|
||||
|
||||
|
||||
class TestFullyShardReduceOpWorldSize1(FSDPTest):
|
||||
@property
|
||||
def world_size(self) -> int:
|
||||
return 1
|
||||
|
||||
def test_size1_reduceop(self):
|
||||
from torch.distributed.distributed_c10d import ReduceOp
|
||||
|
||||
model = nn.Linear(1024, 1025)
|
||||
ref_model = copy.deepcopy(model).to(device_type)
|
||||
ref_optim = torch.optim.Adam(ref_model.parameters())
|
||||
fully_shard(
|
||||
model,
|
||||
mesh=init_device_mesh(device_type.type, (1,)),
|
||||
reshard_after_forward=False,
|
||||
)
|
||||
optim = torch.optim.Adam(model.parameters())
|
||||
|
||||
inp = torch.randn(1025, 1024, device=device_type.type)
|
||||
for _ in range(3):
|
||||
ref_optim.zero_grad()
|
||||
ref_loss = ref_model(inp).sum()
|
||||
ref_loss.backward()
|
||||
for param in ref_model.parameters():
|
||||
dist.all_reduce(param.grad, op=dist.ReduceOp.SUM)
|
||||
ref_optim.step()
|
||||
|
||||
optim.zero_grad()
|
||||
loss = model(inp).sum()
|
||||
loss.backward()
|
||||
optim.step()
|
||||
self.assertEqual(loss, ref_loss)
|
||||
self.assertEqual(
|
||||
model.bias.grad._local_tensor,
|
||||
ref_model.bias.grad,
|
||||
)
|
||||
|
||||
state = model._get_fsdp_state()
|
||||
fsdp_param_group = state._fsdp_param_group
|
||||
group = fsdp_param_group.mesh_info.shard_process_group
|
||||
(
|
||||
_,
|
||||
_,
|
||||
_,
|
||||
all_reduce_op,
|
||||
) = _get_gradient_divide_factors(group, None, torch.float32)
|
||||
self.assertEqual(all_reduce_op, ReduceOp.SUM)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -554,21 +554,6 @@ class TestNew2dParallelTraining(DTensorTestBase):
|
||||
p2 = p2.redistribute(p2.device_mesh, [Replicate()]).to_local()
|
||||
self.assertTrue(torch.allclose(p1, p2), f"{p1} vs {p2}")
|
||||
|
||||
@with_comms
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_raise_invalid_tp_composition(self):
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError, r"Found TP device_mesh on the \d dimension of its parent mesh"
|
||||
):
|
||||
mesh_2d = init_device_mesh(
|
||||
self.device_type, (2, self.world_size // 2), mesh_dim_names=("tp", "dp")
|
||||
)
|
||||
parallelize_plan = {
|
||||
"net1": ColwiseParallel(),
|
||||
"net2": RowwiseParallel(),
|
||||
}
|
||||
parallelize_module(SimpleModel().cuda(), mesh_2d["tp"], parallelize_plan)
|
||||
|
||||
@with_comms
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_2d_fsdp_state_enable_extension(self):
|
||||
|
||||
@ -3182,7 +3182,7 @@ class NcclRegistrationTest(MultiProcessTestCase):
|
||||
|
||||
# Use NCCL memory allocator
|
||||
# enable symmetric memory usage in NCCL
|
||||
pool = torch.cuda.MemPool(backend.mem_allocator, symm_mem=True)
|
||||
pool = torch.cuda.MemPool(backend.mem_allocator, symmetric=True)
|
||||
|
||||
# allocate memory with ncclMemAlloc
|
||||
# note: symmetric kernels are not available for dtypes like torch.int64
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user