[ghstack-poisoned]
This commit is contained in:
Xuehai Pan
2025-07-11 15:05:13 +08:00
229 changed files with 5089 additions and 1410 deletions

View File

@ -4,7 +4,7 @@ set -eux -o pipefail
GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
export TORCH_CUDA_ARCH_LIST="9.0;10.0;12.0"
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
fi
SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"

View File

@ -52,6 +52,8 @@ fi
if [[ "$image" == *-jammy* ]]; then
UBUNTU_VERSION=22.04
elif [[ "$image" == *-noble* ]]; then
UBUNTU_VERSION=24.04
elif [[ "$image" == *ubuntu* ]]; then
extract_version_from_image_name ubuntu UBUNTU_VERSION
fi
@ -230,8 +232,12 @@ case "$tag" in
UCC_COMMIT=${_UCC_COMMIT}
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-rocm-n-py3)
ANACONDA_PYTHON_VERSION=3.10
pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-noble-rocm-n-py3)
if [[ $tag =~ "jammy" ]]; then
ANACONDA_PYTHON_VERSION=3.10
else
ANACONDA_PYTHON_VERSION=3.12
fi
GCC_VERSION=11
VISION=yes
ROCM_VERSION=6.4
@ -322,6 +328,8 @@ case "$tag" in
GCC_VERSION=11
ACL=yes
VISION=yes
CONDA_CMAKE=yes
OPENBLAS=yes
# snadampal: skipping llvm src build install because the current version
# from pytorch/llvm:9.0.1 is x86 specific
SKIP_LLVM_SRC_BUILD_INSTALL=yes
@ -331,6 +339,8 @@ case "$tag" in
GCC_VERSION=11
ACL=yes
VISION=yes
CONDA_CMAKE=yes
OPENBLAS=yes
# snadampal: skipping llvm src build install because the current version
# from pytorch/llvm:9.0.1 is x86 specific
SKIP_LLVM_SRC_BUILD_INSTALL=yes
@ -417,6 +427,7 @@ docker build \
--build-arg "XPU_VERSION=${XPU_VERSION}" \
--build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
--build-arg "ACL=${ACL:-}" \
--build-arg "OPENBLAS=${OPENBLAS:-}" \
--build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
--build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
-f $(dirname ${DOCKERFILE})/Dockerfile \

View File

@ -23,6 +23,10 @@ conda_install() {
as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $*
}
conda_install_through_forge() {
as_jenkins conda install -c conda-forge -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $*
}
conda_run() {
as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION --no-capture-output $*
}

View File

@ -15,6 +15,9 @@ install_ubuntu() {
elif [[ "$UBUNTU_VERSION" == "22.04"* ]]; then
cmake3="cmake=3.22*"
maybe_libiomp_dev=""
elif [[ "$UBUNTU_VERSION" == "24.04"* ]]; then
cmake3="cmake=3.28*"
maybe_libiomp_dev=""
else
cmake3="cmake=3.5*"
maybe_libiomp_dev="libiomp-dev"

View File

@ -70,10 +70,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
fi
# Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
if [[ $(uname -m) == "aarch64" ]]; then
conda_install "openblas==0.3.29=*openmp*"
else
conda_install "mkl=2021.4.0 mkl-include=2021.4.0"
if [[ $(uname -m) != "aarch64" ]]; then
pip_install mkl==2024.2.0
pip_install mkl-static==2024.2.0
pip_install mkl-include==2024.2.0
fi
# Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
@ -87,6 +87,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
conda_run ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION})
fi
if [[ "$UBUNTU_VERSION" == "24.04"* ]] ; then
conda_install_through_forge libstdcxx-ng=14
fi
# Install some other packages, including those needed for Python test reporting
pip_install -r /opt/conda/requirements-ci.txt

View File

@ -4,8 +4,9 @@
set -ex
cd /
git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.29}" --depth 1 --shallow-submodules
git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.30}" --depth 1 --shallow-submodules
OPENBLAS_CHECKOUT_DIR="OpenBLAS"
OPENBLAS_BUILD_FLAGS="
NUM_THREADS=128
USE_OPENMP=1
@ -13,9 +14,8 @@ NO_SHARED=0
DYNAMIC_ARCH=1
TARGET=ARMV8
CFLAGS=-O3
BUILD_BFLOAT16=1
"
OPENBLAS_CHECKOUT_DIR="OpenBLAS"
make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}

View File

@ -8,9 +8,11 @@ ver() {
install_ubuntu() {
apt-get update
if [[ $UBUNTU_VERSION == 20.04 ]]; then
# gpg-agent is not available by default on 20.04
apt-get install -y --no-install-recommends gpg-agent
# gpg-agent is not available by default
apt-get install -y --no-install-recommends gpg-agent
if [[ $(ver $UBUNTU_VERSION) -ge $(ver 22.04) ]]; then
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \
| sudo tee /etc/apt/preferences.d/rocm-pin-600
fi
apt-get install -y kmod
apt-get install -y wget
@ -85,13 +87,14 @@ EOF
VER_STR=6.3
fi
# clr build needs CppHeaderParser but can only find it using conda's python
/opt/conda/bin/python -m pip install CppHeaderParser
python -m pip install CppHeaderParser
git clone https://github.com/ROCm/HIP -b $HIP_BRANCH
HIP_COMMON_DIR=$(readlink -f HIP)
git clone https://github.com/jeffdaily/clr -b release/rocm-rel-${VER_STR}${VER_PATCH}-statco-hotfix
mkdir -p clr/build
pushd clr/build
cmake .. -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
# Need to point CMake to the correct python installation to find CppHeaderParser
cmake .. -DPython3_EXECUTABLE=/opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}/bin/python3 -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
make -j
cp hipamd/lib/libamdhip64.so.${VER_STR}.* /opt/rocm/lib/libamdhip64.so.${VER_STR}.*
popd

View File

@ -41,7 +41,7 @@ case ${image} in
GPU_IMAGE=arm64v8/almalinux:8
DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1"
MANY_LINUX_VERSION="2_28_aarch64"
OPENBLAS_VERSION="v0.3.29"
OPENBLAS_VERSION="v0.3.30"
;;
manylinuxcxx11-abi-builder:cpu-cxx11-abi)
TARGET=final

View File

@ -16,6 +16,7 @@ click
#test that import:
coremltools==5.0b5 ; python_version < "3.12"
coremltools==8.3 ; python_version == "3.12"
#Description: Apple framework for ML integration
#Pinned versions: 5.0b5
#test that import:
@ -63,6 +64,7 @@ lark==0.12.0
#test that import:
librosa>=0.6.2 ; python_version < "3.11"
librosa==0.10.2 ; python_version == "3.12"
#Description: A python package for music and audio analysis
#Pinned versions: >=0.6.2
#test that import: test_spectral_ops.py
@ -111,6 +113,7 @@ ninja==1.11.1.3
numba==0.49.0 ; python_version < "3.9"
numba==0.55.2 ; python_version == "3.9"
numba==0.55.2 ; python_version == "3.10"
numba==0.60.0 ; python_version == "3.12"
#Description: Just-In-Time Compiler for Numerical Functions
#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
#test that import: test_numba_integration.py
@ -360,10 +363,11 @@ pwlf==2.2.1
# To build PyTorch itself
astunparse
PyYAML
pyyaml
pyzstd
setuptools
six
wheel
scons==4.5.2 ; platform_machine == "aarch64"

View File

@ -5,7 +5,7 @@ sphinx==5.3.0
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
# but it doesn't seem to work and hangs around idly. The initial thought is probably
# something related to Docker setup. We can investigate this later
# something related to Docker setup. We can investigate this later.
sphinxcontrib.katex==0.8.6
#Description: This is used to generate PyTorch docs

View File

@ -147,6 +147,12 @@ RUN if [ -n "${ACL}" ]; then bash ./install_acl.sh; fi
RUN rm install_acl.sh
ENV INSTALLED_ACL ${ACL}
ARG OPENBLAS
COPY ./common/install_openblas.sh install_openblas.sh
RUN if [ -n "${OPENBLAS}" ]; then bash ./install_openblas.sh; fi
RUN rm install_openblas.sh
ENV INSTALLED_OPENBLAS ${OPENBLAS}
# Install ccache/sccache (do this last, so we get priority in PATH)
ARG SKIP_SCCACHE_INSTALL
COPY ./common/install_cache.sh install_cache.sh

View File

@ -104,7 +104,7 @@ if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr
fi
echo "Calling setup.py install at $(date)"
echo "Calling 'python -m pip install .' at $(date)"
if [[ $LIBTORCH_VARIANT = *"static"* ]]; then
STATIC_CMAKE_FLAG="-DTORCH_STATIC=1"
@ -120,7 +120,7 @@ fi
# TODO: Remove this flag once https://github.com/pytorch/pytorch/issues/55952 is closed
CFLAGS='-Wno-deprecated-declarations' \
BUILD_LIBTORCH_CPU_WITH_DEBUG=1 \
python setup.py install
python -m pip install --no-build-isolation -v .
mkdir -p libtorch/{lib,bin,include,share}

View File

@ -185,7 +185,7 @@ torchbench_setup_macos() {
}
pip_benchmark_deps() {
python -mpip install --no-input astunparse requests cython scikit-learn
python -mpip install --no-input requests cython scikit-learn six
}

View File

@ -436,11 +436,11 @@ test_inductor_aoti() {
python3 tools/amd_build/build_amd.py
fi
if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python setup.py develop)
BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python -m pip install --no-build-isolation -v -e .)
# TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
else
BUILD_COMMAND=(python setup.py develop)
BUILD_COMMAND=(python -m pip install --no-build-isolation -v -e .)
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
fi
@ -1579,7 +1579,7 @@ test_operator_benchmark() {
test_inductor_set_cpu_affinity
cd benchmarks/operator_benchmark/pt_extension
python setup.py install
python -m pip install .
cd "${TEST_DIR}"/benchmarks/operator_benchmark
$TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \

View File

@ -42,7 +42,7 @@ call choco upgrade -y cmake --no-progress --installargs 'ADD_CMAKE_TO_PATH=Syste
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
call pip install mkl-include==2021.4.0 mkl-devel==2021.4.0
call pip install mkl==2024.2.0 mkl-static==2024.2.0 mkl-include==2024.2.0
if errorlevel 1 goto fail
if not errorlevel 0 goto fail

View File

@ -61,8 +61,8 @@ You are now all set to start developing with PyTorch in a DevContainer environme
## Step 8: Build PyTorch
To build pytorch from source, simply run:
```
python setup.py develop
```bash
python -m pip install --no-build-isolation -v -e .
```
The process involves compiling thousands of files, and would take a long time. Fortunately, the compiled objects can be useful for your next build. When you modify some files, you only need to compile the changed files the next time.

View File

@ -1 +1 @@
70caf76066ef2c1054d6128b11769dc816a779e7
6c57850358f34c47802db216b0746e4e9d08a95a

View File

@ -6,7 +6,7 @@ set -euxo pipefail
cd llm-target-determinator
pip install -q -r requirements.txt
cd ../codellama
pip install -e .
pip install --no-build-isolation -v -e .
pip install numpy==1.26.0
# Run indexer

View File

@ -131,6 +131,9 @@ jobs:
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
instructions: |
Build is done inside the container, to start an interactive session run:
docker exec -it $(docker container ps --format '{{.ID}}') bash
# [pytorch repo ref]
# Use a pytorch/pytorch reference instead of a reference to the local

View File

@ -88,6 +88,14 @@ jobs:
pkill "${PROCESS}" || true
done
- name: Clean up brew miniconda, if installed
continue-on-error: true
run: |
if brew list miniconda; then
brew uninstall miniconda
echo "REINSTALL_BREW_MINICONDA=1" >> "${GITHUB_ENV}"
fi
- name: Clean up leftover local python3 site-packages on MacOS pet runner
continue-on-error: true
run: |
@ -268,6 +276,14 @@ jobs:
workflow_attempt: ${{github.run_attempt}}
local_path: usage_log.txt
- name: Reinstall brew miniconda, if was installed
if: always()
continue-on-error: true
run: |
if [[ -n "$REINSTALL_BREW_MINICONDA" ]]; then
brew install miniconda
fi
- name: Clean up disk space
if: always()
continue-on-error: true

View File

@ -63,6 +63,7 @@ jobs:
pytorch-linux-jammy-py3.13-clang12,
pytorch-linux-jammy-rocm-n-1-py3,
pytorch-linux-jammy-rocm-n-py3,
pytorch-linux-noble-rocm-n-py3,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
pytorch-linux-jammy-py3.9-gcc11,
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,

View File

@ -36,15 +36,15 @@ jobs:
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-rocm-py3_10-build:
linux-noble-rocm-py3_12-build:
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
name: linux-jammy-rocm-py3.10-mi300
name: linux-noble-rocm-py3.12-mi300
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-rocm-py3.10-mi300
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
build-environment: linux-noble-rocm-py3.12-mi300
docker-image-name: ci-image:pytorch-linux-noble-rocm-n-py3
sync-tag: rocm-build
test-matrix: |
{ include: [
@ -57,17 +57,17 @@ jobs:
]}
secrets: inherit
linux-jammy-rocm-py3_10-test:
linux-noble-rocm-py3_12-test:
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3.10-mi300
name: linux-noble-rocm-py3.12-mi300
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-jammy-rocm-py3_10-build
- linux-noble-rocm-py3_12-build
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10-mi300
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
build-environment: linux-noble-rocm-py3.12-mi300
docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
secrets: inherit

View File

@ -231,7 +231,8 @@ include_patterns = [
'c10/**/*.cpp',
'c10/**/*.h',
'torch/*.h',
'torch/_inductor/codegen/aoti_runtime/interface.cpp',
'torch/_inductor/codegen/aoti_runtime/*.h',
'torch/_inductor/codegen/aoti_runtime/*.cpp',
'torch/csrc/*.h',
'torch/csrc/*.cpp',
'torch/csrc/**/*.h',
@ -1476,6 +1477,31 @@ init_command = [
'tomli==2.2.1 ; python_version < "3.11"',
]
[[linter]]
code = 'CMAKE_MINIMUM_REQUIRED'
command = [
'python3',
'tools/linter/adapters/cmake_minimum_required_linter.py',
'--',
'@{{PATHSFILE}}'
]
include_patterns = [
"**/pyproject.toml",
"**/CMakeLists.txt",
"**/CMakeLists.txt.in",
"**/*.cmake",
"**/*.cmake.in",
"**/*requirements*.txt",
"**/*requirements*.in",
]
init_command = [
'python3',
'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}',
'packaging==25.0',
'tomli==2.2.1 ; python_version < "3.11"',
]
[[linter]]
code = 'COPYRIGHT'
include_patterns = ['**']

View File

@ -88,20 +88,19 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
* If you want to have no-op incremental rebuilds (which are fast), see [Make no-op build fast](#make-no-op-build-fast) below.
* When installing with `python setup.py develop` (in contrast to `python setup.py install`) Python runtime will use
* When installing with `python -m pip install -e .` (in contrast to `python -m pip install .`) Python runtime will use
the current local source-tree when importing `torch` package. (This is done by creating [`.egg-link`](https://wiki.python.org/moin/PythonPackagingTerminology#egg-link) file in `site-packages` folder)
This way you do not need to repeatedly install after modifying Python files (`.py`).
However, you would need to reinstall if you modify Python interface (`.pyi`, `.pyi.in`) or
non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...).
However, you would need to reinstall if you modify Python interface (`.pyi`, `.pyi.in`) or non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...).
One way to avoid running `python setup.py develop` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac,
One way to avoid running `python -m pip install -e .` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac,
is to create a symbolic link from `build` folder to `torch/lib`, for example, by issuing following:
```bash
pushd torch/lib; sh -c "ln -sf ../../build/lib/libtorch_cpu.* ."; popd
pushd torch/lib; sh -c "ln -sf ../../build/lib/libtorch_cpu.* ."; popd
```
Afterwards rebuilding a library (for example to rebuild `libtorch_cpu.so` issue `ninja torch_cpu` from `build` folder),
would be sufficient to make change visible in `torch` package.
Afterwards rebuilding a library (for example to rebuild `libtorch_cpu.so` issue `ninja torch_cpu` from `build` folder),
would be sufficient to make change visible in `torch` package.
To reinstall, first uninstall all existing PyTorch installs. You may need to run `pip
@ -115,9 +114,9 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
pip uninstall torch
```
Next run `python setup.py clean`. After that, you can install in `develop` mode again.
Next run `python setup.py clean`. After that, you can install in editable mode again.
* If you run into errors when running `python setup.py develop`, here are some debugging steps:
* If you run into errors when running `python -m pip install -e .`, here are some debugging steps:
1. Run `printf '#include <stdio.h>\nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure
your CMake works and can compile this simple Hello World program without errors.
2. Nuke your `build` directory. The `setup.py` script compiles binaries into the `build` folder and caches many
@ -130,13 +129,20 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
git clean -xdf
python setup.py clean
git submodule update --init --recursive
python setup.py develop
python -m pip install -r requirements.txt
python -m pip install --no-build-isolation -v -e .
```
4. The main step within `python setup.py develop` is running `make` from the `build` directory. If you want to
4. The main step within `python -m pip install -e .` is running `cmake --build build` from the `build` directory. If you want to
experiment with some environment variables, you can pass them into the command:
```bash
ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* python setup.py develop
ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* CMAKE_FRESH=1 python -m pip install --no-build-isolation -v -e .
```
5. Try installing PyTorch without build isolation by adding `--no-build-isolation` to the `pip install` command.
This will use the current environment's packages instead of creating a new isolated environment for the build.
```bash
python -m pip install --no-build-isolation -v -e .
```
* If you run into issue running `git submodule update --init --recursive`. Please try the following:
- If you encounter an error such as
@ -639,9 +645,9 @@ can be selected interactively with your mouse to zoom in on a particular part of
the program execution timeline. The `--native` command-line option tells
`py-spy` to record stack frame entries for PyTorch C++ code. To get line numbers
for C++ code it may be necessary to compile PyTorch in debug mode by prepending
your `setup.py develop` call to compile PyTorch with `DEBUG=1`. Depending on
your operating system it may also be necessary to run `py-spy` with root
privileges.
your `python -m pip install -e .` call to compile PyTorch with `DEBUG=1`.
Depending on your operating system it may also be necessary to run `py-spy` with
root privileges.
`py-spy` can also work in an `htop`-like "live profiling" mode and can be
tweaked to adjust the stack sampling rate, see the `py-spy` readme for more
@ -649,7 +655,7 @@ details.
## Managing multiple build trees
One downside to using `python setup.py develop` is that your development
One downside to using `python -m pip install -e .` is that your development
version of PyTorch will be installed globally on your account (e.g., if
you run `import torch` anywhere else, the development version will be
used).
@ -663,7 +669,7 @@ specific build of PyTorch. To set one up:
python -m venv pytorch-myfeature
source pytorch-myfeature/bin/activate # or `& .\pytorch-myfeature\Scripts\Activate.ps1` on Windows
# if you run python now, torch will NOT be installed
python setup.py develop
python -m pip install --no-build-isolation -v -e .
```
## C++ development tips
@ -701,7 +707,9 @@ variables `DEBUG`, `USE_DISTRIBUTED`, `USE_MKLDNN`, `USE_CUDA`, `USE_FLASH_ATTEN
For example:
```bash
DEBUG=1 USE_DISTRIBUTED=0 USE_MKLDNN=0 USE_CUDA=0 BUILD_TEST=0 USE_FBGEMM=0 USE_NNPACK=0 USE_QNNPACK=0 USE_XNNPACK=0 python setup.py develop
DEBUG=1 USE_DISTRIBUTED=0 USE_MKLDNN=0 USE_CUDA=0 BUILD_TEST=0 \
USE_FBGEMM=0 USE_NNPACK=0 USE_QNNPACK=0 USE_XNNPACK=0 \
python -m pip install --no-build-isolation -v -e .
```
For subsequent builds (i.e., when `build/CMakeCache.txt` exists), the build
@ -711,7 +719,7 @@ options.
### Code completion and IDE support
When using `python setup.py develop`, PyTorch will generate
When using `python -m pip install -e .`, PyTorch will generate
a `compile_commands.json` file that can be used by many editors
to provide command completion and error highlighting for PyTorch's
C++ code. You need to `pip install ninja` to generate accurate
@ -772,7 +780,7 @@ If not, you can define these variables on the command line before invoking `setu
export CMAKE_C_COMPILER_LAUNCHER=ccache
export CMAKE_CXX_COMPILER_LAUNCHER=ccache
export CMAKE_CUDA_COMPILER_LAUNCHER=ccache
python setup.py develop
python -m pip install --no-build-isolation -v -e .
```
#### Use a faster linker
@ -785,7 +793,7 @@ If you are editing a single file and rebuilding in a tight loop, the time spent
Starting with CMake 3.29, you can specify the linker type using the [`CMAKE_LINKER_TYPE`](https://cmake.org/cmake/help/latest/variable/CMAKE_LINKER_TYPE.html) variable. For example, with `mold` installed:
```sh
CMAKE_LINKER_TYPE=MOLD python setup.py develop
CMAKE_LINKER_TYPE=MOLD python -m pip install --no-build-isolation -v -e .
```
#### Use pre-compiled headers
@ -797,7 +805,7 @@ setting `USE_PRECOMPILED_HEADERS=1` either on first setup, or in the
`CMakeCache.txt` file.
```sh
USE_PRECOMPILED_HEADERS=1 python setup.py develop
USE_PRECOMPILED_HEADERS=1 python -m pip install --no-build-isolation -v -e .
```
This adds a build step where the compiler takes `<ATen/ATen.h>` and essentially
@ -820,7 +828,7 @@ A compiler-wrapper to fix this is provided in `tools/nvcc_fix_deps.py`. You can
this as a compiler launcher, similar to `ccache`
```bash
export CMAKE_CUDA_COMPILER_LAUNCHER="python;`pwd`/tools/nvcc_fix_deps.py;ccache"
python setup.py develop
python -m pip install --no-build-isolation -v -e .
```
### Rebuild few files with debug information
@ -1171,7 +1179,7 @@ build_with_asan()
CFLAGS="-fsanitize=address -fno-sanitize-recover=all -shared-libasan -pthread" \
CXX_FLAGS="-pthread" \
USE_CUDA=0 USE_OPENMP=0 USE_DISTRIBUTED=0 DEBUG=1 \
python setup.py develop
python -m pip install --no-build-isolation -v -e .
}
run_with_asan()

View File

@ -57,7 +57,7 @@ RUN --mount=type=cache,target=/opt/ccache \
export eval ${CMAKE_VARS} && \
TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
python setup.py install
python -m pip install --no-build-isolation -v .
FROM conda as conda-installs
ARG PYTHON_VERSION=3.11

View File

@ -228,6 +228,7 @@ If you want to disable Intel GPU support, export the environment variable `USE_X
Other potentially useful environment variables may be found in `setup.py`.
#### Get the PyTorch Source
```bash
git clone https://github.com/pytorch/pytorch
cd pytorch
@ -279,24 +280,29 @@ conda install -c conda-forge libuv=1.39
```
#### Install PyTorch
**On Linux**
If you're compiling for AMD ROCm then first run this command:
```bash
# Only run this if you're compiling for ROCm
python tools/amd_build/build_amd.py
```
Install PyTorch
```bash
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
python setup.py develop
python -m pip install -r requirements.txt
python -m pip install --no-build-isolation -v -e .
```
**On macOS**
```bash
python3 setup.py develop
python -m pip install -r requirements.txt
python -m pip install --no-build-isolation -v -e .
```
**On Windows**
@ -308,7 +314,7 @@ If you want to build legacy python code, please refer to [Building on legacy cod
In this mode PyTorch computations will run on your CPU, not your GPU.
```cmd
python setup.py develop
python -m pip install --no-build-isolation -v -e .
```
Note on OpenMP: The desired OpenMP implementation is Intel OpenMP (iomp). In order to link against iomp, you'll need to manually download the library and set up the building environment by tweaking `CMAKE_INCLUDE_PATH` and `LIB`. The instruction [here](https://github.com/pytorch/pytorch/blob/main/docs/source/notes/windows.rst#building-from-source) is an example for setting up both MKL and Intel OpenMP. Without these configurations for CMake, Microsoft Visual C OpenMP runtime (vcomp) will be used.
@ -329,7 +335,6 @@ Additional libraries such as
You can refer to the [build_pytorch.bat](https://github.com/pytorch/pytorch/blob/main/.ci/pytorch/win-test-helpers/build_pytorch.bat) script for some other environment variables configurations
```cmd
cmd
@ -349,8 +354,7 @@ for /f "usebackq tokens=*" %i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\
:: [Optional] If you want to override the CUDA host compiler
set CUDAHOSTCXX=C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\HostX64\x64\cl.exe
python setup.py develop
python -m pip install --no-build-isolation -v -e .
```
**Intel GPU builds**
@ -372,7 +376,7 @@ if defined CMAKE_PREFIX_PATH (
set "CMAKE_PREFIX_PATH=%CONDA_PREFIX%\Library"
)
python setup.py develop
python -m pip install --no-build-isolation -v -e .
```
##### Adjust Build Options (Optional)
@ -382,6 +386,7 @@ the following. For example, adjusting the pre-detected directories for CuDNN or
with such a step.
On Linux
```bash
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
CMAKE_ONLY=1 python setup.py build
@ -389,6 +394,7 @@ ccmake build # or cmake-gui build
```
On macOS
```bash
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ CMAKE_ONLY=1 python setup.py build

View File

@ -131,69 +131,25 @@ uint64_t CPUGeneratorImpl::seed() {
/**
* Sets the internal state of CPUGeneratorImpl. The new internal state
* must be a strided CPU byte tensor and of the same size as either
* CPUGeneratorImplStateLegacy (for legacy CPU generator state) or
* CPUGeneratorImplState (for new state).
*
* FIXME: Remove support of the legacy state in the future?
* must be a strided CPU byte tensor and of the same size as CPUGeneratorImplState.
*/
void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
using detail::CPUGeneratorImplState;
using detail::CPUGeneratorImplStateLegacy;
static_assert(std::is_standard_layout_v<CPUGeneratorImplStateLegacy>, "CPUGeneratorImplStateLegacy is not a PODType");
static_assert(std::is_standard_layout_v<CPUGeneratorImplState>, "CPUGeneratorImplState is not a PODType");
static const size_t size_legacy = sizeof(CPUGeneratorImplStateLegacy);
static const size_t size_current = sizeof(CPUGeneratorImplState);
static_assert(size_legacy != size_current, "CPUGeneratorImplStateLegacy and CPUGeneratorImplState can't be of the same size");
constexpr size_t size = sizeof(CPUGeneratorImplState);
detail::check_rng_state(new_state);
at::mt19937 engine;
auto float_normal_sample = std::optional<float>();
auto double_normal_sample = std::optional<double>();
// Construct the state of at::CPUGeneratorImpl based on input byte tensor size.
CPUGeneratorImplStateLegacy* legacy_pod{nullptr};
auto new_state_size = new_state.numel();
if (new_state_size == size_legacy) {
legacy_pod = (CPUGeneratorImplStateLegacy*)new_state.data();
// Note that in CPUGeneratorImplStateLegacy, we didn't have float version
// of normal sample and hence we leave the std::optional<float> as is
// Update next_double_normal_sample.
// Note that CPUGeneratorImplStateLegacy stores two uniform values (normal_x, normal_y)
// and a rho value (normal_rho). These three values were redundant and in the new
// DistributionsHelper.h, we store the actual extra normal sample, rather than three
// intermediate values.
if (legacy_pod->normal_is_valid) {
auto r = legacy_pod->normal_rho;
auto theta = 2.0 * c10::pi<double> * legacy_pod->normal_x;
// we return the sin version of the normal sample when in caching mode
double_normal_sample = std::optional<double>(r * ::sin(theta));
}
} else if (new_state_size == size_current) {
auto rng_state = (CPUGeneratorImplState*)new_state.data();
legacy_pod = &rng_state->legacy_pod;
// update next_float_normal_sample
if (rng_state->is_next_float_normal_sample_valid) {
float_normal_sample = std::optional<float>(rng_state->next_float_normal_sample);
}
// Update next_double_normal_sample.
// Note that in getRNGState, we now return the actual normal sample in normal_y
// and if it's valid in normal_is_valid. The redundant normal_x and normal_rho
// are squashed to 0.0.
if (legacy_pod->normal_is_valid) {
double_normal_sample = std::optional<double>(legacy_pod->normal_y);
}
} else {
TORCH_CHECK(false, "Expected either a CPUGeneratorImplStateLegacy of size ", size_legacy,
" or a CPUGeneratorImplState of size ", size_current,
" but found the input RNG state size to be ", new_state_size);
}
TORCH_CHECK(new_state_size == size, "Expected a CPUGeneratorImplState of size ", size,
" but found the input RNG state size to be ", new_state_size);
auto rng_state = new_state.data_ptr_impl<CPUGeneratorImplState>();
auto legacy_pod = &(rng_state->legacy_pod);
// construct engine_
// Note that CPUGeneratorImplStateLegacy stored a state array of 64 bit uints, whereas in our
// redefined mt19937, we have changed to a state array of 32 bit uints. Hence, we are
@ -207,8 +163,12 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
engine.set_data(rng_data);
TORCH_CHECK(engine.is_valid(), "Invalid mt19937 state");
this->engine_ = engine;
this->next_float_normal_sample_ = float_normal_sample;
this->next_double_normal_sample_ = double_normal_sample;
this->next_float_normal_sample_ = rng_state->is_next_float_normal_sample_valid
? std::optional<float>(rng_state->next_float_normal_sample)
: std::optional<float>();
this->next_double_normal_sample_ = legacy_pod->normal_is_valid
? std::optional<double>(legacy_pod->normal_y)
: std::optional<double>();
}
/**

View File

@ -431,7 +431,8 @@ class TORCH_API Context {
at::SDPBackend::flash_attention,
at::SDPBackend::efficient_attention,
at::SDPBackend::math,
at::SDPBackend::cudnn_attention};
at::SDPBackend::cudnn_attention,
at::SDPBackend::overrideable};
bool enabled_flashSDP = true;
bool enabled_mem_efficientSDP = true;
bool enabled_mathSDP = true;

View File

@ -26,9 +26,7 @@ inline void infer_size_impl(
std::optional<int64_t> infer_dim;
for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) {
if (TORCH_GUARD_OR_FALSE(sym_eq(shape[dim], -1))) {
if (infer_dim) {
throw std::runtime_error("only one dimension can be inferred");
}
TORCH_CHECK(!infer_dim, "only one dimension can be inferred");
infer_dim = dim;
} else {
// in case of unbacked shape[dim] we assume it's not -1 and add a runtime

View File

@ -214,7 +214,7 @@ inline Tensor applySlice(
"step must be greater than zero");
// See NOTE [nested tensor size for indexing]
if (self_sizes.has_value()) {
if (self_sizes.has_value() && self_sizes.value().size() > 0) {
// Skip this optimization if we are tracing, as the trace may be polymorphic
// over the shape of the `self` tensor, and we still want to record
// the slice.
@ -223,7 +223,7 @@ inline Tensor applySlice(
: self.sym_size(dim);
if (!disable_slice_optimization &&
TORCH_STATICALLY_KNOWN_TRUE(start.sym_eq(0)) &&
TORCH_STATICALLY_KNOWN_TRUE(length.sym_eq(stop)) && step == 1) {
TORCH_STATICALLY_KNOWN_TRUE(length.sym_le(stop)) && step == 1) {
return self;
}
}

View File

@ -59,9 +59,7 @@ struct TORCH_API Generator {
explicit Generator(c10::intrusive_ptr<c10::GeneratorImpl> gen_impl)
: impl_(std::move(gen_impl)) {
if (impl_.get() == nullptr) {
throw std::runtime_error("GeneratorImpl with nullptr is not supported");
}
TORCH_CHECK(impl_, "GeneratorImpl with nullptr is not supported");
}
bool operator==(const Generator& rhs) const {

View File

@ -98,9 +98,7 @@ class TORCH_API TensorBase {
explicit TensorBase(
c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
: impl_(std::move(tensor_impl)) {
if (impl_.get() == nullptr) {
throw std::runtime_error("TensorImpl with nullptr is not supported");
}
TORCH_CHECK(impl_, "TensorImpl with nullptr is not supported");
}
TensorBase(const TensorBase&) = default;
TensorBase(TensorBase&&) noexcept = default;

View File

@ -68,11 +68,10 @@ Symbol InternedStrings::_symbol(const std::string& s) {
return it->second;
auto pos = s.find("::");
if (pos == std::string::npos) {
std::stringstream ss;
ss << "all symbols must have a namespace, <namespace>::<string>, but found: " << s;
throw std::runtime_error(ss.str());
}
TORCH_CHECK(
pos != std::string::npos,
"all symbols must have a namespace, <namespace>::<string>, but found: ",
s);
Symbol ns = _symbol("namespaces::" + s.substr(0, pos));
Symbol sym(sym_to_info_.size());
@ -121,12 +120,11 @@ std::string Symbol::domainString() const {
}
Symbol Symbol::fromDomainAndUnqualString(const std::string & d, const std::string & s) {
if (d.compare(0, domain_prefix().size(), domain_prefix()) != 0) {
std::ostringstream ss;
ss << "Symbol: domain string is expected to be prefixed with '"
<< domain_prefix() << "', e.g. 'org.pytorch.aten'";
throw std::runtime_error(ss.str());
}
TORCH_CHECK(
d.compare(0, domain_prefix().size(), domain_prefix()) == 0,
"Symbol: domain string is expected to be prefixed with '",
domain_prefix(),
"', e.g. 'org.pytorch.aten'");
std::string qualString = d.substr(domain_prefix().size()) + "::" + s;
return fromQualString(qualString);
}

View File

@ -406,8 +406,7 @@ size_t IValue::hash(const IValue& v) {
case Tag::Enum:
case Tag::Stream:
case Tag::Uninitialized:
throw std::runtime_error(
"unhashable type: '" + v.type()->repr_str() + "'");
TORCH_CHECK(false, "unhashable type: '" + v.type()->repr_str() + "'");
}
// the above switch should be exhaustive
TORCH_INTERNAL_ASSERT(false, "we should never reach here")

View File

@ -116,10 +116,9 @@ struct SingleElementType : public SharedType {
protected:
SingleElementType(TypePtr elem) : SharedType(Kind), elem(std::move(elem)) {
if (!this->elem) {
throw std::runtime_error(c10::str(
"Can not create ", typeKindToString(Kind), " with None type"));
}
TORCH_CHECK(
this->elem,
c10::str("Can not create ", typeKindToString(Kind), " with None type"));
}
private:
@ -416,16 +415,12 @@ struct TORCH_API SymbolicShape {
}
ShapeSymbol operator[](size_t i) const {
if (!dims_) {
throw std::runtime_error("Rank isn't fixed");
}
TORCH_CHECK(dims_, "Rank isn't fixed");
return (*dims_).at(i);
}
ShapeSymbol at(size_t i) const {
if (!dims_) {
throw std::runtime_error("Rank isn't fixed");
}
TORCH_CHECK(dims_, "Rank isn't fixed");
return (*dims_).at(i);
}
@ -520,9 +515,7 @@ struct VaryingShape {
}
const std::optional<T> &operator[](size_t i) const {
if (!dims_) {
throw std::runtime_error("Rank isn't fixed");
}
TORCH_CHECK(dims_, "Rank isn't fixed");
return (*dims_).at(i);
}
@ -957,9 +950,7 @@ struct TORCH_API DictType : public SharedType {
TypePtr createWithContained(
std::vector<TypePtr> contained_types) const override {
if (contained_types.size() != 2) {
throw std::runtime_error("Expected 2 contained types");
}
TORCH_CHECK(contained_types.size() == 2, "Expected 2 contained types");
return create(std::move(contained_types.at(0)), std::move(contained_types.at(1)));
}

View File

@ -826,9 +826,7 @@ TupleType::TupleType(
: NamedType(TypeKind::TupleType, std::move(name)),
elements_(std::move(elements)),
has_free_variables_(std::any_of(elements_.begin(), elements_.end(), [](const TypePtr& v) {
if (!v) {
throw std::runtime_error("Can not create tuple with None type");
}
TORCH_CHECK(v, "Can not create tuple with None type");
return v->hasFreeVariables();
})), schema_(std::move(schema)) {

View File

@ -163,6 +163,9 @@ class Vectorized<BFloat16> {
Vectorized<BFloat16> exp_u20() const {
return exp();
}
Vectorized<BFloat16> fexp_u20() const {
return exp();
}
Vectorized<BFloat16> fmod(const Vectorized<BFloat16>& q) const;
Vectorized<BFloat16> hypot(const Vectorized<BFloat16>& b) const;
Vectorized<BFloat16> i0() const;

View File

@ -249,6 +249,9 @@ class Vectorized<double> {
Vectorized<double> exp_u20() const {
return exp();
}
Vectorized<double> fexp_u20() const {
return exp();
}
Vectorized<double> fmod(const Vectorized<double>& q) const {USE_SLEEF(
{ return Vectorized<double>(Sleef_fmoddx_sve(values, q)); },
{

View File

@ -314,6 +314,9 @@ class Vectorized<float> {
Vectorized<float> exp_u20() const {
return exp();
}
Vectorized<float> fexp_u20() const {
return exp();
}
Vectorized<float> fmod(const Vectorized<float>& q) const {USE_SLEEF(
{ return Vectorized<float>(Sleef_fmodfx_sve(values, q)); },
{

View File

@ -308,6 +308,9 @@ class Vectorized<float> {
Vectorized<float> exp_u20() const {
return exp();
}
Vectorized<float> fexp_u20() const {
return exp();
}
DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
fmod,
Sleef_fmodf4)

View File

@ -206,6 +206,10 @@ struct Vectorized16 {
return static_cast<const Derived*>(this)->map_with_vec_float_method(
&Vectorized<float>::exp_u20);
}
Derived fexp_u20() const {
return static_cast<const Derived*>(this)->map_with_vec_float_method(
&Vectorized<float>::exp_u20);
}
Derived fmod(const Derived& q) const {
// This function is questionable with a conversion, so we use map2
return map2(q, std::fmod);

View File

@ -488,6 +488,9 @@ class Vectorized16 {
Vectorized<T> expm1() const {
return map(Sleef_expm1f8_u10);
}
Vectorized<T> fexp_u20() const {
return exp();
}
Vectorized<T> exp_u20() const {
return exp();
}

View File

@ -198,6 +198,9 @@ class Vectorized<double> {
Vectorized<double> exp_u20() const {
return exp();
}
Vectorized<double> fexp_u20() const {
return exp();
}
Vectorized<double> fmod(const Vectorized<double>& q) const {
return Vectorized<double>(Sleef_fmodd4(values, q));
}

View File

@ -1,5 +1,4 @@
#pragma once
// DO NOT DEFINE STATIC DATA IN THIS HEADER!
// See Note [Do not compile initializers with AVX]
@ -256,6 +255,63 @@ class Vectorized<float> {
Vectorized<float> expm1() const {
return Vectorized<float>(Sleef_expm1f8_u10(values));
}
Vectorized<float> fexp_u20() const {
const __m256 vec_c0 = _mm256_set1_ps(0.00010703434948458272f);
const __m256 vec_c1 = _mm256_set1_ps(0.30354260500649682f);
const __m256 vec_c2 = _mm256_set1_ps(-0.22433836478672356);
const __m256 vec_c3 = _mm256_set1_ps(-0.079204240219773236);
const __m256 vec_exp_log2ef =
_mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e)
const __m256 vec_a = _mm256_set1_ps(std::pow(2, 23) / std::log2(2));
const __m256 vec_b = _mm256_set1_ps(std::pow(2, 23) * 127.f);
const __m256 vec_ln_flt_min =
_mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
const __m256 vec_ln_flt_max =
_mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
const __m256 vec_inf = _mm256_set1_ps(INFINITY);
const __m256 zero = _mm256_setzero_ps();
// exp(x) = 2**(x * log2(e))
// = 2**xi * 2**xf - TIPS we are using the EEEE floating point
// representation with identification to the exponent and the
// mentissa
// 2**xf will be approximated to a polynomial of degree 3 computed with
// Horner method
// compute the min/max for the mask
// Masks
__m256 mask_too_small =
_mm256_cmp_ps(values, vec_ln_flt_min, _CMP_LT_OS); // x < min
__m256 mask_too_large =
_mm256_cmp_ps(values, vec_ln_flt_max, _CMP_GT_OS); // x > max
// transformation with log2(e)
auto vec_src = _mm256_mul_ps(values, vec_exp_log2ef);
auto vec_fractional = _mm256_sub_ps(vec_src, _mm256_floor_ps(vec_src));
// compute polynomial using Horner Scheme
auto vec_res = _mm256_fmadd_ps(vec_fractional, vec_c3, vec_c2);
vec_res = _mm256_fmadd_ps(vec_fractional, vec_res, vec_c1);
vec_res = _mm256_fmadd_ps(vec_fractional, vec_res, vec_c0);
vec_src = _mm256_sub_ps(vec_src, vec_res);
// // the tips is here, headache in perspective
auto tmp = _mm256_fmadd_ps(vec_a, vec_src, vec_b);
// headache bis
__m256i casted_integer = _mm256_cvttps_epi32(tmp);
// bitwise to float for the final transformation
auto result = _mm256_castsi256_ps(casted_integer);
// boundary condition
// Set to 0 where x < ln(FLT_MIN)
result = _mm256_blendv_ps(result, zero, mask_too_small);
// Set to +inf where x > ln(FLT_MAX)
result = _mm256_blendv_ps(result, vec_inf, mask_too_large);
// final interpretation to float
return result;
}
Vectorized<float> exp_u20() const {
// A faster version of exp with ULP=20
const __m256 vec_factorial_1 =

View File

@ -121,27 +121,52 @@ typename std::enable_if_t<
}
template <typename T>
typename std::enable_if_t<
std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
at::vec::Vectorized<
T>> inline convert_float_to_int8(at::vec::Vectorized<float> src) {
at::vec::Vectorized<T> inline convert_float_to_int8(
at::vec::Vectorized<float> src);
template <>
at::vec::Vectorized<int8_t> inline convert_float_to_int8(
at::vec::Vectorized<float> src) {
// Convert from float32 to int32 with truncation
__m256i x_values_int32 = _mm256_cvttps_epi32(src);
// Convert from int32 to int16 using signed saturation
__m256i xy_packed_v = _mm256_packs_epi32(x_values_int32, x_values_int32);
constexpr auto min_val = std::numeric_limits<T>::min();
constexpr auto max_val = std::numeric_limits<T>::max();
constexpr auto min_val = std::numeric_limits<int8_t>::min();
constexpr auto max_val = std::numeric_limits<int8_t>::max();
// Convert from int16 to uint8/int8 using unsigned saturation
__m256i xyzw_clamped_v =
pack_saturate_and_clamp<T>(xy_packed_v, xy_packed_v, min_val, max_val);
// Convert from int16 to int8 using unsigned saturation
__m256i xyzw_clamped_v = pack_saturate_and_clamp<int8_t>(
xy_packed_v, xy_packed_v, min_val, max_val);
__m256i permute_mask_v =
_mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
return _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v);
}
template <>
at::vec::Vectorized<uint8_t> inline convert_float_to_int8(
at::vec::Vectorized<float> src) {
// The type of *_val should be int32_t to ensure correct clamping behavior.
constexpr auto min_val = std::numeric_limits<int32_t>::min();
constexpr auto max_val = std::numeric_limits<int32_t>::max();
__m256 float32_min_val = _mm256_set1_ps(float(min_val));
__m256 float32_max_val = _mm256_set1_ps(float(max_val));
__m256 float32_src = _mm256_max_ps(src, float32_min_val);
float32_src = _mm256_min_ps(float32_src, float32_max_val);
__m256i truncated_src = _mm256_cvttps_epi32(float32_src);
__m128i r1 = _mm256_castsi256_si128(truncated_src);
__m128i mask = _mm_setr_epi8(
0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m128i r1_shuffled = _mm_shuffle_epi8(r1, mask);
__m128i r2 = _mm256_extractf128_si256(truncated_src, 1);
__m128i r2_shuffled = _mm_shuffle_epi8(r2, mask);
__m128i result = _mm_unpacklo_epi32(r1_shuffled, r2_shuffled);
return _mm256_castsi128_si256(result);
}
template <typename T>
__FORCE_INLINE void QuantizeAvx2(
const float* src,

View File

@ -273,6 +273,9 @@ class Vectorized<double> {
Vectorized<double> C10_ALWAYS_INLINE exp_u20() const {
return exp();
}
Vectorized<double> C10_ALWAYS_INLINE fexp_u20() const {
return exp();
}
Vectorized<double> lgamma() const __ubsan_ignore_undefined__ {
return {Sleef_lgammad2_u10(_vec0), Sleef_lgammad2_u10(_vec1)};

View File

@ -352,6 +352,9 @@ class Vectorized<float> {
Vectorized<float> C10_ALWAYS_INLINE exp_u20() const {
return exp();
}
Vectorized<float> C10_ALWAYS_INLINE fexp_u20() const {
return exp();
}
Vectorized<float> C10_ALWAYS_INLINE log() const {
return {Sleef_logf4_u10(_vec0), Sleef_logf4_u10(_vec1)};

View File

@ -1023,6 +1023,9 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
Vectorized<T> exp_u20() const {
return exp();
}
Vectorized<T> fexp_u20() const {
return exp();
}
Vectorized<T> log() const {
return mapSleef(Sleef_logf4_u10, Sleef_logd2_u10);

View File

@ -535,6 +535,9 @@ class Vectorized16 {
Vectorized<T> expm1() const {
return map(Sleef_expm1f16_u10);
}
Vectorized<T> fexp_u20() const {
return exp();
}
Vectorized<T> exp_u20() const {
return exp();
}

View File

@ -221,6 +221,9 @@ class Vectorized<double> {
Vectorized<double> exp_u20() const {
return exp();
}
Vectorized<double> fexp_u20() const {
return exp();
}
Vectorized<double> fmod(const Vectorized<double>& q) const {
return Vectorized<double>(Sleef_fmodd8(values, q));
}

View File

@ -310,6 +310,60 @@ class Vectorized<float> {
Vectorized<float> expm1() const {
return Vectorized<float>(Sleef_expm1f16_u10(values));
}
Vectorized<float> fexp_u20() const {
const __m512 vec_c0 = _mm512_set1_ps(0.00010703434948458272f);
const __m512 vec_c1 = _mm512_set1_ps(0.30354260500649682f);
const __m512 vec_c2 = _mm512_set1_ps(-0.22433836478672356);
const __m512 vec_c3 = _mm512_set1_ps(-0.079204240219773236);
const __m512 vec_exp_log2ef =
_mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e)
const __m512 vec_a = _mm512_set1_ps(std::pow(2, 23) / std::log2(2));
const __m512 vec_b = _mm512_set1_ps(std::pow(2, 23) * 127.f);
const __m512 vec_ln_flt_min =
_mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
const __m512 vec_ln_flt_max =
_mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
__m512i vec_infinity = _mm512_set1_epi32(0x7F800000);
__m512i vec_zero = _mm512_setzero_epi32();
// Fast Exponential Computation on SIMD Architectures
// A. Cristiano I. Malossi, Yves Ineichen, Costas Bekas, and Alessandro
// Curioni exp(x) = 2**(x * log2(e))
// = 2**xi * 2**xf - TIPS we are using the EEEE floating point
// representation with identification to the exponent and the
// mentissa
// 2**xf will be approximated to a polynomial of degree 3 computed with
// Horner method
// mask for the boundary condition
auto min_mask = _mm512_cmp_ps_mask(values, vec_ln_flt_min, _CMP_LT_OS);
auto max_mask = _mm512_cmp_ps_mask(values, vec_ln_flt_max, _CMP_GT_OS);
// transformation with log2(e)
auto vec_src = _mm512_mul_ps(values, vec_exp_log2ef);
auto vec_fractional = _mm512_sub_ps(vec_src, _mm512_floor_ps(vec_src));
// compute polynomial using Horner Scheme, for superscalar processor
auto vec_res = _mm512_fmadd_ps(vec_fractional, vec_c3, vec_c2);
vec_res = _mm512_fmadd_ps(vec_fractional, vec_res, vec_c1);
vec_res = _mm512_fmadd_ps(vec_fractional, vec_res, vec_c0);
vec_src = _mm512_sub_ps(vec_src, vec_res);
// the tips is here, headache in perspective
auto tmp = _mm512_fmadd_ps(vec_a, vec_src, vec_b);
// headache bis - we loose precision with the cast but it "fits", but ok
// after f32 -> f16 later
__m512i casted_integer = _mm512_cvttps_epi32(tmp);
// boundary condition, lower than the min -> 0
casted_integer = _mm512_mask_mov_epi32(casted_integer, min_mask, vec_zero);
// boundary condition, larger than the max -> +oo
casted_integer =
_mm512_mask_mov_epi32(casted_integer, max_mask, vec_infinity);
// final interpretation to float
return _mm512_castsi512_ps(casted_integer);
}
Vectorized<float> exp_u20() const {
// A faster version of exp with ULP=20
const __m512 vec_factorial_1 =

View File

@ -123,22 +123,24 @@ typename std::enable_if_t<
}
template <typename T>
typename std::enable_if_t<
std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
at::vec::Vectorized<
T>> inline convert_float_to_int8(at::vec::Vectorized<float> src) {
at::vec::Vectorized<T> inline convert_float_to_int8(
at::vec::Vectorized<float> src);
template <>
at::vec::Vectorized<int8_t> inline convert_float_to_int8(
at::vec::Vectorized<float> src) {
// Convert from float32 to int32 with truncation
__m512i x_values_int32 = _mm512_cvttps_epi32(src);
// Convert from int32 to int16 using signed saturation
__m512i xy_packed_v = _mm512_packs_epi32(x_values_int32, x_values_int32);
constexpr auto min_val = std::numeric_limits<T>::min();
constexpr auto max_val = std::numeric_limits<T>::max();
constexpr auto min_val = std::numeric_limits<int8_t>::min();
constexpr auto max_val = std::numeric_limits<int8_t>::max();
// Convert from int16 to uint8/int8 using unsigned saturation
__m512i xyzw_clamped_v =
pack_saturate_and_clamp<T>(xy_packed_v, xy_packed_v, min_val, max_val);
// Convert from int16 to int8 using unsigned saturation
__m512i xyzw_clamped_v = pack_saturate_and_clamp<int8_t>(
xy_packed_v, xy_packed_v, min_val, max_val);
__m512i permute_mask_v = _mm512_set_epi32(
0x0f,
0x0b,
@ -159,6 +161,21 @@ typename std::enable_if_t<
return _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
}
template <>
at::vec::Vectorized<uint8_t> inline convert_float_to_int8(
at::vec::Vectorized<float> src) {
// The type of *_val should be int32_t to ensure correct clamping behavior.
constexpr auto min_val = std::numeric_limits<int32_t>::min();
constexpr auto max_val = std::numeric_limits<int32_t>::max();
__m512 float32_min_val = _mm512_set1_ps(float(min_val));
__m512 float32_max_val = _mm512_set1_ps(float(max_val));
__m512 float32_src = _mm512_max_ps(src, float32_min_val);
float32_src = _mm512_min_ps(float32_src, float32_max_val);
__m512i int32_src_clamped = _mm512_cvttps_epi32(float32_src);
__m128i int8_src = _mm512_cvtepi32_epi8(int32_src_clamped);
return _mm512_castsi128_si512(int8_src);
}
template <typename T>
__FORCE_INLINE void QuantizeAvx512(
const float* src,

View File

@ -238,9 +238,6 @@ struct Vectorized {
Vectorized vector;
int_same_size_t<T> buffer[size()];
mask.store(buffer);
#if defined(__clang__) && __ARM_FEATURE_SVE
#pragma clang loop vectorize(disable)
#endif
for (const auto i : c10::irange(size())) {
if (buffer[i] & 0x01) {
vector[i] = b[i];
@ -547,6 +544,9 @@ struct Vectorized {
Vectorized<T> exp_u20() const {
return map(std::exp);
}
Vectorized<T> fexp_u20() const {
return map(std::exp);
}
Vectorized<T> frac() const {
return *this - this->trunc();
}

View File

@ -263,6 +263,7 @@ class VectorizedN {
VECTORIZEDN_DEFINE_UNARY_OP(exp2)
VECTORIZEDN_DEFINE_UNARY_OP(expm1)
VECTORIZEDN_DEFINE_UNARY_OP(exp_u20)
VECTORIZEDN_DEFINE_UNARY_OP(fexp_u20)
VECTORIZEDN_DEFINE_UNARY_OP(frac)
VECTORIZEDN_DEFINE_BINARY_OP(fmod)
VECTORIZEDN_DEFINE_UNARY_OP(log)

View File

@ -94,9 +94,10 @@ static std::vector<std::optional<Tensor>> batchIndices(
if (index.has_value() && index->sym_numel() != 0) {
const auto idx_bdim = indices_bdims[i];
indices_.emplace_back(maybePadToLogicalRank(moveBatchDimToFront(index.value(), idx_bdim), idx_bdim, maxLogicalRank));
if (index.value().dtype() == kBool && indices_bdims[i].has_value()) {
throw std::runtime_error("vmap: We do not support batching operators that can support dynamic shape. Attempting to batch over indexing with a boolean mask.");
}
TORCH_CHECK(
index.value().dtype() != kBool || !indices_bdims[i].has_value(),
"vmap: We do not support batching operators that can support ",
"dynamic shape. Attempting to batch over indexing with a boolean mask.");
} else {
indices_.push_back(index);
}

View File

@ -16,11 +16,14 @@ template<typename O, typename C>
static void _assert_match(const O& original, const C& compared, const std::string& name) {
if (compared) {
bool equal = (original == compared.value());
if (!equal) {
std::stringstream msg;
msg << "Tensor " << name << " mismatch! Expected: " << compared.value() << ", Got: " << original;
throw std::runtime_error(msg.str());
}
TORCH_CHECK(
equal,
"Tensor ",
name,
" mismatch! Expected: ",
compared.value(),
", Got: ",
original);
}
}

View File

@ -424,6 +424,14 @@ Tensor _dirichlet_grad_cpu(const Tensor& x, const Tensor& alpha, const Tensor& t
*/
Tensor _s_binomial_cpu(const Tensor& count, const Tensor& prob, std::optional<Generator> gen) {
TORCH_CHECK_VALUE(
at::isFloatingType(count.scalar_type()),
"binomial only supports floating-point dtypes for count, got: ",
count.scalar_type());
TORCH_CHECK_VALUE(
at::isFloatingType(prob.scalar_type()),
"binomial only supports floating-point dtypes for prob, got: ",
prob.scalar_type());
Tensor ret = at::zeros(count.sizes(), count.options());
auto iter = TensorIteratorConfig()
.add_output(ret)

View File

@ -180,9 +180,7 @@ TORCH_IMPL_FUNC(triu_cpu)(const Tensor& self, int64_t k, const Tensor &result) {
}
Tensor trace_backward_symint(const Tensor& grad, c10::SymIntArrayRef sizes) {
if (sizes.size() != 2) {
throw std::runtime_error("expected matrix input");
}
TORCH_CHECK(sizes.size() == 2, "expected matrix input");
auto grad_input = at::zeros_symint(sizes[0] * sizes[1], grad.options());
auto indices = at::arange(0, grad_input.numel(), sizes[1] + 1, grad.options().dtype(at::kLong));

View File

@ -62,7 +62,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
virtual std::optional<at::Tensor> bias() = 0;
virtual void set_bias(const std::optional<at::Tensor>& bias) {
throw std::runtime_error(
TORCH_CHECK(
false,
"set_bias is not implemented for this packed "
"parameter type");
}

View File

@ -96,7 +96,14 @@ inline void _exp_reduce_sum_fusion_kernel(
for (long i = 0; i < vec_size * (size / vec_size); i += vec_size) {
auto tmp0 = vec::Vectorized<T1>::loadu(a + i);
auto tmp1 = tmp0 - vec_max;
auto tmp2 = tmp1.exp_u20();
Vectorized<T1> tmp2;
if constexpr (std::is_same_v<T1, float> &&
(std::is_same_v<T2, at::BFloat16> || std::is_same_v<T2, at::Half>))
{
tmp2 = tmp1.fexp_u20();
} else {
tmp2 = tmp1.exp_u20();
}
vec_tmp_sum += tmp2;
_store(out + i, tmp2);
}

View File

@ -48,12 +48,7 @@ __global__ void prepare_grouped_gemm_data(
int32_t start = tid == 0 ? 0 : offs[tid - 1];
delta = offs[tid] - start;
if (K < 0) {
if (!a_row_major && b_row_major) {
CUDA_KERNEL_ASSERT(delta >=0 && "expected offsets to be greater or equal 0\n");
} else {
// CUTLASS cannot handle delta=0 here.
CUDA_KERNEL_ASSERT(delta >0 && "expected offsets to be greater than 0\n");
}
CUDA_KERNEL_ASSERT(delta >=0 && "expected offsets to be greater or equal 0\n");
}
// TMA transfers require global memory tensor addresses to be

View File

@ -337,6 +337,7 @@ Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
#include <cmath>
#include <mkl_dfti.h>
#include <mkl_version.h>
#include <ATen/mkl/Exceptions.h>
#include <ATen/mkl/Descriptors.h>
#include <ATen/mkl/Limits.h>
@ -479,6 +480,19 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
const auto value_type = c10::toRealValueType(input.scalar_type());
out.resize_(batched_out_sizes, MemoryFormat::Contiguous);
// fix mkl issue
// https://github.com/pytorch/pytorch/issues/154477
#ifdef INTEL_MKL_VERSION
#if INTEL_MKL_VERSION > 20210400L
for (const auto& stride : input.strides()) {
if (stride == 0) {
input = input.clone(MemoryFormat::Contiguous);
break;
}
}
#endif
#endif
auto descriptor = _plan_mkl_fft(
input.strides(), out.strides(), signal_size, input.is_complex(),
out.is_complex(), normalization, forward, value_type);

View File

@ -79,14 +79,16 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
// 2. Math fallback
auto& ctx = at::globalContext();
// use overrideable linked to onednn as overrideable implementation
if (!ctx.userEnabledMathSDP() && !ctx.userEnabledOverrideableSDP()) {
if (!ctx.userEnabledMathSDP() && !ctx.userEnabledOverrideableSDP() &&
!ctx.userEnabledFlashSDP()) {
return sdp::SDPBackend::error;
}
// Get ideal kernel ordering
const std::array<sdp::SDPBackend, 2> priority_order{
const std::array<sdp::SDPBackend, 3> priority_order{
sdp::SDPBackend::overrideable,
sdp::SDPBackend::math,
sdp::SDPBackend::flash_attention,
};
// Because TORCHCHECK checks if condition is true we negate debug so that
@ -105,6 +107,14 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
return sdp::SDPBackend::math;
}
break;
case sdp::SDPBackend::flash_attention:
if (ctx.userEnabledFlashSDP() &&
use_overrideable_xpu(kernel_params, print_debug)) {
TORCH_WARN(
"Flash Attention is not supported on XPU, falling back to overrideable kernel.");
return sdp::SDPBackend::overrideable;
}
break;
default:
TORCH_CHECK(false, "Invalid backend");
}
@ -141,7 +151,7 @@ int64_t _fused_sdp_choice_xpu(
TORCH_CHECK(
false,
"No viable backend for scaled_dot_product_attention was found. ",
"This is likely due to turning off both the math kernel and the fused kernels.");
"This is likely due to turning off both the math kernel and the overrideable kernels.");
}
return static_cast<int64_t>(backend);
}

View File

@ -1,8 +0,0 @@
// Copyright © 2022 Apple Inc.
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/native/mps/TensorFactory.h>
#include <c10/core/ScalarType.h>
#include <unordered_map>
using namespace at::mps;

View File

@ -18,8 +18,6 @@
#include <ATen/native/Resize.h>
#include <ATen/native/TensorAdvancedIndexing.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/operations/Indexing.h>
#include <c10/core/QScheme.h>
#include <c10/util/SmallVector.h>
#include <c10/util/irange.h>
#include <fmt/format.h>

View File

@ -746,7 +746,7 @@ inline std::tuple<bool, Tensor, Tensor> NestedTensor_compute_size_stride(
}
else if (size_reshaped == -1) {
if (infer_index > -1) {
throw std::runtime_error("only one dimension can be inferred");
TORCH_CHECK(false, "only one dimension can be inferred");
}
else {
infer_index = idim;

View File

@ -19,7 +19,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
double /*output_scale*/,
int64_t /*output_zero_point*/,
at::Tensor& output) {
throw std::runtime_error(
TORCH_CHECK(
false,
"apply_out is not implemented for this packed "
"parameter type");
return output;
@ -30,7 +31,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
double /*output_scale*/,
int64_t /*output_zero_point*/,
at::Tensor& output) {
throw std::runtime_error(
TORCH_CHECK(
false,
"apply_relu_out is not implemented for this packed "
"parameter type");
return output;
@ -55,7 +57,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
at::Tensor input,
double input_scale,
int64_t input_zero_point) {
throw std::runtime_error(
TORCH_CHECK(
false,
"apply_with_input_q_dq_qweight_dq_output_fp32 is not implemented for this packed "
"parameter type");
return {};
@ -79,7 +82,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
at::Tensor input,
double input_scale,
int64_t input_zero_point) {
throw std::runtime_error(
TORCH_CHECK(
false,
"apply_with_input_q_dq_qweight_dq_relu_output_fp32 is not implemented for this packed "
"parameter type");
return {};
@ -96,7 +100,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
const at::Tensor& /* input */,
at::Tensor& output,
bool /* reduce_range */) {
throw std::runtime_error(
TORCH_CHECK(
false,
"apply_dynamic_out is not implemented for this packed "
"parameter type");
return output;
@ -105,7 +110,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
const at::Tensor& /* input */,
at::Tensor& output,
bool /* reduce_range */) {
throw std::runtime_error(
TORCH_CHECK(
false,
"apply_dynamic_relu_out is not implemented for this packed "
"parameter type");
return output;
@ -116,7 +122,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
virtual std::optional<at::Tensor> bias() = 0;
virtual void set_bias(std::optional<at::Tensor> /*bias*/) {
throw std::runtime_error(
TORCH_CHECK(
false,
"set_bias is not implemented for this packed "
"parameter type");
}

View File

@ -843,6 +843,11 @@ SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
return SDPBackend::math;
}
break;
case SDPBackend::overrideable:
if (ctx.userEnabledOverrideableSDP()) {
TORCH_CHECK(false, "Invalid backend");
}
break;
default:
TORCH_CHECK(false, "Invalid backend");
}

View File

@ -8,12 +8,12 @@ It also provides mechanisms to compare PyTorch with other frameworks.
Make sure you're on a machine with CUDA, torchvision, and pytorch installed. Install in the following order:
```
# Install torchvision. It comes with the pytorch stable release binary
pip3 install torch torchvision
python -m pip install torch torchvision
# Install the latest pytorch master from source.
# It should supersede the installation from the release binary.
cd $PYTORCH_HOME
python setup.py build develop
python -m pip install --no-build-isolation -v -e .
# Check the pytorch installation version
python -c "import torch; print(torch.__version__)"

View File

@ -27,7 +27,7 @@ pull-deps: clone-deps
(cd ../../../torchbenchmark && git fetch && git checkout "$$(cat ../pytorch/.github/ci_commit_pins/torchbench.txt)" && git submodule update --init --recursive)
build-deps: clone-deps
uv pip install astunparse numpy scipy ninja pyyaml mkl mkl-include setuptools cmake \
uv pip install numpy scipy ninja pyyaml six mkl mkl-include setuptools wheel cmake \
typing-extensions requests protobuf numba cython scikit-learn librosa
(cd ../../../torchvision && uv pip install -e . --no-build-isolation)
(cd ../../../torchdata && uv pip install -e .)

View File

@ -210,7 +210,7 @@ mobilenet_v2,pass,0
mobilenet_v2_quantized_qat,pass,2
mobilenet_v2_quantized_qat,pass,3
@ -274,7 +274,7 @@ resnet50,pass,0
resnet50_quantized_qat,pass,2
resnet50_quantized_qat,pass,3

1 name accuracy graph_breaks
210
211
212
213
214
215
216
274
275
276
277
278
279
280

View File

@ -210,7 +210,7 @@ mobilenet_v2,pass,0
mobilenet_v2_quantized_qat,pass,2
mobilenet_v2_quantized_qat,pass,3
@ -274,7 +274,7 @@ resnet50,pass,0
resnet50_quantized_qat,pass,2
resnet50_quantized_qat,pass,3

1 name accuracy graph_breaks
210
211
212
213
214
215
216
274
275
276
277
278
279
280

View File

@ -210,7 +210,7 @@ mobilenet_v2,pass,0
mobilenet_v2_quantized_qat,pass,2
mobilenet_v2_quantized_qat,pass,3
@ -274,7 +274,7 @@ resnet50,pass,0
resnet50_quantized_qat,pass,2
resnet50_quantized_qat,pass,3

1 name accuracy graph_breaks
210
211
212
213
214
215
216
274
275
276
277
278
279
280

View File

@ -194,7 +194,7 @@ mobilenet_v2,pass,0
mobilenet_v2_quantized_qat,pass,2
mobilenet_v2_quantized_qat,pass,3
@ -258,7 +258,7 @@ resnet50,pass,0
resnet50_quantized_qat,pass,2
resnet50_quantized_qat,pass,3

1 name accuracy graph_breaks
194
195
196
197
198
199
200
258
259
260
261
262
263
264

View File

@ -210,7 +210,7 @@ mobilenet_v2,pass,0
mobilenet_v2_quantized_qat,pass,2
mobilenet_v2_quantized_qat,pass,3
@ -274,7 +274,7 @@ resnet50,pass,0
resnet50_quantized_qat,pass,2
resnet50_quantized_qat,pass,3

1 name accuracy graph_breaks
210
211
212
213
214
215
216
274
275
276
277
278
279
280

View File

@ -17,8 +17,8 @@ export DEBUG=0
export OMP_NUM_THREADS=10
# Compile pytorch with the base revision
git checkout master
python setup.py develop
git checkout main
python -m pip install --no-build-isolation -v -e .
# Install dependencies:
# Scipy is required by detr
@ -32,7 +32,7 @@ python functional_autograd_benchmark.py --output before.txt
# Compile pytorch with your change
popd
git checkout your_feature_branch
python setup.py develop
python -m pip install --no-build-isolation -v -e .
# Run the benchmark for the new version
pushd benchmarks/functional_autograd_benchmark

View File

@ -20,7 +20,7 @@ Key Features:
The instruction below installs a cpp\_extension for PyTorch and it is required to run the benchmark suite.
```bash
cd pt_extension
python setup.py install
python -m pip install .
```
## How to run the benchmarks:

View File

@ -11,7 +11,7 @@ export USE_MKL=1
CMAKE_ONLY=1 python setup.py build
ccmake build # or cmake-gui build
python setup.py install
python -m pip install --no-build-isolation -v .
cd benchmarks
echo "!! SPARSE SPMM TIME BENCHMARK!! " >> $OUTFILE
@ -28,7 +28,7 @@ echo "----- USE_MKL=0 ------" >> $OUTFILE
rm -rf build
export USE_MKL=0
python setup.py install
python -m pip install --no-build-isolation -v .
cd benchmarks
for dim0 in 1000 5000 10000; do

View File

@ -0,0 +1,233 @@
#include <c10/core/AllocatorConfig.h>
#include <c10/core/DeviceType.h>
#include <c10/util/env.h>
#include <c10/util/irange.h>
namespace c10::CachingAllocator {
namespace {
constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
constexpr size_t kMB = 1024 * 1024ul;
constexpr size_t kRoundUpPowerOfTwoStart = 1 * kMB; // 1MB
constexpr size_t kRoundUpPowerOfTwoEnd = 64 * 1024ul * kMB; // 64GB
} // anonymous namespace
AcceleratorAllocatorConfig& AcceleratorAllocatorConfig::instance() {
static AcceleratorAllocatorConfig instance;
#define C10_ALLOCATOR_CONFIG_PARSE_ENV(env, deprecated) \
auto env##_name = c10::utils::get_env(#env); \
if (env##_name.has_value()) { \
if (deprecated) { \
TORCH_WARN_ONCE(#env " is deprecated, use PYTORCH_ALLOC_CONF instead"); \
} \
instance.parseArgs(env##_name.value()); \
return true; \
}
static bool env_flag [[maybe_unused]] = []() {
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_ALLOC_CONF, false)
// Keep this for backwards compatibility
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_CUDA_ALLOC_CONF, /*deprecated=*/true)
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_HIP_ALLOC_CONF, /*deprecated=*/true)
return false;
}();
#undef C10_ALLOCATOR_CONFIG_PARSE_ENV
return instance;
}
AcceleratorAllocatorConfig::AcceleratorAllocatorConfig() {
roundup_power2_divisions_.assign(kRoundUpPowerOfTwoIntervals, 0);
}
size_t AcceleratorAllocatorConfig::roundup_power2_divisions(size_t size) {
size_t log_size = (63 - llvm::countLeadingZeros(size));
// Our intervals start at 1MB and end at 64GB
const size_t interval_start =
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoStart);
const size_t interval_end =
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoEnd);
TORCH_CHECK(
interval_end - interval_start == kRoundUpPowerOfTwoIntervals,
"kRoundUpPowerOfTwoIntervals mismatch");
size_t index =
(log_size > interval_start) ? (log_size - interval_start) : 0ul;
index = std::min(index, kRoundUpPowerOfTwoIntervals - 1);
return instance().roundup_power2_divisions_[index];
}
size_t AcceleratorAllocatorConfig::parseMaxSplitSize(
const ConfigTokenizer& tokenizer,
size_t i) {
tokenizer.checkToken(++i, ":");
constexpr size_t min_allowed_split_size_mb = kLargeBuffer / kMB;
constexpr size_t max_allowed_split_size_mb =
std::numeric_limits<size_t>::max() / kMB;
size_t val_env = tokenizer.toSizeT(++i);
TORCH_CHECK(
val_env >= min_allowed_split_size_mb,
"CachingAllocator option max_split_size_mb too small, must be >= ",
min_allowed_split_size_mb);
val_env = std::min(val_env, max_allowed_split_size_mb);
max_split_size_ = val_env * kMB;
return i;
}
size_t AcceleratorAllocatorConfig::parseMaxNonSplitRoundingSize(
const ConfigTokenizer& tokenizer,
size_t i) {
tokenizer.checkToken(++i, ":");
constexpr size_t min_allowed_split_size_mb = kLargeBuffer / kMB;
constexpr size_t max_allowed_split_size_mb =
std::numeric_limits<size_t>::max() / kMB;
size_t val_env = tokenizer.toSizeT(++i);
TORCH_CHECK(
val_env >= min_allowed_split_size_mb,
"CachingAllocator option max_non_split_rounding_mb too small, must be >= ",
min_allowed_split_size_mb);
val_env = std::min(val_env, max_allowed_split_size_mb);
max_non_split_rounding_size_ = val_env * kMB;
return i;
}
size_t AcceleratorAllocatorConfig::parseGarbageCollectionThreshold(
const ConfigTokenizer& tokenizer,
size_t i) {
tokenizer.checkToken(++i, ":");
double val_env = tokenizer.toDouble(++i);
TORCH_CHECK(
val_env > 0 && val_env < 1.0,
"garbage_collect_threshold is invalid, set it in (0.0, 1.0)");
garbage_collection_threshold_ = val_env;
return i;
}
size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
const ConfigTokenizer& tokenizer,
size_t i) {
tokenizer.checkToken(++i, ":");
bool first_value = true;
if (tokenizer[++i] == "[") {
size_t last_index = 0;
// NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
while (++i < tokenizer.size() && tokenizer[i] != "]") {
size_t value_index = i;
tokenizer.checkToken(++i, ":");
size_t value = tokenizer.toSizeT(++i);
TORCH_CHECK(
value == 0 || llvm::isPowerOf2_64(value),
"For roundups, the divisions has to be power of 2 or 0 to disable roundup ");
if (tokenizer[value_index] == ">") {
std::fill(
std::next(
roundup_power2_divisions_.begin(),
static_cast<std::vector<size_t>::difference_type>(
last_index + 1)),
roundup_power2_divisions_.end(),
value);
} else {
size_t boundary = tokenizer.toSizeT(value_index);
TORCH_CHECK(
llvm::isPowerOf2_64(boundary),
"For roundups, the intervals have to be power of 2 ");
size_t index = 63 - llvm::countLeadingZeros(boundary);
index =
std::clamp(index, size_t{0}, roundup_power2_divisions_.size() - 1);
if (first_value) {
std::fill(
roundup_power2_divisions_.begin(),
std::next(
roundup_power2_divisions_.begin(),
static_cast<std::vector<size_t>::difference_type>(index)),
value);
first_value = false;
}
roundup_power2_divisions_[index] = value;
last_index = index;
}
if (tokenizer[i + 1] != "]") {
tokenizer.checkToken(++i, ",");
}
}
TORCH_INTERNAL_ASSERT(
i < tokenizer.size(),
"Expected closing bracket ']' in ConfigTokenizer but reached end of config");
} else { // Keep this for backwards compatibility
size_t value = tokenizer.toSizeT(i);
TORCH_CHECK(
llvm::isPowerOf2_64(value),
"For roundups, the divisions has to be power of 2 ");
std::fill(
roundup_power2_divisions_.begin(),
roundup_power2_divisions_.end(),
value);
}
return i;
}
size_t AcceleratorAllocatorConfig::parseExpandableSegments(
const ConfigTokenizer& tokenizer,
size_t i) {
tokenizer.checkToken(++i, ":");
use_expandable_segments_ = tokenizer.toBool(++i);
return i;
}
size_t AcceleratorAllocatorConfig::parsePinnedUseBackgroundThreads(
const ConfigTokenizer& tokenizer,
size_t i) {
tokenizer.checkToken(++i, ":");
pinned_use_background_threads_ = tokenizer.toBool(++i);
return i;
}
void AcceleratorAllocatorConfig::parseArgs(const std::string& env) {
// The following option will be reset to its default value if not explicitly
// set each time.
max_split_size_ = std::numeric_limits<size_t>::max();
roundup_power2_divisions_.assign(kRoundUpPowerOfTwoIntervals, 0);
garbage_collection_threshold_ = 0;
{
std::lock_guard<std::mutex> lock(last_allocator_settings_mutex_);
last_allocator_settings_ = env;
}
ConfigTokenizer tokenizer(env);
for (size_t i = 0; i < tokenizer.size(); i++) {
const auto& key = tokenizer[i];
if (key == "max_split_size_mb") {
i = parseMaxSplitSize(tokenizer, i);
} else if (key == "max_non_split_rounding_mb") {
i = parseMaxNonSplitRoundingSize(tokenizer, i);
} else if (key == "garbage_collection_threshold") {
i = parseGarbageCollectionThreshold(tokenizer, i);
} else if (key == "roundup_power2_divisions") {
i = parseRoundUpPower2Divisions(tokenizer, i);
} else if (key == "expandable_segments") {
i = parseExpandableSegments(tokenizer, i);
} else if (key == "pinned_use_background_threads") {
i = parsePinnedUseBackgroundThreads(tokenizer, i);
} else {
i = tokenizer.skipKey(i);
}
if (i + 1 < tokenizer.size()) {
tokenizer.checkToken(++i, ",");
}
}
}
} // namespace c10::CachingAllocator

337
c10/core/AllocatorConfig.h Normal file
View File

@ -0,0 +1,337 @@
#pragma once
#include <c10/core/DeviceType.h>
#include <c10/util/Exception.h>
#include <c10/util/llvmMathExtras.h>
#include <atomic>
#include <mutex>
#include <string>
#include <vector>
namespace c10::CachingAllocator {
// "large" allocations may be packed in 20 MiB blocks
const size_t kLargeBuffer = 20971520;
// A utility class for tokenizing allocator configuration strings into discrete
// parts. For example, the config string:
// "key1:val1,key2:[val2,val3]"
// is tokenized into:
// "key1", ":", "val1", ",", "key2", ":", "[", "val2", ",", "val3", "]",
//
// Tokens include keys, values, and special characters (':', ',', '[', ']').
// Whitespace is ignored.
class ConfigTokenizer {
public:
explicit ConfigTokenizer(const std::string& env) {
std::string buffer;
for (char ch : env) {
if (ch == ',' || ch == ':' || ch == '[' || ch == ']') {
if (!buffer.empty()) {
config_.emplace_back(std::move(buffer));
buffer.clear();
}
config_.emplace_back(1, ch);
} else if (!std::isspace(static_cast<unsigned char>(ch))) {
buffer += ch;
}
}
if (!buffer.empty()) {
config_.emplace_back(std::move(buffer));
}
}
const std::string& operator[](size_t i) const {
TORCH_INTERNAL_ASSERT(
i < config_.size(), "Index out of bounds in ConfigTokenizer");
return config_[i];
}
size_t size() const {
return config_.size();
}
bool checkToken(size_t i, const std::string& token) const {
checkIndex(i);
return config_[i] == token;
}
size_t toSizeT(size_t i) const {
checkIndex(i);
return std::stoull(config_[i]);
}
double toDouble(size_t i) const {
checkIndex(i);
return std::stod(config_[i]);
}
bool toBool(size_t i) const {
checkIndex(i);
const auto& token = config_[i];
if (token == "True") {
return true;
} else if (token == "False") {
return false;
} else {
TORCH_CHECK(
false,
"Expected 'True' or 'False' at index ",
i,
" in ConfigTokenizer but got '",
token,
"'");
}
}
// Skips the current token group and returns the index of the value token.
// Assumes the current index `i` points to a key name in a key-value pair.
size_t skipKey(size_t i) const {
// Expect a colon after the key
checkToken(++i, ":");
++i; // Move to the value
checkIndex(i);
if (config_[i] != "[") {
// Value is a single token (not a list) -> return its index
return i;
}
// Skip tokens inside the list until matching ']'
// NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
while (++i < config_.size() && config_[i] != "]") {
}
TORCH_INTERNAL_ASSERT(
i < config_.size(),
"Expected closing bracket ']' in ConfigTokenizer but reached end of config");
return i; // Return the index of the closing ']'
}
private:
void checkIndex(size_t i) const {
TORCH_INTERNAL_ASSERT(
i < config_.size(), "Index out of bounds in ConfigTokenizer");
}
std::vector<std::string> config_;
};
/**
* Note [AcceleratorAllocatorConfig design]
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* This class configures memory allocation for both device and host memory. A
* single `AcceleratorAllocatorConfig` instance is shared across all accelerator
* backends, such as CUDA and XPU, under the assumption that relevant
* environment variables apply uniformly to all accelerators. Device-specific
* configuration extensions are supported via hooks (see
* `registerDeviceConfigParserHook`).
*
* Recommended design:
* - Place common configurations in `AcceleratorAllocatorConfig`.
* - Extend backend-specific configurations in corresponding device-specific
* classes, such as `CUDAAllocatorConfig`, etc.
*
* Scope:
* - Configuration options must be environment-variable driven.
*
* Naming Convention:
* - Public API names in `AcceleratorAllocatorConfig` should be device-generic.
* - Members prefixed with `pinned_` are specific to the host/pinned allocator.
* - Environment variable names should be generic across backends.
* - Comma-separated key-value pairs in the format: `key:value`. Use square
* brackets `[]` for list values Example: `key1:123, key2:[val1,val2]`
*
* Environment Variables:
* - The primary environment variable for configuration is `PYTORCH_ALLOC_CONF`.
* - For backward compatibility, `PYTORCH_CUDA_ALLOC_CONF` is also supported
* with lower priority.
*/
class C10_API AcceleratorAllocatorConfig {
public:
static AcceleratorAllocatorConfig& instance();
C10_DISABLE_COPY_AND_ASSIGN(AcceleratorAllocatorConfig);
AcceleratorAllocatorConfig(AcceleratorAllocatorConfig&&) = delete;
AcceleratorAllocatorConfig& operator=(AcceleratorAllocatorConfig&&) = delete;
~AcceleratorAllocatorConfig() = default;
/* Device allocator settings */
// Returns the maximum block size (in MB) that is allowed to be split. The
// default is unlimited (all blocks can be split).
static size_t max_split_size() {
return instance().max_split_size_;
}
// Returns the maximum block size (in MB) that is allowed to be rounded up
// without requiring splitting when searching for a free block. The default is
// 20 MiB.
static size_t max_non_split_rounding_size() {
return instance().max_non_split_rounding_size_;
}
// Return the number of divisions used when rounding up allocation sizes (in
// MB) to the nearest power-of-2 boundary.
static size_t roundup_power2_divisions(size_t size);
// Returns the vector of division factors used for rounding up allocation
// sizes. These divisions apply to size intervals between 1MB and 64GB.
static std::vector<size_t> roundup_power2_divisions() {
return instance().roundup_power2_divisions_;
}
// Returns the threshold that triggers garbage collection when the ratio of
// used memory to maximum allowed memory exceeds this value. The default is 0,
// meaning no garbage collection is triggered. The value should be in the
// range (0.0, 1.0).
static double garbage_collection_threshold() {
return instance().garbage_collection_threshold_;
}
// Returns whether the expandable segment feature is enabled. This allows the
// allocator to start with one segment that grows as needed, rather than
// creating a new segment for each allocation. Default is false (expandable
// segments disabled).
static bool use_expandable_segments() {
return instance().use_expandable_segments_;
}
/* Host allocator settings */
// Returns whether the pinned host allocator uses background threads for
// processing events. This is useful for improving performance in scenarios
// where many small allocations are made. Default is false (background threads
// disabled).
static bool pinned_use_background_threads() {
return instance().pinned_use_background_threads_;
}
/* Settings for both device and host allocator */
// Returns the current allocator settings as a string. This string is useful
// to expand device-specific allocator configurations
static std::string last_allocator_settings() {
std::lock_guard<std::mutex> lock(instance().last_allocator_settings_mutex_);
return instance().last_allocator_settings_;
}
// Parses the environment variable `env` to update the allocator settings.
// If the environment variable is not set, it does nothing.
// The configuration string should be a comma-separated list of key-value
// pairs, where each key is a configuration option and the value is the
// corresponding setting. For example:
// "max_split_size_mb:100,max_non_split_rounding_mb:20,garbage_collection_threshold:0.5,roundup_power2_divisions:[64:8,256:4,1024:4,>:1],expandable_segments:true,pinned_use_background_threads:true"
void parseArgs(const std::string& env);
// Registers a device-specific configuration parser hook. This allows
// backends to parse additional device-specific configuration options from the
// environment variable. The hook should be a function that takes a string
// (the environment variable value) and parses it to set device-specific
// configuration options.
// The hook will be called when the environment variable is parsed.
// If a hook is already registered, it will be replaced with the new one.
void registerDeviceConfigParserHook(
std::function<void(const std::string&)> hook) {
device_config_parser_hook_ = std::move(hook);
}
// Calls the registered device-specific configuration parser hook with the
// provided environment string. This allows backends to parse additional
// device-specific configuration options from the environment variable.
// If no hook is registered, this function does nothing.
void callDeviceConfigParserHook(const std::string& env) const {
if (device_config_parser_hook_) {
device_config_parser_hook_(env);
}
}
private:
AcceleratorAllocatorConfig();
/* Internal functions for device allocator */
// Parse `max_split_size_mb` from environment variable.
size_t parseMaxSplitSize(const ConfigTokenizer& tokenizer, size_t i);
// Parse `max_non_split_rounding_mb` from environment variable.
size_t parseMaxNonSplitRoundingSize(
const ConfigTokenizer& tokenizer,
size_t i);
// Parse `garbage_collection_threshold` from environment variable.
size_t parseGarbageCollectionThreshold(
const ConfigTokenizer& tokenizer,
size_t i);
// Parse `roundup_power2_divisions` from environment variable.
size_t parseRoundUpPower2Divisions(
const ConfigTokenizer& tokenizer,
size_t i);
// Parse `expandable_segments` from environment variable.
size_t parseExpandableSegments(const ConfigTokenizer& tokenizer, size_t i);
/* Internal functions for host allocator */
// Parse `pinned_use_background_threads` from environment variable.
size_t parsePinnedUseBackgroundThreads(
const ConfigTokenizer& tokenizer,
size_t i);
/* The following members are specifically used for the device allocator. */
// The maximum block size that is allowed to be split.
std::atomic<size_t> max_split_size_{std::numeric_limits<size_t>::max()};
// The maximum allowable extra size of a memory block without requiring
// splitting when searching for a free block.
std::atomic<size_t> max_non_split_rounding_size_{kLargeBuffer};
// Used to store how memory allocations of different sizes should be rounded
// up to the nearest power of 2 divisions.
std::vector<size_t> roundup_power2_divisions_;
// The threshold that triggers garbage collection when the ratio of used
// memory to maximum allowed memory exceeds this value.
std::atomic<double> garbage_collection_threshold_{0};
// A flag to enable expandable segments feature.
std::atomic<bool> use_expandable_segments_{false};
/* The following members are specifically used for the host allocator. */
// A flag to enable background thread for processing events.
std::atomic<bool> pinned_use_background_threads_{false};
/* The following members are used for both device and host allocator. */
// Record the last allocator config environment setting.
std::mutex last_allocator_settings_mutex_;
std::string last_allocator_settings_;
// Optional hook for parsing additional device-specific allocator settings.
// This allows backends (e.g., CUDA, XPU) to register a custom parser for
// their own environment configuration extensions.
std::function<void(const std::string&)> device_config_parser_hook_{nullptr};
};
C10_API inline void setAllocatorSettings(const std::string& env) {
AcceleratorAllocatorConfig::instance().parseArgs(env);
AcceleratorAllocatorConfig::instance().callDeviceConfigParserHook(env);
}
C10_API inline std::string getAllocatorSettings() {
return AcceleratorAllocatorConfig::instance().last_allocator_settings();
}
struct DeviceConfigParserHookRegistry {
explicit DeviceConfigParserHookRegistry(
std::function<void(const std::string&)> hook) {
AcceleratorAllocatorConfig::instance().registerDeviceConfigParserHook(
std::move(hook));
}
};
#define REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK(hook) \
namespace { \
static at::CachingAllocator::DeviceConfigParserHookRegistry \
g_device_config_parse_hook_registry_instance(hook); \
}
} // namespace c10::CachingAllocator

View File

@ -1,30 +1,27 @@
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
#include <c10/cuda/CUDAException.h>
#include <c10/cuda/driver_api.h>
#include <c10/util/CallOnce.h>
#include <c10/util/Exception.h>
#include <c10/util/Logging.h>
#include <cuda_runtime.h>
#include <dlfcn.h>
namespace c10::cuda {
namespace {
void* get_symbol(const char* name, int version);
DriverAPI create_driver_api() {
void* handle_0 = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_NOLOAD);
TORCH_CHECK(handle_0, "Can't open libcuda.so.1: ", dlerror());
void* handle_1 = DriverAPI::get_nvml_handle();
DriverAPI r{};
#define LOOKUP_LIBCUDA_ENTRY(name) \
r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \
TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name, ": ", dlerror())
C10_LIBCUDA_DRIVER_API(LOOKUP_LIBCUDA_ENTRY)
#undef LOOKUP_LIBCUDA_ENTRY
#define LOOKUP_LIBCUDA_ENTRY(name) \
r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \
dlerror();
C10_LIBCUDA_DRIVER_API_12030(LOOKUP_LIBCUDA_ENTRY)
#undef LOOKUP_LIBCUDA_ENTRY
#define LOOKUP_LIBCUDA_ENTRY_WITH_VERSION(name, version) \
r.name##_ = reinterpret_cast<decltype(&name)>(get_symbol(#name, version)); \
TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name)
C10_LIBCUDA_DRIVER_API(LOOKUP_LIBCUDA_ENTRY_WITH_VERSION)
#undef LOOKUP_LIBCUDA_ENTRY_WITH_VERSION
if (handle_1) {
#define LOOKUP_NVML_ENTRY(name) \
@ -35,6 +32,32 @@ DriverAPI create_driver_api() {
}
return r;
}
void* get_symbol(const char* name, int version) {
void* out = nullptr;
cudaDriverEntryPointQueryResult qres{};
// CUDA 12.5+ supports version-based lookup
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12050)
if (auto st = cudaGetDriverEntryPointByVersion(
name, &out, version, cudaEnableDefault, &qres);
st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
return out;
}
#endif
// This fallback to the old API to try getting the symbol again.
if (auto st = cudaGetDriverEntryPoint(name, &out, cudaEnableDefault, &qres);
st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
return out;
}
// If the symbol cannot be resolved, report and return nullptr;
// the caller is responsible for checking the pointer.
LOG(INFO) << "Failed to resolve symbol " << name;
return nullptr;
}
} // namespace
void* DriverAPI::get_nvml_handle() {

View File

@ -20,30 +20,24 @@
} \
} while (0)
#define C10_LIBCUDA_DRIVER_API(_) \
_(cuDeviceGetAttribute) \
_(cuMemAddressReserve) \
_(cuMemRelease) \
_(cuMemMap) \
_(cuMemAddressFree) \
_(cuMemSetAccess) \
_(cuMemUnmap) \
_(cuMemCreate) \
_(cuMemGetAllocationGranularity) \
_(cuMemExportToShareableHandle) \
_(cuMemImportFromShareableHandle) \
_(cuMemsetD32Async) \
_(cuStreamWriteValue32) \
_(cuGetErrorString)
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12030)
#define C10_LIBCUDA_DRIVER_API_12030(_) \
_(cuMulticastAddDevice) \
_(cuMulticastBindMem) \
_(cuMulticastCreate)
#else
#define C10_LIBCUDA_DRIVER_API_12030(_)
#endif
#define C10_LIBCUDA_DRIVER_API(_) \
_(cuDeviceGetAttribute, 12000) \
_(cuMemAddressReserve, 12000) \
_(cuMemRelease, 12000) \
_(cuMemMap, 12000) \
_(cuMemAddressFree, 12000) \
_(cuMemSetAccess, 12000) \
_(cuMemUnmap, 12000) \
_(cuMemCreate, 12000) \
_(cuMemGetAllocationGranularity, 12000) \
_(cuMemExportToShareableHandle, 12000) \
_(cuMemImportFromShareableHandle, 12000) \
_(cuMemsetD32Async, 12000) \
_(cuStreamWriteValue32, 12000) \
_(cuGetErrorString, 12000) \
_(cuMulticastAddDevice, 12030) \
_(cuMulticastBindMem, 12030) \
_(cuMulticastCreate, 12030)
#define C10_NVML_DRIVER_API(_) \
_(nvmlInit_v2) \
@ -56,11 +50,13 @@
namespace c10::cuda {
struct DriverAPI {
#define CREATE_MEMBER_VERSIONED(name, version) decltype(&name) name##_;
#define CREATE_MEMBER(name) decltype(&name) name##_;
C10_LIBCUDA_DRIVER_API(CREATE_MEMBER)
C10_LIBCUDA_DRIVER_API_12030(CREATE_MEMBER)
C10_LIBCUDA_DRIVER_API(CREATE_MEMBER_VERSIONED)
C10_NVML_DRIVER_API(CREATE_MEMBER)
#undef CREATE_MEMBER_VERSIONED
#undef CREATE_MEMBER
static DriverAPI* get();
static void* get_nvml_handle();
};

View File

@ -63,7 +63,6 @@ def define_c10_ovrsource(name, is_mobile):
"core/impl/*.h",
]),
reexport_all_header_dependencies = False,
# tests = C10_CPU_TEST_TARGETS,
visibility = [
"//xplat/caffe2/c10:c10_ovrsource",
],
@ -84,25 +83,6 @@ def define_c10_ovrsource(name, is_mobile):
)
def define_ovrsource_targets():
# C10_CPU_TEST_FILES = native.glob([
# "test/core/*.cpp",
# "test/util/*.cpp",
# ])
# C10_GPU_TEST_FILES = native.glob([
# "cuda/test/**/*.cpp",
# ])
# C10_CPU_TEST_TARGETS = [
# ":" + paths.basename(test)[:-len(".cpp")] + "_ovrsource"
# for test in C10_CPU_TEST_FILES
# ]
# C10_GPU_TEST_TARGETS = [
# ":" + paths.basename(test)[:-len(".cpp")] + "_ovrsource"
# for test in C10_GPU_TEST_FILES
# ]
common_c10_cmake_defines = [
("#cmakedefine C10_BUILD_SHARED_LIBS", ""),
("#cmakedefine C10_USE_NUMA", ""),
@ -207,7 +187,6 @@ def define_ovrsource_targets():
"cuda/impl/*.h",
]),
reexport_all_header_dependencies = False,
# tests = C10_GPU_TEST_TARGETS,
visibility = ["PUBLIC"],
deps = [
"//third-party/cuda:libcuda",
@ -217,64 +196,3 @@ def define_ovrsource_targets():
":c10_ovrsource",
],
)
# [
# oxx_test(
# name = paths.basename(test)[:-len(".cpp")] + "_ovrsource",
# srcs = [test],
# compatible_with = cpu_supported_platforms,
# compiler_flags = select({
# "DEFAULT": [],
# "ovr_config//compiler:cl": [
# "/w",
# ],
# "ovr_config//compiler:clang": [
# "-Wno-error",
# "-Wno-self-assign-overloaded",
# "-Wno-self-move",
# "-Wno-shadow",
# "-Wno-undef",
# "-Wno-unused-function",
# "-Wno-unused-variable",
# ],
# }),
# framework = "gtest",
# oncall = "ovrsource_pytorch",
# raw_headers = native.glob([
# "test/**/*.h",
# ]),
# deps = [
# ":c10_ovrsource",
# ],
# )
# for test in C10_CPU_TEST_FILES
# ]
# [
# oxx_test(
# name = paths.basename(test)[:-len(".cpp")] + "_ovrsource",
# srcs = [test],
# compatible_with = cuda_supported_platforms,
# compiler_flags = select({
# "DEFAULT": [],
# "ovr_config//compiler:cl": [
# "/w",
# ],
# "ovr_config//compiler:clang": [
# "-Wno-error",
# ],
# }),
# framework = "gtest",
# oncall = "ovrsource_pytorch",
# raw_headers = native.glob([
# "test/**/*.h",
# ]),
# runtime_shared_libraries = [
# "//third-party/cuda:cudart",
# ],
# deps = [
# ":c10_cuda_ovrsource",
# ],
# )
# for test in C10_GPU_TEST_FILES
# ]

View File

@ -0,0 +1,123 @@
#include <c10/core/AllocatorConfig.h>
#include <gtest/gtest.h>
using namespace c10::CachingAllocator;
constexpr size_t kMB = 1024 * 1024ul;
struct ExtendedAllocatorConfig {
static ExtendedAllocatorConfig& instance() {
static ExtendedAllocatorConfig instance;
return instance;
}
// Returns the device-specific option value in bytes.
static size_t device_specific_option() {
return instance().device_specific_option_;
}
void parseArgs(const std::string& env) {
// Parse device-specific options from the environment variable
ConfigTokenizer tokenizer(env);
for (size_t i = 0; i < tokenizer.size(); i++) {
const auto& key = tokenizer[i];
if (key == "device_specific_option_mb") {
tokenizer.checkToken(++i, ":");
device_specific_option_ = tokenizer.toSizeT(++i) * kMB;
} else {
i = tokenizer.skipKey(i);
}
if (i + 1 < tokenizer.size()) {
tokenizer.checkToken(++i, ",");
}
}
}
private:
// Device-specific option, e.g., memory limit for a specific device.
std::atomic<size_t> device_specific_option_{0};
};
REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK([](const std::string& env) {
ExtendedAllocatorConfig::instance().parseArgs(env);
})
TEST(AllocatorConfigTest, allocator_config_test) {
std::string env =
"max_split_size_mb:40,"
"max_non_split_rounding_mb:30,"
"garbage_collection_threshold:0.5,"
"roundup_power2_divisions:[64:8,128:2,256:4,512:2,1024:4,>:1],"
"expandable_segments:True,"
"pinned_use_background_threads:True,"
"device_specific_option_mb:64";
c10::CachingAllocator::setAllocatorSettings(env);
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
EXPECT_EQ(AcceleratorAllocatorConfig::max_split_size(), 40 * kMB);
EXPECT_EQ(
AcceleratorAllocatorConfig::max_non_split_rounding_size(), 30 * kMB);
EXPECT_EQ(AcceleratorAllocatorConfig::garbage_collection_threshold(), 0.5);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(32 * kMB), 8);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(64 * kMB), 8);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(128 * kMB), 2);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(256 * kMB), 4);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(512 * kMB), 2);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 4);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 1);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(4096 * kMB), 1);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(8192 * kMB), 1);
EXPECT_EQ(AcceleratorAllocatorConfig::use_expandable_segments(), true);
EXPECT_EQ(AcceleratorAllocatorConfig::pinned_use_background_threads(), true);
EXPECT_EQ(ExtendedAllocatorConfig::device_specific_option(), 64 * kMB);
env =
"max_split_size_mb:20,"
"max_non_split_rounding_mb:40,"
"garbage_collection_threshold:0.8";
c10::CachingAllocator::setAllocatorSettings(env);
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
EXPECT_EQ(AcceleratorAllocatorConfig::max_split_size(), 20 * kMB);
EXPECT_EQ(
AcceleratorAllocatorConfig::max_non_split_rounding_size(), 40 * kMB);
EXPECT_EQ(AcceleratorAllocatorConfig::garbage_collection_threshold(), 0.8);
// roundup_power2_divisions knob array syntax
env = "roundup_power2_divisions:[128:8,256:16,512:1,2048:8,>:2]";
c10::CachingAllocator::setAllocatorSettings(env);
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(64 * kMB), 8);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(128 * kMB), 8);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(256 * kMB), 16);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(512 * kMB), 1);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 0);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 8);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(4096 * kMB), 2);
// roundup_power2_divisions single value syntax for backward compatibility
env = "roundup_power2_divisions:4";
c10::CachingAllocator::setAllocatorSettings(env);
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(64 * kMB), 4);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(256 * kMB), 4);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 4);
env = "expandable_segments:False,";
c10::CachingAllocator::setAllocatorSettings(env);
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
EXPECT_EQ(AcceleratorAllocatorConfig::use_expandable_segments(), false);
env = "pinned_use_background_threads:False";
c10::CachingAllocator::setAllocatorSettings(env);
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
EXPECT_EQ(AcceleratorAllocatorConfig::pinned_use_background_threads(), false);
}

View File

@ -4,6 +4,7 @@
// 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa.
#include <c10/macros/Macros.h>
#include <c10/util/bit_cast.h>
#include <cmath>
#include <cstdint>
#include <cstring>
@ -67,13 +68,7 @@ inline C10_HOST_DEVICE uint16_t round_to_nearest_even(float src) {
#endif
return UINT16_C(0x7FC0);
} else {
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
union {
uint32_t U32; // NOLINT(facebook-hte-BadMemberName)
float F32; // NOLINT(facebook-hte-BadMemberName)
};
F32 = src;
const uint32_t U32 = c10::bit_cast<uint32_t>(src);
uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
return static_cast<uint16_t>((U32 + rounding_bias) >> 16);
}

View File

@ -3,6 +3,8 @@
#include <cstring>
#include <type_traits>
#include <c10/macros/Macros.h>
#if __has_include(<bit>) && (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L)
#include <bit>
#define C10_HAVE_STD_BIT_CAST 1
@ -23,7 +25,7 @@ using std::bit_cast;
// See https://en.cppreference.com/w/cpp/numeric/bit_cast for more
// information as well as the source of our implementations.
template <class To, class From>
std::enable_if_t<
C10_HOST_DEVICE std::enable_if_t<
sizeof(To) == sizeof(From) && std::is_trivially_copyable_v<From> &&
std::is_trivially_copyable_v<To>,
To>

View File

@ -58,6 +58,9 @@ def define_targets(rules):
name = "bit_cast",
hdrs = ["bit_cast.h"],
visibility = ["//:__subpackages__"],
deps = [
"//c10/macros",
],
)
rules.cc_library(

View File

@ -0,0 +1,18 @@
```{eval-rst}
.. role:: hidden
:class: hidden-section
```
```{eval-rst}
.. automodule:: torch.distributed._dist2
:members:
:undoc-members:
:show-inheritance:
```
```{eval-rst}
.. autoclass:: torch.distributed.ProcessGroup
:members:
:undoc-members:
:show-inheritance:
```

View File

@ -224,6 +224,10 @@ inconsistent 'UUID' assignment across ranks, and to prevent races during initial
.. autofunction:: is_torchelastic_launched
```
```{eval-rst}
.. autofunction:: get_default_backend_for_device
```
______________________________________________________________________
Currently three initialization methods are supported:
@ -1471,3 +1475,9 @@ If you are running single node training, it may be convenient to interactively b
```{eval-rst}
.. py:module:: torch.distributed.checkpoint.state_dict
```
```{toctree}
:hidden:
distributed._dist2
```

View File

@ -8,16 +8,14 @@ higher-level API to automatically differentiate models split across several
machines.
```{warning}
APIs in the RPC package are stable. There are multiple ongoing work items
to improve performance and error handling, which will ship in future releases.
APIs in the RPC package are stable and in maintenance mode.
```
```{warning}
CUDA support was introduced in PyTorch 1.9 and is still a **beta** feature.
CUDA support is a **beta** feature.
Not all features of the RPC package are yet compatible with CUDA support and
thus their use is discouraged. These unsupported features include: RRefs,
JIT compatibility, dist autograd and dist optimizer, and profiling. These
shortcomings will be addressed in future releases.
JIT compatibility, dist autograd and dist optimizer, and profiling.
```
```{note}
@ -102,13 +100,6 @@ device lists on source and destination workers do not match. In such cases,
applications can always explicitly move the input tensors to CPU on the caller
and move it to the desired devices on the callee if necessary.
```{warning}
TorchScript support in RPC is a prototype feature and subject to change. Since
v1.5.0, ``torch.distributed.rpc`` supports calling TorchScript functions as
RPC target functions, and this will help improve parallelism on the callee
side as executing TorchScript functions does not require GIL.
```
```{eval-rst}
.. autofunction:: rpc_sync
.. autofunction:: rpc_async
@ -159,9 +150,7 @@ multiple different transports (TCP, of course, but also shared memory, NVLink,
InfiniBand, ...) and can automatically detect their availability and negotiate
the best transport to use for each pipe.
The TensorPipe backend has been introduced in PyTorch v1.6 and is being actively
developed. At the moment, it only supports CPU tensors, with GPU support coming
soon. It comes with a TCP-based transport, just like Gloo. It is also able to
The TensorPipe backend comes with a TCP-based transport, just like Gloo. It is also able to
automatically chunk and multiplex large tensors over multiple sockets and
threads in order to achieve very high bandwidths. The agent will be able to pick
the best transport on its own, with no intervention required.
@ -301,6 +290,4 @@ to use [the profiler](https://pytorch.org/docs/stable/autograd.html#profiler) to
- [Getting started with Distributed RPC Framework](https://pytorch.org/tutorials/intermediate/rpc_tutorial.html)
- [Implementing a Parameter Server using Distributed RPC Framework](https://pytorch.org/tutorials/intermediate/rpc_param_server_tutorial.html)
- [Combining Distributed DataParallel with Distributed RPC Framework](https://pytorch.org/tutorials/advanced/rpc_ddp_tutorial.html) (covers **RemoteModule** as well)
- [Profiling RPC-based Workloads](https://pytorch.org/tutorials/recipes/distributed_rpc_profiling.html)
- [Implementing batch RPC processing](https://pytorch.org/tutorials/intermediate/rpc_async_execution.html)
- [Distributed Pipeline Parallel](https://pytorch.org/tutorials/intermediate/dist_pipeline_parallel_tutorial.html)

View File

@ -9,13 +9,13 @@ requires = [
# 77.0.0: min version for SPDX expression support for project.license
"setuptools>=62.3.0,<80.0",
"wheel",
"astunparse",
"cmake>=3.27",
"ninja",
"numpy",
"packaging",
"pyyaml",
"requests",
"six", # dependency chain: NNPACK -> PeachPy -> six
"typing-extensions>=4.10.0",
]
build-backend = "setuptools.build_meta"

View File

@ -1,5 +1,4 @@
# Python dependencies required for development
astunparse
build[uv] # for building sdist and wheel
cmake>=3.27
expecttest>=0.3.0
@ -18,6 +17,8 @@ pyyaml
requests
# setuptools develop deprecated on 80.0
setuptools>=62.3.0,<80.0
six # dependency chain: NNPACK -> PeachPy -> six
sympy>=1.13.3
types-dataclasses
typing-extensions>=4.13.2
wheel

View File

@ -15,4 +15,4 @@ pip install --no-use-pep517 -e "$tp2_dir/onnx"
# Install caffe2 and pytorch
pip install -r "$top_dir/caffe2/requirements.txt"
pip install -r "$top_dir/requirements.txt"
python setup.py develop
python -m pip install --no-build-isolation -v -e .

View File

@ -35,4 +35,4 @@ _pip_install -b "$BUILD_DIR/onnx" "file://$tp2_dir/onnx#egg=onnx"
# Install caffe2 and pytorch
pip install -r "$top_dir/caffe2/requirements.txt"
pip install -r "$top_dir/requirements.txt"
python setup.py install
python -m pip install --no-build-isolation -v .

View File

@ -263,6 +263,7 @@ import json
import shutil
import subprocess
import sysconfig
import textwrap
import time
from collections import defaultdict
from pathlib import Path
@ -601,7 +602,7 @@ def build_deps() -> None:
report(
'Finished running cmake. Run "ccmake build" or '
'"cmake-gui build" to adjust build options and '
'"python setup.py install" to build.'
'"python -m pip install --no-build-isolation -v ." to build.'
)
sys.exit()
@ -1207,24 +1208,25 @@ def configure_extension_build() -> tuple[
# post run, warnings, printed at the end to make them more visible
build_update_message = """
It is no longer necessary to use the 'build' or 'rebuild' targets
It is no longer necessary to use the 'build' or 'rebuild' targets
To install:
$ python setup.py install
To develop locally:
$ python setup.py develop
To force cmake to re-generate native build files (off by default):
$ CMAKE_FRESH=1 python setup.py develop
"""
To install:
$ python -m pip install --no-build-isolation -v .
To develop locally:
$ python -m pip install --no-build-isolation -v -e .
To force cmake to re-generate native build files (off by default):
$ CMAKE_FRESH=1 python -m pip install --no-build-isolation -v -e .
""".strip()
def print_box(msg: str) -> None:
lines = msg.split("\n")
size = max(len(l) + 1 for l in lines)
print("-" * (size + 2))
for l in lines:
print("|{}{}|".format(l, " " * (size - len(l))))
print("-" * (size + 2))
msg = textwrap.dedent(msg).strip()
lines = ["", *msg.split("\n"), ""]
max_width = max(len(l) for l in lines)
print("+" + "-" * (max_width + 4) + "+", file=sys.stderr, flush=True)
for line in lines:
print(f"| {line:<{max_width}s} |", file=sys.stderr, flush=True)
print("+" + "-" * (max_width + 4) + "+", file=sys.stderr, flush=True)
def main() -> None:
@ -1308,7 +1310,9 @@ def main() -> None:
"include/**/*.hpp",
"include/*.cuh",
"include/**/*.cuh",
"csrc/inductor/aoti_runtime/model.h",
"_inductor/codegen/*.h",
"_inductor/codegen/aoti_runtime/*.h",
"_inductor/codegen/aoti_runtime/*.cpp",
"_inductor/script.ld",
"_export/serde/*.yaml",

View File

@ -36,7 +36,7 @@ The following commands assume you are in PyTorch root.
```bash
# ... Build PyTorch from source, e.g.
python setup.py develop
python -m pip install --no-build-isolation -v -e .
# (re)build just the binary
ninja -C build bin/test_jit
# run tests

View File

@ -4,8 +4,8 @@ This folder contains a self-contained example of a PyTorch out-of-tree backend l
## How to use
Install as standalone with `python setup.py develop` (or install) from this folder.
You can run test via `python {PYTORCH_ROOT_PATH}/test/test_openreg.py`.
Install as standalone with `python -m pip install -e .` (or `python -m pip install .`)
from this folder. You can run test via `python {PYTORCH_ROOT_PATH}/test/test_openreg.py`.
## Design principles

View File

@ -1570,5 +1570,54 @@ class TestFullyShardForceSumReduction(FSDPTest):
self.assertRegex(logs, all_reduce_sum_re)
class TestFullyShardReduceOpWorldSize1(FSDPTest):
@property
def world_size(self) -> int:
return 1
def test_size1_reduceop(self):
from torch.distributed.distributed_c10d import ReduceOp
model = nn.Linear(1024, 1025)
ref_model = copy.deepcopy(model).to(device_type)
ref_optim = torch.optim.Adam(ref_model.parameters())
fully_shard(
model,
mesh=init_device_mesh(device_type.type, (1,)),
reshard_after_forward=False,
)
optim = torch.optim.Adam(model.parameters())
inp = torch.randn(1025, 1024, device=device_type.type)
for _ in range(3):
ref_optim.zero_grad()
ref_loss = ref_model(inp).sum()
ref_loss.backward()
for param in ref_model.parameters():
dist.all_reduce(param.grad, op=dist.ReduceOp.SUM)
ref_optim.step()
optim.zero_grad()
loss = model(inp).sum()
loss.backward()
optim.step()
self.assertEqual(loss, ref_loss)
self.assertEqual(
model.bias.grad._local_tensor,
ref_model.bias.grad,
)
state = model._get_fsdp_state()
fsdp_param_group = state._fsdp_param_group
group = fsdp_param_group.mesh_info.shard_process_group
(
_,
_,
_,
all_reduce_op,
) = _get_gradient_divide_factors(group, None, torch.float32)
self.assertEqual(all_reduce_op, ReduceOp.SUM)
if __name__ == "__main__":
run_tests()

View File

@ -554,21 +554,6 @@ class TestNew2dParallelTraining(DTensorTestBase):
p2 = p2.redistribute(p2.device_mesh, [Replicate()]).to_local()
self.assertTrue(torch.allclose(p1, p2), f"{p1} vs {p2}")
@with_comms
@skip_if_lt_x_gpu(4)
def test_raise_invalid_tp_composition(self):
with self.assertRaisesRegex(
RuntimeError, r"Found TP device_mesh on the \d dimension of its parent mesh"
):
mesh_2d = init_device_mesh(
self.device_type, (2, self.world_size // 2), mesh_dim_names=("tp", "dp")
)
parallelize_plan = {
"net1": ColwiseParallel(),
"net2": RowwiseParallel(),
}
parallelize_module(SimpleModel().cuda(), mesh_2d["tp"], parallelize_plan)
@with_comms
@skip_if_lt_x_gpu(4)
def test_2d_fsdp_state_enable_extension(self):

View File

@ -3182,7 +3182,7 @@ class NcclRegistrationTest(MultiProcessTestCase):
# Use NCCL memory allocator
# enable symmetric memory usage in NCCL
pool = torch.cuda.MemPool(backend.mem_allocator, symm_mem=True)
pool = torch.cuda.MemPool(backend.mem_allocator, symmetric=True)
# allocate memory with ncclMemAlloc
# note: symmetric kernels are not available for dtypes like torch.int64

Some files were not shown because too many files have changed in this diff Show More