[ghstack-poisoned]
This commit is contained in:
Xuehai Pan
2025-07-11 15:05:13 +08:00
229 changed files with 5089 additions and 1410 deletions

View File

@ -4,7 +4,7 @@ set -eux -o pipefail
GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-} GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
export TORCH_CUDA_ARCH_LIST="9.0;10.0;12.0" export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
fi fi
SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"

View File

@ -52,6 +52,8 @@ fi
if [[ "$image" == *-jammy* ]]; then if [[ "$image" == *-jammy* ]]; then
UBUNTU_VERSION=22.04 UBUNTU_VERSION=22.04
elif [[ "$image" == *-noble* ]]; then
UBUNTU_VERSION=24.04
elif [[ "$image" == *ubuntu* ]]; then elif [[ "$image" == *ubuntu* ]]; then
extract_version_from_image_name ubuntu UBUNTU_VERSION extract_version_from_image_name ubuntu UBUNTU_VERSION
fi fi
@ -230,8 +232,12 @@ case "$tag" in
UCC_COMMIT=${_UCC_COMMIT} UCC_COMMIT=${_UCC_COMMIT}
INDUCTOR_BENCHMARKS=yes INDUCTOR_BENCHMARKS=yes
;; ;;
pytorch-linux-jammy-rocm-n-py3) pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-noble-rocm-n-py3)
ANACONDA_PYTHON_VERSION=3.10 if [[ $tag =~ "jammy" ]]; then
ANACONDA_PYTHON_VERSION=3.10
else
ANACONDA_PYTHON_VERSION=3.12
fi
GCC_VERSION=11 GCC_VERSION=11
VISION=yes VISION=yes
ROCM_VERSION=6.4 ROCM_VERSION=6.4
@ -322,6 +328,8 @@ case "$tag" in
GCC_VERSION=11 GCC_VERSION=11
ACL=yes ACL=yes
VISION=yes VISION=yes
CONDA_CMAKE=yes
OPENBLAS=yes
# snadampal: skipping llvm src build install because the current version # snadampal: skipping llvm src build install because the current version
# from pytorch/llvm:9.0.1 is x86 specific # from pytorch/llvm:9.0.1 is x86 specific
SKIP_LLVM_SRC_BUILD_INSTALL=yes SKIP_LLVM_SRC_BUILD_INSTALL=yes
@ -331,6 +339,8 @@ case "$tag" in
GCC_VERSION=11 GCC_VERSION=11
ACL=yes ACL=yes
VISION=yes VISION=yes
CONDA_CMAKE=yes
OPENBLAS=yes
# snadampal: skipping llvm src build install because the current version # snadampal: skipping llvm src build install because the current version
# from pytorch/llvm:9.0.1 is x86 specific # from pytorch/llvm:9.0.1 is x86 specific
SKIP_LLVM_SRC_BUILD_INSTALL=yes SKIP_LLVM_SRC_BUILD_INSTALL=yes
@ -417,6 +427,7 @@ docker build \
--build-arg "XPU_VERSION=${XPU_VERSION}" \ --build-arg "XPU_VERSION=${XPU_VERSION}" \
--build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \ --build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
--build-arg "ACL=${ACL:-}" \ --build-arg "ACL=${ACL:-}" \
--build-arg "OPENBLAS=${OPENBLAS:-}" \
--build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \ --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
--build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \ --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
-f $(dirname ${DOCKERFILE})/Dockerfile \ -f $(dirname ${DOCKERFILE})/Dockerfile \

View File

@ -23,6 +23,10 @@ conda_install() {
as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $* as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $*
} }
conda_install_through_forge() {
as_jenkins conda install -c conda-forge -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $*
}
conda_run() { conda_run() {
as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION --no-capture-output $* as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION --no-capture-output $*
} }

View File

@ -15,6 +15,9 @@ install_ubuntu() {
elif [[ "$UBUNTU_VERSION" == "22.04"* ]]; then elif [[ "$UBUNTU_VERSION" == "22.04"* ]]; then
cmake3="cmake=3.22*" cmake3="cmake=3.22*"
maybe_libiomp_dev="" maybe_libiomp_dev=""
elif [[ "$UBUNTU_VERSION" == "24.04"* ]]; then
cmake3="cmake=3.28*"
maybe_libiomp_dev=""
else else
cmake3="cmake=3.5*" cmake3="cmake=3.5*"
maybe_libiomp_dev="libiomp-dev" maybe_libiomp_dev="libiomp-dev"

View File

@ -70,10 +70,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
fi fi
# Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
if [[ $(uname -m) == "aarch64" ]]; then if [[ $(uname -m) != "aarch64" ]]; then
conda_install "openblas==0.3.29=*openmp*" pip_install mkl==2024.2.0
else pip_install mkl-static==2024.2.0
conda_install "mkl=2021.4.0 mkl-include=2021.4.0" pip_install mkl-include==2024.2.0
fi fi
# Install llvm-8 as it is required to compile llvmlite-0.30.0 from source # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
@ -87,6 +87,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
conda_run ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION}) conda_run ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION})
fi fi
if [[ "$UBUNTU_VERSION" == "24.04"* ]] ; then
conda_install_through_forge libstdcxx-ng=14
fi
# Install some other packages, including those needed for Python test reporting # Install some other packages, including those needed for Python test reporting
pip_install -r /opt/conda/requirements-ci.txt pip_install -r /opt/conda/requirements-ci.txt

View File

@ -4,8 +4,9 @@
set -ex set -ex
cd / cd /
git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.29}" --depth 1 --shallow-submodules git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.30}" --depth 1 --shallow-submodules
OPENBLAS_CHECKOUT_DIR="OpenBLAS"
OPENBLAS_BUILD_FLAGS=" OPENBLAS_BUILD_FLAGS="
NUM_THREADS=128 NUM_THREADS=128
USE_OPENMP=1 USE_OPENMP=1
@ -13,9 +14,8 @@ NO_SHARED=0
DYNAMIC_ARCH=1 DYNAMIC_ARCH=1
TARGET=ARMV8 TARGET=ARMV8
CFLAGS=-O3 CFLAGS=-O3
BUILD_BFLOAT16=1
" "
OPENBLAS_CHECKOUT_DIR="OpenBLAS"
make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR} make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR} make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}

View File

@ -8,9 +8,11 @@ ver() {
install_ubuntu() { install_ubuntu() {
apt-get update apt-get update
if [[ $UBUNTU_VERSION == 20.04 ]]; then # gpg-agent is not available by default
# gpg-agent is not available by default on 20.04 apt-get install -y --no-install-recommends gpg-agent
apt-get install -y --no-install-recommends gpg-agent if [[ $(ver $UBUNTU_VERSION) -ge $(ver 22.04) ]]; then
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \
| sudo tee /etc/apt/preferences.d/rocm-pin-600
fi fi
apt-get install -y kmod apt-get install -y kmod
apt-get install -y wget apt-get install -y wget
@ -85,13 +87,14 @@ EOF
VER_STR=6.3 VER_STR=6.3
fi fi
# clr build needs CppHeaderParser but can only find it using conda's python # clr build needs CppHeaderParser but can only find it using conda's python
/opt/conda/bin/python -m pip install CppHeaderParser python -m pip install CppHeaderParser
git clone https://github.com/ROCm/HIP -b $HIP_BRANCH git clone https://github.com/ROCm/HIP -b $HIP_BRANCH
HIP_COMMON_DIR=$(readlink -f HIP) HIP_COMMON_DIR=$(readlink -f HIP)
git clone https://github.com/jeffdaily/clr -b release/rocm-rel-${VER_STR}${VER_PATCH}-statco-hotfix git clone https://github.com/jeffdaily/clr -b release/rocm-rel-${VER_STR}${VER_PATCH}-statco-hotfix
mkdir -p clr/build mkdir -p clr/build
pushd clr/build pushd clr/build
cmake .. -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR # Need to point CMake to the correct python installation to find CppHeaderParser
cmake .. -DPython3_EXECUTABLE=/opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}/bin/python3 -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
make -j make -j
cp hipamd/lib/libamdhip64.so.${VER_STR}.* /opt/rocm/lib/libamdhip64.so.${VER_STR}.* cp hipamd/lib/libamdhip64.so.${VER_STR}.* /opt/rocm/lib/libamdhip64.so.${VER_STR}.*
popd popd

View File

@ -41,7 +41,7 @@ case ${image} in
GPU_IMAGE=arm64v8/almalinux:8 GPU_IMAGE=arm64v8/almalinux:8
DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1" DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1"
MANY_LINUX_VERSION="2_28_aarch64" MANY_LINUX_VERSION="2_28_aarch64"
OPENBLAS_VERSION="v0.3.29" OPENBLAS_VERSION="v0.3.30"
;; ;;
manylinuxcxx11-abi-builder:cpu-cxx11-abi) manylinuxcxx11-abi-builder:cpu-cxx11-abi)
TARGET=final TARGET=final

View File

@ -16,6 +16,7 @@ click
#test that import: #test that import:
coremltools==5.0b5 ; python_version < "3.12" coremltools==5.0b5 ; python_version < "3.12"
coremltools==8.3 ; python_version == "3.12"
#Description: Apple framework for ML integration #Description: Apple framework for ML integration
#Pinned versions: 5.0b5 #Pinned versions: 5.0b5
#test that import: #test that import:
@ -63,6 +64,7 @@ lark==0.12.0
#test that import: #test that import:
librosa>=0.6.2 ; python_version < "3.11" librosa>=0.6.2 ; python_version < "3.11"
librosa==0.10.2 ; python_version == "3.12"
#Description: A python package for music and audio analysis #Description: A python package for music and audio analysis
#Pinned versions: >=0.6.2 #Pinned versions: >=0.6.2
#test that import: test_spectral_ops.py #test that import: test_spectral_ops.py
@ -111,6 +113,7 @@ ninja==1.11.1.3
numba==0.49.0 ; python_version < "3.9" numba==0.49.0 ; python_version < "3.9"
numba==0.55.2 ; python_version == "3.9" numba==0.55.2 ; python_version == "3.9"
numba==0.55.2 ; python_version == "3.10" numba==0.55.2 ; python_version == "3.10"
numba==0.60.0 ; python_version == "3.12"
#Description: Just-In-Time Compiler for Numerical Functions #Description: Just-In-Time Compiler for Numerical Functions
#Pinned versions: 0.54.1, 0.49.0, <=0.49.1 #Pinned versions: 0.54.1, 0.49.0, <=0.49.1
#test that import: test_numba_integration.py #test that import: test_numba_integration.py
@ -360,10 +363,11 @@ pwlf==2.2.1
# To build PyTorch itself # To build PyTorch itself
astunparse pyyaml
PyYAML
pyzstd pyzstd
setuptools setuptools
six
wheel
scons==4.5.2 ; platform_machine == "aarch64" scons==4.5.2 ; platform_machine == "aarch64"

View File

@ -5,7 +5,7 @@ sphinx==5.3.0
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
# but it doesn't seem to work and hangs around idly. The initial thought is probably # but it doesn't seem to work and hangs around idly. The initial thought is probably
# something related to Docker setup. We can investigate this later # something related to Docker setup. We can investigate this later.
sphinxcontrib.katex==0.8.6 sphinxcontrib.katex==0.8.6
#Description: This is used to generate PyTorch docs #Description: This is used to generate PyTorch docs

View File

@ -147,6 +147,12 @@ RUN if [ -n "${ACL}" ]; then bash ./install_acl.sh; fi
RUN rm install_acl.sh RUN rm install_acl.sh
ENV INSTALLED_ACL ${ACL} ENV INSTALLED_ACL ${ACL}
ARG OPENBLAS
COPY ./common/install_openblas.sh install_openblas.sh
RUN if [ -n "${OPENBLAS}" ]; then bash ./install_openblas.sh; fi
RUN rm install_openblas.sh
ENV INSTALLED_OPENBLAS ${OPENBLAS}
# Install ccache/sccache (do this last, so we get priority in PATH) # Install ccache/sccache (do this last, so we get priority in PATH)
ARG SKIP_SCCACHE_INSTALL ARG SKIP_SCCACHE_INSTALL
COPY ./common/install_cache.sh install_cache.sh COPY ./common/install_cache.sh install_cache.sh

View File

@ -104,7 +104,7 @@ if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr
fi fi
echo "Calling setup.py install at $(date)" echo "Calling 'python -m pip install .' at $(date)"
if [[ $LIBTORCH_VARIANT = *"static"* ]]; then if [[ $LIBTORCH_VARIANT = *"static"* ]]; then
STATIC_CMAKE_FLAG="-DTORCH_STATIC=1" STATIC_CMAKE_FLAG="-DTORCH_STATIC=1"
@ -120,7 +120,7 @@ fi
# TODO: Remove this flag once https://github.com/pytorch/pytorch/issues/55952 is closed # TODO: Remove this flag once https://github.com/pytorch/pytorch/issues/55952 is closed
CFLAGS='-Wno-deprecated-declarations' \ CFLAGS='-Wno-deprecated-declarations' \
BUILD_LIBTORCH_CPU_WITH_DEBUG=1 \ BUILD_LIBTORCH_CPU_WITH_DEBUG=1 \
python setup.py install python -m pip install --no-build-isolation -v .
mkdir -p libtorch/{lib,bin,include,share} mkdir -p libtorch/{lib,bin,include,share}

View File

@ -185,7 +185,7 @@ torchbench_setup_macos() {
} }
pip_benchmark_deps() { pip_benchmark_deps() {
python -mpip install --no-input astunparse requests cython scikit-learn python -mpip install --no-input requests cython scikit-learn six
} }

View File

@ -436,11 +436,11 @@ test_inductor_aoti() {
python3 tools/amd_build/build_amd.py python3 tools/amd_build/build_amd.py
fi fi
if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python setup.py develop) BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python -m pip install --no-build-isolation -v -e .)
# TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB # TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}") TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
else else
BUILD_COMMAND=(python setup.py develop) BUILD_COMMAND=(python -m pip install --no-build-isolation -v -e .)
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}") TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
fi fi
@ -1579,7 +1579,7 @@ test_operator_benchmark() {
test_inductor_set_cpu_affinity test_inductor_set_cpu_affinity
cd benchmarks/operator_benchmark/pt_extension cd benchmarks/operator_benchmark/pt_extension
python setup.py install python -m pip install .
cd "${TEST_DIR}"/benchmarks/operator_benchmark cd "${TEST_DIR}"/benchmarks/operator_benchmark
$TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \ $TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \

View File

@ -42,7 +42,7 @@ call choco upgrade -y cmake --no-progress --installargs 'ADD_CMAKE_TO_PATH=Syste
if errorlevel 1 goto fail if errorlevel 1 goto fail
if not errorlevel 0 goto fail if not errorlevel 0 goto fail
call pip install mkl-include==2021.4.0 mkl-devel==2021.4.0 call pip install mkl==2024.2.0 mkl-static==2024.2.0 mkl-include==2024.2.0
if errorlevel 1 goto fail if errorlevel 1 goto fail
if not errorlevel 0 goto fail if not errorlevel 0 goto fail

View File

@ -61,8 +61,8 @@ You are now all set to start developing with PyTorch in a DevContainer environme
## Step 8: Build PyTorch ## Step 8: Build PyTorch
To build pytorch from source, simply run: To build pytorch from source, simply run:
``` ```bash
python setup.py develop python -m pip install --no-build-isolation -v -e .
``` ```
The process involves compiling thousands of files, and would take a long time. Fortunately, the compiled objects can be useful for your next build. When you modify some files, you only need to compile the changed files the next time. The process involves compiling thousands of files, and would take a long time. Fortunately, the compiled objects can be useful for your next build. When you modify some files, you only need to compile the changed files the next time.

View File

@ -1 +1 @@
70caf76066ef2c1054d6128b11769dc816a779e7 6c57850358f34c47802db216b0746e4e9d08a95a

View File

@ -6,7 +6,7 @@ set -euxo pipefail
cd llm-target-determinator cd llm-target-determinator
pip install -q -r requirements.txt pip install -q -r requirements.txt
cd ../codellama cd ../codellama
pip install -e . pip install --no-build-isolation -v -e .
pip install numpy==1.26.0 pip install numpy==1.26.0
# Run indexer # Run indexer

View File

@ -131,6 +131,9 @@ jobs:
if: inputs.build-environment != 'linux-s390x-binary-manywheel' if: inputs.build-environment != 'linux-s390x-binary-manywheel'
with: with:
github-secret: ${{ secrets.GITHUB_TOKEN }} github-secret: ${{ secrets.GITHUB_TOKEN }}
instructions: |
Build is done inside the container, to start an interactive session run:
docker exec -it $(docker container ps --format '{{.ID}}') bash
# [pytorch repo ref] # [pytorch repo ref]
# Use a pytorch/pytorch reference instead of a reference to the local # Use a pytorch/pytorch reference instead of a reference to the local

View File

@ -88,6 +88,14 @@ jobs:
pkill "${PROCESS}" || true pkill "${PROCESS}" || true
done done
- name: Clean up brew miniconda, if installed
continue-on-error: true
run: |
if brew list miniconda; then
brew uninstall miniconda
echo "REINSTALL_BREW_MINICONDA=1" >> "${GITHUB_ENV}"
fi
- name: Clean up leftover local python3 site-packages on MacOS pet runner - name: Clean up leftover local python3 site-packages on MacOS pet runner
continue-on-error: true continue-on-error: true
run: | run: |
@ -268,6 +276,14 @@ jobs:
workflow_attempt: ${{github.run_attempt}} workflow_attempt: ${{github.run_attempt}}
local_path: usage_log.txt local_path: usage_log.txt
- name: Reinstall brew miniconda, if was installed
if: always()
continue-on-error: true
run: |
if [[ -n "$REINSTALL_BREW_MINICONDA" ]]; then
brew install miniconda
fi
- name: Clean up disk space - name: Clean up disk space
if: always() if: always()
continue-on-error: true continue-on-error: true

View File

@ -63,6 +63,7 @@ jobs:
pytorch-linux-jammy-py3.13-clang12, pytorch-linux-jammy-py3.13-clang12,
pytorch-linux-jammy-rocm-n-1-py3, pytorch-linux-jammy-rocm-n-1-py3,
pytorch-linux-jammy-rocm-n-py3, pytorch-linux-jammy-rocm-n-py3,
pytorch-linux-noble-rocm-n-py3,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12, pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
pytorch-linux-jammy-py3.9-gcc11, pytorch-linux-jammy-py3.9-gcc11,
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks, pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,

View File

@ -36,15 +36,15 @@ jobs:
curr_branch: ${{ github.head_ref || github.ref_name }} curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }} curr_ref_type: ${{ github.ref_type }}
linux-jammy-rocm-py3_10-build: linux-noble-rocm-py3_12-build:
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
name: linux-jammy-rocm-py3.10-mi300 name: linux-noble-rocm-py3.12-mi300
uses: ./.github/workflows/_linux-build.yml uses: ./.github/workflows/_linux-build.yml
needs: get-label-type needs: get-label-type
with: with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-rocm-py3.10-mi300 build-environment: linux-noble-rocm-py3.12-mi300
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 docker-image-name: ci-image:pytorch-linux-noble-rocm-n-py3
sync-tag: rocm-build sync-tag: rocm-build
test-matrix: | test-matrix: |
{ include: [ { include: [
@ -57,17 +57,17 @@ jobs:
]} ]}
secrets: inherit secrets: inherit
linux-jammy-rocm-py3_10-test: linux-noble-rocm-py3_12-test:
permissions: permissions:
id-token: write id-token: write
contents: read contents: read
name: linux-jammy-rocm-py3.10-mi300 name: linux-noble-rocm-py3.12-mi300
uses: ./.github/workflows/_rocm-test.yml uses: ./.github/workflows/_rocm-test.yml
needs: needs:
- linux-jammy-rocm-py3_10-build - linux-noble-rocm-py3_12-build
- target-determination - target-determination
with: with:
build-environment: linux-jammy-rocm-py3.10-mi300 build-environment: linux-noble-rocm-py3.12-mi300
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }} docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }} test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
secrets: inherit secrets: inherit

View File

@ -231,7 +231,8 @@ include_patterns = [
'c10/**/*.cpp', 'c10/**/*.cpp',
'c10/**/*.h', 'c10/**/*.h',
'torch/*.h', 'torch/*.h',
'torch/_inductor/codegen/aoti_runtime/interface.cpp', 'torch/_inductor/codegen/aoti_runtime/*.h',
'torch/_inductor/codegen/aoti_runtime/*.cpp',
'torch/csrc/*.h', 'torch/csrc/*.h',
'torch/csrc/*.cpp', 'torch/csrc/*.cpp',
'torch/csrc/**/*.h', 'torch/csrc/**/*.h',
@ -1476,6 +1477,31 @@ init_command = [
'tomli==2.2.1 ; python_version < "3.11"', 'tomli==2.2.1 ; python_version < "3.11"',
] ]
[[linter]]
code = 'CMAKE_MINIMUM_REQUIRED'
command = [
'python3',
'tools/linter/adapters/cmake_minimum_required_linter.py',
'--',
'@{{PATHSFILE}}'
]
include_patterns = [
"**/pyproject.toml",
"**/CMakeLists.txt",
"**/CMakeLists.txt.in",
"**/*.cmake",
"**/*.cmake.in",
"**/*requirements*.txt",
"**/*requirements*.in",
]
init_command = [
'python3',
'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}',
'packaging==25.0',
'tomli==2.2.1 ; python_version < "3.11"',
]
[[linter]] [[linter]]
code = 'COPYRIGHT' code = 'COPYRIGHT'
include_patterns = ['**'] include_patterns = ['**']

View File

@ -88,20 +88,19 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
* If you want to have no-op incremental rebuilds (which are fast), see [Make no-op build fast](#make-no-op-build-fast) below. * If you want to have no-op incremental rebuilds (which are fast), see [Make no-op build fast](#make-no-op-build-fast) below.
* When installing with `python setup.py develop` (in contrast to `python setup.py install`) Python runtime will use * When installing with `python -m pip install -e .` (in contrast to `python -m pip install .`) Python runtime will use
the current local source-tree when importing `torch` package. (This is done by creating [`.egg-link`](https://wiki.python.org/moin/PythonPackagingTerminology#egg-link) file in `site-packages` folder) the current local source-tree when importing `torch` package. (This is done by creating [`.egg-link`](https://wiki.python.org/moin/PythonPackagingTerminology#egg-link) file in `site-packages` folder)
This way you do not need to repeatedly install after modifying Python files (`.py`). This way you do not need to repeatedly install after modifying Python files (`.py`).
However, you would need to reinstall if you modify Python interface (`.pyi`, `.pyi.in`) or However, you would need to reinstall if you modify Python interface (`.pyi`, `.pyi.in`) or non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...).
non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...).
One way to avoid running `python setup.py develop` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac, One way to avoid running `python -m pip install -e .` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac,
is to create a symbolic link from `build` folder to `torch/lib`, for example, by issuing following: is to create a symbolic link from `build` folder to `torch/lib`, for example, by issuing following:
```bash ```bash
pushd torch/lib; sh -c "ln -sf ../../build/lib/libtorch_cpu.* ."; popd pushd torch/lib; sh -c "ln -sf ../../build/lib/libtorch_cpu.* ."; popd
``` ```
Afterwards rebuilding a library (for example to rebuild `libtorch_cpu.so` issue `ninja torch_cpu` from `build` folder), Afterwards rebuilding a library (for example to rebuild `libtorch_cpu.so` issue `ninja torch_cpu` from `build` folder),
would be sufficient to make change visible in `torch` package. would be sufficient to make change visible in `torch` package.
To reinstall, first uninstall all existing PyTorch installs. You may need to run `pip To reinstall, first uninstall all existing PyTorch installs. You may need to run `pip
@ -115,9 +114,9 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
pip uninstall torch pip uninstall torch
``` ```
Next run `python setup.py clean`. After that, you can install in `develop` mode again. Next run `python setup.py clean`. After that, you can install in editable mode again.
* If you run into errors when running `python setup.py develop`, here are some debugging steps: * If you run into errors when running `python -m pip install -e .`, here are some debugging steps:
1. Run `printf '#include <stdio.h>\nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure 1. Run `printf '#include <stdio.h>\nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure
your CMake works and can compile this simple Hello World program without errors. your CMake works and can compile this simple Hello World program without errors.
2. Nuke your `build` directory. The `setup.py` script compiles binaries into the `build` folder and caches many 2. Nuke your `build` directory. The `setup.py` script compiles binaries into the `build` folder and caches many
@ -130,13 +129,20 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
git clean -xdf git clean -xdf
python setup.py clean python setup.py clean
git submodule update --init --recursive git submodule update --init --recursive
python setup.py develop python -m pip install -r requirements.txt
python -m pip install --no-build-isolation -v -e .
``` ```
4. The main step within `python setup.py develop` is running `make` from the `build` directory. If you want to 4. The main step within `python -m pip install -e .` is running `cmake --build build` from the `build` directory. If you want to
experiment with some environment variables, you can pass them into the command: experiment with some environment variables, you can pass them into the command:
```bash ```bash
ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* python setup.py develop ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* CMAKE_FRESH=1 python -m pip install --no-build-isolation -v -e .
``` ```
5. Try installing PyTorch without build isolation by adding `--no-build-isolation` to the `pip install` command.
This will use the current environment's packages instead of creating a new isolated environment for the build.
```bash
python -m pip install --no-build-isolation -v -e .
```
* If you run into issue running `git submodule update --init --recursive`. Please try the following: * If you run into issue running `git submodule update --init --recursive`. Please try the following:
- If you encounter an error such as - If you encounter an error such as
@ -639,9 +645,9 @@ can be selected interactively with your mouse to zoom in on a particular part of
the program execution timeline. The `--native` command-line option tells the program execution timeline. The `--native` command-line option tells
`py-spy` to record stack frame entries for PyTorch C++ code. To get line numbers `py-spy` to record stack frame entries for PyTorch C++ code. To get line numbers
for C++ code it may be necessary to compile PyTorch in debug mode by prepending for C++ code it may be necessary to compile PyTorch in debug mode by prepending
your `setup.py develop` call to compile PyTorch with `DEBUG=1`. Depending on your `python -m pip install -e .` call to compile PyTorch with `DEBUG=1`.
your operating system it may also be necessary to run `py-spy` with root Depending on your operating system it may also be necessary to run `py-spy` with
privileges. root privileges.
`py-spy` can also work in an `htop`-like "live profiling" mode and can be `py-spy` can also work in an `htop`-like "live profiling" mode and can be
tweaked to adjust the stack sampling rate, see the `py-spy` readme for more tweaked to adjust the stack sampling rate, see the `py-spy` readme for more
@ -649,7 +655,7 @@ details.
## Managing multiple build trees ## Managing multiple build trees
One downside to using `python setup.py develop` is that your development One downside to using `python -m pip install -e .` is that your development
version of PyTorch will be installed globally on your account (e.g., if version of PyTorch will be installed globally on your account (e.g., if
you run `import torch` anywhere else, the development version will be you run `import torch` anywhere else, the development version will be
used). used).
@ -663,7 +669,7 @@ specific build of PyTorch. To set one up:
python -m venv pytorch-myfeature python -m venv pytorch-myfeature
source pytorch-myfeature/bin/activate # or `& .\pytorch-myfeature\Scripts\Activate.ps1` on Windows source pytorch-myfeature/bin/activate # or `& .\pytorch-myfeature\Scripts\Activate.ps1` on Windows
# if you run python now, torch will NOT be installed # if you run python now, torch will NOT be installed
python setup.py develop python -m pip install --no-build-isolation -v -e .
``` ```
## C++ development tips ## C++ development tips
@ -701,7 +707,9 @@ variables `DEBUG`, `USE_DISTRIBUTED`, `USE_MKLDNN`, `USE_CUDA`, `USE_FLASH_ATTEN
For example: For example:
```bash ```bash
DEBUG=1 USE_DISTRIBUTED=0 USE_MKLDNN=0 USE_CUDA=0 BUILD_TEST=0 USE_FBGEMM=0 USE_NNPACK=0 USE_QNNPACK=0 USE_XNNPACK=0 python setup.py develop DEBUG=1 USE_DISTRIBUTED=0 USE_MKLDNN=0 USE_CUDA=0 BUILD_TEST=0 \
USE_FBGEMM=0 USE_NNPACK=0 USE_QNNPACK=0 USE_XNNPACK=0 \
python -m pip install --no-build-isolation -v -e .
``` ```
For subsequent builds (i.e., when `build/CMakeCache.txt` exists), the build For subsequent builds (i.e., when `build/CMakeCache.txt` exists), the build
@ -711,7 +719,7 @@ options.
### Code completion and IDE support ### Code completion and IDE support
When using `python setup.py develop`, PyTorch will generate When using `python -m pip install -e .`, PyTorch will generate
a `compile_commands.json` file that can be used by many editors a `compile_commands.json` file that can be used by many editors
to provide command completion and error highlighting for PyTorch's to provide command completion and error highlighting for PyTorch's
C++ code. You need to `pip install ninja` to generate accurate C++ code. You need to `pip install ninja` to generate accurate
@ -772,7 +780,7 @@ If not, you can define these variables on the command line before invoking `setu
export CMAKE_C_COMPILER_LAUNCHER=ccache export CMAKE_C_COMPILER_LAUNCHER=ccache
export CMAKE_CXX_COMPILER_LAUNCHER=ccache export CMAKE_CXX_COMPILER_LAUNCHER=ccache
export CMAKE_CUDA_COMPILER_LAUNCHER=ccache export CMAKE_CUDA_COMPILER_LAUNCHER=ccache
python setup.py develop python -m pip install --no-build-isolation -v -e .
``` ```
#### Use a faster linker #### Use a faster linker
@ -785,7 +793,7 @@ If you are editing a single file and rebuilding in a tight loop, the time spent
Starting with CMake 3.29, you can specify the linker type using the [`CMAKE_LINKER_TYPE`](https://cmake.org/cmake/help/latest/variable/CMAKE_LINKER_TYPE.html) variable. For example, with `mold` installed: Starting with CMake 3.29, you can specify the linker type using the [`CMAKE_LINKER_TYPE`](https://cmake.org/cmake/help/latest/variable/CMAKE_LINKER_TYPE.html) variable. For example, with `mold` installed:
```sh ```sh
CMAKE_LINKER_TYPE=MOLD python setup.py develop CMAKE_LINKER_TYPE=MOLD python -m pip install --no-build-isolation -v -e .
``` ```
#### Use pre-compiled headers #### Use pre-compiled headers
@ -797,7 +805,7 @@ setting `USE_PRECOMPILED_HEADERS=1` either on first setup, or in the
`CMakeCache.txt` file. `CMakeCache.txt` file.
```sh ```sh
USE_PRECOMPILED_HEADERS=1 python setup.py develop USE_PRECOMPILED_HEADERS=1 python -m pip install --no-build-isolation -v -e .
``` ```
This adds a build step where the compiler takes `<ATen/ATen.h>` and essentially This adds a build step where the compiler takes `<ATen/ATen.h>` and essentially
@ -820,7 +828,7 @@ A compiler-wrapper to fix this is provided in `tools/nvcc_fix_deps.py`. You can
this as a compiler launcher, similar to `ccache` this as a compiler launcher, similar to `ccache`
```bash ```bash
export CMAKE_CUDA_COMPILER_LAUNCHER="python;`pwd`/tools/nvcc_fix_deps.py;ccache" export CMAKE_CUDA_COMPILER_LAUNCHER="python;`pwd`/tools/nvcc_fix_deps.py;ccache"
python setup.py develop python -m pip install --no-build-isolation -v -e .
``` ```
### Rebuild few files with debug information ### Rebuild few files with debug information
@ -1171,7 +1179,7 @@ build_with_asan()
CFLAGS="-fsanitize=address -fno-sanitize-recover=all -shared-libasan -pthread" \ CFLAGS="-fsanitize=address -fno-sanitize-recover=all -shared-libasan -pthread" \
CXX_FLAGS="-pthread" \ CXX_FLAGS="-pthread" \
USE_CUDA=0 USE_OPENMP=0 USE_DISTRIBUTED=0 DEBUG=1 \ USE_CUDA=0 USE_OPENMP=0 USE_DISTRIBUTED=0 DEBUG=1 \
python setup.py develop python -m pip install --no-build-isolation -v -e .
} }
run_with_asan() run_with_asan()

View File

@ -57,7 +57,7 @@ RUN --mount=type=cache,target=/opt/ccache \
export eval ${CMAKE_VARS} && \ export eval ${CMAKE_VARS} && \
TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \ CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
python setup.py install python -m pip install --no-build-isolation -v .
FROM conda as conda-installs FROM conda as conda-installs
ARG PYTHON_VERSION=3.11 ARG PYTHON_VERSION=3.11

View File

@ -228,6 +228,7 @@ If you want to disable Intel GPU support, export the environment variable `USE_X
Other potentially useful environment variables may be found in `setup.py`. Other potentially useful environment variables may be found in `setup.py`.
#### Get the PyTorch Source #### Get the PyTorch Source
```bash ```bash
git clone https://github.com/pytorch/pytorch git clone https://github.com/pytorch/pytorch
cd pytorch cd pytorch
@ -279,24 +280,29 @@ conda install -c conda-forge libuv=1.39
``` ```
#### Install PyTorch #### Install PyTorch
**On Linux** **On Linux**
If you're compiling for AMD ROCm then first run this command: If you're compiling for AMD ROCm then first run this command:
```bash ```bash
# Only run this if you're compiling for ROCm # Only run this if you're compiling for ROCm
python tools/amd_build/build_amd.py python tools/amd_build/build_amd.py
``` ```
Install PyTorch Install PyTorch
```bash ```bash
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}" export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
python setup.py develop python -m pip install -r requirements.txt
python -m pip install --no-build-isolation -v -e .
``` ```
**On macOS** **On macOS**
```bash ```bash
python3 setup.py develop python -m pip install -r requirements.txt
python -m pip install --no-build-isolation -v -e .
``` ```
**On Windows** **On Windows**
@ -308,7 +314,7 @@ If you want to build legacy python code, please refer to [Building on legacy cod
In this mode PyTorch computations will run on your CPU, not your GPU. In this mode PyTorch computations will run on your CPU, not your GPU.
```cmd ```cmd
python setup.py develop python -m pip install --no-build-isolation -v -e .
``` ```
Note on OpenMP: The desired OpenMP implementation is Intel OpenMP (iomp). In order to link against iomp, you'll need to manually download the library and set up the building environment by tweaking `CMAKE_INCLUDE_PATH` and `LIB`. The instruction [here](https://github.com/pytorch/pytorch/blob/main/docs/source/notes/windows.rst#building-from-source) is an example for setting up both MKL and Intel OpenMP. Without these configurations for CMake, Microsoft Visual C OpenMP runtime (vcomp) will be used. Note on OpenMP: The desired OpenMP implementation is Intel OpenMP (iomp). In order to link against iomp, you'll need to manually download the library and set up the building environment by tweaking `CMAKE_INCLUDE_PATH` and `LIB`. The instruction [here](https://github.com/pytorch/pytorch/blob/main/docs/source/notes/windows.rst#building-from-source) is an example for setting up both MKL and Intel OpenMP. Without these configurations for CMake, Microsoft Visual C OpenMP runtime (vcomp) will be used.
@ -329,7 +335,6 @@ Additional libraries such as
You can refer to the [build_pytorch.bat](https://github.com/pytorch/pytorch/blob/main/.ci/pytorch/win-test-helpers/build_pytorch.bat) script for some other environment variables configurations You can refer to the [build_pytorch.bat](https://github.com/pytorch/pytorch/blob/main/.ci/pytorch/win-test-helpers/build_pytorch.bat) script for some other environment variables configurations
```cmd ```cmd
cmd cmd
@ -349,8 +354,7 @@ for /f "usebackq tokens=*" %i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\
:: [Optional] If you want to override the CUDA host compiler :: [Optional] If you want to override the CUDA host compiler
set CUDAHOSTCXX=C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\HostX64\x64\cl.exe set CUDAHOSTCXX=C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\HostX64\x64\cl.exe
python setup.py develop python -m pip install --no-build-isolation -v -e .
``` ```
**Intel GPU builds** **Intel GPU builds**
@ -372,7 +376,7 @@ if defined CMAKE_PREFIX_PATH (
set "CMAKE_PREFIX_PATH=%CONDA_PREFIX%\Library" set "CMAKE_PREFIX_PATH=%CONDA_PREFIX%\Library"
) )
python setup.py develop python -m pip install --no-build-isolation -v -e .
``` ```
##### Adjust Build Options (Optional) ##### Adjust Build Options (Optional)
@ -382,6 +386,7 @@ the following. For example, adjusting the pre-detected directories for CuDNN or
with such a step. with such a step.
On Linux On Linux
```bash ```bash
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}" export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
CMAKE_ONLY=1 python setup.py build CMAKE_ONLY=1 python setup.py build
@ -389,6 +394,7 @@ ccmake build # or cmake-gui build
``` ```
On macOS On macOS
```bash ```bash
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}" export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ CMAKE_ONLY=1 python setup.py build MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ CMAKE_ONLY=1 python setup.py build

View File

@ -131,69 +131,25 @@ uint64_t CPUGeneratorImpl::seed() {
/** /**
* Sets the internal state of CPUGeneratorImpl. The new internal state * Sets the internal state of CPUGeneratorImpl. The new internal state
* must be a strided CPU byte tensor and of the same size as either * must be a strided CPU byte tensor and of the same size as CPUGeneratorImplState.
* CPUGeneratorImplStateLegacy (for legacy CPU generator state) or
* CPUGeneratorImplState (for new state).
*
* FIXME: Remove support of the legacy state in the future?
*/ */
void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) { void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
using detail::CPUGeneratorImplState; using detail::CPUGeneratorImplState;
using detail::CPUGeneratorImplStateLegacy; using detail::CPUGeneratorImplStateLegacy;
static_assert(std::is_standard_layout_v<CPUGeneratorImplStateLegacy>, "CPUGeneratorImplStateLegacy is not a PODType");
static_assert(std::is_standard_layout_v<CPUGeneratorImplState>, "CPUGeneratorImplState is not a PODType"); static_assert(std::is_standard_layout_v<CPUGeneratorImplState>, "CPUGeneratorImplState is not a PODType");
constexpr size_t size = sizeof(CPUGeneratorImplState);
static const size_t size_legacy = sizeof(CPUGeneratorImplStateLegacy);
static const size_t size_current = sizeof(CPUGeneratorImplState);
static_assert(size_legacy != size_current, "CPUGeneratorImplStateLegacy and CPUGeneratorImplState can't be of the same size");
detail::check_rng_state(new_state); detail::check_rng_state(new_state);
at::mt19937 engine; at::mt19937 engine;
auto float_normal_sample = std::optional<float>();
auto double_normal_sample = std::optional<double>();
// Construct the state of at::CPUGeneratorImpl based on input byte tensor size.
CPUGeneratorImplStateLegacy* legacy_pod{nullptr};
auto new_state_size = new_state.numel(); auto new_state_size = new_state.numel();
if (new_state_size == size_legacy) {
legacy_pod = (CPUGeneratorImplStateLegacy*)new_state.data();
// Note that in CPUGeneratorImplStateLegacy, we didn't have float version
// of normal sample and hence we leave the std::optional<float> as is
// Update next_double_normal_sample. TORCH_CHECK(new_state_size == size, "Expected a CPUGeneratorImplState of size ", size,
// Note that CPUGeneratorImplStateLegacy stores two uniform values (normal_x, normal_y) " but found the input RNG state size to be ", new_state_size);
// and a rho value (normal_rho). These three values were redundant and in the new
// DistributionsHelper.h, we store the actual extra normal sample, rather than three
// intermediate values.
if (legacy_pod->normal_is_valid) {
auto r = legacy_pod->normal_rho;
auto theta = 2.0 * c10::pi<double> * legacy_pod->normal_x;
// we return the sin version of the normal sample when in caching mode
double_normal_sample = std::optional<double>(r * ::sin(theta));
}
} else if (new_state_size == size_current) {
auto rng_state = (CPUGeneratorImplState*)new_state.data();
legacy_pod = &rng_state->legacy_pod;
// update next_float_normal_sample
if (rng_state->is_next_float_normal_sample_valid) {
float_normal_sample = std::optional<float>(rng_state->next_float_normal_sample);
}
// Update next_double_normal_sample.
// Note that in getRNGState, we now return the actual normal sample in normal_y
// and if it's valid in normal_is_valid. The redundant normal_x and normal_rho
// are squashed to 0.0.
if (legacy_pod->normal_is_valid) {
double_normal_sample = std::optional<double>(legacy_pod->normal_y);
}
} else {
TORCH_CHECK(false, "Expected either a CPUGeneratorImplStateLegacy of size ", size_legacy,
" or a CPUGeneratorImplState of size ", size_current,
" but found the input RNG state size to be ", new_state_size);
}
auto rng_state = new_state.data_ptr_impl<CPUGeneratorImplState>();
auto legacy_pod = &(rng_state->legacy_pod);
// construct engine_ // construct engine_
// Note that CPUGeneratorImplStateLegacy stored a state array of 64 bit uints, whereas in our // Note that CPUGeneratorImplStateLegacy stored a state array of 64 bit uints, whereas in our
// redefined mt19937, we have changed to a state array of 32 bit uints. Hence, we are // redefined mt19937, we have changed to a state array of 32 bit uints. Hence, we are
@ -207,8 +163,12 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
engine.set_data(rng_data); engine.set_data(rng_data);
TORCH_CHECK(engine.is_valid(), "Invalid mt19937 state"); TORCH_CHECK(engine.is_valid(), "Invalid mt19937 state");
this->engine_ = engine; this->engine_ = engine;
this->next_float_normal_sample_ = float_normal_sample; this->next_float_normal_sample_ = rng_state->is_next_float_normal_sample_valid
this->next_double_normal_sample_ = double_normal_sample; ? std::optional<float>(rng_state->next_float_normal_sample)
: std::optional<float>();
this->next_double_normal_sample_ = legacy_pod->normal_is_valid
? std::optional<double>(legacy_pod->normal_y)
: std::optional<double>();
} }
/** /**

View File

@ -431,7 +431,8 @@ class TORCH_API Context {
at::SDPBackend::flash_attention, at::SDPBackend::flash_attention,
at::SDPBackend::efficient_attention, at::SDPBackend::efficient_attention,
at::SDPBackend::math, at::SDPBackend::math,
at::SDPBackend::cudnn_attention}; at::SDPBackend::cudnn_attention,
at::SDPBackend::overrideable};
bool enabled_flashSDP = true; bool enabled_flashSDP = true;
bool enabled_mem_efficientSDP = true; bool enabled_mem_efficientSDP = true;
bool enabled_mathSDP = true; bool enabled_mathSDP = true;

View File

@ -26,9 +26,7 @@ inline void infer_size_impl(
std::optional<int64_t> infer_dim; std::optional<int64_t> infer_dim;
for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) { for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) {
if (TORCH_GUARD_OR_FALSE(sym_eq(shape[dim], -1))) { if (TORCH_GUARD_OR_FALSE(sym_eq(shape[dim], -1))) {
if (infer_dim) { TORCH_CHECK(!infer_dim, "only one dimension can be inferred");
throw std::runtime_error("only one dimension can be inferred");
}
infer_dim = dim; infer_dim = dim;
} else { } else {
// in case of unbacked shape[dim] we assume it's not -1 and add a runtime // in case of unbacked shape[dim] we assume it's not -1 and add a runtime

View File

@ -214,7 +214,7 @@ inline Tensor applySlice(
"step must be greater than zero"); "step must be greater than zero");
// See NOTE [nested tensor size for indexing] // See NOTE [nested tensor size for indexing]
if (self_sizes.has_value()) { if (self_sizes.has_value() && self_sizes.value().size() > 0) {
// Skip this optimization if we are tracing, as the trace may be polymorphic // Skip this optimization if we are tracing, as the trace may be polymorphic
// over the shape of the `self` tensor, and we still want to record // over the shape of the `self` tensor, and we still want to record
// the slice. // the slice.
@ -223,7 +223,7 @@ inline Tensor applySlice(
: self.sym_size(dim); : self.sym_size(dim);
if (!disable_slice_optimization && if (!disable_slice_optimization &&
TORCH_STATICALLY_KNOWN_TRUE(start.sym_eq(0)) && TORCH_STATICALLY_KNOWN_TRUE(start.sym_eq(0)) &&
TORCH_STATICALLY_KNOWN_TRUE(length.sym_eq(stop)) && step == 1) { TORCH_STATICALLY_KNOWN_TRUE(length.sym_le(stop)) && step == 1) {
return self; return self;
} }
} }

View File

@ -59,9 +59,7 @@ struct TORCH_API Generator {
explicit Generator(c10::intrusive_ptr<c10::GeneratorImpl> gen_impl) explicit Generator(c10::intrusive_ptr<c10::GeneratorImpl> gen_impl)
: impl_(std::move(gen_impl)) { : impl_(std::move(gen_impl)) {
if (impl_.get() == nullptr) { TORCH_CHECK(impl_, "GeneratorImpl with nullptr is not supported");
throw std::runtime_error("GeneratorImpl with nullptr is not supported");
}
} }
bool operator==(const Generator& rhs) const { bool operator==(const Generator& rhs) const {

View File

@ -98,9 +98,7 @@ class TORCH_API TensorBase {
explicit TensorBase( explicit TensorBase(
c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl) c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
: impl_(std::move(tensor_impl)) { : impl_(std::move(tensor_impl)) {
if (impl_.get() == nullptr) { TORCH_CHECK(impl_, "TensorImpl with nullptr is not supported");
throw std::runtime_error("TensorImpl with nullptr is not supported");
}
} }
TensorBase(const TensorBase&) = default; TensorBase(const TensorBase&) = default;
TensorBase(TensorBase&&) noexcept = default; TensorBase(TensorBase&&) noexcept = default;

View File

@ -68,11 +68,10 @@ Symbol InternedStrings::_symbol(const std::string& s) {
return it->second; return it->second;
auto pos = s.find("::"); auto pos = s.find("::");
if (pos == std::string::npos) { TORCH_CHECK(
std::stringstream ss; pos != std::string::npos,
ss << "all symbols must have a namespace, <namespace>::<string>, but found: " << s; "all symbols must have a namespace, <namespace>::<string>, but found: ",
throw std::runtime_error(ss.str()); s);
}
Symbol ns = _symbol("namespaces::" + s.substr(0, pos)); Symbol ns = _symbol("namespaces::" + s.substr(0, pos));
Symbol sym(sym_to_info_.size()); Symbol sym(sym_to_info_.size());
@ -121,12 +120,11 @@ std::string Symbol::domainString() const {
} }
Symbol Symbol::fromDomainAndUnqualString(const std::string & d, const std::string & s) { Symbol Symbol::fromDomainAndUnqualString(const std::string & d, const std::string & s) {
if (d.compare(0, domain_prefix().size(), domain_prefix()) != 0) { TORCH_CHECK(
std::ostringstream ss; d.compare(0, domain_prefix().size(), domain_prefix()) == 0,
ss << "Symbol: domain string is expected to be prefixed with '" "Symbol: domain string is expected to be prefixed with '",
<< domain_prefix() << "', e.g. 'org.pytorch.aten'"; domain_prefix(),
throw std::runtime_error(ss.str()); "', e.g. 'org.pytorch.aten'");
}
std::string qualString = d.substr(domain_prefix().size()) + "::" + s; std::string qualString = d.substr(domain_prefix().size()) + "::" + s;
return fromQualString(qualString); return fromQualString(qualString);
} }

View File

@ -406,8 +406,7 @@ size_t IValue::hash(const IValue& v) {
case Tag::Enum: case Tag::Enum:
case Tag::Stream: case Tag::Stream:
case Tag::Uninitialized: case Tag::Uninitialized:
throw std::runtime_error( TORCH_CHECK(false, "unhashable type: '" + v.type()->repr_str() + "'");
"unhashable type: '" + v.type()->repr_str() + "'");
} }
// the above switch should be exhaustive // the above switch should be exhaustive
TORCH_INTERNAL_ASSERT(false, "we should never reach here") TORCH_INTERNAL_ASSERT(false, "we should never reach here")

View File

@ -116,10 +116,9 @@ struct SingleElementType : public SharedType {
protected: protected:
SingleElementType(TypePtr elem) : SharedType(Kind), elem(std::move(elem)) { SingleElementType(TypePtr elem) : SharedType(Kind), elem(std::move(elem)) {
if (!this->elem) { TORCH_CHECK(
throw std::runtime_error(c10::str( this->elem,
"Can not create ", typeKindToString(Kind), " with None type")); c10::str("Can not create ", typeKindToString(Kind), " with None type"));
}
} }
private: private:
@ -416,16 +415,12 @@ struct TORCH_API SymbolicShape {
} }
ShapeSymbol operator[](size_t i) const { ShapeSymbol operator[](size_t i) const {
if (!dims_) { TORCH_CHECK(dims_, "Rank isn't fixed");
throw std::runtime_error("Rank isn't fixed");
}
return (*dims_).at(i); return (*dims_).at(i);
} }
ShapeSymbol at(size_t i) const { ShapeSymbol at(size_t i) const {
if (!dims_) { TORCH_CHECK(dims_, "Rank isn't fixed");
throw std::runtime_error("Rank isn't fixed");
}
return (*dims_).at(i); return (*dims_).at(i);
} }
@ -520,9 +515,7 @@ struct VaryingShape {
} }
const std::optional<T> &operator[](size_t i) const { const std::optional<T> &operator[](size_t i) const {
if (!dims_) { TORCH_CHECK(dims_, "Rank isn't fixed");
throw std::runtime_error("Rank isn't fixed");
}
return (*dims_).at(i); return (*dims_).at(i);
} }
@ -957,9 +950,7 @@ struct TORCH_API DictType : public SharedType {
TypePtr createWithContained( TypePtr createWithContained(
std::vector<TypePtr> contained_types) const override { std::vector<TypePtr> contained_types) const override {
if (contained_types.size() != 2) { TORCH_CHECK(contained_types.size() == 2, "Expected 2 contained types");
throw std::runtime_error("Expected 2 contained types");
}
return create(std::move(contained_types.at(0)), std::move(contained_types.at(1))); return create(std::move(contained_types.at(0)), std::move(contained_types.at(1)));
} }

View File

@ -826,9 +826,7 @@ TupleType::TupleType(
: NamedType(TypeKind::TupleType, std::move(name)), : NamedType(TypeKind::TupleType, std::move(name)),
elements_(std::move(elements)), elements_(std::move(elements)),
has_free_variables_(std::any_of(elements_.begin(), elements_.end(), [](const TypePtr& v) { has_free_variables_(std::any_of(elements_.begin(), elements_.end(), [](const TypePtr& v) {
if (!v) { TORCH_CHECK(v, "Can not create tuple with None type");
throw std::runtime_error("Can not create tuple with None type");
}
return v->hasFreeVariables(); return v->hasFreeVariables();
})), schema_(std::move(schema)) { })), schema_(std::move(schema)) {

View File

@ -163,6 +163,9 @@ class Vectorized<BFloat16> {
Vectorized<BFloat16> exp_u20() const { Vectorized<BFloat16> exp_u20() const {
return exp(); return exp();
} }
Vectorized<BFloat16> fexp_u20() const {
return exp();
}
Vectorized<BFloat16> fmod(const Vectorized<BFloat16>& q) const; Vectorized<BFloat16> fmod(const Vectorized<BFloat16>& q) const;
Vectorized<BFloat16> hypot(const Vectorized<BFloat16>& b) const; Vectorized<BFloat16> hypot(const Vectorized<BFloat16>& b) const;
Vectorized<BFloat16> i0() const; Vectorized<BFloat16> i0() const;

View File

@ -249,6 +249,9 @@ class Vectorized<double> {
Vectorized<double> exp_u20() const { Vectorized<double> exp_u20() const {
return exp(); return exp();
} }
Vectorized<double> fexp_u20() const {
return exp();
}
Vectorized<double> fmod(const Vectorized<double>& q) const {USE_SLEEF( Vectorized<double> fmod(const Vectorized<double>& q) const {USE_SLEEF(
{ return Vectorized<double>(Sleef_fmoddx_sve(values, q)); }, { return Vectorized<double>(Sleef_fmoddx_sve(values, q)); },
{ {

View File

@ -314,6 +314,9 @@ class Vectorized<float> {
Vectorized<float> exp_u20() const { Vectorized<float> exp_u20() const {
return exp(); return exp();
} }
Vectorized<float> fexp_u20() const {
return exp();
}
Vectorized<float> fmod(const Vectorized<float>& q) const {USE_SLEEF( Vectorized<float> fmod(const Vectorized<float>& q) const {USE_SLEEF(
{ return Vectorized<float>(Sleef_fmodfx_sve(values, q)); }, { return Vectorized<float>(Sleef_fmodfx_sve(values, q)); },
{ {

View File

@ -308,6 +308,9 @@ class Vectorized<float> {
Vectorized<float> exp_u20() const { Vectorized<float> exp_u20() const {
return exp(); return exp();
} }
Vectorized<float> fexp_u20() const {
return exp();
}
DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
fmod, fmod,
Sleef_fmodf4) Sleef_fmodf4)

View File

@ -206,6 +206,10 @@ struct Vectorized16 {
return static_cast<const Derived*>(this)->map_with_vec_float_method( return static_cast<const Derived*>(this)->map_with_vec_float_method(
&Vectorized<float>::exp_u20); &Vectorized<float>::exp_u20);
} }
Derived fexp_u20() const {
return static_cast<const Derived*>(this)->map_with_vec_float_method(
&Vectorized<float>::exp_u20);
}
Derived fmod(const Derived& q) const { Derived fmod(const Derived& q) const {
// This function is questionable with a conversion, so we use map2 // This function is questionable with a conversion, so we use map2
return map2(q, std::fmod); return map2(q, std::fmod);

View File

@ -488,6 +488,9 @@ class Vectorized16 {
Vectorized<T> expm1() const { Vectorized<T> expm1() const {
return map(Sleef_expm1f8_u10); return map(Sleef_expm1f8_u10);
} }
Vectorized<T> fexp_u20() const {
return exp();
}
Vectorized<T> exp_u20() const { Vectorized<T> exp_u20() const {
return exp(); return exp();
} }

View File

@ -198,6 +198,9 @@ class Vectorized<double> {
Vectorized<double> exp_u20() const { Vectorized<double> exp_u20() const {
return exp(); return exp();
} }
Vectorized<double> fexp_u20() const {
return exp();
}
Vectorized<double> fmod(const Vectorized<double>& q) const { Vectorized<double> fmod(const Vectorized<double>& q) const {
return Vectorized<double>(Sleef_fmodd4(values, q)); return Vectorized<double>(Sleef_fmodd4(values, q));
} }

View File

@ -1,5 +1,4 @@
#pragma once #pragma once
// DO NOT DEFINE STATIC DATA IN THIS HEADER! // DO NOT DEFINE STATIC DATA IN THIS HEADER!
// See Note [Do not compile initializers with AVX] // See Note [Do not compile initializers with AVX]
@ -256,6 +255,63 @@ class Vectorized<float> {
Vectorized<float> expm1() const { Vectorized<float> expm1() const {
return Vectorized<float>(Sleef_expm1f8_u10(values)); return Vectorized<float>(Sleef_expm1f8_u10(values));
} }
Vectorized<float> fexp_u20() const {
const __m256 vec_c0 = _mm256_set1_ps(0.00010703434948458272f);
const __m256 vec_c1 = _mm256_set1_ps(0.30354260500649682f);
const __m256 vec_c2 = _mm256_set1_ps(-0.22433836478672356);
const __m256 vec_c3 = _mm256_set1_ps(-0.079204240219773236);
const __m256 vec_exp_log2ef =
_mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e)
const __m256 vec_a = _mm256_set1_ps(std::pow(2, 23) / std::log2(2));
const __m256 vec_b = _mm256_set1_ps(std::pow(2, 23) * 127.f);
const __m256 vec_ln_flt_min =
_mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
const __m256 vec_ln_flt_max =
_mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
const __m256 vec_inf = _mm256_set1_ps(INFINITY);
const __m256 zero = _mm256_setzero_ps();
// exp(x) = 2**(x * log2(e))
// = 2**xi * 2**xf - TIPS we are using the EEEE floating point
// representation with identification to the exponent and the
// mentissa
// 2**xf will be approximated to a polynomial of degree 3 computed with
// Horner method
// compute the min/max for the mask
// Masks
__m256 mask_too_small =
_mm256_cmp_ps(values, vec_ln_flt_min, _CMP_LT_OS); // x < min
__m256 mask_too_large =
_mm256_cmp_ps(values, vec_ln_flt_max, _CMP_GT_OS); // x > max
// transformation with log2(e)
auto vec_src = _mm256_mul_ps(values, vec_exp_log2ef);
auto vec_fractional = _mm256_sub_ps(vec_src, _mm256_floor_ps(vec_src));
// compute polynomial using Horner Scheme
auto vec_res = _mm256_fmadd_ps(vec_fractional, vec_c3, vec_c2);
vec_res = _mm256_fmadd_ps(vec_fractional, vec_res, vec_c1);
vec_res = _mm256_fmadd_ps(vec_fractional, vec_res, vec_c0);
vec_src = _mm256_sub_ps(vec_src, vec_res);
// // the tips is here, headache in perspective
auto tmp = _mm256_fmadd_ps(vec_a, vec_src, vec_b);
// headache bis
__m256i casted_integer = _mm256_cvttps_epi32(tmp);
// bitwise to float for the final transformation
auto result = _mm256_castsi256_ps(casted_integer);
// boundary condition
// Set to 0 where x < ln(FLT_MIN)
result = _mm256_blendv_ps(result, zero, mask_too_small);
// Set to +inf where x > ln(FLT_MAX)
result = _mm256_blendv_ps(result, vec_inf, mask_too_large);
// final interpretation to float
return result;
}
Vectorized<float> exp_u20() const { Vectorized<float> exp_u20() const {
// A faster version of exp with ULP=20 // A faster version of exp with ULP=20
const __m256 vec_factorial_1 = const __m256 vec_factorial_1 =

View File

@ -121,27 +121,52 @@ typename std::enable_if_t<
} }
template <typename T> template <typename T>
typename std::enable_if_t< at::vec::Vectorized<T> inline convert_float_to_int8(
std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>, at::vec::Vectorized<float> src);
at::vec::Vectorized<
T>> inline convert_float_to_int8(at::vec::Vectorized<float> src) { template <>
at::vec::Vectorized<int8_t> inline convert_float_to_int8(
at::vec::Vectorized<float> src) {
// Convert from float32 to int32 with truncation // Convert from float32 to int32 with truncation
__m256i x_values_int32 = _mm256_cvttps_epi32(src); __m256i x_values_int32 = _mm256_cvttps_epi32(src);
// Convert from int32 to int16 using signed saturation // Convert from int32 to int16 using signed saturation
__m256i xy_packed_v = _mm256_packs_epi32(x_values_int32, x_values_int32); __m256i xy_packed_v = _mm256_packs_epi32(x_values_int32, x_values_int32);
constexpr auto min_val = std::numeric_limits<T>::min(); constexpr auto min_val = std::numeric_limits<int8_t>::min();
constexpr auto max_val = std::numeric_limits<T>::max(); constexpr auto max_val = std::numeric_limits<int8_t>::max();
// Convert from int16 to uint8/int8 using unsigned saturation // Convert from int16 to int8 using unsigned saturation
__m256i xyzw_clamped_v = __m256i xyzw_clamped_v = pack_saturate_and_clamp<int8_t>(
pack_saturate_and_clamp<T>(xy_packed_v, xy_packed_v, min_val, max_val); xy_packed_v, xy_packed_v, min_val, max_val);
__m256i permute_mask_v = __m256i permute_mask_v =
_mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
return _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v); return _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v);
} }
template <>
at::vec::Vectorized<uint8_t> inline convert_float_to_int8(
at::vec::Vectorized<float> src) {
// The type of *_val should be int32_t to ensure correct clamping behavior.
constexpr auto min_val = std::numeric_limits<int32_t>::min();
constexpr auto max_val = std::numeric_limits<int32_t>::max();
__m256 float32_min_val = _mm256_set1_ps(float(min_val));
__m256 float32_max_val = _mm256_set1_ps(float(max_val));
__m256 float32_src = _mm256_max_ps(src, float32_min_val);
float32_src = _mm256_min_ps(float32_src, float32_max_val);
__m256i truncated_src = _mm256_cvttps_epi32(float32_src);
__m128i r1 = _mm256_castsi256_si128(truncated_src);
__m128i mask = _mm_setr_epi8(
0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m128i r1_shuffled = _mm_shuffle_epi8(r1, mask);
__m128i r2 = _mm256_extractf128_si256(truncated_src, 1);
__m128i r2_shuffled = _mm_shuffle_epi8(r2, mask);
__m128i result = _mm_unpacklo_epi32(r1_shuffled, r2_shuffled);
return _mm256_castsi128_si256(result);
}
template <typename T> template <typename T>
__FORCE_INLINE void QuantizeAvx2( __FORCE_INLINE void QuantizeAvx2(
const float* src, const float* src,

View File

@ -273,6 +273,9 @@ class Vectorized<double> {
Vectorized<double> C10_ALWAYS_INLINE exp_u20() const { Vectorized<double> C10_ALWAYS_INLINE exp_u20() const {
return exp(); return exp();
} }
Vectorized<double> C10_ALWAYS_INLINE fexp_u20() const {
return exp();
}
Vectorized<double> lgamma() const __ubsan_ignore_undefined__ { Vectorized<double> lgamma() const __ubsan_ignore_undefined__ {
return {Sleef_lgammad2_u10(_vec0), Sleef_lgammad2_u10(_vec1)}; return {Sleef_lgammad2_u10(_vec0), Sleef_lgammad2_u10(_vec1)};

View File

@ -352,6 +352,9 @@ class Vectorized<float> {
Vectorized<float> C10_ALWAYS_INLINE exp_u20() const { Vectorized<float> C10_ALWAYS_INLINE exp_u20() const {
return exp(); return exp();
} }
Vectorized<float> C10_ALWAYS_INLINE fexp_u20() const {
return exp();
}
Vectorized<float> C10_ALWAYS_INLINE log() const { Vectorized<float> C10_ALWAYS_INLINE log() const {
return {Sleef_logf4_u10(_vec0), Sleef_logf4_u10(_vec1)}; return {Sleef_logf4_u10(_vec0), Sleef_logf4_u10(_vec1)};

View File

@ -1023,6 +1023,9 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
Vectorized<T> exp_u20() const { Vectorized<T> exp_u20() const {
return exp(); return exp();
} }
Vectorized<T> fexp_u20() const {
return exp();
}
Vectorized<T> log() const { Vectorized<T> log() const {
return mapSleef(Sleef_logf4_u10, Sleef_logd2_u10); return mapSleef(Sleef_logf4_u10, Sleef_logd2_u10);

View File

@ -535,6 +535,9 @@ class Vectorized16 {
Vectorized<T> expm1() const { Vectorized<T> expm1() const {
return map(Sleef_expm1f16_u10); return map(Sleef_expm1f16_u10);
} }
Vectorized<T> fexp_u20() const {
return exp();
}
Vectorized<T> exp_u20() const { Vectorized<T> exp_u20() const {
return exp(); return exp();
} }

View File

@ -221,6 +221,9 @@ class Vectorized<double> {
Vectorized<double> exp_u20() const { Vectorized<double> exp_u20() const {
return exp(); return exp();
} }
Vectorized<double> fexp_u20() const {
return exp();
}
Vectorized<double> fmod(const Vectorized<double>& q) const { Vectorized<double> fmod(const Vectorized<double>& q) const {
return Vectorized<double>(Sleef_fmodd8(values, q)); return Vectorized<double>(Sleef_fmodd8(values, q));
} }

View File

@ -310,6 +310,60 @@ class Vectorized<float> {
Vectorized<float> expm1() const { Vectorized<float> expm1() const {
return Vectorized<float>(Sleef_expm1f16_u10(values)); return Vectorized<float>(Sleef_expm1f16_u10(values));
} }
Vectorized<float> fexp_u20() const {
const __m512 vec_c0 = _mm512_set1_ps(0.00010703434948458272f);
const __m512 vec_c1 = _mm512_set1_ps(0.30354260500649682f);
const __m512 vec_c2 = _mm512_set1_ps(-0.22433836478672356);
const __m512 vec_c3 = _mm512_set1_ps(-0.079204240219773236);
const __m512 vec_exp_log2ef =
_mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e)
const __m512 vec_a = _mm512_set1_ps(std::pow(2, 23) / std::log2(2));
const __m512 vec_b = _mm512_set1_ps(std::pow(2, 23) * 127.f);
const __m512 vec_ln_flt_min =
_mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
const __m512 vec_ln_flt_max =
_mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
__m512i vec_infinity = _mm512_set1_epi32(0x7F800000);
__m512i vec_zero = _mm512_setzero_epi32();
// Fast Exponential Computation on SIMD Architectures
// A. Cristiano I. Malossi, Yves Ineichen, Costas Bekas, and Alessandro
// Curioni exp(x) = 2**(x * log2(e))
// = 2**xi * 2**xf - TIPS we are using the EEEE floating point
// representation with identification to the exponent and the
// mentissa
// 2**xf will be approximated to a polynomial of degree 3 computed with
// Horner method
// mask for the boundary condition
auto min_mask = _mm512_cmp_ps_mask(values, vec_ln_flt_min, _CMP_LT_OS);
auto max_mask = _mm512_cmp_ps_mask(values, vec_ln_flt_max, _CMP_GT_OS);
// transformation with log2(e)
auto vec_src = _mm512_mul_ps(values, vec_exp_log2ef);
auto vec_fractional = _mm512_sub_ps(vec_src, _mm512_floor_ps(vec_src));
// compute polynomial using Horner Scheme, for superscalar processor
auto vec_res = _mm512_fmadd_ps(vec_fractional, vec_c3, vec_c2);
vec_res = _mm512_fmadd_ps(vec_fractional, vec_res, vec_c1);
vec_res = _mm512_fmadd_ps(vec_fractional, vec_res, vec_c0);
vec_src = _mm512_sub_ps(vec_src, vec_res);
// the tips is here, headache in perspective
auto tmp = _mm512_fmadd_ps(vec_a, vec_src, vec_b);
// headache bis - we loose precision with the cast but it "fits", but ok
// after f32 -> f16 later
__m512i casted_integer = _mm512_cvttps_epi32(tmp);
// boundary condition, lower than the min -> 0
casted_integer = _mm512_mask_mov_epi32(casted_integer, min_mask, vec_zero);
// boundary condition, larger than the max -> +oo
casted_integer =
_mm512_mask_mov_epi32(casted_integer, max_mask, vec_infinity);
// final interpretation to float
return _mm512_castsi512_ps(casted_integer);
}
Vectorized<float> exp_u20() const { Vectorized<float> exp_u20() const {
// A faster version of exp with ULP=20 // A faster version of exp with ULP=20
const __m512 vec_factorial_1 = const __m512 vec_factorial_1 =

View File

@ -123,22 +123,24 @@ typename std::enable_if_t<
} }
template <typename T> template <typename T>
typename std::enable_if_t< at::vec::Vectorized<T> inline convert_float_to_int8(
std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>, at::vec::Vectorized<float> src);
at::vec::Vectorized<
T>> inline convert_float_to_int8(at::vec::Vectorized<float> src) { template <>
at::vec::Vectorized<int8_t> inline convert_float_to_int8(
at::vec::Vectorized<float> src) {
// Convert from float32 to int32 with truncation // Convert from float32 to int32 with truncation
__m512i x_values_int32 = _mm512_cvttps_epi32(src); __m512i x_values_int32 = _mm512_cvttps_epi32(src);
// Convert from int32 to int16 using signed saturation // Convert from int32 to int16 using signed saturation
__m512i xy_packed_v = _mm512_packs_epi32(x_values_int32, x_values_int32); __m512i xy_packed_v = _mm512_packs_epi32(x_values_int32, x_values_int32);
constexpr auto min_val = std::numeric_limits<T>::min(); constexpr auto min_val = std::numeric_limits<int8_t>::min();
constexpr auto max_val = std::numeric_limits<T>::max(); constexpr auto max_val = std::numeric_limits<int8_t>::max();
// Convert from int16 to uint8/int8 using unsigned saturation // Convert from int16 to int8 using unsigned saturation
__m512i xyzw_clamped_v = __m512i xyzw_clamped_v = pack_saturate_and_clamp<int8_t>(
pack_saturate_and_clamp<T>(xy_packed_v, xy_packed_v, min_val, max_val); xy_packed_v, xy_packed_v, min_val, max_val);
__m512i permute_mask_v = _mm512_set_epi32( __m512i permute_mask_v = _mm512_set_epi32(
0x0f, 0x0f,
0x0b, 0x0b,
@ -159,6 +161,21 @@ typename std::enable_if_t<
return _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v); return _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
} }
template <>
at::vec::Vectorized<uint8_t> inline convert_float_to_int8(
at::vec::Vectorized<float> src) {
// The type of *_val should be int32_t to ensure correct clamping behavior.
constexpr auto min_val = std::numeric_limits<int32_t>::min();
constexpr auto max_val = std::numeric_limits<int32_t>::max();
__m512 float32_min_val = _mm512_set1_ps(float(min_val));
__m512 float32_max_val = _mm512_set1_ps(float(max_val));
__m512 float32_src = _mm512_max_ps(src, float32_min_val);
float32_src = _mm512_min_ps(float32_src, float32_max_val);
__m512i int32_src_clamped = _mm512_cvttps_epi32(float32_src);
__m128i int8_src = _mm512_cvtepi32_epi8(int32_src_clamped);
return _mm512_castsi128_si512(int8_src);
}
template <typename T> template <typename T>
__FORCE_INLINE void QuantizeAvx512( __FORCE_INLINE void QuantizeAvx512(
const float* src, const float* src,

View File

@ -238,9 +238,6 @@ struct Vectorized {
Vectorized vector; Vectorized vector;
int_same_size_t<T> buffer[size()]; int_same_size_t<T> buffer[size()];
mask.store(buffer); mask.store(buffer);
#if defined(__clang__) && __ARM_FEATURE_SVE
#pragma clang loop vectorize(disable)
#endif
for (const auto i : c10::irange(size())) { for (const auto i : c10::irange(size())) {
if (buffer[i] & 0x01) { if (buffer[i] & 0x01) {
vector[i] = b[i]; vector[i] = b[i];
@ -547,6 +544,9 @@ struct Vectorized {
Vectorized<T> exp_u20() const { Vectorized<T> exp_u20() const {
return map(std::exp); return map(std::exp);
} }
Vectorized<T> fexp_u20() const {
return map(std::exp);
}
Vectorized<T> frac() const { Vectorized<T> frac() const {
return *this - this->trunc(); return *this - this->trunc();
} }

View File

@ -263,6 +263,7 @@ class VectorizedN {
VECTORIZEDN_DEFINE_UNARY_OP(exp2) VECTORIZEDN_DEFINE_UNARY_OP(exp2)
VECTORIZEDN_DEFINE_UNARY_OP(expm1) VECTORIZEDN_DEFINE_UNARY_OP(expm1)
VECTORIZEDN_DEFINE_UNARY_OP(exp_u20) VECTORIZEDN_DEFINE_UNARY_OP(exp_u20)
VECTORIZEDN_DEFINE_UNARY_OP(fexp_u20)
VECTORIZEDN_DEFINE_UNARY_OP(frac) VECTORIZEDN_DEFINE_UNARY_OP(frac)
VECTORIZEDN_DEFINE_BINARY_OP(fmod) VECTORIZEDN_DEFINE_BINARY_OP(fmod)
VECTORIZEDN_DEFINE_UNARY_OP(log) VECTORIZEDN_DEFINE_UNARY_OP(log)

View File

@ -94,9 +94,10 @@ static std::vector<std::optional<Tensor>> batchIndices(
if (index.has_value() && index->sym_numel() != 0) { if (index.has_value() && index->sym_numel() != 0) {
const auto idx_bdim = indices_bdims[i]; const auto idx_bdim = indices_bdims[i];
indices_.emplace_back(maybePadToLogicalRank(moveBatchDimToFront(index.value(), idx_bdim), idx_bdim, maxLogicalRank)); indices_.emplace_back(maybePadToLogicalRank(moveBatchDimToFront(index.value(), idx_bdim), idx_bdim, maxLogicalRank));
if (index.value().dtype() == kBool && indices_bdims[i].has_value()) { TORCH_CHECK(
throw std::runtime_error("vmap: We do not support batching operators that can support dynamic shape. Attempting to batch over indexing with a boolean mask."); index.value().dtype() != kBool || !indices_bdims[i].has_value(),
} "vmap: We do not support batching operators that can support ",
"dynamic shape. Attempting to batch over indexing with a boolean mask.");
} else { } else {
indices_.push_back(index); indices_.push_back(index);
} }

View File

@ -16,11 +16,14 @@ template<typename O, typename C>
static void _assert_match(const O& original, const C& compared, const std::string& name) { static void _assert_match(const O& original, const C& compared, const std::string& name) {
if (compared) { if (compared) {
bool equal = (original == compared.value()); bool equal = (original == compared.value());
if (!equal) { TORCH_CHECK(
std::stringstream msg; equal,
msg << "Tensor " << name << " mismatch! Expected: " << compared.value() << ", Got: " << original; "Tensor ",
throw std::runtime_error(msg.str()); name,
} " mismatch! Expected: ",
compared.value(),
", Got: ",
original);
} }
} }

View File

@ -424,6 +424,14 @@ Tensor _dirichlet_grad_cpu(const Tensor& x, const Tensor& alpha, const Tensor& t
*/ */
Tensor _s_binomial_cpu(const Tensor& count, const Tensor& prob, std::optional<Generator> gen) { Tensor _s_binomial_cpu(const Tensor& count, const Tensor& prob, std::optional<Generator> gen) {
TORCH_CHECK_VALUE(
at::isFloatingType(count.scalar_type()),
"binomial only supports floating-point dtypes for count, got: ",
count.scalar_type());
TORCH_CHECK_VALUE(
at::isFloatingType(prob.scalar_type()),
"binomial only supports floating-point dtypes for prob, got: ",
prob.scalar_type());
Tensor ret = at::zeros(count.sizes(), count.options()); Tensor ret = at::zeros(count.sizes(), count.options());
auto iter = TensorIteratorConfig() auto iter = TensorIteratorConfig()
.add_output(ret) .add_output(ret)

View File

@ -180,9 +180,7 @@ TORCH_IMPL_FUNC(triu_cpu)(const Tensor& self, int64_t k, const Tensor &result) {
} }
Tensor trace_backward_symint(const Tensor& grad, c10::SymIntArrayRef sizes) { Tensor trace_backward_symint(const Tensor& grad, c10::SymIntArrayRef sizes) {
if (sizes.size() != 2) { TORCH_CHECK(sizes.size() == 2, "expected matrix input");
throw std::runtime_error("expected matrix input");
}
auto grad_input = at::zeros_symint(sizes[0] * sizes[1], grad.options()); auto grad_input = at::zeros_symint(sizes[0] * sizes[1], grad.options());
auto indices = at::arange(0, grad_input.numel(), sizes[1] + 1, grad.options().dtype(at::kLong)); auto indices = at::arange(0, grad_input.numel(), sizes[1] + 1, grad.options().dtype(at::kLong));

View File

@ -62,7 +62,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
virtual std::optional<at::Tensor> bias() = 0; virtual std::optional<at::Tensor> bias() = 0;
virtual void set_bias(const std::optional<at::Tensor>& bias) { virtual void set_bias(const std::optional<at::Tensor>& bias) {
throw std::runtime_error( TORCH_CHECK(
false,
"set_bias is not implemented for this packed " "set_bias is not implemented for this packed "
"parameter type"); "parameter type");
} }

View File

@ -96,7 +96,14 @@ inline void _exp_reduce_sum_fusion_kernel(
for (long i = 0; i < vec_size * (size / vec_size); i += vec_size) { for (long i = 0; i < vec_size * (size / vec_size); i += vec_size) {
auto tmp0 = vec::Vectorized<T1>::loadu(a + i); auto tmp0 = vec::Vectorized<T1>::loadu(a + i);
auto tmp1 = tmp0 - vec_max; auto tmp1 = tmp0 - vec_max;
auto tmp2 = tmp1.exp_u20(); Vectorized<T1> tmp2;
if constexpr (std::is_same_v<T1, float> &&
(std::is_same_v<T2, at::BFloat16> || std::is_same_v<T2, at::Half>))
{
tmp2 = tmp1.fexp_u20();
} else {
tmp2 = tmp1.exp_u20();
}
vec_tmp_sum += tmp2; vec_tmp_sum += tmp2;
_store(out + i, tmp2); _store(out + i, tmp2);
} }

View File

@ -48,12 +48,7 @@ __global__ void prepare_grouped_gemm_data(
int32_t start = tid == 0 ? 0 : offs[tid - 1]; int32_t start = tid == 0 ? 0 : offs[tid - 1];
delta = offs[tid] - start; delta = offs[tid] - start;
if (K < 0) { if (K < 0) {
if (!a_row_major && b_row_major) { CUDA_KERNEL_ASSERT(delta >=0 && "expected offsets to be greater or equal 0\n");
CUDA_KERNEL_ASSERT(delta >=0 && "expected offsets to be greater or equal 0\n");
} else {
// CUTLASS cannot handle delta=0 here.
CUDA_KERNEL_ASSERT(delta >0 && "expected offsets to be greater than 0\n");
}
} }
// TMA transfers require global memory tensor addresses to be // TMA transfers require global memory tensor addresses to be

View File

@ -337,6 +337,7 @@ Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
#include <cmath> #include <cmath>
#include <mkl_dfti.h> #include <mkl_dfti.h>
#include <mkl_version.h>
#include <ATen/mkl/Exceptions.h> #include <ATen/mkl/Exceptions.h>
#include <ATen/mkl/Descriptors.h> #include <ATen/mkl/Descriptors.h>
#include <ATen/mkl/Limits.h> #include <ATen/mkl/Limits.h>
@ -479,6 +480,19 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
const auto value_type = c10::toRealValueType(input.scalar_type()); const auto value_type = c10::toRealValueType(input.scalar_type());
out.resize_(batched_out_sizes, MemoryFormat::Contiguous); out.resize_(batched_out_sizes, MemoryFormat::Contiguous);
// fix mkl issue
// https://github.com/pytorch/pytorch/issues/154477
#ifdef INTEL_MKL_VERSION
#if INTEL_MKL_VERSION > 20210400L
for (const auto& stride : input.strides()) {
if (stride == 0) {
input = input.clone(MemoryFormat::Contiguous);
break;
}
}
#endif
#endif
auto descriptor = _plan_mkl_fft( auto descriptor = _plan_mkl_fft(
input.strides(), out.strides(), signal_size, input.is_complex(), input.strides(), out.strides(), signal_size, input.is_complex(),
out.is_complex(), normalization, forward, value_type); out.is_complex(), normalization, forward, value_type);

View File

@ -79,14 +79,16 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
// 2. Math fallback // 2. Math fallback
auto& ctx = at::globalContext(); auto& ctx = at::globalContext();
// use overrideable linked to onednn as overrideable implementation // use overrideable linked to onednn as overrideable implementation
if (!ctx.userEnabledMathSDP() && !ctx.userEnabledOverrideableSDP()) { if (!ctx.userEnabledMathSDP() && !ctx.userEnabledOverrideableSDP() &&
!ctx.userEnabledFlashSDP()) {
return sdp::SDPBackend::error; return sdp::SDPBackend::error;
} }
// Get ideal kernel ordering // Get ideal kernel ordering
const std::array<sdp::SDPBackend, 2> priority_order{ const std::array<sdp::SDPBackend, 3> priority_order{
sdp::SDPBackend::overrideable, sdp::SDPBackend::overrideable,
sdp::SDPBackend::math, sdp::SDPBackend::math,
sdp::SDPBackend::flash_attention,
}; };
// Because TORCHCHECK checks if condition is true we negate debug so that // Because TORCHCHECK checks if condition is true we negate debug so that
@ -105,6 +107,14 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
return sdp::SDPBackend::math; return sdp::SDPBackend::math;
} }
break; break;
case sdp::SDPBackend::flash_attention:
if (ctx.userEnabledFlashSDP() &&
use_overrideable_xpu(kernel_params, print_debug)) {
TORCH_WARN(
"Flash Attention is not supported on XPU, falling back to overrideable kernel.");
return sdp::SDPBackend::overrideable;
}
break;
default: default:
TORCH_CHECK(false, "Invalid backend"); TORCH_CHECK(false, "Invalid backend");
} }
@ -141,7 +151,7 @@ int64_t _fused_sdp_choice_xpu(
TORCH_CHECK( TORCH_CHECK(
false, false,
"No viable backend for scaled_dot_product_attention was found. ", "No viable backend for scaled_dot_product_attention was found. ",
"This is likely due to turning off both the math kernel and the fused kernels."); "This is likely due to turning off both the math kernel and the overrideable kernels.");
} }
return static_cast<int64_t>(backend); return static_cast<int64_t>(backend);
} }

View File

@ -1,8 +0,0 @@
// Copyright © 2022 Apple Inc.
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/native/mps/TensorFactory.h>
#include <c10/core/ScalarType.h>
#include <unordered_map>
using namespace at::mps;

View File

@ -18,8 +18,6 @@
#include <ATen/native/Resize.h> #include <ATen/native/Resize.h>
#include <ATen/native/TensorAdvancedIndexing.h> #include <ATen/native/TensorAdvancedIndexing.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h> #include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/operations/Indexing.h>
#include <c10/core/QScheme.h>
#include <c10/util/SmallVector.h> #include <c10/util/SmallVector.h>
#include <c10/util/irange.h> #include <c10/util/irange.h>
#include <fmt/format.h> #include <fmt/format.h>

View File

@ -746,7 +746,7 @@ inline std::tuple<bool, Tensor, Tensor> NestedTensor_compute_size_stride(
} }
else if (size_reshaped == -1) { else if (size_reshaped == -1) {
if (infer_index > -1) { if (infer_index > -1) {
throw std::runtime_error("only one dimension can be inferred"); TORCH_CHECK(false, "only one dimension can be inferred");
} }
else { else {
infer_index = idim; infer_index = idim;

View File

@ -19,7 +19,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
double /*output_scale*/, double /*output_scale*/,
int64_t /*output_zero_point*/, int64_t /*output_zero_point*/,
at::Tensor& output) { at::Tensor& output) {
throw std::runtime_error( TORCH_CHECK(
false,
"apply_out is not implemented for this packed " "apply_out is not implemented for this packed "
"parameter type"); "parameter type");
return output; return output;
@ -30,7 +31,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
double /*output_scale*/, double /*output_scale*/,
int64_t /*output_zero_point*/, int64_t /*output_zero_point*/,
at::Tensor& output) { at::Tensor& output) {
throw std::runtime_error( TORCH_CHECK(
false,
"apply_relu_out is not implemented for this packed " "apply_relu_out is not implemented for this packed "
"parameter type"); "parameter type");
return output; return output;
@ -55,7 +57,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
at::Tensor input, at::Tensor input,
double input_scale, double input_scale,
int64_t input_zero_point) { int64_t input_zero_point) {
throw std::runtime_error( TORCH_CHECK(
false,
"apply_with_input_q_dq_qweight_dq_output_fp32 is not implemented for this packed " "apply_with_input_q_dq_qweight_dq_output_fp32 is not implemented for this packed "
"parameter type"); "parameter type");
return {}; return {};
@ -79,7 +82,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
at::Tensor input, at::Tensor input,
double input_scale, double input_scale,
int64_t input_zero_point) { int64_t input_zero_point) {
throw std::runtime_error( TORCH_CHECK(
false,
"apply_with_input_q_dq_qweight_dq_relu_output_fp32 is not implemented for this packed " "apply_with_input_q_dq_qweight_dq_relu_output_fp32 is not implemented for this packed "
"parameter type"); "parameter type");
return {}; return {};
@ -96,7 +100,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
const at::Tensor& /* input */, const at::Tensor& /* input */,
at::Tensor& output, at::Tensor& output,
bool /* reduce_range */) { bool /* reduce_range */) {
throw std::runtime_error( TORCH_CHECK(
false,
"apply_dynamic_out is not implemented for this packed " "apply_dynamic_out is not implemented for this packed "
"parameter type"); "parameter type");
return output; return output;
@ -105,7 +110,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
const at::Tensor& /* input */, const at::Tensor& /* input */,
at::Tensor& output, at::Tensor& output,
bool /* reduce_range */) { bool /* reduce_range */) {
throw std::runtime_error( TORCH_CHECK(
false,
"apply_dynamic_relu_out is not implemented for this packed " "apply_dynamic_relu_out is not implemented for this packed "
"parameter type"); "parameter type");
return output; return output;
@ -116,7 +122,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
virtual std::optional<at::Tensor> bias() = 0; virtual std::optional<at::Tensor> bias() = 0;
virtual void set_bias(std::optional<at::Tensor> /*bias*/) { virtual void set_bias(std::optional<at::Tensor> /*bias*/) {
throw std::runtime_error( TORCH_CHECK(
false,
"set_bias is not implemented for this packed " "set_bias is not implemented for this packed "
"parameter type"); "parameter type");
} }

View File

@ -843,6 +843,11 @@ SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
return SDPBackend::math; return SDPBackend::math;
} }
break; break;
case SDPBackend::overrideable:
if (ctx.userEnabledOverrideableSDP()) {
TORCH_CHECK(false, "Invalid backend");
}
break;
default: default:
TORCH_CHECK(false, "Invalid backend"); TORCH_CHECK(false, "Invalid backend");
} }

View File

@ -8,12 +8,12 @@ It also provides mechanisms to compare PyTorch with other frameworks.
Make sure you're on a machine with CUDA, torchvision, and pytorch installed. Install in the following order: Make sure you're on a machine with CUDA, torchvision, and pytorch installed. Install in the following order:
``` ```
# Install torchvision. It comes with the pytorch stable release binary # Install torchvision. It comes with the pytorch stable release binary
pip3 install torch torchvision python -m pip install torch torchvision
# Install the latest pytorch master from source. # Install the latest pytorch master from source.
# It should supersede the installation from the release binary. # It should supersede the installation from the release binary.
cd $PYTORCH_HOME cd $PYTORCH_HOME
python setup.py build develop python -m pip install --no-build-isolation -v -e .
# Check the pytorch installation version # Check the pytorch installation version
python -c "import torch; print(torch.__version__)" python -c "import torch; print(torch.__version__)"

View File

@ -27,7 +27,7 @@ pull-deps: clone-deps
(cd ../../../torchbenchmark && git fetch && git checkout "$$(cat ../pytorch/.github/ci_commit_pins/torchbench.txt)" && git submodule update --init --recursive) (cd ../../../torchbenchmark && git fetch && git checkout "$$(cat ../pytorch/.github/ci_commit_pins/torchbench.txt)" && git submodule update --init --recursive)
build-deps: clone-deps build-deps: clone-deps
uv pip install astunparse numpy scipy ninja pyyaml mkl mkl-include setuptools cmake \ uv pip install numpy scipy ninja pyyaml six mkl mkl-include setuptools wheel cmake \
typing-extensions requests protobuf numba cython scikit-learn librosa typing-extensions requests protobuf numba cython scikit-learn librosa
(cd ../../../torchvision && uv pip install -e . --no-build-isolation) (cd ../../../torchvision && uv pip install -e . --no-build-isolation)
(cd ../../../torchdata && uv pip install -e .) (cd ../../../torchdata && uv pip install -e .)

View File

@ -210,7 +210,7 @@ mobilenet_v2,pass,0
mobilenet_v2_quantized_qat,pass,2 mobilenet_v2_quantized_qat,pass,3
@ -274,7 +274,7 @@ resnet50,pass,0
resnet50_quantized_qat,pass,2 resnet50_quantized_qat,pass,3

1 name accuracy graph_breaks
210
211
212
213
214
215
216
274
275
276
277
278
279
280

View File

@ -210,7 +210,7 @@ mobilenet_v2,pass,0
mobilenet_v2_quantized_qat,pass,2 mobilenet_v2_quantized_qat,pass,3
@ -274,7 +274,7 @@ resnet50,pass,0
resnet50_quantized_qat,pass,2 resnet50_quantized_qat,pass,3

1 name accuracy graph_breaks
210
211
212
213
214
215
216
274
275
276
277
278
279
280

View File

@ -210,7 +210,7 @@ mobilenet_v2,pass,0
mobilenet_v2_quantized_qat,pass,2 mobilenet_v2_quantized_qat,pass,3
@ -274,7 +274,7 @@ resnet50,pass,0
resnet50_quantized_qat,pass,2 resnet50_quantized_qat,pass,3

1 name accuracy graph_breaks
210
211
212
213
214
215
216
274
275
276
277
278
279
280

View File

@ -194,7 +194,7 @@ mobilenet_v2,pass,0
mobilenet_v2_quantized_qat,pass,2 mobilenet_v2_quantized_qat,pass,3
@ -258,7 +258,7 @@ resnet50,pass,0
resnet50_quantized_qat,pass,2 resnet50_quantized_qat,pass,3

1 name accuracy graph_breaks
194
195
196
197
198
199
200
258
259
260
261
262
263
264

View File

@ -210,7 +210,7 @@ mobilenet_v2,pass,0
mobilenet_v2_quantized_qat,pass,2 mobilenet_v2_quantized_qat,pass,3
@ -274,7 +274,7 @@ resnet50,pass,0
resnet50_quantized_qat,pass,2 resnet50_quantized_qat,pass,3

1 name accuracy graph_breaks
210
211
212
213
214
215
216
274
275
276
277
278
279
280

View File

@ -17,8 +17,8 @@ export DEBUG=0
export OMP_NUM_THREADS=10 export OMP_NUM_THREADS=10
# Compile pytorch with the base revision # Compile pytorch with the base revision
git checkout master git checkout main
python setup.py develop python -m pip install --no-build-isolation -v -e .
# Install dependencies: # Install dependencies:
# Scipy is required by detr # Scipy is required by detr
@ -32,7 +32,7 @@ python functional_autograd_benchmark.py --output before.txt
# Compile pytorch with your change # Compile pytorch with your change
popd popd
git checkout your_feature_branch git checkout your_feature_branch
python setup.py develop python -m pip install --no-build-isolation -v -e .
# Run the benchmark for the new version # Run the benchmark for the new version
pushd benchmarks/functional_autograd_benchmark pushd benchmarks/functional_autograd_benchmark

View File

@ -20,7 +20,7 @@ Key Features:
The instruction below installs a cpp\_extension for PyTorch and it is required to run the benchmark suite. The instruction below installs a cpp\_extension for PyTorch and it is required to run the benchmark suite.
```bash ```bash
cd pt_extension cd pt_extension
python setup.py install python -m pip install .
``` ```
## How to run the benchmarks: ## How to run the benchmarks:

View File

@ -11,7 +11,7 @@ export USE_MKL=1
CMAKE_ONLY=1 python setup.py build CMAKE_ONLY=1 python setup.py build
ccmake build # or cmake-gui build ccmake build # or cmake-gui build
python setup.py install python -m pip install --no-build-isolation -v .
cd benchmarks cd benchmarks
echo "!! SPARSE SPMM TIME BENCHMARK!! " >> $OUTFILE echo "!! SPARSE SPMM TIME BENCHMARK!! " >> $OUTFILE
@ -28,7 +28,7 @@ echo "----- USE_MKL=0 ------" >> $OUTFILE
rm -rf build rm -rf build
export USE_MKL=0 export USE_MKL=0
python setup.py install python -m pip install --no-build-isolation -v .
cd benchmarks cd benchmarks
for dim0 in 1000 5000 10000; do for dim0 in 1000 5000 10000; do

View File

@ -0,0 +1,233 @@
#include <c10/core/AllocatorConfig.h>
#include <c10/core/DeviceType.h>
#include <c10/util/env.h>
#include <c10/util/irange.h>
namespace c10::CachingAllocator {
namespace {
constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
constexpr size_t kMB = 1024 * 1024ul;
constexpr size_t kRoundUpPowerOfTwoStart = 1 * kMB; // 1MB
constexpr size_t kRoundUpPowerOfTwoEnd = 64 * 1024ul * kMB; // 64GB
} // anonymous namespace
AcceleratorAllocatorConfig& AcceleratorAllocatorConfig::instance() {
static AcceleratorAllocatorConfig instance;
#define C10_ALLOCATOR_CONFIG_PARSE_ENV(env, deprecated) \
auto env##_name = c10::utils::get_env(#env); \
if (env##_name.has_value()) { \
if (deprecated) { \
TORCH_WARN_ONCE(#env " is deprecated, use PYTORCH_ALLOC_CONF instead"); \
} \
instance.parseArgs(env##_name.value()); \
return true; \
}
static bool env_flag [[maybe_unused]] = []() {
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_ALLOC_CONF, false)
// Keep this for backwards compatibility
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_CUDA_ALLOC_CONF, /*deprecated=*/true)
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_HIP_ALLOC_CONF, /*deprecated=*/true)
return false;
}();
#undef C10_ALLOCATOR_CONFIG_PARSE_ENV
return instance;
}
AcceleratorAllocatorConfig::AcceleratorAllocatorConfig() {
roundup_power2_divisions_.assign(kRoundUpPowerOfTwoIntervals, 0);
}
size_t AcceleratorAllocatorConfig::roundup_power2_divisions(size_t size) {
size_t log_size = (63 - llvm::countLeadingZeros(size));
// Our intervals start at 1MB and end at 64GB
const size_t interval_start =
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoStart);
const size_t interval_end =
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoEnd);
TORCH_CHECK(
interval_end - interval_start == kRoundUpPowerOfTwoIntervals,
"kRoundUpPowerOfTwoIntervals mismatch");
size_t index =
(log_size > interval_start) ? (log_size - interval_start) : 0ul;
index = std::min(index, kRoundUpPowerOfTwoIntervals - 1);
return instance().roundup_power2_divisions_[index];
}
size_t AcceleratorAllocatorConfig::parseMaxSplitSize(
const ConfigTokenizer& tokenizer,
size_t i) {
tokenizer.checkToken(++i, ":");
constexpr size_t min_allowed_split_size_mb = kLargeBuffer / kMB;
constexpr size_t max_allowed_split_size_mb =
std::numeric_limits<size_t>::max() / kMB;
size_t val_env = tokenizer.toSizeT(++i);
TORCH_CHECK(
val_env >= min_allowed_split_size_mb,
"CachingAllocator option max_split_size_mb too small, must be >= ",
min_allowed_split_size_mb);
val_env = std::min(val_env, max_allowed_split_size_mb);
max_split_size_ = val_env * kMB;
return i;
}
size_t AcceleratorAllocatorConfig::parseMaxNonSplitRoundingSize(
const ConfigTokenizer& tokenizer,
size_t i) {
tokenizer.checkToken(++i, ":");
constexpr size_t min_allowed_split_size_mb = kLargeBuffer / kMB;
constexpr size_t max_allowed_split_size_mb =
std::numeric_limits<size_t>::max() / kMB;
size_t val_env = tokenizer.toSizeT(++i);
TORCH_CHECK(
val_env >= min_allowed_split_size_mb,
"CachingAllocator option max_non_split_rounding_mb too small, must be >= ",
min_allowed_split_size_mb);
val_env = std::min(val_env, max_allowed_split_size_mb);
max_non_split_rounding_size_ = val_env * kMB;
return i;
}
size_t AcceleratorAllocatorConfig::parseGarbageCollectionThreshold(
const ConfigTokenizer& tokenizer,
size_t i) {
tokenizer.checkToken(++i, ":");
double val_env = tokenizer.toDouble(++i);
TORCH_CHECK(
val_env > 0 && val_env < 1.0,
"garbage_collect_threshold is invalid, set it in (0.0, 1.0)");
garbage_collection_threshold_ = val_env;
return i;
}
size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
const ConfigTokenizer& tokenizer,
size_t i) {
tokenizer.checkToken(++i, ":");
bool first_value = true;
if (tokenizer[++i] == "[") {
size_t last_index = 0;
// NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
while (++i < tokenizer.size() && tokenizer[i] != "]") {
size_t value_index = i;
tokenizer.checkToken(++i, ":");
size_t value = tokenizer.toSizeT(++i);
TORCH_CHECK(
value == 0 || llvm::isPowerOf2_64(value),
"For roundups, the divisions has to be power of 2 or 0 to disable roundup ");
if (tokenizer[value_index] == ">") {
std::fill(
std::next(
roundup_power2_divisions_.begin(),
static_cast<std::vector<size_t>::difference_type>(
last_index + 1)),
roundup_power2_divisions_.end(),
value);
} else {
size_t boundary = tokenizer.toSizeT(value_index);
TORCH_CHECK(
llvm::isPowerOf2_64(boundary),
"For roundups, the intervals have to be power of 2 ");
size_t index = 63 - llvm::countLeadingZeros(boundary);
index =
std::clamp(index, size_t{0}, roundup_power2_divisions_.size() - 1);
if (first_value) {
std::fill(
roundup_power2_divisions_.begin(),
std::next(
roundup_power2_divisions_.begin(),
static_cast<std::vector<size_t>::difference_type>(index)),
value);
first_value = false;
}
roundup_power2_divisions_[index] = value;
last_index = index;
}
if (tokenizer[i + 1] != "]") {
tokenizer.checkToken(++i, ",");
}
}
TORCH_INTERNAL_ASSERT(
i < tokenizer.size(),
"Expected closing bracket ']' in ConfigTokenizer but reached end of config");
} else { // Keep this for backwards compatibility
size_t value = tokenizer.toSizeT(i);
TORCH_CHECK(
llvm::isPowerOf2_64(value),
"For roundups, the divisions has to be power of 2 ");
std::fill(
roundup_power2_divisions_.begin(),
roundup_power2_divisions_.end(),
value);
}
return i;
}
size_t AcceleratorAllocatorConfig::parseExpandableSegments(
const ConfigTokenizer& tokenizer,
size_t i) {
tokenizer.checkToken(++i, ":");
use_expandable_segments_ = tokenizer.toBool(++i);
return i;
}
size_t AcceleratorAllocatorConfig::parsePinnedUseBackgroundThreads(
const ConfigTokenizer& tokenizer,
size_t i) {
tokenizer.checkToken(++i, ":");
pinned_use_background_threads_ = tokenizer.toBool(++i);
return i;
}
void AcceleratorAllocatorConfig::parseArgs(const std::string& env) {
// The following option will be reset to its default value if not explicitly
// set each time.
max_split_size_ = std::numeric_limits<size_t>::max();
roundup_power2_divisions_.assign(kRoundUpPowerOfTwoIntervals, 0);
garbage_collection_threshold_ = 0;
{
std::lock_guard<std::mutex> lock(last_allocator_settings_mutex_);
last_allocator_settings_ = env;
}
ConfigTokenizer tokenizer(env);
for (size_t i = 0; i < tokenizer.size(); i++) {
const auto& key = tokenizer[i];
if (key == "max_split_size_mb") {
i = parseMaxSplitSize(tokenizer, i);
} else if (key == "max_non_split_rounding_mb") {
i = parseMaxNonSplitRoundingSize(tokenizer, i);
} else if (key == "garbage_collection_threshold") {
i = parseGarbageCollectionThreshold(tokenizer, i);
} else if (key == "roundup_power2_divisions") {
i = parseRoundUpPower2Divisions(tokenizer, i);
} else if (key == "expandable_segments") {
i = parseExpandableSegments(tokenizer, i);
} else if (key == "pinned_use_background_threads") {
i = parsePinnedUseBackgroundThreads(tokenizer, i);
} else {
i = tokenizer.skipKey(i);
}
if (i + 1 < tokenizer.size()) {
tokenizer.checkToken(++i, ",");
}
}
}
} // namespace c10::CachingAllocator

337
c10/core/AllocatorConfig.h Normal file
View File

@ -0,0 +1,337 @@
#pragma once
#include <c10/core/DeviceType.h>
#include <c10/util/Exception.h>
#include <c10/util/llvmMathExtras.h>
#include <atomic>
#include <mutex>
#include <string>
#include <vector>
namespace c10::CachingAllocator {
// "large" allocations may be packed in 20 MiB blocks
const size_t kLargeBuffer = 20971520;
// A utility class for tokenizing allocator configuration strings into discrete
// parts. For example, the config string:
// "key1:val1,key2:[val2,val3]"
// is tokenized into:
// "key1", ":", "val1", ",", "key2", ":", "[", "val2", ",", "val3", "]",
//
// Tokens include keys, values, and special characters (':', ',', '[', ']').
// Whitespace is ignored.
class ConfigTokenizer {
public:
explicit ConfigTokenizer(const std::string& env) {
std::string buffer;
for (char ch : env) {
if (ch == ',' || ch == ':' || ch == '[' || ch == ']') {
if (!buffer.empty()) {
config_.emplace_back(std::move(buffer));
buffer.clear();
}
config_.emplace_back(1, ch);
} else if (!std::isspace(static_cast<unsigned char>(ch))) {
buffer += ch;
}
}
if (!buffer.empty()) {
config_.emplace_back(std::move(buffer));
}
}
const std::string& operator[](size_t i) const {
TORCH_INTERNAL_ASSERT(
i < config_.size(), "Index out of bounds in ConfigTokenizer");
return config_[i];
}
size_t size() const {
return config_.size();
}
bool checkToken(size_t i, const std::string& token) const {
checkIndex(i);
return config_[i] == token;
}
size_t toSizeT(size_t i) const {
checkIndex(i);
return std::stoull(config_[i]);
}
double toDouble(size_t i) const {
checkIndex(i);
return std::stod(config_[i]);
}
bool toBool(size_t i) const {
checkIndex(i);
const auto& token = config_[i];
if (token == "True") {
return true;
} else if (token == "False") {
return false;
} else {
TORCH_CHECK(
false,
"Expected 'True' or 'False' at index ",
i,
" in ConfigTokenizer but got '",
token,
"'");
}
}
// Skips the current token group and returns the index of the value token.
// Assumes the current index `i` points to a key name in a key-value pair.
size_t skipKey(size_t i) const {
// Expect a colon after the key
checkToken(++i, ":");
++i; // Move to the value
checkIndex(i);
if (config_[i] != "[") {
// Value is a single token (not a list) -> return its index
return i;
}
// Skip tokens inside the list until matching ']'
// NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
while (++i < config_.size() && config_[i] != "]") {
}
TORCH_INTERNAL_ASSERT(
i < config_.size(),
"Expected closing bracket ']' in ConfigTokenizer but reached end of config");
return i; // Return the index of the closing ']'
}
private:
void checkIndex(size_t i) const {
TORCH_INTERNAL_ASSERT(
i < config_.size(), "Index out of bounds in ConfigTokenizer");
}
std::vector<std::string> config_;
};
/**
* Note [AcceleratorAllocatorConfig design]
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* This class configures memory allocation for both device and host memory. A
* single `AcceleratorAllocatorConfig` instance is shared across all accelerator
* backends, such as CUDA and XPU, under the assumption that relevant
* environment variables apply uniformly to all accelerators. Device-specific
* configuration extensions are supported via hooks (see
* `registerDeviceConfigParserHook`).
*
* Recommended design:
* - Place common configurations in `AcceleratorAllocatorConfig`.
* - Extend backend-specific configurations in corresponding device-specific
* classes, such as `CUDAAllocatorConfig`, etc.
*
* Scope:
* - Configuration options must be environment-variable driven.
*
* Naming Convention:
* - Public API names in `AcceleratorAllocatorConfig` should be device-generic.
* - Members prefixed with `pinned_` are specific to the host/pinned allocator.
* - Environment variable names should be generic across backends.
* - Comma-separated key-value pairs in the format: `key:value`. Use square
* brackets `[]` for list values Example: `key1:123, key2:[val1,val2]`
*
* Environment Variables:
* - The primary environment variable for configuration is `PYTORCH_ALLOC_CONF`.
* - For backward compatibility, `PYTORCH_CUDA_ALLOC_CONF` is also supported
* with lower priority.
*/
class C10_API AcceleratorAllocatorConfig {
public:
static AcceleratorAllocatorConfig& instance();
C10_DISABLE_COPY_AND_ASSIGN(AcceleratorAllocatorConfig);
AcceleratorAllocatorConfig(AcceleratorAllocatorConfig&&) = delete;
AcceleratorAllocatorConfig& operator=(AcceleratorAllocatorConfig&&) = delete;
~AcceleratorAllocatorConfig() = default;
/* Device allocator settings */
// Returns the maximum block size (in MB) that is allowed to be split. The
// default is unlimited (all blocks can be split).
static size_t max_split_size() {
return instance().max_split_size_;
}
// Returns the maximum block size (in MB) that is allowed to be rounded up
// without requiring splitting when searching for a free block. The default is
// 20 MiB.
static size_t max_non_split_rounding_size() {
return instance().max_non_split_rounding_size_;
}
// Return the number of divisions used when rounding up allocation sizes (in
// MB) to the nearest power-of-2 boundary.
static size_t roundup_power2_divisions(size_t size);
// Returns the vector of division factors used for rounding up allocation
// sizes. These divisions apply to size intervals between 1MB and 64GB.
static std::vector<size_t> roundup_power2_divisions() {
return instance().roundup_power2_divisions_;
}
// Returns the threshold that triggers garbage collection when the ratio of
// used memory to maximum allowed memory exceeds this value. The default is 0,
// meaning no garbage collection is triggered. The value should be in the
// range (0.0, 1.0).
static double garbage_collection_threshold() {
return instance().garbage_collection_threshold_;
}
// Returns whether the expandable segment feature is enabled. This allows the
// allocator to start with one segment that grows as needed, rather than
// creating a new segment for each allocation. Default is false (expandable
// segments disabled).
static bool use_expandable_segments() {
return instance().use_expandable_segments_;
}
/* Host allocator settings */
// Returns whether the pinned host allocator uses background threads for
// processing events. This is useful for improving performance in scenarios
// where many small allocations are made. Default is false (background threads
// disabled).
static bool pinned_use_background_threads() {
return instance().pinned_use_background_threads_;
}
/* Settings for both device and host allocator */
// Returns the current allocator settings as a string. This string is useful
// to expand device-specific allocator configurations
static std::string last_allocator_settings() {
std::lock_guard<std::mutex> lock(instance().last_allocator_settings_mutex_);
return instance().last_allocator_settings_;
}
// Parses the environment variable `env` to update the allocator settings.
// If the environment variable is not set, it does nothing.
// The configuration string should be a comma-separated list of key-value
// pairs, where each key is a configuration option and the value is the
// corresponding setting. For example:
// "max_split_size_mb:100,max_non_split_rounding_mb:20,garbage_collection_threshold:0.5,roundup_power2_divisions:[64:8,256:4,1024:4,>:1],expandable_segments:true,pinned_use_background_threads:true"
void parseArgs(const std::string& env);
// Registers a device-specific configuration parser hook. This allows
// backends to parse additional device-specific configuration options from the
// environment variable. The hook should be a function that takes a string
// (the environment variable value) and parses it to set device-specific
// configuration options.
// The hook will be called when the environment variable is parsed.
// If a hook is already registered, it will be replaced with the new one.
void registerDeviceConfigParserHook(
std::function<void(const std::string&)> hook) {
device_config_parser_hook_ = std::move(hook);
}
// Calls the registered device-specific configuration parser hook with the
// provided environment string. This allows backends to parse additional
// device-specific configuration options from the environment variable.
// If no hook is registered, this function does nothing.
void callDeviceConfigParserHook(const std::string& env) const {
if (device_config_parser_hook_) {
device_config_parser_hook_(env);
}
}
private:
AcceleratorAllocatorConfig();
/* Internal functions for device allocator */
// Parse `max_split_size_mb` from environment variable.
size_t parseMaxSplitSize(const ConfigTokenizer& tokenizer, size_t i);
// Parse `max_non_split_rounding_mb` from environment variable.
size_t parseMaxNonSplitRoundingSize(
const ConfigTokenizer& tokenizer,
size_t i);
// Parse `garbage_collection_threshold` from environment variable.
size_t parseGarbageCollectionThreshold(
const ConfigTokenizer& tokenizer,
size_t i);
// Parse `roundup_power2_divisions` from environment variable.
size_t parseRoundUpPower2Divisions(
const ConfigTokenizer& tokenizer,
size_t i);
// Parse `expandable_segments` from environment variable.
size_t parseExpandableSegments(const ConfigTokenizer& tokenizer, size_t i);
/* Internal functions for host allocator */
// Parse `pinned_use_background_threads` from environment variable.
size_t parsePinnedUseBackgroundThreads(
const ConfigTokenizer& tokenizer,
size_t i);
/* The following members are specifically used for the device allocator. */
// The maximum block size that is allowed to be split.
std::atomic<size_t> max_split_size_{std::numeric_limits<size_t>::max()};
// The maximum allowable extra size of a memory block without requiring
// splitting when searching for a free block.
std::atomic<size_t> max_non_split_rounding_size_{kLargeBuffer};
// Used to store how memory allocations of different sizes should be rounded
// up to the nearest power of 2 divisions.
std::vector<size_t> roundup_power2_divisions_;
// The threshold that triggers garbage collection when the ratio of used
// memory to maximum allowed memory exceeds this value.
std::atomic<double> garbage_collection_threshold_{0};
// A flag to enable expandable segments feature.
std::atomic<bool> use_expandable_segments_{false};
/* The following members are specifically used for the host allocator. */
// A flag to enable background thread for processing events.
std::atomic<bool> pinned_use_background_threads_{false};
/* The following members are used for both device and host allocator. */
// Record the last allocator config environment setting.
std::mutex last_allocator_settings_mutex_;
std::string last_allocator_settings_;
// Optional hook for parsing additional device-specific allocator settings.
// This allows backends (e.g., CUDA, XPU) to register a custom parser for
// their own environment configuration extensions.
std::function<void(const std::string&)> device_config_parser_hook_{nullptr};
};
C10_API inline void setAllocatorSettings(const std::string& env) {
AcceleratorAllocatorConfig::instance().parseArgs(env);
AcceleratorAllocatorConfig::instance().callDeviceConfigParserHook(env);
}
C10_API inline std::string getAllocatorSettings() {
return AcceleratorAllocatorConfig::instance().last_allocator_settings();
}
struct DeviceConfigParserHookRegistry {
explicit DeviceConfigParserHookRegistry(
std::function<void(const std::string&)> hook) {
AcceleratorAllocatorConfig::instance().registerDeviceConfigParserHook(
std::move(hook));
}
};
#define REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK(hook) \
namespace { \
static at::CachingAllocator::DeviceConfigParserHookRegistry \
g_device_config_parse_hook_registry_instance(hook); \
}
} // namespace c10::CachingAllocator

View File

@ -1,30 +1,27 @@
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
#include <c10/cuda/CUDAException.h>
#include <c10/cuda/driver_api.h> #include <c10/cuda/driver_api.h>
#include <c10/util/CallOnce.h> #include <c10/util/CallOnce.h>
#include <c10/util/Exception.h> #include <c10/util/Exception.h>
#include <c10/util/Logging.h>
#include <cuda_runtime.h>
#include <dlfcn.h> #include <dlfcn.h>
namespace c10::cuda { namespace c10::cuda {
namespace { namespace {
void* get_symbol(const char* name, int version);
DriverAPI create_driver_api() { DriverAPI create_driver_api() {
void* handle_0 = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_NOLOAD);
TORCH_CHECK(handle_0, "Can't open libcuda.so.1: ", dlerror());
void* handle_1 = DriverAPI::get_nvml_handle(); void* handle_1 = DriverAPI::get_nvml_handle();
DriverAPI r{}; DriverAPI r{};
#define LOOKUP_LIBCUDA_ENTRY(name) \ #define LOOKUP_LIBCUDA_ENTRY_WITH_VERSION(name, version) \
r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \ r.name##_ = reinterpret_cast<decltype(&name)>(get_symbol(#name, version)); \
TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name, ": ", dlerror()) TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name)
C10_LIBCUDA_DRIVER_API(LOOKUP_LIBCUDA_ENTRY) C10_LIBCUDA_DRIVER_API(LOOKUP_LIBCUDA_ENTRY_WITH_VERSION)
#undef LOOKUP_LIBCUDA_ENTRY #undef LOOKUP_LIBCUDA_ENTRY_WITH_VERSION
#define LOOKUP_LIBCUDA_ENTRY(name) \
r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \
dlerror();
C10_LIBCUDA_DRIVER_API_12030(LOOKUP_LIBCUDA_ENTRY)
#undef LOOKUP_LIBCUDA_ENTRY
if (handle_1) { if (handle_1) {
#define LOOKUP_NVML_ENTRY(name) \ #define LOOKUP_NVML_ENTRY(name) \
@ -35,6 +32,32 @@ DriverAPI create_driver_api() {
} }
return r; return r;
} }
void* get_symbol(const char* name, int version) {
void* out = nullptr;
cudaDriverEntryPointQueryResult qres{};
// CUDA 12.5+ supports version-based lookup
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12050)
if (auto st = cudaGetDriverEntryPointByVersion(
name, &out, version, cudaEnableDefault, &qres);
st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
return out;
}
#endif
// This fallback to the old API to try getting the symbol again.
if (auto st = cudaGetDriverEntryPoint(name, &out, cudaEnableDefault, &qres);
st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
return out;
}
// If the symbol cannot be resolved, report and return nullptr;
// the caller is responsible for checking the pointer.
LOG(INFO) << "Failed to resolve symbol " << name;
return nullptr;
}
} // namespace } // namespace
void* DriverAPI::get_nvml_handle() { void* DriverAPI::get_nvml_handle() {

View File

@ -20,30 +20,24 @@
} \ } \
} while (0) } while (0)
#define C10_LIBCUDA_DRIVER_API(_) \ #define C10_LIBCUDA_DRIVER_API(_) \
_(cuDeviceGetAttribute) \ _(cuDeviceGetAttribute, 12000) \
_(cuMemAddressReserve) \ _(cuMemAddressReserve, 12000) \
_(cuMemRelease) \ _(cuMemRelease, 12000) \
_(cuMemMap) \ _(cuMemMap, 12000) \
_(cuMemAddressFree) \ _(cuMemAddressFree, 12000) \
_(cuMemSetAccess) \ _(cuMemSetAccess, 12000) \
_(cuMemUnmap) \ _(cuMemUnmap, 12000) \
_(cuMemCreate) \ _(cuMemCreate, 12000) \
_(cuMemGetAllocationGranularity) \ _(cuMemGetAllocationGranularity, 12000) \
_(cuMemExportToShareableHandle) \ _(cuMemExportToShareableHandle, 12000) \
_(cuMemImportFromShareableHandle) \ _(cuMemImportFromShareableHandle, 12000) \
_(cuMemsetD32Async) \ _(cuMemsetD32Async, 12000) \
_(cuStreamWriteValue32) \ _(cuStreamWriteValue32, 12000) \
_(cuGetErrorString) _(cuGetErrorString, 12000) \
_(cuMulticastAddDevice, 12030) \
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12030) _(cuMulticastBindMem, 12030) \
#define C10_LIBCUDA_DRIVER_API_12030(_) \ _(cuMulticastCreate, 12030)
_(cuMulticastAddDevice) \
_(cuMulticastBindMem) \
_(cuMulticastCreate)
#else
#define C10_LIBCUDA_DRIVER_API_12030(_)
#endif
#define C10_NVML_DRIVER_API(_) \ #define C10_NVML_DRIVER_API(_) \
_(nvmlInit_v2) \ _(nvmlInit_v2) \
@ -56,11 +50,13 @@
namespace c10::cuda { namespace c10::cuda {
struct DriverAPI { struct DriverAPI {
#define CREATE_MEMBER_VERSIONED(name, version) decltype(&name) name##_;
#define CREATE_MEMBER(name) decltype(&name) name##_; #define CREATE_MEMBER(name) decltype(&name) name##_;
C10_LIBCUDA_DRIVER_API(CREATE_MEMBER) C10_LIBCUDA_DRIVER_API(CREATE_MEMBER_VERSIONED)
C10_LIBCUDA_DRIVER_API_12030(CREATE_MEMBER)
C10_NVML_DRIVER_API(CREATE_MEMBER) C10_NVML_DRIVER_API(CREATE_MEMBER)
#undef CREATE_MEMBER_VERSIONED
#undef CREATE_MEMBER #undef CREATE_MEMBER
static DriverAPI* get(); static DriverAPI* get();
static void* get_nvml_handle(); static void* get_nvml_handle();
}; };

View File

@ -63,7 +63,6 @@ def define_c10_ovrsource(name, is_mobile):
"core/impl/*.h", "core/impl/*.h",
]), ]),
reexport_all_header_dependencies = False, reexport_all_header_dependencies = False,
# tests = C10_CPU_TEST_TARGETS,
visibility = [ visibility = [
"//xplat/caffe2/c10:c10_ovrsource", "//xplat/caffe2/c10:c10_ovrsource",
], ],
@ -84,25 +83,6 @@ def define_c10_ovrsource(name, is_mobile):
) )
def define_ovrsource_targets(): def define_ovrsource_targets():
# C10_CPU_TEST_FILES = native.glob([
# "test/core/*.cpp",
# "test/util/*.cpp",
# ])
# C10_GPU_TEST_FILES = native.glob([
# "cuda/test/**/*.cpp",
# ])
# C10_CPU_TEST_TARGETS = [
# ":" + paths.basename(test)[:-len(".cpp")] + "_ovrsource"
# for test in C10_CPU_TEST_FILES
# ]
# C10_GPU_TEST_TARGETS = [
# ":" + paths.basename(test)[:-len(".cpp")] + "_ovrsource"
# for test in C10_GPU_TEST_FILES
# ]
common_c10_cmake_defines = [ common_c10_cmake_defines = [
("#cmakedefine C10_BUILD_SHARED_LIBS", ""), ("#cmakedefine C10_BUILD_SHARED_LIBS", ""),
("#cmakedefine C10_USE_NUMA", ""), ("#cmakedefine C10_USE_NUMA", ""),
@ -207,7 +187,6 @@ def define_ovrsource_targets():
"cuda/impl/*.h", "cuda/impl/*.h",
]), ]),
reexport_all_header_dependencies = False, reexport_all_header_dependencies = False,
# tests = C10_GPU_TEST_TARGETS,
visibility = ["PUBLIC"], visibility = ["PUBLIC"],
deps = [ deps = [
"//third-party/cuda:libcuda", "//third-party/cuda:libcuda",
@ -217,64 +196,3 @@ def define_ovrsource_targets():
":c10_ovrsource", ":c10_ovrsource",
], ],
) )
# [
# oxx_test(
# name = paths.basename(test)[:-len(".cpp")] + "_ovrsource",
# srcs = [test],
# compatible_with = cpu_supported_platforms,
# compiler_flags = select({
# "DEFAULT": [],
# "ovr_config//compiler:cl": [
# "/w",
# ],
# "ovr_config//compiler:clang": [
# "-Wno-error",
# "-Wno-self-assign-overloaded",
# "-Wno-self-move",
# "-Wno-shadow",
# "-Wno-undef",
# "-Wno-unused-function",
# "-Wno-unused-variable",
# ],
# }),
# framework = "gtest",
# oncall = "ovrsource_pytorch",
# raw_headers = native.glob([
# "test/**/*.h",
# ]),
# deps = [
# ":c10_ovrsource",
# ],
# )
# for test in C10_CPU_TEST_FILES
# ]
# [
# oxx_test(
# name = paths.basename(test)[:-len(".cpp")] + "_ovrsource",
# srcs = [test],
# compatible_with = cuda_supported_platforms,
# compiler_flags = select({
# "DEFAULT": [],
# "ovr_config//compiler:cl": [
# "/w",
# ],
# "ovr_config//compiler:clang": [
# "-Wno-error",
# ],
# }),
# framework = "gtest",
# oncall = "ovrsource_pytorch",
# raw_headers = native.glob([
# "test/**/*.h",
# ]),
# runtime_shared_libraries = [
# "//third-party/cuda:cudart",
# ],
# deps = [
# ":c10_cuda_ovrsource",
# ],
# )
# for test in C10_GPU_TEST_FILES
# ]

View File

@ -0,0 +1,123 @@
#include <c10/core/AllocatorConfig.h>
#include <gtest/gtest.h>
using namespace c10::CachingAllocator;
constexpr size_t kMB = 1024 * 1024ul;
struct ExtendedAllocatorConfig {
static ExtendedAllocatorConfig& instance() {
static ExtendedAllocatorConfig instance;
return instance;
}
// Returns the device-specific option value in bytes.
static size_t device_specific_option() {
return instance().device_specific_option_;
}
void parseArgs(const std::string& env) {
// Parse device-specific options from the environment variable
ConfigTokenizer tokenizer(env);
for (size_t i = 0; i < tokenizer.size(); i++) {
const auto& key = tokenizer[i];
if (key == "device_specific_option_mb") {
tokenizer.checkToken(++i, ":");
device_specific_option_ = tokenizer.toSizeT(++i) * kMB;
} else {
i = tokenizer.skipKey(i);
}
if (i + 1 < tokenizer.size()) {
tokenizer.checkToken(++i, ",");
}
}
}
private:
// Device-specific option, e.g., memory limit for a specific device.
std::atomic<size_t> device_specific_option_{0};
};
REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK([](const std::string& env) {
ExtendedAllocatorConfig::instance().parseArgs(env);
})
TEST(AllocatorConfigTest, allocator_config_test) {
std::string env =
"max_split_size_mb:40,"
"max_non_split_rounding_mb:30,"
"garbage_collection_threshold:0.5,"
"roundup_power2_divisions:[64:8,128:2,256:4,512:2,1024:4,>:1],"
"expandable_segments:True,"
"pinned_use_background_threads:True,"
"device_specific_option_mb:64";
c10::CachingAllocator::setAllocatorSettings(env);
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
EXPECT_EQ(AcceleratorAllocatorConfig::max_split_size(), 40 * kMB);
EXPECT_EQ(
AcceleratorAllocatorConfig::max_non_split_rounding_size(), 30 * kMB);
EXPECT_EQ(AcceleratorAllocatorConfig::garbage_collection_threshold(), 0.5);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(32 * kMB), 8);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(64 * kMB), 8);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(128 * kMB), 2);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(256 * kMB), 4);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(512 * kMB), 2);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 4);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 1);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(4096 * kMB), 1);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(8192 * kMB), 1);
EXPECT_EQ(AcceleratorAllocatorConfig::use_expandable_segments(), true);
EXPECT_EQ(AcceleratorAllocatorConfig::pinned_use_background_threads(), true);
EXPECT_EQ(ExtendedAllocatorConfig::device_specific_option(), 64 * kMB);
env =
"max_split_size_mb:20,"
"max_non_split_rounding_mb:40,"
"garbage_collection_threshold:0.8";
c10::CachingAllocator::setAllocatorSettings(env);
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
EXPECT_EQ(AcceleratorAllocatorConfig::max_split_size(), 20 * kMB);
EXPECT_EQ(
AcceleratorAllocatorConfig::max_non_split_rounding_size(), 40 * kMB);
EXPECT_EQ(AcceleratorAllocatorConfig::garbage_collection_threshold(), 0.8);
// roundup_power2_divisions knob array syntax
env = "roundup_power2_divisions:[128:8,256:16,512:1,2048:8,>:2]";
c10::CachingAllocator::setAllocatorSettings(env);
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(64 * kMB), 8);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(128 * kMB), 8);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(256 * kMB), 16);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(512 * kMB), 1);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 0);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 8);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(4096 * kMB), 2);
// roundup_power2_divisions single value syntax for backward compatibility
env = "roundup_power2_divisions:4";
c10::CachingAllocator::setAllocatorSettings(env);
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(64 * kMB), 4);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(256 * kMB), 4);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 4);
env = "expandable_segments:False,";
c10::CachingAllocator::setAllocatorSettings(env);
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
EXPECT_EQ(AcceleratorAllocatorConfig::use_expandable_segments(), false);
env = "pinned_use_background_threads:False";
c10::CachingAllocator::setAllocatorSettings(env);
EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
EXPECT_EQ(AcceleratorAllocatorConfig::pinned_use_background_threads(), false);
}

View File

@ -4,6 +4,7 @@
// 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa. // 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa.
#include <c10/macros/Macros.h> #include <c10/macros/Macros.h>
#include <c10/util/bit_cast.h>
#include <cmath> #include <cmath>
#include <cstdint> #include <cstdint>
#include <cstring> #include <cstring>
@ -67,13 +68,7 @@ inline C10_HOST_DEVICE uint16_t round_to_nearest_even(float src) {
#endif #endif
return UINT16_C(0x7FC0); return UINT16_C(0x7FC0);
} else { } else {
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) const uint32_t U32 = c10::bit_cast<uint32_t>(src);
union {
uint32_t U32; // NOLINT(facebook-hte-BadMemberName)
float F32; // NOLINT(facebook-hte-BadMemberName)
};
F32 = src;
uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF); uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
return static_cast<uint16_t>((U32 + rounding_bias) >> 16); return static_cast<uint16_t>((U32 + rounding_bias) >> 16);
} }

View File

@ -3,6 +3,8 @@
#include <cstring> #include <cstring>
#include <type_traits> #include <type_traits>
#include <c10/macros/Macros.h>
#if __has_include(<bit>) && (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L) #if __has_include(<bit>) && (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L)
#include <bit> #include <bit>
#define C10_HAVE_STD_BIT_CAST 1 #define C10_HAVE_STD_BIT_CAST 1
@ -23,7 +25,7 @@ using std::bit_cast;
// See https://en.cppreference.com/w/cpp/numeric/bit_cast for more // See https://en.cppreference.com/w/cpp/numeric/bit_cast for more
// information as well as the source of our implementations. // information as well as the source of our implementations.
template <class To, class From> template <class To, class From>
std::enable_if_t< C10_HOST_DEVICE std::enable_if_t<
sizeof(To) == sizeof(From) && std::is_trivially_copyable_v<From> && sizeof(To) == sizeof(From) && std::is_trivially_copyable_v<From> &&
std::is_trivially_copyable_v<To>, std::is_trivially_copyable_v<To>,
To> To>

View File

@ -58,6 +58,9 @@ def define_targets(rules):
name = "bit_cast", name = "bit_cast",
hdrs = ["bit_cast.h"], hdrs = ["bit_cast.h"],
visibility = ["//:__subpackages__"], visibility = ["//:__subpackages__"],
deps = [
"//c10/macros",
],
) )
rules.cc_library( rules.cc_library(

View File

@ -0,0 +1,18 @@
```{eval-rst}
.. role:: hidden
:class: hidden-section
```
```{eval-rst}
.. automodule:: torch.distributed._dist2
:members:
:undoc-members:
:show-inheritance:
```
```{eval-rst}
.. autoclass:: torch.distributed.ProcessGroup
:members:
:undoc-members:
:show-inheritance:
```

View File

@ -224,6 +224,10 @@ inconsistent 'UUID' assignment across ranks, and to prevent races during initial
.. autofunction:: is_torchelastic_launched .. autofunction:: is_torchelastic_launched
``` ```
```{eval-rst}
.. autofunction:: get_default_backend_for_device
```
______________________________________________________________________ ______________________________________________________________________
Currently three initialization methods are supported: Currently three initialization methods are supported:
@ -1471,3 +1475,9 @@ If you are running single node training, it may be convenient to interactively b
```{eval-rst} ```{eval-rst}
.. py:module:: torch.distributed.checkpoint.state_dict .. py:module:: torch.distributed.checkpoint.state_dict
``` ```
```{toctree}
:hidden:
distributed._dist2
```

View File

@ -8,16 +8,14 @@ higher-level API to automatically differentiate models split across several
machines. machines.
```{warning} ```{warning}
APIs in the RPC package are stable. There are multiple ongoing work items APIs in the RPC package are stable and in maintenance mode.
to improve performance and error handling, which will ship in future releases.
``` ```
```{warning} ```{warning}
CUDA support was introduced in PyTorch 1.9 and is still a **beta** feature. CUDA support is a **beta** feature.
Not all features of the RPC package are yet compatible with CUDA support and Not all features of the RPC package are yet compatible with CUDA support and
thus their use is discouraged. These unsupported features include: RRefs, thus their use is discouraged. These unsupported features include: RRefs,
JIT compatibility, dist autograd and dist optimizer, and profiling. These JIT compatibility, dist autograd and dist optimizer, and profiling.
shortcomings will be addressed in future releases.
``` ```
```{note} ```{note}
@ -102,13 +100,6 @@ device lists on source and destination workers do not match. In such cases,
applications can always explicitly move the input tensors to CPU on the caller applications can always explicitly move the input tensors to CPU on the caller
and move it to the desired devices on the callee if necessary. and move it to the desired devices on the callee if necessary.
```{warning}
TorchScript support in RPC is a prototype feature and subject to change. Since
v1.5.0, ``torch.distributed.rpc`` supports calling TorchScript functions as
RPC target functions, and this will help improve parallelism on the callee
side as executing TorchScript functions does not require GIL.
```
```{eval-rst} ```{eval-rst}
.. autofunction:: rpc_sync .. autofunction:: rpc_sync
.. autofunction:: rpc_async .. autofunction:: rpc_async
@ -159,9 +150,7 @@ multiple different transports (TCP, of course, but also shared memory, NVLink,
InfiniBand, ...) and can automatically detect their availability and negotiate InfiniBand, ...) and can automatically detect their availability and negotiate
the best transport to use for each pipe. the best transport to use for each pipe.
The TensorPipe backend has been introduced in PyTorch v1.6 and is being actively The TensorPipe backend comes with a TCP-based transport, just like Gloo. It is also able to
developed. At the moment, it only supports CPU tensors, with GPU support coming
soon. It comes with a TCP-based transport, just like Gloo. It is also able to
automatically chunk and multiplex large tensors over multiple sockets and automatically chunk and multiplex large tensors over multiple sockets and
threads in order to achieve very high bandwidths. The agent will be able to pick threads in order to achieve very high bandwidths. The agent will be able to pick
the best transport on its own, with no intervention required. the best transport on its own, with no intervention required.
@ -301,6 +290,4 @@ to use [the profiler](https://pytorch.org/docs/stable/autograd.html#profiler) to
- [Getting started with Distributed RPC Framework](https://pytorch.org/tutorials/intermediate/rpc_tutorial.html) - [Getting started with Distributed RPC Framework](https://pytorch.org/tutorials/intermediate/rpc_tutorial.html)
- [Implementing a Parameter Server using Distributed RPC Framework](https://pytorch.org/tutorials/intermediate/rpc_param_server_tutorial.html) - [Implementing a Parameter Server using Distributed RPC Framework](https://pytorch.org/tutorials/intermediate/rpc_param_server_tutorial.html)
- [Combining Distributed DataParallel with Distributed RPC Framework](https://pytorch.org/tutorials/advanced/rpc_ddp_tutorial.html) (covers **RemoteModule** as well) - [Combining Distributed DataParallel with Distributed RPC Framework](https://pytorch.org/tutorials/advanced/rpc_ddp_tutorial.html) (covers **RemoteModule** as well)
- [Profiling RPC-based Workloads](https://pytorch.org/tutorials/recipes/distributed_rpc_profiling.html)
- [Implementing batch RPC processing](https://pytorch.org/tutorials/intermediate/rpc_async_execution.html) - [Implementing batch RPC processing](https://pytorch.org/tutorials/intermediate/rpc_async_execution.html)
- [Distributed Pipeline Parallel](https://pytorch.org/tutorials/intermediate/dist_pipeline_parallel_tutorial.html)

View File

@ -9,13 +9,13 @@ requires = [
# 77.0.0: min version for SPDX expression support for project.license # 77.0.0: min version for SPDX expression support for project.license
"setuptools>=62.3.0,<80.0", "setuptools>=62.3.0,<80.0",
"wheel", "wheel",
"astunparse",
"cmake>=3.27", "cmake>=3.27",
"ninja", "ninja",
"numpy", "numpy",
"packaging", "packaging",
"pyyaml", "pyyaml",
"requests", "requests",
"six", # dependency chain: NNPACK -> PeachPy -> six
"typing-extensions>=4.10.0", "typing-extensions>=4.10.0",
] ]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"

View File

@ -1,5 +1,4 @@
# Python dependencies required for development # Python dependencies required for development
astunparse
build[uv] # for building sdist and wheel build[uv] # for building sdist and wheel
cmake>=3.27 cmake>=3.27
expecttest>=0.3.0 expecttest>=0.3.0
@ -18,6 +17,8 @@ pyyaml
requests requests
# setuptools develop deprecated on 80.0 # setuptools develop deprecated on 80.0
setuptools>=62.3.0,<80.0 setuptools>=62.3.0,<80.0
six # dependency chain: NNPACK -> PeachPy -> six
sympy>=1.13.3 sympy>=1.13.3
types-dataclasses types-dataclasses
typing-extensions>=4.13.2 typing-extensions>=4.13.2
wheel

View File

@ -15,4 +15,4 @@ pip install --no-use-pep517 -e "$tp2_dir/onnx"
# Install caffe2 and pytorch # Install caffe2 and pytorch
pip install -r "$top_dir/caffe2/requirements.txt" pip install -r "$top_dir/caffe2/requirements.txt"
pip install -r "$top_dir/requirements.txt" pip install -r "$top_dir/requirements.txt"
python setup.py develop python -m pip install --no-build-isolation -v -e .

View File

@ -35,4 +35,4 @@ _pip_install -b "$BUILD_DIR/onnx" "file://$tp2_dir/onnx#egg=onnx"
# Install caffe2 and pytorch # Install caffe2 and pytorch
pip install -r "$top_dir/caffe2/requirements.txt" pip install -r "$top_dir/caffe2/requirements.txt"
pip install -r "$top_dir/requirements.txt" pip install -r "$top_dir/requirements.txt"
python setup.py install python -m pip install --no-build-isolation -v .

View File

@ -263,6 +263,7 @@ import json
import shutil import shutil
import subprocess import subprocess
import sysconfig import sysconfig
import textwrap
import time import time
from collections import defaultdict from collections import defaultdict
from pathlib import Path from pathlib import Path
@ -601,7 +602,7 @@ def build_deps() -> None:
report( report(
'Finished running cmake. Run "ccmake build" or ' 'Finished running cmake. Run "ccmake build" or '
'"cmake-gui build" to adjust build options and ' '"cmake-gui build" to adjust build options and '
'"python setup.py install" to build.' '"python -m pip install --no-build-isolation -v ." to build.'
) )
sys.exit() sys.exit()
@ -1207,24 +1208,25 @@ def configure_extension_build() -> tuple[
# post run, warnings, printed at the end to make them more visible # post run, warnings, printed at the end to make them more visible
build_update_message = """ build_update_message = """
It is no longer necessary to use the 'build' or 'rebuild' targets It is no longer necessary to use the 'build' or 'rebuild' targets
To install: To install:
$ python setup.py install $ python -m pip install --no-build-isolation -v .
To develop locally: To develop locally:
$ python setup.py develop $ python -m pip install --no-build-isolation -v -e .
To force cmake to re-generate native build files (off by default): To force cmake to re-generate native build files (off by default):
$ CMAKE_FRESH=1 python setup.py develop $ CMAKE_FRESH=1 python -m pip install --no-build-isolation -v -e .
""" """.strip()
def print_box(msg: str) -> None: def print_box(msg: str) -> None:
lines = msg.split("\n") msg = textwrap.dedent(msg).strip()
size = max(len(l) + 1 for l in lines) lines = ["", *msg.split("\n"), ""]
print("-" * (size + 2)) max_width = max(len(l) for l in lines)
for l in lines: print("+" + "-" * (max_width + 4) + "+", file=sys.stderr, flush=True)
print("|{}{}|".format(l, " " * (size - len(l)))) for line in lines:
print("-" * (size + 2)) print(f"| {line:<{max_width}s} |", file=sys.stderr, flush=True)
print("+" + "-" * (max_width + 4) + "+", file=sys.stderr, flush=True)
def main() -> None: def main() -> None:
@ -1308,7 +1310,9 @@ def main() -> None:
"include/**/*.hpp", "include/**/*.hpp",
"include/*.cuh", "include/*.cuh",
"include/**/*.cuh", "include/**/*.cuh",
"csrc/inductor/aoti_runtime/model.h",
"_inductor/codegen/*.h", "_inductor/codegen/*.h",
"_inductor/codegen/aoti_runtime/*.h",
"_inductor/codegen/aoti_runtime/*.cpp", "_inductor/codegen/aoti_runtime/*.cpp",
"_inductor/script.ld", "_inductor/script.ld",
"_export/serde/*.yaml", "_export/serde/*.yaml",

View File

@ -36,7 +36,7 @@ The following commands assume you are in PyTorch root.
```bash ```bash
# ... Build PyTorch from source, e.g. # ... Build PyTorch from source, e.g.
python setup.py develop python -m pip install --no-build-isolation -v -e .
# (re)build just the binary # (re)build just the binary
ninja -C build bin/test_jit ninja -C build bin/test_jit
# run tests # run tests

View File

@ -4,8 +4,8 @@ This folder contains a self-contained example of a PyTorch out-of-tree backend l
## How to use ## How to use
Install as standalone with `python setup.py develop` (or install) from this folder. Install as standalone with `python -m pip install -e .` (or `python -m pip install .`)
You can run test via `python {PYTORCH_ROOT_PATH}/test/test_openreg.py`. from this folder. You can run test via `python {PYTORCH_ROOT_PATH}/test/test_openreg.py`.
## Design principles ## Design principles

View File

@ -1570,5 +1570,54 @@ class TestFullyShardForceSumReduction(FSDPTest):
self.assertRegex(logs, all_reduce_sum_re) self.assertRegex(logs, all_reduce_sum_re)
class TestFullyShardReduceOpWorldSize1(FSDPTest):
@property
def world_size(self) -> int:
return 1
def test_size1_reduceop(self):
from torch.distributed.distributed_c10d import ReduceOp
model = nn.Linear(1024, 1025)
ref_model = copy.deepcopy(model).to(device_type)
ref_optim = torch.optim.Adam(ref_model.parameters())
fully_shard(
model,
mesh=init_device_mesh(device_type.type, (1,)),
reshard_after_forward=False,
)
optim = torch.optim.Adam(model.parameters())
inp = torch.randn(1025, 1024, device=device_type.type)
for _ in range(3):
ref_optim.zero_grad()
ref_loss = ref_model(inp).sum()
ref_loss.backward()
for param in ref_model.parameters():
dist.all_reduce(param.grad, op=dist.ReduceOp.SUM)
ref_optim.step()
optim.zero_grad()
loss = model(inp).sum()
loss.backward()
optim.step()
self.assertEqual(loss, ref_loss)
self.assertEqual(
model.bias.grad._local_tensor,
ref_model.bias.grad,
)
state = model._get_fsdp_state()
fsdp_param_group = state._fsdp_param_group
group = fsdp_param_group.mesh_info.shard_process_group
(
_,
_,
_,
all_reduce_op,
) = _get_gradient_divide_factors(group, None, torch.float32)
self.assertEqual(all_reduce_op, ReduceOp.SUM)
if __name__ == "__main__": if __name__ == "__main__":
run_tests() run_tests()

View File

@ -554,21 +554,6 @@ class TestNew2dParallelTraining(DTensorTestBase):
p2 = p2.redistribute(p2.device_mesh, [Replicate()]).to_local() p2 = p2.redistribute(p2.device_mesh, [Replicate()]).to_local()
self.assertTrue(torch.allclose(p1, p2), f"{p1} vs {p2}") self.assertTrue(torch.allclose(p1, p2), f"{p1} vs {p2}")
@with_comms
@skip_if_lt_x_gpu(4)
def test_raise_invalid_tp_composition(self):
with self.assertRaisesRegex(
RuntimeError, r"Found TP device_mesh on the \d dimension of its parent mesh"
):
mesh_2d = init_device_mesh(
self.device_type, (2, self.world_size // 2), mesh_dim_names=("tp", "dp")
)
parallelize_plan = {
"net1": ColwiseParallel(),
"net2": RowwiseParallel(),
}
parallelize_module(SimpleModel().cuda(), mesh_2d["tp"], parallelize_plan)
@with_comms @with_comms
@skip_if_lt_x_gpu(4) @skip_if_lt_x_gpu(4)
def test_2d_fsdp_state_enable_extension(self): def test_2d_fsdp_state_enable_extension(self):

View File

@ -3182,7 +3182,7 @@ class NcclRegistrationTest(MultiProcessTestCase):
# Use NCCL memory allocator # Use NCCL memory allocator
# enable symmetric memory usage in NCCL # enable symmetric memory usage in NCCL
pool = torch.cuda.MemPool(backend.mem_allocator, symm_mem=True) pool = torch.cuda.MemPool(backend.mem_allocator, symmetric=True)
# allocate memory with ncclMemAlloc # allocate memory with ncclMemAlloc
# note: symmetric kernels are not available for dtypes like torch.int64 # note: symmetric kernels are not available for dtypes like torch.int64

Some files were not shown because too many files have changed in this diff Show More